1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5use RecursiveIteratorIterator; 6use RecursiveDirectoryIterator; 7 8class ChromaDBClient { 9 private $baseUrl; 10 private $client; 11 private $ollamaClient; 12 private $tenant; 13 private $database; 14 private $ollamaHost; 15 private $ollamaPort; 16 private $ollamaModel; 17 18 /** 19 * Get configuration value for the dokullm plugin 20 * 21 * @param string $key Configuration key 22 * @param mixed $default Default value if key not found 23 * @return mixed Configuration value 24 */ 25 /** 26 * Initialize the ChromaDB client 27 * 28 * Creates a new ChromaDB client instance with the specified connection parameters. 29 * Also ensures that the specified tenant and database exist. 30 * 31 * @param string $host ChromaDB server host 32 * @param int $port ChromaDB server port 33 * @param string $tenant ChromaDB tenant name 34 * @param string $database ChromaDB database name 35 * @param string $defaultCollection Default collection name 36 * @param string $ollamaHost Ollama server host 37 * @param int $ollamaPort Ollama server port 38 * @param string $ollamaModel Ollama embeddings model 39 */ 40 public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) { 41 // Use provided parameters (no fallback since they're mandatory) 42 $chromaHost = $host; 43 $chromaPort = $port; 44 $this->tenant = $tenant; 45 $this->database = $database; 46 $this->defaultCollection = $defaultCollection; 47 $this->ollamaHost = $ollamaHost; 48 $this->ollamaPort = $ollamaPort; 49 50 // Ensure ollamaModel is a string with a default fallback 51 if (!is_string($ollamaModel) || empty($ollamaModel)) { 52 $this->ollamaModel = 'nomic-embed-text'; // Default embedding model 53 } else { 54 $this->ollamaModel = $ollamaModel; 55 } 56 57 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 58 $this->client = curl_init(); 59 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 60 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 61 'Content-Type: application/json', 62 'Accept: application/json' 63 ]); 64 // Initialize Ollama client 65 $this->ollamaClient = curl_init(); 66 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 67 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 68 'Content-Type: application/json' 69 ]); 70 // Check if tenant and database exist, create them if they don't 71 $this->ensureTenantAndDatabase(); 72 } 73 74 /** 75 * Clean up the cURL client when the object is destroyed 76 * 77 * @return void 78 */ 79 public function __destruct() { 80 curl_close($this->client); 81 curl_close($this->ollamaClient); 82 } 83 84 /** 85 * Make an HTTP request to the ChromaDB API 86 * 87 * This is a helper function that handles making HTTP requests to the ChromaDB API, 88 * including setting the appropriate headers for tenant and database. 89 * 90 * @param string $endpoint The API endpoint to call 91 * @param string $method The HTTP method to use (default: 'GET') 92 * @param array|null $data The data to send with the request (default: null) 93 * @return array The JSON response decoded as an array 94 * @throws Exception If there's a cURL error or HTTP error 95 */ 96 private function makeRequest($endpoint, $method = 'GET', $data = null) { 97 // Add tenant and database as headers instead of query parameters for v2 API 98 $headers = [ 99 'Content-Type: application/json', 100 'Accept: application/json' 101 ]; 102 // Version 2 103 $url = $this->baseUrl . '/api/v2' . $endpoint; 104 curl_setopt($this->client, CURLOPT_URL, $url); 105 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 106 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 107 // POST JSON data 108 if ($data) { 109 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 110 } else { 111 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 112 } 113 // Call 114 $response = curl_exec($this->client); 115 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 116 // Check the result 117 if (curl_error($this->client)) { 118 throw new \Exception('Curl error: ' . curl_error($this->client)); 119 } 120 if ($httpCode >= 400) { 121 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 122 } 123 // Return the decoded response 124 return json_decode($response, true); 125 } 126 127 /** 128 * Generate embeddings for text using Ollama 129 * 130 * @param string $text The text to generate embeddings for 131 * @return array The embeddings vector 132 */ 133 public function generateEmbeddings($text) { 134 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 135 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 136 137 // Ensure model is a string 138 $model = $this->ollamaModel; 139 if (!is_string($model)) { 140 throw new \Exception("Ollama model must be a string, got: " . gettype($model)); 141 } 142 143 $data = [ 144 'model' => $model, 145 'prompt' => $text, 146 'keep_alive' => '30m' 147 ]; 148 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 149 $response = curl_exec($this->ollamaClient); 150 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 151 if (curl_error($this->ollamaClient)) { 152 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 153 } 154 if ($httpCode >= 400) { 155 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 156 } 157 $result = json_decode($response, true); 158 if (!isset($result['embedding'])) { 159 throw new \Exception("Ollama response missing embedding: " . $response); 160 } 161 return $result['embedding']; 162 } 163 164 /** 165 * List all collections in the database 166 * 167 * Retrieves a list of all collections in the specified tenant and database. 168 * 169 * @return array List of collections 170 */ 171 public function listCollections() { 172 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 173 return $this->makeRequest($endpoint); 174 } 175 176 /** 177 * Get a collection by name 178 * 179 * Retrieves information about a specific collection by its name. 180 * 181 * @param string $name The name of the collection to retrieve 182 * @return array The collection information 183 * @throws Exception If the collection is not found 184 */ 185 public function getCollection($name) { 186 // Use provided name, fallback to 'documents' if empty 187 if (empty($name)) { 188 $name = 'documents'; 189 } 190 // First try to get collection by name 191 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 192 $collections = $this->makeRequest($endpoint); 193 // Find collection by name 194 foreach ($collections as $collection) { 195 if (isset($collection['name']) && $collection['name'] === $name) { 196 return $collection; 197 } 198 } 199 // If not found, throw exception 200 throw new \Exception("Collection '{$name}' not found"); 201 } 202 203 /** 204 * Create a new collection 205 * 206 * Creates a new collection with the specified name and optional metadata. 207 * 208 * @param string $name The name of the collection to create 209 * @param array|null $metadata Optional metadata for the collection 210 * @return array The response from the API 211 */ 212 public function createCollection($name, $metadata = null) { 213 // Use provided name, fallback to 'documents' if empty 214 if (empty($name)) { 215 $name = 'documents'; 216 } 217 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 218 $data = ['name' => $name]; 219 if ($metadata) { 220 $data['metadata'] = $metadata; 221 } 222 return $this->makeRequest($endpoint, 'POST', $data); 223 } 224 225 /** 226 * Delete a collection by name 227 * 228 * Deletes a collection with the specified name. 229 * 230 * @param string $name The name of the collection to delete 231 * @return array The response from the API 232 * @throws Exception If the collection ID is not found 233 */ 234 public function deleteCollection($name) { 235 // Use provided name, fallback to 'documents' if empty 236 if (empty($name)) { 237 $name = 'documents'; 238 } 239 // First get the collection to find its ID 240 $collection = $this->getCollection($name); 241 if (!isset($collection['id'])) { 242 throw new \Exception("Collection ID not found for '{$name}'"); 243 } 244 $collectionId = $collection['id']; 245 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 246 return $this->makeRequest($endpoint, 'DELETE'); 247 } 248 249 /** 250 * Get a document by its ID from a collection 251 * 252 * Retrieves a document from the specified collection using its ID. 253 * 254 * @param string $collectionName The name of the collection to get the document from 255 * @param string $documentId The document ID to retrieve 256 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 257 * @return array The retrieved document 258 * @throws Exception If the collection ID is not found 259 */ 260 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 261 // Use provided name, fallback to 'documents' if empty 262 if (empty($collectionName)) { 263 $collectionName = 'documents'; 264 } 265 // First get the collection to find its ID 266 $collection = $this->getCollection($collectionName); 267 if (!isset($collection['id'])) { 268 throw new \Exception("Collection ID not found for '{$collectionName}'"); 269 } 270 $collectionId = $collection['id']; 271 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 272 $data = [ 273 'ids' => [$documentId], 274 'include' => $include 275 ]; 276 // Return the document 277 return $this->makeRequest($endpoint, 'POST', $data); 278 } 279 280 /** 281 * Add documents to a collection 282 * 283 * Adds documents to the specified collection. Each document must have a corresponding ID. 284 * Optional metadata and pre-computed embeddings can also be provided. 285 * 286 * @param string $collectionName The name of the collection to add documents to 287 * @param array $documents The document contents 288 * @param array $ids The document IDs 289 * @param array|null $metadatas Optional metadata for each document 290 * @param array|null $embeddings Optional pre-computed embeddings for each document 291 * @return array The response from the API 292 * @throws Exception If the collection ID is not found 293 */ 294 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 295 // Use provided name, fallback to 'documents' if empty 296 if (empty($collectionName)) { 297 $collectionName = 'documents'; 298 } 299 // First get the collection to find its ID 300 $collection = $this->getCollection($collectionName); 301 if (!isset($collection['id'])) { 302 throw new \Exception("Collection ID not found for '{$collectionName}'"); 303 } 304 $collectionId = $collection['id']; 305 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 306 $data = [ 307 'ids' => $ids, 308 'documents' => $documents 309 ]; 310 // Get also the metadata 311 if ($metadatas) { 312 $data['metadatas'] = $metadatas; 313 } 314 // Get the embeddings 315 if ($embeddings) { 316 $data['embeddings'] = $embeddings; 317 } 318 // Return the respnse 319 return $this->makeRequest($endpoint, 'POST', $data); 320 } 321 322 /** 323 * Check if a document needs to be updated based on timestamp comparison 324 * 325 * Determines whether a document should be reprocessed by comparing the file's last modification 326 * time with the processed_at timestamp stored in the document's metadata. The function checks 327 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 328 * not included in the database. 329 * 330 * @param string $collectionId The ID of the collection to check documents in 331 * @param string $documentId The base document ID to check (without chunk suffixes) 332 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 333 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 334 * @throws Exception If there's an error checking the document 335 */ 336 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 337 try { 338 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 339 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 340 $chunkIdsToCheck = [ 341 $documentId . '@1', 342 $documentId . '@2', 343 $documentId . '@3' 344 ]; 345 $data = [ 346 'ids' => $chunkIdsToCheck, 347 'include' => [ 348 "metadatas" 349 ], 350 'limit' => 1 351 ]; 352 // Check if document exists 353 $result = $this->makeRequest($endpoint, 'POST', $data); 354 // If no documents found, return true (needs to be added) 355 if (empty($result['ids'])) { 356 return true; 357 } 358 // Check if any document has a processed_at timestamp 359 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 360 // Check the first metadata entry directly 361 $metadata = $result['metadatas'][0]; 362 // If processed_at is not set, return true (needs update) 363 if (!isset($metadata['processed_at'])) { 364 return true; 365 } 366 // Parse the processed_at timestamp 367 $processedTimestamp = strtotime($metadata['processed_at']); 368 // If file is newer than processed time, return true (needs update) 369 if ($fileModifiedTime > $processedTimestamp) { 370 return true; 371 } 372 } 373 // Document exists and is up to date 374 return false; 375 } catch (\Exception $e) { 376 // If there's an error checking the document, assume it needs to be updated 377 return true; 378 } 379 } 380 381 /** 382 * Query a collection for similar documents 383 * 384 * Queries the specified collection for documents similar to the provided query texts. 385 * The function generates embeddings for the query texts and sends them to ChromaDB. 386 * Supports filtering results by metadata using the where parameter. 387 * 388 * @param string $collectionName The name of the collection to query 389 * @param array $queryTexts The query texts to search for 390 * @param int $nResults The number of results to return (default: 5) 391 * @param array|null $where Optional filter conditions for metadata 392 * @return array The query results 393 * @throws Exception If the collection ID is not found 394 */ 395 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 396 // Use provided name, fallback to 'documents' if empty 397 if (empty($collectionName)) { 398 $collectionName = 'documents'; 399 } 400 // First get the collection to find its ID 401 $collection = $this->getCollection($collectionName); 402 if (!isset($collection['id'])) { 403 throw new \Exception("Collection ID not found for '{$collectionName}'"); 404 } 405 $collectionId = $collection['id']; 406 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 407 // Generate embeddings for query texts 408 $queryEmbeddings = []; 409 foreach ($queryTexts as $text) { 410 $queryEmbeddings[] = $this->generateEmbeddings($text); 411 } 412 $data = [ 413 'query_embeddings' => $queryEmbeddings, 414 'n_results' => $nResults 415 ]; 416 // Add where clause for metadata filtering if provided 417 if ($where && is_array($where)) { 418 $data['where'] = $where; 419 } 420 // Return the response 421 return $this->makeRequest($endpoint, 'POST', $data); 422 } 423 424 /** 425 * Check if the ChromaDB server is alive 426 * 427 * Sends a heartbeat request to verify that the ChromaDB server is running. 428 * 429 * @return array The response from the heartbeat endpoint 430 */ 431 public function heartbeat() { 432 $endpoint = "/heartbeat"; 433 return $this->makeRequest($endpoint, 'GET'); 434 } 435 436 /** 437 * Get authentication and identity information 438 * 439 * Retrieves authentication and identity information from the ChromaDB server. 440 * 441 * @return array The response from the auth/identity endpoint 442 */ 443 public function getIdentity() { 444 $endpoint = "/identity"; 445 return $this->makeRequest($endpoint, 'GET'); 446 } 447 448 /** 449 * Ensure that the specified tenant and database exist 450 * 451 * Checks if the specified tenant and database exist, and creates them if they don't. 452 * 453 * @return void 454 */ 455 private function ensureTenantAndDatabase() { 456 // Check if tenant exists, create if it doesn't 457 try { 458 $this->getTenant($this->tenant); 459 } catch (\Exception $e) { 460 // Tenant doesn't exist, create it 461 $this->createTenant($this->tenant); 462 } 463 // Check if database exists, create if it doesn't 464 try { 465 $this->getDatabase($this->database, $this->tenant); 466 } catch (\Exception $e) { 467 // Database doesn't exist, create it 468 $this->createDatabase($this->database, $this->tenant); 469 } 470 } 471 472 /** 473 * Get tenant information 474 * 475 * Retrieves information about the specified tenant. 476 * 477 * @param string $tenantName The tenant name 478 * @return array The tenant information 479 */ 480 public function getTenant($tenantName) { 481 $endpoint = "/tenants/{$tenantName}"; 482 return $this->makeRequest($endpoint, 'GET'); 483 } 484 485 /** 486 * Create a new tenant 487 * 488 * Creates a new tenant with the specified name. 489 * 490 * @param string $tenantName The tenant name 491 * @return array The response from the API 492 */ 493 public function createTenant($tenantName) { 494 $endpoint = "/tenants"; 495 $data = ['name' => $tenantName]; 496 return $this->makeRequest($endpoint, 'POST', $data); 497 } 498 499 /** 500 * Get database information 501 * 502 * Retrieves information about the specified database within a tenant. 503 * 504 * @param string $databaseName The database name 505 * @param string $tenantName The tenant name 506 * @return array The database information 507 */ 508 public function getDatabase($databaseName, $tenantName) { 509 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 510 return $this->makeRequest($endpoint, 'GET'); 511 } 512 513 /** 514 * Create a new database 515 * 516 * Creates a new database with the specified name within a tenant. 517 * 518 * @param string $databaseName The database name 519 * @param string $tenantName The tenant name 520 * @return array The response from the API 521 */ 522 public function createDatabase($databaseName, $tenantName) { 523 $endpoint = "/tenants/{$tenantName}/databases"; 524 $data = ['name' => $databaseName]; 525 return $this->makeRequest($endpoint, 'POST', $data); 526 } 527 528 /** 529 * Ensure a collection exists, creating it if necessary 530 * 531 * This helper function checks if a collection exists and creates it if it doesn't. 532 * 533 * @param string $collectionName The name of the collection to check/create 534 * @return string Status message indicating what happened 535 */ 536 public function ensureCollectionExists($collectionName) { 537 try { 538 $collection = $this->getCollection($collectionName); 539 return "Collection '$collectionName' already exists."; 540 } catch (\Exception $e) { 541 // Collection doesn't exist, create it 542 $created = $this->createCollection($collectionName); 543 return "Collection '$collectionName' created."; 544 } 545 } 546 547 /** 548 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 549 * 550 * This function handles the complete processing of a single DokuWiki file: 551 * 1. Parses the file path to extract metadata and document ID 552 * 2. Determines the appropriate collection based on document ID 553 * 3. Checks if the document needs updating using timestamp comparison 554 * 4. Reads and processes file content only if update is needed 555 * 5. Splits the document into chunks (paragraphs) 556 * 6. Extracts rich metadata from the DokuWiki ID format 557 * 7. Generates embeddings for each chunk 558 * 8. Sends all chunks to ChromaDB with metadata 559 * 560 * Supported ID formats: 561 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 562 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 563 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 564 * 565 * The function implements smart update checking by comparing file modification time 566 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 567 * 568 * @param string $filePath The path to the file to process 569 * @param string $collectionName The name of the collection to use 570 * @param bool $collectionChecked Whether the collection has already been checked/created 571 * @return array Result with status and details 572 */ 573 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 574 // Parse file path to extract metadata 575 $id = parseFilePath($filePath); 576 try { 577 // Create collection if it doesn't exist (only if not already checked) 578 $collectionStatus = ''; 579 if (!$collectionChecked) { 580 $collectionStatus = $this->ensureCollectionExists($collectionName); 581 } 582 // Get collection ID 583 $collection = $this->getCollection($collectionName); 584 if (!isset($collection['id'])) { 585 return [ 586 'status' => 'error', 587 'message' => "Collection ID not found for '{$collectionName}'" 588 ]; 589 } 590 $collectionId = $collection['id']; 591 // Get file modification time 592 $fileModifiedTime = filemtime($filePath); 593 // Check if document needs update 594 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 595 // If document is up to date, skip processing 596 if (!$needsUpdate) { 597 return [ 598 'status' => 'skipped', 599 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 600 ]; 601 } 602 // Read file content 603 $content = file_get_contents($filePath); 604 // Split document into chunks (paragraphs separated by two newlines) 605 $paragraphs = preg_split('/\n\s*\n/', $content); 606 $chunks = []; 607 $chunkMetadata = []; 608 // Parse the DokuWiki ID to extract base metadata 609 $parts = explode(':', $id); 610 // Extract metadata from the last part of the ID 611 $lastPart = end($parts); 612 $baseMetadata = []; 613 // Add the document ID as metadata 614 $baseMetadata['document_id'] = $id; 615 // Add current timestamp 616 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 617 // Check if any part of the ID is 'templates' and set template metadata 618 $isTemplate = in_array('templates', $parts); 619 if ($isTemplate) { 620 $baseMetadata['type'] = 'template'; 621 } else { 622 $baseMetadata['type'] = 'report'; 623 } 624 // Extract modality from the second part 625 if (isset($parts[1])) { 626 $baseMetadata['modality'] = $parts[1]; 627 } 628 // Handle different ID formats based on the third part: word (institution) or numeric (year) 629 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 630 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 631 // For templates, don't set institution, date or year 632 if (isset($parts[2]) && !$isTemplate) { 633 // Check if third part is numeric (year) or word (institution) 634 if (is_numeric($parts[2])) { 635 // Format: reports:mri:2024:g287-name-surname (year format) 636 // Extract year from the third part 637 $baseMetadata['year'] = $parts[2]; 638 // Set default institution from config 639 global $conf; 640 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 641 // Extract registration and name from the last part 642 // Registration should start with one letter or number and contain numbers before the '-' character 643 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 644 // Check if the first part contains at least one digit to be considered a registration 645 if (preg_match('/[0-9]/', $matches[1])) { 646 $baseMetadata['registration'] = $matches[1]; 647 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 648 } else { 649 // If no registration pattern found, treat entire part as patient name 650 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 651 } 652 } else { 653 // If no match, treat entire part as patient name 654 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 655 } 656 } else { 657 // Format: reports:mri:institution:250620-name-surname (institution format) 658 // Extract institution from the third part 659 $baseMetadata['institution'] = $parts[2]; 660 // Extract date and name from the last part 661 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 662 $dateStr = $matches[1]; 663 $name = $matches[2]; 664 // Convert date format (250620 -> 2025-06-20) 665 $day = substr($dateStr, 0, 2); 666 $month = substr($dateStr, 2, 2); 667 $year = substr($dateStr, 4, 2); 668 // Assuming 20xx for years 00-69 and 19xx for years 70-99 669 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 670 $formattedDate = $fullYear . '-' . $month . '-' . $day; 671 $baseMetadata['date'] = $formattedDate; 672 $baseMetadata['name'] = str_replace('-', ' ', $name); 673 } 674 } 675 } 676 // For templates, always extract name from the last part 677 if ($isTemplate && isset($lastPart)) { 678 // Extract name from the last part (everything after the last colon) 679 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 680 // Check if the first part contains at least one digit to be considered a registration 681 if (preg_match('/[0-9]/', $matches[1])) { 682 $baseMetadata['registration'] = $matches[1]; 683 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 684 } else { 685 // If no registration pattern found, treat entire part as template name 686 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 687 } 688 } else { 689 // If no match, treat entire part as template name 690 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 691 } 692 } 693 // Process each paragraph as a chunk with intelligent metadata handling 694 $chunkIds = []; 695 $chunkContents = []; 696 $chunkMetadatas = []; 697 $chunkEmbeddings = []; 698 $currentTags = []; 699 foreach ($paragraphs as $index => $paragraph) { 700 // Skip empty paragraphs to avoid processing whitespace-only content 701 $paragraph = trim($paragraph); 702 if (empty($paragraph)) { 703 continue; 704 } 705 // Check if this is a DokuWiki title (starts and ends with =) 706 // Titles are converted to tags for better searchability but not stored as content chunks 707 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 708 // Extract title content and clean it 709 $titleContent = trim($matches[1]); 710 // Split into words and create searchable tags 711 $words = preg_split('/\s+/', $titleContent); 712 $tags = []; 713 foreach ($words as $word) { 714 // Only use words longer than 3 characters to reduce noise 715 if (strlen($word) >= 3) { 716 $tags[] = strtolower($word); 717 } 718 } 719 // Remove duplicate tags and store for use in subsequent chunks 720 $currentTags = array_unique($tags); 721 continue; // Skip storing title chunks as content 722 } 723 // Create chunk ID 724 $chunkId = $id . '@' . ($index + 1); 725 // Generate embeddings for the chunk 726 $embeddings = $this->generateEmbeddings($paragraph); 727 // Add chunk-specific metadata 728 $metadata = $baseMetadata; 729 $metadata['chunk_id'] = $chunkId; 730 $metadata['chunk_number'] = $index + 1; 731 $metadata['total_chunks'] = count($paragraphs); 732 // Add current tags to metadata if any exist 733 if (!empty($currentTags)) { 734 $metadata['tags'] = implode(',', $currentTags); 735 } 736 // Store chunk data 737 $chunkIds[] = $chunkId; 738 $chunkContents[] = $paragraph; 739 $chunkMetadatas[] = $metadata; 740 $chunkEmbeddings[] = $embeddings; 741 } 742 // If no chunks were created, skip this file 743 if (empty($chunkIds)) { 744 return [ 745 'status' => 'skipped', 746 'message' => "No valid chunks found in file '$id'. Skipping..." 747 ]; 748 } 749 // Send all chunks to ChromaDB 750 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 751 return [ 752 'status' => 'success', 753 'message' => "Successfully sent file to ChromaDB", 754 'details' => [ 755 'document_id' => $id, 756 'chunks' => count($chunkIds), 757 'collection' => $collectionName 758 ], 759 'collection_status' => $collectionStatus 760 ]; 761 } catch (\Exception $e) { 762 return [ 763 'status' => 'error', 764 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 765 ]; 766 } 767 } 768 769} 770 771/** 772 * Parse a file path and convert it to a DokuWiki ID 773 * 774 * Takes a file system path and converts it to the DokuWiki ID format by: 775 * 1. Removing the base path prefix (using DokuWiki's pages directory) 776 * 2. Removing the .txt extension 777 * 3. Converting directory separators to colons 778 * 779 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 780 * Becomes: reports:mri:2024:g287-name-surname 781 * 782 * @param string $filePath The full file path to parse 783 * @return string The DokuWiki ID 784 */ 785function parseFilePath($filePath) { 786 // Use DokuWiki's constant to get the pages directory if available 787 if (defined('DOKU_INC')) { 788 $pagesDir = DOKU_INC . 'data/pages/'; 789 } else { 790 // Fallback to common DokuWiki installation path 791 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 792 } 793 // Remove the base path 794 $relativePath = str_replace($pagesDir, '', $filePath); 795 // Remove .txt extension 796 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 797 // Split path into parts and filter out empty parts 798 $parts = array_filter(explode('/', $relativePath)); 799 // Build DokuWiki ID (use first part as namespace) 800 $idParts = []; 801 foreach ($parts as $part) { 802 if (!empty($part)) { 803 $idParts[] = $part; 804 } 805 } 806 // Reurn the ID 807 return implode(':', $idParts); 808} 809