1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 private $ollamaModel; 14 15 /** 16 * Get configuration value for the dokullm plugin 17 * 18 * @param string $key Configuration key 19 * @param mixed $default Default value if key not found 20 * @return mixed Configuration value 21 */ 22 /** 23 * Initialize the ChromaDB client 24 * 25 * Creates a new ChromaDB client instance with the specified connection parameters. 26 * Also ensures that the specified tenant and database exist. 27 * 28 * @param string $host ChromaDB server host 29 * @param int $port ChromaDB server port 30 * @param string $tenant ChromaDB tenant name 31 * @param string $database ChromaDB database name 32 * @param string $defaultCollection Default collection name 33 * @param string $ollamaHost Ollama server host 34 * @param int $ollamaPort Ollama server port 35 * @param string $ollamaModel Ollama embeddings model 36 */ 37 public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) { 38 // Use provided parameters (no fallback since they're mandatory) 39 $chromaHost = $host; 40 $chromaPort = $port; 41 $this->tenant = $tenant; 42 $this->database = $database; 43 $this->ollamaHost = $ollamaHost; 44 $this->ollamaPort = $ollamaPort; 45 $this->ollamaModel = $ollamaModel; 46 47 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 48 $this->client = curl_init(); 49 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 50 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 51 'Content-Type: application/json', 52 'Accept: application/json' 53 ]); 54 55 // Initialize Ollama client 56 $this->ollamaClient = curl_init(); 57 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 58 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 59 'Content-Type: application/json' 60 ]); 61 62 // Check if tenant and database exist, create them if they don't 63 $this->ensureTenantAndDatabase(); 64 } 65 66 /** 67 * Clean up the cURL client when the object is destroyed 68 * 69 * @return void 70 */ 71 public function __destruct() { 72 curl_close($this->client); 73 curl_close($this->ollamaClient); 74 } 75 76 /** 77 * Make an HTTP request to the ChromaDB API 78 * 79 * This is a helper function that handles making HTTP requests to the ChromaDB API, 80 * including setting the appropriate headers for tenant and database. 81 * 82 * @param string $endpoint The API endpoint to call 83 * @param string $method The HTTP method to use (default: 'GET') 84 * @param array|null $data The data to send with the request (default: null) 85 * @return array The JSON response decoded as an array 86 * @throws Exception If there's a cURL error or HTTP error 87 */ 88 private function makeRequest($endpoint, $method = 'GET', $data = null) { 89 // Add tenant and database as headers instead of query parameters for v2 API 90 $headers = [ 91 'Content-Type: application/json', 92 'Accept: application/json' 93 ]; 94 95 $url = $this->baseUrl . '/api/v2' . $endpoint; 96 97 curl_setopt($this->client, CURLOPT_URL, $url); 98 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 99 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 100 101 if ($data) { 102 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 103 } else { 104 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 105 } 106 107 $response = curl_exec($this->client); 108 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 109 110 if (curl_error($this->client)) { 111 throw new \Exception('Curl error: ' . curl_error($this->client)); 112 } 113 114 if ($httpCode >= 400) { 115 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 116 } 117 118 return json_decode($response, true); 119 } 120 121 /** 122 * Generate embeddings for text using Ollama 123 * 124 * @param string $text The text to generate embeddings for 125 * @return array The embeddings vector 126 */ 127 public function generateEmbeddings($text) { 128 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 129 130 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 131 132 $data = [ 133 'model' => $this->ollamaModel, 134 'prompt' => $text, 135 'keep_alive' => '30m' 136 ]; 137 138 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 139 140 $response = curl_exec($this->ollamaClient); 141 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 142 143 if (curl_error($this->ollamaClient)) { 144 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 145 } 146 147 if ($httpCode >= 400) { 148 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 149 } 150 151 $result = json_decode($response, true); 152 153 if (!isset($result['embedding'])) { 154 throw new \Exception("Ollama response missing embedding: " . $response); 155 } 156 157 return $result['embedding']; 158 } 159 160 /** 161 * List all collections in the database 162 * 163 * Retrieves a list of all collections in the specified tenant and database. 164 * 165 * @return array List of collections 166 */ 167 public function listCollections() { 168 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 169 return $this->makeRequest($endpoint); 170 } 171 172 /** 173 * Get a collection by name 174 * 175 * Retrieves information about a specific collection by its name. 176 * 177 * @param string $name The name of the collection to retrieve 178 * @return array The collection information 179 * @throws Exception If the collection is not found 180 */ 181 public function getCollection($name) { 182 // Use provided name, fallback to 'documents' if empty 183 if (empty($name)) { 184 $name = 'documents'; 185 } 186 187 // First try to get collection by name 188 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 189 $collections = $this->makeRequest($endpoint); 190 191 // Find collection by name 192 foreach ($collections as $collection) { 193 if (isset($collection['name']) && $collection['name'] === $name) { 194 return $collection; 195 } 196 } 197 198 // If not found, throw exception 199 throw new \Exception("Collection '{$name}' not found"); 200 } 201 202 /** 203 * Create a new collection 204 * 205 * Creates a new collection with the specified name and optional metadata. 206 * 207 * @param string $name The name of the collection to create 208 * @param array|null $metadata Optional metadata for the collection 209 * @return array The response from the API 210 */ 211 public function createCollection($name, $metadata = null) { 212 // Use provided name, fallback to 'documents' if empty 213 if (empty($name)) { 214 $name = 'documents'; 215 } 216 217 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 218 $data = ['name' => $name]; 219 if ($metadata) { 220 $data['metadata'] = $metadata; 221 } 222 return $this->makeRequest($endpoint, 'POST', $data); 223 } 224 225 /** 226 * Delete a collection by name 227 * 228 * Deletes a collection with the specified name. 229 * 230 * @param string $name The name of the collection to delete 231 * @return array The response from the API 232 * @throws Exception If the collection ID is not found 233 */ 234 public function deleteCollection($name) { 235 // Use provided name, fallback to 'documents' if empty 236 if (empty($name)) { 237 $name = 'documents'; 238 } 239 240 // First get the collection to find its ID 241 $collection = $this->getCollection($name); 242 if (!isset($collection['id'])) { 243 throw new \Exception("Collection ID not found for '{$name}'"); 244 } 245 246 $collectionId = $collection['id']; 247 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 248 return $this->makeRequest($endpoint, 'DELETE'); 249 } 250 251 /** 252 * Get a document by its ID from a collection 253 * 254 * Retrieves a document from the specified collection using its ID. 255 * 256 * @param string $collectionName The name of the collection to get the document from 257 * @param string $documentId The document ID to retrieve 258 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 259 * @return array The retrieved document 260 * @throws Exception If the collection ID is not found 261 */ 262 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 263 // Use provided name, fallback to 'documents' if empty 264 if (empty($collectionName)) { 265 $collectionName = 'documents'; 266 } 267 268 // First get the collection to find its ID 269 $collection = $this->getCollection($collectionName); 270 if (!isset($collection['id'])) { 271 throw new \Exception("Collection ID not found for '{$collectionName}'"); 272 } 273 274 $collectionId = $collection['id']; 275 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 276 $data = [ 277 'ids' => [$documentId], 278 'include' => $include 279 ]; 280 281 return $this->makeRequest($endpoint, 'POST', $data); 282 } 283 284 /** 285 * Add documents to a collection 286 * 287 * Adds documents to the specified collection. Each document must have a corresponding ID. 288 * Optional metadata and pre-computed embeddings can also be provided. 289 * 290 * @param string $collectionName The name of the collection to add documents to 291 * @param array $documents The document contents 292 * @param array $ids The document IDs 293 * @param array|null $metadatas Optional metadata for each document 294 * @param array|null $embeddings Optional pre-computed embeddings for each document 295 * @return array The response from the API 296 * @throws Exception If the collection ID is not found 297 */ 298 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 299 // Use provided name, fallback to 'documents' if empty 300 if (empty($collectionName)) { 301 $collectionName = 'documents'; 302 } 303 304 // First get the collection to find its ID 305 $collection = $this->getCollection($collectionName); 306 if (!isset($collection['id'])) { 307 throw new \Exception("Collection ID not found for '{$collectionName}'"); 308 } 309 310 $collectionId = $collection['id']; 311 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 312 $data = [ 313 'ids' => $ids, 314 'documents' => $documents 315 ]; 316 317 if ($metadatas) { 318 $data['metadatas'] = $metadatas; 319 } 320 321 if ($embeddings) { 322 $data['embeddings'] = $embeddings; 323 } 324 325 return $this->makeRequest($endpoint, 'POST', $data); 326 } 327 328 /** 329 * Check if a document needs to be updated based on timestamp comparison 330 * 331 * Determines whether a document should be reprocessed by comparing the file's last modification 332 * time with the processed_at timestamp stored in the document's metadata. The function checks 333 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 334 * not included in the database. 335 * 336 * @param string $collectionId The ID of the collection to check documents in 337 * @param string $documentId The base document ID to check (without chunk suffixes) 338 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 339 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 340 * @throws Exception If there's an error checking the document 341 */ 342 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 343 try { 344 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 345 346 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 347 $chunkIdsToCheck = [ 348 $documentId . '@1', 349 $documentId . '@2', 350 $documentId . '@3' 351 ]; 352 353 $data = [ 354 'ids' => $chunkIdsToCheck, 355 'include' => [ 356 "metadatas" 357 ], 358 'limit' => 1 359 ]; 360 361 // Check if document exists 362 $result = $this->makeRequest($endpoint, 'POST', $data); 363 364 // If no documents found, return true (needs to be added) 365 if (empty($result['ids'])) { 366 return true; 367 } 368 369 // Check if any document has a processed_at timestamp 370 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 371 // Check the first metadata entry directly 372 $metadata = $result['metadatas'][0]; 373 374 // If processed_at is not set, return true (needs update) 375 if (!isset($metadata['processed_at'])) { 376 return true; 377 } 378 379 // Parse the processed_at timestamp 380 $processedTimestamp = strtotime($metadata['processed_at']); 381 382 // If file is newer than processed time, return true (needs update) 383 if ($fileModifiedTime > $processedTimestamp) { 384 return true; 385 } 386 } 387 388 // Document exists and is up to date 389 return false; 390 } catch (\Exception $e) { 391 // If there's an error checking the document, assume it needs to be updated 392 return true; 393 } 394 } 395 396 /** 397 * Query a collection for similar documents 398 * 399 * Queries the specified collection for documents similar to the provided query texts. 400 * The function generates embeddings for the query texts and sends them to ChromaDB. 401 * Supports filtering results by metadata using the where parameter. 402 * 403 * @param string $collectionName The name of the collection to query 404 * @param array $queryTexts The query texts to search for 405 * @param int $nResults The number of results to return (default: 5) 406 * @param array|null $where Optional filter conditions for metadata 407 * @return array The query results 408 * @throws Exception If the collection ID is not found 409 */ 410 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 411 // Use provided name, fallback to 'documents' if empty 412 if (empty($collectionName)) { 413 $collectionName = 'documents'; 414 } 415 416 // First get the collection to find its ID 417 $collection = $this->getCollection($collectionName); 418 if (!isset($collection['id'])) { 419 throw new \Exception("Collection ID not found for '{$collectionName}'"); 420 } 421 422 $collectionId = $collection['id']; 423 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 424 425 // Generate embeddings for query texts 426 $queryEmbeddings = []; 427 foreach ($queryTexts as $text) { 428 $queryEmbeddings[] = $this->generateEmbeddings($text); 429 } 430 431 $data = [ 432 'query_embeddings' => $queryEmbeddings, 433 'n_results' => $nResults 434 ]; 435 436 // Add where clause for metadata filtering if provided 437 if ($where && is_array($where)) { 438 $data['where'] = $where; 439 } 440 441 return $this->makeRequest($endpoint, 'POST', $data); 442 } 443 444 /** 445 * Check if the ChromaDB server is alive 446 * 447 * Sends a heartbeat request to verify that the ChromaDB server is running. 448 * 449 * @return array The response from the heartbeat endpoint 450 */ 451 public function heartbeat() { 452 $endpoint = "/heartbeat"; 453 return $this->makeRequest($endpoint, 'GET'); 454 } 455 456 /** 457 * Get authentication and identity information 458 * 459 * Retrieves authentication and identity information from the ChromaDB server. 460 * 461 * @return array The response from the auth/identity endpoint 462 */ 463 public function getIdentity() { 464 $endpoint = "/identity"; 465 return $this->makeRequest($endpoint, 'GET'); 466 } 467 468 /** 469 * Ensure that the specified tenant and database exist 470 * 471 * Checks if the specified tenant and database exist, and creates them if they don't. 472 * 473 * @return void 474 */ 475 private function ensureTenantAndDatabase() { 476 // Check if tenant exists, create if it doesn't 477 try { 478 $this->getTenant($this->tenant); 479 } catch (\Exception $e) { 480 // Tenant doesn't exist, create it 481 $this->createTenant($this->tenant); 482 } 483 484 // Check if database exists, create if it doesn't 485 try { 486 $this->getDatabase($this->database, $this->tenant); 487 } catch (\Exception $e) { 488 // Database doesn't exist, create it 489 $this->createDatabase($this->database, $this->tenant); 490 } 491 } 492 493 /** 494 * Get tenant information 495 * 496 * Retrieves information about the specified tenant. 497 * 498 * @param string $tenantName The tenant name 499 * @return array The tenant information 500 */ 501 public function getTenant($tenantName) { 502 $endpoint = "/tenants/{$tenantName}"; 503 return $this->makeRequest($endpoint, 'GET'); 504 } 505 506 /** 507 * Create a new tenant 508 * 509 * Creates a new tenant with the specified name. 510 * 511 * @param string $tenantName The tenant name 512 * @return array The response from the API 513 */ 514 public function createTenant($tenantName) { 515 $endpoint = "/tenants"; 516 $data = ['name' => $tenantName]; 517 return $this->makeRequest($endpoint, 'POST', $data); 518 } 519 520 /** 521 * Get database information 522 * 523 * Retrieves information about the specified database within a tenant. 524 * 525 * @param string $databaseName The database name 526 * @param string $tenantName The tenant name 527 * @return array The database information 528 */ 529 public function getDatabase($databaseName, $tenantName) { 530 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 531 return $this->makeRequest($endpoint, 'GET'); 532 } 533 534 /** 535 * Create a new database 536 * 537 * Creates a new database with the specified name within a tenant. 538 * 539 * @param string $databaseName The database name 540 * @param string $tenantName The tenant name 541 * @return array The response from the API 542 */ 543 public function createDatabase($databaseName, $tenantName) { 544 $endpoint = "/tenants/{$tenantName}/databases"; 545 $data = ['name' => $databaseName]; 546 return $this->makeRequest($endpoint, 'POST', $data); 547 } 548 549 /** 550 * Ensure a collection exists, creating it if necessary 551 * 552 * This helper function checks if a collection exists and creates it if it doesn't. 553 * 554 * @param string $collectionName The name of the collection to check/create 555 * @return string Status message indicating what happened 556 */ 557 public function ensureCollectionExists($collectionName) { 558 try { 559 $collection = $this->getCollection($collectionName); 560 return "Collection '$collectionName' already exists."; 561 } catch (\Exception $e) { 562 // Collection doesn't exist, create it 563 $created = $this->createCollection($collectionName); 564 return "Collection '$collectionName' created."; 565 } 566 } 567 568 /** 569 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 570 * 571 * This function handles the complete processing of a single DokuWiki file: 572 * 1. Parses the file path to extract metadata and document ID 573 * 2. Determines the appropriate collection based on document ID 574 * 3. Checks if the document needs updating using timestamp comparison 575 * 4. Reads and processes file content only if update is needed 576 * 5. Splits the document into chunks (paragraphs) 577 * 6. Extracts rich metadata from the DokuWiki ID format 578 * 7. Generates embeddings for each chunk 579 * 8. Sends all chunks to ChromaDB with metadata 580 * 581 * Supported ID formats: 582 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 583 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 584 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 585 * 586 * The function implements smart update checking by comparing file modification time 587 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 588 * 589 * @param string $filePath The path to the file to process 590 * @param string $collectionName The name of the collection to use 591 * @param bool $collectionChecked Whether the collection has already been checked/created 592 * @return array Result with status and details 593 */ 594 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 595 // Parse file path to extract metadata 596 $id = parseFilePath($filePath); 597 598 try { 599 // Create collection if it doesn't exist (only if not already checked) 600 $collectionStatus = ''; 601 if (!$collectionChecked) { 602 $collectionStatus = $this->ensureCollectionExists($collectionName); 603 } 604 605 // Get collection ID 606 $collection = $this->getCollection($collectionName); 607 if (!isset($collection['id'])) { 608 return [ 609 'status' => 'error', 610 'message' => "Collection ID not found for '{$collectionName}'" 611 ]; 612 } 613 $collectionId = $collection['id']; 614 615 // Get file modification time 616 $fileModifiedTime = filemtime($filePath); 617 618 // Check if document needs update 619 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 620 621 // If document is up to date, skip processing 622 if (!$needsUpdate) { 623 return [ 624 'status' => 'skipped', 625 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 626 ]; 627 } 628 629 // Read file content 630 $content = file_get_contents($filePath); 631 632 // Split document into chunks (paragraphs separated by two newlines) 633 $paragraphs = preg_split('/\n\s*\n/', $content); 634 $chunks = []; 635 $chunkMetadata = []; 636 637 // Parse the DokuWiki ID to extract base metadata 638 $parts = explode(':', $id); 639 640 // Extract metadata from the last part of the ID 641 $lastPart = end($parts); 642 $baseMetadata = []; 643 644 // Add the document ID as metadata 645 $baseMetadata['document_id'] = $id; 646 647 // Add current timestamp 648 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 649 650 // Check if any part of the ID is 'templates' and set template metadata 651 $isTemplate = in_array('templates', $parts); 652 if ($isTemplate) { 653 $baseMetadata['type'] = 'template'; 654 } else { 655 $baseMetadata['type'] = 'report'; 656 } 657 658 // Extract modality from the second part 659 if (isset($parts[1])) { 660 $baseMetadata['modality'] = $parts[1]; 661 } 662 663 // Handle different ID formats based on the third part: word (institution) or numeric (year) 664 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 665 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 666 // For templates, don't set institution, date or year 667 if (isset($parts[2]) && !$isTemplate) { 668 // Check if third part is numeric (year) or word (institution) 669 if (is_numeric($parts[2])) { 670 // Format: reports:mri:2024:g287-name-surname (year format) 671 // Extract year from the third part 672 $baseMetadata['year'] = $parts[2]; 673 674 // Set default institution from config 675 global $conf; 676 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 677 678 // Extract registration and name from the last part 679 // Registration should start with one letter or number and contain numbers before the '-' character 680 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 681 // Check if the first part contains at least one digit to be considered a registration 682 if (preg_match('/[0-9]/', $matches[1])) { 683 $baseMetadata['registration'] = $matches[1]; 684 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 685 } else { 686 // If no registration pattern found, treat entire part as patient name 687 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 688 } 689 } else { 690 // If no match, treat entire part as patient name 691 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 692 } 693 } else { 694 // Format: reports:mri:institution:250620-name-surname (institution format) 695 // Extract institution from the third part 696 $baseMetadata['institution'] = $parts[2]; 697 698 // Extract date and name from the last part 699 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 700 $dateStr = $matches[1]; 701 $name = $matches[2]; 702 703 // Convert date format (250620 -> 2025-06-20) 704 $day = substr($dateStr, 0, 2); 705 $month = substr($dateStr, 2, 2); 706 $year = substr($dateStr, 4, 2); 707 // Assuming 20xx for years 00-69 and 19xx for years 70-99 708 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 709 $formattedDate = $fullYear . '-' . $month . '-' . $day; 710 711 $baseMetadata['date'] = $formattedDate; 712 $baseMetadata['name'] = str_replace('-', ' ', $name); 713 } 714 } 715 } 716 717 // For templates, always extract name from the last part 718 if ($isTemplate && isset($lastPart)) { 719 // Extract name from the last part (everything after the last colon) 720 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 721 // Check if the first part contains at least one digit to be considered a registration 722 if (preg_match('/[0-9]/', $matches[1])) { 723 $baseMetadata['registration'] = $matches[1]; 724 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 725 } else { 726 // If no registration pattern found, treat entire part as template name 727 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 728 } 729 } else { 730 // If no match, treat entire part as template name 731 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 732 } 733 } 734 735 // Process each paragraph as a chunk with intelligent metadata handling 736 $chunkIds = []; 737 $chunkContents = []; 738 $chunkMetadatas = []; 739 $chunkEmbeddings = []; 740 $currentTags = []; 741 742 foreach ($paragraphs as $index => $paragraph) { 743 // Skip empty paragraphs to avoid processing whitespace-only content 744 $paragraph = trim($paragraph); 745 if (empty($paragraph)) { 746 continue; 747 } 748 749 // Check if this is a DokuWiki title (starts and ends with =) 750 // Titles are converted to tags for better searchability but not stored as content chunks 751 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 752 // Extract title content and clean it 753 $titleContent = trim($matches[1]); 754 755 // Split into words and create searchable tags 756 $words = preg_split('/\s+/', $titleContent); 757 $tags = []; 758 759 foreach ($words as $word) { 760 // Only use words longer than 3 characters to reduce noise 761 if (strlen($word) >= 3) { 762 $tags[] = strtolower($word); 763 } 764 } 765 766 // Remove duplicate tags and store for use in subsequent chunks 767 $currentTags = array_unique($tags); 768 continue; // Skip storing title chunks as content 769 } 770 771 // Create chunk ID 772 $chunkId = $id . '@' . ($index + 1); 773 774 // Generate embeddings for the chunk 775 $embeddings = $this->generateEmbeddings($paragraph); 776 777 // Add chunk-specific metadata 778 $metadata = $baseMetadata; 779 $metadata['chunk_id'] = $chunkId; 780 $metadata['chunk_number'] = $index + 1; 781 $metadata['total_chunks'] = count($paragraphs); 782 783 // Add current tags to metadata if any exist 784 if (!empty($currentTags)) { 785 $metadata['tags'] = implode(',', $currentTags); 786 } 787 788 // Store chunk data 789 $chunkIds[] = $chunkId; 790 $chunkContents[] = $paragraph; 791 $chunkMetadatas[] = $metadata; 792 $chunkEmbeddings[] = $embeddings; 793 } 794 795 // If no chunks were created, skip this file 796 if (empty($chunkIds)) { 797 return [ 798 'status' => 'skipped', 799 'message' => "No valid chunks found in file '$id'. Skipping..." 800 ]; 801 } 802 803 // Send all chunks to ChromaDB 804 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 805 806 return [ 807 'status' => 'success', 808 'message' => "Successfully sent file to ChromaDB", 809 'details' => [ 810 'document_id' => $id, 811 'chunks' => count($chunkIds), 812 'collection' => $collectionName 813 ], 814 'collection_status' => $collectionStatus 815 ]; 816 } catch (\Exception $e) { 817 return [ 818 'status' => 'error', 819 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 820 ]; 821 } 822 } 823 824 /** 825 * Process all DokuWiki files in a directory and send them to ChromaDB 826 * 827 * This function recursively processes all .txt files in a directory and its subdirectories. 828 * It first checks if the appropriate collection exists and creates it if needed. 829 * Then it processes each file individually. 830 * 831 * @param string $dirPath The directory path to process 832 * @return array Result with status and details 833 */ 834 public function processDirectory($dirPath) { 835 // Check if directory exists 836 if (!is_dir($dirPath)) { 837 return [ 838 'status' => 'error', 839 'message' => "Directory does not exist: $dirPath" 840 ]; 841 } 842 843 // Create RecursiveIteratorIterator to process directories recursively 844 $iterator = new RecursiveIteratorIterator( 845 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 846 RecursiveIteratorIterator::LEAVES_ONLY 847 ); 848 849 $files = []; 850 foreach ($iterator as $file) { 851 // Process only .txt files that don't start with underscore 852 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 853 $files[] = $file->getPathname(); 854 } 855 } 856 857 if (empty($files)) { 858 return [ 859 'status' => 'skipped', 860 'message' => "No .txt files found in directory: $dirPath" 861 ]; 862 } 863 864 // Use the first part of the document ID as collection name, fallback to 'documents' 865 $sampleFile = $files[0]; 866 $id = parseFilePath($sampleFile); 867 $idParts = explode(':', $id); 868 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 869 870 try { 871 $this->ensureCollectionExists($collectionName); 872 $collectionChecked = true; 873 } catch (Exception $e) { 874 $collectionChecked = true; 875 } 876 877 $results = []; 878 foreach ($files as $file) { 879 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 880 $results[] = [ 881 'file' => $file, 882 'result' => $result 883 ]; 884 } 885 886 return [ 887 'status' => 'success', 888 'message' => "Finished processing directory.", 889 'files_count' => count($files), 890 'results' => $results 891 ]; 892 } 893} 894 895/** 896 * Parse a file path and convert it to a DokuWiki ID 897 * 898 * Takes a file system path and converts it to the DokuWiki ID format by: 899 * 1. Removing the base path prefix (using DokuWiki's pages directory) 900 * 2. Removing the .txt extension 901 * 3. Converting directory separators to colons 902 * 903 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 904 * Becomes: reports:mri:2024:g287-name-surname 905 * 906 * @param string $filePath The full file path to parse 907 * @return string The DokuWiki ID 908 */ 909function parseFilePath($filePath) { 910 // Use DokuWiki's constant to get the pages directory if available 911 if (defined('DOKU_INC')) { 912 $pagesDir = DOKU_INC . 'data/pages/'; 913 } else { 914 // Fallback to common DokuWiki installation path 915 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 916 } 917 918 // Remove the base path 919 $relativePath = str_replace($pagesDir, '', $filePath); 920 921 // Remove .txt extension 922 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 923 924 // Split path into parts and filter out empty parts 925 $parts = array_filter(explode('/', $relativePath)); 926 927 // Build DokuWiki ID (use first part as namespace) 928 $idParts = []; 929 foreach ($parts as $part) { 930 if (!empty($part)) { 931 $idParts[] = $part; 932 } 933 } 934 935 return implode(':', $idParts); 936} 937 938