1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 private $ollamaModel; 14 15 /** 16 * Get configuration value for the dokullm plugin 17 * 18 * @param string $key Configuration key 19 * @param mixed $default Default value if key not found 20 * @return mixed Configuration value 21 */ 22 /** 23 * Initialize the ChromaDB client 24 * 25 * Creates a new ChromaDB client instance with the specified connection parameters. 26 * Also ensures that the specified tenant and database exist. 27 * 28 * @param string $host ChromaDB server host 29 * @param int $port ChromaDB server port 30 * @param string $tenant ChromaDB tenant name 31 * @param string $database ChromaDB database name 32 * @param string $ollamaHost Ollama server host 33 * @param int $ollamaPort Ollama server port 34 * @param string $ollamaModel Ollama embeddings model 35 */ 36 public function __construct($host, $port, $tenant, $database, $ollamaHost, $ollamaPort, $ollamaModel) { 37 // Use provided parameters (no fallback since they're mandatory) 38 $chromaHost = $host; 39 $chromaPort = $port; 40 $this->tenant = $tenant; 41 $this->database = $database; 42 $this->ollamaHost = $ollamaHost; 43 $this->ollamaPort = $ollamaPort; 44 $this->ollamaModel = $ollamaModel; 45 46 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 47 $this->client = curl_init(); 48 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 49 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 50 'Content-Type: application/json', 51 'Accept: application/json' 52 ]); 53 54 // Initialize Ollama client 55 $this->ollamaClient = curl_init(); 56 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 57 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 58 'Content-Type: application/json' 59 ]); 60 61 // Check if tenant and database exist, create them if they don't 62 $this->ensureTenantAndDatabase(); 63 } 64 65 /** 66 * Clean up the cURL client when the object is destroyed 67 * 68 * @return void 69 */ 70 public function __destruct() { 71 curl_close($this->client); 72 curl_close($this->ollamaClient); 73 } 74 75 /** 76 * Make an HTTP request to the ChromaDB API 77 * 78 * This is a helper function that handles making HTTP requests to the ChromaDB API, 79 * including setting the appropriate headers for tenant and database. 80 * 81 * @param string $endpoint The API endpoint to call 82 * @param string $method The HTTP method to use (default: 'GET') 83 * @param array|null $data The data to send with the request (default: null) 84 * @return array The JSON response decoded as an array 85 * @throws Exception If there's a cURL error or HTTP error 86 */ 87 private function makeRequest($endpoint, $method = 'GET', $data = null) { 88 // Add tenant and database as headers instead of query parameters for v2 API 89 $headers = [ 90 'Content-Type: application/json', 91 'Accept: application/json' 92 ]; 93 94 $url = $this->baseUrl . '/api/v2' . $endpoint; 95 96 curl_setopt($this->client, CURLOPT_URL, $url); 97 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 98 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 99 100 if ($data) { 101 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 102 } else { 103 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 104 } 105 106 $response = curl_exec($this->client); 107 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 108 109 if (curl_error($this->client)) { 110 throw new \Exception('Curl error: ' . curl_error($this->client)); 111 } 112 113 if ($httpCode >= 400) { 114 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 115 } 116 117 return json_decode($response, true); 118 } 119 120 /** 121 * Generate embeddings for text using Ollama 122 * 123 * @param string $text The text to generate embeddings for 124 * @return array The embeddings vector 125 */ 126 public function generateEmbeddings($text) { 127 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 128 129 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 130 131 $data = [ 132 'model' => $this->ollamaModel, 133 'prompt' => $text, 134 'keep_alive' => '30m' 135 ]; 136 137 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 138 139 $response = curl_exec($this->ollamaClient); 140 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 141 142 if (curl_error($this->ollamaClient)) { 143 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 144 } 145 146 if ($httpCode >= 400) { 147 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 148 } 149 150 $result = json_decode($response, true); 151 152 if (!isset($result['embedding'])) { 153 throw new \Exception("Ollama response missing embedding: " . $response); 154 } 155 156 return $result['embedding']; 157 } 158 159 /** 160 * List all collections in the database 161 * 162 * Retrieves a list of all collections in the specified tenant and database. 163 * 164 * @return array List of collections 165 */ 166 public function listCollections() { 167 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 168 return $this->makeRequest($endpoint); 169 } 170 171 /** 172 * Get a collection by name 173 * 174 * Retrieves information about a specific collection by its name. 175 * 176 * @param string $name The name of the collection to retrieve 177 * @return array The collection information 178 * @throws Exception If the collection is not found 179 */ 180 public function getCollection($name) { 181 // Use provided name, fallback to 'documents' if empty 182 if (empty($name)) { 183 $name = 'documents'; 184 } 185 186 // First try to get collection by name 187 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 188 $collections = $this->makeRequest($endpoint); 189 190 // Find collection by name 191 foreach ($collections as $collection) { 192 if (isset($collection['name']) && $collection['name'] === $name) { 193 return $collection; 194 } 195 } 196 197 // If not found, throw exception 198 throw new \Exception("Collection '{$name}' not found"); 199 } 200 201 /** 202 * Create a new collection 203 * 204 * Creates a new collection with the specified name and optional metadata. 205 * 206 * @param string $name The name of the collection to create 207 * @param array|null $metadata Optional metadata for the collection 208 * @return array The response from the API 209 */ 210 public function createCollection($name, $metadata = null) { 211 // Use provided name, fallback to 'documents' if empty 212 if (empty($name)) { 213 $name = 'documents'; 214 } 215 216 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 217 $data = ['name' => $name]; 218 if ($metadata) { 219 $data['metadata'] = $metadata; 220 } 221 return $this->makeRequest($endpoint, 'POST', $data); 222 } 223 224 /** 225 * Delete a collection by name 226 * 227 * Deletes a collection with the specified name. 228 * 229 * @param string $name The name of the collection to delete 230 * @return array The response from the API 231 * @throws Exception If the collection ID is not found 232 */ 233 public function deleteCollection($name) { 234 // Use provided name, fallback to 'documents' if empty 235 if (empty($name)) { 236 $name = 'documents'; 237 } 238 239 // First get the collection to find its ID 240 $collection = $this->getCollection($name); 241 if (!isset($collection['id'])) { 242 throw new \Exception("Collection ID not found for '{$name}'"); 243 } 244 245 $collectionId = $collection['id']; 246 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 247 return $this->makeRequest($endpoint, 'DELETE'); 248 } 249 250 /** 251 * Get a document by its ID from a collection 252 * 253 * Retrieves a document from the specified collection using its ID. 254 * 255 * @param string $collectionName The name of the collection to get the document from 256 * @param string $documentId The document ID to retrieve 257 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 258 * @return array The retrieved document 259 * @throws Exception If the collection ID is not found 260 */ 261 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 262 // Use provided name, fallback to 'documents' if empty 263 if (empty($collectionName)) { 264 $collectionName = 'documents'; 265 } 266 267 // First get the collection to find its ID 268 $collection = $this->getCollection($collectionName); 269 if (!isset($collection['id'])) { 270 throw new \Exception("Collection ID not found for '{$collectionName}'"); 271 } 272 273 $collectionId = $collection['id']; 274 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 275 $data = [ 276 'ids' => [$documentId], 277 'include' => $include 278 ]; 279 280 return $this->makeRequest($endpoint, 'POST', $data); 281 } 282 283 /** 284 * Add documents to a collection 285 * 286 * Adds documents to the specified collection. Each document must have a corresponding ID. 287 * Optional metadata and pre-computed embeddings can also be provided. 288 * 289 * @param string $collectionName The name of the collection to add documents to 290 * @param array $documents The document contents 291 * @param array $ids The document IDs 292 * @param array|null $metadatas Optional metadata for each document 293 * @param array|null $embeddings Optional pre-computed embeddings for each document 294 * @return array The response from the API 295 * @throws Exception If the collection ID is not found 296 */ 297 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 298 // Use provided name, fallback to 'documents' if empty 299 if (empty($collectionName)) { 300 $collectionName = 'documents'; 301 } 302 303 // First get the collection to find its ID 304 $collection = $this->getCollection($collectionName); 305 if (!isset($collection['id'])) { 306 throw new \Exception("Collection ID not found for '{$collectionName}'"); 307 } 308 309 $collectionId = $collection['id']; 310 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 311 $data = [ 312 'ids' => $ids, 313 'documents' => $documents 314 ]; 315 316 if ($metadatas) { 317 $data['metadatas'] = $metadatas; 318 } 319 320 if ($embeddings) { 321 $data['embeddings'] = $embeddings; 322 } 323 324 return $this->makeRequest($endpoint, 'POST', $data); 325 } 326 327 /** 328 * Check if a document needs to be updated based on timestamp comparison 329 * 330 * Determines whether a document should be reprocessed by comparing the file's last modification 331 * time with the processed_at timestamp stored in the document's metadata. The function checks 332 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 333 * not included in the database. 334 * 335 * @param string $collectionId The ID of the collection to check documents in 336 * @param string $documentId The base document ID to check (without chunk suffixes) 337 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 338 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 339 * @throws Exception If there's an error checking the document 340 */ 341 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 342 try { 343 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 344 345 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 346 $chunkIdsToCheck = [ 347 $documentId . '@1', 348 $documentId . '@2', 349 $documentId . '@3' 350 ]; 351 352 $data = [ 353 'ids' => $chunkIdsToCheck, 354 'include' => [ 355 "metadatas" 356 ], 357 'limit' => 1 358 ]; 359 360 // Check if document exists 361 $result = $this->makeRequest($endpoint, 'POST', $data); 362 363 // If no documents found, return true (needs to be added) 364 if (empty($result['ids'])) { 365 return true; 366 } 367 368 // Check if any document has a processed_at timestamp 369 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 370 // Check the first metadata entry directly 371 $metadata = $result['metadatas'][0]; 372 373 // If processed_at is not set, return true (needs update) 374 if (!isset($metadata['processed_at'])) { 375 return true; 376 } 377 378 // Parse the processed_at timestamp 379 $processedTimestamp = strtotime($metadata['processed_at']); 380 381 // If file is newer than processed time, return true (needs update) 382 if ($fileModifiedTime > $processedTimestamp) { 383 return true; 384 } 385 } 386 387 // Document exists and is up to date 388 return false; 389 } catch (\Exception $e) { 390 // If there's an error checking the document, assume it needs to be updated 391 return true; 392 } 393 } 394 395 /** 396 * Query a collection for similar documents 397 * 398 * Queries the specified collection for documents similar to the provided query texts. 399 * The function generates embeddings for the query texts and sends them to ChromaDB. 400 * Supports filtering results by metadata using the where parameter. 401 * 402 * @param string $collectionName The name of the collection to query 403 * @param array $queryTexts The query texts to search for 404 * @param int $nResults The number of results to return (default: 5) 405 * @param array|null $where Optional filter conditions for metadata 406 * @return array The query results 407 * @throws Exception If the collection ID is not found 408 */ 409 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 410 // Use provided name, fallback to 'documents' if empty 411 if (empty($collectionName)) { 412 $collectionName = 'documents'; 413 } 414 415 // First get the collection to find its ID 416 $collection = $this->getCollection($collectionName); 417 if (!isset($collection['id'])) { 418 throw new \Exception("Collection ID not found for '{$collectionName}'"); 419 } 420 421 $collectionId = $collection['id']; 422 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 423 424 // Generate embeddings for query texts 425 $queryEmbeddings = []; 426 foreach ($queryTexts as $text) { 427 $queryEmbeddings[] = $this->generateEmbeddings($text); 428 } 429 430 $data = [ 431 'query_embeddings' => $queryEmbeddings, 432 'n_results' => $nResults 433 ]; 434 435 // Add where clause for metadata filtering if provided 436 if ($where && is_array($where)) { 437 $data['where'] = $where; 438 } 439 440 return $this->makeRequest($endpoint, 'POST', $data); 441 } 442 443 /** 444 * Check if the ChromaDB server is alive 445 * 446 * Sends a heartbeat request to verify that the ChromaDB server is running. 447 * 448 * @return array The response from the heartbeat endpoint 449 */ 450 public function heartbeat() { 451 $endpoint = "/heartbeat"; 452 return $this->makeRequest($endpoint, 'GET'); 453 } 454 455 /** 456 * Get authentication and identity information 457 * 458 * Retrieves authentication and identity information from the ChromaDB server. 459 * 460 * @return array The response from the auth/identity endpoint 461 */ 462 public function getIdentity() { 463 $endpoint = "/identity"; 464 return $this->makeRequest($endpoint, 'GET'); 465 } 466 467 /** 468 * Ensure that the specified tenant and database exist 469 * 470 * Checks if the specified tenant and database exist, and creates them if they don't. 471 * 472 * @return void 473 */ 474 private function ensureTenantAndDatabase() { 475 // Check if tenant exists, create if it doesn't 476 try { 477 $this->getTenant($this->tenant); 478 } catch (\Exception $e) { 479 // Tenant doesn't exist, create it 480 $this->createTenant($this->tenant); 481 } 482 483 // Check if database exists, create if it doesn't 484 try { 485 $this->getDatabase($this->database, $this->tenant); 486 } catch (\Exception $e) { 487 // Database doesn't exist, create it 488 $this->createDatabase($this->database, $this->tenant); 489 } 490 } 491 492 /** 493 * Get tenant information 494 * 495 * Retrieves information about the specified tenant. 496 * 497 * @param string $tenantName The tenant name 498 * @return array The tenant information 499 */ 500 public function getTenant($tenantName) { 501 $endpoint = "/tenants/{$tenantName}"; 502 return $this->makeRequest($endpoint, 'GET'); 503 } 504 505 /** 506 * Create a new tenant 507 * 508 * Creates a new tenant with the specified name. 509 * 510 * @param string $tenantName The tenant name 511 * @return array The response from the API 512 */ 513 public function createTenant($tenantName) { 514 $endpoint = "/tenants"; 515 $data = ['name' => $tenantName]; 516 return $this->makeRequest($endpoint, 'POST', $data); 517 } 518 519 /** 520 * Get database information 521 * 522 * Retrieves information about the specified database within a tenant. 523 * 524 * @param string $databaseName The database name 525 * @param string $tenantName The tenant name 526 * @return array The database information 527 */ 528 public function getDatabase($databaseName, $tenantName) { 529 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 530 return $this->makeRequest($endpoint, 'GET'); 531 } 532 533 /** 534 * Create a new database 535 * 536 * Creates a new database with the specified name within a tenant. 537 * 538 * @param string $databaseName The database name 539 * @param string $tenantName The tenant name 540 * @return array The response from the API 541 */ 542 public function createDatabase($databaseName, $tenantName) { 543 $endpoint = "/tenants/{$tenantName}/databases"; 544 $data = ['name' => $databaseName]; 545 return $this->makeRequest($endpoint, 'POST', $data); 546 } 547 548 /** 549 * Ensure a collection exists, creating it if necessary 550 * 551 * This helper function checks if a collection exists and creates it if it doesn't. 552 * 553 * @param string $collectionName The name of the collection to check/create 554 * @return string Status message indicating what happened 555 */ 556 public function ensureCollectionExists($collectionName) { 557 try { 558 $collection = $this->getCollection($collectionName); 559 return "Collection '$collectionName' already exists."; 560 } catch (\Exception $e) { 561 // Collection doesn't exist, create it 562 $created = $this->createCollection($collectionName); 563 return "Collection '$collectionName' created."; 564 } 565 } 566 567 /** 568 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 569 * 570 * This function handles the complete processing of a single DokuWiki file: 571 * 1. Parses the file path to extract metadata and document ID 572 * 2. Determines the appropriate collection based on document ID 573 * 3. Checks if the document needs updating using timestamp comparison 574 * 4. Reads and processes file content only if update is needed 575 * 5. Splits the document into chunks (paragraphs) 576 * 6. Extracts rich metadata from the DokuWiki ID format 577 * 7. Generates embeddings for each chunk 578 * 8. Sends all chunks to ChromaDB with metadata 579 * 580 * Supported ID formats: 581 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 582 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 583 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 584 * 585 * The function implements smart update checking by comparing file modification time 586 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 587 * 588 * @param string $filePath The path to the file to process 589 * @param string $collectionName The name of the collection to use 590 * @param bool $collectionChecked Whether the collection has already been checked/created 591 * @return array Result with status and details 592 */ 593 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 594 // Parse file path to extract metadata 595 $id = parseFilePath($filePath); 596 597 try { 598 // Create collection if it doesn't exist (only if not already checked) 599 $collectionStatus = ''; 600 if (!$collectionChecked) { 601 $collectionStatus = $this->ensureCollectionExists($collectionName); 602 } 603 604 // Get collection ID 605 $collection = $this->getCollection($collectionName); 606 if (!isset($collection['id'])) { 607 return [ 608 'status' => 'error', 609 'message' => "Collection ID not found for '{$collectionName}'" 610 ]; 611 } 612 $collectionId = $collection['id']; 613 614 // Get file modification time 615 $fileModifiedTime = filemtime($filePath); 616 617 // Check if document needs update 618 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 619 620 // If document is up to date, skip processing 621 if (!$needsUpdate) { 622 return [ 623 'status' => 'skipped', 624 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 625 ]; 626 } 627 628 // Read file content 629 $content = file_get_contents($filePath); 630 631 // Split document into chunks (paragraphs separated by two newlines) 632 $paragraphs = preg_split('/\n\s*\n/', $content); 633 $chunks = []; 634 $chunkMetadata = []; 635 636 // Parse the DokuWiki ID to extract base metadata 637 $parts = explode(':', $id); 638 639 // Extract metadata from the last part of the ID 640 $lastPart = end($parts); 641 $baseMetadata = []; 642 643 // Add the document ID as metadata 644 $baseMetadata['document_id'] = $id; 645 646 // Add current timestamp 647 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 648 649 // Check if any part of the ID is 'templates' and set template metadata 650 $isTemplate = in_array('templates', $parts); 651 if ($isTemplate) { 652 $baseMetadata['type'] = 'template'; 653 } else { 654 $baseMetadata['type'] = 'report'; 655 } 656 657 // Extract modality from the second part 658 if (isset($parts[1])) { 659 $baseMetadata['modality'] = $parts[1]; 660 } 661 662 // Handle different ID formats based on the third part: word (institution) or numeric (year) 663 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 664 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 665 // For templates, don't set institution, date or year 666 if (isset($parts[2]) && !$isTemplate) { 667 // Check if third part is numeric (year) or word (institution) 668 if (is_numeric($parts[2])) { 669 // Format: reports:mri:2024:g287-name-surname (year format) 670 // Extract year from the third part 671 $baseMetadata['year'] = $parts[2]; 672 673 // Set default institution from config 674 global $conf; 675 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 676 677 // Extract registration and name from the last part 678 // Registration should start with one letter or number and contain numbers before the '-' character 679 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 680 // Check if the first part contains at least one digit to be considered a registration 681 if (preg_match('/[0-9]/', $matches[1])) { 682 $baseMetadata['registration'] = $matches[1]; 683 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 684 } else { 685 // If no registration pattern found, treat entire part as patient name 686 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 687 } 688 } else { 689 // If no match, treat entire part as patient name 690 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 691 } 692 } else { 693 // Format: reports:mri:institution:250620-name-surname (institution format) 694 // Extract institution from the third part 695 $baseMetadata['institution'] = $parts[2]; 696 697 // Extract date and name from the last part 698 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 699 $dateStr = $matches[1]; 700 $name = $matches[2]; 701 702 // Convert date format (250620 -> 2025-06-20) 703 $day = substr($dateStr, 0, 2); 704 $month = substr($dateStr, 2, 2); 705 $year = substr($dateStr, 4, 2); 706 // Assuming 20xx for years 00-69 and 19xx for years 70-99 707 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 708 $formattedDate = $fullYear . '-' . $month . '-' . $day; 709 710 $baseMetadata['date'] = $formattedDate; 711 $baseMetadata['name'] = str_replace('-', ' ', $name); 712 } 713 } 714 } 715 716 // For templates, always extract name from the last part 717 if ($isTemplate && isset($lastPart)) { 718 // Extract name from the last part (everything after the last colon) 719 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 720 // Check if the first part contains at least one digit to be considered a registration 721 if (preg_match('/[0-9]/', $matches[1])) { 722 $baseMetadata['registration'] = $matches[1]; 723 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 724 } else { 725 // If no registration pattern found, treat entire part as template name 726 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 727 } 728 } else { 729 // If no match, treat entire part as template name 730 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 731 } 732 } 733 734 // Process each paragraph as a chunk with intelligent metadata handling 735 $chunkIds = []; 736 $chunkContents = []; 737 $chunkMetadatas = []; 738 $chunkEmbeddings = []; 739 $currentTags = []; 740 741 foreach ($paragraphs as $index => $paragraph) { 742 // Skip empty paragraphs to avoid processing whitespace-only content 743 $paragraph = trim($paragraph); 744 if (empty($paragraph)) { 745 continue; 746 } 747 748 // Check if this is a DokuWiki title (starts and ends with =) 749 // Titles are converted to tags for better searchability but not stored as content chunks 750 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 751 // Extract title content and clean it 752 $titleContent = trim($matches[1]); 753 754 // Split into words and create searchable tags 755 $words = preg_split('/\s+/', $titleContent); 756 $tags = []; 757 758 foreach ($words as $word) { 759 // Only use words longer than 3 characters to reduce noise 760 if (strlen($word) >= 3) { 761 $tags[] = strtolower($word); 762 } 763 } 764 765 // Remove duplicate tags and store for use in subsequent chunks 766 $currentTags = array_unique($tags); 767 continue; // Skip storing title chunks as content 768 } 769 770 // Create chunk ID 771 $chunkId = $id . '@' . ($index + 1); 772 773 // Generate embeddings for the chunk 774 $embeddings = $this->generateEmbeddings($paragraph); 775 776 // Add chunk-specific metadata 777 $metadata = $baseMetadata; 778 $metadata['chunk_id'] = $chunkId; 779 $metadata['chunk_number'] = $index + 1; 780 $metadata['total_chunks'] = count($paragraphs); 781 782 // Add current tags to metadata if any exist 783 if (!empty($currentTags)) { 784 $metadata['tags'] = implode(',', $currentTags); 785 } 786 787 // Store chunk data 788 $chunkIds[] = $chunkId; 789 $chunkContents[] = $paragraph; 790 $chunkMetadatas[] = $metadata; 791 $chunkEmbeddings[] = $embeddings; 792 } 793 794 // If no chunks were created, skip this file 795 if (empty($chunkIds)) { 796 return [ 797 'status' => 'skipped', 798 'message' => "No valid chunks found in file '$id'. Skipping..." 799 ]; 800 } 801 802 // Send all chunks to ChromaDB 803 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 804 805 return [ 806 'status' => 'success', 807 'message' => "Successfully sent file to ChromaDB", 808 'details' => [ 809 'document_id' => $id, 810 'chunks' => count($chunkIds), 811 'collection' => $collectionName 812 ], 813 'collection_status' => $collectionStatus 814 ]; 815 } catch (\Exception $e) { 816 return [ 817 'status' => 'error', 818 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 819 ]; 820 } 821 } 822 823 /** 824 * Process all DokuWiki files in a directory and send them to ChromaDB 825 * 826 * This function recursively processes all .txt files in a directory and its subdirectories. 827 * It first checks if the appropriate collection exists and creates it if needed. 828 * Then it processes each file individually. 829 * 830 * @param string $dirPath The directory path to process 831 * @return array Result with status and details 832 */ 833 public function processDirectory($dirPath) { 834 // Check if directory exists 835 if (!is_dir($dirPath)) { 836 return [ 837 'status' => 'error', 838 'message' => "Directory does not exist: $dirPath" 839 ]; 840 } 841 842 // Create RecursiveIteratorIterator to process directories recursively 843 $iterator = new RecursiveIteratorIterator( 844 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 845 RecursiveIteratorIterator::LEAVES_ONLY 846 ); 847 848 $files = []; 849 foreach ($iterator as $file) { 850 // Process only .txt files that don't start with underscore 851 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 852 $files[] = $file->getPathname(); 853 } 854 } 855 856 if (empty($files)) { 857 return [ 858 'status' => 'skipped', 859 'message' => "No .txt files found in directory: $dirPath" 860 ]; 861 } 862 863 // Use the first part of the document ID as collection name, fallback to 'documents' 864 $sampleFile = $files[0]; 865 $id = parseFilePath($sampleFile); 866 $idParts = explode(':', $id); 867 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 868 869 try { 870 $this->ensureCollectionExists($collectionName); 871 $collectionChecked = true; 872 } catch (Exception $e) { 873 $collectionChecked = true; 874 } 875 876 $results = []; 877 foreach ($files as $file) { 878 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 879 $results[] = [ 880 'file' => $file, 881 'result' => $result 882 ]; 883 } 884 885 return [ 886 'status' => 'success', 887 'message' => "Finished processing directory.", 888 'files_count' => count($files), 889 'results' => $results 890 ]; 891 } 892} 893 894/** 895 * Parse a file path and convert it to a DokuWiki ID 896 * 897 * Takes a file system path and converts it to the DokuWiki ID format by: 898 * 1. Removing the base path prefix (using DokuWiki's pages directory) 899 * 2. Removing the .txt extension 900 * 3. Converting directory separators to colons 901 * 902 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 903 * Becomes: reports:mri:2024:g287-name-surname 904 * 905 * @param string $filePath The full file path to parse 906 * @return string The DokuWiki ID 907 */ 908function parseFilePath($filePath) { 909 // Use DokuWiki's constant to get the pages directory if available 910 if (defined('DOKU_INC')) { 911 $pagesDir = DOKU_INC . 'data/pages/'; 912 } else { 913 // Fallback to common DokuWiki installation path 914 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 915 } 916 917 // Remove the base path 918 $relativePath = str_replace($pagesDir, '', $filePath); 919 920 // Remove .txt extension 921 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 922 923 // Split path into parts and filter out empty parts 924 $parts = array_filter(explode('/', $relativePath)); 925 926 // Build DokuWiki ID (use first part as namespace) 927 $idParts = []; 928 foreach ($parts as $part) { 929 if (!empty($part)) { 930 $idParts[] = $part; 931 } 932 } 933 934 return implode(':', $idParts); 935} 936 937