1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 private $ollamaModel; 14 15 /** 16 * Get configuration value for the dokullm plugin 17 * 18 * @param string $key Configuration key 19 * @param mixed $default Default value if key not found 20 * @return mixed Configuration value 21 */ 22 /** 23 * Initialize the ChromaDB client 24 * 25 * Creates a new ChromaDB client instance with the specified connection parameters. 26 * Also ensures that the specified tenant and database exist. 27 * 28 * @param string $host ChromaDB server host 29 * @param int $port ChromaDB server port 30 * @param string $tenant ChromaDB tenant name 31 * @param string $database ChromaDB database name 32 * @param string $defaultCollection Default collection name 33 * @param string $ollamaHost Ollama server host 34 * @param int $ollamaPort Ollama server port 35 * @param string $ollamaModel Ollama embeddings model 36 */ 37 public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) { 38 // Use provided parameters (no fallback since they're mandatory) 39 $chromaHost = $host; 40 $chromaPort = $port; 41 $this->tenant = $tenant; 42 $this->database = $database; 43 $this->defaultCollection = $defaultCollection; 44 $this->ollamaHost = $ollamaHost; 45 $this->ollamaPort = $ollamaPort; 46 $this->ollamaModel = $ollamaModel; 47 48 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 49 $this->client = curl_init(); 50 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 51 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 52 'Content-Type: application/json', 53 'Accept: application/json' 54 ]); 55 56 // Initialize Ollama client 57 $this->ollamaClient = curl_init(); 58 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 59 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 60 'Content-Type: application/json' 61 ]); 62 63 // Check if tenant and database exist, create them if they don't 64 $this->ensureTenantAndDatabase(); 65 } 66 67 /** 68 * Clean up the cURL client when the object is destroyed 69 * 70 * @return void 71 */ 72 public function __destruct() { 73 curl_close($this->client); 74 curl_close($this->ollamaClient); 75 } 76 77 /** 78 * Make an HTTP request to the ChromaDB API 79 * 80 * This is a helper function that handles making HTTP requests to the ChromaDB API, 81 * including setting the appropriate headers for tenant and database. 82 * 83 * @param string $endpoint The API endpoint to call 84 * @param string $method The HTTP method to use (default: 'GET') 85 * @param array|null $data The data to send with the request (default: null) 86 * @return array The JSON response decoded as an array 87 * @throws Exception If there's a cURL error or HTTP error 88 */ 89 private function makeRequest($endpoint, $method = 'GET', $data = null) { 90 // Add tenant and database as headers instead of query parameters for v2 API 91 $headers = [ 92 'Content-Type: application/json', 93 'Accept: application/json' 94 ]; 95 96 $url = $this->baseUrl . '/api/v2' . $endpoint; 97 98 curl_setopt($this->client, CURLOPT_URL, $url); 99 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 100 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 101 102 if ($data) { 103 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 104 } else { 105 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 106 } 107 108 $response = curl_exec($this->client); 109 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 110 111 if (curl_error($this->client)) { 112 throw new \Exception('Curl error: ' . curl_error($this->client)); 113 } 114 115 if ($httpCode >= 400) { 116 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 117 } 118 119 return json_decode($response, true); 120 } 121 122 /** 123 * Generate embeddings for text using Ollama 124 * 125 * @param string $text The text to generate embeddings for 126 * @return array The embeddings vector 127 */ 128 public function generateEmbeddings($text) { 129 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 130 131 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 132 133 $data = [ 134 'model' => $this->ollamaModel, 135 'prompt' => $text, 136 'keep_alive' => '30m' 137 ]; 138 139 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 140 141 $response = curl_exec($this->ollamaClient); 142 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 143 144 if (curl_error($this->ollamaClient)) { 145 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 146 } 147 148 if ($httpCode >= 400) { 149 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 150 } 151 152 $result = json_decode($response, true); 153 154 if (!isset($result['embedding'])) { 155 throw new \Exception("Ollama response missing embedding: " . $response); 156 } 157 158 return $result['embedding']; 159 } 160 161 /** 162 * List all collections in the database 163 * 164 * Retrieves a list of all collections in the specified tenant and database. 165 * 166 * @return array List of collections 167 */ 168 public function listCollections() { 169 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 170 return $this->makeRequest($endpoint); 171 } 172 173 /** 174 * Get a collection by name 175 * 176 * Retrieves information about a specific collection by its name. 177 * 178 * @param string $name The name of the collection to retrieve 179 * @return array The collection information 180 * @throws Exception If the collection is not found 181 */ 182 public function getCollection($name) { 183 // Use provided name, fallback to 'documents' if empty 184 if (empty($name)) { 185 $name = 'documents'; 186 } 187 188 // First try to get collection by name 189 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 190 $collections = $this->makeRequest($endpoint); 191 192 // Find collection by name 193 foreach ($collections as $collection) { 194 if (isset($collection['name']) && $collection['name'] === $name) { 195 return $collection; 196 } 197 } 198 199 // If not found, throw exception 200 throw new \Exception("Collection '{$name}' not found"); 201 } 202 203 /** 204 * Create a new collection 205 * 206 * Creates a new collection with the specified name and optional metadata. 207 * 208 * @param string $name The name of the collection to create 209 * @param array|null $metadata Optional metadata for the collection 210 * @return array The response from the API 211 */ 212 public function createCollection($name, $metadata = null) { 213 // Use provided name, fallback to 'documents' if empty 214 if (empty($name)) { 215 $name = 'documents'; 216 } 217 218 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 219 $data = ['name' => $name]; 220 if ($metadata) { 221 $data['metadata'] = $metadata; 222 } 223 return $this->makeRequest($endpoint, 'POST', $data); 224 } 225 226 /** 227 * Delete a collection by name 228 * 229 * Deletes a collection with the specified name. 230 * 231 * @param string $name The name of the collection to delete 232 * @return array The response from the API 233 * @throws Exception If the collection ID is not found 234 */ 235 public function deleteCollection($name) { 236 // Use provided name, fallback to 'documents' if empty 237 if (empty($name)) { 238 $name = 'documents'; 239 } 240 241 // First get the collection to find its ID 242 $collection = $this->getCollection($name); 243 if (!isset($collection['id'])) { 244 throw new \Exception("Collection ID not found for '{$name}'"); 245 } 246 247 $collectionId = $collection['id']; 248 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 249 return $this->makeRequest($endpoint, 'DELETE'); 250 } 251 252 /** 253 * Get a document by its ID from a collection 254 * 255 * Retrieves a document from the specified collection using its ID. 256 * 257 * @param string $collectionName The name of the collection to get the document from 258 * @param string $documentId The document ID to retrieve 259 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 260 * @return array The retrieved document 261 * @throws Exception If the collection ID is not found 262 */ 263 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 264 // Use provided name, fallback to 'documents' if empty 265 if (empty($collectionName)) { 266 $collectionName = 'documents'; 267 } 268 269 // First get the collection to find its ID 270 $collection = $this->getCollection($collectionName); 271 if (!isset($collection['id'])) { 272 throw new \Exception("Collection ID not found for '{$collectionName}'"); 273 } 274 275 $collectionId = $collection['id']; 276 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 277 $data = [ 278 'ids' => [$documentId], 279 'include' => $include 280 ]; 281 282 return $this->makeRequest($endpoint, 'POST', $data); 283 } 284 285 /** 286 * Add documents to a collection 287 * 288 * Adds documents to the specified collection. Each document must have a corresponding ID. 289 * Optional metadata and pre-computed embeddings can also be provided. 290 * 291 * @param string $collectionName The name of the collection to add documents to 292 * @param array $documents The document contents 293 * @param array $ids The document IDs 294 * @param array|null $metadatas Optional metadata for each document 295 * @param array|null $embeddings Optional pre-computed embeddings for each document 296 * @return array The response from the API 297 * @throws Exception If the collection ID is not found 298 */ 299 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 300 // Use provided name, fallback to 'documents' if empty 301 if (empty($collectionName)) { 302 $collectionName = 'documents'; 303 } 304 305 // First get the collection to find its ID 306 $collection = $this->getCollection($collectionName); 307 if (!isset($collection['id'])) { 308 throw new \Exception("Collection ID not found for '{$collectionName}'"); 309 } 310 311 $collectionId = $collection['id']; 312 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 313 $data = [ 314 'ids' => $ids, 315 'documents' => $documents 316 ]; 317 318 if ($metadatas) { 319 $data['metadatas'] = $metadatas; 320 } 321 322 if ($embeddings) { 323 $data['embeddings'] = $embeddings; 324 } 325 326 return $this->makeRequest($endpoint, 'POST', $data); 327 } 328 329 /** 330 * Check if a document needs to be updated based on timestamp comparison 331 * 332 * Determines whether a document should be reprocessed by comparing the file's last modification 333 * time with the processed_at timestamp stored in the document's metadata. The function checks 334 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 335 * not included in the database. 336 * 337 * @param string $collectionId The ID of the collection to check documents in 338 * @param string $documentId The base document ID to check (without chunk suffixes) 339 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 340 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 341 * @throws Exception If there's an error checking the document 342 */ 343 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 344 try { 345 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 346 347 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 348 $chunkIdsToCheck = [ 349 $documentId . '@1', 350 $documentId . '@2', 351 $documentId . '@3' 352 ]; 353 354 $data = [ 355 'ids' => $chunkIdsToCheck, 356 'include' => [ 357 "metadatas" 358 ], 359 'limit' => 1 360 ]; 361 362 // Check if document exists 363 $result = $this->makeRequest($endpoint, 'POST', $data); 364 365 // If no documents found, return true (needs to be added) 366 if (empty($result['ids'])) { 367 return true; 368 } 369 370 // Check if any document has a processed_at timestamp 371 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 372 // Check the first metadata entry directly 373 $metadata = $result['metadatas'][0]; 374 375 // If processed_at is not set, return true (needs update) 376 if (!isset($metadata['processed_at'])) { 377 return true; 378 } 379 380 // Parse the processed_at timestamp 381 $processedTimestamp = strtotime($metadata['processed_at']); 382 383 // If file is newer than processed time, return true (needs update) 384 if ($fileModifiedTime > $processedTimestamp) { 385 return true; 386 } 387 } 388 389 // Document exists and is up to date 390 return false; 391 } catch (\Exception $e) { 392 // If there's an error checking the document, assume it needs to be updated 393 return true; 394 } 395 } 396 397 /** 398 * Query a collection for similar documents 399 * 400 * Queries the specified collection for documents similar to the provided query texts. 401 * The function generates embeddings for the query texts and sends them to ChromaDB. 402 * Supports filtering results by metadata using the where parameter. 403 * 404 * @param string $collectionName The name of the collection to query 405 * @param array $queryTexts The query texts to search for 406 * @param int $nResults The number of results to return (default: 5) 407 * @param array|null $where Optional filter conditions for metadata 408 * @return array The query results 409 * @throws Exception If the collection ID is not found 410 */ 411 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 412 // Use provided name, fallback to 'documents' if empty 413 if (empty($collectionName)) { 414 $collectionName = 'documents'; 415 } 416 417 // First get the collection to find its ID 418 $collection = $this->getCollection($collectionName); 419 if (!isset($collection['id'])) { 420 throw new \Exception("Collection ID not found for '{$collectionName}'"); 421 } 422 423 $collectionId = $collection['id']; 424 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 425 426 // Generate embeddings for query texts 427 $queryEmbeddings = []; 428 foreach ($queryTexts as $text) { 429 $queryEmbeddings[] = $this->generateEmbeddings($text); 430 } 431 432 $data = [ 433 'query_embeddings' => $queryEmbeddings, 434 'n_results' => $nResults 435 ]; 436 437 // Add where clause for metadata filtering if provided 438 if ($where && is_array($where)) { 439 $data['where'] = $where; 440 } 441 442 return $this->makeRequest($endpoint, 'POST', $data); 443 } 444 445 /** 446 * Check if the ChromaDB server is alive 447 * 448 * Sends a heartbeat request to verify that the ChromaDB server is running. 449 * 450 * @return array The response from the heartbeat endpoint 451 */ 452 public function heartbeat() { 453 $endpoint = "/heartbeat"; 454 return $this->makeRequest($endpoint, 'GET'); 455 } 456 457 /** 458 * Get authentication and identity information 459 * 460 * Retrieves authentication and identity information from the ChromaDB server. 461 * 462 * @return array The response from the auth/identity endpoint 463 */ 464 public function getIdentity() { 465 $endpoint = "/identity"; 466 return $this->makeRequest($endpoint, 'GET'); 467 } 468 469 /** 470 * Ensure that the specified tenant and database exist 471 * 472 * Checks if the specified tenant and database exist, and creates them if they don't. 473 * 474 * @return void 475 */ 476 private function ensureTenantAndDatabase() { 477 // Check if tenant exists, create if it doesn't 478 try { 479 $this->getTenant($this->tenant); 480 } catch (\Exception $e) { 481 // Tenant doesn't exist, create it 482 $this->createTenant($this->tenant); 483 } 484 485 // Check if database exists, create if it doesn't 486 try { 487 $this->getDatabase($this->database, $this->tenant); 488 } catch (\Exception $e) { 489 // Database doesn't exist, create it 490 $this->createDatabase($this->database, $this->tenant); 491 } 492 } 493 494 /** 495 * Get tenant information 496 * 497 * Retrieves information about the specified tenant. 498 * 499 * @param string $tenantName The tenant name 500 * @return array The tenant information 501 */ 502 public function getTenant($tenantName) { 503 $endpoint = "/tenants/{$tenantName}"; 504 return $this->makeRequest($endpoint, 'GET'); 505 } 506 507 /** 508 * Create a new tenant 509 * 510 * Creates a new tenant with the specified name. 511 * 512 * @param string $tenantName The tenant name 513 * @return array The response from the API 514 */ 515 public function createTenant($tenantName) { 516 $endpoint = "/tenants"; 517 $data = ['name' => $tenantName]; 518 return $this->makeRequest($endpoint, 'POST', $data); 519 } 520 521 /** 522 * Get database information 523 * 524 * Retrieves information about the specified database within a tenant. 525 * 526 * @param string $databaseName The database name 527 * @param string $tenantName The tenant name 528 * @return array The database information 529 */ 530 public function getDatabase($databaseName, $tenantName) { 531 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 532 return $this->makeRequest($endpoint, 'GET'); 533 } 534 535 /** 536 * Create a new database 537 * 538 * Creates a new database with the specified name within a tenant. 539 * 540 * @param string $databaseName The database name 541 * @param string $tenantName The tenant name 542 * @return array The response from the API 543 */ 544 public function createDatabase($databaseName, $tenantName) { 545 $endpoint = "/tenants/{$tenantName}/databases"; 546 $data = ['name' => $databaseName]; 547 return $this->makeRequest($endpoint, 'POST', $data); 548 } 549 550 /** 551 * Ensure a collection exists, creating it if necessary 552 * 553 * This helper function checks if a collection exists and creates it if it doesn't. 554 * 555 * @param string $collectionName The name of the collection to check/create 556 * @return string Status message indicating what happened 557 */ 558 public function ensureCollectionExists($collectionName) { 559 try { 560 $collection = $this->getCollection($collectionName); 561 return "Collection '$collectionName' already exists."; 562 } catch (\Exception $e) { 563 // Collection doesn't exist, create it 564 $created = $this->createCollection($collectionName); 565 return "Collection '$collectionName' created."; 566 } 567 } 568 569 /** 570 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 571 * 572 * This function handles the complete processing of a single DokuWiki file: 573 * 1. Parses the file path to extract metadata and document ID 574 * 2. Determines the appropriate collection based on document ID 575 * 3. Checks if the document needs updating using timestamp comparison 576 * 4. Reads and processes file content only if update is needed 577 * 5. Splits the document into chunks (paragraphs) 578 * 6. Extracts rich metadata from the DokuWiki ID format 579 * 7. Generates embeddings for each chunk 580 * 8. Sends all chunks to ChromaDB with metadata 581 * 582 * Supported ID formats: 583 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 584 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 585 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 586 * 587 * The function implements smart update checking by comparing file modification time 588 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 589 * 590 * @param string $filePath The path to the file to process 591 * @param string $collectionName The name of the collection to use 592 * @param bool $collectionChecked Whether the collection has already been checked/created 593 * @return array Result with status and details 594 */ 595 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 596 // Parse file path to extract metadata 597 $id = parseFilePath($filePath); 598 599 try { 600 // Create collection if it doesn't exist (only if not already checked) 601 $collectionStatus = ''; 602 if (!$collectionChecked) { 603 $collectionStatus = $this->ensureCollectionExists($collectionName); 604 } 605 606 // Get collection ID 607 $collection = $this->getCollection($collectionName); 608 if (!isset($collection['id'])) { 609 return [ 610 'status' => 'error', 611 'message' => "Collection ID not found for '{$collectionName}'" 612 ]; 613 } 614 $collectionId = $collection['id']; 615 616 // Get file modification time 617 $fileModifiedTime = filemtime($filePath); 618 619 // Check if document needs update 620 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 621 622 // If document is up to date, skip processing 623 if (!$needsUpdate) { 624 return [ 625 'status' => 'skipped', 626 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 627 ]; 628 } 629 630 // Read file content 631 $content = file_get_contents($filePath); 632 633 // Split document into chunks (paragraphs separated by two newlines) 634 $paragraphs = preg_split('/\n\s*\n/', $content); 635 $chunks = []; 636 $chunkMetadata = []; 637 638 // Parse the DokuWiki ID to extract base metadata 639 $parts = explode(':', $id); 640 641 // Extract metadata from the last part of the ID 642 $lastPart = end($parts); 643 $baseMetadata = []; 644 645 // Add the document ID as metadata 646 $baseMetadata['document_id'] = $id; 647 648 // Add current timestamp 649 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 650 651 // Check if any part of the ID is 'templates' and set template metadata 652 $isTemplate = in_array('templates', $parts); 653 if ($isTemplate) { 654 $baseMetadata['type'] = 'template'; 655 } else { 656 $baseMetadata['type'] = 'report'; 657 } 658 659 // Extract modality from the second part 660 if (isset($parts[1])) { 661 $baseMetadata['modality'] = $parts[1]; 662 } 663 664 // Handle different ID formats based on the third part: word (institution) or numeric (year) 665 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 666 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 667 // For templates, don't set institution, date or year 668 if (isset($parts[2]) && !$isTemplate) { 669 // Check if third part is numeric (year) or word (institution) 670 if (is_numeric($parts[2])) { 671 // Format: reports:mri:2024:g287-name-surname (year format) 672 // Extract year from the third part 673 $baseMetadata['year'] = $parts[2]; 674 675 // Set default institution from config 676 global $conf; 677 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 678 679 // Extract registration and name from the last part 680 // Registration should start with one letter or number and contain numbers before the '-' character 681 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 682 // Check if the first part contains at least one digit to be considered a registration 683 if (preg_match('/[0-9]/', $matches[1])) { 684 $baseMetadata['registration'] = $matches[1]; 685 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 686 } else { 687 // If no registration pattern found, treat entire part as patient name 688 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 689 } 690 } else { 691 // If no match, treat entire part as patient name 692 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 693 } 694 } else { 695 // Format: reports:mri:institution:250620-name-surname (institution format) 696 // Extract institution from the third part 697 $baseMetadata['institution'] = $parts[2]; 698 699 // Extract date and name from the last part 700 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 701 $dateStr = $matches[1]; 702 $name = $matches[2]; 703 704 // Convert date format (250620 -> 2025-06-20) 705 $day = substr($dateStr, 0, 2); 706 $month = substr($dateStr, 2, 2); 707 $year = substr($dateStr, 4, 2); 708 // Assuming 20xx for years 00-69 and 19xx for years 70-99 709 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 710 $formattedDate = $fullYear . '-' . $month . '-' . $day; 711 712 $baseMetadata['date'] = $formattedDate; 713 $baseMetadata['name'] = str_replace('-', ' ', $name); 714 } 715 } 716 } 717 718 // For templates, always extract name from the last part 719 if ($isTemplate && isset($lastPart)) { 720 // Extract name from the last part (everything after the last colon) 721 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 722 // Check if the first part contains at least one digit to be considered a registration 723 if (preg_match('/[0-9]/', $matches[1])) { 724 $baseMetadata['registration'] = $matches[1]; 725 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 726 } else { 727 // If no registration pattern found, treat entire part as template name 728 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 729 } 730 } else { 731 // If no match, treat entire part as template name 732 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 733 } 734 } 735 736 // Process each paragraph as a chunk with intelligent metadata handling 737 $chunkIds = []; 738 $chunkContents = []; 739 $chunkMetadatas = []; 740 $chunkEmbeddings = []; 741 $currentTags = []; 742 743 foreach ($paragraphs as $index => $paragraph) { 744 // Skip empty paragraphs to avoid processing whitespace-only content 745 $paragraph = trim($paragraph); 746 if (empty($paragraph)) { 747 continue; 748 } 749 750 // Check if this is a DokuWiki title (starts and ends with =) 751 // Titles are converted to tags for better searchability but not stored as content chunks 752 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 753 // Extract title content and clean it 754 $titleContent = trim($matches[1]); 755 756 // Split into words and create searchable tags 757 $words = preg_split('/\s+/', $titleContent); 758 $tags = []; 759 760 foreach ($words as $word) { 761 // Only use words longer than 3 characters to reduce noise 762 if (strlen($word) >= 3) { 763 $tags[] = strtolower($word); 764 } 765 } 766 767 // Remove duplicate tags and store for use in subsequent chunks 768 $currentTags = array_unique($tags); 769 continue; // Skip storing title chunks as content 770 } 771 772 // Create chunk ID 773 $chunkId = $id . '@' . ($index + 1); 774 775 // Generate embeddings for the chunk 776 $embeddings = $this->generateEmbeddings($paragraph); 777 778 // Add chunk-specific metadata 779 $metadata = $baseMetadata; 780 $metadata['chunk_id'] = $chunkId; 781 $metadata['chunk_number'] = $index + 1; 782 $metadata['total_chunks'] = count($paragraphs); 783 784 // Add current tags to metadata if any exist 785 if (!empty($currentTags)) { 786 $metadata['tags'] = implode(',', $currentTags); 787 } 788 789 // Store chunk data 790 $chunkIds[] = $chunkId; 791 $chunkContents[] = $paragraph; 792 $chunkMetadatas[] = $metadata; 793 $chunkEmbeddings[] = $embeddings; 794 } 795 796 // If no chunks were created, skip this file 797 if (empty($chunkIds)) { 798 return [ 799 'status' => 'skipped', 800 'message' => "No valid chunks found in file '$id'. Skipping..." 801 ]; 802 } 803 804 // Send all chunks to ChromaDB 805 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 806 807 return [ 808 'status' => 'success', 809 'message' => "Successfully sent file to ChromaDB", 810 'details' => [ 811 'document_id' => $id, 812 'chunks' => count($chunkIds), 813 'collection' => $collectionName 814 ], 815 'collection_status' => $collectionStatus 816 ]; 817 } catch (\Exception $e) { 818 return [ 819 'status' => 'error', 820 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 821 ]; 822 } 823 } 824 825 /** 826 * Process all DokuWiki files in a directory and send them to ChromaDB 827 * 828 * This function recursively processes all .txt files in a directory and its subdirectories. 829 * It first checks if the appropriate collection exists and creates it if needed. 830 * Then it processes each file individually. 831 * 832 * @param string $dirPath The directory path to process 833 * @return array Result with status and details 834 */ 835 public function processDirectory($dirPath) { 836 // Check if directory exists 837 if (!is_dir($dirPath)) { 838 return [ 839 'status' => 'error', 840 'message' => "Directory does not exist: $dirPath" 841 ]; 842 } 843 844 // Create RecursiveIteratorIterator to process directories recursively 845 $iterator = new RecursiveIteratorIterator( 846 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 847 RecursiveIteratorIterator::LEAVES_ONLY 848 ); 849 850 $files = []; 851 foreach ($iterator as $file) { 852 // Process only .txt files that don't start with underscore 853 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 854 $files[] = $file->getPathname(); 855 } 856 } 857 858 if (empty($files)) { 859 return [ 860 'status' => 'skipped', 861 'message' => "No .txt files found in directory: $dirPath" 862 ]; 863 } 864 865 // Use the first part of the document ID as collection name, fallback to 'documents' 866 $sampleFile = $files[0]; 867 $id = parseFilePath($sampleFile); 868 $idParts = explode(':', $id); 869 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 870 871 try { 872 $this->ensureCollectionExists($collectionName); 873 $collectionChecked = true; 874 } catch (Exception $e) { 875 $collectionChecked = true; 876 } 877 878 $results = []; 879 foreach ($files as $file) { 880 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 881 $results[] = [ 882 'file' => $file, 883 'result' => $result 884 ]; 885 } 886 887 return [ 888 'status' => 'success', 889 'message' => "Finished processing directory.", 890 'files_count' => count($files), 891 'results' => $results 892 ]; 893 } 894} 895 896/** 897 * Parse a file path and convert it to a DokuWiki ID 898 * 899 * Takes a file system path and converts it to the DokuWiki ID format by: 900 * 1. Removing the base path prefix (using DokuWiki's pages directory) 901 * 2. Removing the .txt extension 902 * 3. Converting directory separators to colons 903 * 904 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 905 * Becomes: reports:mri:2024:g287-name-surname 906 * 907 * @param string $filePath The full file path to parse 908 * @return string The DokuWiki ID 909 */ 910function parseFilePath($filePath) { 911 // Use DokuWiki's constant to get the pages directory if available 912 if (defined('DOKU_INC')) { 913 $pagesDir = DOKU_INC . 'data/pages/'; 914 } else { 915 // Fallback to common DokuWiki installation path 916 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 917 } 918 919 // Remove the base path 920 $relativePath = str_replace($pagesDir, '', $filePath); 921 922 // Remove .txt extension 923 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 924 925 // Split path into parts and filter out empty parts 926 $parts = array_filter(explode('/', $relativePath)); 927 928 // Build DokuWiki ID (use first part as namespace) 929 $idParts = []; 930 foreach ($parts as $part) { 931 if (!empty($part)) { 932 $idParts[] = $part; 933 } 934 } 935 936 return implode(':', $idParts); 937} 938 939