1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 /** 14 * Initialize the ChromaDB client 15 * 16 * Creates a new ChromaDB client instance with the specified connection parameters. 17 * Also ensures that the specified tenant and database exist. 18 * 19 * @param string $host ChromaDB server host (default: CHROMA_HOST) 20 * @param int $port ChromaDB server port (default: CHROMA_PORT) 21 * @param string $tenant ChromaDB tenant name (default: CHROMA_TENANT) 22 * @param string $database ChromaDB database name (default: CHROMA_DATABASE) 23 * @param string $ollamaHost Ollama server host (default: OLLAMA_HOST) 24 * @param int $ollamaPort Ollama server port (default: OLLAMA_PORT) 25 * @param string $ollamaModel Ollama embeddings model (default: OLLAMA_EMBEDDINGS_MODEL) 26 */ 27 public function __construct($host = CHROMA_HOST, $port = CHROMA_PORT, $tenant = CHROMA_TENANT, $database = CHROMA_DATABASE, $ollamaHost = OLLAMA_HOST, $ollamaPort = OLLAMA_PORT, $ollamaModel = OLLAMA_EMBEDDINGS_MODEL) { 28 $this->baseUrl = "http://{$host}:{$port}"; 29 $this->tenant = $tenant; 30 $this->database = $database; 31 $this->ollamaHost = $ollamaHost; 32 $this->ollamaPort = $ollamaPort; 33 $this->ollamaModel = $ollamaModel; 34 $this->client = curl_init(); 35 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 36 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 37 'Content-Type: application/json', 38 'Accept: application/json' 39 ]); 40 41 // Initialize Ollama client 42 $this->ollamaClient = curl_init(); 43 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 44 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 45 'Content-Type: application/json' 46 ]); 47 48 // Check if tenant and database exist, create them if they don't 49 $this->ensureTenantAndDatabase(); 50 } 51 52 /** 53 * Clean up the cURL client when the object is destroyed 54 * 55 * @return void 56 */ 57 public function __destruct() { 58 curl_close($this->client); 59 curl_close($this->ollamaClient); 60 } 61 62 /** 63 * Make an HTTP request to the ChromaDB API 64 * 65 * This is a helper function that handles making HTTP requests to the ChromaDB API, 66 * including setting the appropriate headers for tenant and database. 67 * 68 * @param string $endpoint The API endpoint to call 69 * @param string $method The HTTP method to use (default: 'GET') 70 * @param array|null $data The data to send with the request (default: null) 71 * @return array The JSON response decoded as an array 72 * @throws Exception If there's a cURL error or HTTP error 73 */ 74 private function makeRequest($endpoint, $method = 'GET', $data = null) { 75 // Add tenant and database as headers instead of query parameters for v2 API 76 $headers = [ 77 'Content-Type: application/json', 78 'Accept: application/json' 79 ]; 80 81 $url = $this->baseUrl . '/api/v2' . $endpoint; 82 83 curl_setopt($this->client, CURLOPT_URL, $url); 84 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 85 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 86 87 if ($data) { 88 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 89 } else { 90 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 91 } 92 93 $response = curl_exec($this->client); 94 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 95 96 if (curl_error($this->client)) { 97 throw new Exception('Curl error: ' . curl_error($this->client)); 98 } 99 100 if ($httpCode >= 400) { 101 throw new Exception("HTTP Error: $httpCode, Response: $response"); 102 } 103 104 return json_decode($response, true); 105 } 106 107 /** 108 * Generate embeddings for text using Ollama 109 * 110 * @param string $text The text to generate embeddings for 111 * @return array The embeddings vector 112 */ 113 public function generateEmbeddings($text) { 114 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 115 116 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 117 118 $data = [ 119 'model' => $this->ollamaModel, 120 'prompt' => $text, 121 'keep_alive' => '30m' 122 ]; 123 124 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 125 126 $response = curl_exec($this->ollamaClient); 127 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 128 129 if (curl_error($this->ollamaClient)) { 130 throw new Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 131 } 132 133 if ($httpCode >= 400) { 134 throw new Exception("Ollama HTTP Error: $httpCode, Response: $response"); 135 } 136 137 $result = json_decode($response, true); 138 139 if (!isset($result['embedding'])) { 140 throw new Exception("Ollama response missing embedding: " . $response); 141 } 142 143 return $result['embedding']; 144 } 145 146 /** 147 * List all collections in the database 148 * 149 * Retrieves a list of all collections in the specified tenant and database. 150 * 151 * @return array List of collections 152 */ 153 public function listCollections() { 154 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 155 return $this->makeRequest($endpoint); 156 } 157 158 /** 159 * Get a collection by name 160 * 161 * Retrieves information about a specific collection by its name. 162 * 163 * @param string $name The name of the collection to retrieve 164 * @return array The collection information 165 * @throws Exception If the collection is not found 166 */ 167 public function getCollection($name) { 168 // Use provided name, fallback to 'documents' if empty 169 if (empty($name)) { 170 $name = 'documents'; 171 } 172 173 // First try to get collection by name 174 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 175 $collections = $this->makeRequest($endpoint); 176 177 // Find collection by name 178 foreach ($collections as $collection) { 179 if (isset($collection['name']) && $collection['name'] === $name) { 180 return $collection; 181 } 182 } 183 184 // If not found, throw exception 185 throw new Exception("Collection '{$name}' not found"); 186 } 187 188 /** 189 * Create a new collection 190 * 191 * Creates a new collection with the specified name and optional metadata. 192 * 193 * @param string $name The name of the collection to create 194 * @param array|null $metadata Optional metadata for the collection 195 * @return array The response from the API 196 */ 197 public function createCollection($name, $metadata = null) { 198 // Use provided name, fallback to 'documents' if empty 199 if (empty($name)) { 200 $name = 'documents'; 201 } 202 203 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 204 $data = ['name' => $name]; 205 if ($metadata) { 206 $data['metadata'] = $metadata; 207 } 208 return $this->makeRequest($endpoint, 'POST', $data); 209 } 210 211 /** 212 * Delete a collection by name 213 * 214 * Deletes a collection with the specified name. 215 * 216 * @param string $name The name of the collection to delete 217 * @return array The response from the API 218 * @throws Exception If the collection ID is not found 219 */ 220 public function deleteCollection($name) { 221 // Use provided name, fallback to 'documents' if empty 222 if (empty($name)) { 223 $name = 'documents'; 224 } 225 226 // First get the collection to find its ID 227 $collection = $this->getCollection($name); 228 if (!isset($collection['id'])) { 229 throw new Exception("Collection ID not found for '{$name}'"); 230 } 231 232 $collectionId = $collection['id']; 233 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 234 return $this->makeRequest($endpoint, 'DELETE'); 235 } 236 237 /** 238 * Get a document by its ID from a collection 239 * 240 * Retrieves a document from the specified collection using its ID. 241 * 242 * @param string $collectionName The name of the collection to get the document from 243 * @param string $documentId The document ID to retrieve 244 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 245 * @return array The retrieved document 246 * @throws Exception If the collection ID is not found 247 */ 248 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 249 // Use provided name, fallback to 'documents' if empty 250 if (empty($collectionName)) { 251 $collectionName = 'documents'; 252 } 253 254 // First get the collection to find its ID 255 $collection = $this->getCollection($collectionName); 256 if (!isset($collection['id'])) { 257 throw new Exception("Collection ID not found for '{$collectionName}'"); 258 } 259 260 $collectionId = $collection['id']; 261 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 262 $data = [ 263 'ids' => [$documentId], 264 'include' => $include 265 ]; 266 267 return $this->makeRequest($endpoint, 'POST', $data); 268 } 269 270 /** 271 * Add documents to a collection 272 * 273 * Adds documents to the specified collection. Each document must have a corresponding ID. 274 * Optional metadata and pre-computed embeddings can also be provided. 275 * 276 * @param string $collectionName The name of the collection to add documents to 277 * @param array $documents The document contents 278 * @param array $ids The document IDs 279 * @param array|null $metadatas Optional metadata for each document 280 * @param array|null $embeddings Optional pre-computed embeddings for each document 281 * @return array The response from the API 282 * @throws Exception If the collection ID is not found 283 */ 284 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 285 // Use provided name, fallback to 'documents' if empty 286 if (empty($collectionName)) { 287 $collectionName = 'documents'; 288 } 289 290 // First get the collection to find its ID 291 $collection = $this->getCollection($collectionName); 292 if (!isset($collection['id'])) { 293 throw new Exception("Collection ID not found for '{$collectionName}'"); 294 } 295 296 $collectionId = $collection['id']; 297 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 298 $data = [ 299 'ids' => $ids, 300 'documents' => $documents 301 ]; 302 303 if ($metadatas) { 304 $data['metadatas'] = $metadatas; 305 } 306 307 if ($embeddings) { 308 $data['embeddings'] = $embeddings; 309 } 310 311 return $this->makeRequest($endpoint, 'POST', $data); 312 } 313 314 /** 315 * Check if a document needs to be updated based on timestamp comparison 316 * 317 * Determines whether a document should be reprocessed by comparing the file's last modification 318 * time with the processed_at timestamp stored in the document's metadata. The function checks 319 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 320 * not included in the database. 321 * 322 * @param string $collectionId The ID of the collection to check documents in 323 * @param string $documentId The base document ID to check (without chunk suffixes) 324 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 325 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 326 * @throws Exception If there's an error checking the document 327 */ 328 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 329 try { 330 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 331 332 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 333 $chunkIdsToCheck = [ 334 $documentId . '@1', 335 $documentId . '@2', 336 $documentId . '@3' 337 ]; 338 339 $data = [ 340 'ids' => $chunkIdsToCheck, 341 'include' => [ 342 "metadatas" 343 ], 344 'limit' => 1 345 ]; 346 347 // Check if document exists 348 $result = $this->makeRequest($endpoint, 'POST', $data); 349 350 // If no documents found, return true (needs to be added) 351 if (empty($result['ids'])) { 352 return true; 353 } 354 355 // Check if any document has a processed_at timestamp 356 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 357 // Check the first metadata entry directly 358 $metadata = $result['metadatas'][0]; 359 360 // If processed_at is not set, return true (needs update) 361 if (!isset($metadata['processed_at'])) { 362 return true; 363 } 364 365 // Parse the processed_at timestamp 366 $processedTimestamp = strtotime($metadata['processed_at']); 367 368 // If file is newer than processed time, return true (needs update) 369 if ($fileModifiedTime > $processedTimestamp) { 370 return true; 371 } 372 } 373 374 // Document exists and is up to date 375 return false; 376 } catch (Exception $e) { 377 // If there's an error checking the document, assume it needs to be updated 378 return true; 379 } 380 } 381 382 /** 383 * Query a collection for similar documents 384 * 385 * Queries the specified collection for documents similar to the provided query texts. 386 * The function generates embeddings for the query texts and sends them to ChromaDB. 387 * Supports filtering results by metadata using the where parameter. 388 * 389 * @param string $collectionName The name of the collection to query 390 * @param array $queryTexts The query texts to search for 391 * @param int $nResults The number of results to return (default: 5) 392 * @param array|null $where Optional filter conditions for metadata 393 * @return array The query results 394 * @throws Exception If the collection ID is not found 395 */ 396 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 397 // Use provided name, fallback to 'documents' if empty 398 if (empty($collectionName)) { 399 $collectionName = 'documents'; 400 } 401 402 // First get the collection to find its ID 403 $collection = $this->getCollection($collectionName); 404 if (!isset($collection['id'])) { 405 throw new Exception("Collection ID not found for '{$collectionName}'"); 406 } 407 408 $collectionId = $collection['id']; 409 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 410 411 // Generate embeddings for query texts 412 $queryEmbeddings = []; 413 foreach ($queryTexts as $text) { 414 $queryEmbeddings[] = $this->generateEmbeddings($text); 415 } 416 417 $data = [ 418 'query_embeddings' => $queryEmbeddings, 419 'n_results' => $nResults 420 ]; 421 422 // Add where clause for metadata filtering if provided 423 if ($where && is_array($where)) { 424 $data['where'] = $where; 425 } 426 427 return $this->makeRequest($endpoint, 'POST', $data); 428 } 429 430 /** 431 * Check if the ChromaDB server is alive 432 * 433 * Sends a heartbeat request to verify that the ChromaDB server is running. 434 * 435 * @return array The response from the heartbeat endpoint 436 */ 437 public function heartbeat() { 438 $endpoint = "/heartbeat"; 439 return $this->makeRequest($endpoint, 'GET'); 440 } 441 442 /** 443 * Get authentication and identity information 444 * 445 * Retrieves authentication and identity information from the ChromaDB server. 446 * 447 * @return array The response from the auth/identity endpoint 448 */ 449 public function getIdentity() { 450 $endpoint = "/identity"; 451 return $this->makeRequest($endpoint, 'GET'); 452 } 453 454 /** 455 * Ensure that the specified tenant and database exist 456 * 457 * Checks if the specified tenant and database exist, and creates them if they don't. 458 * 459 * @return void 460 */ 461 private function ensureTenantAndDatabase() { 462 // Check if tenant exists, create if it doesn't 463 try { 464 $this->getTenant($this->tenant); 465 } catch (Exception $e) { 466 // Tenant doesn't exist, create it 467 $this->createTenant($this->tenant); 468 } 469 470 // Check if database exists, create if it doesn't 471 try { 472 $this->getDatabase($this->database, $this->tenant); 473 } catch (Exception $e) { 474 // Database doesn't exist, create it 475 $this->createDatabase($this->database, $this->tenant); 476 } 477 } 478 479 /** 480 * Get tenant information 481 * 482 * Retrieves information about the specified tenant. 483 * 484 * @param string $tenantName The tenant name 485 * @return array The tenant information 486 */ 487 public function getTenant($tenantName) { 488 $endpoint = "/tenants/{$tenantName}"; 489 return $this->makeRequest($endpoint, 'GET'); 490 } 491 492 /** 493 * Create a new tenant 494 * 495 * Creates a new tenant with the specified name. 496 * 497 * @param string $tenantName The tenant name 498 * @return array The response from the API 499 */ 500 public function createTenant($tenantName) { 501 $endpoint = "/tenants"; 502 $data = ['name' => $tenantName]; 503 return $this->makeRequest($endpoint, 'POST', $data); 504 } 505 506 /** 507 * Get database information 508 * 509 * Retrieves information about the specified database within a tenant. 510 * 511 * @param string $databaseName The database name 512 * @param string $tenantName The tenant name 513 * @return array The database information 514 */ 515 public function getDatabase($databaseName, $tenantName) { 516 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 517 return $this->makeRequest($endpoint, 'GET'); 518 } 519 520 /** 521 * Create a new database 522 * 523 * Creates a new database with the specified name within a tenant. 524 * 525 * @param string $databaseName The database name 526 * @param string $tenantName The tenant name 527 * @return array The response from the API 528 */ 529 public function createDatabase($databaseName, $tenantName) { 530 $endpoint = "/tenants/{$tenantName}/databases"; 531 $data = ['name' => $databaseName]; 532 return $this->makeRequest($endpoint, 'POST', $data); 533 } 534 535 /** 536 * Ensure a collection exists, creating it if necessary 537 * 538 * This helper function checks if a collection exists and creates it if it doesn't. 539 * 540 * @param string $collectionName The name of the collection to check/create 541 * @return string Status message indicating what happened 542 */ 543 public function ensureCollectionExists($collectionName) { 544 try { 545 $collection = $this->getCollection($collectionName); 546 return "Collection '$collectionName' already exists."; 547 } catch (Exception $e) { 548 // Collection doesn't exist, create it 549 $created = $this->createCollection($collectionName); 550 return "Collection '$collectionName' created."; 551 } 552 } 553 554 /** 555 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 556 * 557 * This function handles the complete processing of a single DokuWiki file: 558 * 1. Parses the file path to extract metadata and document ID 559 * 2. Determines the appropriate collection based on document ID 560 * 3. Checks if the document needs updating using timestamp comparison 561 * 4. Reads and processes file content only if update is needed 562 * 5. Splits the document into chunks (paragraphs) 563 * 6. Extracts rich metadata from the DokuWiki ID format 564 * 7. Generates embeddings for each chunk 565 * 8. Sends all chunks to ChromaDB with metadata 566 * 567 * Supported ID formats: 568 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 569 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 570 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 571 * 572 * The function implements smart update checking by comparing file modification time 573 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 574 * 575 * @param string $filePath The path to the file to process 576 * @param string $collectionName The name of the collection to use 577 * @param bool $collectionChecked Whether the collection has already been checked/created 578 * @return array Result with status and details 579 */ 580 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 581 // Parse file path to extract metadata 582 $id = parseFilePath($filePath); 583 584 try { 585 // Create collection if it doesn't exist (only if not already checked) 586 $collectionStatus = ''; 587 if (!$collectionChecked) { 588 $collectionStatus = $this->ensureCollectionExists($collectionName); 589 } 590 591 // Get collection ID 592 $collection = $this->getCollection($collectionName); 593 if (!isset($collection['id'])) { 594 return [ 595 'status' => 'error', 596 'message' => "Collection ID not found for '{$collectionName}'" 597 ]; 598 } 599 $collectionId = $collection['id']; 600 601 // Get file modification time 602 $fileModifiedTime = filemtime($filePath); 603 604 // Check if document needs update 605 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 606 607 // If document is up to date, skip processing 608 if (!$needsUpdate) { 609 return [ 610 'status' => 'skipped', 611 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 612 ]; 613 } 614 615 // Read file content 616 $content = file_get_contents($filePath); 617 618 // Split document into chunks (paragraphs separated by two newlines) 619 $paragraphs = preg_split('/\n\s*\n/', $content); 620 $chunks = []; 621 $chunkMetadata = []; 622 623 // Parse the DokuWiki ID to extract base metadata 624 $parts = explode(':', $id); 625 626 // Extract metadata from the last part of the ID 627 $lastPart = end($parts); 628 $baseMetadata = []; 629 630 // Add the document ID as metadata 631 $baseMetadata['document_id'] = $id; 632 633 // Add current timestamp 634 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 635 636 // Check if any part of the ID is 'templates' and set template metadata 637 $isTemplate = in_array('templates', $parts); 638 if ($isTemplate) { 639 $baseMetadata['type'] = 'template'; 640 } else { 641 $baseMetadata['type'] = 'report'; 642 } 643 644 // Extract modality from the second part 645 if (isset($parts[1])) { 646 $baseMetadata['modality'] = $parts[1]; 647 } 648 649 // Handle different ID formats based on the third part: word (institution) or numeric (year) 650 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 651 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 652 // For templates, don't set institution, date or year 653 if (isset($parts[2]) && !$isTemplate) { 654 // Check if third part is numeric (year) or word (institution) 655 if (is_numeric($parts[2])) { 656 // Format: reports:mri:2024:g287-name-surname (year format) 657 // Extract year from the third part 658 $baseMetadata['year'] = $parts[2]; 659 660 // Set default institution from config 661 $baseMetadata['institution'] = DEFAULT_INSTITUTION; 662 663 // Extract registration and name from the last part 664 // Registration should start with one letter or number and contain numbers before the '-' character 665 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 666 // Check if the first part contains at least one digit to be considered a registration 667 if (preg_match('/[0-9]/', $matches[1])) { 668 $baseMetadata['registration'] = $matches[1]; 669 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 670 } else { 671 // If no registration pattern found, treat entire part as patient name 672 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 673 } 674 } else { 675 // If no match, treat entire part as patient name 676 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 677 } 678 } else { 679 // Format: reports:mri:institution:250620-name-surname (institution format) 680 // Extract institution from the third part 681 $baseMetadata['institution'] = $parts[2]; 682 683 // Extract date and name from the last part 684 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 685 $dateStr = $matches[1]; 686 $name = $matches[2]; 687 688 // Convert date format (250620 -> 2025-06-20) 689 $day = substr($dateStr, 0, 2); 690 $month = substr($dateStr, 2, 2); 691 $year = substr($dateStr, 4, 2); 692 // Assuming 20xx for years 00-69 and 19xx for years 70-99 693 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 694 $formattedDate = $fullYear . '-' . $month . '-' . $day; 695 696 $baseMetadata['date'] = $formattedDate; 697 $baseMetadata['name'] = str_replace('-', ' ', $name); 698 } 699 } 700 } 701 702 // For templates, always extract name from the last part 703 if ($isTemplate && isset($lastPart)) { 704 // Extract name from the last part (everything after the last colon) 705 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 706 // Check if the first part contains at least one digit to be considered a registration 707 if (preg_match('/[0-9]/', $matches[1])) { 708 $baseMetadata['registration'] = $matches[1]; 709 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 710 } else { 711 // If no registration pattern found, treat entire part as template name 712 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 713 } 714 } else { 715 // If no match, treat entire part as template name 716 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 717 } 718 } 719 720 // Process each paragraph as a chunk with intelligent metadata handling 721 $chunkIds = []; 722 $chunkContents = []; 723 $chunkMetadatas = []; 724 $chunkEmbeddings = []; 725 $currentTags = []; 726 727 foreach ($paragraphs as $index => $paragraph) { 728 // Skip empty paragraphs to avoid processing whitespace-only content 729 $paragraph = trim($paragraph); 730 if (empty($paragraph)) { 731 continue; 732 } 733 734 // Check if this is a DokuWiki title (starts and ends with =) 735 // Titles are converted to tags for better searchability but not stored as content chunks 736 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 737 // Extract title content and clean it 738 $titleContent = trim($matches[1]); 739 740 // Split into words and create searchable tags 741 $words = preg_split('/\s+/', $titleContent); 742 $tags = []; 743 744 foreach ($words as $word) { 745 // Only use words longer than 3 characters to reduce noise 746 if (strlen($word) >= 3) { 747 $tags[] = strtolower($word); 748 } 749 } 750 751 // Remove duplicate tags and store for use in subsequent chunks 752 $currentTags = array_unique($tags); 753 continue; // Skip storing title chunks as content 754 } 755 756 // Create chunk ID 757 $chunkId = $id . '@' . ($index + 1); 758 759 // Generate embeddings for the chunk 760 $embeddings = $this->generateEmbeddings($paragraph); 761 762 // Add chunk-specific metadata 763 $metadata = $baseMetadata; 764 $metadata['chunk_id'] = $chunkId; 765 $metadata['chunk_number'] = $index + 1; 766 $metadata['total_chunks'] = count($paragraphs); 767 768 // Add current tags to metadata if any exist 769 if (!empty($currentTags)) { 770 $metadata['tags'] = implode(',', $currentTags); 771 } 772 773 // Store chunk data 774 $chunkIds[] = $chunkId; 775 $chunkContents[] = $paragraph; 776 $chunkMetadatas[] = $metadata; 777 $chunkEmbeddings[] = $embeddings; 778 } 779 780 // If no chunks were created, skip this file 781 if (empty($chunkIds)) { 782 return [ 783 'status' => 'skipped', 784 'message' => "No valid chunks found in file '$id'. Skipping..." 785 ]; 786 } 787 788 // Send all chunks to ChromaDB 789 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 790 791 return [ 792 'status' => 'success', 793 'message' => "Successfully sent file to ChromaDB", 794 'details' => [ 795 'document_id' => $id, 796 'chunks' => count($chunkIds), 797 'collection' => $collectionName 798 ], 799 'collection_status' => $collectionStatus 800 ]; 801 } catch (Exception $e) { 802 return [ 803 'status' => 'error', 804 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 805 ]; 806 } 807 } 808 809 /** 810 * Process all DokuWiki files in a directory and send them to ChromaDB 811 * 812 * This function recursively processes all .txt files in a directory and its subdirectories. 813 * It first checks if the appropriate collection exists and creates it if needed. 814 * Then it processes each file individually. 815 * 816 * @param string $dirPath The directory path to process 817 * @return array Result with status and details 818 */ 819 public function processDirectory($dirPath) { 820 // Check if directory exists 821 if (!is_dir($dirPath)) { 822 return [ 823 'status' => 'error', 824 'message' => "Directory does not exist: $dirPath" 825 ]; 826 } 827 828 // Create RecursiveIteratorIterator to process directories recursively 829 $iterator = new RecursiveIteratorIterator( 830 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 831 RecursiveIteratorIterator::LEAVES_ONLY 832 ); 833 834 $files = []; 835 foreach ($iterator as $file) { 836 // Process only .txt files that don't start with underscore 837 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 838 $files[] = $file->getPathname(); 839 } 840 } 841 842 if (empty($files)) { 843 return [ 844 'status' => 'skipped', 845 'message' => "No .txt files found in directory: $dirPath" 846 ]; 847 } 848 849 // Use the first part of the document ID as collection name, fallback to 'documents' 850 $sampleFile = $files[0]; 851 $id = parseFilePath($sampleFile); 852 $idParts = explode(':', $id); 853 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 854 855 try { 856 $this->ensureCollectionExists($collectionName); 857 $collectionChecked = true; 858 } catch (Exception $e) { 859 $collectionChecked = true; 860 } 861 862 $results = []; 863 foreach ($files as $file) { 864 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 865 $results[] = [ 866 'file' => $file, 867 'result' => $result 868 ]; 869 } 870 871 return [ 872 'status' => 'success', 873 'message' => "Finished processing directory.", 874 'files_count' => count($files), 875 'results' => $results 876 ]; 877 } 878} 879 880/** 881 * Parse a file path and convert it to a DokuWiki ID 882 * 883 * Takes a file system path and converts it to the DokuWiki ID format by: 884 * 1. Removing the base path prefix (using DokuWiki's pages directory) 885 * 2. Removing the .txt extension 886 * 3. Converting directory separators to colons 887 * 888 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 889 * Becomes: reports:mri:2024:g287-name-surname 890 * 891 * @param string $filePath The full file path to parse 892 * @return string The DokuWiki ID 893 */ 894function parseFilePath($filePath) { 895 // Use DokuWiki's constant to get the pages directory if available 896 if (defined('DOKU_INC')) { 897 $pagesDir = DOKU_INC . 'data/pages/'; 898 } else { 899 // Fallback to common DokuWiki installation path 900 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 901 } 902 903 // Remove the base path 904 $relativePath = str_replace($pagesDir, '', $filePath); 905 906 // Remove .txt extension 907 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 908 909 // Split path into parts and filter out empty parts 910 $parts = array_filter(explode('/', $relativePath)); 911 912 // Build DokuWiki ID (use first part as namespace) 913 $idParts = []; 914 foreach ($parts as $part) { 915 if (!empty($part)) { 916 $idParts[] = $part; 917 } 918 } 919 920 return implode(':', $idParts); 921} 922 923