1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 private $ollamaModel; 14 15 /** 16 * Get configuration value for the dokullm plugin 17 * 18 * @param string $key Configuration key 19 * @param mixed $default Default value if key not found 20 * @return mixed Configuration value 21 */ 22 private function getConf($key, $default = null) { 23 global $conf; 24 return isset($conf['plugin']['dokullm'][$key]) ? $conf['plugin']['dokullm'][$key] : $default; 25 } 26 /** 27 * Initialize the ChromaDB client 28 * 29 * Creates a new ChromaDB client instance with the specified connection parameters. 30 * Also ensures that the specified tenant and database exist. 31 * 32 * @param string $host ChromaDB server host 33 * @param int $port ChromaDB server port 34 * @param string $tenant ChromaDB tenant name 35 * @param string $database ChromaDB database name 36 * @param string $ollamaHost Ollama server host 37 * @param int $ollamaPort Ollama server port 38 * @param string $ollamaModel Ollama embeddings model 39 */ 40 public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) { 41 // Use provided parameters or fall back to configuration values 42 $chromaHost = $host ?? $this->getConf('chroma_host', '127.0.0.1'); 43 $chromaPort = $port ?? $this->getConf('chroma_port', 8000); 44 $this->tenant = $tenant ?? $this->getConf('chroma_tenant', 'dokullm'); 45 $this->database = $database ?? $this->getConf('chroma_database', 'dokullm'); 46 $this->ollamaHost = $ollamaHost ?? $this->getConf('ollama_host', '127.0.0.1'); 47 $this->ollamaPort = $ollamaPort ?? $this->getConf('ollama_port', 11434); 48 $this->ollamaModel = $ollamaModel ?? $this->getConf('ollama_embeddings_model', 'nomic-embed-text'); 49 50 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 51 $this->client = curl_init(); 52 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 53 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 54 'Content-Type: application/json', 55 'Accept: application/json' 56 ]); 57 58 // Initialize Ollama client 59 $this->ollamaClient = curl_init(); 60 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 61 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 62 'Content-Type: application/json' 63 ]); 64 65 // Check if tenant and database exist, create them if they don't 66 $this->ensureTenantAndDatabase(); 67 } 68 69 /** 70 * Clean up the cURL client when the object is destroyed 71 * 72 * @return void 73 */ 74 public function __destruct() { 75 curl_close($this->client); 76 curl_close($this->ollamaClient); 77 } 78 79 /** 80 * Make an HTTP request to the ChromaDB API 81 * 82 * This is a helper function that handles making HTTP requests to the ChromaDB API, 83 * including setting the appropriate headers for tenant and database. 84 * 85 * @param string $endpoint The API endpoint to call 86 * @param string $method The HTTP method to use (default: 'GET') 87 * @param array|null $data The data to send with the request (default: null) 88 * @return array The JSON response decoded as an array 89 * @throws Exception If there's a cURL error or HTTP error 90 */ 91 private function makeRequest($endpoint, $method = 'GET', $data = null) { 92 // Add tenant and database as headers instead of query parameters for v2 API 93 $headers = [ 94 'Content-Type: application/json', 95 'Accept: application/json' 96 ]; 97 98 $url = $this->baseUrl . '/api/v2' . $endpoint; 99 100 curl_setopt($this->client, CURLOPT_URL, $url); 101 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 102 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 103 104 if ($data) { 105 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 106 } else { 107 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 108 } 109 110 $response = curl_exec($this->client); 111 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 112 113 if (curl_error($this->client)) { 114 throw new \Exception('Curl error: ' . curl_error($this->client)); 115 } 116 117 if ($httpCode >= 400) { 118 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 119 } 120 121 return json_decode($response, true); 122 } 123 124 /** 125 * Generate embeddings for text using Ollama 126 * 127 * @param string $text The text to generate embeddings for 128 * @return array The embeddings vector 129 */ 130 public function generateEmbeddings($text) { 131 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 132 133 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 134 135 $data = [ 136 'model' => $this->ollamaModel, 137 'prompt' => $text, 138 'keep_alive' => '30m' 139 ]; 140 141 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 142 143 $response = curl_exec($this->ollamaClient); 144 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 145 146 if (curl_error($this->ollamaClient)) { 147 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 148 } 149 150 if ($httpCode >= 400) { 151 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 152 } 153 154 $result = json_decode($response, true); 155 156 if (!isset($result['embedding'])) { 157 throw new \Exception("Ollama response missing embedding: " . $response); 158 } 159 160 return $result['embedding']; 161 } 162 163 /** 164 * List all collections in the database 165 * 166 * Retrieves a list of all collections in the specified tenant and database. 167 * 168 * @return array List of collections 169 */ 170 public function listCollections() { 171 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 172 return $this->makeRequest($endpoint); 173 } 174 175 /** 176 * Get a collection by name 177 * 178 * Retrieves information about a specific collection by its name. 179 * 180 * @param string $name The name of the collection to retrieve 181 * @return array The collection information 182 * @throws Exception If the collection is not found 183 */ 184 public function getCollection($name) { 185 // Use provided name, fallback to 'documents' if empty 186 if (empty($name)) { 187 $name = 'documents'; 188 } 189 190 // First try to get collection by name 191 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 192 $collections = $this->makeRequest($endpoint); 193 194 // Find collection by name 195 foreach ($collections as $collection) { 196 if (isset($collection['name']) && $collection['name'] === $name) { 197 return $collection; 198 } 199 } 200 201 // If not found, throw exception 202 throw new \Exception("Collection '{$name}' not found"); 203 } 204 205 /** 206 * Create a new collection 207 * 208 * Creates a new collection with the specified name and optional metadata. 209 * 210 * @param string $name The name of the collection to create 211 * @param array|null $metadata Optional metadata for the collection 212 * @return array The response from the API 213 */ 214 public function createCollection($name, $metadata = null) { 215 // Use provided name, fallback to 'documents' if empty 216 if (empty($name)) { 217 $name = 'documents'; 218 } 219 220 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 221 $data = ['name' => $name]; 222 if ($metadata) { 223 $data['metadata'] = $metadata; 224 } 225 return $this->makeRequest($endpoint, 'POST', $data); 226 } 227 228 /** 229 * Delete a collection by name 230 * 231 * Deletes a collection with the specified name. 232 * 233 * @param string $name The name of the collection to delete 234 * @return array The response from the API 235 * @throws Exception If the collection ID is not found 236 */ 237 public function deleteCollection($name) { 238 // Use provided name, fallback to 'documents' if empty 239 if (empty($name)) { 240 $name = 'documents'; 241 } 242 243 // First get the collection to find its ID 244 $collection = $this->getCollection($name); 245 if (!isset($collection['id'])) { 246 throw new \Exception("Collection ID not found for '{$name}'"); 247 } 248 249 $collectionId = $collection['id']; 250 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 251 return $this->makeRequest($endpoint, 'DELETE'); 252 } 253 254 /** 255 * Get a document by its ID from a collection 256 * 257 * Retrieves a document from the specified collection using its ID. 258 * 259 * @param string $collectionName The name of the collection to get the document from 260 * @param string $documentId The document ID to retrieve 261 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 262 * @return array The retrieved document 263 * @throws Exception If the collection ID is not found 264 */ 265 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 266 // Use provided name, fallback to 'documents' if empty 267 if (empty($collectionName)) { 268 $collectionName = 'documents'; 269 } 270 271 // First get the collection to find its ID 272 $collection = $this->getCollection($collectionName); 273 if (!isset($collection['id'])) { 274 throw new \Exception("Collection ID not found for '{$collectionName}'"); 275 } 276 277 $collectionId = $collection['id']; 278 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 279 $data = [ 280 'ids' => [$documentId], 281 'include' => $include 282 ]; 283 284 return $this->makeRequest($endpoint, 'POST', $data); 285 } 286 287 /** 288 * Add documents to a collection 289 * 290 * Adds documents to the specified collection. Each document must have a corresponding ID. 291 * Optional metadata and pre-computed embeddings can also be provided. 292 * 293 * @param string $collectionName The name of the collection to add documents to 294 * @param array $documents The document contents 295 * @param array $ids The document IDs 296 * @param array|null $metadatas Optional metadata for each document 297 * @param array|null $embeddings Optional pre-computed embeddings for each document 298 * @return array The response from the API 299 * @throws Exception If the collection ID is not found 300 */ 301 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 302 // Use provided name, fallback to 'documents' if empty 303 if (empty($collectionName)) { 304 $collectionName = 'documents'; 305 } 306 307 // First get the collection to find its ID 308 $collection = $this->getCollection($collectionName); 309 if (!isset($collection['id'])) { 310 throw new \Exception("Collection ID not found for '{$collectionName}'"); 311 } 312 313 $collectionId = $collection['id']; 314 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 315 $data = [ 316 'ids' => $ids, 317 'documents' => $documents 318 ]; 319 320 if ($metadatas) { 321 $data['metadatas'] = $metadatas; 322 } 323 324 if ($embeddings) { 325 $data['embeddings'] = $embeddings; 326 } 327 328 return $this->makeRequest($endpoint, 'POST', $data); 329 } 330 331 /** 332 * Check if a document needs to be updated based on timestamp comparison 333 * 334 * Determines whether a document should be reprocessed by comparing the file's last modification 335 * time with the processed_at timestamp stored in the document's metadata. The function checks 336 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 337 * not included in the database. 338 * 339 * @param string $collectionId The ID of the collection to check documents in 340 * @param string $documentId The base document ID to check (without chunk suffixes) 341 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 342 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 343 * @throws Exception If there's an error checking the document 344 */ 345 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 346 try { 347 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 348 349 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 350 $chunkIdsToCheck = [ 351 $documentId . '@1', 352 $documentId . '@2', 353 $documentId . '@3' 354 ]; 355 356 $data = [ 357 'ids' => $chunkIdsToCheck, 358 'include' => [ 359 "metadatas" 360 ], 361 'limit' => 1 362 ]; 363 364 // Check if document exists 365 $result = $this->makeRequest($endpoint, 'POST', $data); 366 367 // If no documents found, return true (needs to be added) 368 if (empty($result['ids'])) { 369 return true; 370 } 371 372 // Check if any document has a processed_at timestamp 373 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 374 // Check the first metadata entry directly 375 $metadata = $result['metadatas'][0]; 376 377 // If processed_at is not set, return true (needs update) 378 if (!isset($metadata['processed_at'])) { 379 return true; 380 } 381 382 // Parse the processed_at timestamp 383 $processedTimestamp = strtotime($metadata['processed_at']); 384 385 // If file is newer than processed time, return true (needs update) 386 if ($fileModifiedTime > $processedTimestamp) { 387 return true; 388 } 389 } 390 391 // Document exists and is up to date 392 return false; 393 } catch (\Exception $e) { 394 // If there's an error checking the document, assume it needs to be updated 395 return true; 396 } 397 } 398 399 /** 400 * Query a collection for similar documents 401 * 402 * Queries the specified collection for documents similar to the provided query texts. 403 * The function generates embeddings for the query texts and sends them to ChromaDB. 404 * Supports filtering results by metadata using the where parameter. 405 * 406 * @param string $collectionName The name of the collection to query 407 * @param array $queryTexts The query texts to search for 408 * @param int $nResults The number of results to return (default: 5) 409 * @param array|null $where Optional filter conditions for metadata 410 * @return array The query results 411 * @throws Exception If the collection ID is not found 412 */ 413 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 414 // Use provided name, fallback to 'documents' if empty 415 if (empty($collectionName)) { 416 $collectionName = 'documents'; 417 } 418 419 // First get the collection to find its ID 420 $collection = $this->getCollection($collectionName); 421 if (!isset($collection['id'])) { 422 throw new \Exception("Collection ID not found for '{$collectionName}'"); 423 } 424 425 $collectionId = $collection['id']; 426 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 427 428 // Generate embeddings for query texts 429 $queryEmbeddings = []; 430 foreach ($queryTexts as $text) { 431 $queryEmbeddings[] = $this->generateEmbeddings($text); 432 } 433 434 $data = [ 435 'query_embeddings' => $queryEmbeddings, 436 'n_results' => $nResults 437 ]; 438 439 // Add where clause for metadata filtering if provided 440 if ($where && is_array($where)) { 441 $data['where'] = $where; 442 } 443 444 return $this->makeRequest($endpoint, 'POST', $data); 445 } 446 447 /** 448 * Check if the ChromaDB server is alive 449 * 450 * Sends a heartbeat request to verify that the ChromaDB server is running. 451 * 452 * @return array The response from the heartbeat endpoint 453 */ 454 public function heartbeat() { 455 $endpoint = "/heartbeat"; 456 return $this->makeRequest($endpoint, 'GET'); 457 } 458 459 /** 460 * Get authentication and identity information 461 * 462 * Retrieves authentication and identity information from the ChromaDB server. 463 * 464 * @return array The response from the auth/identity endpoint 465 */ 466 public function getIdentity() { 467 $endpoint = "/identity"; 468 return $this->makeRequest($endpoint, 'GET'); 469 } 470 471 /** 472 * Ensure that the specified tenant and database exist 473 * 474 * Checks if the specified tenant and database exist, and creates them if they don't. 475 * 476 * @return void 477 */ 478 private function ensureTenantAndDatabase() { 479 // Check if tenant exists, create if it doesn't 480 try { 481 $this->getTenant($this->tenant); 482 } catch (\Exception $e) { 483 // Tenant doesn't exist, create it 484 $this->createTenant($this->tenant); 485 } 486 487 // Check if database exists, create if it doesn't 488 try { 489 $this->getDatabase($this->database, $this->tenant); 490 } catch (\Exception $e) { 491 // Database doesn't exist, create it 492 $this->createDatabase($this->database, $this->tenant); 493 } 494 } 495 496 /** 497 * Get tenant information 498 * 499 * Retrieves information about the specified tenant. 500 * 501 * @param string $tenantName The tenant name 502 * @return array The tenant information 503 */ 504 public function getTenant($tenantName) { 505 $endpoint = "/tenants/{$tenantName}"; 506 return $this->makeRequest($endpoint, 'GET'); 507 } 508 509 /** 510 * Create a new tenant 511 * 512 * Creates a new tenant with the specified name. 513 * 514 * @param string $tenantName The tenant name 515 * @return array The response from the API 516 */ 517 public function createTenant($tenantName) { 518 $endpoint = "/tenants"; 519 $data = ['name' => $tenantName]; 520 return $this->makeRequest($endpoint, 'POST', $data); 521 } 522 523 /** 524 * Get database information 525 * 526 * Retrieves information about the specified database within a tenant. 527 * 528 * @param string $databaseName The database name 529 * @param string $tenantName The tenant name 530 * @return array The database information 531 */ 532 public function getDatabase($databaseName, $tenantName) { 533 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 534 return $this->makeRequest($endpoint, 'GET'); 535 } 536 537 /** 538 * Create a new database 539 * 540 * Creates a new database with the specified name within a tenant. 541 * 542 * @param string $databaseName The database name 543 * @param string $tenantName The tenant name 544 * @return array The response from the API 545 */ 546 public function createDatabase($databaseName, $tenantName) { 547 $endpoint = "/tenants/{$tenantName}/databases"; 548 $data = ['name' => $databaseName]; 549 return $this->makeRequest($endpoint, 'POST', $data); 550 } 551 552 /** 553 * Ensure a collection exists, creating it if necessary 554 * 555 * This helper function checks if a collection exists and creates it if it doesn't. 556 * 557 * @param string $collectionName The name of the collection to check/create 558 * @return string Status message indicating what happened 559 */ 560 public function ensureCollectionExists($collectionName) { 561 try { 562 $collection = $this->getCollection($collectionName); 563 return "Collection '$collectionName' already exists."; 564 } catch (\Exception $e) { 565 // Collection doesn't exist, create it 566 $created = $this->createCollection($collectionName); 567 return "Collection '$collectionName' created."; 568 } 569 } 570 571 /** 572 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 573 * 574 * This function handles the complete processing of a single DokuWiki file: 575 * 1. Parses the file path to extract metadata and document ID 576 * 2. Determines the appropriate collection based on document ID 577 * 3. Checks if the document needs updating using timestamp comparison 578 * 4. Reads and processes file content only if update is needed 579 * 5. Splits the document into chunks (paragraphs) 580 * 6. Extracts rich metadata from the DokuWiki ID format 581 * 7. Generates embeddings for each chunk 582 * 8. Sends all chunks to ChromaDB with metadata 583 * 584 * Supported ID formats: 585 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 586 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 587 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 588 * 589 * The function implements smart update checking by comparing file modification time 590 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 591 * 592 * @param string $filePath The path to the file to process 593 * @param string $collectionName The name of the collection to use 594 * @param bool $collectionChecked Whether the collection has already been checked/created 595 * @return array Result with status and details 596 */ 597 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 598 // Parse file path to extract metadata 599 $id = parseFilePath($filePath); 600 601 try { 602 // Create collection if it doesn't exist (only if not already checked) 603 $collectionStatus = ''; 604 if (!$collectionChecked) { 605 $collectionStatus = $this->ensureCollectionExists($collectionName); 606 } 607 608 // Get collection ID 609 $collection = $this->getCollection($collectionName); 610 if (!isset($collection['id'])) { 611 return [ 612 'status' => 'error', 613 'message' => "Collection ID not found for '{$collectionName}'" 614 ]; 615 } 616 $collectionId = $collection['id']; 617 618 // Get file modification time 619 $fileModifiedTime = filemtime($filePath); 620 621 // Check if document needs update 622 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 623 624 // If document is up to date, skip processing 625 if (!$needsUpdate) { 626 return [ 627 'status' => 'skipped', 628 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 629 ]; 630 } 631 632 // Read file content 633 $content = file_get_contents($filePath); 634 635 // Split document into chunks (paragraphs separated by two newlines) 636 $paragraphs = preg_split('/\n\s*\n/', $content); 637 $chunks = []; 638 $chunkMetadata = []; 639 640 // Parse the DokuWiki ID to extract base metadata 641 $parts = explode(':', $id); 642 643 // Extract metadata from the last part of the ID 644 $lastPart = end($parts); 645 $baseMetadata = []; 646 647 // Add the document ID as metadata 648 $baseMetadata['document_id'] = $id; 649 650 // Add current timestamp 651 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 652 653 // Check if any part of the ID is 'templates' and set template metadata 654 $isTemplate = in_array('templates', $parts); 655 if ($isTemplate) { 656 $baseMetadata['type'] = 'template'; 657 } else { 658 $baseMetadata['type'] = 'report'; 659 } 660 661 // Extract modality from the second part 662 if (isset($parts[1])) { 663 $baseMetadata['modality'] = $parts[1]; 664 } 665 666 // Handle different ID formats based on the third part: word (institution) or numeric (year) 667 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 668 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 669 // For templates, don't set institution, date or year 670 if (isset($parts[2]) && !$isTemplate) { 671 // Check if third part is numeric (year) or word (institution) 672 if (is_numeric($parts[2])) { 673 // Format: reports:mri:2024:g287-name-surname (year format) 674 // Extract year from the third part 675 $baseMetadata['year'] = $parts[2]; 676 677 // Set default institution from config 678 $baseMetadata['institution'] = $this->getConf('default_institution', 'default'); 679 680 // Extract registration and name from the last part 681 // Registration should start with one letter or number and contain numbers before the '-' character 682 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 683 // Check if the first part contains at least one digit to be considered a registration 684 if (preg_match('/[0-9]/', $matches[1])) { 685 $baseMetadata['registration'] = $matches[1]; 686 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 687 } else { 688 // If no registration pattern found, treat entire part as patient name 689 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 690 } 691 } else { 692 // If no match, treat entire part as patient name 693 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 694 } 695 } else { 696 // Format: reports:mri:institution:250620-name-surname (institution format) 697 // Extract institution from the third part 698 $baseMetadata['institution'] = $parts[2]; 699 700 // Extract date and name from the last part 701 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 702 $dateStr = $matches[1]; 703 $name = $matches[2]; 704 705 // Convert date format (250620 -> 2025-06-20) 706 $day = substr($dateStr, 0, 2); 707 $month = substr($dateStr, 2, 2); 708 $year = substr($dateStr, 4, 2); 709 // Assuming 20xx for years 00-69 and 19xx for years 70-99 710 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 711 $formattedDate = $fullYear . '-' . $month . '-' . $day; 712 713 $baseMetadata['date'] = $formattedDate; 714 $baseMetadata['name'] = str_replace('-', ' ', $name); 715 } 716 } 717 } 718 719 // For templates, always extract name from the last part 720 if ($isTemplate && isset($lastPart)) { 721 // Extract name from the last part (everything after the last colon) 722 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 723 // Check if the first part contains at least one digit to be considered a registration 724 if (preg_match('/[0-9]/', $matches[1])) { 725 $baseMetadata['registration'] = $matches[1]; 726 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 727 } else { 728 // If no registration pattern found, treat entire part as template name 729 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 730 } 731 } else { 732 // If no match, treat entire part as template name 733 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 734 } 735 } 736 737 // Process each paragraph as a chunk with intelligent metadata handling 738 $chunkIds = []; 739 $chunkContents = []; 740 $chunkMetadatas = []; 741 $chunkEmbeddings = []; 742 $currentTags = []; 743 744 foreach ($paragraphs as $index => $paragraph) { 745 // Skip empty paragraphs to avoid processing whitespace-only content 746 $paragraph = trim($paragraph); 747 if (empty($paragraph)) { 748 continue; 749 } 750 751 // Check if this is a DokuWiki title (starts and ends with =) 752 // Titles are converted to tags for better searchability but not stored as content chunks 753 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 754 // Extract title content and clean it 755 $titleContent = trim($matches[1]); 756 757 // Split into words and create searchable tags 758 $words = preg_split('/\s+/', $titleContent); 759 $tags = []; 760 761 foreach ($words as $word) { 762 // Only use words longer than 3 characters to reduce noise 763 if (strlen($word) >= 3) { 764 $tags[] = strtolower($word); 765 } 766 } 767 768 // Remove duplicate tags and store for use in subsequent chunks 769 $currentTags = array_unique($tags); 770 continue; // Skip storing title chunks as content 771 } 772 773 // Create chunk ID 774 $chunkId = $id . '@' . ($index + 1); 775 776 // Generate embeddings for the chunk 777 $embeddings = $this->generateEmbeddings($paragraph); 778 779 // Add chunk-specific metadata 780 $metadata = $baseMetadata; 781 $metadata['chunk_id'] = $chunkId; 782 $metadata['chunk_number'] = $index + 1; 783 $metadata['total_chunks'] = count($paragraphs); 784 785 // Add current tags to metadata if any exist 786 if (!empty($currentTags)) { 787 $metadata['tags'] = implode(',', $currentTags); 788 } 789 790 // Store chunk data 791 $chunkIds[] = $chunkId; 792 $chunkContents[] = $paragraph; 793 $chunkMetadatas[] = $metadata; 794 $chunkEmbeddings[] = $embeddings; 795 } 796 797 // If no chunks were created, skip this file 798 if (empty($chunkIds)) { 799 return [ 800 'status' => 'skipped', 801 'message' => "No valid chunks found in file '$id'. Skipping..." 802 ]; 803 } 804 805 // Send all chunks to ChromaDB 806 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 807 808 return [ 809 'status' => 'success', 810 'message' => "Successfully sent file to ChromaDB", 811 'details' => [ 812 'document_id' => $id, 813 'chunks' => count($chunkIds), 814 'collection' => $collectionName 815 ], 816 'collection_status' => $collectionStatus 817 ]; 818 } catch (\Exception $e) { 819 return [ 820 'status' => 'error', 821 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 822 ]; 823 } 824 } 825 826 /** 827 * Process all DokuWiki files in a directory and send them to ChromaDB 828 * 829 * This function recursively processes all .txt files in a directory and its subdirectories. 830 * It first checks if the appropriate collection exists and creates it if needed. 831 * Then it processes each file individually. 832 * 833 * @param string $dirPath The directory path to process 834 * @return array Result with status and details 835 */ 836 public function processDirectory($dirPath) { 837 // Check if directory exists 838 if (!is_dir($dirPath)) { 839 return [ 840 'status' => 'error', 841 'message' => "Directory does not exist: $dirPath" 842 ]; 843 } 844 845 // Create RecursiveIteratorIterator to process directories recursively 846 $iterator = new RecursiveIteratorIterator( 847 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 848 RecursiveIteratorIterator::LEAVES_ONLY 849 ); 850 851 $files = []; 852 foreach ($iterator as $file) { 853 // Process only .txt files that don't start with underscore 854 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 855 $files[] = $file->getPathname(); 856 } 857 } 858 859 if (empty($files)) { 860 return [ 861 'status' => 'skipped', 862 'message' => "No .txt files found in directory: $dirPath" 863 ]; 864 } 865 866 // Use the first part of the document ID as collection name, fallback to 'documents' 867 $sampleFile = $files[0]; 868 $id = parseFilePath($sampleFile); 869 $idParts = explode(':', $id); 870 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 871 872 try { 873 $this->ensureCollectionExists($collectionName); 874 $collectionChecked = true; 875 } catch (Exception $e) { 876 $collectionChecked = true; 877 } 878 879 $results = []; 880 foreach ($files as $file) { 881 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 882 $results[] = [ 883 'file' => $file, 884 'result' => $result 885 ]; 886 } 887 888 return [ 889 'status' => 'success', 890 'message' => "Finished processing directory.", 891 'files_count' => count($files), 892 'results' => $results 893 ]; 894 } 895} 896 897/** 898 * Parse a file path and convert it to a DokuWiki ID 899 * 900 * Takes a file system path and converts it to the DokuWiki ID format by: 901 * 1. Removing the base path prefix (using DokuWiki's pages directory) 902 * 2. Removing the .txt extension 903 * 3. Converting directory separators to colons 904 * 905 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 906 * Becomes: reports:mri:2024:g287-name-surname 907 * 908 * @param string $filePath The full file path to parse 909 * @return string The DokuWiki ID 910 */ 911function parseFilePath($filePath) { 912 // Use DokuWiki's constant to get the pages directory if available 913 if (defined('DOKU_INC')) { 914 $pagesDir = DOKU_INC . 'data/pages/'; 915 } else { 916 // Fallback to common DokuWiki installation path 917 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 918 } 919 920 // Remove the base path 921 $relativePath = str_replace($pagesDir, '', $filePath); 922 923 // Remove .txt extension 924 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 925 926 // Split path into parts and filter out empty parts 927 $parts = array_filter(explode('/', $relativePath)); 928 929 // Build DokuWiki ID (use first part as namespace) 930 $idParts = []; 931 foreach ($parts as $part) { 932 if (!empty($part)) { 933 $idParts[] = $part; 934 } 935 } 936 937 return implode(':', $idParts); 938} 939 940