1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 private $ollamaModel; 14 15 /** 16 * Get configuration value for the dokullm plugin 17 * 18 * @param string $key Configuration key 19 * @param mixed $default Default value if key not found 20 * @return mixed Configuration value 21 */ 22 private function getConf($key, $default = null) { 23 global $conf; 24 return isset($conf['plugin']['dokullm'][$key]) ? $conf['plugin']['dokullm'][$key] : $default; 25 } 26 private $baseUrl; 27 private $client; 28 private $ollamaClient; 29 private $tenant; 30 private $database; 31 private $ollamaHost; 32 private $ollamaPort; 33 /** 34 * Initialize the ChromaDB client 35 * 36 * Creates a new ChromaDB client instance with the specified connection parameters. 37 * Also ensures that the specified tenant and database exist. 38 * 39 * @param string $host ChromaDB server host 40 * @param int $port ChromaDB server port 41 * @param string $tenant ChromaDB tenant name 42 * @param string $database ChromaDB database name 43 * @param string $ollamaHost Ollama server host 44 * @param int $ollamaPort Ollama server port 45 * @param string $ollamaModel Ollama embeddings model 46 */ 47 public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) { 48 // Use provided parameters or fall back to configuration values 49 $chromaHost = $host ?? $this->getConf('chroma_host', '127.0.0.1'); 50 $chromaPort = $port ?? $this->getConf('chroma_port', 8000); 51 $this->tenant = $tenant ?? $this->getConf('chroma_tenant', 'dokullm'); 52 $this->database = $database ?? $this->getConf('chroma_database', 'dokullm'); 53 $this->ollamaHost = $ollamaHost ?? $this->getConf('ollama_host', '127.0.0.1'); 54 $this->ollamaPort = $ollamaPort ?? $this->getConf('ollama_port', 11434); 55 $this->ollamaModel = $ollamaModel ?? $this->getConf('ollama_embeddings_model', 'nomic-embed-text'); 56 57 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 58 $this->client = curl_init(); 59 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 60 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 61 'Content-Type: application/json', 62 'Accept: application/json' 63 ]); 64 65 // Initialize Ollama client 66 $this->ollamaClient = curl_init(); 67 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 68 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 69 'Content-Type: application/json' 70 ]); 71 72 // Check if tenant and database exist, create them if they don't 73 $this->ensureTenantAndDatabase(); 74 } 75 76 /** 77 * Clean up the cURL client when the object is destroyed 78 * 79 * @return void 80 */ 81 public function __destruct() { 82 curl_close($this->client); 83 curl_close($this->ollamaClient); 84 } 85 86 /** 87 * Make an HTTP request to the ChromaDB API 88 * 89 * This is a helper function that handles making HTTP requests to the ChromaDB API, 90 * including setting the appropriate headers for tenant and database. 91 * 92 * @param string $endpoint The API endpoint to call 93 * @param string $method The HTTP method to use (default: 'GET') 94 * @param array|null $data The data to send with the request (default: null) 95 * @return array The JSON response decoded as an array 96 * @throws Exception If there's a cURL error or HTTP error 97 */ 98 private function makeRequest($endpoint, $method = 'GET', $data = null) { 99 // Add tenant and database as headers instead of query parameters for v2 API 100 $headers = [ 101 'Content-Type: application/json', 102 'Accept: application/json' 103 ]; 104 105 $url = $this->baseUrl . '/api/v2' . $endpoint; 106 107 curl_setopt($this->client, CURLOPT_URL, $url); 108 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 109 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 110 111 if ($data) { 112 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 113 } else { 114 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 115 } 116 117 $response = curl_exec($this->client); 118 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 119 120 if (curl_error($this->client)) { 121 throw new \Exception('Curl error: ' . curl_error($this->client)); 122 } 123 124 if ($httpCode >= 400) { 125 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 126 } 127 128 return json_decode($response, true); 129 } 130 131 /** 132 * Generate embeddings for text using Ollama 133 * 134 * @param string $text The text to generate embeddings for 135 * @return array The embeddings vector 136 */ 137 public function generateEmbeddings($text) { 138 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 139 140 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 141 142 $data = [ 143 'model' => $this->ollamaModel, 144 'prompt' => $text, 145 'keep_alive' => '30m' 146 ]; 147 148 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 149 150 $response = curl_exec($this->ollamaClient); 151 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 152 153 if (curl_error($this->ollamaClient)) { 154 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 155 } 156 157 if ($httpCode >= 400) { 158 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 159 } 160 161 $result = json_decode($response, true); 162 163 if (!isset($result['embedding'])) { 164 throw new \Exception("Ollama response missing embedding: " . $response); 165 } 166 167 return $result['embedding']; 168 } 169 170 /** 171 * List all collections in the database 172 * 173 * Retrieves a list of all collections in the specified tenant and database. 174 * 175 * @return array List of collections 176 */ 177 public function listCollections() { 178 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 179 return $this->makeRequest($endpoint); 180 } 181 182 /** 183 * Get a collection by name 184 * 185 * Retrieves information about a specific collection by its name. 186 * 187 * @param string $name The name of the collection to retrieve 188 * @return array The collection information 189 * @throws Exception If the collection is not found 190 */ 191 public function getCollection($name) { 192 // Use provided name, fallback to 'documents' if empty 193 if (empty($name)) { 194 $name = 'documents'; 195 } 196 197 // First try to get collection by name 198 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 199 $collections = $this->makeRequest($endpoint); 200 201 // Find collection by name 202 foreach ($collections as $collection) { 203 if (isset($collection['name']) && $collection['name'] === $name) { 204 return $collection; 205 } 206 } 207 208 // If not found, throw exception 209 throw new \Exception("Collection '{$name}' not found"); 210 } 211 212 /** 213 * Create a new collection 214 * 215 * Creates a new collection with the specified name and optional metadata. 216 * 217 * @param string $name The name of the collection to create 218 * @param array|null $metadata Optional metadata for the collection 219 * @return array The response from the API 220 */ 221 public function createCollection($name, $metadata = null) { 222 // Use provided name, fallback to 'documents' if empty 223 if (empty($name)) { 224 $name = 'documents'; 225 } 226 227 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 228 $data = ['name' => $name]; 229 if ($metadata) { 230 $data['metadata'] = $metadata; 231 } 232 return $this->makeRequest($endpoint, 'POST', $data); 233 } 234 235 /** 236 * Delete a collection by name 237 * 238 * Deletes a collection with the specified name. 239 * 240 * @param string $name The name of the collection to delete 241 * @return array The response from the API 242 * @throws Exception If the collection ID is not found 243 */ 244 public function deleteCollection($name) { 245 // Use provided name, fallback to 'documents' if empty 246 if (empty($name)) { 247 $name = 'documents'; 248 } 249 250 // First get the collection to find its ID 251 $collection = $this->getCollection($name); 252 if (!isset($collection['id'])) { 253 throw new \Exception("Collection ID not found for '{$name}'"); 254 } 255 256 $collectionId = $collection['id']; 257 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 258 return $this->makeRequest($endpoint, 'DELETE'); 259 } 260 261 /** 262 * Get a document by its ID from a collection 263 * 264 * Retrieves a document from the specified collection using its ID. 265 * 266 * @param string $collectionName The name of the collection to get the document from 267 * @param string $documentId The document ID to retrieve 268 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 269 * @return array The retrieved document 270 * @throws Exception If the collection ID is not found 271 */ 272 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 273 // Use provided name, fallback to 'documents' if empty 274 if (empty($collectionName)) { 275 $collectionName = 'documents'; 276 } 277 278 // First get the collection to find its ID 279 $collection = $this->getCollection($collectionName); 280 if (!isset($collection['id'])) { 281 throw new \Exception("Collection ID not found for '{$collectionName}'"); 282 } 283 284 $collectionId = $collection['id']; 285 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 286 $data = [ 287 'ids' => [$documentId], 288 'include' => $include 289 ]; 290 291 return $this->makeRequest($endpoint, 'POST', $data); 292 } 293 294 /** 295 * Add documents to a collection 296 * 297 * Adds documents to the specified collection. Each document must have a corresponding ID. 298 * Optional metadata and pre-computed embeddings can also be provided. 299 * 300 * @param string $collectionName The name of the collection to add documents to 301 * @param array $documents The document contents 302 * @param array $ids The document IDs 303 * @param array|null $metadatas Optional metadata for each document 304 * @param array|null $embeddings Optional pre-computed embeddings for each document 305 * @return array The response from the API 306 * @throws Exception If the collection ID is not found 307 */ 308 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 309 // Use provided name, fallback to 'documents' if empty 310 if (empty($collectionName)) { 311 $collectionName = 'documents'; 312 } 313 314 // First get the collection to find its ID 315 $collection = $this->getCollection($collectionName); 316 if (!isset($collection['id'])) { 317 throw new \Exception("Collection ID not found for '{$collectionName}'"); 318 } 319 320 $collectionId = $collection['id']; 321 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 322 $data = [ 323 'ids' => $ids, 324 'documents' => $documents 325 ]; 326 327 if ($metadatas) { 328 $data['metadatas'] = $metadatas; 329 } 330 331 if ($embeddings) { 332 $data['embeddings'] = $embeddings; 333 } 334 335 return $this->makeRequest($endpoint, 'POST', $data); 336 } 337 338 /** 339 * Check if a document needs to be updated based on timestamp comparison 340 * 341 * Determines whether a document should be reprocessed by comparing the file's last modification 342 * time with the processed_at timestamp stored in the document's metadata. The function checks 343 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 344 * not included in the database. 345 * 346 * @param string $collectionId The ID of the collection to check documents in 347 * @param string $documentId The base document ID to check (without chunk suffixes) 348 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 349 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 350 * @throws Exception If there's an error checking the document 351 */ 352 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 353 try { 354 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 355 356 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 357 $chunkIdsToCheck = [ 358 $documentId . '@1', 359 $documentId . '@2', 360 $documentId . '@3' 361 ]; 362 363 $data = [ 364 'ids' => $chunkIdsToCheck, 365 'include' => [ 366 "metadatas" 367 ], 368 'limit' => 1 369 ]; 370 371 // Check if document exists 372 $result = $this->makeRequest($endpoint, 'POST', $data); 373 374 // If no documents found, return true (needs to be added) 375 if (empty($result['ids'])) { 376 return true; 377 } 378 379 // Check if any document has a processed_at timestamp 380 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 381 // Check the first metadata entry directly 382 $metadata = $result['metadatas'][0]; 383 384 // If processed_at is not set, return true (needs update) 385 if (!isset($metadata['processed_at'])) { 386 return true; 387 } 388 389 // Parse the processed_at timestamp 390 $processedTimestamp = strtotime($metadata['processed_at']); 391 392 // If file is newer than processed time, return true (needs update) 393 if ($fileModifiedTime > $processedTimestamp) { 394 return true; 395 } 396 } 397 398 // Document exists and is up to date 399 return false; 400 } catch (\Exception $e) { 401 // If there's an error checking the document, assume it needs to be updated 402 return true; 403 } 404 } 405 406 /** 407 * Query a collection for similar documents 408 * 409 * Queries the specified collection for documents similar to the provided query texts. 410 * The function generates embeddings for the query texts and sends them to ChromaDB. 411 * Supports filtering results by metadata using the where parameter. 412 * 413 * @param string $collectionName The name of the collection to query 414 * @param array $queryTexts The query texts to search for 415 * @param int $nResults The number of results to return (default: 5) 416 * @param array|null $where Optional filter conditions for metadata 417 * @return array The query results 418 * @throws Exception If the collection ID is not found 419 */ 420 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 421 // Use provided name, fallback to 'documents' if empty 422 if (empty($collectionName)) { 423 $collectionName = 'documents'; 424 } 425 426 // First get the collection to find its ID 427 $collection = $this->getCollection($collectionName); 428 if (!isset($collection['id'])) { 429 throw new \Exception("Collection ID not found for '{$collectionName}'"); 430 } 431 432 $collectionId = $collection['id']; 433 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 434 435 // Generate embeddings for query texts 436 $queryEmbeddings = []; 437 foreach ($queryTexts as $text) { 438 $queryEmbeddings[] = $this->generateEmbeddings($text); 439 } 440 441 $data = [ 442 'query_embeddings' => $queryEmbeddings, 443 'n_results' => $nResults 444 ]; 445 446 // Add where clause for metadata filtering if provided 447 if ($where && is_array($where)) { 448 $data['where'] = $where; 449 } 450 451 return $this->makeRequest($endpoint, 'POST', $data); 452 } 453 454 /** 455 * Check if the ChromaDB server is alive 456 * 457 * Sends a heartbeat request to verify that the ChromaDB server is running. 458 * 459 * @return array The response from the heartbeat endpoint 460 */ 461 public function heartbeat() { 462 $endpoint = "/heartbeat"; 463 return $this->makeRequest($endpoint, 'GET'); 464 } 465 466 /** 467 * Get authentication and identity information 468 * 469 * Retrieves authentication and identity information from the ChromaDB server. 470 * 471 * @return array The response from the auth/identity endpoint 472 */ 473 public function getIdentity() { 474 $endpoint = "/identity"; 475 return $this->makeRequest($endpoint, 'GET'); 476 } 477 478 /** 479 * Ensure that the specified tenant and database exist 480 * 481 * Checks if the specified tenant and database exist, and creates them if they don't. 482 * 483 * @return void 484 */ 485 private function ensureTenantAndDatabase() { 486 // Check if tenant exists, create if it doesn't 487 try { 488 $this->getTenant($this->tenant); 489 } catch (\Exception $e) { 490 // Tenant doesn't exist, create it 491 $this->createTenant($this->tenant); 492 } 493 494 // Check if database exists, create if it doesn't 495 try { 496 $this->getDatabase($this->database, $this->tenant); 497 } catch (\Exception $e) { 498 // Database doesn't exist, create it 499 $this->createDatabase($this->database, $this->tenant); 500 } 501 } 502 503 /** 504 * Get tenant information 505 * 506 * Retrieves information about the specified tenant. 507 * 508 * @param string $tenantName The tenant name 509 * @return array The tenant information 510 */ 511 public function getTenant($tenantName) { 512 $endpoint = "/tenants/{$tenantName}"; 513 return $this->makeRequest($endpoint, 'GET'); 514 } 515 516 /** 517 * Create a new tenant 518 * 519 * Creates a new tenant with the specified name. 520 * 521 * @param string $tenantName The tenant name 522 * @return array The response from the API 523 */ 524 public function createTenant($tenantName) { 525 $endpoint = "/tenants"; 526 $data = ['name' => $tenantName]; 527 return $this->makeRequest($endpoint, 'POST', $data); 528 } 529 530 /** 531 * Get database information 532 * 533 * Retrieves information about the specified database within a tenant. 534 * 535 * @param string $databaseName The database name 536 * @param string $tenantName The tenant name 537 * @return array The database information 538 */ 539 public function getDatabase($databaseName, $tenantName) { 540 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 541 return $this->makeRequest($endpoint, 'GET'); 542 } 543 544 /** 545 * Create a new database 546 * 547 * Creates a new database with the specified name within a tenant. 548 * 549 * @param string $databaseName The database name 550 * @param string $tenantName The tenant name 551 * @return array The response from the API 552 */ 553 public function createDatabase($databaseName, $tenantName) { 554 $endpoint = "/tenants/{$tenantName}/databases"; 555 $data = ['name' => $databaseName]; 556 return $this->makeRequest($endpoint, 'POST', $data); 557 } 558 559 /** 560 * Ensure a collection exists, creating it if necessary 561 * 562 * This helper function checks if a collection exists and creates it if it doesn't. 563 * 564 * @param string $collectionName The name of the collection to check/create 565 * @return string Status message indicating what happened 566 */ 567 public function ensureCollectionExists($collectionName) { 568 try { 569 $collection = $this->getCollection($collectionName); 570 return "Collection '$collectionName' already exists."; 571 } catch (\Exception $e) { 572 // Collection doesn't exist, create it 573 $created = $this->createCollection($collectionName); 574 return "Collection '$collectionName' created."; 575 } 576 } 577 578 /** 579 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 580 * 581 * This function handles the complete processing of a single DokuWiki file: 582 * 1. Parses the file path to extract metadata and document ID 583 * 2. Determines the appropriate collection based on document ID 584 * 3. Checks if the document needs updating using timestamp comparison 585 * 4. Reads and processes file content only if update is needed 586 * 5. Splits the document into chunks (paragraphs) 587 * 6. Extracts rich metadata from the DokuWiki ID format 588 * 7. Generates embeddings for each chunk 589 * 8. Sends all chunks to ChromaDB with metadata 590 * 591 * Supported ID formats: 592 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 593 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 594 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 595 * 596 * The function implements smart update checking by comparing file modification time 597 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 598 * 599 * @param string $filePath The path to the file to process 600 * @param string $collectionName The name of the collection to use 601 * @param bool $collectionChecked Whether the collection has already been checked/created 602 * @return array Result with status and details 603 */ 604 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 605 // Parse file path to extract metadata 606 $id = parseFilePath($filePath); 607 608 try { 609 // Create collection if it doesn't exist (only if not already checked) 610 $collectionStatus = ''; 611 if (!$collectionChecked) { 612 $collectionStatus = $this->ensureCollectionExists($collectionName); 613 } 614 615 // Get collection ID 616 $collection = $this->getCollection($collectionName); 617 if (!isset($collection['id'])) { 618 return [ 619 'status' => 'error', 620 'message' => "Collection ID not found for '{$collectionName}'" 621 ]; 622 } 623 $collectionId = $collection['id']; 624 625 // Get file modification time 626 $fileModifiedTime = filemtime($filePath); 627 628 // Check if document needs update 629 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 630 631 // If document is up to date, skip processing 632 if (!$needsUpdate) { 633 return [ 634 'status' => 'skipped', 635 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 636 ]; 637 } 638 639 // Read file content 640 $content = file_get_contents($filePath); 641 642 // Split document into chunks (paragraphs separated by two newlines) 643 $paragraphs = preg_split('/\n\s*\n/', $content); 644 $chunks = []; 645 $chunkMetadata = []; 646 647 // Parse the DokuWiki ID to extract base metadata 648 $parts = explode(':', $id); 649 650 // Extract metadata from the last part of the ID 651 $lastPart = end($parts); 652 $baseMetadata = []; 653 654 // Add the document ID as metadata 655 $baseMetadata['document_id'] = $id; 656 657 // Add current timestamp 658 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 659 660 // Check if any part of the ID is 'templates' and set template metadata 661 $isTemplate = in_array('templates', $parts); 662 if ($isTemplate) { 663 $baseMetadata['type'] = 'template'; 664 } else { 665 $baseMetadata['type'] = 'report'; 666 } 667 668 // Extract modality from the second part 669 if (isset($parts[1])) { 670 $baseMetadata['modality'] = $parts[1]; 671 } 672 673 // Handle different ID formats based on the third part: word (institution) or numeric (year) 674 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 675 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 676 // For templates, don't set institution, date or year 677 if (isset($parts[2]) && !$isTemplate) { 678 // Check if third part is numeric (year) or word (institution) 679 if (is_numeric($parts[2])) { 680 // Format: reports:mri:2024:g287-name-surname (year format) 681 // Extract year from the third part 682 $baseMetadata['year'] = $parts[2]; 683 684 // Set default institution from config 685 $baseMetadata['institution'] = $this->getConf('default_institution', 'default'); 686 687 // Extract registration and name from the last part 688 // Registration should start with one letter or number and contain numbers before the '-' character 689 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 690 // Check if the first part contains at least one digit to be considered a registration 691 if (preg_match('/[0-9]/', $matches[1])) { 692 $baseMetadata['registration'] = $matches[1]; 693 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 694 } else { 695 // If no registration pattern found, treat entire part as patient name 696 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 697 } 698 } else { 699 // If no match, treat entire part as patient name 700 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 701 } 702 } else { 703 // Format: reports:mri:institution:250620-name-surname (institution format) 704 // Extract institution from the third part 705 $baseMetadata['institution'] = $parts[2]; 706 707 // Extract date and name from the last part 708 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 709 $dateStr = $matches[1]; 710 $name = $matches[2]; 711 712 // Convert date format (250620 -> 2025-06-20) 713 $day = substr($dateStr, 0, 2); 714 $month = substr($dateStr, 2, 2); 715 $year = substr($dateStr, 4, 2); 716 // Assuming 20xx for years 00-69 and 19xx for years 70-99 717 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 718 $formattedDate = $fullYear . '-' . $month . '-' . $day; 719 720 $baseMetadata['date'] = $formattedDate; 721 $baseMetadata['name'] = str_replace('-', ' ', $name); 722 } 723 } 724 } 725 726 // For templates, always extract name from the last part 727 if ($isTemplate && isset($lastPart)) { 728 // Extract name from the last part (everything after the last colon) 729 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 730 // Check if the first part contains at least one digit to be considered a registration 731 if (preg_match('/[0-9]/', $matches[1])) { 732 $baseMetadata['registration'] = $matches[1]; 733 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 734 } else { 735 // If no registration pattern found, treat entire part as template name 736 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 737 } 738 } else { 739 // If no match, treat entire part as template name 740 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 741 } 742 } 743 744 // Process each paragraph as a chunk with intelligent metadata handling 745 $chunkIds = []; 746 $chunkContents = []; 747 $chunkMetadatas = []; 748 $chunkEmbeddings = []; 749 $currentTags = []; 750 751 foreach ($paragraphs as $index => $paragraph) { 752 // Skip empty paragraphs to avoid processing whitespace-only content 753 $paragraph = trim($paragraph); 754 if (empty($paragraph)) { 755 continue; 756 } 757 758 // Check if this is a DokuWiki title (starts and ends with =) 759 // Titles are converted to tags for better searchability but not stored as content chunks 760 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 761 // Extract title content and clean it 762 $titleContent = trim($matches[1]); 763 764 // Split into words and create searchable tags 765 $words = preg_split('/\s+/', $titleContent); 766 $tags = []; 767 768 foreach ($words as $word) { 769 // Only use words longer than 3 characters to reduce noise 770 if (strlen($word) >= 3) { 771 $tags[] = strtolower($word); 772 } 773 } 774 775 // Remove duplicate tags and store for use in subsequent chunks 776 $currentTags = array_unique($tags); 777 continue; // Skip storing title chunks as content 778 } 779 780 // Create chunk ID 781 $chunkId = $id . '@' . ($index + 1); 782 783 // Generate embeddings for the chunk 784 $embeddings = $this->generateEmbeddings($paragraph); 785 786 // Add chunk-specific metadata 787 $metadata = $baseMetadata; 788 $metadata['chunk_id'] = $chunkId; 789 $metadata['chunk_number'] = $index + 1; 790 $metadata['total_chunks'] = count($paragraphs); 791 792 // Add current tags to metadata if any exist 793 if (!empty($currentTags)) { 794 $metadata['tags'] = implode(',', $currentTags); 795 } 796 797 // Store chunk data 798 $chunkIds[] = $chunkId; 799 $chunkContents[] = $paragraph; 800 $chunkMetadatas[] = $metadata; 801 $chunkEmbeddings[] = $embeddings; 802 } 803 804 // If no chunks were created, skip this file 805 if (empty($chunkIds)) { 806 return [ 807 'status' => 'skipped', 808 'message' => "No valid chunks found in file '$id'. Skipping..." 809 ]; 810 } 811 812 // Send all chunks to ChromaDB 813 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 814 815 return [ 816 'status' => 'success', 817 'message' => "Successfully sent file to ChromaDB", 818 'details' => [ 819 'document_id' => $id, 820 'chunks' => count($chunkIds), 821 'collection' => $collectionName 822 ], 823 'collection_status' => $collectionStatus 824 ]; 825 } catch (\Exception $e) { 826 return [ 827 'status' => 'error', 828 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 829 ]; 830 } 831 } 832 833 /** 834 * Process all DokuWiki files in a directory and send them to ChromaDB 835 * 836 * This function recursively processes all .txt files in a directory and its subdirectories. 837 * It first checks if the appropriate collection exists and creates it if needed. 838 * Then it processes each file individually. 839 * 840 * @param string $dirPath The directory path to process 841 * @return array Result with status and details 842 */ 843 public function processDirectory($dirPath) { 844 // Check if directory exists 845 if (!is_dir($dirPath)) { 846 return [ 847 'status' => 'error', 848 'message' => "Directory does not exist: $dirPath" 849 ]; 850 } 851 852 // Create RecursiveIteratorIterator to process directories recursively 853 $iterator = new RecursiveIteratorIterator( 854 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 855 RecursiveIteratorIterator::LEAVES_ONLY 856 ); 857 858 $files = []; 859 foreach ($iterator as $file) { 860 // Process only .txt files that don't start with underscore 861 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 862 $files[] = $file->getPathname(); 863 } 864 } 865 866 if (empty($files)) { 867 return [ 868 'status' => 'skipped', 869 'message' => "No .txt files found in directory: $dirPath" 870 ]; 871 } 872 873 // Use the first part of the document ID as collection name, fallback to 'documents' 874 $sampleFile = $files[0]; 875 $id = parseFilePath($sampleFile); 876 $idParts = explode(':', $id); 877 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 878 879 try { 880 $this->ensureCollectionExists($collectionName); 881 $collectionChecked = true; 882 } catch (Exception $e) { 883 $collectionChecked = true; 884 } 885 886 $results = []; 887 foreach ($files as $file) { 888 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 889 $results[] = [ 890 'file' => $file, 891 'result' => $result 892 ]; 893 } 894 895 return [ 896 'status' => 'success', 897 'message' => "Finished processing directory.", 898 'files_count' => count($files), 899 'results' => $results 900 ]; 901 } 902} 903 904/** 905 * Parse a file path and convert it to a DokuWiki ID 906 * 907 * Takes a file system path and converts it to the DokuWiki ID format by: 908 * 1. Removing the base path prefix (using DokuWiki's pages directory) 909 * 2. Removing the .txt extension 910 * 3. Converting directory separators to colons 911 * 912 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 913 * Becomes: reports:mri:2024:g287-name-surname 914 * 915 * @param string $filePath The full file path to parse 916 * @return string The DokuWiki ID 917 */ 918function parseFilePath($filePath) { 919 // Use DokuWiki's constant to get the pages directory if available 920 if (defined('DOKU_INC')) { 921 $pagesDir = DOKU_INC . 'data/pages/'; 922 } else { 923 // Fallback to common DokuWiki installation path 924 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 925 } 926 927 // Remove the base path 928 $relativePath = str_replace($pagesDir, '', $filePath); 929 930 // Remove .txt extension 931 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 932 933 // Split path into parts and filter out empty parts 934 $parts = array_filter(explode('/', $relativePath)); 935 936 // Build DokuWiki ID (use first part as namespace) 937 $idParts = []; 938 foreach ($parts as $part) { 939 if (!empty($part)) { 940 $idParts[] = $part; 941 } 942 } 943 944 return implode(':', $idParts); 945} 946 947