1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 /** 14 * Initialize the ChromaDB client 15 * 16 * Creates a new ChromaDB client instance with the specified connection parameters. 17 * Also ensures that the specified tenant and database exist. 18 * 19 * @param string $host ChromaDB server host 20 * @param int $port ChromaDB server port 21 * @param string $tenant ChromaDB tenant name 22 * @param string $database ChromaDB database name 23 * @param string $ollamaHost Ollama server host 24 * @param int $ollamaPort Ollama server port 25 * @param string $ollamaModel Ollama embeddings model 26 */ 27 public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) { 28 // Load DokuWiki plugin configuration 29 global $conf; 30 31 // Use provided parameters or fall back to configuration values 32 $chromaHost = $host ?? ($conf['plugin']['dokullm']['chroma_host'] ?? '127.0.0.1'); 33 $chromaPort = $port ?? ($conf['plugin']['dokullm']['chroma_port'] ?? 8000); 34 $this->tenant = $tenant ?? ($conf['plugin']['dokullm']['chroma_tenant'] ?? 'dokullm'); 35 $this->database = $database ?? ($conf['plugin']['dokullm']['chroma_database'] ?? 'dokullm'); 36 $this->ollamaHost = $ollamaHost ?? ($conf['plugin']['dokullm']['ollama_host'] ?? '127.0.0.1'); 37 $this->ollamaPort = $ollamaPort ?? ($conf['plugin']['dokullm']['ollama_port'] ?? 11434); 38 $this->ollamaModel = $ollamaModel ?? ($conf['plugin']['dokullm']['ollama_embeddings_model'] ?? 'nomic-embed-text'); 39 40 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 41 $this->client = curl_init(); 42 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 43 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 44 'Content-Type: application/json', 45 'Accept: application/json' 46 ]); 47 48 // Initialize Ollama client 49 $this->ollamaClient = curl_init(); 50 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 51 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 52 'Content-Type: application/json' 53 ]); 54 55 // Check if tenant and database exist, create them if they don't 56 $this->ensureTenantAndDatabase(); 57 } 58 59 /** 60 * Clean up the cURL client when the object is destroyed 61 * 62 * @return void 63 */ 64 public function __destruct() { 65 curl_close($this->client); 66 curl_close($this->ollamaClient); 67 } 68 69 /** 70 * Make an HTTP request to the ChromaDB API 71 * 72 * This is a helper function that handles making HTTP requests to the ChromaDB API, 73 * including setting the appropriate headers for tenant and database. 74 * 75 * @param string $endpoint The API endpoint to call 76 * @param string $method The HTTP method to use (default: 'GET') 77 * @param array|null $data The data to send with the request (default: null) 78 * @return array The JSON response decoded as an array 79 * @throws Exception If there's a cURL error or HTTP error 80 */ 81 private function makeRequest($endpoint, $method = 'GET', $data = null) { 82 // Add tenant and database as headers instead of query parameters for v2 API 83 $headers = [ 84 'Content-Type: application/json', 85 'Accept: application/json' 86 ]; 87 88 $url = $this->baseUrl . '/api/v2' . $endpoint; 89 90 curl_setopt($this->client, CURLOPT_URL, $url); 91 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 92 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 93 94 if ($data) { 95 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 96 } else { 97 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 98 } 99 100 $response = curl_exec($this->client); 101 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 102 103 if (curl_error($this->client)) { 104 throw new \Exception('Curl error: ' . curl_error($this->client)); 105 } 106 107 if ($httpCode >= 400) { 108 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 109 } 110 111 return json_decode($response, true); 112 } 113 114 /** 115 * Generate embeddings for text using Ollama 116 * 117 * @param string $text The text to generate embeddings for 118 * @return array The embeddings vector 119 */ 120 public function generateEmbeddings($text) { 121 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 122 123 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 124 125 $data = [ 126 'model' => $this->ollamaModel, 127 'prompt' => $text, 128 'keep_alive' => '30m' 129 ]; 130 131 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 132 133 $response = curl_exec($this->ollamaClient); 134 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 135 136 if (curl_error($this->ollamaClient)) { 137 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 138 } 139 140 if ($httpCode >= 400) { 141 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 142 } 143 144 $result = json_decode($response, true); 145 146 if (!isset($result['embedding'])) { 147 throw new \Exception("Ollama response missing embedding: " . $response); 148 } 149 150 return $result['embedding']; 151 } 152 153 /** 154 * List all collections in the database 155 * 156 * Retrieves a list of all collections in the specified tenant and database. 157 * 158 * @return array List of collections 159 */ 160 public function listCollections() { 161 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 162 return $this->makeRequest($endpoint); 163 } 164 165 /** 166 * Get a collection by name 167 * 168 * Retrieves information about a specific collection by its name. 169 * 170 * @param string $name The name of the collection to retrieve 171 * @return array The collection information 172 * @throws Exception If the collection is not found 173 */ 174 public function getCollection($name) { 175 // Use provided name, fallback to 'documents' if empty 176 if (empty($name)) { 177 $name = 'documents'; 178 } 179 180 // First try to get collection by name 181 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 182 $collections = $this->makeRequest($endpoint); 183 184 // Find collection by name 185 foreach ($collections as $collection) { 186 if (isset($collection['name']) && $collection['name'] === $name) { 187 return $collection; 188 } 189 } 190 191 // If not found, throw exception 192 throw new \Exception("Collection '{$name}' not found"); 193 } 194 195 /** 196 * Create a new collection 197 * 198 * Creates a new collection with the specified name and optional metadata. 199 * 200 * @param string $name The name of the collection to create 201 * @param array|null $metadata Optional metadata for the collection 202 * @return array The response from the API 203 */ 204 public function createCollection($name, $metadata = null) { 205 // Use provided name, fallback to 'documents' if empty 206 if (empty($name)) { 207 $name = 'documents'; 208 } 209 210 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 211 $data = ['name' => $name]; 212 if ($metadata) { 213 $data['metadata'] = $metadata; 214 } 215 return $this->makeRequest($endpoint, 'POST', $data); 216 } 217 218 /** 219 * Delete a collection by name 220 * 221 * Deletes a collection with the specified name. 222 * 223 * @param string $name The name of the collection to delete 224 * @return array The response from the API 225 * @throws Exception If the collection ID is not found 226 */ 227 public function deleteCollection($name) { 228 // Use provided name, fallback to 'documents' if empty 229 if (empty($name)) { 230 $name = 'documents'; 231 } 232 233 // First get the collection to find its ID 234 $collection = $this->getCollection($name); 235 if (!isset($collection['id'])) { 236 throw new \Exception("Collection ID not found for '{$name}'"); 237 } 238 239 $collectionId = $collection['id']; 240 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 241 return $this->makeRequest($endpoint, 'DELETE'); 242 } 243 244 /** 245 * Get a document by its ID from a collection 246 * 247 * Retrieves a document from the specified collection using its ID. 248 * 249 * @param string $collectionName The name of the collection to get the document from 250 * @param string $documentId The document ID to retrieve 251 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 252 * @return array The retrieved document 253 * @throws Exception If the collection ID is not found 254 */ 255 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 256 // Use provided name, fallback to 'documents' if empty 257 if (empty($collectionName)) { 258 $collectionName = 'documents'; 259 } 260 261 // First get the collection to find its ID 262 $collection = $this->getCollection($collectionName); 263 if (!isset($collection['id'])) { 264 throw new \Exception("Collection ID not found for '{$collectionName}'"); 265 } 266 267 $collectionId = $collection['id']; 268 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 269 $data = [ 270 'ids' => [$documentId], 271 'include' => $include 272 ]; 273 274 return $this->makeRequest($endpoint, 'POST', $data); 275 } 276 277 /** 278 * Add documents to a collection 279 * 280 * Adds documents to the specified collection. Each document must have a corresponding ID. 281 * Optional metadata and pre-computed embeddings can also be provided. 282 * 283 * @param string $collectionName The name of the collection to add documents to 284 * @param array $documents The document contents 285 * @param array $ids The document IDs 286 * @param array|null $metadatas Optional metadata for each document 287 * @param array|null $embeddings Optional pre-computed embeddings for each document 288 * @return array The response from the API 289 * @throws Exception If the collection ID is not found 290 */ 291 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 292 // Use provided name, fallback to 'documents' if empty 293 if (empty($collectionName)) { 294 $collectionName = 'documents'; 295 } 296 297 // First get the collection to find its ID 298 $collection = $this->getCollection($collectionName); 299 if (!isset($collection['id'])) { 300 throw new \Exception("Collection ID not found for '{$collectionName}'"); 301 } 302 303 $collectionId = $collection['id']; 304 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 305 $data = [ 306 'ids' => $ids, 307 'documents' => $documents 308 ]; 309 310 if ($metadatas) { 311 $data['metadatas'] = $metadatas; 312 } 313 314 if ($embeddings) { 315 $data['embeddings'] = $embeddings; 316 } 317 318 return $this->makeRequest($endpoint, 'POST', $data); 319 } 320 321 /** 322 * Check if a document needs to be updated based on timestamp comparison 323 * 324 * Determines whether a document should be reprocessed by comparing the file's last modification 325 * time with the processed_at timestamp stored in the document's metadata. The function checks 326 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 327 * not included in the database. 328 * 329 * @param string $collectionId The ID of the collection to check documents in 330 * @param string $documentId The base document ID to check (without chunk suffixes) 331 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 332 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 333 * @throws Exception If there's an error checking the document 334 */ 335 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 336 try { 337 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 338 339 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 340 $chunkIdsToCheck = [ 341 $documentId . '@1', 342 $documentId . '@2', 343 $documentId . '@3' 344 ]; 345 346 $data = [ 347 'ids' => $chunkIdsToCheck, 348 'include' => [ 349 "metadatas" 350 ], 351 'limit' => 1 352 ]; 353 354 // Check if document exists 355 $result = $this->makeRequest($endpoint, 'POST', $data); 356 357 // If no documents found, return true (needs to be added) 358 if (empty($result['ids'])) { 359 return true; 360 } 361 362 // Check if any document has a processed_at timestamp 363 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 364 // Check the first metadata entry directly 365 $metadata = $result['metadatas'][0]; 366 367 // If processed_at is not set, return true (needs update) 368 if (!isset($metadata['processed_at'])) { 369 return true; 370 } 371 372 // Parse the processed_at timestamp 373 $processedTimestamp = strtotime($metadata['processed_at']); 374 375 // If file is newer than processed time, return true (needs update) 376 if ($fileModifiedTime > $processedTimestamp) { 377 return true; 378 } 379 } 380 381 // Document exists and is up to date 382 return false; 383 } catch (\Exception $e) { 384 // If there's an error checking the document, assume it needs to be updated 385 return true; 386 } 387 } 388 389 /** 390 * Query a collection for similar documents 391 * 392 * Queries the specified collection for documents similar to the provided query texts. 393 * The function generates embeddings for the query texts and sends them to ChromaDB. 394 * Supports filtering results by metadata using the where parameter. 395 * 396 * @param string $collectionName The name of the collection to query 397 * @param array $queryTexts The query texts to search for 398 * @param int $nResults The number of results to return (default: 5) 399 * @param array|null $where Optional filter conditions for metadata 400 * @return array The query results 401 * @throws Exception If the collection ID is not found 402 */ 403 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 404 // Use provided name, fallback to 'documents' if empty 405 if (empty($collectionName)) { 406 $collectionName = 'documents'; 407 } 408 409 // First get the collection to find its ID 410 $collection = $this->getCollection($collectionName); 411 if (!isset($collection['id'])) { 412 throw new \Exception("Collection ID not found for '{$collectionName}'"); 413 } 414 415 $collectionId = $collection['id']; 416 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 417 418 // Generate embeddings for query texts 419 $queryEmbeddings = []; 420 foreach ($queryTexts as $text) { 421 $queryEmbeddings[] = $this->generateEmbeddings($text); 422 } 423 424 $data = [ 425 'query_embeddings' => $queryEmbeddings, 426 'n_results' => $nResults 427 ]; 428 429 // Add where clause for metadata filtering if provided 430 if ($where && is_array($where)) { 431 $data['where'] = $where; 432 } 433 434 return $this->makeRequest($endpoint, 'POST', $data); 435 } 436 437 /** 438 * Check if the ChromaDB server is alive 439 * 440 * Sends a heartbeat request to verify that the ChromaDB server is running. 441 * 442 * @return array The response from the heartbeat endpoint 443 */ 444 public function heartbeat() { 445 $endpoint = "/heartbeat"; 446 return $this->makeRequest($endpoint, 'GET'); 447 } 448 449 /** 450 * Get authentication and identity information 451 * 452 * Retrieves authentication and identity information from the ChromaDB server. 453 * 454 * @return array The response from the auth/identity endpoint 455 */ 456 public function getIdentity() { 457 $endpoint = "/identity"; 458 return $this->makeRequest($endpoint, 'GET'); 459 } 460 461 /** 462 * Ensure that the specified tenant and database exist 463 * 464 * Checks if the specified tenant and database exist, and creates them if they don't. 465 * 466 * @return void 467 */ 468 private function ensureTenantAndDatabase() { 469 // Check if tenant exists, create if it doesn't 470 try { 471 $this->getTenant($this->tenant); 472 } catch (\Exception $e) { 473 // Tenant doesn't exist, create it 474 $this->createTenant($this->tenant); 475 } 476 477 // Check if database exists, create if it doesn't 478 try { 479 $this->getDatabase($this->database, $this->tenant); 480 } catch (\Exception $e) { 481 // Database doesn't exist, create it 482 $this->createDatabase($this->database, $this->tenant); 483 } 484 } 485 486 /** 487 * Get tenant information 488 * 489 * Retrieves information about the specified tenant. 490 * 491 * @param string $tenantName The tenant name 492 * @return array The tenant information 493 */ 494 public function getTenant($tenantName) { 495 $endpoint = "/tenants/{$tenantName}"; 496 return $this->makeRequest($endpoint, 'GET'); 497 } 498 499 /** 500 * Create a new tenant 501 * 502 * Creates a new tenant with the specified name. 503 * 504 * @param string $tenantName The tenant name 505 * @return array The response from the API 506 */ 507 public function createTenant($tenantName) { 508 $endpoint = "/tenants"; 509 $data = ['name' => $tenantName]; 510 return $this->makeRequest($endpoint, 'POST', $data); 511 } 512 513 /** 514 * Get database information 515 * 516 * Retrieves information about the specified database within a tenant. 517 * 518 * @param string $databaseName The database name 519 * @param string $tenantName The tenant name 520 * @return array The database information 521 */ 522 public function getDatabase($databaseName, $tenantName) { 523 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 524 return $this->makeRequest($endpoint, 'GET'); 525 } 526 527 /** 528 * Create a new database 529 * 530 * Creates a new database with the specified name within a tenant. 531 * 532 * @param string $databaseName The database name 533 * @param string $tenantName The tenant name 534 * @return array The response from the API 535 */ 536 public function createDatabase($databaseName, $tenantName) { 537 $endpoint = "/tenants/{$tenantName}/databases"; 538 $data = ['name' => $databaseName]; 539 return $this->makeRequest($endpoint, 'POST', $data); 540 } 541 542 /** 543 * Ensure a collection exists, creating it if necessary 544 * 545 * This helper function checks if a collection exists and creates it if it doesn't. 546 * 547 * @param string $collectionName The name of the collection to check/create 548 * @return string Status message indicating what happened 549 */ 550 public function ensureCollectionExists($collectionName) { 551 try { 552 $collection = $this->getCollection($collectionName); 553 return "Collection '$collectionName' already exists."; 554 } catch (\Exception $e) { 555 // Collection doesn't exist, create it 556 $created = $this->createCollection($collectionName); 557 return "Collection '$collectionName' created."; 558 } 559 } 560 561 /** 562 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 563 * 564 * This function handles the complete processing of a single DokuWiki file: 565 * 1. Parses the file path to extract metadata and document ID 566 * 2. Determines the appropriate collection based on document ID 567 * 3. Checks if the document needs updating using timestamp comparison 568 * 4. Reads and processes file content only if update is needed 569 * 5. Splits the document into chunks (paragraphs) 570 * 6. Extracts rich metadata from the DokuWiki ID format 571 * 7. Generates embeddings for each chunk 572 * 8. Sends all chunks to ChromaDB with metadata 573 * 574 * Supported ID formats: 575 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 576 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 577 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 578 * 579 * The function implements smart update checking by comparing file modification time 580 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 581 * 582 * @param string $filePath The path to the file to process 583 * @param string $collectionName The name of the collection to use 584 * @param bool $collectionChecked Whether the collection has already been checked/created 585 * @return array Result with status and details 586 */ 587 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 588 // Parse file path to extract metadata 589 $id = parseFilePath($filePath); 590 591 try { 592 // Create collection if it doesn't exist (only if not already checked) 593 $collectionStatus = ''; 594 if (!$collectionChecked) { 595 $collectionStatus = $this->ensureCollectionExists($collectionName); 596 } 597 598 // Get collection ID 599 $collection = $this->getCollection($collectionName); 600 if (!isset($collection['id'])) { 601 return [ 602 'status' => 'error', 603 'message' => "Collection ID not found for '{$collectionName}'" 604 ]; 605 } 606 $collectionId = $collection['id']; 607 608 // Get file modification time 609 $fileModifiedTime = filemtime($filePath); 610 611 // Check if document needs update 612 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 613 614 // If document is up to date, skip processing 615 if (!$needsUpdate) { 616 return [ 617 'status' => 'skipped', 618 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 619 ]; 620 } 621 622 // Read file content 623 $content = file_get_contents($filePath); 624 625 // Split document into chunks (paragraphs separated by two newlines) 626 $paragraphs = preg_split('/\n\s*\n/', $content); 627 $chunks = []; 628 $chunkMetadata = []; 629 630 // Parse the DokuWiki ID to extract base metadata 631 $parts = explode(':', $id); 632 633 // Extract metadata from the last part of the ID 634 $lastPart = end($parts); 635 $baseMetadata = []; 636 637 // Add the document ID as metadata 638 $baseMetadata['document_id'] = $id; 639 640 // Add current timestamp 641 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 642 643 // Check if any part of the ID is 'templates' and set template metadata 644 $isTemplate = in_array('templates', $parts); 645 if ($isTemplate) { 646 $baseMetadata['type'] = 'template'; 647 } else { 648 $baseMetadata['type'] = 'report'; 649 } 650 651 // Extract modality from the second part 652 if (isset($parts[1])) { 653 $baseMetadata['modality'] = $parts[1]; 654 } 655 656 // Handle different ID formats based on the third part: word (institution) or numeric (year) 657 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 658 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 659 // For templates, don't set institution, date or year 660 if (isset($parts[2]) && !$isTemplate) { 661 // Check if third part is numeric (year) or word (institution) 662 if (is_numeric($parts[2])) { 663 // Format: reports:mri:2024:g287-name-surname (year format) 664 // Extract year from the third part 665 $baseMetadata['year'] = $parts[2]; 666 667 // Set default institution from config 668 global $conf; 669 $baseMetadata['institution'] = $conf['plugin']['dokullm']['default_institution'] ?? 'default'; 670 671 // Extract registration and name from the last part 672 // Registration should start with one letter or number and contain numbers before the '-' character 673 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 674 // Check if the first part contains at least one digit to be considered a registration 675 if (preg_match('/[0-9]/', $matches[1])) { 676 $baseMetadata['registration'] = $matches[1]; 677 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 678 } else { 679 // If no registration pattern found, treat entire part as patient name 680 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 681 } 682 } else { 683 // If no match, treat entire part as patient name 684 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 685 } 686 } else { 687 // Format: reports:mri:institution:250620-name-surname (institution format) 688 // Extract institution from the third part 689 $baseMetadata['institution'] = $parts[2]; 690 691 // Extract date and name from the last part 692 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 693 $dateStr = $matches[1]; 694 $name = $matches[2]; 695 696 // Convert date format (250620 -> 2025-06-20) 697 $day = substr($dateStr, 0, 2); 698 $month = substr($dateStr, 2, 2); 699 $year = substr($dateStr, 4, 2); 700 // Assuming 20xx for years 00-69 and 19xx for years 70-99 701 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 702 $formattedDate = $fullYear . '-' . $month . '-' . $day; 703 704 $baseMetadata['date'] = $formattedDate; 705 $baseMetadata['name'] = str_replace('-', ' ', $name); 706 } 707 } 708 } 709 710 // For templates, always extract name from the last part 711 if ($isTemplate && isset($lastPart)) { 712 // Extract name from the last part (everything after the last colon) 713 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 714 // Check if the first part contains at least one digit to be considered a registration 715 if (preg_match('/[0-9]/', $matches[1])) { 716 $baseMetadata['registration'] = $matches[1]; 717 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 718 } else { 719 // If no registration pattern found, treat entire part as template name 720 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 721 } 722 } else { 723 // If no match, treat entire part as template name 724 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 725 } 726 } 727 728 // Process each paragraph as a chunk with intelligent metadata handling 729 $chunkIds = []; 730 $chunkContents = []; 731 $chunkMetadatas = []; 732 $chunkEmbeddings = []; 733 $currentTags = []; 734 735 foreach ($paragraphs as $index => $paragraph) { 736 // Skip empty paragraphs to avoid processing whitespace-only content 737 $paragraph = trim($paragraph); 738 if (empty($paragraph)) { 739 continue; 740 } 741 742 // Check if this is a DokuWiki title (starts and ends with =) 743 // Titles are converted to tags for better searchability but not stored as content chunks 744 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 745 // Extract title content and clean it 746 $titleContent = trim($matches[1]); 747 748 // Split into words and create searchable tags 749 $words = preg_split('/\s+/', $titleContent); 750 $tags = []; 751 752 foreach ($words as $word) { 753 // Only use words longer than 3 characters to reduce noise 754 if (strlen($word) >= 3) { 755 $tags[] = strtolower($word); 756 } 757 } 758 759 // Remove duplicate tags and store for use in subsequent chunks 760 $currentTags = array_unique($tags); 761 continue; // Skip storing title chunks as content 762 } 763 764 // Create chunk ID 765 $chunkId = $id . '@' . ($index + 1); 766 767 // Generate embeddings for the chunk 768 $embeddings = $this->generateEmbeddings($paragraph); 769 770 // Add chunk-specific metadata 771 $metadata = $baseMetadata; 772 $metadata['chunk_id'] = $chunkId; 773 $metadata['chunk_number'] = $index + 1; 774 $metadata['total_chunks'] = count($paragraphs); 775 776 // Add current tags to metadata if any exist 777 if (!empty($currentTags)) { 778 $metadata['tags'] = implode(',', $currentTags); 779 } 780 781 // Store chunk data 782 $chunkIds[] = $chunkId; 783 $chunkContents[] = $paragraph; 784 $chunkMetadatas[] = $metadata; 785 $chunkEmbeddings[] = $embeddings; 786 } 787 788 // If no chunks were created, skip this file 789 if (empty($chunkIds)) { 790 return [ 791 'status' => 'skipped', 792 'message' => "No valid chunks found in file '$id'. Skipping..." 793 ]; 794 } 795 796 // Send all chunks to ChromaDB 797 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 798 799 return [ 800 'status' => 'success', 801 'message' => "Successfully sent file to ChromaDB", 802 'details' => [ 803 'document_id' => $id, 804 'chunks' => count($chunkIds), 805 'collection' => $collectionName 806 ], 807 'collection_status' => $collectionStatus 808 ]; 809 } catch (\Exception $e) { 810 return [ 811 'status' => 'error', 812 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 813 ]; 814 } 815 } 816 817 /** 818 * Process all DokuWiki files in a directory and send them to ChromaDB 819 * 820 * This function recursively processes all .txt files in a directory and its subdirectories. 821 * It first checks if the appropriate collection exists and creates it if needed. 822 * Then it processes each file individually. 823 * 824 * @param string $dirPath The directory path to process 825 * @return array Result with status and details 826 */ 827 public function processDirectory($dirPath) { 828 // Check if directory exists 829 if (!is_dir($dirPath)) { 830 return [ 831 'status' => 'error', 832 'message' => "Directory does not exist: $dirPath" 833 ]; 834 } 835 836 // Create RecursiveIteratorIterator to process directories recursively 837 $iterator = new RecursiveIteratorIterator( 838 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 839 RecursiveIteratorIterator::LEAVES_ONLY 840 ); 841 842 $files = []; 843 foreach ($iterator as $file) { 844 // Process only .txt files that don't start with underscore 845 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 846 $files[] = $file->getPathname(); 847 } 848 } 849 850 if (empty($files)) { 851 return [ 852 'status' => 'skipped', 853 'message' => "No .txt files found in directory: $dirPath" 854 ]; 855 } 856 857 // Use the first part of the document ID as collection name, fallback to 'documents' 858 $sampleFile = $files[0]; 859 $id = parseFilePath($sampleFile); 860 $idParts = explode(':', $id); 861 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 862 863 try { 864 $this->ensureCollectionExists($collectionName); 865 $collectionChecked = true; 866 } catch (Exception $e) { 867 $collectionChecked = true; 868 } 869 870 $results = []; 871 foreach ($files as $file) { 872 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 873 $results[] = [ 874 'file' => $file, 875 'result' => $result 876 ]; 877 } 878 879 return [ 880 'status' => 'success', 881 'message' => "Finished processing directory.", 882 'files_count' => count($files), 883 'results' => $results 884 ]; 885 } 886} 887 888/** 889 * Parse a file path and convert it to a DokuWiki ID 890 * 891 * Takes a file system path and converts it to the DokuWiki ID format by: 892 * 1. Removing the base path prefix (using DokuWiki's pages directory) 893 * 2. Removing the .txt extension 894 * 3. Converting directory separators to colons 895 * 896 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 897 * Becomes: reports:mri:2024:g287-name-surname 898 * 899 * @param string $filePath The full file path to parse 900 * @return string The DokuWiki ID 901 */ 902function parseFilePath($filePath) { 903 // Use DokuWiki's constant to get the pages directory if available 904 if (defined('DOKU_INC')) { 905 $pagesDir = DOKU_INC . 'data/pages/'; 906 } else { 907 // Fallback to common DokuWiki installation path 908 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 909 } 910 911 // Remove the base path 912 $relativePath = str_replace($pagesDir, '', $filePath); 913 914 // Remove .txt extension 915 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 916 917 // Split path into parts and filter out empty parts 918 $parts = array_filter(explode('/', $relativePath)); 919 920 // Build DokuWiki ID (use first part as namespace) 921 $idParts = []; 922 foreach ($parts as $part) { 923 if (!empty($part)) { 924 $idParts[] = $part; 925 } 926 } 927 928 return implode(':', $idParts); 929} 930 931