1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5class ChromaDBClient { 6 private $baseUrl; 7 private $client; 8 private $ollamaClient; 9 private $tenant; 10 private $database; 11 private $ollamaHost; 12 private $ollamaPort; 13 private $ollamaModel; 14 15 /** 16 * Get configuration value for the dokullm plugin 17 * 18 * @param string $key Configuration key 19 * @param mixed $default Default value if key not found 20 * @return mixed Configuration value 21 */ 22 /** 23 * Initialize the ChromaDB client 24 * 25 * Creates a new ChromaDB client instance with the specified connection parameters. 26 * Also ensures that the specified tenant and database exist. 27 * 28 * @param string $host ChromaDB server host 29 * @param int $port ChromaDB server port 30 * @param string $tenant ChromaDB tenant name 31 * @param string $database ChromaDB database name 32 * @param string $defaultCollection Default collection name 33 * @param string $ollamaHost Ollama server host 34 * @param int $ollamaPort Ollama server port 35 * @param string $ollamaModel Ollama embeddings model 36 */ 37 public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) { 38 // Use provided parameters (no fallback since they're mandatory) 39 $chromaHost = $host; 40 $chromaPort = $port; 41 $this->tenant = $tenant; 42 $this->database = $database; 43 $this->defaultCollection = $defaultCollection; 44 $this->ollamaHost = $ollamaHost; 45 $this->ollamaPort = $ollamaPort; 46 $this->ollamaModel = $ollamaModel; 47 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 48 $this->client = curl_init(); 49 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 50 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 51 'Content-Type: application/json', 52 'Accept: application/json' 53 ]); 54 // Initialize Ollama client 55 $this->ollamaClient = curl_init(); 56 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 57 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 58 'Content-Type: application/json' 59 ]); 60 // Check if tenant and database exist, create them if they don't 61 $this->ensureTenantAndDatabase(); 62 } 63 64 /** 65 * Clean up the cURL client when the object is destroyed 66 * 67 * @return void 68 */ 69 public function __destruct() { 70 curl_close($this->client); 71 curl_close($this->ollamaClient); 72 } 73 74 /** 75 * Make an HTTP request to the ChromaDB API 76 * 77 * This is a helper function that handles making HTTP requests to the ChromaDB API, 78 * including setting the appropriate headers for tenant and database. 79 * 80 * @param string $endpoint The API endpoint to call 81 * @param string $method The HTTP method to use (default: 'GET') 82 * @param array|null $data The data to send with the request (default: null) 83 * @return array The JSON response decoded as an array 84 * @throws Exception If there's a cURL error or HTTP error 85 */ 86 private function makeRequest($endpoint, $method = 'GET', $data = null) { 87 // Add tenant and database as headers instead of query parameters for v2 API 88 $headers = [ 89 'Content-Type: application/json', 90 'Accept: application/json' 91 ]; 92 // Version 2 93 $url = $this->baseUrl . '/api/v2' . $endpoint; 94 curl_setopt($this->client, CURLOPT_URL, $url); 95 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 96 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 97 // POST JSON data 98 if ($data) { 99 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 100 } else { 101 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 102 } 103 // Call 104 $response = curl_exec($this->client); 105 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 106 // Check the result 107 if (curl_error($this->client)) { 108 throw new \Exception('Curl error: ' . curl_error($this->client)); 109 } 110 if ($httpCode >= 400) { 111 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 112 } 113 // Return the decoded response 114 return json_decode($response, true); 115 } 116 117 /** 118 * Generate embeddings for text using Ollama 119 * 120 * @param string $text The text to generate embeddings for 121 * @return array The embeddings vector 122 */ 123 public function generateEmbeddings($text) { 124 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 125 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 126 $data = [ 127 'model' => $this->ollamaModel, 128 'prompt' => $text, 129 'keep_alive' => '30m' 130 ]; 131 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 132 $response = curl_exec($this->ollamaClient); 133 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 134 if (curl_error($this->ollamaClient)) { 135 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 136 } 137 if ($httpCode >= 400) { 138 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 139 } 140 $result = json_decode($response, true); 141 if (!isset($result['embedding'])) { 142 throw new \Exception("Ollama response missing embedding: " . $response); 143 } 144 return $result['embedding']; 145 } 146 147 /** 148 * List all collections in the database 149 * 150 * Retrieves a list of all collections in the specified tenant and database. 151 * 152 * @return array List of collections 153 */ 154 public function listCollections() { 155 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 156 return $this->makeRequest($endpoint); 157 } 158 159 /** 160 * Get a collection by name 161 * 162 * Retrieves information about a specific collection by its name. 163 * 164 * @param string $name The name of the collection to retrieve 165 * @return array The collection information 166 * @throws Exception If the collection is not found 167 */ 168 public function getCollection($name) { 169 // Use provided name, fallback to 'documents' if empty 170 if (empty($name)) { 171 $name = 'documents'; 172 } 173 // First try to get collection by name 174 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 175 $collections = $this->makeRequest($endpoint); 176 // Find collection by name 177 foreach ($collections as $collection) { 178 if (isset($collection['name']) && $collection['name'] === $name) { 179 return $collection; 180 } 181 } 182 // If not found, throw exception 183 throw new \Exception("Collection '{$name}' not found"); 184 } 185 186 /** 187 * Create a new collection 188 * 189 * Creates a new collection with the specified name and optional metadata. 190 * 191 * @param string $name The name of the collection to create 192 * @param array|null $metadata Optional metadata for the collection 193 * @return array The response from the API 194 */ 195 public function createCollection($name, $metadata = null) { 196 // Use provided name, fallback to 'documents' if empty 197 if (empty($name)) { 198 $name = 'documents'; 199 } 200 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 201 $data = ['name' => $name]; 202 if ($metadata) { 203 $data['metadata'] = $metadata; 204 } 205 return $this->makeRequest($endpoint, 'POST', $data); 206 } 207 208 /** 209 * Delete a collection by name 210 * 211 * Deletes a collection with the specified name. 212 * 213 * @param string $name The name of the collection to delete 214 * @return array The response from the API 215 * @throws Exception If the collection ID is not found 216 */ 217 public function deleteCollection($name) { 218 // Use provided name, fallback to 'documents' if empty 219 if (empty($name)) { 220 $name = 'documents'; 221 } 222 // First get the collection to find its ID 223 $collection = $this->getCollection($name); 224 if (!isset($collection['id'])) { 225 throw new \Exception("Collection ID not found for '{$name}'"); 226 } 227 $collectionId = $collection['id']; 228 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 229 return $this->makeRequest($endpoint, 'DELETE'); 230 } 231 232 /** 233 * Get a document by its ID from a collection 234 * 235 * Retrieves a document from the specified collection using its ID. 236 * 237 * @param string $collectionName The name of the collection to get the document from 238 * @param string $documentId The document ID to retrieve 239 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 240 * @return array The retrieved document 241 * @throws Exception If the collection ID is not found 242 */ 243 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 244 // Use provided name, fallback to 'documents' if empty 245 if (empty($collectionName)) { 246 $collectionName = 'documents'; 247 } 248 // First get the collection to find its ID 249 $collection = $this->getCollection($collectionName); 250 if (!isset($collection['id'])) { 251 throw new \Exception("Collection ID not found for '{$collectionName}'"); 252 } 253 $collectionId = $collection['id']; 254 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 255 $data = [ 256 'ids' => [$documentId], 257 'include' => $include 258 ]; 259 // Return the document 260 return $this->makeRequest($endpoint, 'POST', $data); 261 } 262 263 /** 264 * Add documents to a collection 265 * 266 * Adds documents to the specified collection. Each document must have a corresponding ID. 267 * Optional metadata and pre-computed embeddings can also be provided. 268 * 269 * @param string $collectionName The name of the collection to add documents to 270 * @param array $documents The document contents 271 * @param array $ids The document IDs 272 * @param array|null $metadatas Optional metadata for each document 273 * @param array|null $embeddings Optional pre-computed embeddings for each document 274 * @return array The response from the API 275 * @throws Exception If the collection ID is not found 276 */ 277 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 278 // Use provided name, fallback to 'documents' if empty 279 if (empty($collectionName)) { 280 $collectionName = 'documents'; 281 } 282 // First get the collection to find its ID 283 $collection = $this->getCollection($collectionName); 284 if (!isset($collection['id'])) { 285 throw new \Exception("Collection ID not found for '{$collectionName}'"); 286 } 287 $collectionId = $collection['id']; 288 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 289 $data = [ 290 'ids' => $ids, 291 'documents' => $documents 292 ]; 293 // Get also the metadata 294 if ($metadatas) { 295 $data['metadatas'] = $metadatas; 296 } 297 // Get the embeddings 298 if ($embeddings) { 299 $data['embeddings'] = $embeddings; 300 } 301 // Return the respnse 302 return $this->makeRequest($endpoint, 'POST', $data); 303 } 304 305 /** 306 * Check if a document needs to be updated based on timestamp comparison 307 * 308 * Determines whether a document should be reprocessed by comparing the file's last modification 309 * time with the processed_at timestamp stored in the document's metadata. The function checks 310 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 311 * not included in the database. 312 * 313 * @param string $collectionId The ID of the collection to check documents in 314 * @param string $documentId The base document ID to check (without chunk suffixes) 315 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 316 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 317 * @throws Exception If there's an error checking the document 318 */ 319 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 320 try { 321 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 322 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 323 $chunkIdsToCheck = [ 324 $documentId . '@1', 325 $documentId . '@2', 326 $documentId . '@3' 327 ]; 328 $data = [ 329 'ids' => $chunkIdsToCheck, 330 'include' => [ 331 "metadatas" 332 ], 333 'limit' => 1 334 ]; 335 // Check if document exists 336 $result = $this->makeRequest($endpoint, 'POST', $data); 337 // If no documents found, return true (needs to be added) 338 if (empty($result['ids'])) { 339 return true; 340 } 341 // Check if any document has a processed_at timestamp 342 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 343 // Check the first metadata entry directly 344 $metadata = $result['metadatas'][0]; 345 // If processed_at is not set, return true (needs update) 346 if (!isset($metadata['processed_at'])) { 347 return true; 348 } 349 // Parse the processed_at timestamp 350 $processedTimestamp = strtotime($metadata['processed_at']); 351 // If file is newer than processed time, return true (needs update) 352 if ($fileModifiedTime > $processedTimestamp) { 353 return true; 354 } 355 } 356 // Document exists and is up to date 357 return false; 358 } catch (\Exception $e) { 359 // If there's an error checking the document, assume it needs to be updated 360 return true; 361 } 362 } 363 364 /** 365 * Query a collection for similar documents 366 * 367 * Queries the specified collection for documents similar to the provided query texts. 368 * The function generates embeddings for the query texts and sends them to ChromaDB. 369 * Supports filtering results by metadata using the where parameter. 370 * 371 * @param string $collectionName The name of the collection to query 372 * @param array $queryTexts The query texts to search for 373 * @param int $nResults The number of results to return (default: 5) 374 * @param array|null $where Optional filter conditions for metadata 375 * @return array The query results 376 * @throws Exception If the collection ID is not found 377 */ 378 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 379 // Use provided name, fallback to 'documents' if empty 380 if (empty($collectionName)) { 381 $collectionName = 'documents'; 382 } 383 // First get the collection to find its ID 384 $collection = $this->getCollection($collectionName); 385 if (!isset($collection['id'])) { 386 throw new \Exception("Collection ID not found for '{$collectionName}'"); 387 } 388 $collectionId = $collection['id']; 389 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 390 // Generate embeddings for query texts 391 $queryEmbeddings = []; 392 foreach ($queryTexts as $text) { 393 $queryEmbeddings[] = $this->generateEmbeddings($text); 394 } 395 $data = [ 396 'query_embeddings' => $queryEmbeddings, 397 'n_results' => $nResults 398 ]; 399 // Add where clause for metadata filtering if provided 400 if ($where && is_array($where)) { 401 $data['where'] = $where; 402 } 403 // Return the response 404 return $this->makeRequest($endpoint, 'POST', $data); 405 } 406 407 /** 408 * Check if the ChromaDB server is alive 409 * 410 * Sends a heartbeat request to verify that the ChromaDB server is running. 411 * 412 * @return array The response from the heartbeat endpoint 413 */ 414 public function heartbeat() { 415 $endpoint = "/heartbeat"; 416 return $this->makeRequest($endpoint, 'GET'); 417 } 418 419 /** 420 * Get authentication and identity information 421 * 422 * Retrieves authentication and identity information from the ChromaDB server. 423 * 424 * @return array The response from the auth/identity endpoint 425 */ 426 public function getIdentity() { 427 $endpoint = "/identity"; 428 return $this->makeRequest($endpoint, 'GET'); 429 } 430 431 /** 432 * Ensure that the specified tenant and database exist 433 * 434 * Checks if the specified tenant and database exist, and creates them if they don't. 435 * 436 * @return void 437 */ 438 private function ensureTenantAndDatabase() { 439 // Check if tenant exists, create if it doesn't 440 try { 441 $this->getTenant($this->tenant); 442 } catch (\Exception $e) { 443 // Tenant doesn't exist, create it 444 $this->createTenant($this->tenant); 445 } 446 // Check if database exists, create if it doesn't 447 try { 448 $this->getDatabase($this->database, $this->tenant); 449 } catch (\Exception $e) { 450 // Database doesn't exist, create it 451 $this->createDatabase($this->database, $this->tenant); 452 } 453 } 454 455 /** 456 * Get tenant information 457 * 458 * Retrieves information about the specified tenant. 459 * 460 * @param string $tenantName The tenant name 461 * @return array The tenant information 462 */ 463 public function getTenant($tenantName) { 464 $endpoint = "/tenants/{$tenantName}"; 465 return $this->makeRequest($endpoint, 'GET'); 466 } 467 468 /** 469 * Create a new tenant 470 * 471 * Creates a new tenant with the specified name. 472 * 473 * @param string $tenantName The tenant name 474 * @return array The response from the API 475 */ 476 public function createTenant($tenantName) { 477 $endpoint = "/tenants"; 478 $data = ['name' => $tenantName]; 479 return $this->makeRequest($endpoint, 'POST', $data); 480 } 481 482 /** 483 * Get database information 484 * 485 * Retrieves information about the specified database within a tenant. 486 * 487 * @param string $databaseName The database name 488 * @param string $tenantName The tenant name 489 * @return array The database information 490 */ 491 public function getDatabase($databaseName, $tenantName) { 492 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 493 return $this->makeRequest($endpoint, 'GET'); 494 } 495 496 /** 497 * Create a new database 498 * 499 * Creates a new database with the specified name within a tenant. 500 * 501 * @param string $databaseName The database name 502 * @param string $tenantName The tenant name 503 * @return array The response from the API 504 */ 505 public function createDatabase($databaseName, $tenantName) { 506 $endpoint = "/tenants/{$tenantName}/databases"; 507 $data = ['name' => $databaseName]; 508 return $this->makeRequest($endpoint, 'POST', $data); 509 } 510 511 /** 512 * Ensure a collection exists, creating it if necessary 513 * 514 * This helper function checks if a collection exists and creates it if it doesn't. 515 * 516 * @param string $collectionName The name of the collection to check/create 517 * @return string Status message indicating what happened 518 */ 519 public function ensureCollectionExists($collectionName) { 520 try { 521 $collection = $this->getCollection($collectionName); 522 return "Collection '$collectionName' already exists."; 523 } catch (\Exception $e) { 524 // Collection doesn't exist, create it 525 $created = $this->createCollection($collectionName); 526 return "Collection '$collectionName' created."; 527 } 528 } 529 530 /** 531 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 532 * 533 * This function handles the complete processing of a single DokuWiki file: 534 * 1. Parses the file path to extract metadata and document ID 535 * 2. Determines the appropriate collection based on document ID 536 * 3. Checks if the document needs updating using timestamp comparison 537 * 4. Reads and processes file content only if update is needed 538 * 5. Splits the document into chunks (paragraphs) 539 * 6. Extracts rich metadata from the DokuWiki ID format 540 * 7. Generates embeddings for each chunk 541 * 8. Sends all chunks to ChromaDB with metadata 542 * 543 * Supported ID formats: 544 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 545 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 546 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 547 * 548 * The function implements smart update checking by comparing file modification time 549 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 550 * 551 * @param string $filePath The path to the file to process 552 * @param string $collectionName The name of the collection to use 553 * @param bool $collectionChecked Whether the collection has already been checked/created 554 * @return array Result with status and details 555 */ 556 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 557 // Parse file path to extract metadata 558 $id = parseFilePath($filePath); 559 try { 560 // Create collection if it doesn't exist (only if not already checked) 561 $collectionStatus = ''; 562 if (!$collectionChecked) { 563 $collectionStatus = $this->ensureCollectionExists($collectionName); 564 } 565 // Get collection ID 566 $collection = $this->getCollection($collectionName); 567 if (!isset($collection['id'])) { 568 return [ 569 'status' => 'error', 570 'message' => "Collection ID not found for '{$collectionName}'" 571 ]; 572 } 573 $collectionId = $collection['id']; 574 // Get file modification time 575 $fileModifiedTime = filemtime($filePath); 576 // Check if document needs update 577 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 578 // If document is up to date, skip processing 579 if (!$needsUpdate) { 580 return [ 581 'status' => 'skipped', 582 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 583 ]; 584 } 585 // Read file content 586 $content = file_get_contents($filePath); 587 // Split document into chunks (paragraphs separated by two newlines) 588 $paragraphs = preg_split('/\n\s*\n/', $content); 589 $chunks = []; 590 $chunkMetadata = []; 591 // Parse the DokuWiki ID to extract base metadata 592 $parts = explode(':', $id); 593 // Extract metadata from the last part of the ID 594 $lastPart = end($parts); 595 $baseMetadata = []; 596 // Add the document ID as metadata 597 $baseMetadata['document_id'] = $id; 598 // Add current timestamp 599 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 600 // Check if any part of the ID is 'templates' and set template metadata 601 $isTemplate = in_array('templates', $parts); 602 if ($isTemplate) { 603 $baseMetadata['type'] = 'template'; 604 } else { 605 $baseMetadata['type'] = 'report'; 606 } 607 // Extract modality from the second part 608 if (isset($parts[1])) { 609 $baseMetadata['modality'] = $parts[1]; 610 } 611 // Handle different ID formats based on the third part: word (institution) or numeric (year) 612 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 613 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 614 // For templates, don't set institution, date or year 615 if (isset($parts[2]) && !$isTemplate) { 616 // Check if third part is numeric (year) or word (institution) 617 if (is_numeric($parts[2])) { 618 // Format: reports:mri:2024:g287-name-surname (year format) 619 // Extract year from the third part 620 $baseMetadata['year'] = $parts[2]; 621 // Set default institution from config 622 global $conf; 623 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 624 // Extract registration and name from the last part 625 // Registration should start with one letter or number and contain numbers before the '-' character 626 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 627 // Check if the first part contains at least one digit to be considered a registration 628 if (preg_match('/[0-9]/', $matches[1])) { 629 $baseMetadata['registration'] = $matches[1]; 630 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 631 } else { 632 // If no registration pattern found, treat entire part as patient name 633 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 634 } 635 } else { 636 // If no match, treat entire part as patient name 637 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 638 } 639 } else { 640 // Format: reports:mri:institution:250620-name-surname (institution format) 641 // Extract institution from the third part 642 $baseMetadata['institution'] = $parts[2]; 643 // Extract date and name from the last part 644 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 645 $dateStr = $matches[1]; 646 $name = $matches[2]; 647 // Convert date format (250620 -> 2025-06-20) 648 $day = substr($dateStr, 0, 2); 649 $month = substr($dateStr, 2, 2); 650 $year = substr($dateStr, 4, 2); 651 // Assuming 20xx for years 00-69 and 19xx for years 70-99 652 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 653 $formattedDate = $fullYear . '-' . $month . '-' . $day; 654 $baseMetadata['date'] = $formattedDate; 655 $baseMetadata['name'] = str_replace('-', ' ', $name); 656 } 657 } 658 } 659 // For templates, always extract name from the last part 660 if ($isTemplate && isset($lastPart)) { 661 // Extract name from the last part (everything after the last colon) 662 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 663 // Check if the first part contains at least one digit to be considered a registration 664 if (preg_match('/[0-9]/', $matches[1])) { 665 $baseMetadata['registration'] = $matches[1]; 666 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 667 } else { 668 // If no registration pattern found, treat entire part as template name 669 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 670 } 671 } else { 672 // If no match, treat entire part as template name 673 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 674 } 675 } 676 // Process each paragraph as a chunk with intelligent metadata handling 677 $chunkIds = []; 678 $chunkContents = []; 679 $chunkMetadatas = []; 680 $chunkEmbeddings = []; 681 $currentTags = []; 682 foreach ($paragraphs as $index => $paragraph) { 683 // Skip empty paragraphs to avoid processing whitespace-only content 684 $paragraph = trim($paragraph); 685 if (empty($paragraph)) { 686 continue; 687 } 688 // Check if this is a DokuWiki title (starts and ends with =) 689 // Titles are converted to tags for better searchability but not stored as content chunks 690 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 691 // Extract title content and clean it 692 $titleContent = trim($matches[1]); 693 // Split into words and create searchable tags 694 $words = preg_split('/\s+/', $titleContent); 695 $tags = []; 696 foreach ($words as $word) { 697 // Only use words longer than 3 characters to reduce noise 698 if (strlen($word) >= 3) { 699 $tags[] = strtolower($word); 700 } 701 } 702 // Remove duplicate tags and store for use in subsequent chunks 703 $currentTags = array_unique($tags); 704 continue; // Skip storing title chunks as content 705 } 706 // Create chunk ID 707 $chunkId = $id . '@' . ($index + 1); 708 // Generate embeddings for the chunk 709 $embeddings = $this->generateEmbeddings($paragraph); 710 // Add chunk-specific metadata 711 $metadata = $baseMetadata; 712 $metadata['chunk_id'] = $chunkId; 713 $metadata['chunk_number'] = $index + 1; 714 $metadata['total_chunks'] = count($paragraphs); 715 // Add current tags to metadata if any exist 716 if (!empty($currentTags)) { 717 $metadata['tags'] = implode(',', $currentTags); 718 } 719 // Store chunk data 720 $chunkIds[] = $chunkId; 721 $chunkContents[] = $paragraph; 722 $chunkMetadatas[] = $metadata; 723 $chunkEmbeddings[] = $embeddings; 724 } 725 // If no chunks were created, skip this file 726 if (empty($chunkIds)) { 727 return [ 728 'status' => 'skipped', 729 'message' => "No valid chunks found in file '$id'. Skipping..." 730 ]; 731 } 732 // Send all chunks to ChromaDB 733 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 734 return [ 735 'status' => 'success', 736 'message' => "Successfully sent file to ChromaDB", 737 'details' => [ 738 'document_id' => $id, 739 'chunks' => count($chunkIds), 740 'collection' => $collectionName 741 ], 742 'collection_status' => $collectionStatus 743 ]; 744 } catch (\Exception $e) { 745 return [ 746 'status' => 'error', 747 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 748 ]; 749 } 750 } 751 752 /** 753 * Process all DokuWiki files in a directory and send them to ChromaDB 754 * 755 * This function recursively processes all .txt files in a directory and its subdirectories. 756 * It first checks if the appropriate collection exists and creates it if needed. 757 * Then it processes each file individually. 758 * 759 * @param string $dirPath The directory path to process 760 * @return array Result with status and details 761 */ 762 public function processDirectory($dirPath) { 763 // Check if directory exists 764 if (!is_dir($dirPath)) { 765 return [ 766 'status' => 'error', 767 'message' => "Directory does not exist: $dirPath" 768 ]; 769 } 770 // Create RecursiveIteratorIterator to process directories recursively 771 $iterator = new RecursiveIteratorIterator( 772 new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 773 RecursiveIteratorIterator::LEAVES_ONLY 774 ); 775 $files = []; 776 foreach ($iterator as $file) { 777 // Process only .txt files that don't start with underscore 778 if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 779 $files[] = $file->getPathname(); 780 } 781 } 782 // Skip if no files 783 if (empty($files)) { 784 return [ 785 'status' => 'skipped', 786 'message' => "No .txt files found in directory: $dirPath" 787 ]; 788 } 789 // Use the first part of the document ID as collection name, fallback to 'documents' 790 $sampleFile = $files[0]; 791 $id = parseFilePath($sampleFile); 792 $idParts = explode(':', $id); 793 $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 794 try { 795 $this->ensureCollectionExists($collectionName); 796 $collectionChecked = true; 797 } catch (Exception $e) { 798 $collectionChecked = true; 799 } 800 // Send each file 801 $results = []; 802 foreach ($files as $file) { 803 $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 804 $results[] = [ 805 'file' => $file, 806 'result' => $result 807 ]; 808 } 809 // Return the result 810 return [ 811 'status' => 'success', 812 'message' => "Finished processing directory.", 813 'files_count' => count($files), 814 'results' => $results 815 ]; 816 } 817} 818 819/** 820 * Parse a file path and convert it to a DokuWiki ID 821 * 822 * Takes a file system path and converts it to the DokuWiki ID format by: 823 * 1. Removing the base path prefix (using DokuWiki's pages directory) 824 * 2. Removing the .txt extension 825 * 3. Converting directory separators to colons 826 * 827 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 828 * Becomes: reports:mri:2024:g287-name-surname 829 * 830 * @param string $filePath The full file path to parse 831 * @return string The DokuWiki ID 832 */ 833function parseFilePath($filePath) { 834 // Use DokuWiki's constant to get the pages directory if available 835 if (defined('DOKU_INC')) { 836 $pagesDir = DOKU_INC . 'data/pages/'; 837 } else { 838 // Fallback to common DokuWiki installation path 839 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 840 } 841 // Remove the base path 842 $relativePath = str_replace($pagesDir, '', $filePath); 843 // Remove .txt extension 844 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 845 // Split path into parts and filter out empty parts 846 $parts = array_filter(explode('/', $relativePath)); 847 // Build DokuWiki ID (use first part as namespace) 848 $idParts = []; 849 foreach ($parts as $part) { 850 if (!empty($part)) { 851 $idParts[] = $part; 852 } 853 } 854 // Reurn the ID 855 return implode(':', $idParts); 856} 857