1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5use RecursiveIteratorIterator; 6use RecursiveDirectoryIterator; 7 8class ChromaDBClient { 9 private $baseUrl; 10 private $client; 11 private $ollamaClient; 12 private $tenant; 13 private $database; 14 private $ollamaHost; 15 private $ollamaPort; 16 private $ollamaModel; 17 18 /** 19 * Get configuration value for the dokullm plugin 20 * 21 * @param string $key Configuration key 22 * @param mixed $default Default value if key not found 23 * @return mixed Configuration value 24 */ 25 /** 26 * Initialize the ChromaDB client 27 * 28 * Creates a new ChromaDB client instance with the specified connection parameters. 29 * Also ensures that the specified tenant and database exist. 30 * 31 * @param string $host ChromaDB server host 32 * @param int $port ChromaDB server port 33 * @param string $tenant ChromaDB tenant name 34 * @param string $database ChromaDB database name 35 * @param string $defaultCollection Default collection name 36 * @param string $ollamaHost Ollama server host 37 * @param int $ollamaPort Ollama server port 38 * @param string $ollamaModel Ollama embeddings model 39 */ 40 public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) { 41 // Use provided parameters (no fallback since they're mandatory) 42 $chromaHost = $host; 43 $chromaPort = $port; 44 $this->tenant = $tenant; 45 $this->database = $database; 46 $this->defaultCollection = $defaultCollection; 47 $this->ollamaHost = $ollamaHost; 48 $this->ollamaPort = $ollamaPort; 49 $this->ollamaModel = $ollamaModel; 50 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 51 $this->client = curl_init(); 52 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 53 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 54 'Content-Type: application/json', 55 'Accept: application/json' 56 ]); 57 // Initialize Ollama client 58 $this->ollamaClient = curl_init(); 59 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 60 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 61 'Content-Type: application/json' 62 ]); 63 // Check if tenant and database exist, create them if they don't 64 $this->ensureTenantAndDatabase(); 65 } 66 67 /** 68 * Clean up the cURL client when the object is destroyed 69 * 70 * @return void 71 */ 72 public function __destruct() { 73 curl_close($this->client); 74 curl_close($this->ollamaClient); 75 } 76 77 /** 78 * Make an HTTP request to the ChromaDB API 79 * 80 * This is a helper function that handles making HTTP requests to the ChromaDB API, 81 * including setting the appropriate headers for tenant and database. 82 * 83 * @param string $endpoint The API endpoint to call 84 * @param string $method The HTTP method to use (default: 'GET') 85 * @param array|null $data The data to send with the request (default: null) 86 * @return array The JSON response decoded as an array 87 * @throws Exception If there's a cURL error or HTTP error 88 */ 89 private function makeRequest($endpoint, $method = 'GET', $data = null) { 90 // Add tenant and database as headers instead of query parameters for v2 API 91 $headers = [ 92 'Content-Type: application/json', 93 'Accept: application/json' 94 ]; 95 // Version 2 96 $url = $this->baseUrl . '/api/v2' . $endpoint; 97 curl_setopt($this->client, CURLOPT_URL, $url); 98 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 99 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 100 // POST JSON data 101 if ($data) { 102 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 103 } else { 104 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 105 } 106 // Call 107 $response = curl_exec($this->client); 108 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 109 // Check the result 110 if (curl_error($this->client)) { 111 throw new \Exception('Curl error: ' . curl_error($this->client)); 112 } 113 if ($httpCode >= 400) { 114 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 115 } 116 // Return the decoded response 117 return json_decode($response, true); 118 } 119 120 /** 121 * Generate embeddings for text using Ollama 122 * 123 * @param string $text The text to generate embeddings for 124 * @return array The embeddings vector 125 */ 126 public function generateEmbeddings($text) { 127 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 128 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 129 $data = [ 130 'model' => $this->ollamaModel, 131 'prompt' => $text, 132 'keep_alive' => '30m' 133 ]; 134 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 135 $response = curl_exec($this->ollamaClient); 136 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 137 if (curl_error($this->ollamaClient)) { 138 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 139 } 140 if ($httpCode >= 400) { 141 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 142 } 143 $result = json_decode($response, true); 144 if (!isset($result['embedding'])) { 145 throw new \Exception("Ollama response missing embedding: " . $response); 146 } 147 return $result['embedding']; 148 } 149 150 /** 151 * List all collections in the database 152 * 153 * Retrieves a list of all collections in the specified tenant and database. 154 * 155 * @return array List of collections 156 */ 157 public function listCollections() { 158 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 159 return $this->makeRequest($endpoint); 160 } 161 162 /** 163 * Get a collection by name 164 * 165 * Retrieves information about a specific collection by its name. 166 * 167 * @param string $name The name of the collection to retrieve 168 * @return array The collection information 169 * @throws Exception If the collection is not found 170 */ 171 public function getCollection($name) { 172 // Use provided name, fallback to 'documents' if empty 173 if (empty($name)) { 174 $name = 'documents'; 175 } 176 // First try to get collection by name 177 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 178 $collections = $this->makeRequest($endpoint); 179 // Find collection by name 180 foreach ($collections as $collection) { 181 if (isset($collection['name']) && $collection['name'] === $name) { 182 return $collection; 183 } 184 } 185 // If not found, throw exception 186 throw new \Exception("Collection '{$name}' not found"); 187 } 188 189 /** 190 * Create a new collection 191 * 192 * Creates a new collection with the specified name and optional metadata. 193 * 194 * @param string $name The name of the collection to create 195 * @param array|null $metadata Optional metadata for the collection 196 * @return array The response from the API 197 */ 198 public function createCollection($name, $metadata = null) { 199 // Use provided name, fallback to 'documents' if empty 200 if (empty($name)) { 201 $name = 'documents'; 202 } 203 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 204 $data = ['name' => $name]; 205 if ($metadata) { 206 $data['metadata'] = $metadata; 207 } 208 return $this->makeRequest($endpoint, 'POST', $data); 209 } 210 211 /** 212 * Delete a collection by name 213 * 214 * Deletes a collection with the specified name. 215 * 216 * @param string $name The name of the collection to delete 217 * @return array The response from the API 218 * @throws Exception If the collection ID is not found 219 */ 220 public function deleteCollection($name) { 221 // Use provided name, fallback to 'documents' if empty 222 if (empty($name)) { 223 $name = 'documents'; 224 } 225 // First get the collection to find its ID 226 $collection = $this->getCollection($name); 227 if (!isset($collection['id'])) { 228 throw new \Exception("Collection ID not found for '{$name}'"); 229 } 230 $collectionId = $collection['id']; 231 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 232 return $this->makeRequest($endpoint, 'DELETE'); 233 } 234 235 /** 236 * Get a document by its ID from a collection 237 * 238 * Retrieves a document from the specified collection using its ID. 239 * 240 * @param string $collectionName The name of the collection to get the document from 241 * @param string $documentId The document ID to retrieve 242 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 243 * @return array The retrieved document 244 * @throws Exception If the collection ID is not found 245 */ 246 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 247 // Use provided name, fallback to 'documents' if empty 248 if (empty($collectionName)) { 249 $collectionName = 'documents'; 250 } 251 // First get the collection to find its ID 252 $collection = $this->getCollection($collectionName); 253 if (!isset($collection['id'])) { 254 throw new \Exception("Collection ID not found for '{$collectionName}'"); 255 } 256 $collectionId = $collection['id']; 257 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 258 $data = [ 259 'ids' => [$documentId], 260 'include' => $include 261 ]; 262 // Return the document 263 return $this->makeRequest($endpoint, 'POST', $data); 264 } 265 266 /** 267 * Add documents to a collection 268 * 269 * Adds documents to the specified collection. Each document must have a corresponding ID. 270 * Optional metadata and pre-computed embeddings can also be provided. 271 * 272 * @param string $collectionName The name of the collection to add documents to 273 * @param array $documents The document contents 274 * @param array $ids The document IDs 275 * @param array|null $metadatas Optional metadata for each document 276 * @param array|null $embeddings Optional pre-computed embeddings for each document 277 * @return array The response from the API 278 * @throws Exception If the collection ID is not found 279 */ 280 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 281 // Use provided name, fallback to 'documents' if empty 282 if (empty($collectionName)) { 283 $collectionName = 'documents'; 284 } 285 // First get the collection to find its ID 286 $collection = $this->getCollection($collectionName); 287 if (!isset($collection['id'])) { 288 throw new \Exception("Collection ID not found for '{$collectionName}'"); 289 } 290 $collectionId = $collection['id']; 291 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 292 $data = [ 293 'ids' => $ids, 294 'documents' => $documents 295 ]; 296 // Get also the metadata 297 if ($metadatas) { 298 $data['metadatas'] = $metadatas; 299 } 300 // Get the embeddings 301 if ($embeddings) { 302 $data['embeddings'] = $embeddings; 303 } 304 // Return the respnse 305 return $this->makeRequest($endpoint, 'POST', $data); 306 } 307 308 /** 309 * Check if a document needs to be updated based on timestamp comparison 310 * 311 * Determines whether a document should be reprocessed by comparing the file's last modification 312 * time with the processed_at timestamp stored in the document's metadata. The function checks 313 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 314 * not included in the database. 315 * 316 * @param string $collectionId The ID of the collection to check documents in 317 * @param string $documentId The base document ID to check (without chunk suffixes) 318 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 319 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 320 * @throws Exception If there's an error checking the document 321 */ 322 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 323 try { 324 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 325 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 326 $chunkIdsToCheck = [ 327 $documentId . '@1', 328 $documentId . '@2', 329 $documentId . '@3' 330 ]; 331 $data = [ 332 'ids' => $chunkIdsToCheck, 333 'include' => [ 334 "metadatas" 335 ], 336 'limit' => 1 337 ]; 338 // Check if document exists 339 $result = $this->makeRequest($endpoint, 'POST', $data); 340 // If no documents found, return true (needs to be added) 341 if (empty($result['ids'])) { 342 return true; 343 } 344 // Check if any document has a processed_at timestamp 345 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 346 // Check the first metadata entry directly 347 $metadata = $result['metadatas'][0]; 348 // If processed_at is not set, return true (needs update) 349 if (!isset($metadata['processed_at'])) { 350 return true; 351 } 352 // Parse the processed_at timestamp 353 $processedTimestamp = strtotime($metadata['processed_at']); 354 // If file is newer than processed time, return true (needs update) 355 if ($fileModifiedTime > $processedTimestamp) { 356 return true; 357 } 358 } 359 // Document exists and is up to date 360 return false; 361 } catch (\Exception $e) { 362 // If there's an error checking the document, assume it needs to be updated 363 return true; 364 } 365 } 366 367 /** 368 * Query a collection for similar documents 369 * 370 * Queries the specified collection for documents similar to the provided query texts. 371 * The function generates embeddings for the query texts and sends them to ChromaDB. 372 * Supports filtering results by metadata using the where parameter. 373 * 374 * @param string $collectionName The name of the collection to query 375 * @param array $queryTexts The query texts to search for 376 * @param int $nResults The number of results to return (default: 5) 377 * @param array|null $where Optional filter conditions for metadata 378 * @return array The query results 379 * @throws Exception If the collection ID is not found 380 */ 381 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 382 // Use provided name, fallback to 'documents' if empty 383 if (empty($collectionName)) { 384 $collectionName = 'documents'; 385 } 386 // First get the collection to find its ID 387 $collection = $this->getCollection($collectionName); 388 if (!isset($collection['id'])) { 389 throw new \Exception("Collection ID not found for '{$collectionName}'"); 390 } 391 $collectionId = $collection['id']; 392 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 393 // Generate embeddings for query texts 394 $queryEmbeddings = []; 395 foreach ($queryTexts as $text) { 396 $queryEmbeddings[] = $this->generateEmbeddings($text); 397 } 398 $data = [ 399 'query_embeddings' => $queryEmbeddings, 400 'n_results' => $nResults 401 ]; 402 // Add where clause for metadata filtering if provided 403 if ($where && is_array($where)) { 404 $data['where'] = $where; 405 } 406 // Return the response 407 return $this->makeRequest($endpoint, 'POST', $data); 408 } 409 410 /** 411 * Check if the ChromaDB server is alive 412 * 413 * Sends a heartbeat request to verify that the ChromaDB server is running. 414 * 415 * @return array The response from the heartbeat endpoint 416 */ 417 public function heartbeat() { 418 $endpoint = "/heartbeat"; 419 return $this->makeRequest($endpoint, 'GET'); 420 } 421 422 /** 423 * Get authentication and identity information 424 * 425 * Retrieves authentication and identity information from the ChromaDB server. 426 * 427 * @return array The response from the auth/identity endpoint 428 */ 429 public function getIdentity() { 430 $endpoint = "/identity"; 431 return $this->makeRequest($endpoint, 'GET'); 432 } 433 434 /** 435 * Ensure that the specified tenant and database exist 436 * 437 * Checks if the specified tenant and database exist, and creates them if they don't. 438 * 439 * @return void 440 */ 441 private function ensureTenantAndDatabase() { 442 // Check if tenant exists, create if it doesn't 443 try { 444 $this->getTenant($this->tenant); 445 } catch (\Exception $e) { 446 // Tenant doesn't exist, create it 447 $this->createTenant($this->tenant); 448 } 449 // Check if database exists, create if it doesn't 450 try { 451 $this->getDatabase($this->database, $this->tenant); 452 } catch (\Exception $e) { 453 // Database doesn't exist, create it 454 $this->createDatabase($this->database, $this->tenant); 455 } 456 } 457 458 /** 459 * Get tenant information 460 * 461 * Retrieves information about the specified tenant. 462 * 463 * @param string $tenantName The tenant name 464 * @return array The tenant information 465 */ 466 public function getTenant($tenantName) { 467 $endpoint = "/tenants/{$tenantName}"; 468 return $this->makeRequest($endpoint, 'GET'); 469 } 470 471 /** 472 * Create a new tenant 473 * 474 * Creates a new tenant with the specified name. 475 * 476 * @param string $tenantName The tenant name 477 * @return array The response from the API 478 */ 479 public function createTenant($tenantName) { 480 $endpoint = "/tenants"; 481 $data = ['name' => $tenantName]; 482 return $this->makeRequest($endpoint, 'POST', $data); 483 } 484 485 /** 486 * Get database information 487 * 488 * Retrieves information about the specified database within a tenant. 489 * 490 * @param string $databaseName The database name 491 * @param string $tenantName The tenant name 492 * @return array The database information 493 */ 494 public function getDatabase($databaseName, $tenantName) { 495 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 496 return $this->makeRequest($endpoint, 'GET'); 497 } 498 499 /** 500 * Create a new database 501 * 502 * Creates a new database with the specified name within a tenant. 503 * 504 * @param string $databaseName The database name 505 * @param string $tenantName The tenant name 506 * @return array The response from the API 507 */ 508 public function createDatabase($databaseName, $tenantName) { 509 $endpoint = "/tenants/{$tenantName}/databases"; 510 $data = ['name' => $databaseName]; 511 return $this->makeRequest($endpoint, 'POST', $data); 512 } 513 514 /** 515 * Ensure a collection exists, creating it if necessary 516 * 517 * This helper function checks if a collection exists and creates it if it doesn't. 518 * 519 * @param string $collectionName The name of the collection to check/create 520 * @return string Status message indicating what happened 521 */ 522 public function ensureCollectionExists($collectionName) { 523 try { 524 $collection = $this->getCollection($collectionName); 525 return "Collection '$collectionName' already exists."; 526 } catch (\Exception $e) { 527 // Collection doesn't exist, create it 528 $created = $this->createCollection($collectionName); 529 return "Collection '$collectionName' created."; 530 } 531 } 532 533 /** 534 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 535 * 536 * This function handles the complete processing of a single DokuWiki file: 537 * 1. Parses the file path to extract metadata and document ID 538 * 2. Determines the appropriate collection based on document ID 539 * 3. Checks if the document needs updating using timestamp comparison 540 * 4. Reads and processes file content only if update is needed 541 * 5. Splits the document into chunks (paragraphs) 542 * 6. Extracts rich metadata from the DokuWiki ID format 543 * 7. Generates embeddings for each chunk 544 * 8. Sends all chunks to ChromaDB with metadata 545 * 546 * Supported ID formats: 547 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 548 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 549 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 550 * 551 * The function implements smart update checking by comparing file modification time 552 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 553 * 554 * @param string $filePath The path to the file to process 555 * @param string $collectionName The name of the collection to use 556 * @param bool $collectionChecked Whether the collection has already been checked/created 557 * @return array Result with status and details 558 */ 559 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 560 // Parse file path to extract metadata 561 $id = parseFilePath($filePath); 562 try { 563 // Create collection if it doesn't exist (only if not already checked) 564 $collectionStatus = ''; 565 if (!$collectionChecked) { 566 $collectionStatus = $this->ensureCollectionExists($collectionName); 567 } 568 // Get collection ID 569 $collection = $this->getCollection($collectionName); 570 if (!isset($collection['id'])) { 571 return [ 572 'status' => 'error', 573 'message' => "Collection ID not found for '{$collectionName}'" 574 ]; 575 } 576 $collectionId = $collection['id']; 577 // Get file modification time 578 $fileModifiedTime = filemtime($filePath); 579 // Check if document needs update 580 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 581 // If document is up to date, skip processing 582 if (!$needsUpdate) { 583 return [ 584 'status' => 'skipped', 585 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 586 ]; 587 } 588 // Read file content 589 $content = file_get_contents($filePath); 590 // Split document into chunks (paragraphs separated by two newlines) 591 $paragraphs = preg_split('/\n\s*\n/', $content); 592 $chunks = []; 593 $chunkMetadata = []; 594 // Parse the DokuWiki ID to extract base metadata 595 $parts = explode(':', $id); 596 // Extract metadata from the last part of the ID 597 $lastPart = end($parts); 598 $baseMetadata = []; 599 // Add the document ID as metadata 600 $baseMetadata['document_id'] = $id; 601 // Add current timestamp 602 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 603 // Check if any part of the ID is 'templates' and set template metadata 604 $isTemplate = in_array('templates', $parts); 605 if ($isTemplate) { 606 $baseMetadata['type'] = 'template'; 607 } else { 608 $baseMetadata['type'] = 'report'; 609 } 610 // Extract modality from the second part 611 if (isset($parts[1])) { 612 $baseMetadata['modality'] = $parts[1]; 613 } 614 // Handle different ID formats based on the third part: word (institution) or numeric (year) 615 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 616 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 617 // For templates, don't set institution, date or year 618 if (isset($parts[2]) && !$isTemplate) { 619 // Check if third part is numeric (year) or word (institution) 620 if (is_numeric($parts[2])) { 621 // Format: reports:mri:2024:g287-name-surname (year format) 622 // Extract year from the third part 623 $baseMetadata['year'] = $parts[2]; 624 // Set default institution from config 625 global $conf; 626 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 627 // Extract registration and name from the last part 628 // Registration should start with one letter or number and contain numbers before the '-' character 629 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 630 // Check if the first part contains at least one digit to be considered a registration 631 if (preg_match('/[0-9]/', $matches[1])) { 632 $baseMetadata['registration'] = $matches[1]; 633 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 634 } else { 635 // If no registration pattern found, treat entire part as patient name 636 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 637 } 638 } else { 639 // If no match, treat entire part as patient name 640 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 641 } 642 } else { 643 // Format: reports:mri:institution:250620-name-surname (institution format) 644 // Extract institution from the third part 645 $baseMetadata['institution'] = $parts[2]; 646 // Extract date and name from the last part 647 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 648 $dateStr = $matches[1]; 649 $name = $matches[2]; 650 // Convert date format (250620 -> 2025-06-20) 651 $day = substr($dateStr, 0, 2); 652 $month = substr($dateStr, 2, 2); 653 $year = substr($dateStr, 4, 2); 654 // Assuming 20xx for years 00-69 and 19xx for years 70-99 655 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 656 $formattedDate = $fullYear . '-' . $month . '-' . $day; 657 $baseMetadata['date'] = $formattedDate; 658 $baseMetadata['name'] = str_replace('-', ' ', $name); 659 } 660 } 661 } 662 // For templates, always extract name from the last part 663 if ($isTemplate && isset($lastPart)) { 664 // Extract name from the last part (everything after the last colon) 665 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 666 // Check if the first part contains at least one digit to be considered a registration 667 if (preg_match('/[0-9]/', $matches[1])) { 668 $baseMetadata['registration'] = $matches[1]; 669 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 670 } else { 671 // If no registration pattern found, treat entire part as template name 672 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 673 } 674 } else { 675 // If no match, treat entire part as template name 676 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 677 } 678 } 679 // Process each paragraph as a chunk with intelligent metadata handling 680 $chunkIds = []; 681 $chunkContents = []; 682 $chunkMetadatas = []; 683 $chunkEmbeddings = []; 684 $currentTags = []; 685 foreach ($paragraphs as $index => $paragraph) { 686 // Skip empty paragraphs to avoid processing whitespace-only content 687 $paragraph = trim($paragraph); 688 if (empty($paragraph)) { 689 continue; 690 } 691 // Check if this is a DokuWiki title (starts and ends with =) 692 // Titles are converted to tags for better searchability but not stored as content chunks 693 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 694 // Extract title content and clean it 695 $titleContent = trim($matches[1]); 696 // Split into words and create searchable tags 697 $words = preg_split('/\s+/', $titleContent); 698 $tags = []; 699 foreach ($words as $word) { 700 // Only use words longer than 3 characters to reduce noise 701 if (strlen($word) >= 3) { 702 $tags[] = strtolower($word); 703 } 704 } 705 // Remove duplicate tags and store for use in subsequent chunks 706 $currentTags = array_unique($tags); 707 continue; // Skip storing title chunks as content 708 } 709 // Create chunk ID 710 $chunkId = $id . '@' . ($index + 1); 711 // Generate embeddings for the chunk 712 $embeddings = $this->generateEmbeddings($paragraph); 713 // Add chunk-specific metadata 714 $metadata = $baseMetadata; 715 $metadata['chunk_id'] = $chunkId; 716 $metadata['chunk_number'] = $index + 1; 717 $metadata['total_chunks'] = count($paragraphs); 718 // Add current tags to metadata if any exist 719 if (!empty($currentTags)) { 720 $metadata['tags'] = implode(',', $currentTags); 721 } 722 // Store chunk data 723 $chunkIds[] = $chunkId; 724 $chunkContents[] = $paragraph; 725 $chunkMetadatas[] = $metadata; 726 $chunkEmbeddings[] = $embeddings; 727 } 728 // If no chunks were created, skip this file 729 if (empty($chunkIds)) { 730 return [ 731 'status' => 'skipped', 732 'message' => "No valid chunks found in file '$id'. Skipping..." 733 ]; 734 } 735 // Send all chunks to ChromaDB 736 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 737 return [ 738 'status' => 'success', 739 'message' => "Successfully sent file to ChromaDB", 740 'details' => [ 741 'document_id' => $id, 742 'chunks' => count($chunkIds), 743 'collection' => $collectionName 744 ], 745 'collection_status' => $collectionStatus 746 ]; 747 } catch (\Exception $e) { 748 return [ 749 'status' => 'error', 750 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 751 ]; 752 } 753 } 754 755} 756 757/** 758 * Parse a file path and convert it to a DokuWiki ID 759 * 760 * Takes a file system path and converts it to the DokuWiki ID format by: 761 * 1. Removing the base path prefix (using DokuWiki's pages directory) 762 * 2. Removing the .txt extension 763 * 3. Converting directory separators to colons 764 * 765 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 766 * Becomes: reports:mri:2024:g287-name-surname 767 * 768 * @param string $filePath The full file path to parse 769 * @return string The DokuWiki ID 770 */ 771function parseFilePath($filePath) { 772 // Use DokuWiki's constant to get the pages directory if available 773 if (defined('DOKU_INC')) { 774 $pagesDir = DOKU_INC . 'data/pages/'; 775 } else { 776 // Fallback to common DokuWiki installation path 777 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 778 } 779 // Remove the base path 780 $relativePath = str_replace($pagesDir, '', $filePath); 781 // Remove .txt extension 782 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 783 // Split path into parts and filter out empty parts 784 $parts = array_filter(explode('/', $relativePath)); 785 // Build DokuWiki ID (use first part as namespace) 786 $idParts = []; 787 foreach ($parts as $part) { 788 if (!empty($part)) { 789 $idParts[] = $part; 790 } 791 } 792 // Reurn the ID 793 return implode(':', $idParts); 794} 795