1<?php 2 3namespace dokuwiki\plugin\dokullm; 4 5use RecursiveIteratorIterator; 6use RecursiveDirectoryIterator; 7 8class ChromaDBClient { 9 private $baseUrl; 10 private $client; 11 private $ollamaClient; 12 private $tenant; 13 private $database; 14 private $ollamaHost; 15 private $ollamaPort; 16 private $ollamaModel; 17 18 /** 19 * Get configuration value for the dokullm plugin 20 * 21 * @param string $key Configuration key 22 * @param mixed $default Default value if key not found 23 * @return mixed Configuration value 24 */ 25 /** 26 * Initialize the ChromaDB client 27 * 28 * Creates a new ChromaDB client instance with the specified connection parameters. 29 * Also ensures that the specified tenant and database exist. 30 * 31 * @param string $host ChromaDB server host 32 * @param int $port ChromaDB server port 33 * @param string $tenant ChromaDB tenant name 34 * @param string $database ChromaDB database name 35 * @param string $defaultCollection Default collection name 36 * @param string $ollamaHost Ollama server host 37 * @param int $ollamaPort Ollama server port 38 * @param string $ollamaModel Ollama embeddings model 39 */ 40 public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) { 41 // Use provided parameters (no fallback since they're mandatory) 42 $chromaHost = $host; 43 $chromaPort = $port; 44 $this->tenant = $tenant; 45 $this->database = $database; 46 $this->defaultCollection = $defaultCollection; 47 $this->ollamaHost = $ollamaHost; 48 $this->ollamaPort = $ollamaPort; 49 $this->ollamaModel = $ollamaModel; 50 $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 51 $this->client = curl_init(); 52 curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 53 curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 54 'Content-Type: application/json', 55 'Accept: application/json' 56 ]); 57 // Initialize Ollama client 58 $this->ollamaClient = curl_init(); 59 curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 60 curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 61 'Content-Type: application/json' 62 ]); 63 // Check if tenant and database exist, create them if they don't 64 $this->ensureTenantAndDatabase(); 65 } 66 67 /** 68 * Clean up the cURL client when the object is destroyed 69 * 70 * @return void 71 */ 72 public function __destruct() { 73 curl_close($this->client); 74 curl_close($this->ollamaClient); 75 } 76 77 /** 78 * Make an HTTP request to the ChromaDB API 79 * 80 * This is a helper function that handles making HTTP requests to the ChromaDB API, 81 * including setting the appropriate headers for tenant and database. 82 * 83 * @param string $endpoint The API endpoint to call 84 * @param string $method The HTTP method to use (default: 'GET') 85 * @param array|null $data The data to send with the request (default: null) 86 * @return array The JSON response decoded as an array 87 * @throws Exception If there's a cURL error or HTTP error 88 */ 89 private function makeRequest($endpoint, $method = 'GET', $data = null) { 90 // Add tenant and database as headers instead of query parameters for v2 API 91 $headers = [ 92 'Content-Type: application/json', 93 'Accept: application/json' 94 ]; 95 // Version 2 96 $url = $this->baseUrl . '/api/v2' . $endpoint; 97 curl_setopt($this->client, CURLOPT_URL, $url); 98 curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 99 curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 100 // POST JSON data 101 if ($data) { 102 curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 103 } else { 104 curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 105 } 106 // Call 107 $response = curl_exec($this->client); 108 $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 109 // Check the result 110 if (curl_error($this->client)) { 111 throw new \Exception('Curl error: ' . curl_error($this->client)); 112 } 113 if ($httpCode >= 400) { 114 throw new \Exception("HTTP Error: $httpCode, Response: $response"); 115 } 116 // Return the decoded response 117 return json_decode($response, true); 118 } 119 120 /** 121 * Generate embeddings for text using Ollama 122 * 123 * @param string $text The text to generate embeddings for 124 * @return array The embeddings vector 125 */ 126 public function generateEmbeddings($text) { 127 $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 128 curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 129 130 // Ensure model is a string 131 $model = $this->ollamaModel; 132 if (!is_string($model)) { 133 throw new \Exception("Ollama model must be a string, got: " . gettype($model)); 134 } 135 136 $data = [ 137 'model' => $model, 138 'prompt' => $text, 139 'keep_alive' => '30m' 140 ]; 141 curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 142 $response = curl_exec($this->ollamaClient); 143 $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 144 if (curl_error($this->ollamaClient)) { 145 throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 146 } 147 if ($httpCode >= 400) { 148 throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 149 } 150 $result = json_decode($response, true); 151 if (!isset($result['embedding'])) { 152 throw new \Exception("Ollama response missing embedding: " . $response); 153 } 154 return $result['embedding']; 155 } 156 157 /** 158 * List all collections in the database 159 * 160 * Retrieves a list of all collections in the specified tenant and database. 161 * 162 * @return array List of collections 163 */ 164 public function listCollections() { 165 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 166 return $this->makeRequest($endpoint); 167 } 168 169 /** 170 * Get a collection by name 171 * 172 * Retrieves information about a specific collection by its name. 173 * 174 * @param string $name The name of the collection to retrieve 175 * @return array The collection information 176 * @throws Exception If the collection is not found 177 */ 178 public function getCollection($name) { 179 // Use provided name, fallback to 'documents' if empty 180 if (empty($name)) { 181 $name = 'documents'; 182 } 183 // First try to get collection by name 184 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 185 $collections = $this->makeRequest($endpoint); 186 // Find collection by name 187 foreach ($collections as $collection) { 188 if (isset($collection['name']) && $collection['name'] === $name) { 189 return $collection; 190 } 191 } 192 // If not found, throw exception 193 throw new \Exception("Collection '{$name}' not found"); 194 } 195 196 /** 197 * Create a new collection 198 * 199 * Creates a new collection with the specified name and optional metadata. 200 * 201 * @param string $name The name of the collection to create 202 * @param array|null $metadata Optional metadata for the collection 203 * @return array The response from the API 204 */ 205 public function createCollection($name, $metadata = null) { 206 // Use provided name, fallback to 'documents' if empty 207 if (empty($name)) { 208 $name = 'documents'; 209 } 210 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 211 $data = ['name' => $name]; 212 if ($metadata) { 213 $data['metadata'] = $metadata; 214 } 215 return $this->makeRequest($endpoint, 'POST', $data); 216 } 217 218 /** 219 * Delete a collection by name 220 * 221 * Deletes a collection with the specified name. 222 * 223 * @param string $name The name of the collection to delete 224 * @return array The response from the API 225 * @throws Exception If the collection ID is not found 226 */ 227 public function deleteCollection($name) { 228 // Use provided name, fallback to 'documents' if empty 229 if (empty($name)) { 230 $name = 'documents'; 231 } 232 // First get the collection to find its ID 233 $collection = $this->getCollection($name); 234 if (!isset($collection['id'])) { 235 throw new \Exception("Collection ID not found for '{$name}'"); 236 } 237 $collectionId = $collection['id']; 238 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 239 return $this->makeRequest($endpoint, 'DELETE'); 240 } 241 242 /** 243 * Get a document by its ID from a collection 244 * 245 * Retrieves a document from the specified collection using its ID. 246 * 247 * @param string $collectionName The name of the collection to get the document from 248 * @param string $documentId The document ID to retrieve 249 * @param array $include What to include in the response (default: ["metadatas", "documents"]) 250 * @return array The retrieved document 251 * @throws Exception If the collection ID is not found 252 */ 253 public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 254 // Use provided name, fallback to 'documents' if empty 255 if (empty($collectionName)) { 256 $collectionName = 'documents'; 257 } 258 // First get the collection to find its ID 259 $collection = $this->getCollection($collectionName); 260 if (!isset($collection['id'])) { 261 throw new \Exception("Collection ID not found for '{$collectionName}'"); 262 } 263 $collectionId = $collection['id']; 264 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 265 $data = [ 266 'ids' => [$documentId], 267 'include' => $include 268 ]; 269 // Return the document 270 return $this->makeRequest($endpoint, 'POST', $data); 271 } 272 273 /** 274 * Add documents to a collection 275 * 276 * Adds documents to the specified collection. Each document must have a corresponding ID. 277 * Optional metadata and pre-computed embeddings can also be provided. 278 * 279 * @param string $collectionName The name of the collection to add documents to 280 * @param array $documents The document contents 281 * @param array $ids The document IDs 282 * @param array|null $metadatas Optional metadata for each document 283 * @param array|null $embeddings Optional pre-computed embeddings for each document 284 * @return array The response from the API 285 * @throws Exception If the collection ID is not found 286 */ 287 public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 288 // Use provided name, fallback to 'documents' if empty 289 if (empty($collectionName)) { 290 $collectionName = 'documents'; 291 } 292 // First get the collection to find its ID 293 $collection = $this->getCollection($collectionName); 294 if (!isset($collection['id'])) { 295 throw new \Exception("Collection ID not found for '{$collectionName}'"); 296 } 297 $collectionId = $collection['id']; 298 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 299 $data = [ 300 'ids' => $ids, 301 'documents' => $documents 302 ]; 303 // Get also the metadata 304 if ($metadatas) { 305 $data['metadatas'] = $metadatas; 306 } 307 // Get the embeddings 308 if ($embeddings) { 309 $data['embeddings'] = $embeddings; 310 } 311 // Return the respnse 312 return $this->makeRequest($endpoint, 'POST', $data); 313 } 314 315 /** 316 * Check if a document needs to be updated based on timestamp comparison 317 * 318 * Determines whether a document should be reprocessed by comparing the file's last modification 319 * time with the processed_at timestamp stored in the document's metadata. The function checks 320 * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 321 * not included in the database. 322 * 323 * @param string $collectionId The ID of the collection to check documents in 324 * @param string $documentId The base document ID to check (without chunk suffixes) 325 * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 326 * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 327 * @throws Exception If there's an error checking the document 328 */ 329 public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 330 try { 331 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 332 // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 333 $chunkIdsToCheck = [ 334 $documentId . '@1', 335 $documentId . '@2', 336 $documentId . '@3' 337 ]; 338 $data = [ 339 'ids' => $chunkIdsToCheck, 340 'include' => [ 341 "metadatas" 342 ], 343 'limit' => 1 344 ]; 345 // Check if document exists 346 $result = $this->makeRequest($endpoint, 'POST', $data); 347 // If no documents found, return true (needs to be added) 348 if (empty($result['ids'])) { 349 return true; 350 } 351 // Check if any document has a processed_at timestamp 352 if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 353 // Check the first metadata entry directly 354 $metadata = $result['metadatas'][0]; 355 // If processed_at is not set, return true (needs update) 356 if (!isset($metadata['processed_at'])) { 357 return true; 358 } 359 // Parse the processed_at timestamp 360 $processedTimestamp = strtotime($metadata['processed_at']); 361 // If file is newer than processed time, return true (needs update) 362 if ($fileModifiedTime > $processedTimestamp) { 363 return true; 364 } 365 } 366 // Document exists and is up to date 367 return false; 368 } catch (\Exception $e) { 369 // If there's an error checking the document, assume it needs to be updated 370 return true; 371 } 372 } 373 374 /** 375 * Query a collection for similar documents 376 * 377 * Queries the specified collection for documents similar to the provided query texts. 378 * The function generates embeddings for the query texts and sends them to ChromaDB. 379 * Supports filtering results by metadata using the where parameter. 380 * 381 * @param string $collectionName The name of the collection to query 382 * @param array $queryTexts The query texts to search for 383 * @param int $nResults The number of results to return (default: 5) 384 * @param array|null $where Optional filter conditions for metadata 385 * @return array The query results 386 * @throws Exception If the collection ID is not found 387 */ 388 public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 389 // Use provided name, fallback to 'documents' if empty 390 if (empty($collectionName)) { 391 $collectionName = 'documents'; 392 } 393 // First get the collection to find its ID 394 $collection = $this->getCollection($collectionName); 395 if (!isset($collection['id'])) { 396 throw new \Exception("Collection ID not found for '{$collectionName}'"); 397 } 398 $collectionId = $collection['id']; 399 $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 400 // Generate embeddings for query texts 401 $queryEmbeddings = []; 402 foreach ($queryTexts as $text) { 403 $queryEmbeddings[] = $this->generateEmbeddings($text); 404 } 405 $data = [ 406 'query_embeddings' => $queryEmbeddings, 407 'n_results' => $nResults 408 ]; 409 // Add where clause for metadata filtering if provided 410 if ($where && is_array($where)) { 411 $data['where'] = $where; 412 } 413 // Return the response 414 return $this->makeRequest($endpoint, 'POST', $data); 415 } 416 417 /** 418 * Check if the ChromaDB server is alive 419 * 420 * Sends a heartbeat request to verify that the ChromaDB server is running. 421 * 422 * @return array The response from the heartbeat endpoint 423 */ 424 public function heartbeat() { 425 $endpoint = "/heartbeat"; 426 return $this->makeRequest($endpoint, 'GET'); 427 } 428 429 /** 430 * Get authentication and identity information 431 * 432 * Retrieves authentication and identity information from the ChromaDB server. 433 * 434 * @return array The response from the auth/identity endpoint 435 */ 436 public function getIdentity() { 437 $endpoint = "/identity"; 438 return $this->makeRequest($endpoint, 'GET'); 439 } 440 441 /** 442 * Ensure that the specified tenant and database exist 443 * 444 * Checks if the specified tenant and database exist, and creates them if they don't. 445 * 446 * @return void 447 */ 448 private function ensureTenantAndDatabase() { 449 // Check if tenant exists, create if it doesn't 450 try { 451 $this->getTenant($this->tenant); 452 } catch (\Exception $e) { 453 // Tenant doesn't exist, create it 454 $this->createTenant($this->tenant); 455 } 456 // Check if database exists, create if it doesn't 457 try { 458 $this->getDatabase($this->database, $this->tenant); 459 } catch (\Exception $e) { 460 // Database doesn't exist, create it 461 $this->createDatabase($this->database, $this->tenant); 462 } 463 } 464 465 /** 466 * Get tenant information 467 * 468 * Retrieves information about the specified tenant. 469 * 470 * @param string $tenantName The tenant name 471 * @return array The tenant information 472 */ 473 public function getTenant($tenantName) { 474 $endpoint = "/tenants/{$tenantName}"; 475 return $this->makeRequest($endpoint, 'GET'); 476 } 477 478 /** 479 * Create a new tenant 480 * 481 * Creates a new tenant with the specified name. 482 * 483 * @param string $tenantName The tenant name 484 * @return array The response from the API 485 */ 486 public function createTenant($tenantName) { 487 $endpoint = "/tenants"; 488 $data = ['name' => $tenantName]; 489 return $this->makeRequest($endpoint, 'POST', $data); 490 } 491 492 /** 493 * Get database information 494 * 495 * Retrieves information about the specified database within a tenant. 496 * 497 * @param string $databaseName The database name 498 * @param string $tenantName The tenant name 499 * @return array The database information 500 */ 501 public function getDatabase($databaseName, $tenantName) { 502 $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 503 return $this->makeRequest($endpoint, 'GET'); 504 } 505 506 /** 507 * Create a new database 508 * 509 * Creates a new database with the specified name within a tenant. 510 * 511 * @param string $databaseName The database name 512 * @param string $tenantName The tenant name 513 * @return array The response from the API 514 */ 515 public function createDatabase($databaseName, $tenantName) { 516 $endpoint = "/tenants/{$tenantName}/databases"; 517 $data = ['name' => $databaseName]; 518 return $this->makeRequest($endpoint, 'POST', $data); 519 } 520 521 /** 522 * Ensure a collection exists, creating it if necessary 523 * 524 * This helper function checks if a collection exists and creates it if it doesn't. 525 * 526 * @param string $collectionName The name of the collection to check/create 527 * @return string Status message indicating what happened 528 */ 529 public function ensureCollectionExists($collectionName) { 530 try { 531 $collection = $this->getCollection($collectionName); 532 return "Collection '$collectionName' already exists."; 533 } catch (\Exception $e) { 534 // Collection doesn't exist, create it 535 $created = $this->createCollection($collectionName); 536 return "Collection '$collectionName' created."; 537 } 538 } 539 540 /** 541 * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 542 * 543 * This function handles the complete processing of a single DokuWiki file: 544 * 1. Parses the file path to extract metadata and document ID 545 * 2. Determines the appropriate collection based on document ID 546 * 3. Checks if the document needs updating using timestamp comparison 547 * 4. Reads and processes file content only if update is needed 548 * 5. Splits the document into chunks (paragraphs) 549 * 6. Extracts rich metadata from the DokuWiki ID format 550 * 7. Generates embeddings for each chunk 551 * 8. Sends all chunks to ChromaDB with metadata 552 * 553 * Supported ID formats: 554 * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 555 * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 556 * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 557 * 558 * The function implements smart update checking by comparing file modification time 559 * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 560 * 561 * @param string $filePath The path to the file to process 562 * @param string $collectionName The name of the collection to use 563 * @param bool $collectionChecked Whether the collection has already been checked/created 564 * @return array Result with status and details 565 */ 566 public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 567 // Parse file path to extract metadata 568 $id = parseFilePath($filePath); 569 try { 570 // Create collection if it doesn't exist (only if not already checked) 571 $collectionStatus = ''; 572 if (!$collectionChecked) { 573 $collectionStatus = $this->ensureCollectionExists($collectionName); 574 } 575 // Get collection ID 576 $collection = $this->getCollection($collectionName); 577 if (!isset($collection['id'])) { 578 return [ 579 'status' => 'error', 580 'message' => "Collection ID not found for '{$collectionName}'" 581 ]; 582 } 583 $collectionId = $collection['id']; 584 // Get file modification time 585 $fileModifiedTime = filemtime($filePath); 586 // Check if document needs update 587 $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 588 // If document is up to date, skip processing 589 if (!$needsUpdate) { 590 return [ 591 'status' => 'skipped', 592 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 593 ]; 594 } 595 // Read file content 596 $content = file_get_contents($filePath); 597 // Split document into chunks (paragraphs separated by two newlines) 598 $paragraphs = preg_split('/\n\s*\n/', $content); 599 $chunks = []; 600 $chunkMetadata = []; 601 // Parse the DokuWiki ID to extract base metadata 602 $parts = explode(':', $id); 603 // Extract metadata from the last part of the ID 604 $lastPart = end($parts); 605 $baseMetadata = []; 606 // Add the document ID as metadata 607 $baseMetadata['document_id'] = $id; 608 // Add current timestamp 609 $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 610 // Check if any part of the ID is 'templates' and set template metadata 611 $isTemplate = in_array('templates', $parts); 612 if ($isTemplate) { 613 $baseMetadata['type'] = 'template'; 614 } else { 615 $baseMetadata['type'] = 'report'; 616 } 617 // Extract modality from the second part 618 if (isset($parts[1])) { 619 $baseMetadata['modality'] = $parts[1]; 620 } 621 // Handle different ID formats based on the third part: word (institution) or numeric (year) 622 // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 623 // Format 2: reports:mri:2024:g287-name-surname (third part is year) 624 // For templates, don't set institution, date or year 625 if (isset($parts[2]) && !$isTemplate) { 626 // Check if third part is numeric (year) or word (institution) 627 if (is_numeric($parts[2])) { 628 // Format: reports:mri:2024:g287-name-surname (year format) 629 // Extract year from the third part 630 $baseMetadata['year'] = $parts[2]; 631 // Set default institution from config 632 global $conf; 633 $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 634 // Extract registration and name from the last part 635 // Registration should start with one letter or number and contain numbers before the '-' character 636 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 637 // Check if the first part contains at least one digit to be considered a registration 638 if (preg_match('/[0-9]/', $matches[1])) { 639 $baseMetadata['registration'] = $matches[1]; 640 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 641 } else { 642 // If no registration pattern found, treat entire part as patient name 643 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 644 } 645 } else { 646 // If no match, treat entire part as patient name 647 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 648 } 649 } else { 650 // Format: reports:mri:institution:250620-name-surname (institution format) 651 // Extract institution from the third part 652 $baseMetadata['institution'] = $parts[2]; 653 // Extract date and name from the last part 654 if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 655 $dateStr = $matches[1]; 656 $name = $matches[2]; 657 // Convert date format (250620 -> 2025-06-20) 658 $day = substr($dateStr, 0, 2); 659 $month = substr($dateStr, 2, 2); 660 $year = substr($dateStr, 4, 2); 661 // Assuming 20xx for years 00-69 and 19xx for years 70-99 662 $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 663 $formattedDate = $fullYear . '-' . $month . '-' . $day; 664 $baseMetadata['date'] = $formattedDate; 665 $baseMetadata['name'] = str_replace('-', ' ', $name); 666 } 667 } 668 } 669 // For templates, always extract name from the last part 670 if ($isTemplate && isset($lastPart)) { 671 // Extract name from the last part (everything after the last colon) 672 if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 673 // Check if the first part contains at least one digit to be considered a registration 674 if (preg_match('/[0-9]/', $matches[1])) { 675 $baseMetadata['registration'] = $matches[1]; 676 $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 677 } else { 678 // If no registration pattern found, treat entire part as template name 679 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 680 } 681 } else { 682 // If no match, treat entire part as template name 683 $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 684 } 685 } 686 // Process each paragraph as a chunk with intelligent metadata handling 687 $chunkIds = []; 688 $chunkContents = []; 689 $chunkMetadatas = []; 690 $chunkEmbeddings = []; 691 $currentTags = []; 692 foreach ($paragraphs as $index => $paragraph) { 693 // Skip empty paragraphs to avoid processing whitespace-only content 694 $paragraph = trim($paragraph); 695 if (empty($paragraph)) { 696 continue; 697 } 698 // Check if this is a DokuWiki title (starts and ends with =) 699 // Titles are converted to tags for better searchability but not stored as content chunks 700 if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 701 // Extract title content and clean it 702 $titleContent = trim($matches[1]); 703 // Split into words and create searchable tags 704 $words = preg_split('/\s+/', $titleContent); 705 $tags = []; 706 foreach ($words as $word) { 707 // Only use words longer than 3 characters to reduce noise 708 if (strlen($word) >= 3) { 709 $tags[] = strtolower($word); 710 } 711 } 712 // Remove duplicate tags and store for use in subsequent chunks 713 $currentTags = array_unique($tags); 714 continue; // Skip storing title chunks as content 715 } 716 // Create chunk ID 717 $chunkId = $id . '@' . ($index + 1); 718 // Generate embeddings for the chunk 719 $embeddings = $this->generateEmbeddings($paragraph); 720 // Add chunk-specific metadata 721 $metadata = $baseMetadata; 722 $metadata['chunk_id'] = $chunkId; 723 $metadata['chunk_number'] = $index + 1; 724 $metadata['total_chunks'] = count($paragraphs); 725 // Add current tags to metadata if any exist 726 if (!empty($currentTags)) { 727 $metadata['tags'] = implode(',', $currentTags); 728 } 729 // Store chunk data 730 $chunkIds[] = $chunkId; 731 $chunkContents[] = $paragraph; 732 $chunkMetadatas[] = $metadata; 733 $chunkEmbeddings[] = $embeddings; 734 } 735 // If no chunks were created, skip this file 736 if (empty($chunkIds)) { 737 return [ 738 'status' => 'skipped', 739 'message' => "No valid chunks found in file '$id'. Skipping..." 740 ]; 741 } 742 // Send all chunks to ChromaDB 743 $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 744 return [ 745 'status' => 'success', 746 'message' => "Successfully sent file to ChromaDB", 747 'details' => [ 748 'document_id' => $id, 749 'chunks' => count($chunkIds), 750 'collection' => $collectionName 751 ], 752 'collection_status' => $collectionStatus 753 ]; 754 } catch (\Exception $e) { 755 return [ 756 'status' => 'error', 757 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 758 ]; 759 } 760 } 761 762} 763 764/** 765 * Parse a file path and convert it to a DokuWiki ID 766 * 767 * Takes a file system path and converts it to the DokuWiki ID format by: 768 * 1. Removing the base path prefix (using DokuWiki's pages directory) 769 * 2. Removing the .txt extension 770 * 3. Converting directory separators to colons 771 * 772 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 773 * Becomes: reports:mri:2024:g287-name-surname 774 * 775 * @param string $filePath The full file path to parse 776 * @return string The DokuWiki ID 777 */ 778function parseFilePath($filePath) { 779 // Use DokuWiki's constant to get the pages directory if available 780 if (defined('DOKU_INC')) { 781 $pagesDir = DOKU_INC . 'data/pages/'; 782 } else { 783 // Fallback to common DokuWiki installation path 784 $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 785 } 786 // Remove the base path 787 $relativePath = str_replace($pagesDir, '', $filePath); 788 // Remove .txt extension 789 $relativePath = preg_replace('/\.txt$/', '', $relativePath); 790 // Split path into parts and filter out empty parts 791 $parts = array_filter(explode('/', $relativePath)); 792 // Build DokuWiki ID (use first part as namespace) 793 $idParts = []; 794 foreach ($parts as $part) { 795 if (!empty($part)) { 796 $idParts[] = $part; 797 } 798 } 799 // Reurn the ID 800 return implode(':', $idParts); 801} 802