xref: /plugin/dokullm/ChromaDBClient.php (revision 7f9bf09471e813ffab41c15829b843bd35668228)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    /**
14     * Initialize the ChromaDB client
15     *
16     * Creates a new ChromaDB client instance with the specified connection parameters.
17     * Also ensures that the specified tenant and database exist.
18     *
19     * @param string $host ChromaDB server host
20     * @param int $port ChromaDB server port
21     * @param string $tenant ChromaDB tenant name
22     * @param string $database ChromaDB database name
23     * @param string $ollamaHost Ollama server host
24     * @param int $ollamaPort Ollama server port
25     * @param string $ollamaModel Ollama embeddings model
26     */
27    public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) {
28        // Load DokuWiki plugin configuration
29        global $conf;
30
31        // Use provided parameters or fall back to configuration values
32        $host = $host ?? ($conf['plugin']['dokullm']['chroma_host'] ?? '127.0.0.1');
33        $port = $port ?? ($conf['plugin']['dokullm']['chroma_port'] ?? 8000);
34        $this->tenant = $tenant ?? ($conf['plugin']['dokullm']['chroma_tenant'] ?? 'dokullm');
35        $this->database = $database ?? ($conf['plugin']['dokullm']['chroma_database'] ?? 'dokullm');
36        $this->ollamaHost = $ollamaHost ?? ($conf['plugin']['dokullm']['ollama_host'] ?? '127.0.0.1');
37        $this->ollamaPort = $ollamaPort ?? ($conf['plugin']['dokullm']['ollama_port'] ?? 11434);
38        $this->ollamaModel = $ollamaModel ?? ($conf['plugin']['dokullm']['ollama_embeddings_model'] ?? 'nomic-embed-text');
39
40        $this->baseUrl = "http://{$host}:{$port}";
41        $this->client = curl_init();
42        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
43        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
44            'Content-Type: application/json',
45            'Accept: application/json'
46        ]);
47
48        // Initialize Ollama client
49        $this->ollamaClient = curl_init();
50        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
51        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
52            'Content-Type: application/json'
53        ]);
54
55        // Check if tenant and database exist, create them if they don't
56        $this->ensureTenantAndDatabase();
57    }
58
59    /**
60     * Clean up the cURL client when the object is destroyed
61     *
62     * @return void
63     */
64    public function __destruct() {
65        curl_close($this->client);
66        curl_close($this->ollamaClient);
67    }
68
69    /**
70     * Make an HTTP request to the ChromaDB API
71     *
72     * This is a helper function that handles making HTTP requests to the ChromaDB API,
73     * including setting the appropriate headers for tenant and database.
74     *
75     * @param string $endpoint The API endpoint to call
76     * @param string $method The HTTP method to use (default: 'GET')
77     * @param array|null $data The data to send with the request (default: null)
78     * @return array The JSON response decoded as an array
79     * @throws Exception If there's a cURL error or HTTP error
80     */
81    private function makeRequest($endpoint, $method = 'GET', $data = null) {
82        // Add tenant and database as headers instead of query parameters for v2 API
83        $headers = [
84            'Content-Type: application/json',
85            'Accept: application/json'
86        ];
87
88        $url = $this->baseUrl . '/api/v2' . $endpoint;
89
90        curl_setopt($this->client, CURLOPT_URL, $url);
91        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
92        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
93
94        if ($data) {
95            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
96        } else {
97            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
98        }
99
100        $response = curl_exec($this->client);
101        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
102
103        if (curl_error($this->client)) {
104            throw new Exception('Curl error: ' . curl_error($this->client));
105        }
106
107        if ($httpCode >= 400) {
108            throw new Exception("HTTP Error: $httpCode, Response: $response");
109        }
110
111        return json_decode($response, true);
112    }
113
114    /**
115     * Generate embeddings for text using Ollama
116     *
117     * @param string $text The text to generate embeddings for
118     * @return array The embeddings vector
119     */
120    public function generateEmbeddings($text) {
121        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
122
123        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
124
125        $data = [
126            'model' => $this->ollamaModel,
127            'prompt' => $text,
128            'keep_alive' => '30m'
129        ];
130
131        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
132
133        $response = curl_exec($this->ollamaClient);
134        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
135
136        if (curl_error($this->ollamaClient)) {
137            throw new Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
138        }
139
140        if ($httpCode >= 400) {
141            throw new Exception("Ollama HTTP Error: $httpCode, Response: $response");
142        }
143
144        $result = json_decode($response, true);
145
146        if (!isset($result['embedding'])) {
147            throw new Exception("Ollama response missing embedding: " . $response);
148        }
149
150        return $result['embedding'];
151    }
152
153    /**
154     * List all collections in the database
155     *
156     * Retrieves a list of all collections in the specified tenant and database.
157     *
158     * @return array List of collections
159     */
160    public function listCollections() {
161        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
162        return $this->makeRequest($endpoint);
163    }
164
165    /**
166     * Get a collection by name
167     *
168     * Retrieves information about a specific collection by its name.
169     *
170     * @param string $name The name of the collection to retrieve
171     * @return array The collection information
172     * @throws Exception If the collection is not found
173     */
174    public function getCollection($name) {
175        // Use provided name, fallback to 'documents' if empty
176        if (empty($name)) {
177            $name = 'documents';
178        }
179
180        // First try to get collection by name
181        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
182        $collections = $this->makeRequest($endpoint);
183
184        // Find collection by name
185        foreach ($collections as $collection) {
186            if (isset($collection['name']) && $collection['name'] === $name) {
187                return $collection;
188            }
189        }
190
191        // If not found, throw exception
192        throw new Exception("Collection '{$name}' not found");
193    }
194
195    /**
196     * Create a new collection
197     *
198     * Creates a new collection with the specified name and optional metadata.
199     *
200     * @param string $name The name of the collection to create
201     * @param array|null $metadata Optional metadata for the collection
202     * @return array The response from the API
203     */
204    public function createCollection($name, $metadata = null) {
205        // Use provided name, fallback to 'documents' if empty
206        if (empty($name)) {
207            $name = 'documents';
208        }
209
210        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
211        $data = ['name' => $name];
212        if ($metadata) {
213            $data['metadata'] = $metadata;
214        }
215        return $this->makeRequest($endpoint, 'POST', $data);
216    }
217
218    /**
219     * Delete a collection by name
220     *
221     * Deletes a collection with the specified name.
222     *
223     * @param string $name The name of the collection to delete
224     * @return array The response from the API
225     * @throws Exception If the collection ID is not found
226     */
227    public function deleteCollection($name) {
228        // Use provided name, fallback to 'documents' if empty
229        if (empty($name)) {
230            $name = 'documents';
231        }
232
233        // First get the collection to find its ID
234        $collection = $this->getCollection($name);
235        if (!isset($collection['id'])) {
236            throw new Exception("Collection ID not found for '{$name}'");
237        }
238
239        $collectionId = $collection['id'];
240        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
241        return $this->makeRequest($endpoint, 'DELETE');
242    }
243
244    /**
245     * Get a document by its ID from a collection
246     *
247     * Retrieves a document from the specified collection using its ID.
248     *
249     * @param string $collectionName The name of the collection to get the document from
250     * @param string $documentId The document ID to retrieve
251     * @param array $include What to include in the response (default: ["metadatas", "documents"])
252     * @return array The retrieved document
253     * @throws Exception If the collection ID is not found
254     */
255    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
256        // Use provided name, fallback to 'documents' if empty
257        if (empty($collectionName)) {
258            $collectionName = 'documents';
259        }
260
261        // First get the collection to find its ID
262        $collection = $this->getCollection($collectionName);
263        if (!isset($collection['id'])) {
264            throw new Exception("Collection ID not found for '{$collectionName}'");
265        }
266
267        $collectionId = $collection['id'];
268        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
269        $data = [
270            'ids' => [$documentId],
271            'include' => $include
272        ];
273
274        return $this->makeRequest($endpoint, 'POST', $data);
275    }
276
277    /**
278     * Add documents to a collection
279     *
280     * Adds documents to the specified collection. Each document must have a corresponding ID.
281     * Optional metadata and pre-computed embeddings can also be provided.
282     *
283     * @param string $collectionName The name of the collection to add documents to
284     * @param array $documents The document contents
285     * @param array $ids The document IDs
286     * @param array|null $metadatas Optional metadata for each document
287     * @param array|null $embeddings Optional pre-computed embeddings for each document
288     * @return array The response from the API
289     * @throws Exception If the collection ID is not found
290     */
291    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
292        // Use provided name, fallback to 'documents' if empty
293        if (empty($collectionName)) {
294            $collectionName = 'documents';
295        }
296
297        // First get the collection to find its ID
298        $collection = $this->getCollection($collectionName);
299        if (!isset($collection['id'])) {
300            throw new Exception("Collection ID not found for '{$collectionName}'");
301        }
302
303        $collectionId = $collection['id'];
304        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
305        $data = [
306            'ids' => $ids,
307            'documents' => $documents
308        ];
309
310        if ($metadatas) {
311            $data['metadatas'] = $metadatas;
312        }
313
314        if ($embeddings) {
315            $data['embeddings'] = $embeddings;
316        }
317
318        return $this->makeRequest($endpoint, 'POST', $data);
319    }
320
321    /**
322     * Check if a document needs to be updated based on timestamp comparison
323     *
324     * Determines whether a document should be reprocessed by comparing the file's last modification
325     * time with the processed_at timestamp stored in the document's metadata. The function checks
326     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
327     * not included in the database.
328     *
329     * @param string $collectionId The ID of the collection to check documents in
330     * @param string $documentId The base document ID to check (without chunk suffixes)
331     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
332     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
333     * @throws Exception If there's an error checking the document
334     */
335    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
336        try {
337            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
338
339            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
340            $chunkIdsToCheck = [
341                $documentId . '@1',
342                $documentId . '@2',
343                $documentId . '@3'
344            ];
345
346            $data = [
347                'ids' => $chunkIdsToCheck,
348                'include' => [
349                    "metadatas"
350                ],
351                'limit' => 1
352            ];
353
354            // Check if document exists
355            $result = $this->makeRequest($endpoint, 'POST', $data);
356
357            // If no documents found, return true (needs to be added)
358            if (empty($result['ids'])) {
359                return true;
360            }
361
362            // Check if any document has a processed_at timestamp
363            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
364                // Check the first metadata entry directly
365                $metadata = $result['metadatas'][0];
366
367                // If processed_at is not set, return true (needs update)
368                if (!isset($metadata['processed_at'])) {
369                    return true;
370                }
371
372                // Parse the processed_at timestamp
373                $processedTimestamp = strtotime($metadata['processed_at']);
374
375                // If file is newer than processed time, return true (needs update)
376                if ($fileModifiedTime > $processedTimestamp) {
377                    return true;
378                }
379            }
380
381            // Document exists and is up to date
382            return false;
383        } catch (Exception $e) {
384            // If there's an error checking the document, assume it needs to be updated
385            return true;
386        }
387    }
388
389    /**
390     * Query a collection for similar documents
391     *
392     * Queries the specified collection for documents similar to the provided query texts.
393     * The function generates embeddings for the query texts and sends them to ChromaDB.
394     * Supports filtering results by metadata using the where parameter.
395     *
396     * @param string $collectionName The name of the collection to query
397     * @param array $queryTexts The query texts to search for
398     * @param int $nResults The number of results to return (default: 5)
399     * @param array|null $where Optional filter conditions for metadata
400     * @return array The query results
401     * @throws Exception If the collection ID is not found
402     */
403    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
404        // Use provided name, fallback to 'documents' if empty
405        if (empty($collectionName)) {
406            $collectionName = 'documents';
407        }
408
409        // First get the collection to find its ID
410        $collection = $this->getCollection($collectionName);
411        if (!isset($collection['id'])) {
412            throw new Exception("Collection ID not found for '{$collectionName}'");
413        }
414
415        $collectionId = $collection['id'];
416        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
417
418        // Generate embeddings for query texts
419        $queryEmbeddings = [];
420        foreach ($queryTexts as $text) {
421            $queryEmbeddings[] = $this->generateEmbeddings($text);
422        }
423
424        $data = [
425            'query_embeddings' => $queryEmbeddings,
426            'n_results' => $nResults
427        ];
428
429        // Add where clause for metadata filtering if provided
430        if ($where && is_array($where)) {
431            $data['where'] = $where;
432        }
433
434        return $this->makeRequest($endpoint, 'POST', $data);
435    }
436
437    /**
438     * Check if the ChromaDB server is alive
439     *
440     * Sends a heartbeat request to verify that the ChromaDB server is running.
441     *
442     * @return array The response from the heartbeat endpoint
443     */
444    public function heartbeat() {
445        $endpoint = "/heartbeat";
446        return $this->makeRequest($endpoint, 'GET');
447    }
448
449    /**
450     * Get authentication and identity information
451     *
452     * Retrieves authentication and identity information from the ChromaDB server.
453     *
454     * @return array The response from the auth/identity endpoint
455     */
456    public function getIdentity() {
457        $endpoint = "/identity";
458        return $this->makeRequest($endpoint, 'GET');
459    }
460
461    /**
462     * Ensure that the specified tenant and database exist
463     *
464     * Checks if the specified tenant and database exist, and creates them if they don't.
465     *
466     * @return void
467     */
468    private function ensureTenantAndDatabase() {
469        // Check if tenant exists, create if it doesn't
470        try {
471            $this->getTenant($this->tenant);
472        } catch (Exception $e) {
473            // Tenant doesn't exist, create it
474            $this->createTenant($this->tenant);
475        }
476
477        // Check if database exists, create if it doesn't
478        try {
479            $this->getDatabase($this->database, $this->tenant);
480        } catch (Exception $e) {
481            // Database doesn't exist, create it
482            $this->createDatabase($this->database, $this->tenant);
483        }
484    }
485
486    /**
487     * Get tenant information
488     *
489     * Retrieves information about the specified tenant.
490     *
491     * @param string $tenantName The tenant name
492     * @return array The tenant information
493     */
494    public function getTenant($tenantName) {
495        $endpoint = "/tenants/{$tenantName}";
496        return $this->makeRequest($endpoint, 'GET');
497    }
498
499    /**
500     * Create a new tenant
501     *
502     * Creates a new tenant with the specified name.
503     *
504     * @param string $tenantName The tenant name
505     * @return array The response from the API
506     */
507    public function createTenant($tenantName) {
508        $endpoint = "/tenants";
509        $data = ['name' => $tenantName];
510        return $this->makeRequest($endpoint, 'POST', $data);
511    }
512
513    /**
514     * Get database information
515     *
516     * Retrieves information about the specified database within a tenant.
517     *
518     * @param string $databaseName The database name
519     * @param string $tenantName The tenant name
520     * @return array The database information
521     */
522    public function getDatabase($databaseName, $tenantName) {
523        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
524        return $this->makeRequest($endpoint, 'GET');
525    }
526
527    /**
528     * Create a new database
529     *
530     * Creates a new database with the specified name within a tenant.
531     *
532     * @param string $databaseName The database name
533     * @param string $tenantName The tenant name
534     * @return array The response from the API
535     */
536    public function createDatabase($databaseName, $tenantName) {
537        $endpoint = "/tenants/{$tenantName}/databases";
538        $data = ['name' => $databaseName];
539        return $this->makeRequest($endpoint, 'POST', $data);
540    }
541
542    /**
543     * Ensure a collection exists, creating it if necessary
544     *
545     * This helper function checks if a collection exists and creates it if it doesn't.
546     *
547     * @param string $collectionName The name of the collection to check/create
548     * @return string Status message indicating what happened
549     */
550    public function ensureCollectionExists($collectionName) {
551        try {
552            $collection = $this->getCollection($collectionName);
553            return "Collection '$collectionName' already exists.";
554        } catch (Exception $e) {
555            // Collection doesn't exist, create it
556            $created = $this->createCollection($collectionName);
557            return "Collection '$collectionName' created.";
558        }
559    }
560
561    /**
562     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
563     *
564     * This function handles the complete processing of a single DokuWiki file:
565     * 1. Parses the file path to extract metadata and document ID
566     * 2. Determines the appropriate collection based on document ID
567     * 3. Checks if the document needs updating using timestamp comparison
568     * 4. Reads and processes file content only if update is needed
569     * 5. Splits the document into chunks (paragraphs)
570     * 6. Extracts rich metadata from the DokuWiki ID format
571     * 7. Generates embeddings for each chunk
572     * 8. Sends all chunks to ChromaDB with metadata
573     *
574     * Supported ID formats:
575     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
576     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
577     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
578     *
579     * The function implements smart update checking by comparing file modification time
580     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
581     *
582     * @param string $filePath The path to the file to process
583     * @param string $collectionName The name of the collection to use
584     * @param bool $collectionChecked Whether the collection has already been checked/created
585     * @return array Result with status and details
586     */
587    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
588        // Parse file path to extract metadata
589        $id = parseFilePath($filePath);
590
591        try {
592            // Create collection if it doesn't exist (only if not already checked)
593            $collectionStatus = '';
594            if (!$collectionChecked) {
595                $collectionStatus = $this->ensureCollectionExists($collectionName);
596            }
597
598            // Get collection ID
599            $collection = $this->getCollection($collectionName);
600            if (!isset($collection['id'])) {
601                return [
602                    'status' => 'error',
603                    'message' => "Collection ID not found for '{$collectionName}'"
604                ];
605            }
606            $collectionId = $collection['id'];
607
608            // Get file modification time
609            $fileModifiedTime = filemtime($filePath);
610
611            // Check if document needs update
612            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
613
614            // If document is up to date, skip processing
615            if (!$needsUpdate) {
616                return [
617                    'status' => 'skipped',
618                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
619                ];
620            }
621
622            // Read file content
623            $content = file_get_contents($filePath);
624
625            // Split document into chunks (paragraphs separated by two newlines)
626            $paragraphs = preg_split('/\n\s*\n/', $content);
627            $chunks = [];
628            $chunkMetadata = [];
629
630            // Parse the DokuWiki ID to extract base metadata
631            $parts = explode(':', $id);
632
633            // Extract metadata from the last part of the ID
634            $lastPart = end($parts);
635            $baseMetadata = [];
636
637            // Add the document ID as metadata
638            $baseMetadata['document_id'] = $id;
639
640            // Add current timestamp
641            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
642
643            // Check if any part of the ID is 'templates' and set template metadata
644            $isTemplate = in_array('templates', $parts);
645            if ($isTemplate) {
646                $baseMetadata['type'] = 'template';
647            } else {
648                $baseMetadata['type'] = 'report';
649            }
650
651            // Extract modality from the second part
652            if (isset($parts[1])) {
653                $baseMetadata['modality'] = $parts[1];
654            }
655
656            // Handle different ID formats based on the third part: word (institution) or numeric (year)
657            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
658            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
659            // For templates, don't set institution, date or year
660            if (isset($parts[2]) && !$isTemplate) {
661                // Check if third part is numeric (year) or word (institution)
662                if (is_numeric($parts[2])) {
663                    // Format: reports:mri:2024:g287-name-surname (year format)
664                    // Extract year from the third part
665                    $baseMetadata['year'] = $parts[2];
666
667                    // Set default institution from config
668                    $baseMetadata['institution'] = DEFAULT_INSTITUTION;
669
670                    // Extract registration and name from the last part
671                    // Registration should start with one letter or number and contain numbers before the '-' character
672                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
673                        // Check if the first part contains at least one digit to be considered a registration
674                        if (preg_match('/[0-9]/', $matches[1])) {
675                            $baseMetadata['registration'] = $matches[1];
676                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
677                        } else {
678                            // If no registration pattern found, treat entire part as patient name
679                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
680                        }
681                    } else {
682                        // If no match, treat entire part as patient name
683                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
684                    }
685                } else {
686                    // Format: reports:mri:institution:250620-name-surname (institution format)
687                    // Extract institution from the third part
688                    $baseMetadata['institution'] = $parts[2];
689
690                    // Extract date and name from the last part
691                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
692                        $dateStr = $matches[1];
693                        $name = $matches[2];
694
695                        // Convert date format (250620 -> 2025-06-20)
696                        $day = substr($dateStr, 0, 2);
697                        $month = substr($dateStr, 2, 2);
698                        $year = substr($dateStr, 4, 2);
699                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
700                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
701                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
702
703                        $baseMetadata['date'] = $formattedDate;
704                        $baseMetadata['name'] = str_replace('-', ' ', $name);
705                    }
706                }
707            }
708
709            // For templates, always extract name from the last part
710            if ($isTemplate && isset($lastPart)) {
711                // Extract name from the last part (everything after the last colon)
712                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
713                    // Check if the first part contains at least one digit to be considered a registration
714                    if (preg_match('/[0-9]/', $matches[1])) {
715                        $baseMetadata['registration'] = $matches[1];
716                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
717                    } else {
718                        // If no registration pattern found, treat entire part as template name
719                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
720                    }
721                } else {
722                    // If no match, treat entire part as template name
723                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
724                }
725            }
726
727            // Process each paragraph as a chunk with intelligent metadata handling
728            $chunkIds = [];
729            $chunkContents = [];
730            $chunkMetadatas = [];
731            $chunkEmbeddings = [];
732            $currentTags = [];
733
734            foreach ($paragraphs as $index => $paragraph) {
735                // Skip empty paragraphs to avoid processing whitespace-only content
736                $paragraph = trim($paragraph);
737                if (empty($paragraph)) {
738                    continue;
739                }
740
741                // Check if this is a DokuWiki title (starts and ends with =)
742                // Titles are converted to tags for better searchability but not stored as content chunks
743                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
744                    // Extract title content and clean it
745                    $titleContent = trim($matches[1]);
746
747                    // Split into words and create searchable tags
748                    $words = preg_split('/\s+/', $titleContent);
749                    $tags = [];
750
751                    foreach ($words as $word) {
752                        // Only use words longer than 3 characters to reduce noise
753                        if (strlen($word) >= 3) {
754                            $tags[] = strtolower($word);
755                        }
756                    }
757
758                    // Remove duplicate tags and store for use in subsequent chunks
759                    $currentTags = array_unique($tags);
760                    continue; // Skip storing title chunks as content
761                }
762
763                // Create chunk ID
764                $chunkId = $id . '@' . ($index + 1);
765
766                // Generate embeddings for the chunk
767                $embeddings = $this->generateEmbeddings($paragraph);
768
769                // Add chunk-specific metadata
770                $metadata = $baseMetadata;
771                $metadata['chunk_id'] = $chunkId;
772                $metadata['chunk_number'] = $index + 1;
773                $metadata['total_chunks'] = count($paragraphs);
774
775                // Add current tags to metadata if any exist
776                if (!empty($currentTags)) {
777                    $metadata['tags'] = implode(',', $currentTags);
778                }
779
780                // Store chunk data
781                $chunkIds[] = $chunkId;
782                $chunkContents[] = $paragraph;
783                $chunkMetadatas[] = $metadata;
784                $chunkEmbeddings[] = $embeddings;
785            }
786
787            // If no chunks were created, skip this file
788            if (empty($chunkIds)) {
789                return [
790                    'status' => 'skipped',
791                    'message' => "No valid chunks found in file '$id'. Skipping..."
792                ];
793            }
794
795            // Send all chunks to ChromaDB
796            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
797
798            return [
799                'status' => 'success',
800                'message' => "Successfully sent file to ChromaDB",
801                'details' => [
802                    'document_id' => $id,
803                    'chunks' => count($chunkIds),
804                    'collection' => $collectionName
805                ],
806                'collection_status' => $collectionStatus
807            ];
808        } catch (Exception $e) {
809            return [
810                'status' => 'error',
811                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
812            ];
813        }
814    }
815
816    /**
817     * Process all DokuWiki files in a directory and send them to ChromaDB
818     *
819     * This function recursively processes all .txt files in a directory and its subdirectories.
820     * It first checks if the appropriate collection exists and creates it if needed.
821     * Then it processes each file individually.
822     *
823     * @param string $dirPath The directory path to process
824     * @return array Result with status and details
825     */
826    public function processDirectory($dirPath) {
827        // Check if directory exists
828        if (!is_dir($dirPath)) {
829            return [
830                'status' => 'error',
831                'message' => "Directory does not exist: $dirPath"
832            ];
833        }
834
835        // Create RecursiveIteratorIterator to process directories recursively
836        $iterator = new RecursiveIteratorIterator(
837            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
838            RecursiveIteratorIterator::LEAVES_ONLY
839        );
840
841        $files = [];
842        foreach ($iterator as $file) {
843            // Process only .txt files that don't start with underscore
844            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
845                $files[] = $file->getPathname();
846            }
847        }
848
849        if (empty($files)) {
850            return [
851                'status' => 'skipped',
852                'message' => "No .txt files found in directory: $dirPath"
853            ];
854        }
855
856        // Use the first part of the document ID as collection name, fallback to 'documents'
857        $sampleFile = $files[0];
858        $id = parseFilePath($sampleFile);
859        $idParts = explode(':', $id);
860        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
861
862        try {
863            $this->ensureCollectionExists($collectionName);
864            $collectionChecked = true;
865        } catch (Exception $e) {
866            $collectionChecked = true;
867        }
868
869        $results = [];
870        foreach ($files as $file) {
871            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
872            $results[] = [
873                'file' => $file,
874                'result' => $result
875            ];
876        }
877
878        return [
879            'status' => 'success',
880            'message' => "Finished processing directory.",
881            'files_count' => count($files),
882            'results' => $results
883        ];
884    }
885}
886
887/**
888 * Parse a file path and convert it to a DokuWiki ID
889 *
890 * Takes a file system path and converts it to the DokuWiki ID format by:
891 * 1. Removing the base path prefix (using DokuWiki's pages directory)
892 * 2. Removing the .txt extension
893 * 3. Converting directory separators to colons
894 *
895 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
896 * Becomes: reports:mri:2024:g287-name-surname
897 *
898 * @param string $filePath The full file path to parse
899 * @return string The DokuWiki ID
900 */
901function parseFilePath($filePath) {
902    // Use DokuWiki's constant to get the pages directory if available
903    if (defined('DOKU_INC')) {
904        $pagesDir = DOKU_INC . 'data/pages/';
905    } else {
906        // Fallback to common DokuWiki installation path
907        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
908    }
909
910    // Remove the base path
911    $relativePath = str_replace($pagesDir, '', $filePath);
912
913    // Remove .txt extension
914    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
915
916    // Split path into parts and filter out empty parts
917    $parts = array_filter(explode('/', $relativePath));
918
919    // Build DokuWiki ID (use first part as namespace)
920    $idParts = [];
921    foreach ($parts as $part) {
922        if (!empty($part)) {
923            $idParts[] = $part;
924        }
925    }
926
927    return implode(':', $idParts);
928}
929
930