xref: /plugin/dokullm/ChromaDBClient.php (revision 35d66f98c4f7d3fcf74b945ef372b2cce1d9cf36)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    private $ollamaModel;
14
15    /**
16     * Get configuration value for the dokullm plugin
17     *
18     * @param string $key Configuration key
19     * @param mixed $default Default value if key not found
20     * @return mixed Configuration value
21     */
22    /**
23     * Initialize the ChromaDB client
24     *
25     * Creates a new ChromaDB client instance with the specified connection parameters.
26     * Also ensures that the specified tenant and database exist.
27     *
28     * @param string $host ChromaDB server host
29     * @param int $port ChromaDB server port
30     * @param string $tenant ChromaDB tenant name
31     * @param string $database ChromaDB database name
32     * @param string $ollamaHost Ollama server host
33     * @param int $ollamaPort Ollama server port
34     * @param string $ollamaModel Ollama embeddings model
35     */
36    public function __construct($host, $port, $tenant, $database, $ollamaHost, $ollamaPort, $ollamaModel) {
37        // Use provided parameters (no fallback since they're mandatory)
38        $chromaHost = $host;
39        $chromaPort = $port;
40        $this->tenant = $tenant;
41        $this->database = $database;
42        $this->ollamaHost = $ollamaHost;
43        $this->ollamaPort = $ollamaPort;
44        $this->ollamaModel = $ollamaModel;
45
46        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
47        $this->client = curl_init();
48        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
49        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
50            'Content-Type: application/json',
51            'Accept: application/json'
52        ]);
53
54        // Initialize Ollama client
55        $this->ollamaClient = curl_init();
56        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
57        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
58            'Content-Type: application/json'
59        ]);
60
61        // Check if tenant and database exist, create them if they don't
62        $this->ensureTenantAndDatabase();
63    }
64
65    /**
66     * Clean up the cURL client when the object is destroyed
67     *
68     * @return void
69     */
70    public function __destruct() {
71        curl_close($this->client);
72        curl_close($this->ollamaClient);
73    }
74
75    /**
76     * Make an HTTP request to the ChromaDB API
77     *
78     * This is a helper function that handles making HTTP requests to the ChromaDB API,
79     * including setting the appropriate headers for tenant and database.
80     *
81     * @param string $endpoint The API endpoint to call
82     * @param string $method The HTTP method to use (default: 'GET')
83     * @param array|null $data The data to send with the request (default: null)
84     * @return array The JSON response decoded as an array
85     * @throws Exception If there's a cURL error or HTTP error
86     */
87    private function makeRequest($endpoint, $method = 'GET', $data = null) {
88        // Add tenant and database as headers instead of query parameters for v2 API
89        $headers = [
90            'Content-Type: application/json',
91            'Accept: application/json'
92        ];
93
94        $url = $this->baseUrl . '/api/v2' . $endpoint;
95
96        curl_setopt($this->client, CURLOPT_URL, $url);
97        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
98        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
99
100        if ($data) {
101            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
102        } else {
103            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
104        }
105
106        $response = curl_exec($this->client);
107        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
108
109        if (curl_error($this->client)) {
110            throw new \Exception('Curl error: ' . curl_error($this->client));
111        }
112
113        if ($httpCode >= 400) {
114            throw new \Exception("HTTP Error: $httpCode, Response: $response");
115        }
116
117        return json_decode($response, true);
118    }
119
120    /**
121     * Generate embeddings for text using Ollama
122     *
123     * @param string $text The text to generate embeddings for
124     * @return array The embeddings vector
125     */
126    public function generateEmbeddings($text) {
127        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
128
129        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
130
131        $data = [
132            'model' => $this->ollamaModel,
133            'prompt' => $text,
134            'keep_alive' => '30m'
135        ];
136
137        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
138
139        $response = curl_exec($this->ollamaClient);
140        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
141
142        if (curl_error($this->ollamaClient)) {
143            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
144        }
145
146        if ($httpCode >= 400) {
147            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
148        }
149
150        $result = json_decode($response, true);
151
152        if (!isset($result['embedding'])) {
153            throw new \Exception("Ollama response missing embedding: " . $response);
154        }
155
156        return $result['embedding'];
157    }
158
159    /**
160     * List all collections in the database
161     *
162     * Retrieves a list of all collections in the specified tenant and database.
163     *
164     * @return array List of collections
165     */
166    public function listCollections() {
167        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
168        return $this->makeRequest($endpoint);
169    }
170
171    /**
172     * Get a collection by name
173     *
174     * Retrieves information about a specific collection by its name.
175     *
176     * @param string $name The name of the collection to retrieve
177     * @return array The collection information
178     * @throws Exception If the collection is not found
179     */
180    public function getCollection($name) {
181        // Use provided name, fallback to 'documents' if empty
182        if (empty($name)) {
183            $name = 'documents';
184        }
185
186        // First try to get collection by name
187        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
188        $collections = $this->makeRequest($endpoint);
189
190        // Find collection by name
191        foreach ($collections as $collection) {
192            if (isset($collection['name']) && $collection['name'] === $name) {
193                return $collection;
194            }
195        }
196
197        // If not found, throw exception
198        throw new \Exception("Collection '{$name}' not found");
199    }
200
201    /**
202     * Create a new collection
203     *
204     * Creates a new collection with the specified name and optional metadata.
205     *
206     * @param string $name The name of the collection to create
207     * @param array|null $metadata Optional metadata for the collection
208     * @return array The response from the API
209     */
210    public function createCollection($name, $metadata = null) {
211        // Use provided name, fallback to 'documents' if empty
212        if (empty($name)) {
213            $name = 'documents';
214        }
215
216        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
217        $data = ['name' => $name];
218        if ($metadata) {
219            $data['metadata'] = $metadata;
220        }
221        return $this->makeRequest($endpoint, 'POST', $data);
222    }
223
224    /**
225     * Delete a collection by name
226     *
227     * Deletes a collection with the specified name.
228     *
229     * @param string $name The name of the collection to delete
230     * @return array The response from the API
231     * @throws Exception If the collection ID is not found
232     */
233    public function deleteCollection($name) {
234        // Use provided name, fallback to 'documents' if empty
235        if (empty($name)) {
236            $name = 'documents';
237        }
238
239        // First get the collection to find its ID
240        $collection = $this->getCollection($name);
241        if (!isset($collection['id'])) {
242            throw new \Exception("Collection ID not found for '{$name}'");
243        }
244
245        $collectionId = $collection['id'];
246        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
247        return $this->makeRequest($endpoint, 'DELETE');
248    }
249
250    /**
251     * Get a document by its ID from a collection
252     *
253     * Retrieves a document from the specified collection using its ID.
254     *
255     * @param string $collectionName The name of the collection to get the document from
256     * @param string $documentId The document ID to retrieve
257     * @param array $include What to include in the response (default: ["metadatas", "documents"])
258     * @return array The retrieved document
259     * @throws Exception If the collection ID is not found
260     */
261    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
262        // Use provided name, fallback to 'documents' if empty
263        if (empty($collectionName)) {
264            $collectionName = 'documents';
265        }
266
267        // First get the collection to find its ID
268        $collection = $this->getCollection($collectionName);
269        if (!isset($collection['id'])) {
270            throw new \Exception("Collection ID not found for '{$collectionName}'");
271        }
272
273        $collectionId = $collection['id'];
274        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
275        $data = [
276            'ids' => [$documentId],
277            'include' => $include
278        ];
279
280        return $this->makeRequest($endpoint, 'POST', $data);
281    }
282
283    /**
284     * Add documents to a collection
285     *
286     * Adds documents to the specified collection. Each document must have a corresponding ID.
287     * Optional metadata and pre-computed embeddings can also be provided.
288     *
289     * @param string $collectionName The name of the collection to add documents to
290     * @param array $documents The document contents
291     * @param array $ids The document IDs
292     * @param array|null $metadatas Optional metadata for each document
293     * @param array|null $embeddings Optional pre-computed embeddings for each document
294     * @return array The response from the API
295     * @throws Exception If the collection ID is not found
296     */
297    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
298        // Use provided name, fallback to 'documents' if empty
299        if (empty($collectionName)) {
300            $collectionName = 'documents';
301        }
302
303        // First get the collection to find its ID
304        $collection = $this->getCollection($collectionName);
305        if (!isset($collection['id'])) {
306            throw new \Exception("Collection ID not found for '{$collectionName}'");
307        }
308
309        $collectionId = $collection['id'];
310        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
311        $data = [
312            'ids' => $ids,
313            'documents' => $documents
314        ];
315
316        if ($metadatas) {
317            $data['metadatas'] = $metadatas;
318        }
319
320        if ($embeddings) {
321            $data['embeddings'] = $embeddings;
322        }
323
324        return $this->makeRequest($endpoint, 'POST', $data);
325    }
326
327    /**
328     * Check if a document needs to be updated based on timestamp comparison
329     *
330     * Determines whether a document should be reprocessed by comparing the file's last modification
331     * time with the processed_at timestamp stored in the document's metadata. The function checks
332     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
333     * not included in the database.
334     *
335     * @param string $collectionId The ID of the collection to check documents in
336     * @param string $documentId The base document ID to check (without chunk suffixes)
337     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
338     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
339     * @throws Exception If there's an error checking the document
340     */
341    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
342        try {
343            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
344
345            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
346            $chunkIdsToCheck = [
347                $documentId . '@1',
348                $documentId . '@2',
349                $documentId . '@3'
350            ];
351
352            $data = [
353                'ids' => $chunkIdsToCheck,
354                'include' => [
355                    "metadatas"
356                ],
357                'limit' => 1
358            ];
359
360            // Check if document exists
361            $result = $this->makeRequest($endpoint, 'POST', $data);
362
363            // If no documents found, return true (needs to be added)
364            if (empty($result['ids'])) {
365                return true;
366            }
367
368            // Check if any document has a processed_at timestamp
369            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
370                // Check the first metadata entry directly
371                $metadata = $result['metadatas'][0];
372
373                // If processed_at is not set, return true (needs update)
374                if (!isset($metadata['processed_at'])) {
375                    return true;
376                }
377
378                // Parse the processed_at timestamp
379                $processedTimestamp = strtotime($metadata['processed_at']);
380
381                // If file is newer than processed time, return true (needs update)
382                if ($fileModifiedTime > $processedTimestamp) {
383                    return true;
384                }
385            }
386
387            // Document exists and is up to date
388            return false;
389        } catch (\Exception $e) {
390            // If there's an error checking the document, assume it needs to be updated
391            return true;
392        }
393    }
394
395    /**
396     * Query a collection for similar documents
397     *
398     * Queries the specified collection for documents similar to the provided query texts.
399     * The function generates embeddings for the query texts and sends them to ChromaDB.
400     * Supports filtering results by metadata using the where parameter.
401     *
402     * @param string $collectionName The name of the collection to query
403     * @param array $queryTexts The query texts to search for
404     * @param int $nResults The number of results to return (default: 5)
405     * @param array|null $where Optional filter conditions for metadata
406     * @return array The query results
407     * @throws Exception If the collection ID is not found
408     */
409    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
410        // Use provided name, fallback to 'documents' if empty
411        if (empty($collectionName)) {
412            $collectionName = 'documents';
413        }
414
415        // First get the collection to find its ID
416        $collection = $this->getCollection($collectionName);
417        if (!isset($collection['id'])) {
418            throw new \Exception("Collection ID not found for '{$collectionName}'");
419        }
420
421        $collectionId = $collection['id'];
422        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
423
424        // Generate embeddings for query texts
425        $queryEmbeddings = [];
426        foreach ($queryTexts as $text) {
427            $queryEmbeddings[] = $this->generateEmbeddings($text);
428        }
429
430        $data = [
431            'query_embeddings' => $queryEmbeddings,
432            'n_results' => $nResults
433        ];
434
435        // Add where clause for metadata filtering if provided
436        if ($where && is_array($where)) {
437            $data['where'] = $where;
438        }
439
440        return $this->makeRequest($endpoint, 'POST', $data);
441    }
442
443    /**
444     * Check if the ChromaDB server is alive
445     *
446     * Sends a heartbeat request to verify that the ChromaDB server is running.
447     *
448     * @return array The response from the heartbeat endpoint
449     */
450    public function heartbeat() {
451        $endpoint = "/heartbeat";
452        return $this->makeRequest($endpoint, 'GET');
453    }
454
455    /**
456     * Get authentication and identity information
457     *
458     * Retrieves authentication and identity information from the ChromaDB server.
459     *
460     * @return array The response from the auth/identity endpoint
461     */
462    public function getIdentity() {
463        $endpoint = "/identity";
464        return $this->makeRequest($endpoint, 'GET');
465    }
466
467    /**
468     * Ensure that the specified tenant and database exist
469     *
470     * Checks if the specified tenant and database exist, and creates them if they don't.
471     *
472     * @return void
473     */
474    private function ensureTenantAndDatabase() {
475        // Check if tenant exists, create if it doesn't
476        try {
477            $this->getTenant($this->tenant);
478        } catch (\Exception $e) {
479            // Tenant doesn't exist, create it
480            $this->createTenant($this->tenant);
481        }
482
483        // Check if database exists, create if it doesn't
484        try {
485            $this->getDatabase($this->database, $this->tenant);
486        } catch (\Exception $e) {
487            // Database doesn't exist, create it
488            $this->createDatabase($this->database, $this->tenant);
489        }
490    }
491
492    /**
493     * Get tenant information
494     *
495     * Retrieves information about the specified tenant.
496     *
497     * @param string $tenantName The tenant name
498     * @return array The tenant information
499     */
500    public function getTenant($tenantName) {
501        $endpoint = "/tenants/{$tenantName}";
502        return $this->makeRequest($endpoint, 'GET');
503    }
504
505    /**
506     * Create a new tenant
507     *
508     * Creates a new tenant with the specified name.
509     *
510     * @param string $tenantName The tenant name
511     * @return array The response from the API
512     */
513    public function createTenant($tenantName) {
514        $endpoint = "/tenants";
515        $data = ['name' => $tenantName];
516        return $this->makeRequest($endpoint, 'POST', $data);
517    }
518
519    /**
520     * Get database information
521     *
522     * Retrieves information about the specified database within a tenant.
523     *
524     * @param string $databaseName The database name
525     * @param string $tenantName The tenant name
526     * @return array The database information
527     */
528    public function getDatabase($databaseName, $tenantName) {
529        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
530        return $this->makeRequest($endpoint, 'GET');
531    }
532
533    /**
534     * Create a new database
535     *
536     * Creates a new database with the specified name within a tenant.
537     *
538     * @param string $databaseName The database name
539     * @param string $tenantName The tenant name
540     * @return array The response from the API
541     */
542    public function createDatabase($databaseName, $tenantName) {
543        $endpoint = "/tenants/{$tenantName}/databases";
544        $data = ['name' => $databaseName];
545        return $this->makeRequest($endpoint, 'POST', $data);
546    }
547
548    /**
549     * Ensure a collection exists, creating it if necessary
550     *
551     * This helper function checks if a collection exists and creates it if it doesn't.
552     *
553     * @param string $collectionName The name of the collection to check/create
554     * @return string Status message indicating what happened
555     */
556    public function ensureCollectionExists($collectionName) {
557        try {
558            $collection = $this->getCollection($collectionName);
559            return "Collection '$collectionName' already exists.";
560        } catch (\Exception $e) {
561            // Collection doesn't exist, create it
562            $created = $this->createCollection($collectionName);
563            return "Collection '$collectionName' created.";
564        }
565    }
566
567    /**
568     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
569     *
570     * This function handles the complete processing of a single DokuWiki file:
571     * 1. Parses the file path to extract metadata and document ID
572     * 2. Determines the appropriate collection based on document ID
573     * 3. Checks if the document needs updating using timestamp comparison
574     * 4. Reads and processes file content only if update is needed
575     * 5. Splits the document into chunks (paragraphs)
576     * 6. Extracts rich metadata from the DokuWiki ID format
577     * 7. Generates embeddings for each chunk
578     * 8. Sends all chunks to ChromaDB with metadata
579     *
580     * Supported ID formats:
581     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
582     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
583     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
584     *
585     * The function implements smart update checking by comparing file modification time
586     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
587     *
588     * @param string $filePath The path to the file to process
589     * @param string $collectionName The name of the collection to use
590     * @param bool $collectionChecked Whether the collection has already been checked/created
591     * @return array Result with status and details
592     */
593    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
594        // Parse file path to extract metadata
595        $id = parseFilePath($filePath);
596
597        try {
598            // Create collection if it doesn't exist (only if not already checked)
599            $collectionStatus = '';
600            if (!$collectionChecked) {
601                $collectionStatus = $this->ensureCollectionExists($collectionName);
602            }
603
604            // Get collection ID
605            $collection = $this->getCollection($collectionName);
606            if (!isset($collection['id'])) {
607                return [
608                    'status' => 'error',
609                    'message' => "Collection ID not found for '{$collectionName}'"
610                ];
611            }
612            $collectionId = $collection['id'];
613
614            // Get file modification time
615            $fileModifiedTime = filemtime($filePath);
616
617            // Check if document needs update
618            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
619
620            // If document is up to date, skip processing
621            if (!$needsUpdate) {
622                return [
623                    'status' => 'skipped',
624                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
625                ];
626            }
627
628            // Read file content
629            $content = file_get_contents($filePath);
630
631            // Split document into chunks (paragraphs separated by two newlines)
632            $paragraphs = preg_split('/\n\s*\n/', $content);
633            $chunks = [];
634            $chunkMetadata = [];
635
636            // Parse the DokuWiki ID to extract base metadata
637            $parts = explode(':', $id);
638
639            // Extract metadata from the last part of the ID
640            $lastPart = end($parts);
641            $baseMetadata = [];
642
643            // Add the document ID as metadata
644            $baseMetadata['document_id'] = $id;
645
646            // Add current timestamp
647            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
648
649            // Check if any part of the ID is 'templates' and set template metadata
650            $isTemplate = in_array('templates', $parts);
651            if ($isTemplate) {
652                $baseMetadata['type'] = 'template';
653            } else {
654                $baseMetadata['type'] = 'report';
655            }
656
657            // Extract modality from the second part
658            if (isset($parts[1])) {
659                $baseMetadata['modality'] = $parts[1];
660            }
661
662            // Handle different ID formats based on the third part: word (institution) or numeric (year)
663            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
664            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
665            // For templates, don't set institution, date or year
666            if (isset($parts[2]) && !$isTemplate) {
667                // Check if third part is numeric (year) or word (institution)
668                if (is_numeric($parts[2])) {
669                    // Format: reports:mri:2024:g287-name-surname (year format)
670                    // Extract year from the third part
671                    $baseMetadata['year'] = $parts[2];
672
673                    // Set default institution from config
674                    global $conf;
675                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
676
677                    // Extract registration and name from the last part
678                    // Registration should start with one letter or number and contain numbers before the '-' character
679                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
680                        // Check if the first part contains at least one digit to be considered a registration
681                        if (preg_match('/[0-9]/', $matches[1])) {
682                            $baseMetadata['registration'] = $matches[1];
683                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
684                        } else {
685                            // If no registration pattern found, treat entire part as patient name
686                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
687                        }
688                    } else {
689                        // If no match, treat entire part as patient name
690                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
691                    }
692                } else {
693                    // Format: reports:mri:institution:250620-name-surname (institution format)
694                    // Extract institution from the third part
695                    $baseMetadata['institution'] = $parts[2];
696
697                    // Extract date and name from the last part
698                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
699                        $dateStr = $matches[1];
700                        $name = $matches[2];
701
702                        // Convert date format (250620 -> 2025-06-20)
703                        $day = substr($dateStr, 0, 2);
704                        $month = substr($dateStr, 2, 2);
705                        $year = substr($dateStr, 4, 2);
706                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
707                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
708                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
709
710                        $baseMetadata['date'] = $formattedDate;
711                        $baseMetadata['name'] = str_replace('-', ' ', $name);
712                    }
713                }
714            }
715
716            // For templates, always extract name from the last part
717            if ($isTemplate && isset($lastPart)) {
718                // Extract name from the last part (everything after the last colon)
719                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
720                    // Check if the first part contains at least one digit to be considered a registration
721                    if (preg_match('/[0-9]/', $matches[1])) {
722                        $baseMetadata['registration'] = $matches[1];
723                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
724                    } else {
725                        // If no registration pattern found, treat entire part as template name
726                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
727                    }
728                } else {
729                    // If no match, treat entire part as template name
730                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
731                }
732            }
733
734            // Process each paragraph as a chunk with intelligent metadata handling
735            $chunkIds = [];
736            $chunkContents = [];
737            $chunkMetadatas = [];
738            $chunkEmbeddings = [];
739            $currentTags = [];
740
741            foreach ($paragraphs as $index => $paragraph) {
742                // Skip empty paragraphs to avoid processing whitespace-only content
743                $paragraph = trim($paragraph);
744                if (empty($paragraph)) {
745                    continue;
746                }
747
748                // Check if this is a DokuWiki title (starts and ends with =)
749                // Titles are converted to tags for better searchability but not stored as content chunks
750                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
751                    // Extract title content and clean it
752                    $titleContent = trim($matches[1]);
753
754                    // Split into words and create searchable tags
755                    $words = preg_split('/\s+/', $titleContent);
756                    $tags = [];
757
758                    foreach ($words as $word) {
759                        // Only use words longer than 3 characters to reduce noise
760                        if (strlen($word) >= 3) {
761                            $tags[] = strtolower($word);
762                        }
763                    }
764
765                    // Remove duplicate tags and store for use in subsequent chunks
766                    $currentTags = array_unique($tags);
767                    continue; // Skip storing title chunks as content
768                }
769
770                // Create chunk ID
771                $chunkId = $id . '@' . ($index + 1);
772
773                // Generate embeddings for the chunk
774                $embeddings = $this->generateEmbeddings($paragraph);
775
776                // Add chunk-specific metadata
777                $metadata = $baseMetadata;
778                $metadata['chunk_id'] = $chunkId;
779                $metadata['chunk_number'] = $index + 1;
780                $metadata['total_chunks'] = count($paragraphs);
781
782                // Add current tags to metadata if any exist
783                if (!empty($currentTags)) {
784                    $metadata['tags'] = implode(',', $currentTags);
785                }
786
787                // Store chunk data
788                $chunkIds[] = $chunkId;
789                $chunkContents[] = $paragraph;
790                $chunkMetadatas[] = $metadata;
791                $chunkEmbeddings[] = $embeddings;
792            }
793
794            // If no chunks were created, skip this file
795            if (empty($chunkIds)) {
796                return [
797                    'status' => 'skipped',
798                    'message' => "No valid chunks found in file '$id'. Skipping..."
799                ];
800            }
801
802            // Send all chunks to ChromaDB
803            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
804
805            return [
806                'status' => 'success',
807                'message' => "Successfully sent file to ChromaDB",
808                'details' => [
809                    'document_id' => $id,
810                    'chunks' => count($chunkIds),
811                    'collection' => $collectionName
812                ],
813                'collection_status' => $collectionStatus
814            ];
815        } catch (\Exception $e) {
816            return [
817                'status' => 'error',
818                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
819            ];
820        }
821    }
822
823    /**
824     * Process all DokuWiki files in a directory and send them to ChromaDB
825     *
826     * This function recursively processes all .txt files in a directory and its subdirectories.
827     * It first checks if the appropriate collection exists and creates it if needed.
828     * Then it processes each file individually.
829     *
830     * @param string $dirPath The directory path to process
831     * @return array Result with status and details
832     */
833    public function processDirectory($dirPath) {
834        // Check if directory exists
835        if (!is_dir($dirPath)) {
836            return [
837                'status' => 'error',
838                'message' => "Directory does not exist: $dirPath"
839            ];
840        }
841
842        // Create RecursiveIteratorIterator to process directories recursively
843        $iterator = new RecursiveIteratorIterator(
844            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
845            RecursiveIteratorIterator::LEAVES_ONLY
846        );
847
848        $files = [];
849        foreach ($iterator as $file) {
850            // Process only .txt files that don't start with underscore
851            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
852                $files[] = $file->getPathname();
853            }
854        }
855
856        if (empty($files)) {
857            return [
858                'status' => 'skipped',
859                'message' => "No .txt files found in directory: $dirPath"
860            ];
861        }
862
863        // Use the first part of the document ID as collection name, fallback to 'documents'
864        $sampleFile = $files[0];
865        $id = parseFilePath($sampleFile);
866        $idParts = explode(':', $id);
867        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
868
869        try {
870            $this->ensureCollectionExists($collectionName);
871            $collectionChecked = true;
872        } catch (Exception $e) {
873            $collectionChecked = true;
874        }
875
876        $results = [];
877        foreach ($files as $file) {
878            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
879            $results[] = [
880                'file' => $file,
881                'result' => $result
882            ];
883        }
884
885        return [
886            'status' => 'success',
887            'message' => "Finished processing directory.",
888            'files_count' => count($files),
889            'results' => $results
890        ];
891    }
892}
893
894/**
895 * Parse a file path and convert it to a DokuWiki ID
896 *
897 * Takes a file system path and converts it to the DokuWiki ID format by:
898 * 1. Removing the base path prefix (using DokuWiki's pages directory)
899 * 2. Removing the .txt extension
900 * 3. Converting directory separators to colons
901 *
902 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
903 * Becomes: reports:mri:2024:g287-name-surname
904 *
905 * @param string $filePath The full file path to parse
906 * @return string The DokuWiki ID
907 */
908function parseFilePath($filePath) {
909    // Use DokuWiki's constant to get the pages directory if available
910    if (defined('DOKU_INC')) {
911        $pagesDir = DOKU_INC . 'data/pages/';
912    } else {
913        // Fallback to common DokuWiki installation path
914        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
915    }
916
917    // Remove the base path
918    $relativePath = str_replace($pagesDir, '', $filePath);
919
920    // Remove .txt extension
921    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
922
923    // Split path into parts and filter out empty parts
924    $parts = array_filter(explode('/', $relativePath));
925
926    // Build DokuWiki ID (use first part as namespace)
927    $idParts = [];
928    foreach ($parts as $part) {
929        if (!empty($part)) {
930            $idParts[] = $part;
931        }
932    }
933
934    return implode(':', $idParts);
935}
936
937