xref: /plugin/dokullm/ChromaDBClient.php (revision 590368144294a28ecf0e0e39feb976bf79fefb1e)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    /**
14     * Initialize the ChromaDB client
15     *
16     * Creates a new ChromaDB client instance with the specified connection parameters.
17     * Also ensures that the specified tenant and database exist.
18     *
19     * @param string $host ChromaDB server host (default: CHROMA_HOST)
20     * @param int $port ChromaDB server port (default: CHROMA_PORT)
21     * @param string $tenant ChromaDB tenant name (default: CHROMA_TENANT)
22     * @param string $database ChromaDB database name (default: CHROMA_DATABASE)
23     * @param string $ollamaHost Ollama server host (default: OLLAMA_HOST)
24     * @param int $ollamaPort Ollama server port (default: OLLAMA_PORT)
25     * @param string $ollamaModel Ollama embeddings model (default: OLLAMA_EMBEDDINGS_MODEL)
26     */
27    public function __construct($host = CHROMA_HOST, $port = CHROMA_PORT, $tenant = CHROMA_TENANT, $database = CHROMA_DATABASE, $ollamaHost = OLLAMA_HOST, $ollamaPort = OLLAMA_PORT, $ollamaModel = OLLAMA_EMBEDDINGS_MODEL) {
28        $this->baseUrl = "http://{$host}:{$port}";
29        $this->tenant = $tenant;
30        $this->database = $database;
31        $this->ollamaHost = $ollamaHost;
32        $this->ollamaPort = $ollamaPort;
33        $this->ollamaModel = $ollamaModel;
34        $this->client = curl_init();
35        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
36        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
37            'Content-Type: application/json',
38            'Accept: application/json'
39        ]);
40
41        // Initialize Ollama client
42        $this->ollamaClient = curl_init();
43        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
44        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
45            'Content-Type: application/json'
46        ]);
47
48        // Check if tenant and database exist, create them if they don't
49        $this->ensureTenantAndDatabase();
50    }
51
52    /**
53     * Clean up the cURL client when the object is destroyed
54     *
55     * @return void
56     */
57    public function __destruct() {
58        curl_close($this->client);
59        curl_close($this->ollamaClient);
60    }
61
62    /**
63     * Make an HTTP request to the ChromaDB API
64     *
65     * This is a helper function that handles making HTTP requests to the ChromaDB API,
66     * including setting the appropriate headers for tenant and database.
67     *
68     * @param string $endpoint The API endpoint to call
69     * @param string $method The HTTP method to use (default: 'GET')
70     * @param array|null $data The data to send with the request (default: null)
71     * @return array The JSON response decoded as an array
72     * @throws Exception If there's a cURL error or HTTP error
73     */
74    private function makeRequest($endpoint, $method = 'GET', $data = null) {
75        // Add tenant and database as headers instead of query parameters for v2 API
76        $headers = [
77            'Content-Type: application/json',
78            'Accept: application/json'
79        ];
80
81        $url = $this->baseUrl . '/api/v2' . $endpoint;
82
83        curl_setopt($this->client, CURLOPT_URL, $url);
84        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
85        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
86
87        if ($data) {
88            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
89        } else {
90            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
91        }
92
93        $response = curl_exec($this->client);
94        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
95
96        if (curl_error($this->client)) {
97            throw new Exception('Curl error: ' . curl_error($this->client));
98        }
99
100        if ($httpCode >= 400) {
101            throw new Exception("HTTP Error: $httpCode, Response: $response");
102        }
103
104        return json_decode($response, true);
105    }
106
107    /**
108     * Generate embeddings for text using Ollama
109     *
110     * @param string $text The text to generate embeddings for
111     * @return array The embeddings vector
112     */
113    public function generateEmbeddings($text) {
114        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
115
116        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
117
118        $data = [
119            'model' => $this->ollamaModel,
120            'prompt' => $text,
121            'keep_alive' => '30m'
122        ];
123
124        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
125
126        $response = curl_exec($this->ollamaClient);
127        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
128
129        if (curl_error($this->ollamaClient)) {
130            throw new Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
131        }
132
133        if ($httpCode >= 400) {
134            throw new Exception("Ollama HTTP Error: $httpCode, Response: $response");
135        }
136
137        $result = json_decode($response, true);
138
139        if (!isset($result['embedding'])) {
140            throw new Exception("Ollama response missing embedding: " . $response);
141        }
142
143        return $result['embedding'];
144    }
145
146    /**
147     * List all collections in the database
148     *
149     * Retrieves a list of all collections in the specified tenant and database.
150     *
151     * @return array List of collections
152     */
153    public function listCollections() {
154        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
155        return $this->makeRequest($endpoint);
156    }
157
158    /**
159     * Get a collection by name
160     *
161     * Retrieves information about a specific collection by its name.
162     *
163     * @param string $name The name of the collection to retrieve
164     * @return array The collection information
165     * @throws Exception If the collection is not found
166     */
167    public function getCollection($name) {
168        // Use provided name, fallback to 'documents' if empty
169        if (empty($name)) {
170            $name = 'documents';
171        }
172
173        // First try to get collection by name
174        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
175        $collections = $this->makeRequest($endpoint);
176
177        // Find collection by name
178        foreach ($collections as $collection) {
179            if (isset($collection['name']) && $collection['name'] === $name) {
180                return $collection;
181            }
182        }
183
184        // If not found, throw exception
185        throw new Exception("Collection '{$name}' not found");
186    }
187
188    /**
189     * Create a new collection
190     *
191     * Creates a new collection with the specified name and optional metadata.
192     *
193     * @param string $name The name of the collection to create
194     * @param array|null $metadata Optional metadata for the collection
195     * @return array The response from the API
196     */
197    public function createCollection($name, $metadata = null) {
198        // Use provided name, fallback to 'documents' if empty
199        if (empty($name)) {
200            $name = 'documents';
201        }
202
203        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
204        $data = ['name' => $name];
205        if ($metadata) {
206            $data['metadata'] = $metadata;
207        }
208        return $this->makeRequest($endpoint, 'POST', $data);
209    }
210
211    /**
212     * Delete a collection by name
213     *
214     * Deletes a collection with the specified name.
215     *
216     * @param string $name The name of the collection to delete
217     * @return array The response from the API
218     * @throws Exception If the collection ID is not found
219     */
220    public function deleteCollection($name) {
221        // Use provided name, fallback to 'documents' if empty
222        if (empty($name)) {
223            $name = 'documents';
224        }
225
226        // First get the collection to find its ID
227        $collection = $this->getCollection($name);
228        if (!isset($collection['id'])) {
229            throw new Exception("Collection ID not found for '{$name}'");
230        }
231
232        $collectionId = $collection['id'];
233        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
234        return $this->makeRequest($endpoint, 'DELETE');
235    }
236
237    /**
238     * Get a document by its ID from a collection
239     *
240     * Retrieves a document from the specified collection using its ID.
241     *
242     * @param string $collectionName The name of the collection to get the document from
243     * @param string $documentId The document ID to retrieve
244     * @param array $include What to include in the response (default: ["metadatas", "documents"])
245     * @return array The retrieved document
246     * @throws Exception If the collection ID is not found
247     */
248    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
249        // Use provided name, fallback to 'documents' if empty
250        if (empty($collectionName)) {
251            $collectionName = 'documents';
252        }
253
254        // First get the collection to find its ID
255        $collection = $this->getCollection($collectionName);
256        if (!isset($collection['id'])) {
257            throw new Exception("Collection ID not found for '{$collectionName}'");
258        }
259
260        $collectionId = $collection['id'];
261        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
262        $data = [
263            'ids' => [$documentId],
264            'include' => $include
265        ];
266
267        return $this->makeRequest($endpoint, 'POST', $data);
268    }
269
270    /**
271     * Add documents to a collection
272     *
273     * Adds documents to the specified collection. Each document must have a corresponding ID.
274     * Optional metadata and pre-computed embeddings can also be provided.
275     *
276     * @param string $collectionName The name of the collection to add documents to
277     * @param array $documents The document contents
278     * @param array $ids The document IDs
279     * @param array|null $metadatas Optional metadata for each document
280     * @param array|null $embeddings Optional pre-computed embeddings for each document
281     * @return array The response from the API
282     * @throws Exception If the collection ID is not found
283     */
284    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
285        // Use provided name, fallback to 'documents' if empty
286        if (empty($collectionName)) {
287            $collectionName = 'documents';
288        }
289
290        // First get the collection to find its ID
291        $collection = $this->getCollection($collectionName);
292        if (!isset($collection['id'])) {
293            throw new Exception("Collection ID not found for '{$collectionName}'");
294        }
295
296        $collectionId = $collection['id'];
297        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
298        $data = [
299            'ids' => $ids,
300            'documents' => $documents
301        ];
302
303        if ($metadatas) {
304            $data['metadatas'] = $metadatas;
305        }
306
307        if ($embeddings) {
308            $data['embeddings'] = $embeddings;
309        }
310
311        return $this->makeRequest($endpoint, 'POST', $data);
312    }
313
314    /**
315     * Check if a document needs to be updated based on timestamp comparison
316     *
317     * Determines whether a document should be reprocessed by comparing the file's last modification
318     * time with the processed_at timestamp stored in the document's metadata. The function checks
319     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
320     * not included in the database.
321     *
322     * @param string $collectionId The ID of the collection to check documents in
323     * @param string $documentId The base document ID to check (without chunk suffixes)
324     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
325     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
326     * @throws Exception If there's an error checking the document
327     */
328    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
329        try {
330            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
331
332            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
333            $chunkIdsToCheck = [
334                $documentId . '@1',
335                $documentId . '@2',
336                $documentId . '@3'
337            ];
338
339            $data = [
340                'ids' => $chunkIdsToCheck,
341                'include' => [
342                    "metadatas"
343                ],
344                'limit' => 1
345            ];
346
347            // Check if document exists
348            $result = $this->makeRequest($endpoint, 'POST', $data);
349
350            // If no documents found, return true (needs to be added)
351            if (empty($result['ids'])) {
352                return true;
353            }
354
355            // Check if any document has a processed_at timestamp
356            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
357                // Check the first metadata entry directly
358                $metadata = $result['metadatas'][0];
359
360                // If processed_at is not set, return true (needs update)
361                if (!isset($metadata['processed_at'])) {
362                    return true;
363                }
364
365                // Parse the processed_at timestamp
366                $processedTimestamp = strtotime($metadata['processed_at']);
367
368                // If file is newer than processed time, return true (needs update)
369                if ($fileModifiedTime > $processedTimestamp) {
370                    return true;
371                }
372            }
373
374            // Document exists and is up to date
375            return false;
376        } catch (Exception $e) {
377            // If there's an error checking the document, assume it needs to be updated
378            return true;
379        }
380    }
381
382    /**
383     * Query a collection for similar documents
384     *
385     * Queries the specified collection for documents similar to the provided query texts.
386     * The function generates embeddings for the query texts and sends them to ChromaDB.
387     * Supports filtering results by metadata using the where parameter.
388     *
389     * @param string $collectionName The name of the collection to query
390     * @param array $queryTexts The query texts to search for
391     * @param int $nResults The number of results to return (default: 5)
392     * @param array|null $where Optional filter conditions for metadata
393     * @return array The query results
394     * @throws Exception If the collection ID is not found
395     */
396    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
397        // Use provided name, fallback to 'documents' if empty
398        if (empty($collectionName)) {
399            $collectionName = 'documents';
400        }
401
402        // First get the collection to find its ID
403        $collection = $this->getCollection($collectionName);
404        if (!isset($collection['id'])) {
405            throw new Exception("Collection ID not found for '{$collectionName}'");
406        }
407
408        $collectionId = $collection['id'];
409        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
410
411        // Generate embeddings for query texts
412        $queryEmbeddings = [];
413        foreach ($queryTexts as $text) {
414            $queryEmbeddings[] = $this->generateEmbeddings($text);
415        }
416
417        $data = [
418            'query_embeddings' => $queryEmbeddings,
419            'n_results' => $nResults
420        ];
421
422        // Add where clause for metadata filtering if provided
423        if ($where && is_array($where)) {
424            $data['where'] = $where;
425        }
426
427        return $this->makeRequest($endpoint, 'POST', $data);
428    }
429
430    /**
431     * Check if the ChromaDB server is alive
432     *
433     * Sends a heartbeat request to verify that the ChromaDB server is running.
434     *
435     * @return array The response from the heartbeat endpoint
436     */
437    public function heartbeat() {
438        $endpoint = "/heartbeat";
439        return $this->makeRequest($endpoint, 'GET');
440    }
441
442    /**
443     * Get authentication and identity information
444     *
445     * Retrieves authentication and identity information from the ChromaDB server.
446     *
447     * @return array The response from the auth/identity endpoint
448     */
449    public function getIdentity() {
450        $endpoint = "/identity";
451        return $this->makeRequest($endpoint, 'GET');
452    }
453
454    /**
455     * Ensure that the specified tenant and database exist
456     *
457     * Checks if the specified tenant and database exist, and creates them if they don't.
458     *
459     * @return void
460     */
461    private function ensureTenantAndDatabase() {
462        // Check if tenant exists, create if it doesn't
463        try {
464            $this->getTenant($this->tenant);
465        } catch (Exception $e) {
466            // Tenant doesn't exist, create it
467            $this->createTenant($this->tenant);
468        }
469
470        // Check if database exists, create if it doesn't
471        try {
472            $this->getDatabase($this->database, $this->tenant);
473        } catch (Exception $e) {
474            // Database doesn't exist, create it
475            $this->createDatabase($this->database, $this->tenant);
476        }
477    }
478
479    /**
480     * Get tenant information
481     *
482     * Retrieves information about the specified tenant.
483     *
484     * @param string $tenantName The tenant name
485     * @return array The tenant information
486     */
487    public function getTenant($tenantName) {
488        $endpoint = "/tenants/{$tenantName}";
489        return $this->makeRequest($endpoint, 'GET');
490    }
491
492    /**
493     * Create a new tenant
494     *
495     * Creates a new tenant with the specified name.
496     *
497     * @param string $tenantName The tenant name
498     * @return array The response from the API
499     */
500    public function createTenant($tenantName) {
501        $endpoint = "/tenants";
502        $data = ['name' => $tenantName];
503        return $this->makeRequest($endpoint, 'POST', $data);
504    }
505
506    /**
507     * Get database information
508     *
509     * Retrieves information about the specified database within a tenant.
510     *
511     * @param string $databaseName The database name
512     * @param string $tenantName The tenant name
513     * @return array The database information
514     */
515    public function getDatabase($databaseName, $tenantName) {
516        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
517        return $this->makeRequest($endpoint, 'GET');
518    }
519
520    /**
521     * Create a new database
522     *
523     * Creates a new database with the specified name within a tenant.
524     *
525     * @param string $databaseName The database name
526     * @param string $tenantName The tenant name
527     * @return array The response from the API
528     */
529    public function createDatabase($databaseName, $tenantName) {
530        $endpoint = "/tenants/{$tenantName}/databases";
531        $data = ['name' => $databaseName];
532        return $this->makeRequest($endpoint, 'POST', $data);
533    }
534
535    /**
536     * Ensure a collection exists, creating it if necessary
537     *
538     * This helper function checks if a collection exists and creates it if it doesn't.
539     *
540     * @param string $collectionName The name of the collection to check/create
541     * @return string Status message indicating what happened
542     */
543    public function ensureCollectionExists($collectionName) {
544        try {
545            $collection = $this->getCollection($collectionName);
546            return "Collection '$collectionName' already exists.";
547        } catch (Exception $e) {
548            // Collection doesn't exist, create it
549            $created = $this->createCollection($collectionName);
550            return "Collection '$collectionName' created.";
551        }
552    }
553
554    /**
555     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
556     *
557     * This function handles the complete processing of a single DokuWiki file:
558     * 1. Parses the file path to extract metadata and document ID
559     * 2. Determines the appropriate collection based on document ID
560     * 3. Checks if the document needs updating using timestamp comparison
561     * 4. Reads and processes file content only if update is needed
562     * 5. Splits the document into chunks (paragraphs)
563     * 6. Extracts rich metadata from the DokuWiki ID format
564     * 7. Generates embeddings for each chunk
565     * 8. Sends all chunks to ChromaDB with metadata
566     *
567     * Supported ID formats:
568     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
569     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
570     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
571     *
572     * The function implements smart update checking by comparing file modification time
573     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
574     *
575     * @param string $filePath The path to the file to process
576     * @param string $collectionName The name of the collection to use
577     * @param bool $collectionChecked Whether the collection has already been checked/created
578     * @return array Result with status and details
579     */
580    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
581        // Parse file path to extract metadata
582        $id = parseFilePath($filePath);
583
584        try {
585            // Create collection if it doesn't exist (only if not already checked)
586            $collectionStatus = '';
587            if (!$collectionChecked) {
588                $collectionStatus = $this->ensureCollectionExists($collectionName);
589            }
590
591            // Get collection ID
592            $collection = $this->getCollection($collectionName);
593            if (!isset($collection['id'])) {
594                return [
595                    'status' => 'error',
596                    'message' => "Collection ID not found for '{$collectionName}'"
597                ];
598            }
599            $collectionId = $collection['id'];
600
601            // Get file modification time
602            $fileModifiedTime = filemtime($filePath);
603
604            // Check if document needs update
605            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
606
607            // If document is up to date, skip processing
608            if (!$needsUpdate) {
609                return [
610                    'status' => 'skipped',
611                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
612                ];
613            }
614
615            // Read file content
616            $content = file_get_contents($filePath);
617
618            // Split document into chunks (paragraphs separated by two newlines)
619            $paragraphs = preg_split('/\n\s*\n/', $content);
620            $chunks = [];
621            $chunkMetadata = [];
622
623            // Parse the DokuWiki ID to extract base metadata
624            $parts = explode(':', $id);
625
626            // Extract metadata from the last part of the ID
627            $lastPart = end($parts);
628            $baseMetadata = [];
629
630            // Add the document ID as metadata
631            $baseMetadata['document_id'] = $id;
632
633            // Add current timestamp
634            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
635
636            // Check if any part of the ID is 'templates' and set template metadata
637            $isTemplate = in_array('templates', $parts);
638            if ($isTemplate) {
639                $baseMetadata['type'] = 'template';
640            } else {
641                $baseMetadata['type'] = 'report';
642            }
643
644            // Extract modality from the second part
645            if (isset($parts[1])) {
646                $baseMetadata['modality'] = $parts[1];
647            }
648
649            // Handle different ID formats based on the third part: word (institution) or numeric (year)
650            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
651            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
652            // For templates, don't set institution, date or year
653            if (isset($parts[2]) && !$isTemplate) {
654                // Check if third part is numeric (year) or word (institution)
655                if (is_numeric($parts[2])) {
656                    // Format: reports:mri:2024:g287-name-surname (year format)
657                    // Extract year from the third part
658                    $baseMetadata['year'] = $parts[2];
659
660                    // Set default institution from config
661                    $baseMetadata['institution'] = DEFAULT_INSTITUTION;
662
663                    // Extract registration and name from the last part
664                    // Registration should start with one letter or number and contain numbers before the '-' character
665                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
666                        // Check if the first part contains at least one digit to be considered a registration
667                        if (preg_match('/[0-9]/', $matches[1])) {
668                            $baseMetadata['registration'] = $matches[1];
669                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
670                        } else {
671                            // If no registration pattern found, treat entire part as patient name
672                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
673                        }
674                    } else {
675                        // If no match, treat entire part as patient name
676                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
677                    }
678                } else {
679                    // Format: reports:mri:institution:250620-name-surname (institution format)
680                    // Extract institution from the third part
681                    $baseMetadata['institution'] = $parts[2];
682
683                    // Extract date and name from the last part
684                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
685                        $dateStr = $matches[1];
686                        $name = $matches[2];
687
688                        // Convert date format (250620 -> 2025-06-20)
689                        $day = substr($dateStr, 0, 2);
690                        $month = substr($dateStr, 2, 2);
691                        $year = substr($dateStr, 4, 2);
692                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
693                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
694                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
695
696                        $baseMetadata['date'] = $formattedDate;
697                        $baseMetadata['name'] = str_replace('-', ' ', $name);
698                    }
699                }
700            }
701
702            // For templates, always extract name from the last part
703            if ($isTemplate && isset($lastPart)) {
704                // Extract name from the last part (everything after the last colon)
705                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
706                    // Check if the first part contains at least one digit to be considered a registration
707                    if (preg_match('/[0-9]/', $matches[1])) {
708                        $baseMetadata['registration'] = $matches[1];
709                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
710                    } else {
711                        // If no registration pattern found, treat entire part as template name
712                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
713                    }
714                } else {
715                    // If no match, treat entire part as template name
716                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
717                }
718            }
719
720            // Process each paragraph as a chunk with intelligent metadata handling
721            $chunkIds = [];
722            $chunkContents = [];
723            $chunkMetadatas = [];
724            $chunkEmbeddings = [];
725            $currentTags = [];
726
727            foreach ($paragraphs as $index => $paragraph) {
728                // Skip empty paragraphs to avoid processing whitespace-only content
729                $paragraph = trim($paragraph);
730                if (empty($paragraph)) {
731                    continue;
732                }
733
734                // Check if this is a DokuWiki title (starts and ends with =)
735                // Titles are converted to tags for better searchability but not stored as content chunks
736                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
737                    // Extract title content and clean it
738                    $titleContent = trim($matches[1]);
739
740                    // Split into words and create searchable tags
741                    $words = preg_split('/\s+/', $titleContent);
742                    $tags = [];
743
744                    foreach ($words as $word) {
745                        // Only use words longer than 3 characters to reduce noise
746                        if (strlen($word) >= 3) {
747                            $tags[] = strtolower($word);
748                        }
749                    }
750
751                    // Remove duplicate tags and store for use in subsequent chunks
752                    $currentTags = array_unique($tags);
753                    continue; // Skip storing title chunks as content
754                }
755
756                // Create chunk ID
757                $chunkId = $id . '@' . ($index + 1);
758
759                // Generate embeddings for the chunk
760                $embeddings = $this->generateEmbeddings($paragraph);
761
762                // Add chunk-specific metadata
763                $metadata = $baseMetadata;
764                $metadata['chunk_id'] = $chunkId;
765                $metadata['chunk_number'] = $index + 1;
766                $metadata['total_chunks'] = count($paragraphs);
767
768                // Add current tags to metadata if any exist
769                if (!empty($currentTags)) {
770                    $metadata['tags'] = implode(',', $currentTags);
771                }
772
773                // Store chunk data
774                $chunkIds[] = $chunkId;
775                $chunkContents[] = $paragraph;
776                $chunkMetadatas[] = $metadata;
777                $chunkEmbeddings[] = $embeddings;
778            }
779
780            // If no chunks were created, skip this file
781            if (empty($chunkIds)) {
782                return [
783                    'status' => 'skipped',
784                    'message' => "No valid chunks found in file '$id'. Skipping..."
785                ];
786            }
787
788            // Send all chunks to ChromaDB
789            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
790
791            return [
792                'status' => 'success',
793                'message' => "Successfully sent file to ChromaDB",
794                'details' => [
795                    'document_id' => $id,
796                    'chunks' => count($chunkIds),
797                    'collection' => $collectionName
798                ],
799                'collection_status' => $collectionStatus
800            ];
801        } catch (Exception $e) {
802            return [
803                'status' => 'error',
804                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
805            ];
806        }
807    }
808
809    /**
810     * Process all DokuWiki files in a directory and send them to ChromaDB
811     *
812     * This function recursively processes all .txt files in a directory and its subdirectories.
813     * It first checks if the appropriate collection exists and creates it if needed.
814     * Then it processes each file individually.
815     *
816     * @param string $dirPath The directory path to process
817     * @return array Result with status and details
818     */
819    public function processDirectory($dirPath) {
820        // Check if directory exists
821        if (!is_dir($dirPath)) {
822            return [
823                'status' => 'error',
824                'message' => "Directory does not exist: $dirPath"
825            ];
826        }
827
828        // Create RecursiveIteratorIterator to process directories recursively
829        $iterator = new RecursiveIteratorIterator(
830            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
831            RecursiveIteratorIterator::LEAVES_ONLY
832        );
833
834        $files = [];
835        foreach ($iterator as $file) {
836            // Process only .txt files that don't start with underscore
837            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
838                $files[] = $file->getPathname();
839            }
840        }
841
842        if (empty($files)) {
843            return [
844                'status' => 'skipped',
845                'message' => "No .txt files found in directory: $dirPath"
846            ];
847        }
848
849        // Use the first part of the document ID as collection name, fallback to 'documents'
850        $sampleFile = $files[0];
851        $id = parseFilePath($sampleFile);
852        $idParts = explode(':', $id);
853        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
854
855        try {
856            $this->ensureCollectionExists($collectionName);
857            $collectionChecked = true;
858        } catch (Exception $e) {
859            $collectionChecked = true;
860        }
861
862        $results = [];
863        foreach ($files as $file) {
864            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
865            $results[] = [
866                'file' => $file,
867                'result' => $result
868            ];
869        }
870
871        return [
872            'status' => 'success',
873            'message' => "Finished processing directory.",
874            'files_count' => count($files),
875            'results' => $results
876        ];
877    }
878}
879
880/**
881 * Parse a file path and convert it to a DokuWiki ID
882 *
883 * Takes a file system path and converts it to the DokuWiki ID format by:
884 * 1. Removing the base path prefix (using DokuWiki's pages directory)
885 * 2. Removing the .txt extension
886 * 3. Converting directory separators to colons
887 *
888 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
889 * Becomes: reports:mri:2024:g287-name-surname
890 *
891 * @param string $filePath The full file path to parse
892 * @return string The DokuWiki ID
893 */
894function parseFilePath($filePath) {
895    // Use DokuWiki's constant to get the pages directory if available
896    if (defined('DOKU_INC')) {
897        $pagesDir = DOKU_INC . 'data/pages/';
898    } else {
899        // Fallback to common DokuWiki installation path
900        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
901    }
902
903    // Remove the base path
904    $relativePath = str_replace($pagesDir, '', $filePath);
905
906    // Remove .txt extension
907    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
908
909    // Split path into parts and filter out empty parts
910    $parts = array_filter(explode('/', $relativePath));
911
912    // Build DokuWiki ID (use first part as namespace)
913    $idParts = [];
914    foreach ($parts as $part) {
915        if (!empty($part)) {
916            $idParts[] = $part;
917        }
918    }
919
920    return implode(':', $idParts);
921}
922
923