xref: /plugin/dokullm/ChromaDBClient.php (revision db43cc0996840949e34ced7b0f71ad5df44b6d2a)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    private $ollamaModel;
14
15    /**
16     * Get configuration value for the dokullm plugin
17     *
18     * @param string $key Configuration key
19     * @param mixed $default Default value if key not found
20     * @return mixed Configuration value
21     */
22    /**
23     * Initialize the ChromaDB client
24     *
25     * Creates a new ChromaDB client instance with the specified connection parameters.
26     * Also ensures that the specified tenant and database exist.
27     *
28     * @param string $host ChromaDB server host
29     * @param int $port ChromaDB server port
30     * @param string $tenant ChromaDB tenant name
31     * @param string $database ChromaDB database name
32     * @param string $defaultCollection Default collection name
33     * @param string $ollamaHost Ollama server host
34     * @param int $ollamaPort Ollama server port
35     * @param string $ollamaModel Ollama embeddings model
36     */
37    public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) {
38        // Use provided parameters (no fallback since they're mandatory)
39        $chromaHost = $host;
40        $chromaPort = $port;
41        $this->tenant = $tenant;
42        $this->database = $database;
43        $this->defaultCollection = $defaultCollection;
44        $this->ollamaHost = $ollamaHost;
45        $this->ollamaPort = $ollamaPort;
46        $this->ollamaModel = $ollamaModel;
47
48        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
49        $this->client = curl_init();
50        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
51        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
52            'Content-Type: application/json',
53            'Accept: application/json'
54        ]);
55
56        // Initialize Ollama client
57        $this->ollamaClient = curl_init();
58        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
59        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
60            'Content-Type: application/json'
61        ]);
62
63        // Check if tenant and database exist, create them if they don't
64        $this->ensureTenantAndDatabase();
65    }
66
67    /**
68     * Clean up the cURL client when the object is destroyed
69     *
70     * @return void
71     */
72    public function __destruct() {
73        curl_close($this->client);
74        curl_close($this->ollamaClient);
75    }
76
77    /**
78     * Make an HTTP request to the ChromaDB API
79     *
80     * This is a helper function that handles making HTTP requests to the ChromaDB API,
81     * including setting the appropriate headers for tenant and database.
82     *
83     * @param string $endpoint The API endpoint to call
84     * @param string $method The HTTP method to use (default: 'GET')
85     * @param array|null $data The data to send with the request (default: null)
86     * @return array The JSON response decoded as an array
87     * @throws Exception If there's a cURL error or HTTP error
88     */
89    private function makeRequest($endpoint, $method = 'GET', $data = null) {
90        // Add tenant and database as headers instead of query parameters for v2 API
91        $headers = [
92            'Content-Type: application/json',
93            'Accept: application/json'
94        ];
95
96        $url = $this->baseUrl . '/api/v2' . $endpoint;
97
98        curl_setopt($this->client, CURLOPT_URL, $url);
99        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
100        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
101
102        if ($data) {
103            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
104        } else {
105            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
106        }
107
108        $response = curl_exec($this->client);
109        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
110
111        if (curl_error($this->client)) {
112            throw new \Exception('Curl error: ' . curl_error($this->client));
113        }
114
115        if ($httpCode >= 400) {
116            throw new \Exception("HTTP Error: $httpCode, Response: $response");
117        }
118
119        return json_decode($response, true);
120    }
121
122    /**
123     * Generate embeddings for text using Ollama
124     *
125     * @param string $text The text to generate embeddings for
126     * @return array The embeddings vector
127     */
128    public function generateEmbeddings($text) {
129        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
130
131        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
132
133        $data = [
134            'model' => $this->ollamaModel,
135            'prompt' => $text,
136            'keep_alive' => '30m'
137        ];
138
139        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
140
141        $response = curl_exec($this->ollamaClient);
142        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
143
144        if (curl_error($this->ollamaClient)) {
145            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
146        }
147
148        if ($httpCode >= 400) {
149            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
150        }
151
152        $result = json_decode($response, true);
153
154        if (!isset($result['embedding'])) {
155            throw new \Exception("Ollama response missing embedding: " . $response);
156        }
157
158        return $result['embedding'];
159    }
160
161    /**
162     * List all collections in the database
163     *
164     * Retrieves a list of all collections in the specified tenant and database.
165     *
166     * @return array List of collections
167     */
168    public function listCollections() {
169        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
170        return $this->makeRequest($endpoint);
171    }
172
173    /**
174     * Get a collection by name
175     *
176     * Retrieves information about a specific collection by its name.
177     *
178     * @param string $name The name of the collection to retrieve
179     * @return array The collection information
180     * @throws Exception If the collection is not found
181     */
182    public function getCollection($name) {
183        // Use provided name, fallback to 'documents' if empty
184        if (empty($name)) {
185            $name = 'documents';
186        }
187
188        // First try to get collection by name
189        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
190        $collections = $this->makeRequest($endpoint);
191
192        // Find collection by name
193        foreach ($collections as $collection) {
194            if (isset($collection['name']) && $collection['name'] === $name) {
195                return $collection;
196            }
197        }
198
199        // If not found, throw exception
200        throw new \Exception("Collection '{$name}' not found");
201    }
202
203    /**
204     * Create a new collection
205     *
206     * Creates a new collection with the specified name and optional metadata.
207     *
208     * @param string $name The name of the collection to create
209     * @param array|null $metadata Optional metadata for the collection
210     * @return array The response from the API
211     */
212    public function createCollection($name, $metadata = null) {
213        // Use provided name, fallback to 'documents' if empty
214        if (empty($name)) {
215            $name = 'documents';
216        }
217
218        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
219        $data = ['name' => $name];
220        if ($metadata) {
221            $data['metadata'] = $metadata;
222        }
223        return $this->makeRequest($endpoint, 'POST', $data);
224    }
225
226    /**
227     * Delete a collection by name
228     *
229     * Deletes a collection with the specified name.
230     *
231     * @param string $name The name of the collection to delete
232     * @return array The response from the API
233     * @throws Exception If the collection ID is not found
234     */
235    public function deleteCollection($name) {
236        // Use provided name, fallback to 'documents' if empty
237        if (empty($name)) {
238            $name = 'documents';
239        }
240
241        // First get the collection to find its ID
242        $collection = $this->getCollection($name);
243        if (!isset($collection['id'])) {
244            throw new \Exception("Collection ID not found for '{$name}'");
245        }
246
247        $collectionId = $collection['id'];
248        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
249        return $this->makeRequest($endpoint, 'DELETE');
250    }
251
252    /**
253     * Get a document by its ID from a collection
254     *
255     * Retrieves a document from the specified collection using its ID.
256     *
257     * @param string $collectionName The name of the collection to get the document from
258     * @param string $documentId The document ID to retrieve
259     * @param array $include What to include in the response (default: ["metadatas", "documents"])
260     * @return array The retrieved document
261     * @throws Exception If the collection ID is not found
262     */
263    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
264        // Use provided name, fallback to 'documents' if empty
265        if (empty($collectionName)) {
266            $collectionName = 'documents';
267        }
268
269        // First get the collection to find its ID
270        $collection = $this->getCollection($collectionName);
271        if (!isset($collection['id'])) {
272            throw new \Exception("Collection ID not found for '{$collectionName}'");
273        }
274
275        $collectionId = $collection['id'];
276        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
277        $data = [
278            'ids' => [$documentId],
279            'include' => $include
280        ];
281
282        return $this->makeRequest($endpoint, 'POST', $data);
283    }
284
285    /**
286     * Add documents to a collection
287     *
288     * Adds documents to the specified collection. Each document must have a corresponding ID.
289     * Optional metadata and pre-computed embeddings can also be provided.
290     *
291     * @param string $collectionName The name of the collection to add documents to
292     * @param array $documents The document contents
293     * @param array $ids The document IDs
294     * @param array|null $metadatas Optional metadata for each document
295     * @param array|null $embeddings Optional pre-computed embeddings for each document
296     * @return array The response from the API
297     * @throws Exception If the collection ID is not found
298     */
299    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
300        // Use provided name, fallback to 'documents' if empty
301        if (empty($collectionName)) {
302            $collectionName = 'documents';
303        }
304
305        // First get the collection to find its ID
306        $collection = $this->getCollection($collectionName);
307        if (!isset($collection['id'])) {
308            throw new \Exception("Collection ID not found for '{$collectionName}'");
309        }
310
311        $collectionId = $collection['id'];
312        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
313        $data = [
314            'ids' => $ids,
315            'documents' => $documents
316        ];
317
318        if ($metadatas) {
319            $data['metadatas'] = $metadatas;
320        }
321
322        if ($embeddings) {
323            $data['embeddings'] = $embeddings;
324        }
325
326        return $this->makeRequest($endpoint, 'POST', $data);
327    }
328
329    /**
330     * Check if a document needs to be updated based on timestamp comparison
331     *
332     * Determines whether a document should be reprocessed by comparing the file's last modification
333     * time with the processed_at timestamp stored in the document's metadata. The function checks
334     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
335     * not included in the database.
336     *
337     * @param string $collectionId The ID of the collection to check documents in
338     * @param string $documentId The base document ID to check (without chunk suffixes)
339     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
340     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
341     * @throws Exception If there's an error checking the document
342     */
343    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
344        try {
345            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
346
347            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
348            $chunkIdsToCheck = [
349                $documentId . '@1',
350                $documentId . '@2',
351                $documentId . '@3'
352            ];
353
354            $data = [
355                'ids' => $chunkIdsToCheck,
356                'include' => [
357                    "metadatas"
358                ],
359                'limit' => 1
360            ];
361
362            // Check if document exists
363            $result = $this->makeRequest($endpoint, 'POST', $data);
364
365            // If no documents found, return true (needs to be added)
366            if (empty($result['ids'])) {
367                return true;
368            }
369
370            // Check if any document has a processed_at timestamp
371            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
372                // Check the first metadata entry directly
373                $metadata = $result['metadatas'][0];
374
375                // If processed_at is not set, return true (needs update)
376                if (!isset($metadata['processed_at'])) {
377                    return true;
378                }
379
380                // Parse the processed_at timestamp
381                $processedTimestamp = strtotime($metadata['processed_at']);
382
383                // If file is newer than processed time, return true (needs update)
384                if ($fileModifiedTime > $processedTimestamp) {
385                    return true;
386                }
387            }
388
389            // Document exists and is up to date
390            return false;
391        } catch (\Exception $e) {
392            // If there's an error checking the document, assume it needs to be updated
393            return true;
394        }
395    }
396
397    /**
398     * Query a collection for similar documents
399     *
400     * Queries the specified collection for documents similar to the provided query texts.
401     * The function generates embeddings for the query texts and sends them to ChromaDB.
402     * Supports filtering results by metadata using the where parameter.
403     *
404     * @param string $collectionName The name of the collection to query
405     * @param array $queryTexts The query texts to search for
406     * @param int $nResults The number of results to return (default: 5)
407     * @param array|null $where Optional filter conditions for metadata
408     * @return array The query results
409     * @throws Exception If the collection ID is not found
410     */
411    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
412        // Use provided name, fallback to 'documents' if empty
413        if (empty($collectionName)) {
414            $collectionName = 'documents';
415        }
416
417        // First get the collection to find its ID
418        $collection = $this->getCollection($collectionName);
419        if (!isset($collection['id'])) {
420            throw new \Exception("Collection ID not found for '{$collectionName}'");
421        }
422
423        $collectionId = $collection['id'];
424        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
425
426        // Generate embeddings for query texts
427        $queryEmbeddings = [];
428        foreach ($queryTexts as $text) {
429            $queryEmbeddings[] = $this->generateEmbeddings($text);
430        }
431
432        $data = [
433            'query_embeddings' => $queryEmbeddings,
434            'n_results' => $nResults
435        ];
436
437        // Add where clause for metadata filtering if provided
438        if ($where && is_array($where)) {
439            $data['where'] = $where;
440        }
441
442        return $this->makeRequest($endpoint, 'POST', $data);
443    }
444
445    /**
446     * Check if the ChromaDB server is alive
447     *
448     * Sends a heartbeat request to verify that the ChromaDB server is running.
449     *
450     * @return array The response from the heartbeat endpoint
451     */
452    public function heartbeat() {
453        $endpoint = "/heartbeat";
454        return $this->makeRequest($endpoint, 'GET');
455    }
456
457    /**
458     * Get authentication and identity information
459     *
460     * Retrieves authentication and identity information from the ChromaDB server.
461     *
462     * @return array The response from the auth/identity endpoint
463     */
464    public function getIdentity() {
465        $endpoint = "/identity";
466        return $this->makeRequest($endpoint, 'GET');
467    }
468
469    /**
470     * Ensure that the specified tenant and database exist
471     *
472     * Checks if the specified tenant and database exist, and creates them if they don't.
473     *
474     * @return void
475     */
476    private function ensureTenantAndDatabase() {
477        // Check if tenant exists, create if it doesn't
478        try {
479            $this->getTenant($this->tenant);
480        } catch (\Exception $e) {
481            // Tenant doesn't exist, create it
482            $this->createTenant($this->tenant);
483        }
484
485        // Check if database exists, create if it doesn't
486        try {
487            $this->getDatabase($this->database, $this->tenant);
488        } catch (\Exception $e) {
489            // Database doesn't exist, create it
490            $this->createDatabase($this->database, $this->tenant);
491        }
492    }
493
494    /**
495     * Get tenant information
496     *
497     * Retrieves information about the specified tenant.
498     *
499     * @param string $tenantName The tenant name
500     * @return array The tenant information
501     */
502    public function getTenant($tenantName) {
503        $endpoint = "/tenants/{$tenantName}";
504        return $this->makeRequest($endpoint, 'GET');
505    }
506
507    /**
508     * Create a new tenant
509     *
510     * Creates a new tenant with the specified name.
511     *
512     * @param string $tenantName The tenant name
513     * @return array The response from the API
514     */
515    public function createTenant($tenantName) {
516        $endpoint = "/tenants";
517        $data = ['name' => $tenantName];
518        return $this->makeRequest($endpoint, 'POST', $data);
519    }
520
521    /**
522     * Get database information
523     *
524     * Retrieves information about the specified database within a tenant.
525     *
526     * @param string $databaseName The database name
527     * @param string $tenantName The tenant name
528     * @return array The database information
529     */
530    public function getDatabase($databaseName, $tenantName) {
531        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
532        return $this->makeRequest($endpoint, 'GET');
533    }
534
535    /**
536     * Create a new database
537     *
538     * Creates a new database with the specified name within a tenant.
539     *
540     * @param string $databaseName The database name
541     * @param string $tenantName The tenant name
542     * @return array The response from the API
543     */
544    public function createDatabase($databaseName, $tenantName) {
545        $endpoint = "/tenants/{$tenantName}/databases";
546        $data = ['name' => $databaseName];
547        return $this->makeRequest($endpoint, 'POST', $data);
548    }
549
550    /**
551     * Ensure a collection exists, creating it if necessary
552     *
553     * This helper function checks if a collection exists and creates it if it doesn't.
554     *
555     * @param string $collectionName The name of the collection to check/create
556     * @return string Status message indicating what happened
557     */
558    public function ensureCollectionExists($collectionName) {
559        try {
560            $collection = $this->getCollection($collectionName);
561            return "Collection '$collectionName' already exists.";
562        } catch (\Exception $e) {
563            // Collection doesn't exist, create it
564            $created = $this->createCollection($collectionName);
565            return "Collection '$collectionName' created.";
566        }
567    }
568
569    /**
570     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
571     *
572     * This function handles the complete processing of a single DokuWiki file:
573     * 1. Parses the file path to extract metadata and document ID
574     * 2. Determines the appropriate collection based on document ID
575     * 3. Checks if the document needs updating using timestamp comparison
576     * 4. Reads and processes file content only if update is needed
577     * 5. Splits the document into chunks (paragraphs)
578     * 6. Extracts rich metadata from the DokuWiki ID format
579     * 7. Generates embeddings for each chunk
580     * 8. Sends all chunks to ChromaDB with metadata
581     *
582     * Supported ID formats:
583     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
584     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
585     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
586     *
587     * The function implements smart update checking by comparing file modification time
588     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
589     *
590     * @param string $filePath The path to the file to process
591     * @param string $collectionName The name of the collection to use
592     * @param bool $collectionChecked Whether the collection has already been checked/created
593     * @return array Result with status and details
594     */
595    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
596        // Parse file path to extract metadata
597        $id = parseFilePath($filePath);
598
599        try {
600            // Create collection if it doesn't exist (only if not already checked)
601            $collectionStatus = '';
602            if (!$collectionChecked) {
603                $collectionStatus = $this->ensureCollectionExists($collectionName);
604            }
605
606            // Get collection ID
607            $collection = $this->getCollection($collectionName);
608            if (!isset($collection['id'])) {
609                return [
610                    'status' => 'error',
611                    'message' => "Collection ID not found for '{$collectionName}'"
612                ];
613            }
614            $collectionId = $collection['id'];
615
616            // Get file modification time
617            $fileModifiedTime = filemtime($filePath);
618
619            // Check if document needs update
620            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
621
622            // If document is up to date, skip processing
623            if (!$needsUpdate) {
624                return [
625                    'status' => 'skipped',
626                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
627                ];
628            }
629
630            // Read file content
631            $content = file_get_contents($filePath);
632
633            // Split document into chunks (paragraphs separated by two newlines)
634            $paragraphs = preg_split('/\n\s*\n/', $content);
635            $chunks = [];
636            $chunkMetadata = [];
637
638            // Parse the DokuWiki ID to extract base metadata
639            $parts = explode(':', $id);
640
641            // Extract metadata from the last part of the ID
642            $lastPart = end($parts);
643            $baseMetadata = [];
644
645            // Add the document ID as metadata
646            $baseMetadata['document_id'] = $id;
647
648            // Add current timestamp
649            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
650
651            // Check if any part of the ID is 'templates' and set template metadata
652            $isTemplate = in_array('templates', $parts);
653            if ($isTemplate) {
654                $baseMetadata['type'] = 'template';
655            } else {
656                $baseMetadata['type'] = 'report';
657            }
658
659            // Extract modality from the second part
660            if (isset($parts[1])) {
661                $baseMetadata['modality'] = $parts[1];
662            }
663
664            // Handle different ID formats based on the third part: word (institution) or numeric (year)
665            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
666            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
667            // For templates, don't set institution, date or year
668            if (isset($parts[2]) && !$isTemplate) {
669                // Check if third part is numeric (year) or word (institution)
670                if (is_numeric($parts[2])) {
671                    // Format: reports:mri:2024:g287-name-surname (year format)
672                    // Extract year from the third part
673                    $baseMetadata['year'] = $parts[2];
674
675                    // Set default institution from config
676                    global $conf;
677                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
678
679                    // Extract registration and name from the last part
680                    // Registration should start with one letter or number and contain numbers before the '-' character
681                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
682                        // Check if the first part contains at least one digit to be considered a registration
683                        if (preg_match('/[0-9]/', $matches[1])) {
684                            $baseMetadata['registration'] = $matches[1];
685                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
686                        } else {
687                            // If no registration pattern found, treat entire part as patient name
688                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
689                        }
690                    } else {
691                        // If no match, treat entire part as patient name
692                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
693                    }
694                } else {
695                    // Format: reports:mri:institution:250620-name-surname (institution format)
696                    // Extract institution from the third part
697                    $baseMetadata['institution'] = $parts[2];
698
699                    // Extract date and name from the last part
700                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
701                        $dateStr = $matches[1];
702                        $name = $matches[2];
703
704                        // Convert date format (250620 -> 2025-06-20)
705                        $day = substr($dateStr, 0, 2);
706                        $month = substr($dateStr, 2, 2);
707                        $year = substr($dateStr, 4, 2);
708                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
709                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
710                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
711
712                        $baseMetadata['date'] = $formattedDate;
713                        $baseMetadata['name'] = str_replace('-', ' ', $name);
714                    }
715                }
716            }
717
718            // For templates, always extract name from the last part
719            if ($isTemplate && isset($lastPart)) {
720                // Extract name from the last part (everything after the last colon)
721                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
722                    // Check if the first part contains at least one digit to be considered a registration
723                    if (preg_match('/[0-9]/', $matches[1])) {
724                        $baseMetadata['registration'] = $matches[1];
725                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
726                    } else {
727                        // If no registration pattern found, treat entire part as template name
728                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
729                    }
730                } else {
731                    // If no match, treat entire part as template name
732                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
733                }
734            }
735
736            // Process each paragraph as a chunk with intelligent metadata handling
737            $chunkIds = [];
738            $chunkContents = [];
739            $chunkMetadatas = [];
740            $chunkEmbeddings = [];
741            $currentTags = [];
742
743            foreach ($paragraphs as $index => $paragraph) {
744                // Skip empty paragraphs to avoid processing whitespace-only content
745                $paragraph = trim($paragraph);
746                if (empty($paragraph)) {
747                    continue;
748                }
749
750                // Check if this is a DokuWiki title (starts and ends with =)
751                // Titles are converted to tags for better searchability but not stored as content chunks
752                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
753                    // Extract title content and clean it
754                    $titleContent = trim($matches[1]);
755
756                    // Split into words and create searchable tags
757                    $words = preg_split('/\s+/', $titleContent);
758                    $tags = [];
759
760                    foreach ($words as $word) {
761                        // Only use words longer than 3 characters to reduce noise
762                        if (strlen($word) >= 3) {
763                            $tags[] = strtolower($word);
764                        }
765                    }
766
767                    // Remove duplicate tags and store for use in subsequent chunks
768                    $currentTags = array_unique($tags);
769                    continue; // Skip storing title chunks as content
770                }
771
772                // Create chunk ID
773                $chunkId = $id . '@' . ($index + 1);
774
775                // Generate embeddings for the chunk
776                $embeddings = $this->generateEmbeddings($paragraph);
777
778                // Add chunk-specific metadata
779                $metadata = $baseMetadata;
780                $metadata['chunk_id'] = $chunkId;
781                $metadata['chunk_number'] = $index + 1;
782                $metadata['total_chunks'] = count($paragraphs);
783
784                // Add current tags to metadata if any exist
785                if (!empty($currentTags)) {
786                    $metadata['tags'] = implode(',', $currentTags);
787                }
788
789                // Store chunk data
790                $chunkIds[] = $chunkId;
791                $chunkContents[] = $paragraph;
792                $chunkMetadatas[] = $metadata;
793                $chunkEmbeddings[] = $embeddings;
794            }
795
796            // If no chunks were created, skip this file
797            if (empty($chunkIds)) {
798                return [
799                    'status' => 'skipped',
800                    'message' => "No valid chunks found in file '$id'. Skipping..."
801                ];
802            }
803
804            // Send all chunks to ChromaDB
805            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
806
807            return [
808                'status' => 'success',
809                'message' => "Successfully sent file to ChromaDB",
810                'details' => [
811                    'document_id' => $id,
812                    'chunks' => count($chunkIds),
813                    'collection' => $collectionName
814                ],
815                'collection_status' => $collectionStatus
816            ];
817        } catch (\Exception $e) {
818            return [
819                'status' => 'error',
820                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
821            ];
822        }
823    }
824
825    /**
826     * Process all DokuWiki files in a directory and send them to ChromaDB
827     *
828     * This function recursively processes all .txt files in a directory and its subdirectories.
829     * It first checks if the appropriate collection exists and creates it if needed.
830     * Then it processes each file individually.
831     *
832     * @param string $dirPath The directory path to process
833     * @return array Result with status and details
834     */
835    public function processDirectory($dirPath) {
836        // Check if directory exists
837        if (!is_dir($dirPath)) {
838            return [
839                'status' => 'error',
840                'message' => "Directory does not exist: $dirPath"
841            ];
842        }
843
844        // Create RecursiveIteratorIterator to process directories recursively
845        $iterator = new RecursiveIteratorIterator(
846            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
847            RecursiveIteratorIterator::LEAVES_ONLY
848        );
849
850        $files = [];
851        foreach ($iterator as $file) {
852            // Process only .txt files that don't start with underscore
853            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
854                $files[] = $file->getPathname();
855            }
856        }
857
858        if (empty($files)) {
859            return [
860                'status' => 'skipped',
861                'message' => "No .txt files found in directory: $dirPath"
862            ];
863        }
864
865        // Use the first part of the document ID as collection name, fallback to 'documents'
866        $sampleFile = $files[0];
867        $id = parseFilePath($sampleFile);
868        $idParts = explode(':', $id);
869        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
870
871        try {
872            $this->ensureCollectionExists($collectionName);
873            $collectionChecked = true;
874        } catch (Exception $e) {
875            $collectionChecked = true;
876        }
877
878        $results = [];
879        foreach ($files as $file) {
880            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
881            $results[] = [
882                'file' => $file,
883                'result' => $result
884            ];
885        }
886
887        return [
888            'status' => 'success',
889            'message' => "Finished processing directory.",
890            'files_count' => count($files),
891            'results' => $results
892        ];
893    }
894}
895
896/**
897 * Parse a file path and convert it to a DokuWiki ID
898 *
899 * Takes a file system path and converts it to the DokuWiki ID format by:
900 * 1. Removing the base path prefix (using DokuWiki's pages directory)
901 * 2. Removing the .txt extension
902 * 3. Converting directory separators to colons
903 *
904 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
905 * Becomes: reports:mri:2024:g287-name-surname
906 *
907 * @param string $filePath The full file path to parse
908 * @return string The DokuWiki ID
909 */
910function parseFilePath($filePath) {
911    // Use DokuWiki's constant to get the pages directory if available
912    if (defined('DOKU_INC')) {
913        $pagesDir = DOKU_INC . 'data/pages/';
914    } else {
915        // Fallback to common DokuWiki installation path
916        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
917    }
918
919    // Remove the base path
920    $relativePath = str_replace($pagesDir, '', $filePath);
921
922    // Remove .txt extension
923    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
924
925    // Split path into parts and filter out empty parts
926    $parts = array_filter(explode('/', $relativePath));
927
928    // Build DokuWiki ID (use first part as namespace)
929    $idParts = [];
930    foreach ($parts as $part) {
931        if (!empty($part)) {
932            $idParts[] = $part;
933        }
934    }
935
936    return implode(':', $idParts);
937}
938
939