xref: /plugin/dokullm/ChromaDBClient.php (revision a223cb917daa460951c9e9d8e2bd4803072d62b8)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    private $ollamaModel;
14
15    /**
16     * Get configuration value for the dokullm plugin
17     *
18     * @param string $key Configuration key
19     * @param mixed $default Default value if key not found
20     * @return mixed Configuration value
21     */
22    /**
23     * Initialize the ChromaDB client
24     *
25     * Creates a new ChromaDB client instance with the specified connection parameters.
26     * Also ensures that the specified tenant and database exist.
27     *
28     * @param string $host ChromaDB server host
29     * @param int $port ChromaDB server port
30     * @param string $tenant ChromaDB tenant name
31     * @param string $database ChromaDB database name
32     * @param string $defaultCollection Default collection name
33     * @param string $ollamaHost Ollama server host
34     * @param int $ollamaPort Ollama server port
35     * @param string $ollamaModel Ollama embeddings model
36     */
37    public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) {
38        // Use provided parameters (no fallback since they're mandatory)
39        $chromaHost = $host;
40        $chromaPort = $port;
41        $this->tenant = $tenant;
42        $this->database = $database;
43        $this->ollamaHost = $ollamaHost;
44        $this->ollamaPort = $ollamaPort;
45        $this->ollamaModel = $ollamaModel;
46
47        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
48        $this->client = curl_init();
49        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
50        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
51            'Content-Type: application/json',
52            'Accept: application/json'
53        ]);
54
55        // Initialize Ollama client
56        $this->ollamaClient = curl_init();
57        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
58        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
59            'Content-Type: application/json'
60        ]);
61
62        // Check if tenant and database exist, create them if they don't
63        $this->ensureTenantAndDatabase();
64    }
65
66    /**
67     * Clean up the cURL client when the object is destroyed
68     *
69     * @return void
70     */
71    public function __destruct() {
72        curl_close($this->client);
73        curl_close($this->ollamaClient);
74    }
75
76    /**
77     * Make an HTTP request to the ChromaDB API
78     *
79     * This is a helper function that handles making HTTP requests to the ChromaDB API,
80     * including setting the appropriate headers for tenant and database.
81     *
82     * @param string $endpoint The API endpoint to call
83     * @param string $method The HTTP method to use (default: 'GET')
84     * @param array|null $data The data to send with the request (default: null)
85     * @return array The JSON response decoded as an array
86     * @throws Exception If there's a cURL error or HTTP error
87     */
88    private function makeRequest($endpoint, $method = 'GET', $data = null) {
89        // Add tenant and database as headers instead of query parameters for v2 API
90        $headers = [
91            'Content-Type: application/json',
92            'Accept: application/json'
93        ];
94
95        $url = $this->baseUrl . '/api/v2' . $endpoint;
96
97        curl_setopt($this->client, CURLOPT_URL, $url);
98        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
99        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
100
101        if ($data) {
102            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
103        } else {
104            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
105        }
106
107        $response = curl_exec($this->client);
108        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
109
110        if (curl_error($this->client)) {
111            throw new \Exception('Curl error: ' . curl_error($this->client));
112        }
113
114        if ($httpCode >= 400) {
115            throw new \Exception("HTTP Error: $httpCode, Response: $response");
116        }
117
118        return json_decode($response, true);
119    }
120
121    /**
122     * Generate embeddings for text using Ollama
123     *
124     * @param string $text The text to generate embeddings for
125     * @return array The embeddings vector
126     */
127    public function generateEmbeddings($text) {
128        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
129
130        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
131
132        $data = [
133            'model' => $this->ollamaModel,
134            'prompt' => $text,
135            'keep_alive' => '30m'
136        ];
137
138        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
139
140        $response = curl_exec($this->ollamaClient);
141        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
142
143        if (curl_error($this->ollamaClient)) {
144            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
145        }
146
147        if ($httpCode >= 400) {
148            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
149        }
150
151        $result = json_decode($response, true);
152
153        if (!isset($result['embedding'])) {
154            throw new \Exception("Ollama response missing embedding: " . $response);
155        }
156
157        return $result['embedding'];
158    }
159
160    /**
161     * List all collections in the database
162     *
163     * Retrieves a list of all collections in the specified tenant and database.
164     *
165     * @return array List of collections
166     */
167    public function listCollections() {
168        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
169        return $this->makeRequest($endpoint);
170    }
171
172    /**
173     * Get a collection by name
174     *
175     * Retrieves information about a specific collection by its name.
176     *
177     * @param string $name The name of the collection to retrieve
178     * @return array The collection information
179     * @throws Exception If the collection is not found
180     */
181    public function getCollection($name) {
182        // Use provided name, fallback to 'documents' if empty
183        if (empty($name)) {
184            $name = 'documents';
185        }
186
187        // First try to get collection by name
188        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
189        $collections = $this->makeRequest($endpoint);
190
191        // Find collection by name
192        foreach ($collections as $collection) {
193            if (isset($collection['name']) && $collection['name'] === $name) {
194                return $collection;
195            }
196        }
197
198        // If not found, throw exception
199        throw new \Exception("Collection '{$name}' not found");
200    }
201
202    /**
203     * Create a new collection
204     *
205     * Creates a new collection with the specified name and optional metadata.
206     *
207     * @param string $name The name of the collection to create
208     * @param array|null $metadata Optional metadata for the collection
209     * @return array The response from the API
210     */
211    public function createCollection($name, $metadata = null) {
212        // Use provided name, fallback to 'documents' if empty
213        if (empty($name)) {
214            $name = 'documents';
215        }
216
217        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
218        $data = ['name' => $name];
219        if ($metadata) {
220            $data['metadata'] = $metadata;
221        }
222        return $this->makeRequest($endpoint, 'POST', $data);
223    }
224
225    /**
226     * Delete a collection by name
227     *
228     * Deletes a collection with the specified name.
229     *
230     * @param string $name The name of the collection to delete
231     * @return array The response from the API
232     * @throws Exception If the collection ID is not found
233     */
234    public function deleteCollection($name) {
235        // Use provided name, fallback to 'documents' if empty
236        if (empty($name)) {
237            $name = 'documents';
238        }
239
240        // First get the collection to find its ID
241        $collection = $this->getCollection($name);
242        if (!isset($collection['id'])) {
243            throw new \Exception("Collection ID not found for '{$name}'");
244        }
245
246        $collectionId = $collection['id'];
247        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
248        return $this->makeRequest($endpoint, 'DELETE');
249    }
250
251    /**
252     * Get a document by its ID from a collection
253     *
254     * Retrieves a document from the specified collection using its ID.
255     *
256     * @param string $collectionName The name of the collection to get the document from
257     * @param string $documentId The document ID to retrieve
258     * @param array $include What to include in the response (default: ["metadatas", "documents"])
259     * @return array The retrieved document
260     * @throws Exception If the collection ID is not found
261     */
262    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
263        // Use provided name, fallback to 'documents' if empty
264        if (empty($collectionName)) {
265            $collectionName = 'documents';
266        }
267
268        // First get the collection to find its ID
269        $collection = $this->getCollection($collectionName);
270        if (!isset($collection['id'])) {
271            throw new \Exception("Collection ID not found for '{$collectionName}'");
272        }
273
274        $collectionId = $collection['id'];
275        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
276        $data = [
277            'ids' => [$documentId],
278            'include' => $include
279        ];
280
281        return $this->makeRequest($endpoint, 'POST', $data);
282    }
283
284    /**
285     * Add documents to a collection
286     *
287     * Adds documents to the specified collection. Each document must have a corresponding ID.
288     * Optional metadata and pre-computed embeddings can also be provided.
289     *
290     * @param string $collectionName The name of the collection to add documents to
291     * @param array $documents The document contents
292     * @param array $ids The document IDs
293     * @param array|null $metadatas Optional metadata for each document
294     * @param array|null $embeddings Optional pre-computed embeddings for each document
295     * @return array The response from the API
296     * @throws Exception If the collection ID is not found
297     */
298    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
299        // Use provided name, fallback to 'documents' if empty
300        if (empty($collectionName)) {
301            $collectionName = 'documents';
302        }
303
304        // First get the collection to find its ID
305        $collection = $this->getCollection($collectionName);
306        if (!isset($collection['id'])) {
307            throw new \Exception("Collection ID not found for '{$collectionName}'");
308        }
309
310        $collectionId = $collection['id'];
311        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
312        $data = [
313            'ids' => $ids,
314            'documents' => $documents
315        ];
316
317        if ($metadatas) {
318            $data['metadatas'] = $metadatas;
319        }
320
321        if ($embeddings) {
322            $data['embeddings'] = $embeddings;
323        }
324
325        return $this->makeRequest($endpoint, 'POST', $data);
326    }
327
328    /**
329     * Check if a document needs to be updated based on timestamp comparison
330     *
331     * Determines whether a document should be reprocessed by comparing the file's last modification
332     * time with the processed_at timestamp stored in the document's metadata. The function checks
333     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
334     * not included in the database.
335     *
336     * @param string $collectionId The ID of the collection to check documents in
337     * @param string $documentId The base document ID to check (without chunk suffixes)
338     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
339     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
340     * @throws Exception If there's an error checking the document
341     */
342    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
343        try {
344            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
345
346            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
347            $chunkIdsToCheck = [
348                $documentId . '@1',
349                $documentId . '@2',
350                $documentId . '@3'
351            ];
352
353            $data = [
354                'ids' => $chunkIdsToCheck,
355                'include' => [
356                    "metadatas"
357                ],
358                'limit' => 1
359            ];
360
361            // Check if document exists
362            $result = $this->makeRequest($endpoint, 'POST', $data);
363
364            // If no documents found, return true (needs to be added)
365            if (empty($result['ids'])) {
366                return true;
367            }
368
369            // Check if any document has a processed_at timestamp
370            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
371                // Check the first metadata entry directly
372                $metadata = $result['metadatas'][0];
373
374                // If processed_at is not set, return true (needs update)
375                if (!isset($metadata['processed_at'])) {
376                    return true;
377                }
378
379                // Parse the processed_at timestamp
380                $processedTimestamp = strtotime($metadata['processed_at']);
381
382                // If file is newer than processed time, return true (needs update)
383                if ($fileModifiedTime > $processedTimestamp) {
384                    return true;
385                }
386            }
387
388            // Document exists and is up to date
389            return false;
390        } catch (\Exception $e) {
391            // If there's an error checking the document, assume it needs to be updated
392            return true;
393        }
394    }
395
396    /**
397     * Query a collection for similar documents
398     *
399     * Queries the specified collection for documents similar to the provided query texts.
400     * The function generates embeddings for the query texts and sends them to ChromaDB.
401     * Supports filtering results by metadata using the where parameter.
402     *
403     * @param string $collectionName The name of the collection to query
404     * @param array $queryTexts The query texts to search for
405     * @param int $nResults The number of results to return (default: 5)
406     * @param array|null $where Optional filter conditions for metadata
407     * @return array The query results
408     * @throws Exception If the collection ID is not found
409     */
410    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
411        // Use provided name, fallback to 'documents' if empty
412        if (empty($collectionName)) {
413            $collectionName = 'documents';
414        }
415
416        // First get the collection to find its ID
417        $collection = $this->getCollection($collectionName);
418        if (!isset($collection['id'])) {
419            throw new \Exception("Collection ID not found for '{$collectionName}'");
420        }
421
422        $collectionId = $collection['id'];
423        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
424
425        // Generate embeddings for query texts
426        $queryEmbeddings = [];
427        foreach ($queryTexts as $text) {
428            $queryEmbeddings[] = $this->generateEmbeddings($text);
429        }
430
431        $data = [
432            'query_embeddings' => $queryEmbeddings,
433            'n_results' => $nResults
434        ];
435
436        // Add where clause for metadata filtering if provided
437        if ($where && is_array($where)) {
438            $data['where'] = $where;
439        }
440
441        return $this->makeRequest($endpoint, 'POST', $data);
442    }
443
444    /**
445     * Check if the ChromaDB server is alive
446     *
447     * Sends a heartbeat request to verify that the ChromaDB server is running.
448     *
449     * @return array The response from the heartbeat endpoint
450     */
451    public function heartbeat() {
452        $endpoint = "/heartbeat";
453        return $this->makeRequest($endpoint, 'GET');
454    }
455
456    /**
457     * Get authentication and identity information
458     *
459     * Retrieves authentication and identity information from the ChromaDB server.
460     *
461     * @return array The response from the auth/identity endpoint
462     */
463    public function getIdentity() {
464        $endpoint = "/identity";
465        return $this->makeRequest($endpoint, 'GET');
466    }
467
468    /**
469     * Ensure that the specified tenant and database exist
470     *
471     * Checks if the specified tenant and database exist, and creates them if they don't.
472     *
473     * @return void
474     */
475    private function ensureTenantAndDatabase() {
476        // Check if tenant exists, create if it doesn't
477        try {
478            $this->getTenant($this->tenant);
479        } catch (\Exception $e) {
480            // Tenant doesn't exist, create it
481            $this->createTenant($this->tenant);
482        }
483
484        // Check if database exists, create if it doesn't
485        try {
486            $this->getDatabase($this->database, $this->tenant);
487        } catch (\Exception $e) {
488            // Database doesn't exist, create it
489            $this->createDatabase($this->database, $this->tenant);
490        }
491    }
492
493    /**
494     * Get tenant information
495     *
496     * Retrieves information about the specified tenant.
497     *
498     * @param string $tenantName The tenant name
499     * @return array The tenant information
500     */
501    public function getTenant($tenantName) {
502        $endpoint = "/tenants/{$tenantName}";
503        return $this->makeRequest($endpoint, 'GET');
504    }
505
506    /**
507     * Create a new tenant
508     *
509     * Creates a new tenant with the specified name.
510     *
511     * @param string $tenantName The tenant name
512     * @return array The response from the API
513     */
514    public function createTenant($tenantName) {
515        $endpoint = "/tenants";
516        $data = ['name' => $tenantName];
517        return $this->makeRequest($endpoint, 'POST', $data);
518    }
519
520    /**
521     * Get database information
522     *
523     * Retrieves information about the specified database within a tenant.
524     *
525     * @param string $databaseName The database name
526     * @param string $tenantName The tenant name
527     * @return array The database information
528     */
529    public function getDatabase($databaseName, $tenantName) {
530        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
531        return $this->makeRequest($endpoint, 'GET');
532    }
533
534    /**
535     * Create a new database
536     *
537     * Creates a new database with the specified name within a tenant.
538     *
539     * @param string $databaseName The database name
540     * @param string $tenantName The tenant name
541     * @return array The response from the API
542     */
543    public function createDatabase($databaseName, $tenantName) {
544        $endpoint = "/tenants/{$tenantName}/databases";
545        $data = ['name' => $databaseName];
546        return $this->makeRequest($endpoint, 'POST', $data);
547    }
548
549    /**
550     * Ensure a collection exists, creating it if necessary
551     *
552     * This helper function checks if a collection exists and creates it if it doesn't.
553     *
554     * @param string $collectionName The name of the collection to check/create
555     * @return string Status message indicating what happened
556     */
557    public function ensureCollectionExists($collectionName) {
558        try {
559            $collection = $this->getCollection($collectionName);
560            return "Collection '$collectionName' already exists.";
561        } catch (\Exception $e) {
562            // Collection doesn't exist, create it
563            $created = $this->createCollection($collectionName);
564            return "Collection '$collectionName' created.";
565        }
566    }
567
568    /**
569     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
570     *
571     * This function handles the complete processing of a single DokuWiki file:
572     * 1. Parses the file path to extract metadata and document ID
573     * 2. Determines the appropriate collection based on document ID
574     * 3. Checks if the document needs updating using timestamp comparison
575     * 4. Reads and processes file content only if update is needed
576     * 5. Splits the document into chunks (paragraphs)
577     * 6. Extracts rich metadata from the DokuWiki ID format
578     * 7. Generates embeddings for each chunk
579     * 8. Sends all chunks to ChromaDB with metadata
580     *
581     * Supported ID formats:
582     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
583     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
584     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
585     *
586     * The function implements smart update checking by comparing file modification time
587     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
588     *
589     * @param string $filePath The path to the file to process
590     * @param string $collectionName The name of the collection to use
591     * @param bool $collectionChecked Whether the collection has already been checked/created
592     * @return array Result with status and details
593     */
594    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
595        // Parse file path to extract metadata
596        $id = parseFilePath($filePath);
597
598        try {
599            // Create collection if it doesn't exist (only if not already checked)
600            $collectionStatus = '';
601            if (!$collectionChecked) {
602                $collectionStatus = $this->ensureCollectionExists($collectionName);
603            }
604
605            // Get collection ID
606            $collection = $this->getCollection($collectionName);
607            if (!isset($collection['id'])) {
608                return [
609                    'status' => 'error',
610                    'message' => "Collection ID not found for '{$collectionName}'"
611                ];
612            }
613            $collectionId = $collection['id'];
614
615            // Get file modification time
616            $fileModifiedTime = filemtime($filePath);
617
618            // Check if document needs update
619            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
620
621            // If document is up to date, skip processing
622            if (!$needsUpdate) {
623                return [
624                    'status' => 'skipped',
625                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
626                ];
627            }
628
629            // Read file content
630            $content = file_get_contents($filePath);
631
632            // Split document into chunks (paragraphs separated by two newlines)
633            $paragraphs = preg_split('/\n\s*\n/', $content);
634            $chunks = [];
635            $chunkMetadata = [];
636
637            // Parse the DokuWiki ID to extract base metadata
638            $parts = explode(':', $id);
639
640            // Extract metadata from the last part of the ID
641            $lastPart = end($parts);
642            $baseMetadata = [];
643
644            // Add the document ID as metadata
645            $baseMetadata['document_id'] = $id;
646
647            // Add current timestamp
648            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
649
650            // Check if any part of the ID is 'templates' and set template metadata
651            $isTemplate = in_array('templates', $parts);
652            if ($isTemplate) {
653                $baseMetadata['type'] = 'template';
654            } else {
655                $baseMetadata['type'] = 'report';
656            }
657
658            // Extract modality from the second part
659            if (isset($parts[1])) {
660                $baseMetadata['modality'] = $parts[1];
661            }
662
663            // Handle different ID formats based on the third part: word (institution) or numeric (year)
664            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
665            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
666            // For templates, don't set institution, date or year
667            if (isset($parts[2]) && !$isTemplate) {
668                // Check if third part is numeric (year) or word (institution)
669                if (is_numeric($parts[2])) {
670                    // Format: reports:mri:2024:g287-name-surname (year format)
671                    // Extract year from the third part
672                    $baseMetadata['year'] = $parts[2];
673
674                    // Set default institution from config
675                    global $conf;
676                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
677
678                    // Extract registration and name from the last part
679                    // Registration should start with one letter or number and contain numbers before the '-' character
680                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
681                        // Check if the first part contains at least one digit to be considered a registration
682                        if (preg_match('/[0-9]/', $matches[1])) {
683                            $baseMetadata['registration'] = $matches[1];
684                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
685                        } else {
686                            // If no registration pattern found, treat entire part as patient name
687                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
688                        }
689                    } else {
690                        // If no match, treat entire part as patient name
691                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
692                    }
693                } else {
694                    // Format: reports:mri:institution:250620-name-surname (institution format)
695                    // Extract institution from the third part
696                    $baseMetadata['institution'] = $parts[2];
697
698                    // Extract date and name from the last part
699                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
700                        $dateStr = $matches[1];
701                        $name = $matches[2];
702
703                        // Convert date format (250620 -> 2025-06-20)
704                        $day = substr($dateStr, 0, 2);
705                        $month = substr($dateStr, 2, 2);
706                        $year = substr($dateStr, 4, 2);
707                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
708                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
709                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
710
711                        $baseMetadata['date'] = $formattedDate;
712                        $baseMetadata['name'] = str_replace('-', ' ', $name);
713                    }
714                }
715            }
716
717            // For templates, always extract name from the last part
718            if ($isTemplate && isset($lastPart)) {
719                // Extract name from the last part (everything after the last colon)
720                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
721                    // Check if the first part contains at least one digit to be considered a registration
722                    if (preg_match('/[0-9]/', $matches[1])) {
723                        $baseMetadata['registration'] = $matches[1];
724                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
725                    } else {
726                        // If no registration pattern found, treat entire part as template name
727                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
728                    }
729                } else {
730                    // If no match, treat entire part as template name
731                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
732                }
733            }
734
735            // Process each paragraph as a chunk with intelligent metadata handling
736            $chunkIds = [];
737            $chunkContents = [];
738            $chunkMetadatas = [];
739            $chunkEmbeddings = [];
740            $currentTags = [];
741
742            foreach ($paragraphs as $index => $paragraph) {
743                // Skip empty paragraphs to avoid processing whitespace-only content
744                $paragraph = trim($paragraph);
745                if (empty($paragraph)) {
746                    continue;
747                }
748
749                // Check if this is a DokuWiki title (starts and ends with =)
750                // Titles are converted to tags for better searchability but not stored as content chunks
751                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
752                    // Extract title content and clean it
753                    $titleContent = trim($matches[1]);
754
755                    // Split into words and create searchable tags
756                    $words = preg_split('/\s+/', $titleContent);
757                    $tags = [];
758
759                    foreach ($words as $word) {
760                        // Only use words longer than 3 characters to reduce noise
761                        if (strlen($word) >= 3) {
762                            $tags[] = strtolower($word);
763                        }
764                    }
765
766                    // Remove duplicate tags and store for use in subsequent chunks
767                    $currentTags = array_unique($tags);
768                    continue; // Skip storing title chunks as content
769                }
770
771                // Create chunk ID
772                $chunkId = $id . '@' . ($index + 1);
773
774                // Generate embeddings for the chunk
775                $embeddings = $this->generateEmbeddings($paragraph);
776
777                // Add chunk-specific metadata
778                $metadata = $baseMetadata;
779                $metadata['chunk_id'] = $chunkId;
780                $metadata['chunk_number'] = $index + 1;
781                $metadata['total_chunks'] = count($paragraphs);
782
783                // Add current tags to metadata if any exist
784                if (!empty($currentTags)) {
785                    $metadata['tags'] = implode(',', $currentTags);
786                }
787
788                // Store chunk data
789                $chunkIds[] = $chunkId;
790                $chunkContents[] = $paragraph;
791                $chunkMetadatas[] = $metadata;
792                $chunkEmbeddings[] = $embeddings;
793            }
794
795            // If no chunks were created, skip this file
796            if (empty($chunkIds)) {
797                return [
798                    'status' => 'skipped',
799                    'message' => "No valid chunks found in file '$id'. Skipping..."
800                ];
801            }
802
803            // Send all chunks to ChromaDB
804            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
805
806            return [
807                'status' => 'success',
808                'message' => "Successfully sent file to ChromaDB",
809                'details' => [
810                    'document_id' => $id,
811                    'chunks' => count($chunkIds),
812                    'collection' => $collectionName
813                ],
814                'collection_status' => $collectionStatus
815            ];
816        } catch (\Exception $e) {
817            return [
818                'status' => 'error',
819                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
820            ];
821        }
822    }
823
824    /**
825     * Process all DokuWiki files in a directory and send them to ChromaDB
826     *
827     * This function recursively processes all .txt files in a directory and its subdirectories.
828     * It first checks if the appropriate collection exists and creates it if needed.
829     * Then it processes each file individually.
830     *
831     * @param string $dirPath The directory path to process
832     * @return array Result with status and details
833     */
834    public function processDirectory($dirPath) {
835        // Check if directory exists
836        if (!is_dir($dirPath)) {
837            return [
838                'status' => 'error',
839                'message' => "Directory does not exist: $dirPath"
840            ];
841        }
842
843        // Create RecursiveIteratorIterator to process directories recursively
844        $iterator = new RecursiveIteratorIterator(
845            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
846            RecursiveIteratorIterator::LEAVES_ONLY
847        );
848
849        $files = [];
850        foreach ($iterator as $file) {
851            // Process only .txt files that don't start with underscore
852            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
853                $files[] = $file->getPathname();
854            }
855        }
856
857        if (empty($files)) {
858            return [
859                'status' => 'skipped',
860                'message' => "No .txt files found in directory: $dirPath"
861            ];
862        }
863
864        // Use the first part of the document ID as collection name, fallback to 'documents'
865        $sampleFile = $files[0];
866        $id = parseFilePath($sampleFile);
867        $idParts = explode(':', $id);
868        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
869
870        try {
871            $this->ensureCollectionExists($collectionName);
872            $collectionChecked = true;
873        } catch (Exception $e) {
874            $collectionChecked = true;
875        }
876
877        $results = [];
878        foreach ($files as $file) {
879            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
880            $results[] = [
881                'file' => $file,
882                'result' => $result
883            ];
884        }
885
886        return [
887            'status' => 'success',
888            'message' => "Finished processing directory.",
889            'files_count' => count($files),
890            'results' => $results
891        ];
892    }
893}
894
895/**
896 * Parse a file path and convert it to a DokuWiki ID
897 *
898 * Takes a file system path and converts it to the DokuWiki ID format by:
899 * 1. Removing the base path prefix (using DokuWiki's pages directory)
900 * 2. Removing the .txt extension
901 * 3. Converting directory separators to colons
902 *
903 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
904 * Becomes: reports:mri:2024:g287-name-surname
905 *
906 * @param string $filePath The full file path to parse
907 * @return string The DokuWiki ID
908 */
909function parseFilePath($filePath) {
910    // Use DokuWiki's constant to get the pages directory if available
911    if (defined('DOKU_INC')) {
912        $pagesDir = DOKU_INC . 'data/pages/';
913    } else {
914        // Fallback to common DokuWiki installation path
915        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
916    }
917
918    // Remove the base path
919    $relativePath = str_replace($pagesDir, '', $filePath);
920
921    // Remove .txt extension
922    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
923
924    // Split path into parts and filter out empty parts
925    $parts = array_filter(explode('/', $relativePath));
926
927    // Build DokuWiki ID (use first part as namespace)
928    $idParts = [];
929    foreach ($parts as $part) {
930        if (!empty($part)) {
931            $idParts[] = $part;
932        }
933    }
934
935    return implode(':', $idParts);
936}
937
938