xref: /plugin/dokullm/ChromaDBClient.php (revision 15312b8e6a64365ef4eaab5a2ba7d5c4fc2e090b)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    private $ollamaModel;
14
15    /**
16     * Get configuration value for the dokullm plugin
17     *
18     * @param string $key Configuration key
19     * @param mixed $default Default value if key not found
20     * @return mixed Configuration value
21     */
22    private function getConf($key, $default = null) {
23        global $conf;
24        return isset($conf['plugin']['dokullm'][$key]) ? $conf['plugin']['dokullm'][$key] : $default;
25    }
26    /**
27     * Initialize the ChromaDB client
28     *
29     * Creates a new ChromaDB client instance with the specified connection parameters.
30     * Also ensures that the specified tenant and database exist.
31     *
32     * @param string $host ChromaDB server host
33     * @param int $port ChromaDB server port
34     * @param string $tenant ChromaDB tenant name
35     * @param string $database ChromaDB database name
36     * @param string $ollamaHost Ollama server host
37     * @param int $ollamaPort Ollama server port
38     * @param string $ollamaModel Ollama embeddings model
39     */
40    public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) {
41        // Use provided parameters or fall back to configuration values
42        $chromaHost = $host ?? $this->getConf('chroma_host', '127.0.0.1');
43        $chromaPort = $port ?? $this->getConf('chroma_port', 8000);
44        $this->tenant = $tenant ?? $this->getConf('chroma_tenant', 'dokullm');
45        $this->database = $database ?? $this->getConf('chroma_database', 'dokullm');
46        $this->ollamaHost = $ollamaHost ?? $this->getConf('ollama_host', '127.0.0.1');
47        $this->ollamaPort = $ollamaPort ?? $this->getConf('ollama_port', 11434);
48        $this->ollamaModel = $ollamaModel ?? $this->getConf('ollama_embeddings_model', 'nomic-embed-text');
49
50        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
51        $this->client = curl_init();
52        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
53        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
54            'Content-Type: application/json',
55            'Accept: application/json'
56        ]);
57
58        // Initialize Ollama client
59        $this->ollamaClient = curl_init();
60        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
61        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
62            'Content-Type: application/json'
63        ]);
64
65        // Check if tenant and database exist, create them if they don't
66        $this->ensureTenantAndDatabase();
67    }
68
69    /**
70     * Clean up the cURL client when the object is destroyed
71     *
72     * @return void
73     */
74    public function __destruct() {
75        curl_close($this->client);
76        curl_close($this->ollamaClient);
77    }
78
79    /**
80     * Make an HTTP request to the ChromaDB API
81     *
82     * This is a helper function that handles making HTTP requests to the ChromaDB API,
83     * including setting the appropriate headers for tenant and database.
84     *
85     * @param string $endpoint The API endpoint to call
86     * @param string $method The HTTP method to use (default: 'GET')
87     * @param array|null $data The data to send with the request (default: null)
88     * @return array The JSON response decoded as an array
89     * @throws Exception If there's a cURL error or HTTP error
90     */
91    private function makeRequest($endpoint, $method = 'GET', $data = null) {
92        // Add tenant and database as headers instead of query parameters for v2 API
93        $headers = [
94            'Content-Type: application/json',
95            'Accept: application/json'
96        ];
97
98        $url = $this->baseUrl . '/api/v2' . $endpoint;
99
100        curl_setopt($this->client, CURLOPT_URL, $url);
101        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
102        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
103
104        if ($data) {
105            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
106        } else {
107            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
108        }
109
110        $response = curl_exec($this->client);
111        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
112
113        if (curl_error($this->client)) {
114            throw new \Exception('Curl error: ' . curl_error($this->client));
115        }
116
117        if ($httpCode >= 400) {
118            throw new \Exception("HTTP Error: $httpCode, Response: $response");
119        }
120
121        return json_decode($response, true);
122    }
123
124    /**
125     * Generate embeddings for text using Ollama
126     *
127     * @param string $text The text to generate embeddings for
128     * @return array The embeddings vector
129     */
130    public function generateEmbeddings($text) {
131        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
132
133        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
134
135        $data = [
136            'model' => $this->ollamaModel,
137            'prompt' => $text,
138            'keep_alive' => '30m'
139        ];
140
141        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
142
143        $response = curl_exec($this->ollamaClient);
144        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
145
146        if (curl_error($this->ollamaClient)) {
147            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
148        }
149
150        if ($httpCode >= 400) {
151            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
152        }
153
154        $result = json_decode($response, true);
155
156        if (!isset($result['embedding'])) {
157            throw new \Exception("Ollama response missing embedding: " . $response);
158        }
159
160        return $result['embedding'];
161    }
162
163    /**
164     * List all collections in the database
165     *
166     * Retrieves a list of all collections in the specified tenant and database.
167     *
168     * @return array List of collections
169     */
170    public function listCollections() {
171        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
172        return $this->makeRequest($endpoint);
173    }
174
175    /**
176     * Get a collection by name
177     *
178     * Retrieves information about a specific collection by its name.
179     *
180     * @param string $name The name of the collection to retrieve
181     * @return array The collection information
182     * @throws Exception If the collection is not found
183     */
184    public function getCollection($name) {
185        // Use provided name, fallback to 'documents' if empty
186        if (empty($name)) {
187            $name = 'documents';
188        }
189
190        // First try to get collection by name
191        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
192        $collections = $this->makeRequest($endpoint);
193
194        // Find collection by name
195        foreach ($collections as $collection) {
196            if (isset($collection['name']) && $collection['name'] === $name) {
197                return $collection;
198            }
199        }
200
201        // If not found, throw exception
202        throw new \Exception("Collection '{$name}' not found");
203    }
204
205    /**
206     * Create a new collection
207     *
208     * Creates a new collection with the specified name and optional metadata.
209     *
210     * @param string $name The name of the collection to create
211     * @param array|null $metadata Optional metadata for the collection
212     * @return array The response from the API
213     */
214    public function createCollection($name, $metadata = null) {
215        // Use provided name, fallback to 'documents' if empty
216        if (empty($name)) {
217            $name = 'documents';
218        }
219
220        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
221        $data = ['name' => $name];
222        if ($metadata) {
223            $data['metadata'] = $metadata;
224        }
225        return $this->makeRequest($endpoint, 'POST', $data);
226    }
227
228    /**
229     * Delete a collection by name
230     *
231     * Deletes a collection with the specified name.
232     *
233     * @param string $name The name of the collection to delete
234     * @return array The response from the API
235     * @throws Exception If the collection ID is not found
236     */
237    public function deleteCollection($name) {
238        // Use provided name, fallback to 'documents' if empty
239        if (empty($name)) {
240            $name = 'documents';
241        }
242
243        // First get the collection to find its ID
244        $collection = $this->getCollection($name);
245        if (!isset($collection['id'])) {
246            throw new \Exception("Collection ID not found for '{$name}'");
247        }
248
249        $collectionId = $collection['id'];
250        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
251        return $this->makeRequest($endpoint, 'DELETE');
252    }
253
254    /**
255     * Get a document by its ID from a collection
256     *
257     * Retrieves a document from the specified collection using its ID.
258     *
259     * @param string $collectionName The name of the collection to get the document from
260     * @param string $documentId The document ID to retrieve
261     * @param array $include What to include in the response (default: ["metadatas", "documents"])
262     * @return array The retrieved document
263     * @throws Exception If the collection ID is not found
264     */
265    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
266        // Use provided name, fallback to 'documents' if empty
267        if (empty($collectionName)) {
268            $collectionName = 'documents';
269        }
270
271        // First get the collection to find its ID
272        $collection = $this->getCollection($collectionName);
273        if (!isset($collection['id'])) {
274            throw new \Exception("Collection ID not found for '{$collectionName}'");
275        }
276
277        $collectionId = $collection['id'];
278        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
279        $data = [
280            'ids' => [$documentId],
281            'include' => $include
282        ];
283
284        return $this->makeRequest($endpoint, 'POST', $data);
285    }
286
287    /**
288     * Add documents to a collection
289     *
290     * Adds documents to the specified collection. Each document must have a corresponding ID.
291     * Optional metadata and pre-computed embeddings can also be provided.
292     *
293     * @param string $collectionName The name of the collection to add documents to
294     * @param array $documents The document contents
295     * @param array $ids The document IDs
296     * @param array|null $metadatas Optional metadata for each document
297     * @param array|null $embeddings Optional pre-computed embeddings for each document
298     * @return array The response from the API
299     * @throws Exception If the collection ID is not found
300     */
301    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
302        // Use provided name, fallback to 'documents' if empty
303        if (empty($collectionName)) {
304            $collectionName = 'documents';
305        }
306
307        // First get the collection to find its ID
308        $collection = $this->getCollection($collectionName);
309        if (!isset($collection['id'])) {
310            throw new \Exception("Collection ID not found for '{$collectionName}'");
311        }
312
313        $collectionId = $collection['id'];
314        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
315        $data = [
316            'ids' => $ids,
317            'documents' => $documents
318        ];
319
320        if ($metadatas) {
321            $data['metadatas'] = $metadatas;
322        }
323
324        if ($embeddings) {
325            $data['embeddings'] = $embeddings;
326        }
327
328        return $this->makeRequest($endpoint, 'POST', $data);
329    }
330
331    /**
332     * Check if a document needs to be updated based on timestamp comparison
333     *
334     * Determines whether a document should be reprocessed by comparing the file's last modification
335     * time with the processed_at timestamp stored in the document's metadata. The function checks
336     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
337     * not included in the database.
338     *
339     * @param string $collectionId The ID of the collection to check documents in
340     * @param string $documentId The base document ID to check (without chunk suffixes)
341     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
342     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
343     * @throws Exception If there's an error checking the document
344     */
345    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
346        try {
347            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
348
349            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
350            $chunkIdsToCheck = [
351                $documentId . '@1',
352                $documentId . '@2',
353                $documentId . '@3'
354            ];
355
356            $data = [
357                'ids' => $chunkIdsToCheck,
358                'include' => [
359                    "metadatas"
360                ],
361                'limit' => 1
362            ];
363
364            // Check if document exists
365            $result = $this->makeRequest($endpoint, 'POST', $data);
366
367            // If no documents found, return true (needs to be added)
368            if (empty($result['ids'])) {
369                return true;
370            }
371
372            // Check if any document has a processed_at timestamp
373            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
374                // Check the first metadata entry directly
375                $metadata = $result['metadatas'][0];
376
377                // If processed_at is not set, return true (needs update)
378                if (!isset($metadata['processed_at'])) {
379                    return true;
380                }
381
382                // Parse the processed_at timestamp
383                $processedTimestamp = strtotime($metadata['processed_at']);
384
385                // If file is newer than processed time, return true (needs update)
386                if ($fileModifiedTime > $processedTimestamp) {
387                    return true;
388                }
389            }
390
391            // Document exists and is up to date
392            return false;
393        } catch (\Exception $e) {
394            // If there's an error checking the document, assume it needs to be updated
395            return true;
396        }
397    }
398
399    /**
400     * Query a collection for similar documents
401     *
402     * Queries the specified collection for documents similar to the provided query texts.
403     * The function generates embeddings for the query texts and sends them to ChromaDB.
404     * Supports filtering results by metadata using the where parameter.
405     *
406     * @param string $collectionName The name of the collection to query
407     * @param array $queryTexts The query texts to search for
408     * @param int $nResults The number of results to return (default: 5)
409     * @param array|null $where Optional filter conditions for metadata
410     * @return array The query results
411     * @throws Exception If the collection ID is not found
412     */
413    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
414        // Use provided name, fallback to 'documents' if empty
415        if (empty($collectionName)) {
416            $collectionName = 'documents';
417        }
418
419        // First get the collection to find its ID
420        $collection = $this->getCollection($collectionName);
421        if (!isset($collection['id'])) {
422            throw new \Exception("Collection ID not found for '{$collectionName}'");
423        }
424
425        $collectionId = $collection['id'];
426        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
427
428        // Generate embeddings for query texts
429        $queryEmbeddings = [];
430        foreach ($queryTexts as $text) {
431            $queryEmbeddings[] = $this->generateEmbeddings($text);
432        }
433
434        $data = [
435            'query_embeddings' => $queryEmbeddings,
436            'n_results' => $nResults
437        ];
438
439        // Add where clause for metadata filtering if provided
440        if ($where && is_array($where)) {
441            $data['where'] = $where;
442        }
443
444        return $this->makeRequest($endpoint, 'POST', $data);
445    }
446
447    /**
448     * Check if the ChromaDB server is alive
449     *
450     * Sends a heartbeat request to verify that the ChromaDB server is running.
451     *
452     * @return array The response from the heartbeat endpoint
453     */
454    public function heartbeat() {
455        $endpoint = "/heartbeat";
456        return $this->makeRequest($endpoint, 'GET');
457    }
458
459    /**
460     * Get authentication and identity information
461     *
462     * Retrieves authentication and identity information from the ChromaDB server.
463     *
464     * @return array The response from the auth/identity endpoint
465     */
466    public function getIdentity() {
467        $endpoint = "/identity";
468        return $this->makeRequest($endpoint, 'GET');
469    }
470
471    /**
472     * Ensure that the specified tenant and database exist
473     *
474     * Checks if the specified tenant and database exist, and creates them if they don't.
475     *
476     * @return void
477     */
478    private function ensureTenantAndDatabase() {
479        // Check if tenant exists, create if it doesn't
480        try {
481            $this->getTenant($this->tenant);
482        } catch (\Exception $e) {
483            // Tenant doesn't exist, create it
484            $this->createTenant($this->tenant);
485        }
486
487        // Check if database exists, create if it doesn't
488        try {
489            $this->getDatabase($this->database, $this->tenant);
490        } catch (\Exception $e) {
491            // Database doesn't exist, create it
492            $this->createDatabase($this->database, $this->tenant);
493        }
494    }
495
496    /**
497     * Get tenant information
498     *
499     * Retrieves information about the specified tenant.
500     *
501     * @param string $tenantName The tenant name
502     * @return array The tenant information
503     */
504    public function getTenant($tenantName) {
505        $endpoint = "/tenants/{$tenantName}";
506        return $this->makeRequest($endpoint, 'GET');
507    }
508
509    /**
510     * Create a new tenant
511     *
512     * Creates a new tenant with the specified name.
513     *
514     * @param string $tenantName The tenant name
515     * @return array The response from the API
516     */
517    public function createTenant($tenantName) {
518        $endpoint = "/tenants";
519        $data = ['name' => $tenantName];
520        return $this->makeRequest($endpoint, 'POST', $data);
521    }
522
523    /**
524     * Get database information
525     *
526     * Retrieves information about the specified database within a tenant.
527     *
528     * @param string $databaseName The database name
529     * @param string $tenantName The tenant name
530     * @return array The database information
531     */
532    public function getDatabase($databaseName, $tenantName) {
533        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
534        return $this->makeRequest($endpoint, 'GET');
535    }
536
537    /**
538     * Create a new database
539     *
540     * Creates a new database with the specified name within a tenant.
541     *
542     * @param string $databaseName The database name
543     * @param string $tenantName The tenant name
544     * @return array The response from the API
545     */
546    public function createDatabase($databaseName, $tenantName) {
547        $endpoint = "/tenants/{$tenantName}/databases";
548        $data = ['name' => $databaseName];
549        return $this->makeRequest($endpoint, 'POST', $data);
550    }
551
552    /**
553     * Ensure a collection exists, creating it if necessary
554     *
555     * This helper function checks if a collection exists and creates it if it doesn't.
556     *
557     * @param string $collectionName The name of the collection to check/create
558     * @return string Status message indicating what happened
559     */
560    public function ensureCollectionExists($collectionName) {
561        try {
562            $collection = $this->getCollection($collectionName);
563            return "Collection '$collectionName' already exists.";
564        } catch (\Exception $e) {
565            // Collection doesn't exist, create it
566            $created = $this->createCollection($collectionName);
567            return "Collection '$collectionName' created.";
568        }
569    }
570
571    /**
572     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
573     *
574     * This function handles the complete processing of a single DokuWiki file:
575     * 1. Parses the file path to extract metadata and document ID
576     * 2. Determines the appropriate collection based on document ID
577     * 3. Checks if the document needs updating using timestamp comparison
578     * 4. Reads and processes file content only if update is needed
579     * 5. Splits the document into chunks (paragraphs)
580     * 6. Extracts rich metadata from the DokuWiki ID format
581     * 7. Generates embeddings for each chunk
582     * 8. Sends all chunks to ChromaDB with metadata
583     *
584     * Supported ID formats:
585     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
586     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
587     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
588     *
589     * The function implements smart update checking by comparing file modification time
590     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
591     *
592     * @param string $filePath The path to the file to process
593     * @param string $collectionName The name of the collection to use
594     * @param bool $collectionChecked Whether the collection has already been checked/created
595     * @return array Result with status and details
596     */
597    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
598        // Parse file path to extract metadata
599        $id = parseFilePath($filePath);
600
601        try {
602            // Create collection if it doesn't exist (only if not already checked)
603            $collectionStatus = '';
604            if (!$collectionChecked) {
605                $collectionStatus = $this->ensureCollectionExists($collectionName);
606            }
607
608            // Get collection ID
609            $collection = $this->getCollection($collectionName);
610            if (!isset($collection['id'])) {
611                return [
612                    'status' => 'error',
613                    'message' => "Collection ID not found for '{$collectionName}'"
614                ];
615            }
616            $collectionId = $collection['id'];
617
618            // Get file modification time
619            $fileModifiedTime = filemtime($filePath);
620
621            // Check if document needs update
622            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
623
624            // If document is up to date, skip processing
625            if (!$needsUpdate) {
626                return [
627                    'status' => 'skipped',
628                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
629                ];
630            }
631
632            // Read file content
633            $content = file_get_contents($filePath);
634
635            // Split document into chunks (paragraphs separated by two newlines)
636            $paragraphs = preg_split('/\n\s*\n/', $content);
637            $chunks = [];
638            $chunkMetadata = [];
639
640            // Parse the DokuWiki ID to extract base metadata
641            $parts = explode(':', $id);
642
643            // Extract metadata from the last part of the ID
644            $lastPart = end($parts);
645            $baseMetadata = [];
646
647            // Add the document ID as metadata
648            $baseMetadata['document_id'] = $id;
649
650            // Add current timestamp
651            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
652
653            // Check if any part of the ID is 'templates' and set template metadata
654            $isTemplate = in_array('templates', $parts);
655            if ($isTemplate) {
656                $baseMetadata['type'] = 'template';
657            } else {
658                $baseMetadata['type'] = 'report';
659            }
660
661            // Extract modality from the second part
662            if (isset($parts[1])) {
663                $baseMetadata['modality'] = $parts[1];
664            }
665
666            // Handle different ID formats based on the third part: word (institution) or numeric (year)
667            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
668            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
669            // For templates, don't set institution, date or year
670            if (isset($parts[2]) && !$isTemplate) {
671                // Check if third part is numeric (year) or word (institution)
672                if (is_numeric($parts[2])) {
673                    // Format: reports:mri:2024:g287-name-surname (year format)
674                    // Extract year from the third part
675                    $baseMetadata['year'] = $parts[2];
676
677                    // Set default institution from config
678                    $baseMetadata['institution'] = $this->getConf('default_institution', 'default');
679
680                    // Extract registration and name from the last part
681                    // Registration should start with one letter or number and contain numbers before the '-' character
682                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
683                        // Check if the first part contains at least one digit to be considered a registration
684                        if (preg_match('/[0-9]/', $matches[1])) {
685                            $baseMetadata['registration'] = $matches[1];
686                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
687                        } else {
688                            // If no registration pattern found, treat entire part as patient name
689                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
690                        }
691                    } else {
692                        // If no match, treat entire part as patient name
693                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
694                    }
695                } else {
696                    // Format: reports:mri:institution:250620-name-surname (institution format)
697                    // Extract institution from the third part
698                    $baseMetadata['institution'] = $parts[2];
699
700                    // Extract date and name from the last part
701                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
702                        $dateStr = $matches[1];
703                        $name = $matches[2];
704
705                        // Convert date format (250620 -> 2025-06-20)
706                        $day = substr($dateStr, 0, 2);
707                        $month = substr($dateStr, 2, 2);
708                        $year = substr($dateStr, 4, 2);
709                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
710                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
711                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
712
713                        $baseMetadata['date'] = $formattedDate;
714                        $baseMetadata['name'] = str_replace('-', ' ', $name);
715                    }
716                }
717            }
718
719            // For templates, always extract name from the last part
720            if ($isTemplate && isset($lastPart)) {
721                // Extract name from the last part (everything after the last colon)
722                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
723                    // Check if the first part contains at least one digit to be considered a registration
724                    if (preg_match('/[0-9]/', $matches[1])) {
725                        $baseMetadata['registration'] = $matches[1];
726                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
727                    } else {
728                        // If no registration pattern found, treat entire part as template name
729                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
730                    }
731                } else {
732                    // If no match, treat entire part as template name
733                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
734                }
735            }
736
737            // Process each paragraph as a chunk with intelligent metadata handling
738            $chunkIds = [];
739            $chunkContents = [];
740            $chunkMetadatas = [];
741            $chunkEmbeddings = [];
742            $currentTags = [];
743
744            foreach ($paragraphs as $index => $paragraph) {
745                // Skip empty paragraphs to avoid processing whitespace-only content
746                $paragraph = trim($paragraph);
747                if (empty($paragraph)) {
748                    continue;
749                }
750
751                // Check if this is a DokuWiki title (starts and ends with =)
752                // Titles are converted to tags for better searchability but not stored as content chunks
753                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
754                    // Extract title content and clean it
755                    $titleContent = trim($matches[1]);
756
757                    // Split into words and create searchable tags
758                    $words = preg_split('/\s+/', $titleContent);
759                    $tags = [];
760
761                    foreach ($words as $word) {
762                        // Only use words longer than 3 characters to reduce noise
763                        if (strlen($word) >= 3) {
764                            $tags[] = strtolower($word);
765                        }
766                    }
767
768                    // Remove duplicate tags and store for use in subsequent chunks
769                    $currentTags = array_unique($tags);
770                    continue; // Skip storing title chunks as content
771                }
772
773                // Create chunk ID
774                $chunkId = $id . '@' . ($index + 1);
775
776                // Generate embeddings for the chunk
777                $embeddings = $this->generateEmbeddings($paragraph);
778
779                // Add chunk-specific metadata
780                $metadata = $baseMetadata;
781                $metadata['chunk_id'] = $chunkId;
782                $metadata['chunk_number'] = $index + 1;
783                $metadata['total_chunks'] = count($paragraphs);
784
785                // Add current tags to metadata if any exist
786                if (!empty($currentTags)) {
787                    $metadata['tags'] = implode(',', $currentTags);
788                }
789
790                // Store chunk data
791                $chunkIds[] = $chunkId;
792                $chunkContents[] = $paragraph;
793                $chunkMetadatas[] = $metadata;
794                $chunkEmbeddings[] = $embeddings;
795            }
796
797            // If no chunks were created, skip this file
798            if (empty($chunkIds)) {
799                return [
800                    'status' => 'skipped',
801                    'message' => "No valid chunks found in file '$id'. Skipping..."
802                ];
803            }
804
805            // Send all chunks to ChromaDB
806            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
807
808            return [
809                'status' => 'success',
810                'message' => "Successfully sent file to ChromaDB",
811                'details' => [
812                    'document_id' => $id,
813                    'chunks' => count($chunkIds),
814                    'collection' => $collectionName
815                ],
816                'collection_status' => $collectionStatus
817            ];
818        } catch (\Exception $e) {
819            return [
820                'status' => 'error',
821                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
822            ];
823        }
824    }
825
826    /**
827     * Process all DokuWiki files in a directory and send them to ChromaDB
828     *
829     * This function recursively processes all .txt files in a directory and its subdirectories.
830     * It first checks if the appropriate collection exists and creates it if needed.
831     * Then it processes each file individually.
832     *
833     * @param string $dirPath The directory path to process
834     * @return array Result with status and details
835     */
836    public function processDirectory($dirPath) {
837        // Check if directory exists
838        if (!is_dir($dirPath)) {
839            return [
840                'status' => 'error',
841                'message' => "Directory does not exist: $dirPath"
842            ];
843        }
844
845        // Create RecursiveIteratorIterator to process directories recursively
846        $iterator = new RecursiveIteratorIterator(
847            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
848            RecursiveIteratorIterator::LEAVES_ONLY
849        );
850
851        $files = [];
852        foreach ($iterator as $file) {
853            // Process only .txt files that don't start with underscore
854            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
855                $files[] = $file->getPathname();
856            }
857        }
858
859        if (empty($files)) {
860            return [
861                'status' => 'skipped',
862                'message' => "No .txt files found in directory: $dirPath"
863            ];
864        }
865
866        // Use the first part of the document ID as collection name, fallback to 'documents'
867        $sampleFile = $files[0];
868        $id = parseFilePath($sampleFile);
869        $idParts = explode(':', $id);
870        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
871
872        try {
873            $this->ensureCollectionExists($collectionName);
874            $collectionChecked = true;
875        } catch (Exception $e) {
876            $collectionChecked = true;
877        }
878
879        $results = [];
880        foreach ($files as $file) {
881            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
882            $results[] = [
883                'file' => $file,
884                'result' => $result
885            ];
886        }
887
888        return [
889            'status' => 'success',
890            'message' => "Finished processing directory.",
891            'files_count' => count($files),
892            'results' => $results
893        ];
894    }
895}
896
897/**
898 * Parse a file path and convert it to a DokuWiki ID
899 *
900 * Takes a file system path and converts it to the DokuWiki ID format by:
901 * 1. Removing the base path prefix (using DokuWiki's pages directory)
902 * 2. Removing the .txt extension
903 * 3. Converting directory separators to colons
904 *
905 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
906 * Becomes: reports:mri:2024:g287-name-surname
907 *
908 * @param string $filePath The full file path to parse
909 * @return string The DokuWiki ID
910 */
911function parseFilePath($filePath) {
912    // Use DokuWiki's constant to get the pages directory if available
913    if (defined('DOKU_INC')) {
914        $pagesDir = DOKU_INC . 'data/pages/';
915    } else {
916        // Fallback to common DokuWiki installation path
917        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
918    }
919
920    // Remove the base path
921    $relativePath = str_replace($pagesDir, '', $filePath);
922
923    // Remove .txt extension
924    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
925
926    // Split path into parts and filter out empty parts
927    $parts = array_filter(explode('/', $relativePath));
928
929    // Build DokuWiki ID (use first part as namespace)
930    $idParts = [];
931    foreach ($parts as $part) {
932        if (!empty($part)) {
933            $idParts[] = $part;
934        }
935    }
936
937    return implode(':', $idParts);
938}
939
940