xref: /plugin/dokullm/ChromaDBClient.php (revision a068a1ba363ef960eac31d32ced19151b291674d)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    private $ollamaModel;
14
15    /**
16     * Get configuration value for the dokullm plugin
17     *
18     * @param string $key Configuration key
19     * @param mixed $default Default value if key not found
20     * @return mixed Configuration value
21     */
22    private function getConf($key, $default = null) {
23        global $conf;
24        return isset($conf['plugin']['dokullm'][$key]) ? $conf['plugin']['dokullm'][$key] : $default;
25    }
26    private $baseUrl;
27    private $client;
28    private $ollamaClient;
29    private $tenant;
30    private $database;
31    private $ollamaHost;
32    private $ollamaPort;
33    /**
34     * Initialize the ChromaDB client
35     *
36     * Creates a new ChromaDB client instance with the specified connection parameters.
37     * Also ensures that the specified tenant and database exist.
38     *
39     * @param string $host ChromaDB server host
40     * @param int $port ChromaDB server port
41     * @param string $tenant ChromaDB tenant name
42     * @param string $database ChromaDB database name
43     * @param string $ollamaHost Ollama server host
44     * @param int $ollamaPort Ollama server port
45     * @param string $ollamaModel Ollama embeddings model
46     */
47    public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) {
48        // Use provided parameters or fall back to configuration values
49        $chromaHost = $host ?? $this->getConf('chroma_host', '127.0.0.1');
50        $chromaPort = $port ?? $this->getConf('chroma_port', 8000);
51        $this->tenant = $tenant ?? $this->getConf('chroma_tenant', 'dokullm');
52        $this->database = $database ?? $this->getConf('chroma_database', 'dokullm');
53        $this->ollamaHost = $ollamaHost ?? $this->getConf('ollama_host', '127.0.0.1');
54        $this->ollamaPort = $ollamaPort ?? $this->getConf('ollama_port', 11434);
55        $this->ollamaModel = $ollamaModel ?? $this->getConf('ollama_embeddings_model', 'nomic-embed-text');
56
57        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
58        $this->client = curl_init();
59        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
60        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
61            'Content-Type: application/json',
62            'Accept: application/json'
63        ]);
64
65        // Initialize Ollama client
66        $this->ollamaClient = curl_init();
67        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
68        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
69            'Content-Type: application/json'
70        ]);
71
72        // Check if tenant and database exist, create them if they don't
73        $this->ensureTenantAndDatabase();
74    }
75
76    /**
77     * Clean up the cURL client when the object is destroyed
78     *
79     * @return void
80     */
81    public function __destruct() {
82        curl_close($this->client);
83        curl_close($this->ollamaClient);
84    }
85
86    /**
87     * Make an HTTP request to the ChromaDB API
88     *
89     * This is a helper function that handles making HTTP requests to the ChromaDB API,
90     * including setting the appropriate headers for tenant and database.
91     *
92     * @param string $endpoint The API endpoint to call
93     * @param string $method The HTTP method to use (default: 'GET')
94     * @param array|null $data The data to send with the request (default: null)
95     * @return array The JSON response decoded as an array
96     * @throws Exception If there's a cURL error or HTTP error
97     */
98    private function makeRequest($endpoint, $method = 'GET', $data = null) {
99        // Add tenant and database as headers instead of query parameters for v2 API
100        $headers = [
101            'Content-Type: application/json',
102            'Accept: application/json'
103        ];
104
105        $url = $this->baseUrl . '/api/v2' . $endpoint;
106
107        curl_setopt($this->client, CURLOPT_URL, $url);
108        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
109        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
110
111        if ($data) {
112            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
113        } else {
114            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
115        }
116
117        $response = curl_exec($this->client);
118        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
119
120        if (curl_error($this->client)) {
121            throw new \Exception('Curl error: ' . curl_error($this->client));
122        }
123
124        if ($httpCode >= 400) {
125            throw new \Exception("HTTP Error: $httpCode, Response: $response");
126        }
127
128        return json_decode($response, true);
129    }
130
131    /**
132     * Generate embeddings for text using Ollama
133     *
134     * @param string $text The text to generate embeddings for
135     * @return array The embeddings vector
136     */
137    public function generateEmbeddings($text) {
138        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
139
140        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
141
142        $data = [
143            'model' => $this->ollamaModel,
144            'prompt' => $text,
145            'keep_alive' => '30m'
146        ];
147
148        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
149
150        $response = curl_exec($this->ollamaClient);
151        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
152
153        if (curl_error($this->ollamaClient)) {
154            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
155        }
156
157        if ($httpCode >= 400) {
158            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
159        }
160
161        $result = json_decode($response, true);
162
163        if (!isset($result['embedding'])) {
164            throw new \Exception("Ollama response missing embedding: " . $response);
165        }
166
167        return $result['embedding'];
168    }
169
170    /**
171     * List all collections in the database
172     *
173     * Retrieves a list of all collections in the specified tenant and database.
174     *
175     * @return array List of collections
176     */
177    public function listCollections() {
178        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
179        return $this->makeRequest($endpoint);
180    }
181
182    /**
183     * Get a collection by name
184     *
185     * Retrieves information about a specific collection by its name.
186     *
187     * @param string $name The name of the collection to retrieve
188     * @return array The collection information
189     * @throws Exception If the collection is not found
190     */
191    public function getCollection($name) {
192        // Use provided name, fallback to 'documents' if empty
193        if (empty($name)) {
194            $name = 'documents';
195        }
196
197        // First try to get collection by name
198        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
199        $collections = $this->makeRequest($endpoint);
200
201        // Find collection by name
202        foreach ($collections as $collection) {
203            if (isset($collection['name']) && $collection['name'] === $name) {
204                return $collection;
205            }
206        }
207
208        // If not found, throw exception
209        throw new \Exception("Collection '{$name}' not found");
210    }
211
212    /**
213     * Create a new collection
214     *
215     * Creates a new collection with the specified name and optional metadata.
216     *
217     * @param string $name The name of the collection to create
218     * @param array|null $metadata Optional metadata for the collection
219     * @return array The response from the API
220     */
221    public function createCollection($name, $metadata = null) {
222        // Use provided name, fallback to 'documents' if empty
223        if (empty($name)) {
224            $name = 'documents';
225        }
226
227        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
228        $data = ['name' => $name];
229        if ($metadata) {
230            $data['metadata'] = $metadata;
231        }
232        return $this->makeRequest($endpoint, 'POST', $data);
233    }
234
235    /**
236     * Delete a collection by name
237     *
238     * Deletes a collection with the specified name.
239     *
240     * @param string $name The name of the collection to delete
241     * @return array The response from the API
242     * @throws Exception If the collection ID is not found
243     */
244    public function deleteCollection($name) {
245        // Use provided name, fallback to 'documents' if empty
246        if (empty($name)) {
247            $name = 'documents';
248        }
249
250        // First get the collection to find its ID
251        $collection = $this->getCollection($name);
252        if (!isset($collection['id'])) {
253            throw new \Exception("Collection ID not found for '{$name}'");
254        }
255
256        $collectionId = $collection['id'];
257        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
258        return $this->makeRequest($endpoint, 'DELETE');
259    }
260
261    /**
262     * Get a document by its ID from a collection
263     *
264     * Retrieves a document from the specified collection using its ID.
265     *
266     * @param string $collectionName The name of the collection to get the document from
267     * @param string $documentId The document ID to retrieve
268     * @param array $include What to include in the response (default: ["metadatas", "documents"])
269     * @return array The retrieved document
270     * @throws Exception If the collection ID is not found
271     */
272    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
273        // Use provided name, fallback to 'documents' if empty
274        if (empty($collectionName)) {
275            $collectionName = 'documents';
276        }
277
278        // First get the collection to find its ID
279        $collection = $this->getCollection($collectionName);
280        if (!isset($collection['id'])) {
281            throw new \Exception("Collection ID not found for '{$collectionName}'");
282        }
283
284        $collectionId = $collection['id'];
285        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
286        $data = [
287            'ids' => [$documentId],
288            'include' => $include
289        ];
290
291        return $this->makeRequest($endpoint, 'POST', $data);
292    }
293
294    /**
295     * Add documents to a collection
296     *
297     * Adds documents to the specified collection. Each document must have a corresponding ID.
298     * Optional metadata and pre-computed embeddings can also be provided.
299     *
300     * @param string $collectionName The name of the collection to add documents to
301     * @param array $documents The document contents
302     * @param array $ids The document IDs
303     * @param array|null $metadatas Optional metadata for each document
304     * @param array|null $embeddings Optional pre-computed embeddings for each document
305     * @return array The response from the API
306     * @throws Exception If the collection ID is not found
307     */
308    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
309        // Use provided name, fallback to 'documents' if empty
310        if (empty($collectionName)) {
311            $collectionName = 'documents';
312        }
313
314        // First get the collection to find its ID
315        $collection = $this->getCollection($collectionName);
316        if (!isset($collection['id'])) {
317            throw new \Exception("Collection ID not found for '{$collectionName}'");
318        }
319
320        $collectionId = $collection['id'];
321        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
322        $data = [
323            'ids' => $ids,
324            'documents' => $documents
325        ];
326
327        if ($metadatas) {
328            $data['metadatas'] = $metadatas;
329        }
330
331        if ($embeddings) {
332            $data['embeddings'] = $embeddings;
333        }
334
335        return $this->makeRequest($endpoint, 'POST', $data);
336    }
337
338    /**
339     * Check if a document needs to be updated based on timestamp comparison
340     *
341     * Determines whether a document should be reprocessed by comparing the file's last modification
342     * time with the processed_at timestamp stored in the document's metadata. The function checks
343     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
344     * not included in the database.
345     *
346     * @param string $collectionId The ID of the collection to check documents in
347     * @param string $documentId The base document ID to check (without chunk suffixes)
348     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
349     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
350     * @throws Exception If there's an error checking the document
351     */
352    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
353        try {
354            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
355
356            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
357            $chunkIdsToCheck = [
358                $documentId . '@1',
359                $documentId . '@2',
360                $documentId . '@3'
361            ];
362
363            $data = [
364                'ids' => $chunkIdsToCheck,
365                'include' => [
366                    "metadatas"
367                ],
368                'limit' => 1
369            ];
370
371            // Check if document exists
372            $result = $this->makeRequest($endpoint, 'POST', $data);
373
374            // If no documents found, return true (needs to be added)
375            if (empty($result['ids'])) {
376                return true;
377            }
378
379            // Check if any document has a processed_at timestamp
380            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
381                // Check the first metadata entry directly
382                $metadata = $result['metadatas'][0];
383
384                // If processed_at is not set, return true (needs update)
385                if (!isset($metadata['processed_at'])) {
386                    return true;
387                }
388
389                // Parse the processed_at timestamp
390                $processedTimestamp = strtotime($metadata['processed_at']);
391
392                // If file is newer than processed time, return true (needs update)
393                if ($fileModifiedTime > $processedTimestamp) {
394                    return true;
395                }
396            }
397
398            // Document exists and is up to date
399            return false;
400        } catch (\Exception $e) {
401            // If there's an error checking the document, assume it needs to be updated
402            return true;
403        }
404    }
405
406    /**
407     * Query a collection for similar documents
408     *
409     * Queries the specified collection for documents similar to the provided query texts.
410     * The function generates embeddings for the query texts and sends them to ChromaDB.
411     * Supports filtering results by metadata using the where parameter.
412     *
413     * @param string $collectionName The name of the collection to query
414     * @param array $queryTexts The query texts to search for
415     * @param int $nResults The number of results to return (default: 5)
416     * @param array|null $where Optional filter conditions for metadata
417     * @return array The query results
418     * @throws Exception If the collection ID is not found
419     */
420    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
421        // Use provided name, fallback to 'documents' if empty
422        if (empty($collectionName)) {
423            $collectionName = 'documents';
424        }
425
426        // First get the collection to find its ID
427        $collection = $this->getCollection($collectionName);
428        if (!isset($collection['id'])) {
429            throw new \Exception("Collection ID not found for '{$collectionName}'");
430        }
431
432        $collectionId = $collection['id'];
433        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
434
435        // Generate embeddings for query texts
436        $queryEmbeddings = [];
437        foreach ($queryTexts as $text) {
438            $queryEmbeddings[] = $this->generateEmbeddings($text);
439        }
440
441        $data = [
442            'query_embeddings' => $queryEmbeddings,
443            'n_results' => $nResults
444        ];
445
446        // Add where clause for metadata filtering if provided
447        if ($where && is_array($where)) {
448            $data['where'] = $where;
449        }
450
451        return $this->makeRequest($endpoint, 'POST', $data);
452    }
453
454    /**
455     * Check if the ChromaDB server is alive
456     *
457     * Sends a heartbeat request to verify that the ChromaDB server is running.
458     *
459     * @return array The response from the heartbeat endpoint
460     */
461    public function heartbeat() {
462        $endpoint = "/heartbeat";
463        return $this->makeRequest($endpoint, 'GET');
464    }
465
466    /**
467     * Get authentication and identity information
468     *
469     * Retrieves authentication and identity information from the ChromaDB server.
470     *
471     * @return array The response from the auth/identity endpoint
472     */
473    public function getIdentity() {
474        $endpoint = "/identity";
475        return $this->makeRequest($endpoint, 'GET');
476    }
477
478    /**
479     * Ensure that the specified tenant and database exist
480     *
481     * Checks if the specified tenant and database exist, and creates them if they don't.
482     *
483     * @return void
484     */
485    private function ensureTenantAndDatabase() {
486        // Check if tenant exists, create if it doesn't
487        try {
488            $this->getTenant($this->tenant);
489        } catch (\Exception $e) {
490            // Tenant doesn't exist, create it
491            $this->createTenant($this->tenant);
492        }
493
494        // Check if database exists, create if it doesn't
495        try {
496            $this->getDatabase($this->database, $this->tenant);
497        } catch (\Exception $e) {
498            // Database doesn't exist, create it
499            $this->createDatabase($this->database, $this->tenant);
500        }
501    }
502
503    /**
504     * Get tenant information
505     *
506     * Retrieves information about the specified tenant.
507     *
508     * @param string $tenantName The tenant name
509     * @return array The tenant information
510     */
511    public function getTenant($tenantName) {
512        $endpoint = "/tenants/{$tenantName}";
513        return $this->makeRequest($endpoint, 'GET');
514    }
515
516    /**
517     * Create a new tenant
518     *
519     * Creates a new tenant with the specified name.
520     *
521     * @param string $tenantName The tenant name
522     * @return array The response from the API
523     */
524    public function createTenant($tenantName) {
525        $endpoint = "/tenants";
526        $data = ['name' => $tenantName];
527        return $this->makeRequest($endpoint, 'POST', $data);
528    }
529
530    /**
531     * Get database information
532     *
533     * Retrieves information about the specified database within a tenant.
534     *
535     * @param string $databaseName The database name
536     * @param string $tenantName The tenant name
537     * @return array The database information
538     */
539    public function getDatabase($databaseName, $tenantName) {
540        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
541        return $this->makeRequest($endpoint, 'GET');
542    }
543
544    /**
545     * Create a new database
546     *
547     * Creates a new database with the specified name within a tenant.
548     *
549     * @param string $databaseName The database name
550     * @param string $tenantName The tenant name
551     * @return array The response from the API
552     */
553    public function createDatabase($databaseName, $tenantName) {
554        $endpoint = "/tenants/{$tenantName}/databases";
555        $data = ['name' => $databaseName];
556        return $this->makeRequest($endpoint, 'POST', $data);
557    }
558
559    /**
560     * Ensure a collection exists, creating it if necessary
561     *
562     * This helper function checks if a collection exists and creates it if it doesn't.
563     *
564     * @param string $collectionName The name of the collection to check/create
565     * @return string Status message indicating what happened
566     */
567    public function ensureCollectionExists($collectionName) {
568        try {
569            $collection = $this->getCollection($collectionName);
570            return "Collection '$collectionName' already exists.";
571        } catch (\Exception $e) {
572            // Collection doesn't exist, create it
573            $created = $this->createCollection($collectionName);
574            return "Collection '$collectionName' created.";
575        }
576    }
577
578    /**
579     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
580     *
581     * This function handles the complete processing of a single DokuWiki file:
582     * 1. Parses the file path to extract metadata and document ID
583     * 2. Determines the appropriate collection based on document ID
584     * 3. Checks if the document needs updating using timestamp comparison
585     * 4. Reads and processes file content only if update is needed
586     * 5. Splits the document into chunks (paragraphs)
587     * 6. Extracts rich metadata from the DokuWiki ID format
588     * 7. Generates embeddings for each chunk
589     * 8. Sends all chunks to ChromaDB with metadata
590     *
591     * Supported ID formats:
592     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
593     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
594     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
595     *
596     * The function implements smart update checking by comparing file modification time
597     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
598     *
599     * @param string $filePath The path to the file to process
600     * @param string $collectionName The name of the collection to use
601     * @param bool $collectionChecked Whether the collection has already been checked/created
602     * @return array Result with status and details
603     */
604    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
605        // Parse file path to extract metadata
606        $id = parseFilePath($filePath);
607
608        try {
609            // Create collection if it doesn't exist (only if not already checked)
610            $collectionStatus = '';
611            if (!$collectionChecked) {
612                $collectionStatus = $this->ensureCollectionExists($collectionName);
613            }
614
615            // Get collection ID
616            $collection = $this->getCollection($collectionName);
617            if (!isset($collection['id'])) {
618                return [
619                    'status' => 'error',
620                    'message' => "Collection ID not found for '{$collectionName}'"
621                ];
622            }
623            $collectionId = $collection['id'];
624
625            // Get file modification time
626            $fileModifiedTime = filemtime($filePath);
627
628            // Check if document needs update
629            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
630
631            // If document is up to date, skip processing
632            if (!$needsUpdate) {
633                return [
634                    'status' => 'skipped',
635                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
636                ];
637            }
638
639            // Read file content
640            $content = file_get_contents($filePath);
641
642            // Split document into chunks (paragraphs separated by two newlines)
643            $paragraphs = preg_split('/\n\s*\n/', $content);
644            $chunks = [];
645            $chunkMetadata = [];
646
647            // Parse the DokuWiki ID to extract base metadata
648            $parts = explode(':', $id);
649
650            // Extract metadata from the last part of the ID
651            $lastPart = end($parts);
652            $baseMetadata = [];
653
654            // Add the document ID as metadata
655            $baseMetadata['document_id'] = $id;
656
657            // Add current timestamp
658            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
659
660            // Check if any part of the ID is 'templates' and set template metadata
661            $isTemplate = in_array('templates', $parts);
662            if ($isTemplate) {
663                $baseMetadata['type'] = 'template';
664            } else {
665                $baseMetadata['type'] = 'report';
666            }
667
668            // Extract modality from the second part
669            if (isset($parts[1])) {
670                $baseMetadata['modality'] = $parts[1];
671            }
672
673            // Handle different ID formats based on the third part: word (institution) or numeric (year)
674            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
675            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
676            // For templates, don't set institution, date or year
677            if (isset($parts[2]) && !$isTemplate) {
678                // Check if third part is numeric (year) or word (institution)
679                if (is_numeric($parts[2])) {
680                    // Format: reports:mri:2024:g287-name-surname (year format)
681                    // Extract year from the third part
682                    $baseMetadata['year'] = $parts[2];
683
684                    // Set default institution from config
685                    $baseMetadata['institution'] = $this->getConf('default_institution', 'default');
686
687                    // Extract registration and name from the last part
688                    // Registration should start with one letter or number and contain numbers before the '-' character
689                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
690                        // Check if the first part contains at least one digit to be considered a registration
691                        if (preg_match('/[0-9]/', $matches[1])) {
692                            $baseMetadata['registration'] = $matches[1];
693                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
694                        } else {
695                            // If no registration pattern found, treat entire part as patient name
696                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
697                        }
698                    } else {
699                        // If no match, treat entire part as patient name
700                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
701                    }
702                } else {
703                    // Format: reports:mri:institution:250620-name-surname (institution format)
704                    // Extract institution from the third part
705                    $baseMetadata['institution'] = $parts[2];
706
707                    // Extract date and name from the last part
708                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
709                        $dateStr = $matches[1];
710                        $name = $matches[2];
711
712                        // Convert date format (250620 -> 2025-06-20)
713                        $day = substr($dateStr, 0, 2);
714                        $month = substr($dateStr, 2, 2);
715                        $year = substr($dateStr, 4, 2);
716                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
717                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
718                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
719
720                        $baseMetadata['date'] = $formattedDate;
721                        $baseMetadata['name'] = str_replace('-', ' ', $name);
722                    }
723                }
724            }
725
726            // For templates, always extract name from the last part
727            if ($isTemplate && isset($lastPart)) {
728                // Extract name from the last part (everything after the last colon)
729                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
730                    // Check if the first part contains at least one digit to be considered a registration
731                    if (preg_match('/[0-9]/', $matches[1])) {
732                        $baseMetadata['registration'] = $matches[1];
733                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
734                    } else {
735                        // If no registration pattern found, treat entire part as template name
736                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
737                    }
738                } else {
739                    // If no match, treat entire part as template name
740                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
741                }
742            }
743
744            // Process each paragraph as a chunk with intelligent metadata handling
745            $chunkIds = [];
746            $chunkContents = [];
747            $chunkMetadatas = [];
748            $chunkEmbeddings = [];
749            $currentTags = [];
750
751            foreach ($paragraphs as $index => $paragraph) {
752                // Skip empty paragraphs to avoid processing whitespace-only content
753                $paragraph = trim($paragraph);
754                if (empty($paragraph)) {
755                    continue;
756                }
757
758                // Check if this is a DokuWiki title (starts and ends with =)
759                // Titles are converted to tags for better searchability but not stored as content chunks
760                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
761                    // Extract title content and clean it
762                    $titleContent = trim($matches[1]);
763
764                    // Split into words and create searchable tags
765                    $words = preg_split('/\s+/', $titleContent);
766                    $tags = [];
767
768                    foreach ($words as $word) {
769                        // Only use words longer than 3 characters to reduce noise
770                        if (strlen($word) >= 3) {
771                            $tags[] = strtolower($word);
772                        }
773                    }
774
775                    // Remove duplicate tags and store for use in subsequent chunks
776                    $currentTags = array_unique($tags);
777                    continue; // Skip storing title chunks as content
778                }
779
780                // Create chunk ID
781                $chunkId = $id . '@' . ($index + 1);
782
783                // Generate embeddings for the chunk
784                $embeddings = $this->generateEmbeddings($paragraph);
785
786                // Add chunk-specific metadata
787                $metadata = $baseMetadata;
788                $metadata['chunk_id'] = $chunkId;
789                $metadata['chunk_number'] = $index + 1;
790                $metadata['total_chunks'] = count($paragraphs);
791
792                // Add current tags to metadata if any exist
793                if (!empty($currentTags)) {
794                    $metadata['tags'] = implode(',', $currentTags);
795                }
796
797                // Store chunk data
798                $chunkIds[] = $chunkId;
799                $chunkContents[] = $paragraph;
800                $chunkMetadatas[] = $metadata;
801                $chunkEmbeddings[] = $embeddings;
802            }
803
804            // If no chunks were created, skip this file
805            if (empty($chunkIds)) {
806                return [
807                    'status' => 'skipped',
808                    'message' => "No valid chunks found in file '$id'. Skipping..."
809                ];
810            }
811
812            // Send all chunks to ChromaDB
813            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
814
815            return [
816                'status' => 'success',
817                'message' => "Successfully sent file to ChromaDB",
818                'details' => [
819                    'document_id' => $id,
820                    'chunks' => count($chunkIds),
821                    'collection' => $collectionName
822                ],
823                'collection_status' => $collectionStatus
824            ];
825        } catch (\Exception $e) {
826            return [
827                'status' => 'error',
828                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
829            ];
830        }
831    }
832
833    /**
834     * Process all DokuWiki files in a directory and send them to ChromaDB
835     *
836     * This function recursively processes all .txt files in a directory and its subdirectories.
837     * It first checks if the appropriate collection exists and creates it if needed.
838     * Then it processes each file individually.
839     *
840     * @param string $dirPath The directory path to process
841     * @return array Result with status and details
842     */
843    public function processDirectory($dirPath) {
844        // Check if directory exists
845        if (!is_dir($dirPath)) {
846            return [
847                'status' => 'error',
848                'message' => "Directory does not exist: $dirPath"
849            ];
850        }
851
852        // Create RecursiveIteratorIterator to process directories recursively
853        $iterator = new RecursiveIteratorIterator(
854            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
855            RecursiveIteratorIterator::LEAVES_ONLY
856        );
857
858        $files = [];
859        foreach ($iterator as $file) {
860            // Process only .txt files that don't start with underscore
861            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
862                $files[] = $file->getPathname();
863            }
864        }
865
866        if (empty($files)) {
867            return [
868                'status' => 'skipped',
869                'message' => "No .txt files found in directory: $dirPath"
870            ];
871        }
872
873        // Use the first part of the document ID as collection name, fallback to 'documents'
874        $sampleFile = $files[0];
875        $id = parseFilePath($sampleFile);
876        $idParts = explode(':', $id);
877        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
878
879        try {
880            $this->ensureCollectionExists($collectionName);
881            $collectionChecked = true;
882        } catch (Exception $e) {
883            $collectionChecked = true;
884        }
885
886        $results = [];
887        foreach ($files as $file) {
888            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
889            $results[] = [
890                'file' => $file,
891                'result' => $result
892            ];
893        }
894
895        return [
896            'status' => 'success',
897            'message' => "Finished processing directory.",
898            'files_count' => count($files),
899            'results' => $results
900        ];
901    }
902}
903
904/**
905 * Parse a file path and convert it to a DokuWiki ID
906 *
907 * Takes a file system path and converts it to the DokuWiki ID format by:
908 * 1. Removing the base path prefix (using DokuWiki's pages directory)
909 * 2. Removing the .txt extension
910 * 3. Converting directory separators to colons
911 *
912 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
913 * Becomes: reports:mri:2024:g287-name-surname
914 *
915 * @param string $filePath The full file path to parse
916 * @return string The DokuWiki ID
917 */
918function parseFilePath($filePath) {
919    // Use DokuWiki's constant to get the pages directory if available
920    if (defined('DOKU_INC')) {
921        $pagesDir = DOKU_INC . 'data/pages/';
922    } else {
923        // Fallback to common DokuWiki installation path
924        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
925    }
926
927    // Remove the base path
928    $relativePath = str_replace($pagesDir, '', $filePath);
929
930    // Remove .txt extension
931    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
932
933    // Split path into parts and filter out empty parts
934    $parts = array_filter(explode('/', $relativePath));
935
936    // Build DokuWiki ID (use first part as namespace)
937    $idParts = [];
938    foreach ($parts as $part) {
939        if (!empty($part)) {
940            $idParts[] = $part;
941        }
942    }
943
944    return implode(':', $idParts);
945}
946
947