xref: /plugin/dokullm/ChromaDBClient.php (revision a74e859f6dbc39be26ce69d909c7c1753ed29425)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    private $ollamaModel;
14
15    /**
16     * Get configuration value for the dokullm plugin
17     *
18     * @param string $key Configuration key
19     * @param mixed $default Default value if key not found
20     * @return mixed Configuration value
21     */
22    /**
23     * Initialize the ChromaDB client
24     *
25     * Creates a new ChromaDB client instance with the specified connection parameters.
26     * Also ensures that the specified tenant and database exist.
27     *
28     * @param string $host ChromaDB server host
29     * @param int $port ChromaDB server port
30     * @param string $tenant ChromaDB tenant name
31     * @param string $database ChromaDB database name
32     * @param string $defaultCollection Default collection name
33     * @param string $ollamaHost Ollama server host
34     * @param int $ollamaPort Ollama server port
35     * @param string $ollamaModel Ollama embeddings model
36     */
37    public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) {
38        // Use provided parameters (no fallback since they're mandatory)
39        $chromaHost = $host;
40        $chromaPort = $port;
41        $this->tenant = $tenant;
42        $this->database = $database;
43        $this->defaultCollection = $defaultCollection;
44        $this->ollamaHost = $ollamaHost;
45        $this->ollamaPort = $ollamaPort;
46        $this->ollamaModel = $ollamaModel;
47        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
48        $this->client = curl_init();
49        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
50        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
51            'Content-Type: application/json',
52            'Accept: application/json'
53        ]);
54        // Initialize Ollama client
55        $this->ollamaClient = curl_init();
56        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
57        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
58            'Content-Type: application/json'
59        ]);
60        // Check if tenant and database exist, create them if they don't
61        $this->ensureTenantAndDatabase();
62    }
63
64    /**
65     * Clean up the cURL client when the object is destroyed
66     *
67     * @return void
68     */
69    public function __destruct() {
70        curl_close($this->client);
71        curl_close($this->ollamaClient);
72    }
73
74    /**
75     * Make an HTTP request to the ChromaDB API
76     *
77     * This is a helper function that handles making HTTP requests to the ChromaDB API,
78     * including setting the appropriate headers for tenant and database.
79     *
80     * @param string $endpoint The API endpoint to call
81     * @param string $method The HTTP method to use (default: 'GET')
82     * @param array|null $data The data to send with the request (default: null)
83     * @return array The JSON response decoded as an array
84     * @throws Exception If there's a cURL error or HTTP error
85     */
86    private function makeRequest($endpoint, $method = 'GET', $data = null) {
87        // Add tenant and database as headers instead of query parameters for v2 API
88        $headers = [
89            'Content-Type: application/json',
90            'Accept: application/json'
91        ];
92        // Version 2
93        $url = $this->baseUrl . '/api/v2' . $endpoint;
94        curl_setopt($this->client, CURLOPT_URL, $url);
95        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
96        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
97        // POST JSON data
98        if ($data) {
99            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
100        } else {
101            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
102        }
103        // Call
104        $response = curl_exec($this->client);
105        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
106        // Check the result
107        if (curl_error($this->client)) {
108            throw new \Exception('Curl error: ' . curl_error($this->client));
109        }
110        if ($httpCode >= 400) {
111            throw new \Exception("HTTP Error: $httpCode, Response: $response");
112        }
113        // Return the decoded response
114        return json_decode($response, true);
115    }
116
117    /**
118     * Generate embeddings for text using Ollama
119     *
120     * @param string $text The text to generate embeddings for
121     * @return array The embeddings vector
122     */
123    public function generateEmbeddings($text) {
124        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
125        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
126        $data = [
127            'model' => $this->ollamaModel,
128            'prompt' => $text,
129            'keep_alive' => '30m'
130        ];
131        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
132        $response = curl_exec($this->ollamaClient);
133        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
134        if (curl_error($this->ollamaClient)) {
135            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
136        }
137        if ($httpCode >= 400) {
138            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
139        }
140        $result = json_decode($response, true);
141        if (!isset($result['embedding'])) {
142            throw new \Exception("Ollama response missing embedding: " . $response);
143        }
144        return $result['embedding'];
145    }
146
147    /**
148     * List all collections in the database
149     *
150     * Retrieves a list of all collections in the specified tenant and database.
151     *
152     * @return array List of collections
153     */
154    public function listCollections() {
155        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
156        return $this->makeRequest($endpoint);
157    }
158
159    /**
160     * Get a collection by name
161     *
162     * Retrieves information about a specific collection by its name.
163     *
164     * @param string $name The name of the collection to retrieve
165     * @return array The collection information
166     * @throws Exception If the collection is not found
167     */
168    public function getCollection($name) {
169        // Use provided name, fallback to 'documents' if empty
170        if (empty($name)) {
171            $name = 'documents';
172        }
173        // First try to get collection by name
174        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
175        $collections = $this->makeRequest($endpoint);
176        // Find collection by name
177        foreach ($collections as $collection) {
178            if (isset($collection['name']) && $collection['name'] === $name) {
179                return $collection;
180            }
181        }
182        // If not found, throw exception
183        throw new \Exception("Collection '{$name}' not found");
184    }
185
186    /**
187     * Create a new collection
188     *
189     * Creates a new collection with the specified name and optional metadata.
190     *
191     * @param string $name The name of the collection to create
192     * @param array|null $metadata Optional metadata for the collection
193     * @return array The response from the API
194     */
195    public function createCollection($name, $metadata = null) {
196        // Use provided name, fallback to 'documents' if empty
197        if (empty($name)) {
198            $name = 'documents';
199        }
200        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
201        $data = ['name' => $name];
202        if ($metadata) {
203            $data['metadata'] = $metadata;
204        }
205        return $this->makeRequest($endpoint, 'POST', $data);
206    }
207
208    /**
209     * Delete a collection by name
210     *
211     * Deletes a collection with the specified name.
212     *
213     * @param string $name The name of the collection to delete
214     * @return array The response from the API
215     * @throws Exception If the collection ID is not found
216     */
217    public function deleteCollection($name) {
218        // Use provided name, fallback to 'documents' if empty
219        if (empty($name)) {
220            $name = 'documents';
221        }
222        // First get the collection to find its ID
223        $collection = $this->getCollection($name);
224        if (!isset($collection['id'])) {
225            throw new \Exception("Collection ID not found for '{$name}'");
226        }
227        $collectionId = $collection['id'];
228        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
229        return $this->makeRequest($endpoint, 'DELETE');
230    }
231
232    /**
233     * Get a document by its ID from a collection
234     *
235     * Retrieves a document from the specified collection using its ID.
236     *
237     * @param string $collectionName The name of the collection to get the document from
238     * @param string $documentId The document ID to retrieve
239     * @param array $include What to include in the response (default: ["metadatas", "documents"])
240     * @return array The retrieved document
241     * @throws Exception If the collection ID is not found
242     */
243    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
244        // Use provided name, fallback to 'documents' if empty
245        if (empty($collectionName)) {
246            $collectionName = 'documents';
247        }
248        // First get the collection to find its ID
249        $collection = $this->getCollection($collectionName);
250        if (!isset($collection['id'])) {
251            throw new \Exception("Collection ID not found for '{$collectionName}'");
252        }
253        $collectionId = $collection['id'];
254        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
255        $data = [
256            'ids' => [$documentId],
257            'include' => $include
258        ];
259        // Return the document
260        return $this->makeRequest($endpoint, 'POST', $data);
261    }
262
263    /**
264     * Add documents to a collection
265     *
266     * Adds documents to the specified collection. Each document must have a corresponding ID.
267     * Optional metadata and pre-computed embeddings can also be provided.
268     *
269     * @param string $collectionName The name of the collection to add documents to
270     * @param array $documents The document contents
271     * @param array $ids The document IDs
272     * @param array|null $metadatas Optional metadata for each document
273     * @param array|null $embeddings Optional pre-computed embeddings for each document
274     * @return array The response from the API
275     * @throws Exception If the collection ID is not found
276     */
277    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
278        // Use provided name, fallback to 'documents' if empty
279        if (empty($collectionName)) {
280            $collectionName = 'documents';
281        }
282        // First get the collection to find its ID
283        $collection = $this->getCollection($collectionName);
284        if (!isset($collection['id'])) {
285            throw new \Exception("Collection ID not found for '{$collectionName}'");
286        }
287        $collectionId = $collection['id'];
288        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
289        $data = [
290            'ids' => $ids,
291            'documents' => $documents
292        ];
293        // Get also the metadata
294        if ($metadatas) {
295            $data['metadatas'] = $metadatas;
296        }
297        // Get the embeddings
298        if ($embeddings) {
299            $data['embeddings'] = $embeddings;
300        }
301        // Return the respnse
302        return $this->makeRequest($endpoint, 'POST', $data);
303    }
304
305    /**
306     * Check if a document needs to be updated based on timestamp comparison
307     *
308     * Determines whether a document should be reprocessed by comparing the file's last modification
309     * time with the processed_at timestamp stored in the document's metadata. The function checks
310     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
311     * not included in the database.
312     *
313     * @param string $collectionId The ID of the collection to check documents in
314     * @param string $documentId The base document ID to check (without chunk suffixes)
315     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
316     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
317     * @throws Exception If there's an error checking the document
318     */
319    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
320        try {
321            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
322            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
323            $chunkIdsToCheck = [
324                $documentId . '@1',
325                $documentId . '@2',
326                $documentId . '@3'
327            ];
328            $data = [
329                'ids' => $chunkIdsToCheck,
330                'include' => [
331                    "metadatas"
332                ],
333                'limit' => 1
334            ];
335            // Check if document exists
336            $result = $this->makeRequest($endpoint, 'POST', $data);
337            // If no documents found, return true (needs to be added)
338            if (empty($result['ids'])) {
339                return true;
340            }
341            // Check if any document has a processed_at timestamp
342            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
343                // Check the first metadata entry directly
344                $metadata = $result['metadatas'][0];
345                // If processed_at is not set, return true (needs update)
346                if (!isset($metadata['processed_at'])) {
347                    return true;
348                }
349                // Parse the processed_at timestamp
350                $processedTimestamp = strtotime($metadata['processed_at']);
351                // If file is newer than processed time, return true (needs update)
352                if ($fileModifiedTime > $processedTimestamp) {
353                    return true;
354                }
355            }
356            // Document exists and is up to date
357            return false;
358        } catch (\Exception $e) {
359            // If there's an error checking the document, assume it needs to be updated
360            return true;
361        }
362    }
363
364    /**
365     * Query a collection for similar documents
366     *
367     * Queries the specified collection for documents similar to the provided query texts.
368     * The function generates embeddings for the query texts and sends them to ChromaDB.
369     * Supports filtering results by metadata using the where parameter.
370     *
371     * @param string $collectionName The name of the collection to query
372     * @param array $queryTexts The query texts to search for
373     * @param int $nResults The number of results to return (default: 5)
374     * @param array|null $where Optional filter conditions for metadata
375     * @return array The query results
376     * @throws Exception If the collection ID is not found
377     */
378    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
379        // Use provided name, fallback to 'documents' if empty
380        if (empty($collectionName)) {
381            $collectionName = 'documents';
382        }
383        // First get the collection to find its ID
384        $collection = $this->getCollection($collectionName);
385        if (!isset($collection['id'])) {
386            throw new \Exception("Collection ID not found for '{$collectionName}'");
387        }
388        $collectionId = $collection['id'];
389        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
390        // Generate embeddings for query texts
391        $queryEmbeddings = [];
392        foreach ($queryTexts as $text) {
393            $queryEmbeddings[] = $this->generateEmbeddings($text);
394        }
395        $data = [
396            'query_embeddings' => $queryEmbeddings,
397            'n_results' => $nResults
398        ];
399        // Add where clause for metadata filtering if provided
400        if ($where && is_array($where)) {
401            $data['where'] = $where;
402        }
403        // Return the response
404        return $this->makeRequest($endpoint, 'POST', $data);
405    }
406
407    /**
408     * Check if the ChromaDB server is alive
409     *
410     * Sends a heartbeat request to verify that the ChromaDB server is running.
411     *
412     * @return array The response from the heartbeat endpoint
413     */
414    public function heartbeat() {
415        $endpoint = "/heartbeat";
416        return $this->makeRequest($endpoint, 'GET');
417    }
418
419    /**
420     * Get authentication and identity information
421     *
422     * Retrieves authentication and identity information from the ChromaDB server.
423     *
424     * @return array The response from the auth/identity endpoint
425     */
426    public function getIdentity() {
427        $endpoint = "/identity";
428        return $this->makeRequest($endpoint, 'GET');
429    }
430
431    /**
432     * Ensure that the specified tenant and database exist
433     *
434     * Checks if the specified tenant and database exist, and creates them if they don't.
435     *
436     * @return void
437     */
438    private function ensureTenantAndDatabase() {
439        // Check if tenant exists, create if it doesn't
440        try {
441            $this->getTenant($this->tenant);
442        } catch (\Exception $e) {
443            // Tenant doesn't exist, create it
444            $this->createTenant($this->tenant);
445        }
446        // Check if database exists, create if it doesn't
447        try {
448            $this->getDatabase($this->database, $this->tenant);
449        } catch (\Exception $e) {
450            // Database doesn't exist, create it
451            $this->createDatabase($this->database, $this->tenant);
452        }
453    }
454
455    /**
456     * Get tenant information
457     *
458     * Retrieves information about the specified tenant.
459     *
460     * @param string $tenantName The tenant name
461     * @return array The tenant information
462     */
463    public function getTenant($tenantName) {
464        $endpoint = "/tenants/{$tenantName}";
465        return $this->makeRequest($endpoint, 'GET');
466    }
467
468    /**
469     * Create a new tenant
470     *
471     * Creates a new tenant with the specified name.
472     *
473     * @param string $tenantName The tenant name
474     * @return array The response from the API
475     */
476    public function createTenant($tenantName) {
477        $endpoint = "/tenants";
478        $data = ['name' => $tenantName];
479        return $this->makeRequest($endpoint, 'POST', $data);
480    }
481
482    /**
483     * Get database information
484     *
485     * Retrieves information about the specified database within a tenant.
486     *
487     * @param string $databaseName The database name
488     * @param string $tenantName The tenant name
489     * @return array The database information
490     */
491    public function getDatabase($databaseName, $tenantName) {
492        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
493        return $this->makeRequest($endpoint, 'GET');
494    }
495
496    /**
497     * Create a new database
498     *
499     * Creates a new database with the specified name within a tenant.
500     *
501     * @param string $databaseName The database name
502     * @param string $tenantName The tenant name
503     * @return array The response from the API
504     */
505    public function createDatabase($databaseName, $tenantName) {
506        $endpoint = "/tenants/{$tenantName}/databases";
507        $data = ['name' => $databaseName];
508        return $this->makeRequest($endpoint, 'POST', $data);
509    }
510
511    /**
512     * Ensure a collection exists, creating it if necessary
513     *
514     * This helper function checks if a collection exists and creates it if it doesn't.
515     *
516     * @param string $collectionName The name of the collection to check/create
517     * @return string Status message indicating what happened
518     */
519    public function ensureCollectionExists($collectionName) {
520        try {
521            $collection = $this->getCollection($collectionName);
522            return "Collection '$collectionName' already exists.";
523        } catch (\Exception $e) {
524            // Collection doesn't exist, create it
525            $created = $this->createCollection($collectionName);
526            return "Collection '$collectionName' created.";
527        }
528    }
529
530    /**
531     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
532     *
533     * This function handles the complete processing of a single DokuWiki file:
534     * 1. Parses the file path to extract metadata and document ID
535     * 2. Determines the appropriate collection based on document ID
536     * 3. Checks if the document needs updating using timestamp comparison
537     * 4. Reads and processes file content only if update is needed
538     * 5. Splits the document into chunks (paragraphs)
539     * 6. Extracts rich metadata from the DokuWiki ID format
540     * 7. Generates embeddings for each chunk
541     * 8. Sends all chunks to ChromaDB with metadata
542     *
543     * Supported ID formats:
544     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
545     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
546     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
547     *
548     * The function implements smart update checking by comparing file modification time
549     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
550     *
551     * @param string $filePath The path to the file to process
552     * @param string $collectionName The name of the collection to use
553     * @param bool $collectionChecked Whether the collection has already been checked/created
554     * @return array Result with status and details
555     */
556    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
557        // Parse file path to extract metadata
558        $id = parseFilePath($filePath);
559        try {
560            // Create collection if it doesn't exist (only if not already checked)
561            $collectionStatus = '';
562            if (!$collectionChecked) {
563                $collectionStatus = $this->ensureCollectionExists($collectionName);
564            }
565            // Get collection ID
566            $collection = $this->getCollection($collectionName);
567            if (!isset($collection['id'])) {
568                return [
569                    'status' => 'error',
570                    'message' => "Collection ID not found for '{$collectionName}'"
571                ];
572            }
573            $collectionId = $collection['id'];
574            // Get file modification time
575            $fileModifiedTime = filemtime($filePath);
576            // Check if document needs update
577            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
578            // If document is up to date, skip processing
579            if (!$needsUpdate) {
580                return [
581                    'status' => 'skipped',
582                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
583                ];
584            }
585            // Read file content
586            $content = file_get_contents($filePath);
587            // Split document into chunks (paragraphs separated by two newlines)
588            $paragraphs = preg_split('/\n\s*\n/', $content);
589            $chunks = [];
590            $chunkMetadata = [];
591            // Parse the DokuWiki ID to extract base metadata
592            $parts = explode(':', $id);
593            // Extract metadata from the last part of the ID
594            $lastPart = end($parts);
595            $baseMetadata = [];
596            // Add the document ID as metadata
597            $baseMetadata['document_id'] = $id;
598            // Add current timestamp
599            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
600            // Check if any part of the ID is 'templates' and set template metadata
601            $isTemplate = in_array('templates', $parts);
602            if ($isTemplate) {
603                $baseMetadata['type'] = 'template';
604            } else {
605                $baseMetadata['type'] = 'report';
606            }
607            // Extract modality from the second part
608            if (isset($parts[1])) {
609                $baseMetadata['modality'] = $parts[1];
610            }
611            // Handle different ID formats based on the third part: word (institution) or numeric (year)
612            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
613            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
614            // For templates, don't set institution, date or year
615            if (isset($parts[2]) && !$isTemplate) {
616                // Check if third part is numeric (year) or word (institution)
617                if (is_numeric($parts[2])) {
618                    // Format: reports:mri:2024:g287-name-surname (year format)
619                    // Extract year from the third part
620                    $baseMetadata['year'] = $parts[2];
621                    // Set default institution from config
622                    global $conf;
623                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
624                    // Extract registration and name from the last part
625                    // Registration should start with one letter or number and contain numbers before the '-' character
626                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
627                        // Check if the first part contains at least one digit to be considered a registration
628                        if (preg_match('/[0-9]/', $matches[1])) {
629                            $baseMetadata['registration'] = $matches[1];
630                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
631                        } else {
632                            // If no registration pattern found, treat entire part as patient name
633                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
634                        }
635                    } else {
636                        // If no match, treat entire part as patient name
637                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
638                    }
639                } else {
640                    // Format: reports:mri:institution:250620-name-surname (institution format)
641                    // Extract institution from the third part
642                    $baseMetadata['institution'] = $parts[2];
643                    // Extract date and name from the last part
644                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
645                        $dateStr = $matches[1];
646                        $name = $matches[2];
647                        // Convert date format (250620 -> 2025-06-20)
648                        $day = substr($dateStr, 0, 2);
649                        $month = substr($dateStr, 2, 2);
650                        $year = substr($dateStr, 4, 2);
651                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
652                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
653                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
654                        $baseMetadata['date'] = $formattedDate;
655                        $baseMetadata['name'] = str_replace('-', ' ', $name);
656                    }
657                }
658            }
659            // For templates, always extract name from the last part
660            if ($isTemplate && isset($lastPart)) {
661                // Extract name from the last part (everything after the last colon)
662                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
663                    // Check if the first part contains at least one digit to be considered a registration
664                    if (preg_match('/[0-9]/', $matches[1])) {
665                        $baseMetadata['registration'] = $matches[1];
666                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
667                    } else {
668                        // If no registration pattern found, treat entire part as template name
669                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
670                    }
671                } else {
672                    // If no match, treat entire part as template name
673                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
674                }
675            }
676            // Process each paragraph as a chunk with intelligent metadata handling
677            $chunkIds = [];
678            $chunkContents = [];
679            $chunkMetadatas = [];
680            $chunkEmbeddings = [];
681            $currentTags = [];
682            foreach ($paragraphs as $index => $paragraph) {
683                // Skip empty paragraphs to avoid processing whitespace-only content
684                $paragraph = trim($paragraph);
685                if (empty($paragraph)) {
686                    continue;
687                }
688                // Check if this is a DokuWiki title (starts and ends with =)
689                // Titles are converted to tags for better searchability but not stored as content chunks
690                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
691                    // Extract title content and clean it
692                    $titleContent = trim($matches[1]);
693                    // Split into words and create searchable tags
694                    $words = preg_split('/\s+/', $titleContent);
695                    $tags = [];
696                    foreach ($words as $word) {
697                        // Only use words longer than 3 characters to reduce noise
698                        if (strlen($word) >= 3) {
699                            $tags[] = strtolower($word);
700                        }
701                    }
702                    // Remove duplicate tags and store for use in subsequent chunks
703                    $currentTags = array_unique($tags);
704                    continue; // Skip storing title chunks as content
705                }
706                // Create chunk ID
707                $chunkId = $id . '@' . ($index + 1);
708                // Generate embeddings for the chunk
709                $embeddings = $this->generateEmbeddings($paragraph);
710                // Add chunk-specific metadata
711                $metadata = $baseMetadata;
712                $metadata['chunk_id'] = $chunkId;
713                $metadata['chunk_number'] = $index + 1;
714                $metadata['total_chunks'] = count($paragraphs);
715                // Add current tags to metadata if any exist
716                if (!empty($currentTags)) {
717                    $metadata['tags'] = implode(',', $currentTags);
718                }
719                // Store chunk data
720                $chunkIds[] = $chunkId;
721                $chunkContents[] = $paragraph;
722                $chunkMetadatas[] = $metadata;
723                $chunkEmbeddings[] = $embeddings;
724            }
725            // If no chunks were created, skip this file
726            if (empty($chunkIds)) {
727                return [
728                    'status' => 'skipped',
729                    'message' => "No valid chunks found in file '$id'. Skipping..."
730                ];
731            }
732            // Send all chunks to ChromaDB
733            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
734            return [
735                'status' => 'success',
736                'message' => "Successfully sent file to ChromaDB",
737                'details' => [
738                    'document_id' => $id,
739                    'chunks' => count($chunkIds),
740                    'collection' => $collectionName
741                ],
742                'collection_status' => $collectionStatus
743            ];
744        } catch (\Exception $e) {
745            return [
746                'status' => 'error',
747                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
748            ];
749        }
750    }
751
752    /**
753     * Process all DokuWiki files in a directory and send them to ChromaDB
754     *
755     * This function recursively processes all .txt files in a directory and its subdirectories.
756     * It first checks if the appropriate collection exists and creates it if needed.
757     * Then it processes each file individually.
758     *
759     * @param string $dirPath The directory path to process
760     * @return array Result with status and details
761     */
762    public function processDirectory($dirPath) {
763        // Check if directory exists
764        if (!is_dir($dirPath)) {
765            return [
766                'status' => 'error',
767                'message' => "Directory does not exist: $dirPath"
768            ];
769        }
770        // Create RecursiveIteratorIterator to process directories recursively
771        $iterator = new RecursiveIteratorIterator(
772            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
773            RecursiveIteratorIterator::LEAVES_ONLY
774        );
775        $files = [];
776        foreach ($iterator as $file) {
777            // Process only .txt files that don't start with underscore
778            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
779                $files[] = $file->getPathname();
780            }
781        }
782        // Skip if no files
783        if (empty($files)) {
784            return [
785                'status' => 'skipped',
786                'message' => "No .txt files found in directory: $dirPath"
787            ];
788        }
789        // Use the first part of the document ID as collection name, fallback to 'documents'
790        $sampleFile = $files[0];
791        $id = parseFilePath($sampleFile);
792        $idParts = explode(':', $id);
793        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
794        try {
795            $this->ensureCollectionExists($collectionName);
796            $collectionChecked = true;
797        } catch (Exception $e) {
798            $collectionChecked = true;
799        }
800        // Send each file
801        $results = [];
802        foreach ($files as $file) {
803            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
804            $results[] = [
805                'file' => $file,
806                'result' => $result
807            ];
808        }
809        // Return the result
810        return [
811            'status' => 'success',
812            'message' => "Finished processing directory.",
813            'files_count' => count($files),
814            'results' => $results
815        ];
816    }
817}
818
819/**
820 * Parse a file path and convert it to a DokuWiki ID
821 *
822 * Takes a file system path and converts it to the DokuWiki ID format by:
823 * 1. Removing the base path prefix (using DokuWiki's pages directory)
824 * 2. Removing the .txt extension
825 * 3. Converting directory separators to colons
826 *
827 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
828 * Becomes: reports:mri:2024:g287-name-surname
829 *
830 * @param string $filePath The full file path to parse
831 * @return string The DokuWiki ID
832 */
833function parseFilePath($filePath) {
834    // Use DokuWiki's constant to get the pages directory if available
835    if (defined('DOKU_INC')) {
836        $pagesDir = DOKU_INC . 'data/pages/';
837    } else {
838        // Fallback to common DokuWiki installation path
839        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
840    }
841    // Remove the base path
842    $relativePath = str_replace($pagesDir, '', $filePath);
843    // Remove .txt extension
844    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
845    // Split path into parts and filter out empty parts
846    $parts = array_filter(explode('/', $relativePath));
847    // Build DokuWiki ID (use first part as namespace)
848    $idParts = [];
849    foreach ($parts as $part) {
850        if (!empty($part)) {
851            $idParts[] = $part;
852        }
853    }
854    // Reurn the ID
855    return implode(':', $idParts);
856}
857