xref: /plugin/dokullm/ChromaDBClient.php (revision 8db00449eb1c0733e8e1bc395a7b872220d00a16)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5use RecursiveIteratorIterator;
6use RecursiveDirectoryIterator;
7
8class ChromaDBClient {
9    private $baseUrl;
10    private $client;
11    private $ollamaClient;
12    private $tenant;
13    private $database;
14    private $ollamaHost;
15    private $ollamaPort;
16    private $ollamaModel;
17
18    /**
19     * Get configuration value for the dokullm plugin
20     *
21     * @param string $key Configuration key
22     * @param mixed $default Default value if key not found
23     * @return mixed Configuration value
24     */
25    /**
26     * Initialize the ChromaDB client
27     *
28     * Creates a new ChromaDB client instance with the specified connection parameters.
29     * Also ensures that the specified tenant and database exist.
30     *
31     * @param string $host ChromaDB server host
32     * @param int $port ChromaDB server port
33     * @param string $tenant ChromaDB tenant name
34     * @param string $database ChromaDB database name
35     * @param string $defaultCollection Default collection name
36     * @param string $ollamaHost Ollama server host
37     * @param int $ollamaPort Ollama server port
38     * @param string $ollamaModel Ollama embeddings model
39     */
40    public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) {
41        // Use provided parameters (no fallback since they're mandatory)
42        $chromaHost = $host;
43        $chromaPort = $port;
44        $this->tenant = $tenant;
45        $this->database = $database;
46        $this->defaultCollection = $defaultCollection;
47        $this->ollamaHost = $ollamaHost;
48        $this->ollamaPort = $ollamaPort;
49        $this->ollamaModel = $ollamaModel;
50        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
51        $this->client = curl_init();
52        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
53        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
54            'Content-Type: application/json',
55            'Accept: application/json'
56        ]);
57        // Initialize Ollama client
58        $this->ollamaClient = curl_init();
59        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
60        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
61            'Content-Type: application/json'
62        ]);
63        // Check if tenant and database exist, create them if they don't
64        $this->ensureTenantAndDatabase();
65    }
66
67    /**
68     * Clean up the cURL client when the object is destroyed
69     *
70     * @return void
71     */
72    public function __destruct() {
73        curl_close($this->client);
74        curl_close($this->ollamaClient);
75    }
76
77    /**
78     * Make an HTTP request to the ChromaDB API
79     *
80     * This is a helper function that handles making HTTP requests to the ChromaDB API,
81     * including setting the appropriate headers for tenant and database.
82     *
83     * @param string $endpoint The API endpoint to call
84     * @param string $method The HTTP method to use (default: 'GET')
85     * @param array|null $data The data to send with the request (default: null)
86     * @return array The JSON response decoded as an array
87     * @throws Exception If there's a cURL error or HTTP error
88     */
89    private function makeRequest($endpoint, $method = 'GET', $data = null) {
90        // Add tenant and database as headers instead of query parameters for v2 API
91        $headers = [
92            'Content-Type: application/json',
93            'Accept: application/json'
94        ];
95        // Version 2
96        $url = $this->baseUrl . '/api/v2' . $endpoint;
97        curl_setopt($this->client, CURLOPT_URL, $url);
98        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
99        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
100        // POST JSON data
101        if ($data) {
102            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
103        } else {
104            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
105        }
106        // Call
107        $response = curl_exec($this->client);
108        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
109        // Check the result
110        if (curl_error($this->client)) {
111            throw new \Exception('Curl error: ' . curl_error($this->client));
112        }
113        if ($httpCode >= 400) {
114            throw new \Exception("HTTP Error: $httpCode, Response: $response");
115        }
116        // Return the decoded response
117        return json_decode($response, true);
118    }
119
120    /**
121     * Generate embeddings for text using Ollama
122     *
123     * @param string $text The text to generate embeddings for
124     * @return array The embeddings vector
125     */
126    public function generateEmbeddings($text) {
127        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
128        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
129
130        // Ensure model is a string
131        $model = $this->ollamaModel;
132        if (!is_string($model)) {
133            throw new \Exception("Ollama model must be a string, got: " . gettype($model));
134        }
135
136        $data = [
137            'model' => $model,
138            'prompt' => $text,
139            'keep_alive' => '30m'
140        ];
141        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
142        $response = curl_exec($this->ollamaClient);
143        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
144        if (curl_error($this->ollamaClient)) {
145            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
146        }
147        if ($httpCode >= 400) {
148            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
149        }
150        $result = json_decode($response, true);
151        if (!isset($result['embedding'])) {
152            throw new \Exception("Ollama response missing embedding: " . $response);
153        }
154        return $result['embedding'];
155    }
156
157    /**
158     * List all collections in the database
159     *
160     * Retrieves a list of all collections in the specified tenant and database.
161     *
162     * @return array List of collections
163     */
164    public function listCollections() {
165        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
166        return $this->makeRequest($endpoint);
167    }
168
169    /**
170     * Get a collection by name
171     *
172     * Retrieves information about a specific collection by its name.
173     *
174     * @param string $name The name of the collection to retrieve
175     * @return array The collection information
176     * @throws Exception If the collection is not found
177     */
178    public function getCollection($name) {
179        // Use provided name, fallback to 'documents' if empty
180        if (empty($name)) {
181            $name = 'documents';
182        }
183        // First try to get collection by name
184        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
185        $collections = $this->makeRequest($endpoint);
186        // Find collection by name
187        foreach ($collections as $collection) {
188            if (isset($collection['name']) && $collection['name'] === $name) {
189                return $collection;
190            }
191        }
192        // If not found, throw exception
193        throw new \Exception("Collection '{$name}' not found");
194    }
195
196    /**
197     * Create a new collection
198     *
199     * Creates a new collection with the specified name and optional metadata.
200     *
201     * @param string $name The name of the collection to create
202     * @param array|null $metadata Optional metadata for the collection
203     * @return array The response from the API
204     */
205    public function createCollection($name, $metadata = null) {
206        // Use provided name, fallback to 'documents' if empty
207        if (empty($name)) {
208            $name = 'documents';
209        }
210        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
211        $data = ['name' => $name];
212        if ($metadata) {
213            $data['metadata'] = $metadata;
214        }
215        return $this->makeRequest($endpoint, 'POST', $data);
216    }
217
218    /**
219     * Delete a collection by name
220     *
221     * Deletes a collection with the specified name.
222     *
223     * @param string $name The name of the collection to delete
224     * @return array The response from the API
225     * @throws Exception If the collection ID is not found
226     */
227    public function deleteCollection($name) {
228        // Use provided name, fallback to 'documents' if empty
229        if (empty($name)) {
230            $name = 'documents';
231        }
232        // First get the collection to find its ID
233        $collection = $this->getCollection($name);
234        if (!isset($collection['id'])) {
235            throw new \Exception("Collection ID not found for '{$name}'");
236        }
237        $collectionId = $collection['id'];
238        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
239        return $this->makeRequest($endpoint, 'DELETE');
240    }
241
242    /**
243     * Get a document by its ID from a collection
244     *
245     * Retrieves a document from the specified collection using its ID.
246     *
247     * @param string $collectionName The name of the collection to get the document from
248     * @param string $documentId The document ID to retrieve
249     * @param array $include What to include in the response (default: ["metadatas", "documents"])
250     * @return array The retrieved document
251     * @throws Exception If the collection ID is not found
252     */
253    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
254        // Use provided name, fallback to 'documents' if empty
255        if (empty($collectionName)) {
256            $collectionName = 'documents';
257        }
258        // First get the collection to find its ID
259        $collection = $this->getCollection($collectionName);
260        if (!isset($collection['id'])) {
261            throw new \Exception("Collection ID not found for '{$collectionName}'");
262        }
263        $collectionId = $collection['id'];
264        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
265        $data = [
266            'ids' => [$documentId],
267            'include' => $include
268        ];
269        // Return the document
270        return $this->makeRequest($endpoint, 'POST', $data);
271    }
272
273    /**
274     * Add documents to a collection
275     *
276     * Adds documents to the specified collection. Each document must have a corresponding ID.
277     * Optional metadata and pre-computed embeddings can also be provided.
278     *
279     * @param string $collectionName The name of the collection to add documents to
280     * @param array $documents The document contents
281     * @param array $ids The document IDs
282     * @param array|null $metadatas Optional metadata for each document
283     * @param array|null $embeddings Optional pre-computed embeddings for each document
284     * @return array The response from the API
285     * @throws Exception If the collection ID is not found
286     */
287    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
288        // Use provided name, fallback to 'documents' if empty
289        if (empty($collectionName)) {
290            $collectionName = 'documents';
291        }
292        // First get the collection to find its ID
293        $collection = $this->getCollection($collectionName);
294        if (!isset($collection['id'])) {
295            throw new \Exception("Collection ID not found for '{$collectionName}'");
296        }
297        $collectionId = $collection['id'];
298        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
299        $data = [
300            'ids' => $ids,
301            'documents' => $documents
302        ];
303        // Get also the metadata
304        if ($metadatas) {
305            $data['metadatas'] = $metadatas;
306        }
307        // Get the embeddings
308        if ($embeddings) {
309            $data['embeddings'] = $embeddings;
310        }
311        // Return the respnse
312        return $this->makeRequest($endpoint, 'POST', $data);
313    }
314
315    /**
316     * Check if a document needs to be updated based on timestamp comparison
317     *
318     * Determines whether a document should be reprocessed by comparing the file's last modification
319     * time with the processed_at timestamp stored in the document's metadata. The function checks
320     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
321     * not included in the database.
322     *
323     * @param string $collectionId The ID of the collection to check documents in
324     * @param string $documentId The base document ID to check (without chunk suffixes)
325     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
326     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
327     * @throws Exception If there's an error checking the document
328     */
329    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
330        try {
331            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
332            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
333            $chunkIdsToCheck = [
334                $documentId . '@1',
335                $documentId . '@2',
336                $documentId . '@3'
337            ];
338            $data = [
339                'ids' => $chunkIdsToCheck,
340                'include' => [
341                    "metadatas"
342                ],
343                'limit' => 1
344            ];
345            // Check if document exists
346            $result = $this->makeRequest($endpoint, 'POST', $data);
347            // If no documents found, return true (needs to be added)
348            if (empty($result['ids'])) {
349                return true;
350            }
351            // Check if any document has a processed_at timestamp
352            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
353                // Check the first metadata entry directly
354                $metadata = $result['metadatas'][0];
355                // If processed_at is not set, return true (needs update)
356                if (!isset($metadata['processed_at'])) {
357                    return true;
358                }
359                // Parse the processed_at timestamp
360                $processedTimestamp = strtotime($metadata['processed_at']);
361                // If file is newer than processed time, return true (needs update)
362                if ($fileModifiedTime > $processedTimestamp) {
363                    return true;
364                }
365            }
366            // Document exists and is up to date
367            return false;
368        } catch (\Exception $e) {
369            // If there's an error checking the document, assume it needs to be updated
370            return true;
371        }
372    }
373
374    /**
375     * Query a collection for similar documents
376     *
377     * Queries the specified collection for documents similar to the provided query texts.
378     * The function generates embeddings for the query texts and sends them to ChromaDB.
379     * Supports filtering results by metadata using the where parameter.
380     *
381     * @param string $collectionName The name of the collection to query
382     * @param array $queryTexts The query texts to search for
383     * @param int $nResults The number of results to return (default: 5)
384     * @param array|null $where Optional filter conditions for metadata
385     * @return array The query results
386     * @throws Exception If the collection ID is not found
387     */
388    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
389        // Use provided name, fallback to 'documents' if empty
390        if (empty($collectionName)) {
391            $collectionName = 'documents';
392        }
393        // First get the collection to find its ID
394        $collection = $this->getCollection($collectionName);
395        if (!isset($collection['id'])) {
396            throw new \Exception("Collection ID not found for '{$collectionName}'");
397        }
398        $collectionId = $collection['id'];
399        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
400        // Generate embeddings for query texts
401        $queryEmbeddings = [];
402        foreach ($queryTexts as $text) {
403            $queryEmbeddings[] = $this->generateEmbeddings($text);
404        }
405        $data = [
406            'query_embeddings' => $queryEmbeddings,
407            'n_results' => $nResults
408        ];
409        // Add where clause for metadata filtering if provided
410        if ($where && is_array($where)) {
411            $data['where'] = $where;
412        }
413        // Return the response
414        return $this->makeRequest($endpoint, 'POST', $data);
415    }
416
417    /**
418     * Check if the ChromaDB server is alive
419     *
420     * Sends a heartbeat request to verify that the ChromaDB server is running.
421     *
422     * @return array The response from the heartbeat endpoint
423     */
424    public function heartbeat() {
425        $endpoint = "/heartbeat";
426        return $this->makeRequest($endpoint, 'GET');
427    }
428
429    /**
430     * Get authentication and identity information
431     *
432     * Retrieves authentication and identity information from the ChromaDB server.
433     *
434     * @return array The response from the auth/identity endpoint
435     */
436    public function getIdentity() {
437        $endpoint = "/identity";
438        return $this->makeRequest($endpoint, 'GET');
439    }
440
441    /**
442     * Ensure that the specified tenant and database exist
443     *
444     * Checks if the specified tenant and database exist, and creates them if they don't.
445     *
446     * @return void
447     */
448    private function ensureTenantAndDatabase() {
449        // Check if tenant exists, create if it doesn't
450        try {
451            $this->getTenant($this->tenant);
452        } catch (\Exception $e) {
453            // Tenant doesn't exist, create it
454            $this->createTenant($this->tenant);
455        }
456        // Check if database exists, create if it doesn't
457        try {
458            $this->getDatabase($this->database, $this->tenant);
459        } catch (\Exception $e) {
460            // Database doesn't exist, create it
461            $this->createDatabase($this->database, $this->tenant);
462        }
463    }
464
465    /**
466     * Get tenant information
467     *
468     * Retrieves information about the specified tenant.
469     *
470     * @param string $tenantName The tenant name
471     * @return array The tenant information
472     */
473    public function getTenant($tenantName) {
474        $endpoint = "/tenants/{$tenantName}";
475        return $this->makeRequest($endpoint, 'GET');
476    }
477
478    /**
479     * Create a new tenant
480     *
481     * Creates a new tenant with the specified name.
482     *
483     * @param string $tenantName The tenant name
484     * @return array The response from the API
485     */
486    public function createTenant($tenantName) {
487        $endpoint = "/tenants";
488        $data = ['name' => $tenantName];
489        return $this->makeRequest($endpoint, 'POST', $data);
490    }
491
492    /**
493     * Get database information
494     *
495     * Retrieves information about the specified database within a tenant.
496     *
497     * @param string $databaseName The database name
498     * @param string $tenantName The tenant name
499     * @return array The database information
500     */
501    public function getDatabase($databaseName, $tenantName) {
502        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
503        return $this->makeRequest($endpoint, 'GET');
504    }
505
506    /**
507     * Create a new database
508     *
509     * Creates a new database with the specified name within a tenant.
510     *
511     * @param string $databaseName The database name
512     * @param string $tenantName The tenant name
513     * @return array The response from the API
514     */
515    public function createDatabase($databaseName, $tenantName) {
516        $endpoint = "/tenants/{$tenantName}/databases";
517        $data = ['name' => $databaseName];
518        return $this->makeRequest($endpoint, 'POST', $data);
519    }
520
521    /**
522     * Ensure a collection exists, creating it if necessary
523     *
524     * This helper function checks if a collection exists and creates it if it doesn't.
525     *
526     * @param string $collectionName The name of the collection to check/create
527     * @return string Status message indicating what happened
528     */
529    public function ensureCollectionExists($collectionName) {
530        try {
531            $collection = $this->getCollection($collectionName);
532            return "Collection '$collectionName' already exists.";
533        } catch (\Exception $e) {
534            // Collection doesn't exist, create it
535            $created = $this->createCollection($collectionName);
536            return "Collection '$collectionName' created.";
537        }
538    }
539
540    /**
541     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
542     *
543     * This function handles the complete processing of a single DokuWiki file:
544     * 1. Parses the file path to extract metadata and document ID
545     * 2. Determines the appropriate collection based on document ID
546     * 3. Checks if the document needs updating using timestamp comparison
547     * 4. Reads and processes file content only if update is needed
548     * 5. Splits the document into chunks (paragraphs)
549     * 6. Extracts rich metadata from the DokuWiki ID format
550     * 7. Generates embeddings for each chunk
551     * 8. Sends all chunks to ChromaDB with metadata
552     *
553     * Supported ID formats:
554     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
555     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
556     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
557     *
558     * The function implements smart update checking by comparing file modification time
559     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
560     *
561     * @param string $filePath The path to the file to process
562     * @param string $collectionName The name of the collection to use
563     * @param bool $collectionChecked Whether the collection has already been checked/created
564     * @return array Result with status and details
565     */
566    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
567        // Parse file path to extract metadata
568        $id = parseFilePath($filePath);
569        try {
570            // Create collection if it doesn't exist (only if not already checked)
571            $collectionStatus = '';
572            if (!$collectionChecked) {
573                $collectionStatus = $this->ensureCollectionExists($collectionName);
574            }
575            // Get collection ID
576            $collection = $this->getCollection($collectionName);
577            if (!isset($collection['id'])) {
578                return [
579                    'status' => 'error',
580                    'message' => "Collection ID not found for '{$collectionName}'"
581                ];
582            }
583            $collectionId = $collection['id'];
584            // Get file modification time
585            $fileModifiedTime = filemtime($filePath);
586            // Check if document needs update
587            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
588            // If document is up to date, skip processing
589            if (!$needsUpdate) {
590                return [
591                    'status' => 'skipped',
592                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
593                ];
594            }
595            // Read file content
596            $content = file_get_contents($filePath);
597            // Split document into chunks (paragraphs separated by two newlines)
598            $paragraphs = preg_split('/\n\s*\n/', $content);
599            $chunks = [];
600            $chunkMetadata = [];
601            // Parse the DokuWiki ID to extract base metadata
602            $parts = explode(':', $id);
603            // Extract metadata from the last part of the ID
604            $lastPart = end($parts);
605            $baseMetadata = [];
606            // Add the document ID as metadata
607            $baseMetadata['document_id'] = $id;
608            // Add current timestamp
609            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
610            // Check if any part of the ID is 'templates' and set template metadata
611            $isTemplate = in_array('templates', $parts);
612            if ($isTemplate) {
613                $baseMetadata['type'] = 'template';
614            } else {
615                $baseMetadata['type'] = 'report';
616            }
617            // Extract modality from the second part
618            if (isset($parts[1])) {
619                $baseMetadata['modality'] = $parts[1];
620            }
621            // Handle different ID formats based on the third part: word (institution) or numeric (year)
622            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
623            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
624            // For templates, don't set institution, date or year
625            if (isset($parts[2]) && !$isTemplate) {
626                // Check if third part is numeric (year) or word (institution)
627                if (is_numeric($parts[2])) {
628                    // Format: reports:mri:2024:g287-name-surname (year format)
629                    // Extract year from the third part
630                    $baseMetadata['year'] = $parts[2];
631                    // Set default institution from config
632                    global $conf;
633                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
634                    // Extract registration and name from the last part
635                    // Registration should start with one letter or number and contain numbers before the '-' character
636                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
637                        // Check if the first part contains at least one digit to be considered a registration
638                        if (preg_match('/[0-9]/', $matches[1])) {
639                            $baseMetadata['registration'] = $matches[1];
640                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
641                        } else {
642                            // If no registration pattern found, treat entire part as patient name
643                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
644                        }
645                    } else {
646                        // If no match, treat entire part as patient name
647                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
648                    }
649                } else {
650                    // Format: reports:mri:institution:250620-name-surname (institution format)
651                    // Extract institution from the third part
652                    $baseMetadata['institution'] = $parts[2];
653                    // Extract date and name from the last part
654                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
655                        $dateStr = $matches[1];
656                        $name = $matches[2];
657                        // Convert date format (250620 -> 2025-06-20)
658                        $day = substr($dateStr, 0, 2);
659                        $month = substr($dateStr, 2, 2);
660                        $year = substr($dateStr, 4, 2);
661                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
662                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
663                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
664                        $baseMetadata['date'] = $formattedDate;
665                        $baseMetadata['name'] = str_replace('-', ' ', $name);
666                    }
667                }
668            }
669            // For templates, always extract name from the last part
670            if ($isTemplate && isset($lastPart)) {
671                // Extract name from the last part (everything after the last colon)
672                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
673                    // Check if the first part contains at least one digit to be considered a registration
674                    if (preg_match('/[0-9]/', $matches[1])) {
675                        $baseMetadata['registration'] = $matches[1];
676                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
677                    } else {
678                        // If no registration pattern found, treat entire part as template name
679                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
680                    }
681                } else {
682                    // If no match, treat entire part as template name
683                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
684                }
685            }
686            // Process each paragraph as a chunk with intelligent metadata handling
687            $chunkIds = [];
688            $chunkContents = [];
689            $chunkMetadatas = [];
690            $chunkEmbeddings = [];
691            $currentTags = [];
692            foreach ($paragraphs as $index => $paragraph) {
693                // Skip empty paragraphs to avoid processing whitespace-only content
694                $paragraph = trim($paragraph);
695                if (empty($paragraph)) {
696                    continue;
697                }
698                // Check if this is a DokuWiki title (starts and ends with =)
699                // Titles are converted to tags for better searchability but not stored as content chunks
700                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
701                    // Extract title content and clean it
702                    $titleContent = trim($matches[1]);
703                    // Split into words and create searchable tags
704                    $words = preg_split('/\s+/', $titleContent);
705                    $tags = [];
706                    foreach ($words as $word) {
707                        // Only use words longer than 3 characters to reduce noise
708                        if (strlen($word) >= 3) {
709                            $tags[] = strtolower($word);
710                        }
711                    }
712                    // Remove duplicate tags and store for use in subsequent chunks
713                    $currentTags = array_unique($tags);
714                    continue; // Skip storing title chunks as content
715                }
716                // Create chunk ID
717                $chunkId = $id . '@' . ($index + 1);
718                // Generate embeddings for the chunk
719                $embeddings = $this->generateEmbeddings($paragraph);
720                // Add chunk-specific metadata
721                $metadata = $baseMetadata;
722                $metadata['chunk_id'] = $chunkId;
723                $metadata['chunk_number'] = $index + 1;
724                $metadata['total_chunks'] = count($paragraphs);
725                // Add current tags to metadata if any exist
726                if (!empty($currentTags)) {
727                    $metadata['tags'] = implode(',', $currentTags);
728                }
729                // Store chunk data
730                $chunkIds[] = $chunkId;
731                $chunkContents[] = $paragraph;
732                $chunkMetadatas[] = $metadata;
733                $chunkEmbeddings[] = $embeddings;
734            }
735            // If no chunks were created, skip this file
736            if (empty($chunkIds)) {
737                return [
738                    'status' => 'skipped',
739                    'message' => "No valid chunks found in file '$id'. Skipping..."
740                ];
741            }
742            // Send all chunks to ChromaDB
743            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
744            return [
745                'status' => 'success',
746                'message' => "Successfully sent file to ChromaDB",
747                'details' => [
748                    'document_id' => $id,
749                    'chunks' => count($chunkIds),
750                    'collection' => $collectionName
751                ],
752                'collection_status' => $collectionStatus
753            ];
754        } catch (\Exception $e) {
755            return [
756                'status' => 'error',
757                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
758            ];
759        }
760    }
761
762}
763
764/**
765 * Parse a file path and convert it to a DokuWiki ID
766 *
767 * Takes a file system path and converts it to the DokuWiki ID format by:
768 * 1. Removing the base path prefix (using DokuWiki's pages directory)
769 * 2. Removing the .txt extension
770 * 3. Converting directory separators to colons
771 *
772 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
773 * Becomes: reports:mri:2024:g287-name-surname
774 *
775 * @param string $filePath The full file path to parse
776 * @return string The DokuWiki ID
777 */
778function parseFilePath($filePath) {
779    // Use DokuWiki's constant to get the pages directory if available
780    if (defined('DOKU_INC')) {
781        $pagesDir = DOKU_INC . 'data/pages/';
782    } else {
783        // Fallback to common DokuWiki installation path
784        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
785    }
786    // Remove the base path
787    $relativePath = str_replace($pagesDir, '', $filePath);
788    // Remove .txt extension
789    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
790    // Split path into parts and filter out empty parts
791    $parts = array_filter(explode('/', $relativePath));
792    // Build DokuWiki ID (use first part as namespace)
793    $idParts = [];
794    foreach ($parts as $part) {
795        if (!empty($part)) {
796            $idParts[] = $part;
797        }
798    }
799    // Reurn the ID
800    return implode(':', $idParts);
801}
802