xref: /plugin/dokullm/ChromaDBClient.php (revision e3574e2b3d062cb617b2e508cd017b0ef074d3a5)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5use RecursiveIteratorIterator;
6use RecursiveDirectoryIterator;
7
8class ChromaDBClient {
9    private $baseUrl;
10    private $client;
11    private $ollamaClient;
12    private $tenant;
13    private $database;
14    private $ollamaHost;
15    private $ollamaPort;
16    private $ollamaModel;
17
18    /**
19     * Get configuration value for the dokullm plugin
20     *
21     * @param string $key Configuration key
22     * @param mixed $default Default value if key not found
23     * @return mixed Configuration value
24     */
25    /**
26     * Initialize the ChromaDB client
27     *
28     * Creates a new ChromaDB client instance with the specified connection parameters.
29     * Also ensures that the specified tenant and database exist.
30     *
31     * @param string $host ChromaDB server host
32     * @param int $port ChromaDB server port
33     * @param string $tenant ChromaDB tenant name
34     * @param string $database ChromaDB database name
35     * @param string $defaultCollection Default collection name
36     * @param string $ollamaHost Ollama server host
37     * @param int $ollamaPort Ollama server port
38     * @param string $ollamaModel Ollama embeddings model
39     */
40    public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) {
41        // Use provided parameters (no fallback since they're mandatory)
42        $chromaHost = $host;
43        $chromaPort = $port;
44        $this->tenant = $tenant;
45        $this->database = $database;
46        $this->defaultCollection = $defaultCollection;
47        $this->ollamaHost = $ollamaHost;
48        $this->ollamaPort = $ollamaPort;
49
50        // Ensure ollamaModel is a string with a default fallback
51        if (!is_string($ollamaModel) || empty($ollamaModel)) {
52            $this->ollamaModel = 'nomic-embed-text'; // Default embedding model
53        } else {
54            $this->ollamaModel = $ollamaModel;
55        }
56
57        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
58        $this->client = curl_init();
59        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
60        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
61            'Content-Type: application/json',
62            'Accept: application/json'
63        ]);
64        // Initialize Ollama client
65        $this->ollamaClient = curl_init();
66        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
67        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
68            'Content-Type: application/json'
69        ]);
70        // Check if tenant and database exist, create them if they don't
71        $this->ensureTenantAndDatabase();
72    }
73
74    /**
75     * Clean up the cURL client when the object is destroyed
76     *
77     * @return void
78     */
79    public function __destruct() {
80        curl_close($this->client);
81        curl_close($this->ollamaClient);
82    }
83
84    /**
85     * Make an HTTP request to the ChromaDB API
86     *
87     * This is a helper function that handles making HTTP requests to the ChromaDB API,
88     * including setting the appropriate headers for tenant and database.
89     *
90     * @param string $endpoint The API endpoint to call
91     * @param string $method The HTTP method to use (default: 'GET')
92     * @param array|null $data The data to send with the request (default: null)
93     * @return array The JSON response decoded as an array
94     * @throws Exception If there's a cURL error or HTTP error
95     */
96    private function makeRequest($endpoint, $method = 'GET', $data = null) {
97        // Add tenant and database as headers instead of query parameters for v2 API
98        $headers = [
99            'Content-Type: application/json',
100            'Accept: application/json'
101        ];
102        // Version 2
103        $url = $this->baseUrl . '/api/v2' . $endpoint;
104        curl_setopt($this->client, CURLOPT_URL, $url);
105        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
106        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
107        // POST JSON data
108        if ($data) {
109            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
110        } else {
111            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
112        }
113        // Call
114        $response = curl_exec($this->client);
115        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
116        // Check the result
117        if (curl_error($this->client)) {
118            throw new \Exception('Curl error: ' . curl_error($this->client));
119        }
120        if ($httpCode >= 400) {
121            throw new \Exception("HTTP Error: $httpCode, Response: $response");
122        }
123        // Return the decoded response
124        return json_decode($response, true);
125    }
126
127    /**
128     * Generate embeddings for text using Ollama
129     *
130     * @param string $text The text to generate embeddings for
131     * @return array The embeddings vector
132     */
133    public function generateEmbeddings($text) {
134        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
135        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
136
137        // Ensure model is a string
138        $model = $this->ollamaModel;
139        if (!is_string($model)) {
140            throw new \Exception("Ollama model must be a string, got: " . gettype($model));
141        }
142
143        $data = [
144            'model' => $model,
145            'prompt' => $text,
146            'keep_alive' => '30m'
147        ];
148        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
149        $response = curl_exec($this->ollamaClient);
150        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
151        if (curl_error($this->ollamaClient)) {
152            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
153        }
154        if ($httpCode >= 400) {
155            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
156        }
157        $result = json_decode($response, true);
158        if (!isset($result['embedding'])) {
159            throw new \Exception("Ollama response missing embedding: " . $response);
160        }
161        return $result['embedding'];
162    }
163
164    /**
165     * List all collections in the database
166     *
167     * Retrieves a list of all collections in the specified tenant and database.
168     *
169     * @return array List of collections
170     */
171    public function listCollections() {
172        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
173        return $this->makeRequest($endpoint);
174    }
175
176    /**
177     * Get a collection by name
178     *
179     * Retrieves information about a specific collection by its name.
180     *
181     * @param string $name The name of the collection to retrieve
182     * @return array The collection information
183     * @throws Exception If the collection is not found
184     */
185    public function getCollection($name) {
186        // Use provided name, fallback to 'documents' if empty
187        if (empty($name)) {
188            $name = 'documents';
189        }
190        // First try to get collection by name
191        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
192        $collections = $this->makeRequest($endpoint);
193        // Find collection by name
194        foreach ($collections as $collection) {
195            if (isset($collection['name']) && $collection['name'] === $name) {
196                return $collection;
197            }
198        }
199        // If not found, throw exception
200        throw new \Exception("Collection '{$name}' not found");
201    }
202
203    /**
204     * Create a new collection
205     *
206     * Creates a new collection with the specified name and optional metadata.
207     *
208     * @param string $name The name of the collection to create
209     * @param array|null $metadata Optional metadata for the collection
210     * @return array The response from the API
211     */
212    public function createCollection($name, $metadata = null) {
213        // Use provided name, fallback to 'documents' if empty
214        if (empty($name)) {
215            $name = 'documents';
216        }
217        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
218        $data = ['name' => $name];
219        if ($metadata) {
220            $data['metadata'] = $metadata;
221        }
222        return $this->makeRequest($endpoint, 'POST', $data);
223    }
224
225    /**
226     * Delete a collection by name
227     *
228     * Deletes a collection with the specified name.
229     *
230     * @param string $name The name of the collection to delete
231     * @return array The response from the API
232     * @throws Exception If the collection ID is not found
233     */
234    public function deleteCollection($name) {
235        // Use provided name, fallback to 'documents' if empty
236        if (empty($name)) {
237            $name = 'documents';
238        }
239        // First get the collection to find its ID
240        $collection = $this->getCollection($name);
241        if (!isset($collection['id'])) {
242            throw new \Exception("Collection ID not found for '{$name}'");
243        }
244        $collectionId = $collection['id'];
245        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
246        return $this->makeRequest($endpoint, 'DELETE');
247    }
248
249    /**
250     * Get a document by its ID from a collection
251     *
252     * Retrieves a document from the specified collection using its ID.
253     *
254     * @param string $collectionName The name of the collection to get the document from
255     * @param string $documentId The document ID to retrieve
256     * @param array $include What to include in the response (default: ["metadatas", "documents"])
257     * @return array The retrieved document
258     * @throws Exception If the collection ID is not found
259     */
260    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
261        // Use provided name, fallback to 'documents' if empty
262        if (empty($collectionName)) {
263            $collectionName = 'documents';
264        }
265        // First get the collection to find its ID
266        $collection = $this->getCollection($collectionName);
267        if (!isset($collection['id'])) {
268            throw new \Exception("Collection ID not found for '{$collectionName}'");
269        }
270        $collectionId = $collection['id'];
271        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
272        $data = [
273            'ids' => [$documentId],
274            'include' => $include
275        ];
276        // Return the document
277        return $this->makeRequest($endpoint, 'POST', $data);
278    }
279
280    /**
281     * Add documents to a collection
282     *
283     * Adds documents to the specified collection. Each document must have a corresponding ID.
284     * Optional metadata and pre-computed embeddings can also be provided.
285     *
286     * @param string $collectionName The name of the collection to add documents to
287     * @param array $documents The document contents
288     * @param array $ids The document IDs
289     * @param array|null $metadatas Optional metadata for each document
290     * @param array|null $embeddings Optional pre-computed embeddings for each document
291     * @return array The response from the API
292     * @throws Exception If the collection ID is not found
293     */
294    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
295        // Use provided name, fallback to 'documents' if empty
296        if (empty($collectionName)) {
297            $collectionName = 'documents';
298        }
299        // First get the collection to find its ID
300        $collection = $this->getCollection($collectionName);
301        if (!isset($collection['id'])) {
302            throw new \Exception("Collection ID not found for '{$collectionName}'");
303        }
304        $collectionId = $collection['id'];
305        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
306        $data = [
307            'ids' => $ids,
308            'documents' => $documents
309        ];
310        // Get also the metadata
311        if ($metadatas) {
312            $data['metadatas'] = $metadatas;
313        }
314        // Get the embeddings
315        if ($embeddings) {
316            $data['embeddings'] = $embeddings;
317        }
318        // Return the respnse
319        return $this->makeRequest($endpoint, 'POST', $data);
320    }
321
322    /**
323     * Check if a document needs to be updated based on timestamp comparison
324     *
325     * Determines whether a document should be reprocessed by comparing the file's last modification
326     * time with the processed_at timestamp stored in the document's metadata. The function checks
327     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
328     * not included in the database.
329     *
330     * @param string $collectionId The ID of the collection to check documents in
331     * @param string $documentId The base document ID to check (without chunk suffixes)
332     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
333     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
334     * @throws Exception If there's an error checking the document
335     */
336    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
337        try {
338            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
339            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
340            $chunkIdsToCheck = [
341                $documentId . '@1',
342                $documentId . '@2',
343                $documentId . '@3'
344            ];
345            $data = [
346                'ids' => $chunkIdsToCheck,
347                'include' => [
348                    "metadatas"
349                ],
350                'limit' => 1
351            ];
352            // Check if document exists
353            $result = $this->makeRequest($endpoint, 'POST', $data);
354            // If no documents found, return true (needs to be added)
355            if (empty($result['ids'])) {
356                return true;
357            }
358            // Check if any document has a processed_at timestamp
359            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
360                // Check the first metadata entry directly
361                $metadata = $result['metadatas'][0];
362                // If processed_at is not set, return true (needs update)
363                if (!isset($metadata['processed_at'])) {
364                    return true;
365                }
366                // Parse the processed_at timestamp
367                $processedTimestamp = strtotime($metadata['processed_at']);
368                // If file is newer than processed time, return true (needs update)
369                if ($fileModifiedTime > $processedTimestamp) {
370                    return true;
371                }
372            }
373            // Document exists and is up to date
374            return false;
375        } catch (\Exception $e) {
376            // If there's an error checking the document, assume it needs to be updated
377            return true;
378        }
379    }
380
381    /**
382     * Query a collection for similar documents
383     *
384     * Queries the specified collection for documents similar to the provided query texts.
385     * The function generates embeddings for the query texts and sends them to ChromaDB.
386     * Supports filtering results by metadata using the where parameter.
387     *
388     * @param string $collectionName The name of the collection to query
389     * @param array $queryTexts The query texts to search for
390     * @param int $nResults The number of results to return (default: 5)
391     * @param array|null $where Optional filter conditions for metadata
392     * @return array The query results
393     * @throws Exception If the collection ID is not found
394     */
395    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
396        // Use provided name, fallback to 'documents' if empty
397        if (empty($collectionName)) {
398            $collectionName = 'documents';
399        }
400        // First get the collection to find its ID
401        $collection = $this->getCollection($collectionName);
402        if (!isset($collection['id'])) {
403            throw new \Exception("Collection ID not found for '{$collectionName}'");
404        }
405        $collectionId = $collection['id'];
406        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
407        // Generate embeddings for query texts
408        $queryEmbeddings = [];
409        foreach ($queryTexts as $text) {
410            $queryEmbeddings[] = $this->generateEmbeddings($text);
411        }
412        $data = [
413            'query_embeddings' => $queryEmbeddings,
414            'n_results' => $nResults
415        ];
416        // Add where clause for metadata filtering if provided
417        if ($where && is_array($where)) {
418            $data['where'] = $where;
419        }
420        // Return the response
421        return $this->makeRequest($endpoint, 'POST', $data);
422    }
423
424    /**
425     * Check if the ChromaDB server is alive
426     *
427     * Sends a heartbeat request to verify that the ChromaDB server is running.
428     *
429     * @return array The response from the heartbeat endpoint
430     */
431    public function heartbeat() {
432        $endpoint = "/heartbeat";
433        return $this->makeRequest($endpoint, 'GET');
434    }
435
436    /**
437     * Get authentication and identity information
438     *
439     * Retrieves authentication and identity information from the ChromaDB server.
440     *
441     * @return array The response from the auth/identity endpoint
442     */
443    public function getIdentity() {
444        $endpoint = "/identity";
445        return $this->makeRequest($endpoint, 'GET');
446    }
447
448    /**
449     * Ensure that the specified tenant and database exist
450     *
451     * Checks if the specified tenant and database exist, and creates them if they don't.
452     *
453     * @return void
454     */
455    private function ensureTenantAndDatabase() {
456        // Check if tenant exists, create if it doesn't
457        try {
458            $this->getTenant($this->tenant);
459        } catch (\Exception $e) {
460            // Tenant doesn't exist, create it
461            $this->createTenant($this->tenant);
462        }
463        // Check if database exists, create if it doesn't
464        try {
465            $this->getDatabase($this->database, $this->tenant);
466        } catch (\Exception $e) {
467            // Database doesn't exist, create it
468            $this->createDatabase($this->database, $this->tenant);
469        }
470    }
471
472    /**
473     * Get tenant information
474     *
475     * Retrieves information about the specified tenant.
476     *
477     * @param string $tenantName The tenant name
478     * @return array The tenant information
479     */
480    public function getTenant($tenantName) {
481        $endpoint = "/tenants/{$tenantName}";
482        return $this->makeRequest($endpoint, 'GET');
483    }
484
485    /**
486     * Create a new tenant
487     *
488     * Creates a new tenant with the specified name.
489     *
490     * @param string $tenantName The tenant name
491     * @return array The response from the API
492     */
493    public function createTenant($tenantName) {
494        $endpoint = "/tenants";
495        $data = ['name' => $tenantName];
496        return $this->makeRequest($endpoint, 'POST', $data);
497    }
498
499    /**
500     * Get database information
501     *
502     * Retrieves information about the specified database within a tenant.
503     *
504     * @param string $databaseName The database name
505     * @param string $tenantName The tenant name
506     * @return array The database information
507     */
508    public function getDatabase($databaseName, $tenantName) {
509        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
510        return $this->makeRequest($endpoint, 'GET');
511    }
512
513    /**
514     * Create a new database
515     *
516     * Creates a new database with the specified name within a tenant.
517     *
518     * @param string $databaseName The database name
519     * @param string $tenantName The tenant name
520     * @return array The response from the API
521     */
522    public function createDatabase($databaseName, $tenantName) {
523        $endpoint = "/tenants/{$tenantName}/databases";
524        $data = ['name' => $databaseName];
525        return $this->makeRequest($endpoint, 'POST', $data);
526    }
527
528    /**
529     * Ensure a collection exists, creating it if necessary
530     *
531     * This helper function checks if a collection exists and creates it if it doesn't.
532     *
533     * @param string $collectionName The name of the collection to check/create
534     * @return string Status message indicating what happened
535     */
536    public function ensureCollectionExists($collectionName) {
537        try {
538            $collection = $this->getCollection($collectionName);
539            return "Collection '$collectionName' already exists.";
540        } catch (\Exception $e) {
541            // Collection doesn't exist, create it
542            $created = $this->createCollection($collectionName);
543            return "Collection '$collectionName' created.";
544        }
545    }
546
547    /**
548     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
549     *
550     * This function handles the complete processing of a single DokuWiki file:
551     * 1. Parses the file path to extract metadata and document ID
552     * 2. Determines the appropriate collection based on document ID
553     * 3. Checks if the document needs updating using timestamp comparison
554     * 4. Reads and processes file content only if update is needed
555     * 5. Splits the document into chunks (paragraphs)
556     * 6. Extracts rich metadata from the DokuWiki ID format
557     * 7. Generates embeddings for each chunk
558     * 8. Sends all chunks to ChromaDB with metadata
559     *
560     * Supported ID formats:
561     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
562     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
563     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
564     *
565     * The function implements smart update checking by comparing file modification time
566     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
567     *
568     * @param string $filePath The path to the file to process
569     * @param string $collectionName The name of the collection to use
570     * @param bool $collectionChecked Whether the collection has already been checked/created
571     * @return array Result with status and details
572     */
573    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
574        // Parse file path to extract metadata
575        $id = parseFilePath($filePath);
576        try {
577            // Create collection if it doesn't exist (only if not already checked)
578            $collectionStatus = '';
579            if (!$collectionChecked) {
580                $collectionStatus = $this->ensureCollectionExists($collectionName);
581            }
582            // Get collection ID
583            $collection = $this->getCollection($collectionName);
584            if (!isset($collection['id'])) {
585                return [
586                    'status' => 'error',
587                    'message' => "Collection ID not found for '{$collectionName}'"
588                ];
589            }
590            $collectionId = $collection['id'];
591            // Get file modification time
592            $fileModifiedTime = filemtime($filePath);
593            // Check if document needs update
594            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
595            // If document is up to date, skip processing
596            if (!$needsUpdate) {
597                return [
598                    'status' => 'skipped',
599                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
600                ];
601            }
602            // Read file content
603            $content = file_get_contents($filePath);
604            // Split document into chunks (paragraphs separated by two newlines)
605            $paragraphs = preg_split('/\n\s*\n/', $content);
606            $chunks = [];
607            $chunkMetadata = [];
608            // Parse the DokuWiki ID to extract base metadata
609            $parts = explode(':', $id);
610            // Extract metadata from the last part of the ID
611            $lastPart = end($parts);
612            $baseMetadata = [];
613            // Add the document ID as metadata
614            $baseMetadata['document_id'] = $id;
615            // Add current timestamp
616            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
617            // Check if any part of the ID is 'templates' and set template metadata
618            $isTemplate = in_array('templates', $parts);
619            if ($isTemplate) {
620                $baseMetadata['type'] = 'template';
621            } else {
622                $baseMetadata['type'] = 'report';
623            }
624            // Extract modality from the second part
625            if (isset($parts[1])) {
626                $baseMetadata['modality'] = $parts[1];
627            }
628            // Handle different ID formats based on the third part: word (institution) or numeric (year)
629            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
630            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
631            // For templates, don't set institution, date or year
632            if (isset($parts[2]) && !$isTemplate) {
633                // Check if third part is numeric (year) or word (institution)
634                if (is_numeric($parts[2])) {
635                    // Format: reports:mri:2024:g287-name-surname (year format)
636                    // Extract year from the third part
637                    $baseMetadata['year'] = $parts[2];
638                    // Set default institution from config
639                    global $conf;
640                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
641                    // Extract registration and name from the last part
642                    // Registration should start with one letter or number and contain numbers before the '-' character
643                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
644                        // Check if the first part contains at least one digit to be considered a registration
645                        if (preg_match('/[0-9]/', $matches[1])) {
646                            $baseMetadata['registration'] = $matches[1];
647                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
648                        } else {
649                            // If no registration pattern found, treat entire part as patient name
650                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
651                        }
652                    } else {
653                        // If no match, treat entire part as patient name
654                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
655                    }
656                } else {
657                    // Format: reports:mri:institution:250620-name-surname (institution format)
658                    // Extract institution from the third part
659                    $baseMetadata['institution'] = $parts[2];
660                    // Extract date and name from the last part
661                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
662                        $dateStr = $matches[1];
663                        $name = $matches[2];
664                        // Convert date format (250620 -> 2025-06-20)
665                        $day = substr($dateStr, 0, 2);
666                        $month = substr($dateStr, 2, 2);
667                        $year = substr($dateStr, 4, 2);
668                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
669                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
670                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
671                        $baseMetadata['date'] = $formattedDate;
672                        $baseMetadata['name'] = str_replace('-', ' ', $name);
673                    }
674                }
675            }
676            // For templates, always extract name from the last part
677            if ($isTemplate && isset($lastPart)) {
678                // Extract name from the last part (everything after the last colon)
679                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
680                    // Check if the first part contains at least one digit to be considered a registration
681                    if (preg_match('/[0-9]/', $matches[1])) {
682                        $baseMetadata['registration'] = $matches[1];
683                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
684                    } else {
685                        // If no registration pattern found, treat entire part as template name
686                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
687                    }
688                } else {
689                    // If no match, treat entire part as template name
690                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
691                }
692            }
693            // Process each paragraph as a chunk with intelligent metadata handling
694            $chunkIds = [];
695            $chunkContents = [];
696            $chunkMetadatas = [];
697            $chunkEmbeddings = [];
698            $currentTags = [];
699            foreach ($paragraphs as $index => $paragraph) {
700                // Skip empty paragraphs to avoid processing whitespace-only content
701                $paragraph = trim($paragraph);
702                if (empty($paragraph)) {
703                    continue;
704                }
705                // Check if this is a DokuWiki title (starts and ends with =)
706                // Titles are converted to tags for better searchability but not stored as content chunks
707                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
708                    // Extract title content and clean it
709                    $titleContent = trim($matches[1]);
710                    // Split into words and create searchable tags
711                    $words = preg_split('/\s+/', $titleContent);
712                    $tags = [];
713                    foreach ($words as $word) {
714                        // Only use words longer than 3 characters to reduce noise
715                        if (strlen($word) >= 3) {
716                            $tags[] = strtolower($word);
717                        }
718                    }
719                    // Remove duplicate tags and store for use in subsequent chunks
720                    $currentTags = array_unique($tags);
721                    continue; // Skip storing title chunks as content
722                }
723                // Create chunk ID
724                $chunkId = $id . '@' . ($index + 1);
725                // Generate embeddings for the chunk
726                $embeddings = $this->generateEmbeddings($paragraph);
727                // Add chunk-specific metadata
728                $metadata = $baseMetadata;
729                $metadata['chunk_id'] = $chunkId;
730                $metadata['chunk_number'] = $index + 1;
731                $metadata['total_chunks'] = count($paragraphs);
732                // Add current tags to metadata if any exist
733                if (!empty($currentTags)) {
734                    $metadata['tags'] = implode(',', $currentTags);
735                }
736                // Store chunk data
737                $chunkIds[] = $chunkId;
738                $chunkContents[] = $paragraph;
739                $chunkMetadatas[] = $metadata;
740                $chunkEmbeddings[] = $embeddings;
741            }
742            // If no chunks were created, skip this file
743            if (empty($chunkIds)) {
744                return [
745                    'status' => 'skipped',
746                    'message' => "No valid chunks found in file '$id'. Skipping..."
747                ];
748            }
749            // Send all chunks to ChromaDB
750            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
751            return [
752                'status' => 'success',
753                'message' => "Successfully sent file to ChromaDB",
754                'details' => [
755                    'document_id' => $id,
756                    'chunks' => count($chunkIds),
757                    'collection' => $collectionName
758                ],
759                'collection_status' => $collectionStatus
760            ];
761        } catch (\Exception $e) {
762            return [
763                'status' => 'error',
764                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
765            ];
766        }
767    }
768
769}
770
771/**
772 * Parse a file path and convert it to a DokuWiki ID
773 *
774 * Takes a file system path and converts it to the DokuWiki ID format by:
775 * 1. Removing the base path prefix (using DokuWiki's pages directory)
776 * 2. Removing the .txt extension
777 * 3. Converting directory separators to colons
778 *
779 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
780 * Becomes: reports:mri:2024:g287-name-surname
781 *
782 * @param string $filePath The full file path to parse
783 * @return string The DokuWiki ID
784 */
785function parseFilePath($filePath) {
786    // Use DokuWiki's constant to get the pages directory if available
787    if (defined('DOKU_INC')) {
788        $pagesDir = DOKU_INC . 'data/pages/';
789    } else {
790        // Fallback to common DokuWiki installation path
791        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
792    }
793    // Remove the base path
794    $relativePath = str_replace($pagesDir, '', $filePath);
795    // Remove .txt extension
796    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
797    // Split path into parts and filter out empty parts
798    $parts = array_filter(explode('/', $relativePath));
799    // Build DokuWiki ID (use first part as namespace)
800    $idParts = [];
801    foreach ($parts as $part) {
802        if (!empty($part)) {
803            $idParts[] = $part;
804        }
805    }
806    // Reurn the ID
807    return implode(':', $idParts);
808}
809