xref: /plugin/dokullm/ChromaDBClient.php (revision 0fca79f351f99c857ddc498011af073af22a734f)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5use RecursiveIteratorIterator;
6use RecursiveDirectoryIterator;
7
8class ChromaDBClient {
9    private $baseUrl;
10    private $client;
11    private $ollamaClient;
12    private $tenant;
13    private $database;
14    private $ollamaHost;
15    private $ollamaPort;
16    private $ollamaModel;
17
18    /**
19     * Get configuration value for the dokullm plugin
20     *
21     * @param string $key Configuration key
22     * @param mixed $default Default value if key not found
23     * @return mixed Configuration value
24     */
25    /**
26     * Initialize the ChromaDB client
27     *
28     * Creates a new ChromaDB client instance with the specified connection parameters.
29     * Also ensures that the specified tenant and database exist.
30     *
31     * @param string $host ChromaDB server host
32     * @param int $port ChromaDB server port
33     * @param string $tenant ChromaDB tenant name
34     * @param string $database ChromaDB database name
35     * @param string $defaultCollection Default collection name
36     * @param string $ollamaHost Ollama server host
37     * @param int $ollamaPort Ollama server port
38     * @param string $ollamaModel Ollama embeddings model
39     */
40    public function __construct($host, $port, $tenant, $database, $defaultCollection, $ollamaHost, $ollamaPort, $ollamaModel) {
41        // Use provided parameters (no fallback since they're mandatory)
42        $chromaHost = $host;
43        $chromaPort = $port;
44        $this->tenant = $tenant;
45        $this->database = $database;
46        $this->defaultCollection = $defaultCollection;
47        $this->ollamaHost = $ollamaHost;
48        $this->ollamaPort = $ollamaPort;
49        $this->ollamaModel = $ollamaModel;
50        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
51        $this->client = curl_init();
52        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
53        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
54            'Content-Type: application/json',
55            'Accept: application/json'
56        ]);
57        // Initialize Ollama client
58        $this->ollamaClient = curl_init();
59        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
60        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
61            'Content-Type: application/json'
62        ]);
63        // Check if tenant and database exist, create them if they don't
64        $this->ensureTenantAndDatabase();
65    }
66
67    /**
68     * Clean up the cURL client when the object is destroyed
69     *
70     * @return void
71     */
72    public function __destruct() {
73        curl_close($this->client);
74        curl_close($this->ollamaClient);
75    }
76
77    /**
78     * Make an HTTP request to the ChromaDB API
79     *
80     * This is a helper function that handles making HTTP requests to the ChromaDB API,
81     * including setting the appropriate headers for tenant and database.
82     *
83     * @param string $endpoint The API endpoint to call
84     * @param string $method The HTTP method to use (default: 'GET')
85     * @param array|null $data The data to send with the request (default: null)
86     * @return array The JSON response decoded as an array
87     * @throws Exception If there's a cURL error or HTTP error
88     */
89    private function makeRequest($endpoint, $method = 'GET', $data = null) {
90        // Add tenant and database as headers instead of query parameters for v2 API
91        $headers = [
92            'Content-Type: application/json',
93            'Accept: application/json'
94        ];
95        // Version 2
96        $url = $this->baseUrl . '/api/v2' . $endpoint;
97        curl_setopt($this->client, CURLOPT_URL, $url);
98        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
99        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
100        // POST JSON data
101        if ($data) {
102            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
103        } else {
104            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
105        }
106        // Call
107        $response = curl_exec($this->client);
108        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
109        // Check the result
110        if (curl_error($this->client)) {
111            throw new \Exception('Curl error: ' . curl_error($this->client));
112        }
113        if ($httpCode >= 400) {
114            throw new \Exception("HTTP Error: $httpCode, Response: $response");
115        }
116        // Return the decoded response
117        return json_decode($response, true);
118    }
119
120    /**
121     * Generate embeddings for text using Ollama
122     *
123     * @param string $text The text to generate embeddings for
124     * @return array The embeddings vector
125     */
126    public function generateEmbeddings($text) {
127        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
128        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
129        $data = [
130            'model' => $this->ollamaModel,
131            'prompt' => $text,
132            'keep_alive' => '30m'
133        ];
134        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
135        $response = curl_exec($this->ollamaClient);
136        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
137        if (curl_error($this->ollamaClient)) {
138            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
139        }
140        if ($httpCode >= 400) {
141            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
142        }
143        $result = json_decode($response, true);
144        if (!isset($result['embedding'])) {
145            throw new \Exception("Ollama response missing embedding: " . $response);
146        }
147        return $result['embedding'];
148    }
149
150    /**
151     * List all collections in the database
152     *
153     * Retrieves a list of all collections in the specified tenant and database.
154     *
155     * @return array List of collections
156     */
157    public function listCollections() {
158        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
159        return $this->makeRequest($endpoint);
160    }
161
162    /**
163     * Get a collection by name
164     *
165     * Retrieves information about a specific collection by its name.
166     *
167     * @param string $name The name of the collection to retrieve
168     * @return array The collection information
169     * @throws Exception If the collection is not found
170     */
171    public function getCollection($name) {
172        // Use provided name, fallback to 'documents' if empty
173        if (empty($name)) {
174            $name = 'documents';
175        }
176        // First try to get collection by name
177        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
178        $collections = $this->makeRequest($endpoint);
179        // Find collection by name
180        foreach ($collections as $collection) {
181            if (isset($collection['name']) && $collection['name'] === $name) {
182                return $collection;
183            }
184        }
185        // If not found, throw exception
186        throw new \Exception("Collection '{$name}' not found");
187    }
188
189    /**
190     * Create a new collection
191     *
192     * Creates a new collection with the specified name and optional metadata.
193     *
194     * @param string $name The name of the collection to create
195     * @param array|null $metadata Optional metadata for the collection
196     * @return array The response from the API
197     */
198    public function createCollection($name, $metadata = null) {
199        // Use provided name, fallback to 'documents' if empty
200        if (empty($name)) {
201            $name = 'documents';
202        }
203        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
204        $data = ['name' => $name];
205        if ($metadata) {
206            $data['metadata'] = $metadata;
207        }
208        return $this->makeRequest($endpoint, 'POST', $data);
209    }
210
211    /**
212     * Delete a collection by name
213     *
214     * Deletes a collection with the specified name.
215     *
216     * @param string $name The name of the collection to delete
217     * @return array The response from the API
218     * @throws Exception If the collection ID is not found
219     */
220    public function deleteCollection($name) {
221        // Use provided name, fallback to 'documents' if empty
222        if (empty($name)) {
223            $name = 'documents';
224        }
225        // First get the collection to find its ID
226        $collection = $this->getCollection($name);
227        if (!isset($collection['id'])) {
228            throw new \Exception("Collection ID not found for '{$name}'");
229        }
230        $collectionId = $collection['id'];
231        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
232        return $this->makeRequest($endpoint, 'DELETE');
233    }
234
235    /**
236     * Get a document by its ID from a collection
237     *
238     * Retrieves a document from the specified collection using its ID.
239     *
240     * @param string $collectionName The name of the collection to get the document from
241     * @param string $documentId The document ID to retrieve
242     * @param array $include What to include in the response (default: ["metadatas", "documents"])
243     * @return array The retrieved document
244     * @throws Exception If the collection ID is not found
245     */
246    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
247        // Use provided name, fallback to 'documents' if empty
248        if (empty($collectionName)) {
249            $collectionName = 'documents';
250        }
251        // First get the collection to find its ID
252        $collection = $this->getCollection($collectionName);
253        if (!isset($collection['id'])) {
254            throw new \Exception("Collection ID not found for '{$collectionName}'");
255        }
256        $collectionId = $collection['id'];
257        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
258        $data = [
259            'ids' => [$documentId],
260            'include' => $include
261        ];
262        // Return the document
263        return $this->makeRequest($endpoint, 'POST', $data);
264    }
265
266    /**
267     * Add documents to a collection
268     *
269     * Adds documents to the specified collection. Each document must have a corresponding ID.
270     * Optional metadata and pre-computed embeddings can also be provided.
271     *
272     * @param string $collectionName The name of the collection to add documents to
273     * @param array $documents The document contents
274     * @param array $ids The document IDs
275     * @param array|null $metadatas Optional metadata for each document
276     * @param array|null $embeddings Optional pre-computed embeddings for each document
277     * @return array The response from the API
278     * @throws Exception If the collection ID is not found
279     */
280    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
281        // Use provided name, fallback to 'documents' if empty
282        if (empty($collectionName)) {
283            $collectionName = 'documents';
284        }
285        // First get the collection to find its ID
286        $collection = $this->getCollection($collectionName);
287        if (!isset($collection['id'])) {
288            throw new \Exception("Collection ID not found for '{$collectionName}'");
289        }
290        $collectionId = $collection['id'];
291        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
292        $data = [
293            'ids' => $ids,
294            'documents' => $documents
295        ];
296        // Get also the metadata
297        if ($metadatas) {
298            $data['metadatas'] = $metadatas;
299        }
300        // Get the embeddings
301        if ($embeddings) {
302            $data['embeddings'] = $embeddings;
303        }
304        // Return the respnse
305        return $this->makeRequest($endpoint, 'POST', $data);
306    }
307
308    /**
309     * Check if a document needs to be updated based on timestamp comparison
310     *
311     * Determines whether a document should be reprocessed by comparing the file's last modification
312     * time with the processed_at timestamp stored in the document's metadata. The function checks
313     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
314     * not included in the database.
315     *
316     * @param string $collectionId The ID of the collection to check documents in
317     * @param string $documentId The base document ID to check (without chunk suffixes)
318     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
319     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
320     * @throws Exception If there's an error checking the document
321     */
322    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
323        try {
324            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
325            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
326            $chunkIdsToCheck = [
327                $documentId . '@1',
328                $documentId . '@2',
329                $documentId . '@3'
330            ];
331            $data = [
332                'ids' => $chunkIdsToCheck,
333                'include' => [
334                    "metadatas"
335                ],
336                'limit' => 1
337            ];
338            // Check if document exists
339            $result = $this->makeRequest($endpoint, 'POST', $data);
340            // If no documents found, return true (needs to be added)
341            if (empty($result['ids'])) {
342                return true;
343            }
344            // Check if any document has a processed_at timestamp
345            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
346                // Check the first metadata entry directly
347                $metadata = $result['metadatas'][0];
348                // If processed_at is not set, return true (needs update)
349                if (!isset($metadata['processed_at'])) {
350                    return true;
351                }
352                // Parse the processed_at timestamp
353                $processedTimestamp = strtotime($metadata['processed_at']);
354                // If file is newer than processed time, return true (needs update)
355                if ($fileModifiedTime > $processedTimestamp) {
356                    return true;
357                }
358            }
359            // Document exists and is up to date
360            return false;
361        } catch (\Exception $e) {
362            // If there's an error checking the document, assume it needs to be updated
363            return true;
364        }
365    }
366
367    /**
368     * Query a collection for similar documents
369     *
370     * Queries the specified collection for documents similar to the provided query texts.
371     * The function generates embeddings for the query texts and sends them to ChromaDB.
372     * Supports filtering results by metadata using the where parameter.
373     *
374     * @param string $collectionName The name of the collection to query
375     * @param array $queryTexts The query texts to search for
376     * @param int $nResults The number of results to return (default: 5)
377     * @param array|null $where Optional filter conditions for metadata
378     * @return array The query results
379     * @throws Exception If the collection ID is not found
380     */
381    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
382        // Use provided name, fallback to 'documents' if empty
383        if (empty($collectionName)) {
384            $collectionName = 'documents';
385        }
386        // First get the collection to find its ID
387        $collection = $this->getCollection($collectionName);
388        if (!isset($collection['id'])) {
389            throw new \Exception("Collection ID not found for '{$collectionName}'");
390        }
391        $collectionId = $collection['id'];
392        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
393        // Generate embeddings for query texts
394        $queryEmbeddings = [];
395        foreach ($queryTexts as $text) {
396            $queryEmbeddings[] = $this->generateEmbeddings($text);
397        }
398        $data = [
399            'query_embeddings' => $queryEmbeddings,
400            'n_results' => $nResults
401        ];
402        // Add where clause for metadata filtering if provided
403        if ($where && is_array($where)) {
404            $data['where'] = $where;
405        }
406        // Return the response
407        return $this->makeRequest($endpoint, 'POST', $data);
408    }
409
410    /**
411     * Check if the ChromaDB server is alive
412     *
413     * Sends a heartbeat request to verify that the ChromaDB server is running.
414     *
415     * @return array The response from the heartbeat endpoint
416     */
417    public function heartbeat() {
418        $endpoint = "/heartbeat";
419        return $this->makeRequest($endpoint, 'GET');
420    }
421
422    /**
423     * Get authentication and identity information
424     *
425     * Retrieves authentication and identity information from the ChromaDB server.
426     *
427     * @return array The response from the auth/identity endpoint
428     */
429    public function getIdentity() {
430        $endpoint = "/identity";
431        return $this->makeRequest($endpoint, 'GET');
432    }
433
434    /**
435     * Ensure that the specified tenant and database exist
436     *
437     * Checks if the specified tenant and database exist, and creates them if they don't.
438     *
439     * @return void
440     */
441    private function ensureTenantAndDatabase() {
442        // Check if tenant exists, create if it doesn't
443        try {
444            $this->getTenant($this->tenant);
445        } catch (\Exception $e) {
446            // Tenant doesn't exist, create it
447            $this->createTenant($this->tenant);
448        }
449        // Check if database exists, create if it doesn't
450        try {
451            $this->getDatabase($this->database, $this->tenant);
452        } catch (\Exception $e) {
453            // Database doesn't exist, create it
454            $this->createDatabase($this->database, $this->tenant);
455        }
456    }
457
458    /**
459     * Get tenant information
460     *
461     * Retrieves information about the specified tenant.
462     *
463     * @param string $tenantName The tenant name
464     * @return array The tenant information
465     */
466    public function getTenant($tenantName) {
467        $endpoint = "/tenants/{$tenantName}";
468        return $this->makeRequest($endpoint, 'GET');
469    }
470
471    /**
472     * Create a new tenant
473     *
474     * Creates a new tenant with the specified name.
475     *
476     * @param string $tenantName The tenant name
477     * @return array The response from the API
478     */
479    public function createTenant($tenantName) {
480        $endpoint = "/tenants";
481        $data = ['name' => $tenantName];
482        return $this->makeRequest($endpoint, 'POST', $data);
483    }
484
485    /**
486     * Get database information
487     *
488     * Retrieves information about the specified database within a tenant.
489     *
490     * @param string $databaseName The database name
491     * @param string $tenantName The tenant name
492     * @return array The database information
493     */
494    public function getDatabase($databaseName, $tenantName) {
495        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
496        return $this->makeRequest($endpoint, 'GET');
497    }
498
499    /**
500     * Create a new database
501     *
502     * Creates a new database with the specified name within a tenant.
503     *
504     * @param string $databaseName The database name
505     * @param string $tenantName The tenant name
506     * @return array The response from the API
507     */
508    public function createDatabase($databaseName, $tenantName) {
509        $endpoint = "/tenants/{$tenantName}/databases";
510        $data = ['name' => $databaseName];
511        return $this->makeRequest($endpoint, 'POST', $data);
512    }
513
514    /**
515     * Ensure a collection exists, creating it if necessary
516     *
517     * This helper function checks if a collection exists and creates it if it doesn't.
518     *
519     * @param string $collectionName The name of the collection to check/create
520     * @return string Status message indicating what happened
521     */
522    public function ensureCollectionExists($collectionName) {
523        try {
524            $collection = $this->getCollection($collectionName);
525            return "Collection '$collectionName' already exists.";
526        } catch (\Exception $e) {
527            // Collection doesn't exist, create it
528            $created = $this->createCollection($collectionName);
529            return "Collection '$collectionName' created.";
530        }
531    }
532
533    /**
534     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
535     *
536     * This function handles the complete processing of a single DokuWiki file:
537     * 1. Parses the file path to extract metadata and document ID
538     * 2. Determines the appropriate collection based on document ID
539     * 3. Checks if the document needs updating using timestamp comparison
540     * 4. Reads and processes file content only if update is needed
541     * 5. Splits the document into chunks (paragraphs)
542     * 6. Extracts rich metadata from the DokuWiki ID format
543     * 7. Generates embeddings for each chunk
544     * 8. Sends all chunks to ChromaDB with metadata
545     *
546     * Supported ID formats:
547     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
548     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
549     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
550     *
551     * The function implements smart update checking by comparing file modification time
552     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
553     *
554     * @param string $filePath The path to the file to process
555     * @param string $collectionName The name of the collection to use
556     * @param bool $collectionChecked Whether the collection has already been checked/created
557     * @return array Result with status and details
558     */
559    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
560        // Parse file path to extract metadata
561        $id = parseFilePath($filePath);
562        try {
563            // Create collection if it doesn't exist (only if not already checked)
564            $collectionStatus = '';
565            if (!$collectionChecked) {
566                $collectionStatus = $this->ensureCollectionExists($collectionName);
567            }
568            // Get collection ID
569            $collection = $this->getCollection($collectionName);
570            if (!isset($collection['id'])) {
571                return [
572                    'status' => 'error',
573                    'message' => "Collection ID not found for '{$collectionName}'"
574                ];
575            }
576            $collectionId = $collection['id'];
577            // Get file modification time
578            $fileModifiedTime = filemtime($filePath);
579            // Check if document needs update
580            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
581            // If document is up to date, skip processing
582            if (!$needsUpdate) {
583                return [
584                    'status' => 'skipped',
585                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
586                ];
587            }
588            // Read file content
589            $content = file_get_contents($filePath);
590            // Split document into chunks (paragraphs separated by two newlines)
591            $paragraphs = preg_split('/\n\s*\n/', $content);
592            $chunks = [];
593            $chunkMetadata = [];
594            // Parse the DokuWiki ID to extract base metadata
595            $parts = explode(':', $id);
596            // Extract metadata from the last part of the ID
597            $lastPart = end($parts);
598            $baseMetadata = [];
599            // Add the document ID as metadata
600            $baseMetadata['document_id'] = $id;
601            // Add current timestamp
602            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
603            // Check if any part of the ID is 'templates' and set template metadata
604            $isTemplate = in_array('templates', $parts);
605            if ($isTemplate) {
606                $baseMetadata['type'] = 'template';
607            } else {
608                $baseMetadata['type'] = 'report';
609            }
610            // Extract modality from the second part
611            if (isset($parts[1])) {
612                $baseMetadata['modality'] = $parts[1];
613            }
614            // Handle different ID formats based on the third part: word (institution) or numeric (year)
615            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
616            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
617            // For templates, don't set institution, date or year
618            if (isset($parts[2]) && !$isTemplate) {
619                // Check if third part is numeric (year) or word (institution)
620                if (is_numeric($parts[2])) {
621                    // Format: reports:mri:2024:g287-name-surname (year format)
622                    // Extract year from the third part
623                    $baseMetadata['year'] = $parts[2];
624                    // Set default institution from config
625                    global $conf;
626                    $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default';
627                    // Extract registration and name from the last part
628                    // Registration should start with one letter or number and contain numbers before the '-' character
629                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
630                        // Check if the first part contains at least one digit to be considered a registration
631                        if (preg_match('/[0-9]/', $matches[1])) {
632                            $baseMetadata['registration'] = $matches[1];
633                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
634                        } else {
635                            // If no registration pattern found, treat entire part as patient name
636                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
637                        }
638                    } else {
639                        // If no match, treat entire part as patient name
640                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
641                    }
642                } else {
643                    // Format: reports:mri:institution:250620-name-surname (institution format)
644                    // Extract institution from the third part
645                    $baseMetadata['institution'] = $parts[2];
646                    // Extract date and name from the last part
647                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
648                        $dateStr = $matches[1];
649                        $name = $matches[2];
650                        // Convert date format (250620 -> 2025-06-20)
651                        $day = substr($dateStr, 0, 2);
652                        $month = substr($dateStr, 2, 2);
653                        $year = substr($dateStr, 4, 2);
654                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
655                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
656                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
657                        $baseMetadata['date'] = $formattedDate;
658                        $baseMetadata['name'] = str_replace('-', ' ', $name);
659                    }
660                }
661            }
662            // For templates, always extract name from the last part
663            if ($isTemplate && isset($lastPart)) {
664                // Extract name from the last part (everything after the last colon)
665                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
666                    // Check if the first part contains at least one digit to be considered a registration
667                    if (preg_match('/[0-9]/', $matches[1])) {
668                        $baseMetadata['registration'] = $matches[1];
669                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
670                    } else {
671                        // If no registration pattern found, treat entire part as template name
672                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
673                    }
674                } else {
675                    // If no match, treat entire part as template name
676                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
677                }
678            }
679            // Process each paragraph as a chunk with intelligent metadata handling
680            $chunkIds = [];
681            $chunkContents = [];
682            $chunkMetadatas = [];
683            $chunkEmbeddings = [];
684            $currentTags = [];
685            foreach ($paragraphs as $index => $paragraph) {
686                // Skip empty paragraphs to avoid processing whitespace-only content
687                $paragraph = trim($paragraph);
688                if (empty($paragraph)) {
689                    continue;
690                }
691                // Check if this is a DokuWiki title (starts and ends with =)
692                // Titles are converted to tags for better searchability but not stored as content chunks
693                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
694                    // Extract title content and clean it
695                    $titleContent = trim($matches[1]);
696                    // Split into words and create searchable tags
697                    $words = preg_split('/\s+/', $titleContent);
698                    $tags = [];
699                    foreach ($words as $word) {
700                        // Only use words longer than 3 characters to reduce noise
701                        if (strlen($word) >= 3) {
702                            $tags[] = strtolower($word);
703                        }
704                    }
705                    // Remove duplicate tags and store for use in subsequent chunks
706                    $currentTags = array_unique($tags);
707                    continue; // Skip storing title chunks as content
708                }
709                // Create chunk ID
710                $chunkId = $id . '@' . ($index + 1);
711                // Generate embeddings for the chunk
712                $embeddings = $this->generateEmbeddings($paragraph);
713                // Add chunk-specific metadata
714                $metadata = $baseMetadata;
715                $metadata['chunk_id'] = $chunkId;
716                $metadata['chunk_number'] = $index + 1;
717                $metadata['total_chunks'] = count($paragraphs);
718                // Add current tags to metadata if any exist
719                if (!empty($currentTags)) {
720                    $metadata['tags'] = implode(',', $currentTags);
721                }
722                // Store chunk data
723                $chunkIds[] = $chunkId;
724                $chunkContents[] = $paragraph;
725                $chunkMetadatas[] = $metadata;
726                $chunkEmbeddings[] = $embeddings;
727            }
728            // If no chunks were created, skip this file
729            if (empty($chunkIds)) {
730                return [
731                    'status' => 'skipped',
732                    'message' => "No valid chunks found in file '$id'. Skipping..."
733                ];
734            }
735            // Send all chunks to ChromaDB
736            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
737            return [
738                'status' => 'success',
739                'message' => "Successfully sent file to ChromaDB",
740                'details' => [
741                    'document_id' => $id,
742                    'chunks' => count($chunkIds),
743                    'collection' => $collectionName
744                ],
745                'collection_status' => $collectionStatus
746            ];
747        } catch (\Exception $e) {
748            return [
749                'status' => 'error',
750                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
751            ];
752        }
753    }
754
755}
756
757/**
758 * Parse a file path and convert it to a DokuWiki ID
759 *
760 * Takes a file system path and converts it to the DokuWiki ID format by:
761 * 1. Removing the base path prefix (using DokuWiki's pages directory)
762 * 2. Removing the .txt extension
763 * 3. Converting directory separators to colons
764 *
765 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
766 * Becomes: reports:mri:2024:g287-name-surname
767 *
768 * @param string $filePath The full file path to parse
769 * @return string The DokuWiki ID
770 */
771function parseFilePath($filePath) {
772    // Use DokuWiki's constant to get the pages directory if available
773    if (defined('DOKU_INC')) {
774        $pagesDir = DOKU_INC . 'data/pages/';
775    } else {
776        // Fallback to common DokuWiki installation path
777        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
778    }
779    // Remove the base path
780    $relativePath = str_replace($pagesDir, '', $filePath);
781    // Remove .txt extension
782    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
783    // Split path into parts and filter out empty parts
784    $parts = array_filter(explode('/', $relativePath));
785    // Build DokuWiki ID (use first part as namespace)
786    $idParts = [];
787    foreach ($parts as $part) {
788        if (!empty($part)) {
789            $idParts[] = $part;
790        }
791    }
792    // Reurn the ID
793    return implode(':', $idParts);
794}
795