xref: /plugin/dokullm/ChromaDBClient.php (revision 3eb8beceb9dd9a07565dafac270a93f89c7c0200)
1<?php
2
3namespace dokuwiki\plugin\dokullm;
4
5class ChromaDBClient {
6    private $baseUrl;
7    private $client;
8    private $ollamaClient;
9    private $tenant;
10    private $database;
11    private $ollamaHost;
12    private $ollamaPort;
13    /**
14     * Initialize the ChromaDB client
15     *
16     * Creates a new ChromaDB client instance with the specified connection parameters.
17     * Also ensures that the specified tenant and database exist.
18     *
19     * @param string $host ChromaDB server host
20     * @param int $port ChromaDB server port
21     * @param string $tenant ChromaDB tenant name
22     * @param string $database ChromaDB database name
23     * @param string $ollamaHost Ollama server host
24     * @param int $ollamaPort Ollama server port
25     * @param string $ollamaModel Ollama embeddings model
26     */
27    public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) {
28        // Load DokuWiki plugin configuration
29        global $conf;
30
31        // Use provided parameters or fall back to configuration values
32        $host = $host ?? ($conf['plugin']['dokullm']['chroma_host'] ?? '127.0.0.1');
33        $port = $port ?? ($conf['plugin']['dokullm']['chroma_port'] ?? 8000);
34        $this->tenant = $tenant ?? ($conf['plugin']['dokullm']['chroma_tenant'] ?? 'dokullm');
35        $this->database = $database ?? ($conf['plugin']['dokullm']['chroma_database'] ?? 'dokullm');
36        $this->ollamaHost = $ollamaHost ?? ($conf['plugin']['dokullm']['ollama_host'] ?? '127.0.0.1');
37        $this->ollamaPort = $ollamaPort ?? ($conf['plugin']['dokullm']['ollama_port'] ?? 11434);
38        $this->ollamaModel = $ollamaModel ?? ($conf['plugin']['dokullm']['ollama_embeddings_model'] ?? 'nomic-embed-text');
39
40        $this->baseUrl = "http://{$host}:{$port}";
41        $this->client = curl_init();
42        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
43        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
44            'Content-Type: application/json',
45            'Accept: application/json'
46        ]);
47
48        // Initialize Ollama client
49        $this->ollamaClient = curl_init();
50        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
51        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
52            'Content-Type: application/json'
53        ]);
54
55        // Check if tenant and database exist, create them if they don't
56        $this->ensureTenantAndDatabase();
57    }
58
59    /**
60     * Clean up the cURL client when the object is destroyed
61     *
62     * @return void
63     */
64    public function __destruct() {
65        curl_close($this->client);
66        curl_close($this->ollamaClient);
67    }
68
69    /**
70     * Make an HTTP request to the ChromaDB API
71     *
72     * This is a helper function that handles making HTTP requests to the ChromaDB API,
73     * including setting the appropriate headers for tenant and database.
74     *
75     * @param string $endpoint The API endpoint to call
76     * @param string $method The HTTP method to use (default: 'GET')
77     * @param array|null $data The data to send with the request (default: null)
78     * @return array The JSON response decoded as an array
79     * @throws Exception If there's a cURL error or HTTP error
80     */
81    private function makeRequest($endpoint, $method = 'GET', $data = null) {
82        // Add tenant and database as headers instead of query parameters for v2 API
83        $headers = [
84            'Content-Type: application/json',
85            'Accept: application/json'
86        ];
87
88        $url = $this->baseUrl . '/api/v2' . $endpoint;
89
90        curl_setopt($this->client, CURLOPT_URL, $url);
91        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
92        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
93
94        if ($data) {
95            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
96        } else {
97            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
98        }
99
100        $response = curl_exec($this->client);
101        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
102
103        if (curl_error($this->client)) {
104            throw new \Exception('Curl error: ' . curl_error($this->client));
105        }
106
107        if ($httpCode >= 400) {
108            throw new \Exception("HTTP Error: $httpCode, Response: $response");
109        }
110
111        return json_decode($response, true);
112    }
113
114    /**
115     * Generate embeddings for text using Ollama
116     *
117     * @param string $text The text to generate embeddings for
118     * @return array The embeddings vector
119     */
120    public function generateEmbeddings($text) {
121        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
122
123        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
124
125        $data = [
126            'model' => $this->ollamaModel,
127            'prompt' => $text,
128            'keep_alive' => '30m'
129        ];
130
131        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
132
133        $response = curl_exec($this->ollamaClient);
134        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
135
136        if (curl_error($this->ollamaClient)) {
137            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
138        }
139
140        if ($httpCode >= 400) {
141            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
142        }
143
144        $result = json_decode($response, true);
145
146        if (!isset($result['embedding'])) {
147            throw new \Exception("Ollama response missing embedding: " . $response);
148        }
149
150        return $result['embedding'];
151    }
152
153    /**
154     * List all collections in the database
155     *
156     * Retrieves a list of all collections in the specified tenant and database.
157     *
158     * @return array List of collections
159     */
160    public function listCollections() {
161        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
162        return $this->makeRequest($endpoint);
163    }
164
165    /**
166     * Get a collection by name
167     *
168     * Retrieves information about a specific collection by its name.
169     *
170     * @param string $name The name of the collection to retrieve
171     * @return array The collection information
172     * @throws Exception If the collection is not found
173     */
174    public function getCollection($name) {
175        // Use provided name, fallback to 'documents' if empty
176        if (empty($name)) {
177            $name = 'documents';
178        }
179
180        // First try to get collection by name
181        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
182        $collections = $this->makeRequest($endpoint);
183
184        // Find collection by name
185        foreach ($collections as $collection) {
186            if (isset($collection['name']) && $collection['name'] === $name) {
187                return $collection;
188            }
189        }
190
191        // If not found, throw exception
192        throw new \Exception("Collection '{$name}' not found");
193    }
194
195    /**
196     * Create a new collection
197     *
198     * Creates a new collection with the specified name and optional metadata.
199     *
200     * @param string $name The name of the collection to create
201     * @param array|null $metadata Optional metadata for the collection
202     * @return array The response from the API
203     */
204    public function createCollection($name, $metadata = null) {
205        // Use provided name, fallback to 'documents' if empty
206        if (empty($name)) {
207            $name = 'documents';
208        }
209
210        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
211        $data = ['name' => $name];
212        if ($metadata) {
213            $data['metadata'] = $metadata;
214        }
215        return $this->makeRequest($endpoint, 'POST', $data);
216    }
217
218    /**
219     * Delete a collection by name
220     *
221     * Deletes a collection with the specified name.
222     *
223     * @param string $name The name of the collection to delete
224     * @return array The response from the API
225     * @throws Exception If the collection ID is not found
226     */
227    public function deleteCollection($name) {
228        // Use provided name, fallback to 'documents' if empty
229        if (empty($name)) {
230            $name = 'documents';
231        }
232
233        // First get the collection to find its ID
234        $collection = $this->getCollection($name);
235        if (!isset($collection['id'])) {
236            throw new \Exception("Collection ID not found for '{$name}'");
237        }
238
239        $collectionId = $collection['id'];
240        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
241        return $this->makeRequest($endpoint, 'DELETE');
242    }
243
244    /**
245     * Get a document by its ID from a collection
246     *
247     * Retrieves a document from the specified collection using its ID.
248     *
249     * @param string $collectionName The name of the collection to get the document from
250     * @param string $documentId The document ID to retrieve
251     * @param array $include What to include in the response (default: ["metadatas", "documents"])
252     * @return array The retrieved document
253     * @throws Exception If the collection ID is not found
254     */
255    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
256        // Use provided name, fallback to 'documents' if empty
257        if (empty($collectionName)) {
258            $collectionName = 'documents';
259        }
260
261        // First get the collection to find its ID
262        $collection = $this->getCollection($collectionName);
263        if (!isset($collection['id'])) {
264            throw new \Exception("Collection ID not found for '{$collectionName}'");
265        }
266
267        $collectionId = $collection['id'];
268        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
269        $data = [
270            'ids' => [$documentId],
271            'include' => $include
272        ];
273
274        return $this->makeRequest($endpoint, 'POST', $data);
275    }
276
277    /**
278     * Add documents to a collection
279     *
280     * Adds documents to the specified collection. Each document must have a corresponding ID.
281     * Optional metadata and pre-computed embeddings can also be provided.
282     *
283     * @param string $collectionName The name of the collection to add documents to
284     * @param array $documents The document contents
285     * @param array $ids The document IDs
286     * @param array|null $metadatas Optional metadata for each document
287     * @param array|null $embeddings Optional pre-computed embeddings for each document
288     * @return array The response from the API
289     * @throws Exception If the collection ID is not found
290     */
291    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
292        // Use provided name, fallback to 'documents' if empty
293        if (empty($collectionName)) {
294            $collectionName = 'documents';
295        }
296
297        // First get the collection to find its ID
298        $collection = $this->getCollection($collectionName);
299        if (!isset($collection['id'])) {
300            throw new \Exception("Collection ID not found for '{$collectionName}'");
301        }
302
303        $collectionId = $collection['id'];
304        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
305        $data = [
306            'ids' => $ids,
307            'documents' => $documents
308        ];
309
310        if ($metadatas) {
311            $data['metadatas'] = $metadatas;
312        }
313
314        if ($embeddings) {
315            $data['embeddings'] = $embeddings;
316        }
317
318        return $this->makeRequest($endpoint, 'POST', $data);
319    }
320
321    /**
322     * Check if a document needs to be updated based on timestamp comparison
323     *
324     * Determines whether a document should be reprocessed by comparing the file's last modification
325     * time with the processed_at timestamp stored in the document's metadata. The function checks
326     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
327     * not included in the database.
328     *
329     * @param string $collectionId The ID of the collection to check documents in
330     * @param string $documentId The base document ID to check (without chunk suffixes)
331     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
332     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
333     * @throws Exception If there's an error checking the document
334     */
335    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
336        try {
337            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
338
339            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
340            $chunkIdsToCheck = [
341                $documentId . '@1',
342                $documentId . '@2',
343                $documentId . '@3'
344            ];
345
346            $data = [
347                'ids' => $chunkIdsToCheck,
348                'include' => [
349                    "metadatas"
350                ],
351                'limit' => 1
352            ];
353
354            // Check if document exists
355            $result = $this->makeRequest($endpoint, 'POST', $data);
356
357            // If no documents found, return true (needs to be added)
358            if (empty($result['ids'])) {
359                return true;
360            }
361
362            // Check if any document has a processed_at timestamp
363            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
364                // Check the first metadata entry directly
365                $metadata = $result['metadatas'][0];
366
367                // If processed_at is not set, return true (needs update)
368                if (!isset($metadata['processed_at'])) {
369                    return true;
370                }
371
372                // Parse the processed_at timestamp
373                $processedTimestamp = strtotime($metadata['processed_at']);
374
375                // If file is newer than processed time, return true (needs update)
376                if ($fileModifiedTime > $processedTimestamp) {
377                    return true;
378                }
379            }
380
381            // Document exists and is up to date
382            return false;
383        } catch (Exception $e) {
384            // If there's an error checking the document, assume it needs to be updated
385            return true;
386        }
387    }
388
389    /**
390     * Query a collection for similar documents
391     *
392     * Queries the specified collection for documents similar to the provided query texts.
393     * The function generates embeddings for the query texts and sends them to ChromaDB.
394     * Supports filtering results by metadata using the where parameter.
395     *
396     * @param string $collectionName The name of the collection to query
397     * @param array $queryTexts The query texts to search for
398     * @param int $nResults The number of results to return (default: 5)
399     * @param array|null $where Optional filter conditions for metadata
400     * @return array The query results
401     * @throws Exception If the collection ID is not found
402     */
403    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
404        // Use provided name, fallback to 'documents' if empty
405        if (empty($collectionName)) {
406            $collectionName = 'documents';
407        }
408
409        // First get the collection to find its ID
410        $collection = $this->getCollection($collectionName);
411        if (!isset($collection['id'])) {
412            throw new \Exception("Collection ID not found for '{$collectionName}'");
413        }
414
415        $collectionId = $collection['id'];
416        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
417
418        // Generate embeddings for query texts
419        $queryEmbeddings = [];
420        foreach ($queryTexts as $text) {
421            $queryEmbeddings[] = $this->generateEmbeddings($text);
422        }
423
424        $data = [
425            'query_embeddings' => $queryEmbeddings,
426            'n_results' => $nResults
427        ];
428
429        // Add where clause for metadata filtering if provided
430        if ($where && is_array($where)) {
431            $data['where'] = $where;
432        }
433
434        return $this->makeRequest($endpoint, 'POST', $data);
435    }
436
437    /**
438     * Check if the ChromaDB server is alive
439     *
440     * Sends a heartbeat request to verify that the ChromaDB server is running.
441     *
442     * @return array The response from the heartbeat endpoint
443     */
444    public function heartbeat() {
445        $endpoint = "/heartbeat";
446        return $this->makeRequest($endpoint, 'GET');
447    }
448
449    /**
450     * Get authentication and identity information
451     *
452     * Retrieves authentication and identity information from the ChromaDB server.
453     *
454     * @return array The response from the auth/identity endpoint
455     */
456    public function getIdentity() {
457        $endpoint = "/identity";
458        return $this->makeRequest($endpoint, 'GET');
459    }
460
461    /**
462     * Ensure that the specified tenant and database exist
463     *
464     * Checks if the specified tenant and database exist, and creates them if they don't.
465     *
466     * @return void
467     */
468    private function ensureTenantAndDatabase() {
469        // Check if tenant exists, create if it doesn't
470        try {
471            $this->getTenant($this->tenant);
472        } catch (\Exception $e) {
473            // Tenant doesn't exist, create it
474            $this->createTenant($this->tenant);
475        }
476
477        // Check if database exists, create if it doesn't
478        try {
479            $this->getDatabase($this->database, $this->tenant);
480        } catch (\Exception $e) {
481            // Database doesn't exist, create it
482            $this->createDatabase($this->database, $this->tenant);
483        }
484    }
485
486    /**
487     * Get tenant information
488     *
489     * Retrieves information about the specified tenant.
490     *
491     * @param string $tenantName The tenant name
492     * @return array The tenant information
493     */
494    public function getTenant($tenantName) {
495        $endpoint = "/tenants/{$tenantName}";
496        return $this->makeRequest($endpoint, 'GET');
497    }
498
499    /**
500     * Create a new tenant
501     *
502     * Creates a new tenant with the specified name.
503     *
504     * @param string $tenantName The tenant name
505     * @return array The response from the API
506     */
507    public function createTenant($tenantName) {
508        $endpoint = "/tenants";
509        $data = ['name' => $tenantName];
510        return $this->makeRequest($endpoint, 'POST', $data);
511    }
512
513    /**
514     * Get database information
515     *
516     * Retrieves information about the specified database within a tenant.
517     *
518     * @param string $databaseName The database name
519     * @param string $tenantName The tenant name
520     * @return array The database information
521     */
522    public function getDatabase($databaseName, $tenantName) {
523        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
524        return $this->makeRequest($endpoint, 'GET');
525    }
526
527    /**
528     * Create a new database
529     *
530     * Creates a new database with the specified name within a tenant.
531     *
532     * @param string $databaseName The database name
533     * @param string $tenantName The tenant name
534     * @return array The response from the API
535     */
536    public function createDatabase($databaseName, $tenantName) {
537        $endpoint = "/tenants/{$tenantName}/databases";
538        $data = ['name' => $databaseName];
539        return $this->makeRequest($endpoint, 'POST', $data);
540    }
541
542    /**
543     * Ensure a collection exists, creating it if necessary
544     *
545     * This helper function checks if a collection exists and creates it if it doesn't.
546     *
547     * @param string $collectionName The name of the collection to check/create
548     * @return string Status message indicating what happened
549     */
550    public function ensureCollectionExists($collectionName) {
551        try {
552            $collection = $this->getCollection($collectionName);
553            return "Collection '$collectionName' already exists.";
554        } catch (\Exception $e) {
555            // Collection doesn't exist, create it
556            $created = $this->createCollection($collectionName);
557            return "Collection '$collectionName' created.";
558        }
559    }
560
561    /**
562     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
563     *
564     * This function handles the complete processing of a single DokuWiki file:
565     * 1. Parses the file path to extract metadata and document ID
566     * 2. Determines the appropriate collection based on document ID
567     * 3. Checks if the document needs updating using timestamp comparison
568     * 4. Reads and processes file content only if update is needed
569     * 5. Splits the document into chunks (paragraphs)
570     * 6. Extracts rich metadata from the DokuWiki ID format
571     * 7. Generates embeddings for each chunk
572     * 8. Sends all chunks to ChromaDB with metadata
573     *
574     * Supported ID formats:
575     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
576     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
577     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
578     *
579     * The function implements smart update checking by comparing file modification time
580     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
581     *
582     * @param string $filePath The path to the file to process
583     * @param string $collectionName The name of the collection to use
584     * @param bool $collectionChecked Whether the collection has already been checked/created
585     * @return array Result with status and details
586     */
587    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
588        // Parse file path to extract metadata
589        $id = parseFilePath($filePath);
590
591        try {
592            // Create collection if it doesn't exist (only if not already checked)
593            $collectionStatus = '';
594            if (!$collectionChecked) {
595                $collectionStatus = $this->ensureCollectionExists($collectionName);
596            }
597
598            // Get collection ID
599            $collection = $this->getCollection($collectionName);
600            if (!isset($collection['id'])) {
601                return [
602                    'status' => 'error',
603                    'message' => "Collection ID not found for '{$collectionName}'"
604                ];
605            }
606            $collectionId = $collection['id'];
607
608            // Get file modification time
609            $fileModifiedTime = filemtime($filePath);
610
611            // Check if document needs update
612            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
613
614            // If document is up to date, skip processing
615            if (!$needsUpdate) {
616                return [
617                    'status' => 'skipped',
618                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
619                ];
620            }
621
622            // Read file content
623            $content = file_get_contents($filePath);
624
625            // Split document into chunks (paragraphs separated by two newlines)
626            $paragraphs = preg_split('/\n\s*\n/', $content);
627            $chunks = [];
628            $chunkMetadata = [];
629
630            // Parse the DokuWiki ID to extract base metadata
631            $parts = explode(':', $id);
632
633            // Extract metadata from the last part of the ID
634            $lastPart = end($parts);
635            $baseMetadata = [];
636
637            // Add the document ID as metadata
638            $baseMetadata['document_id'] = $id;
639
640            // Add current timestamp
641            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
642
643            // Check if any part of the ID is 'templates' and set template metadata
644            $isTemplate = in_array('templates', $parts);
645            if ($isTemplate) {
646                $baseMetadata['type'] = 'template';
647            } else {
648                $baseMetadata['type'] = 'report';
649            }
650
651            // Extract modality from the second part
652            if (isset($parts[1])) {
653                $baseMetadata['modality'] = $parts[1];
654            }
655
656            // Handle different ID formats based on the third part: word (institution) or numeric (year)
657            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
658            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
659            // For templates, don't set institution, date or year
660            if (isset($parts[2]) && !$isTemplate) {
661                // Check if third part is numeric (year) or word (institution)
662                if (is_numeric($parts[2])) {
663                    // Format: reports:mri:2024:g287-name-surname (year format)
664                    // Extract year from the third part
665                    $baseMetadata['year'] = $parts[2];
666
667                    // Set default institution from config
668                    global $conf;
669                    $baseMetadata['institution'] = $conf['plugin']['dokullm']['default_institution'] ?? 'default';
670
671                    // Extract registration and name from the last part
672                    // Registration should start with one letter or number and contain numbers before the '-' character
673                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
674                        // Check if the first part contains at least one digit to be considered a registration
675                        if (preg_match('/[0-9]/', $matches[1])) {
676                            $baseMetadata['registration'] = $matches[1];
677                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
678                        } else {
679                            // If no registration pattern found, treat entire part as patient name
680                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
681                        }
682                    } else {
683                        // If no match, treat entire part as patient name
684                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
685                    }
686                } else {
687                    // Format: reports:mri:institution:250620-name-surname (institution format)
688                    // Extract institution from the third part
689                    $baseMetadata['institution'] = $parts[2];
690
691                    // Extract date and name from the last part
692                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
693                        $dateStr = $matches[1];
694                        $name = $matches[2];
695
696                        // Convert date format (250620 -> 2025-06-20)
697                        $day = substr($dateStr, 0, 2);
698                        $month = substr($dateStr, 2, 2);
699                        $year = substr($dateStr, 4, 2);
700                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
701                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
702                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
703
704                        $baseMetadata['date'] = $formattedDate;
705                        $baseMetadata['name'] = str_replace('-', ' ', $name);
706                    }
707                }
708            }
709
710            // For templates, always extract name from the last part
711            if ($isTemplate && isset($lastPart)) {
712                // Extract name from the last part (everything after the last colon)
713                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
714                    // Check if the first part contains at least one digit to be considered a registration
715                    if (preg_match('/[0-9]/', $matches[1])) {
716                        $baseMetadata['registration'] = $matches[1];
717                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
718                    } else {
719                        // If no registration pattern found, treat entire part as template name
720                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
721                    }
722                } else {
723                    // If no match, treat entire part as template name
724                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
725                }
726            }
727
728            // Process each paragraph as a chunk with intelligent metadata handling
729            $chunkIds = [];
730            $chunkContents = [];
731            $chunkMetadatas = [];
732            $chunkEmbeddings = [];
733            $currentTags = [];
734
735            foreach ($paragraphs as $index => $paragraph) {
736                // Skip empty paragraphs to avoid processing whitespace-only content
737                $paragraph = trim($paragraph);
738                if (empty($paragraph)) {
739                    continue;
740                }
741
742                // Check if this is a DokuWiki title (starts and ends with =)
743                // Titles are converted to tags for better searchability but not stored as content chunks
744                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
745                    // Extract title content and clean it
746                    $titleContent = trim($matches[1]);
747
748                    // Split into words and create searchable tags
749                    $words = preg_split('/\s+/', $titleContent);
750                    $tags = [];
751
752                    foreach ($words as $word) {
753                        // Only use words longer than 3 characters to reduce noise
754                        if (strlen($word) >= 3) {
755                            $tags[] = strtolower($word);
756                        }
757                    }
758
759                    // Remove duplicate tags and store for use in subsequent chunks
760                    $currentTags = array_unique($tags);
761                    continue; // Skip storing title chunks as content
762                }
763
764                // Create chunk ID
765                $chunkId = $id . '@' . ($index + 1);
766
767                // Generate embeddings for the chunk
768                $embeddings = $this->generateEmbeddings($paragraph);
769
770                // Add chunk-specific metadata
771                $metadata = $baseMetadata;
772                $metadata['chunk_id'] = $chunkId;
773                $metadata['chunk_number'] = $index + 1;
774                $metadata['total_chunks'] = count($paragraphs);
775
776                // Add current tags to metadata if any exist
777                if (!empty($currentTags)) {
778                    $metadata['tags'] = implode(',', $currentTags);
779                }
780
781                // Store chunk data
782                $chunkIds[] = $chunkId;
783                $chunkContents[] = $paragraph;
784                $chunkMetadatas[] = $metadata;
785                $chunkEmbeddings[] = $embeddings;
786            }
787
788            // If no chunks were created, skip this file
789            if (empty($chunkIds)) {
790                return [
791                    'status' => 'skipped',
792                    'message' => "No valid chunks found in file '$id'. Skipping..."
793                ];
794            }
795
796            // Send all chunks to ChromaDB
797            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
798
799            return [
800                'status' => 'success',
801                'message' => "Successfully sent file to ChromaDB",
802                'details' => [
803                    'document_id' => $id,
804                    'chunks' => count($chunkIds),
805                    'collection' => $collectionName
806                ],
807                'collection_status' => $collectionStatus
808            ];
809        } catch (Exception $e) {
810            return [
811                'status' => 'error',
812                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
813            ];
814        }
815    }
816
817    /**
818     * Process all DokuWiki files in a directory and send them to ChromaDB
819     *
820     * This function recursively processes all .txt files in a directory and its subdirectories.
821     * It first checks if the appropriate collection exists and creates it if needed.
822     * Then it processes each file individually.
823     *
824     * @param string $dirPath The directory path to process
825     * @return array Result with status and details
826     */
827    public function processDirectory($dirPath) {
828        // Check if directory exists
829        if (!is_dir($dirPath)) {
830            return [
831                'status' => 'error',
832                'message' => "Directory does not exist: $dirPath"
833            ];
834        }
835
836        // Create RecursiveIteratorIterator to process directories recursively
837        $iterator = new RecursiveIteratorIterator(
838            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
839            RecursiveIteratorIterator::LEAVES_ONLY
840        );
841
842        $files = [];
843        foreach ($iterator as $file) {
844            // Process only .txt files that don't start with underscore
845            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
846                $files[] = $file->getPathname();
847            }
848        }
849
850        if (empty($files)) {
851            return [
852                'status' => 'skipped',
853                'message' => "No .txt files found in directory: $dirPath"
854            ];
855        }
856
857        // Use the first part of the document ID as collection name, fallback to 'documents'
858        $sampleFile = $files[0];
859        $id = parseFilePath($sampleFile);
860        $idParts = explode(':', $id);
861        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
862
863        try {
864            $this->ensureCollectionExists($collectionName);
865            $collectionChecked = true;
866        } catch (Exception $e) {
867            $collectionChecked = true;
868        }
869
870        $results = [];
871        foreach ($files as $file) {
872            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
873            $results[] = [
874                'file' => $file,
875                'result' => $result
876            ];
877        }
878
879        return [
880            'status' => 'success',
881            'message' => "Finished processing directory.",
882            'files_count' => count($files),
883            'results' => $results
884        ];
885    }
886}
887
888/**
889 * Parse a file path and convert it to a DokuWiki ID
890 *
891 * Takes a file system path and converts it to the DokuWiki ID format by:
892 * 1. Removing the base path prefix (using DokuWiki's pages directory)
893 * 2. Removing the .txt extension
894 * 3. Converting directory separators to colons
895 *
896 * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
897 * Becomes: reports:mri:2024:g287-name-surname
898 *
899 * @param string $filePath The full file path to parse
900 * @return string The DokuWiki ID
901 */
902function parseFilePath($filePath) {
903    // Use DokuWiki's constant to get the pages directory if available
904    if (defined('DOKU_INC')) {
905        $pagesDir = DOKU_INC . 'data/pages/';
906    } else {
907        // Fallback to common DokuWiki installation path
908        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
909    }
910
911    // Remove the base path
912    $relativePath = str_replace($pagesDir, '', $filePath);
913
914    // Remove .txt extension
915    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
916
917    // Split path into parts and filter out empty parts
918    $parts = array_filter(explode('/', $relativePath));
919
920    // Build DokuWiki ID (use first part as namespace)
921    $idParts = [];
922    foreach ($parts as $part) {
923        if (!empty($part)) {
924            $idParts[] = $part;
925        }
926    }
927
928    return implode(':', $idParts);
929}
930
931