xref: /plugin/dokullm/ChromaDBClient.php (revision 590368144294a28ecf0e0e39feb976bf79fefb1e)
1*59036814SCostin Stroie<?php
2*59036814SCostin Stroie
3*59036814SCostin Stroienamespace dokuwiki\plugin\dokullm;
4*59036814SCostin Stroie
5*59036814SCostin Stroieclass ChromaDBClient {
6*59036814SCostin Stroie    private $baseUrl;
7*59036814SCostin Stroie    private $client;
8*59036814SCostin Stroie    private $ollamaClient;
9*59036814SCostin Stroie    private $tenant;
10*59036814SCostin Stroie    private $database;
11*59036814SCostin Stroie    private $ollamaHost;
12*59036814SCostin Stroie    private $ollamaPort;
13*59036814SCostin Stroie    /**
14*59036814SCostin Stroie     * Initialize the ChromaDB client
15*59036814SCostin Stroie     *
16*59036814SCostin Stroie     * Creates a new ChromaDB client instance with the specified connection parameters.
17*59036814SCostin Stroie     * Also ensures that the specified tenant and database exist.
18*59036814SCostin Stroie     *
19*59036814SCostin Stroie     * @param string $host ChromaDB server host (default: CHROMA_HOST)
20*59036814SCostin Stroie     * @param int $port ChromaDB server port (default: CHROMA_PORT)
21*59036814SCostin Stroie     * @param string $tenant ChromaDB tenant name (default: CHROMA_TENANT)
22*59036814SCostin Stroie     * @param string $database ChromaDB database name (default: CHROMA_DATABASE)
23*59036814SCostin Stroie     * @param string $ollamaHost Ollama server host (default: OLLAMA_HOST)
24*59036814SCostin Stroie     * @param int $ollamaPort Ollama server port (default: OLLAMA_PORT)
25*59036814SCostin Stroie     * @param string $ollamaModel Ollama embeddings model (default: OLLAMA_EMBEDDINGS_MODEL)
26*59036814SCostin Stroie     */
27*59036814SCostin Stroie    public function __construct($host = CHROMA_HOST, $port = CHROMA_PORT, $tenant = CHROMA_TENANT, $database = CHROMA_DATABASE, $ollamaHost = OLLAMA_HOST, $ollamaPort = OLLAMA_PORT, $ollamaModel = OLLAMA_EMBEDDINGS_MODEL) {
28*59036814SCostin Stroie        $this->baseUrl = "http://{$host}:{$port}";
29*59036814SCostin Stroie        $this->tenant = $tenant;
30*59036814SCostin Stroie        $this->database = $database;
31*59036814SCostin Stroie        $this->ollamaHost = $ollamaHost;
32*59036814SCostin Stroie        $this->ollamaPort = $ollamaPort;
33*59036814SCostin Stroie        $this->ollamaModel = $ollamaModel;
34*59036814SCostin Stroie        $this->client = curl_init();
35*59036814SCostin Stroie        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
36*59036814SCostin Stroie        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
37*59036814SCostin Stroie            'Content-Type: application/json',
38*59036814SCostin Stroie            'Accept: application/json'
39*59036814SCostin Stroie        ]);
40*59036814SCostin Stroie
41*59036814SCostin Stroie        // Initialize Ollama client
42*59036814SCostin Stroie        $this->ollamaClient = curl_init();
43*59036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
44*59036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
45*59036814SCostin Stroie            'Content-Type: application/json'
46*59036814SCostin Stroie        ]);
47*59036814SCostin Stroie
48*59036814SCostin Stroie        // Check if tenant and database exist, create them if they don't
49*59036814SCostin Stroie        $this->ensureTenantAndDatabase();
50*59036814SCostin Stroie    }
51*59036814SCostin Stroie
52*59036814SCostin Stroie    /**
53*59036814SCostin Stroie     * Clean up the cURL client when the object is destroyed
54*59036814SCostin Stroie     *
55*59036814SCostin Stroie     * @return void
56*59036814SCostin Stroie     */
57*59036814SCostin Stroie    public function __destruct() {
58*59036814SCostin Stroie        curl_close($this->client);
59*59036814SCostin Stroie        curl_close($this->ollamaClient);
60*59036814SCostin Stroie    }
61*59036814SCostin Stroie
62*59036814SCostin Stroie    /**
63*59036814SCostin Stroie     * Make an HTTP request to the ChromaDB API
64*59036814SCostin Stroie     *
65*59036814SCostin Stroie     * This is a helper function that handles making HTTP requests to the ChromaDB API,
66*59036814SCostin Stroie     * including setting the appropriate headers for tenant and database.
67*59036814SCostin Stroie     *
68*59036814SCostin Stroie     * @param string $endpoint The API endpoint to call
69*59036814SCostin Stroie     * @param string $method The HTTP method to use (default: 'GET')
70*59036814SCostin Stroie     * @param array|null $data The data to send with the request (default: null)
71*59036814SCostin Stroie     * @return array The JSON response decoded as an array
72*59036814SCostin Stroie     * @throws Exception If there's a cURL error or HTTP error
73*59036814SCostin Stroie     */
74*59036814SCostin Stroie    private function makeRequest($endpoint, $method = 'GET', $data = null) {
75*59036814SCostin Stroie        // Add tenant and database as headers instead of query parameters for v2 API
76*59036814SCostin Stroie        $headers = [
77*59036814SCostin Stroie            'Content-Type: application/json',
78*59036814SCostin Stroie            'Accept: application/json'
79*59036814SCostin Stroie        ];
80*59036814SCostin Stroie
81*59036814SCostin Stroie        $url = $this->baseUrl . '/api/v2' . $endpoint;
82*59036814SCostin Stroie
83*59036814SCostin Stroie        curl_setopt($this->client, CURLOPT_URL, $url);
84*59036814SCostin Stroie        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
85*59036814SCostin Stroie        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
86*59036814SCostin Stroie
87*59036814SCostin Stroie        if ($data) {
88*59036814SCostin Stroie            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
89*59036814SCostin Stroie        } else {
90*59036814SCostin Stroie            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
91*59036814SCostin Stroie        }
92*59036814SCostin Stroie
93*59036814SCostin Stroie        $response = curl_exec($this->client);
94*59036814SCostin Stroie        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
95*59036814SCostin Stroie
96*59036814SCostin Stroie        if (curl_error($this->client)) {
97*59036814SCostin Stroie            throw new Exception('Curl error: ' . curl_error($this->client));
98*59036814SCostin Stroie        }
99*59036814SCostin Stroie
100*59036814SCostin Stroie        if ($httpCode >= 400) {
101*59036814SCostin Stroie            throw new Exception("HTTP Error: $httpCode, Response: $response");
102*59036814SCostin Stroie        }
103*59036814SCostin Stroie
104*59036814SCostin Stroie        return json_decode($response, true);
105*59036814SCostin Stroie    }
106*59036814SCostin Stroie
107*59036814SCostin Stroie    /**
108*59036814SCostin Stroie     * Generate embeddings for text using Ollama
109*59036814SCostin Stroie     *
110*59036814SCostin Stroie     * @param string $text The text to generate embeddings for
111*59036814SCostin Stroie     * @return array The embeddings vector
112*59036814SCostin Stroie     */
113*59036814SCostin Stroie    public function generateEmbeddings($text) {
114*59036814SCostin Stroie        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
115*59036814SCostin Stroie
116*59036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
117*59036814SCostin Stroie
118*59036814SCostin Stroie        $data = [
119*59036814SCostin Stroie            'model' => $this->ollamaModel,
120*59036814SCostin Stroie            'prompt' => $text,
121*59036814SCostin Stroie            'keep_alive' => '30m'
122*59036814SCostin Stroie        ];
123*59036814SCostin Stroie
124*59036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
125*59036814SCostin Stroie
126*59036814SCostin Stroie        $response = curl_exec($this->ollamaClient);
127*59036814SCostin Stroie        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
128*59036814SCostin Stroie
129*59036814SCostin Stroie        if (curl_error($this->ollamaClient)) {
130*59036814SCostin Stroie            throw new Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
131*59036814SCostin Stroie        }
132*59036814SCostin Stroie
133*59036814SCostin Stroie        if ($httpCode >= 400) {
134*59036814SCostin Stroie            throw new Exception("Ollama HTTP Error: $httpCode, Response: $response");
135*59036814SCostin Stroie        }
136*59036814SCostin Stroie
137*59036814SCostin Stroie        $result = json_decode($response, true);
138*59036814SCostin Stroie
139*59036814SCostin Stroie        if (!isset($result['embedding'])) {
140*59036814SCostin Stroie            throw new Exception("Ollama response missing embedding: " . $response);
141*59036814SCostin Stroie        }
142*59036814SCostin Stroie
143*59036814SCostin Stroie        return $result['embedding'];
144*59036814SCostin Stroie    }
145*59036814SCostin Stroie
146*59036814SCostin Stroie    /**
147*59036814SCostin Stroie     * List all collections in the database
148*59036814SCostin Stroie     *
149*59036814SCostin Stroie     * Retrieves a list of all collections in the specified tenant and database.
150*59036814SCostin Stroie     *
151*59036814SCostin Stroie     * @return array List of collections
152*59036814SCostin Stroie     */
153*59036814SCostin Stroie    public function listCollections() {
154*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
155*59036814SCostin Stroie        return $this->makeRequest($endpoint);
156*59036814SCostin Stroie    }
157*59036814SCostin Stroie
158*59036814SCostin Stroie    /**
159*59036814SCostin Stroie     * Get a collection by name
160*59036814SCostin Stroie     *
161*59036814SCostin Stroie     * Retrieves information about a specific collection by its name.
162*59036814SCostin Stroie     *
163*59036814SCostin Stroie     * @param string $name The name of the collection to retrieve
164*59036814SCostin Stroie     * @return array The collection information
165*59036814SCostin Stroie     * @throws Exception If the collection is not found
166*59036814SCostin Stroie     */
167*59036814SCostin Stroie    public function getCollection($name) {
168*59036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
169*59036814SCostin Stroie        if (empty($name)) {
170*59036814SCostin Stroie            $name = 'documents';
171*59036814SCostin Stroie        }
172*59036814SCostin Stroie
173*59036814SCostin Stroie        // First try to get collection by name
174*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
175*59036814SCostin Stroie        $collections = $this->makeRequest($endpoint);
176*59036814SCostin Stroie
177*59036814SCostin Stroie        // Find collection by name
178*59036814SCostin Stroie        foreach ($collections as $collection) {
179*59036814SCostin Stroie            if (isset($collection['name']) && $collection['name'] === $name) {
180*59036814SCostin Stroie                return $collection;
181*59036814SCostin Stroie            }
182*59036814SCostin Stroie        }
183*59036814SCostin Stroie
184*59036814SCostin Stroie        // If not found, throw exception
185*59036814SCostin Stroie        throw new Exception("Collection '{$name}' not found");
186*59036814SCostin Stroie    }
187*59036814SCostin Stroie
188*59036814SCostin Stroie    /**
189*59036814SCostin Stroie     * Create a new collection
190*59036814SCostin Stroie     *
191*59036814SCostin Stroie     * Creates a new collection with the specified name and optional metadata.
192*59036814SCostin Stroie     *
193*59036814SCostin Stroie     * @param string $name The name of the collection to create
194*59036814SCostin Stroie     * @param array|null $metadata Optional metadata for the collection
195*59036814SCostin Stroie     * @return array The response from the API
196*59036814SCostin Stroie     */
197*59036814SCostin Stroie    public function createCollection($name, $metadata = null) {
198*59036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
199*59036814SCostin Stroie        if (empty($name)) {
200*59036814SCostin Stroie            $name = 'documents';
201*59036814SCostin Stroie        }
202*59036814SCostin Stroie
203*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
204*59036814SCostin Stroie        $data = ['name' => $name];
205*59036814SCostin Stroie        if ($metadata) {
206*59036814SCostin Stroie            $data['metadata'] = $metadata;
207*59036814SCostin Stroie        }
208*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
209*59036814SCostin Stroie    }
210*59036814SCostin Stroie
211*59036814SCostin Stroie    /**
212*59036814SCostin Stroie     * Delete a collection by name
213*59036814SCostin Stroie     *
214*59036814SCostin Stroie     * Deletes a collection with the specified name.
215*59036814SCostin Stroie     *
216*59036814SCostin Stroie     * @param string $name The name of the collection to delete
217*59036814SCostin Stroie     * @return array The response from the API
218*59036814SCostin Stroie     * @throws Exception If the collection ID is not found
219*59036814SCostin Stroie     */
220*59036814SCostin Stroie    public function deleteCollection($name) {
221*59036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
222*59036814SCostin Stroie        if (empty($name)) {
223*59036814SCostin Stroie            $name = 'documents';
224*59036814SCostin Stroie        }
225*59036814SCostin Stroie
226*59036814SCostin Stroie        // First get the collection to find its ID
227*59036814SCostin Stroie        $collection = $this->getCollection($name);
228*59036814SCostin Stroie        if (!isset($collection['id'])) {
229*59036814SCostin Stroie            throw new Exception("Collection ID not found for '{$name}'");
230*59036814SCostin Stroie        }
231*59036814SCostin Stroie
232*59036814SCostin Stroie        $collectionId = $collection['id'];
233*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
234*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'DELETE');
235*59036814SCostin Stroie    }
236*59036814SCostin Stroie
237*59036814SCostin Stroie    /**
238*59036814SCostin Stroie     * Get a document by its ID from a collection
239*59036814SCostin Stroie     *
240*59036814SCostin Stroie     * Retrieves a document from the specified collection using its ID.
241*59036814SCostin Stroie     *
242*59036814SCostin Stroie     * @param string $collectionName The name of the collection to get the document from
243*59036814SCostin Stroie     * @param string $documentId The document ID to retrieve
244*59036814SCostin Stroie     * @param array $include What to include in the response (default: ["metadatas", "documents"])
245*59036814SCostin Stroie     * @return array The retrieved document
246*59036814SCostin Stroie     * @throws Exception If the collection ID is not found
247*59036814SCostin Stroie     */
248*59036814SCostin Stroie    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
249*59036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
250*59036814SCostin Stroie        if (empty($collectionName)) {
251*59036814SCostin Stroie            $collectionName = 'documents';
252*59036814SCostin Stroie        }
253*59036814SCostin Stroie
254*59036814SCostin Stroie        // First get the collection to find its ID
255*59036814SCostin Stroie        $collection = $this->getCollection($collectionName);
256*59036814SCostin Stroie        if (!isset($collection['id'])) {
257*59036814SCostin Stroie            throw new Exception("Collection ID not found for '{$collectionName}'");
258*59036814SCostin Stroie        }
259*59036814SCostin Stroie
260*59036814SCostin Stroie        $collectionId = $collection['id'];
261*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
262*59036814SCostin Stroie        $data = [
263*59036814SCostin Stroie            'ids' => [$documentId],
264*59036814SCostin Stroie            'include' => $include
265*59036814SCostin Stroie        ];
266*59036814SCostin Stroie
267*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
268*59036814SCostin Stroie    }
269*59036814SCostin Stroie
270*59036814SCostin Stroie    /**
271*59036814SCostin Stroie     * Add documents to a collection
272*59036814SCostin Stroie     *
273*59036814SCostin Stroie     * Adds documents to the specified collection. Each document must have a corresponding ID.
274*59036814SCostin Stroie     * Optional metadata and pre-computed embeddings can also be provided.
275*59036814SCostin Stroie     *
276*59036814SCostin Stroie     * @param string $collectionName The name of the collection to add documents to
277*59036814SCostin Stroie     * @param array $documents The document contents
278*59036814SCostin Stroie     * @param array $ids The document IDs
279*59036814SCostin Stroie     * @param array|null $metadatas Optional metadata for each document
280*59036814SCostin Stroie     * @param array|null $embeddings Optional pre-computed embeddings for each document
281*59036814SCostin Stroie     * @return array The response from the API
282*59036814SCostin Stroie     * @throws Exception If the collection ID is not found
283*59036814SCostin Stroie     */
284*59036814SCostin Stroie    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
285*59036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
286*59036814SCostin Stroie        if (empty($collectionName)) {
287*59036814SCostin Stroie            $collectionName = 'documents';
288*59036814SCostin Stroie        }
289*59036814SCostin Stroie
290*59036814SCostin Stroie        // First get the collection to find its ID
291*59036814SCostin Stroie        $collection = $this->getCollection($collectionName);
292*59036814SCostin Stroie        if (!isset($collection['id'])) {
293*59036814SCostin Stroie            throw new Exception("Collection ID not found for '{$collectionName}'");
294*59036814SCostin Stroie        }
295*59036814SCostin Stroie
296*59036814SCostin Stroie        $collectionId = $collection['id'];
297*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
298*59036814SCostin Stroie        $data = [
299*59036814SCostin Stroie            'ids' => $ids,
300*59036814SCostin Stroie            'documents' => $documents
301*59036814SCostin Stroie        ];
302*59036814SCostin Stroie
303*59036814SCostin Stroie        if ($metadatas) {
304*59036814SCostin Stroie            $data['metadatas'] = $metadatas;
305*59036814SCostin Stroie        }
306*59036814SCostin Stroie
307*59036814SCostin Stroie        if ($embeddings) {
308*59036814SCostin Stroie            $data['embeddings'] = $embeddings;
309*59036814SCostin Stroie        }
310*59036814SCostin Stroie
311*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
312*59036814SCostin Stroie    }
313*59036814SCostin Stroie
314*59036814SCostin Stroie    /**
315*59036814SCostin Stroie     * Check if a document needs to be updated based on timestamp comparison
316*59036814SCostin Stroie     *
317*59036814SCostin Stroie     * Determines whether a document should be reprocessed by comparing the file's last modification
318*59036814SCostin Stroie     * time with the processed_at timestamp stored in the document's metadata. The function checks
319*59036814SCostin Stroie     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
320*59036814SCostin Stroie     * not included in the database.
321*59036814SCostin Stroie     *
322*59036814SCostin Stroie     * @param string $collectionId The ID of the collection to check documents in
323*59036814SCostin Stroie     * @param string $documentId The base document ID to check (without chunk suffixes)
324*59036814SCostin Stroie     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
325*59036814SCostin Stroie     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
326*59036814SCostin Stroie     * @throws Exception If there's an error checking the document
327*59036814SCostin Stroie     */
328*59036814SCostin Stroie    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
329*59036814SCostin Stroie        try {
330*59036814SCostin Stroie            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
331*59036814SCostin Stroie
332*59036814SCostin Stroie            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
333*59036814SCostin Stroie            $chunkIdsToCheck = [
334*59036814SCostin Stroie                $documentId . '@1',
335*59036814SCostin Stroie                $documentId . '@2',
336*59036814SCostin Stroie                $documentId . '@3'
337*59036814SCostin Stroie            ];
338*59036814SCostin Stroie
339*59036814SCostin Stroie            $data = [
340*59036814SCostin Stroie                'ids' => $chunkIdsToCheck,
341*59036814SCostin Stroie                'include' => [
342*59036814SCostin Stroie                    "metadatas"
343*59036814SCostin Stroie                ],
344*59036814SCostin Stroie                'limit' => 1
345*59036814SCostin Stroie            ];
346*59036814SCostin Stroie
347*59036814SCostin Stroie            // Check if document exists
348*59036814SCostin Stroie            $result = $this->makeRequest($endpoint, 'POST', $data);
349*59036814SCostin Stroie
350*59036814SCostin Stroie            // If no documents found, return true (needs to be added)
351*59036814SCostin Stroie            if (empty($result['ids'])) {
352*59036814SCostin Stroie                return true;
353*59036814SCostin Stroie            }
354*59036814SCostin Stroie
355*59036814SCostin Stroie            // Check if any document has a processed_at timestamp
356*59036814SCostin Stroie            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
357*59036814SCostin Stroie                // Check the first metadata entry directly
358*59036814SCostin Stroie                $metadata = $result['metadatas'][0];
359*59036814SCostin Stroie
360*59036814SCostin Stroie                // If processed_at is not set, return true (needs update)
361*59036814SCostin Stroie                if (!isset($metadata['processed_at'])) {
362*59036814SCostin Stroie                    return true;
363*59036814SCostin Stroie                }
364*59036814SCostin Stroie
365*59036814SCostin Stroie                // Parse the processed_at timestamp
366*59036814SCostin Stroie                $processedTimestamp = strtotime($metadata['processed_at']);
367*59036814SCostin Stroie
368*59036814SCostin Stroie                // If file is newer than processed time, return true (needs update)
369*59036814SCostin Stroie                if ($fileModifiedTime > $processedTimestamp) {
370*59036814SCostin Stroie                    return true;
371*59036814SCostin Stroie                }
372*59036814SCostin Stroie            }
373*59036814SCostin Stroie
374*59036814SCostin Stroie            // Document exists and is up to date
375*59036814SCostin Stroie            return false;
376*59036814SCostin Stroie        } catch (Exception $e) {
377*59036814SCostin Stroie            // If there's an error checking the document, assume it needs to be updated
378*59036814SCostin Stroie            return true;
379*59036814SCostin Stroie        }
380*59036814SCostin Stroie    }
381*59036814SCostin Stroie
382*59036814SCostin Stroie    /**
383*59036814SCostin Stroie     * Query a collection for similar documents
384*59036814SCostin Stroie     *
385*59036814SCostin Stroie     * Queries the specified collection for documents similar to the provided query texts.
386*59036814SCostin Stroie     * The function generates embeddings for the query texts and sends them to ChromaDB.
387*59036814SCostin Stroie     * Supports filtering results by metadata using the where parameter.
388*59036814SCostin Stroie     *
389*59036814SCostin Stroie     * @param string $collectionName The name of the collection to query
390*59036814SCostin Stroie     * @param array $queryTexts The query texts to search for
391*59036814SCostin Stroie     * @param int $nResults The number of results to return (default: 5)
392*59036814SCostin Stroie     * @param array|null $where Optional filter conditions for metadata
393*59036814SCostin Stroie     * @return array The query results
394*59036814SCostin Stroie     * @throws Exception If the collection ID is not found
395*59036814SCostin Stroie     */
396*59036814SCostin Stroie    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
397*59036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
398*59036814SCostin Stroie        if (empty($collectionName)) {
399*59036814SCostin Stroie            $collectionName = 'documents';
400*59036814SCostin Stroie        }
401*59036814SCostin Stroie
402*59036814SCostin Stroie        // First get the collection to find its ID
403*59036814SCostin Stroie        $collection = $this->getCollection($collectionName);
404*59036814SCostin Stroie        if (!isset($collection['id'])) {
405*59036814SCostin Stroie            throw new Exception("Collection ID not found for '{$collectionName}'");
406*59036814SCostin Stroie        }
407*59036814SCostin Stroie
408*59036814SCostin Stroie        $collectionId = $collection['id'];
409*59036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
410*59036814SCostin Stroie
411*59036814SCostin Stroie        // Generate embeddings for query texts
412*59036814SCostin Stroie        $queryEmbeddings = [];
413*59036814SCostin Stroie        foreach ($queryTexts as $text) {
414*59036814SCostin Stroie            $queryEmbeddings[] = $this->generateEmbeddings($text);
415*59036814SCostin Stroie        }
416*59036814SCostin Stroie
417*59036814SCostin Stroie        $data = [
418*59036814SCostin Stroie            'query_embeddings' => $queryEmbeddings,
419*59036814SCostin Stroie            'n_results' => $nResults
420*59036814SCostin Stroie        ];
421*59036814SCostin Stroie
422*59036814SCostin Stroie        // Add where clause for metadata filtering if provided
423*59036814SCostin Stroie        if ($where && is_array($where)) {
424*59036814SCostin Stroie            $data['where'] = $where;
425*59036814SCostin Stroie        }
426*59036814SCostin Stroie
427*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
428*59036814SCostin Stroie    }
429*59036814SCostin Stroie
430*59036814SCostin Stroie    /**
431*59036814SCostin Stroie     * Check if the ChromaDB server is alive
432*59036814SCostin Stroie     *
433*59036814SCostin Stroie     * Sends a heartbeat request to verify that the ChromaDB server is running.
434*59036814SCostin Stroie     *
435*59036814SCostin Stroie     * @return array The response from the heartbeat endpoint
436*59036814SCostin Stroie     */
437*59036814SCostin Stroie    public function heartbeat() {
438*59036814SCostin Stroie        $endpoint = "/heartbeat";
439*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
440*59036814SCostin Stroie    }
441*59036814SCostin Stroie
442*59036814SCostin Stroie    /**
443*59036814SCostin Stroie     * Get authentication and identity information
444*59036814SCostin Stroie     *
445*59036814SCostin Stroie     * Retrieves authentication and identity information from the ChromaDB server.
446*59036814SCostin Stroie     *
447*59036814SCostin Stroie     * @return array The response from the auth/identity endpoint
448*59036814SCostin Stroie     */
449*59036814SCostin Stroie    public function getIdentity() {
450*59036814SCostin Stroie        $endpoint = "/identity";
451*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
452*59036814SCostin Stroie    }
453*59036814SCostin Stroie
454*59036814SCostin Stroie    /**
455*59036814SCostin Stroie     * Ensure that the specified tenant and database exist
456*59036814SCostin Stroie     *
457*59036814SCostin Stroie     * Checks if the specified tenant and database exist, and creates them if they don't.
458*59036814SCostin Stroie     *
459*59036814SCostin Stroie     * @return void
460*59036814SCostin Stroie     */
461*59036814SCostin Stroie    private function ensureTenantAndDatabase() {
462*59036814SCostin Stroie        // Check if tenant exists, create if it doesn't
463*59036814SCostin Stroie        try {
464*59036814SCostin Stroie            $this->getTenant($this->tenant);
465*59036814SCostin Stroie        } catch (Exception $e) {
466*59036814SCostin Stroie            // Tenant doesn't exist, create it
467*59036814SCostin Stroie            $this->createTenant($this->tenant);
468*59036814SCostin Stroie        }
469*59036814SCostin Stroie
470*59036814SCostin Stroie        // Check if database exists, create if it doesn't
471*59036814SCostin Stroie        try {
472*59036814SCostin Stroie            $this->getDatabase($this->database, $this->tenant);
473*59036814SCostin Stroie        } catch (Exception $e) {
474*59036814SCostin Stroie            // Database doesn't exist, create it
475*59036814SCostin Stroie            $this->createDatabase($this->database, $this->tenant);
476*59036814SCostin Stroie        }
477*59036814SCostin Stroie    }
478*59036814SCostin Stroie
479*59036814SCostin Stroie    /**
480*59036814SCostin Stroie     * Get tenant information
481*59036814SCostin Stroie     *
482*59036814SCostin Stroie     * Retrieves information about the specified tenant.
483*59036814SCostin Stroie     *
484*59036814SCostin Stroie     * @param string $tenantName The tenant name
485*59036814SCostin Stroie     * @return array The tenant information
486*59036814SCostin Stroie     */
487*59036814SCostin Stroie    public function getTenant($tenantName) {
488*59036814SCostin Stroie        $endpoint = "/tenants/{$tenantName}";
489*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
490*59036814SCostin Stroie    }
491*59036814SCostin Stroie
492*59036814SCostin Stroie    /**
493*59036814SCostin Stroie     * Create a new tenant
494*59036814SCostin Stroie     *
495*59036814SCostin Stroie     * Creates a new tenant with the specified name.
496*59036814SCostin Stroie     *
497*59036814SCostin Stroie     * @param string $tenantName The tenant name
498*59036814SCostin Stroie     * @return array The response from the API
499*59036814SCostin Stroie     */
500*59036814SCostin Stroie    public function createTenant($tenantName) {
501*59036814SCostin Stroie        $endpoint = "/tenants";
502*59036814SCostin Stroie        $data = ['name' => $tenantName];
503*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
504*59036814SCostin Stroie    }
505*59036814SCostin Stroie
506*59036814SCostin Stroie    /**
507*59036814SCostin Stroie     * Get database information
508*59036814SCostin Stroie     *
509*59036814SCostin Stroie     * Retrieves information about the specified database within a tenant.
510*59036814SCostin Stroie     *
511*59036814SCostin Stroie     * @param string $databaseName The database name
512*59036814SCostin Stroie     * @param string $tenantName The tenant name
513*59036814SCostin Stroie     * @return array The database information
514*59036814SCostin Stroie     */
515*59036814SCostin Stroie    public function getDatabase($databaseName, $tenantName) {
516*59036814SCostin Stroie        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
517*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
518*59036814SCostin Stroie    }
519*59036814SCostin Stroie
520*59036814SCostin Stroie    /**
521*59036814SCostin Stroie     * Create a new database
522*59036814SCostin Stroie     *
523*59036814SCostin Stroie     * Creates a new database with the specified name within a tenant.
524*59036814SCostin Stroie     *
525*59036814SCostin Stroie     * @param string $databaseName The database name
526*59036814SCostin Stroie     * @param string $tenantName The tenant name
527*59036814SCostin Stroie     * @return array The response from the API
528*59036814SCostin Stroie     */
529*59036814SCostin Stroie    public function createDatabase($databaseName, $tenantName) {
530*59036814SCostin Stroie        $endpoint = "/tenants/{$tenantName}/databases";
531*59036814SCostin Stroie        $data = ['name' => $databaseName];
532*59036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
533*59036814SCostin Stroie    }
534*59036814SCostin Stroie
535*59036814SCostin Stroie    /**
536*59036814SCostin Stroie     * Ensure a collection exists, creating it if necessary
537*59036814SCostin Stroie     *
538*59036814SCostin Stroie     * This helper function checks if a collection exists and creates it if it doesn't.
539*59036814SCostin Stroie     *
540*59036814SCostin Stroie     * @param string $collectionName The name of the collection to check/create
541*59036814SCostin Stroie     * @return string Status message indicating what happened
542*59036814SCostin Stroie     */
543*59036814SCostin Stroie    public function ensureCollectionExists($collectionName) {
544*59036814SCostin Stroie        try {
545*59036814SCostin Stroie            $collection = $this->getCollection($collectionName);
546*59036814SCostin Stroie            return "Collection '$collectionName' already exists.";
547*59036814SCostin Stroie        } catch (Exception $e) {
548*59036814SCostin Stroie            // Collection doesn't exist, create it
549*59036814SCostin Stroie            $created = $this->createCollection($collectionName);
550*59036814SCostin Stroie            return "Collection '$collectionName' created.";
551*59036814SCostin Stroie        }
552*59036814SCostin Stroie    }
553*59036814SCostin Stroie
554*59036814SCostin Stroie    /**
555*59036814SCostin Stroie     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
556*59036814SCostin Stroie     *
557*59036814SCostin Stroie     * This function handles the complete processing of a single DokuWiki file:
558*59036814SCostin Stroie     * 1. Parses the file path to extract metadata and document ID
559*59036814SCostin Stroie     * 2. Determines the appropriate collection based on document ID
560*59036814SCostin Stroie     * 3. Checks if the document needs updating using timestamp comparison
561*59036814SCostin Stroie     * 4. Reads and processes file content only if update is needed
562*59036814SCostin Stroie     * 5. Splits the document into chunks (paragraphs)
563*59036814SCostin Stroie     * 6. Extracts rich metadata from the DokuWiki ID format
564*59036814SCostin Stroie     * 7. Generates embeddings for each chunk
565*59036814SCostin Stroie     * 8. Sends all chunks to ChromaDB with metadata
566*59036814SCostin Stroie     *
567*59036814SCostin Stroie     * Supported ID formats:
568*59036814SCostin Stroie     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
569*59036814SCostin Stroie     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
570*59036814SCostin Stroie     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
571*59036814SCostin Stroie     *
572*59036814SCostin Stroie     * The function implements smart update checking by comparing file modification time
573*59036814SCostin Stroie     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
574*59036814SCostin Stroie     *
575*59036814SCostin Stroie     * @param string $filePath The path to the file to process
576*59036814SCostin Stroie     * @param string $collectionName The name of the collection to use
577*59036814SCostin Stroie     * @param bool $collectionChecked Whether the collection has already been checked/created
578*59036814SCostin Stroie     * @return array Result with status and details
579*59036814SCostin Stroie     */
580*59036814SCostin Stroie    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
581*59036814SCostin Stroie        // Parse file path to extract metadata
582*59036814SCostin Stroie        $id = parseFilePath($filePath);
583*59036814SCostin Stroie
584*59036814SCostin Stroie        try {
585*59036814SCostin Stroie            // Create collection if it doesn't exist (only if not already checked)
586*59036814SCostin Stroie            $collectionStatus = '';
587*59036814SCostin Stroie            if (!$collectionChecked) {
588*59036814SCostin Stroie                $collectionStatus = $this->ensureCollectionExists($collectionName);
589*59036814SCostin Stroie            }
590*59036814SCostin Stroie
591*59036814SCostin Stroie            // Get collection ID
592*59036814SCostin Stroie            $collection = $this->getCollection($collectionName);
593*59036814SCostin Stroie            if (!isset($collection['id'])) {
594*59036814SCostin Stroie                return [
595*59036814SCostin Stroie                    'status' => 'error',
596*59036814SCostin Stroie                    'message' => "Collection ID not found for '{$collectionName}'"
597*59036814SCostin Stroie                ];
598*59036814SCostin Stroie            }
599*59036814SCostin Stroie            $collectionId = $collection['id'];
600*59036814SCostin Stroie
601*59036814SCostin Stroie            // Get file modification time
602*59036814SCostin Stroie            $fileModifiedTime = filemtime($filePath);
603*59036814SCostin Stroie
604*59036814SCostin Stroie            // Check if document needs update
605*59036814SCostin Stroie            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
606*59036814SCostin Stroie
607*59036814SCostin Stroie            // If document is up to date, skip processing
608*59036814SCostin Stroie            if (!$needsUpdate) {
609*59036814SCostin Stroie                return [
610*59036814SCostin Stroie                    'status' => 'skipped',
611*59036814SCostin Stroie                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
612*59036814SCostin Stroie                ];
613*59036814SCostin Stroie            }
614*59036814SCostin Stroie
615*59036814SCostin Stroie            // Read file content
616*59036814SCostin Stroie            $content = file_get_contents($filePath);
617*59036814SCostin Stroie
618*59036814SCostin Stroie            // Split document into chunks (paragraphs separated by two newlines)
619*59036814SCostin Stroie            $paragraphs = preg_split('/\n\s*\n/', $content);
620*59036814SCostin Stroie            $chunks = [];
621*59036814SCostin Stroie            $chunkMetadata = [];
622*59036814SCostin Stroie
623*59036814SCostin Stroie            // Parse the DokuWiki ID to extract base metadata
624*59036814SCostin Stroie            $parts = explode(':', $id);
625*59036814SCostin Stroie
626*59036814SCostin Stroie            // Extract metadata from the last part of the ID
627*59036814SCostin Stroie            $lastPart = end($parts);
628*59036814SCostin Stroie            $baseMetadata = [];
629*59036814SCostin Stroie
630*59036814SCostin Stroie            // Add the document ID as metadata
631*59036814SCostin Stroie            $baseMetadata['document_id'] = $id;
632*59036814SCostin Stroie
633*59036814SCostin Stroie            // Add current timestamp
634*59036814SCostin Stroie            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
635*59036814SCostin Stroie
636*59036814SCostin Stroie            // Check if any part of the ID is 'templates' and set template metadata
637*59036814SCostin Stroie            $isTemplate = in_array('templates', $parts);
638*59036814SCostin Stroie            if ($isTemplate) {
639*59036814SCostin Stroie                $baseMetadata['type'] = 'template';
640*59036814SCostin Stroie            } else {
641*59036814SCostin Stroie                $baseMetadata['type'] = 'report';
642*59036814SCostin Stroie            }
643*59036814SCostin Stroie
644*59036814SCostin Stroie            // Extract modality from the second part
645*59036814SCostin Stroie            if (isset($parts[1])) {
646*59036814SCostin Stroie                $baseMetadata['modality'] = $parts[1];
647*59036814SCostin Stroie            }
648*59036814SCostin Stroie
649*59036814SCostin Stroie            // Handle different ID formats based on the third part: word (institution) or numeric (year)
650*59036814SCostin Stroie            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
651*59036814SCostin Stroie            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
652*59036814SCostin Stroie            // For templates, don't set institution, date or year
653*59036814SCostin Stroie            if (isset($parts[2]) && !$isTemplate) {
654*59036814SCostin Stroie                // Check if third part is numeric (year) or word (institution)
655*59036814SCostin Stroie                if (is_numeric($parts[2])) {
656*59036814SCostin Stroie                    // Format: reports:mri:2024:g287-name-surname (year format)
657*59036814SCostin Stroie                    // Extract year from the third part
658*59036814SCostin Stroie                    $baseMetadata['year'] = $parts[2];
659*59036814SCostin Stroie
660*59036814SCostin Stroie                    // Set default institution from config
661*59036814SCostin Stroie                    $baseMetadata['institution'] = DEFAULT_INSTITUTION;
662*59036814SCostin Stroie
663*59036814SCostin Stroie                    // Extract registration and name from the last part
664*59036814SCostin Stroie                    // Registration should start with one letter or number and contain numbers before the '-' character
665*59036814SCostin Stroie                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
666*59036814SCostin Stroie                        // Check if the first part contains at least one digit to be considered a registration
667*59036814SCostin Stroie                        if (preg_match('/[0-9]/', $matches[1])) {
668*59036814SCostin Stroie                            $baseMetadata['registration'] = $matches[1];
669*59036814SCostin Stroie                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
670*59036814SCostin Stroie                        } else {
671*59036814SCostin Stroie                            // If no registration pattern found, treat entire part as patient name
672*59036814SCostin Stroie                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
673*59036814SCostin Stroie                        }
674*59036814SCostin Stroie                    } else {
675*59036814SCostin Stroie                        // If no match, treat entire part as patient name
676*59036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
677*59036814SCostin Stroie                    }
678*59036814SCostin Stroie                } else {
679*59036814SCostin Stroie                    // Format: reports:mri:institution:250620-name-surname (institution format)
680*59036814SCostin Stroie                    // Extract institution from the third part
681*59036814SCostin Stroie                    $baseMetadata['institution'] = $parts[2];
682*59036814SCostin Stroie
683*59036814SCostin Stroie                    // Extract date and name from the last part
684*59036814SCostin Stroie                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
685*59036814SCostin Stroie                        $dateStr = $matches[1];
686*59036814SCostin Stroie                        $name = $matches[2];
687*59036814SCostin Stroie
688*59036814SCostin Stroie                        // Convert date format (250620 -> 2025-06-20)
689*59036814SCostin Stroie                        $day = substr($dateStr, 0, 2);
690*59036814SCostin Stroie                        $month = substr($dateStr, 2, 2);
691*59036814SCostin Stroie                        $year = substr($dateStr, 4, 2);
692*59036814SCostin Stroie                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
693*59036814SCostin Stroie                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
694*59036814SCostin Stroie                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
695*59036814SCostin Stroie
696*59036814SCostin Stroie                        $baseMetadata['date'] = $formattedDate;
697*59036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $name);
698*59036814SCostin Stroie                    }
699*59036814SCostin Stroie                }
700*59036814SCostin Stroie            }
701*59036814SCostin Stroie
702*59036814SCostin Stroie            // For templates, always extract name from the last part
703*59036814SCostin Stroie            if ($isTemplate && isset($lastPart)) {
704*59036814SCostin Stroie                // Extract name from the last part (everything after the last colon)
705*59036814SCostin Stroie                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
706*59036814SCostin Stroie                    // Check if the first part contains at least one digit to be considered a registration
707*59036814SCostin Stroie                    if (preg_match('/[0-9]/', $matches[1])) {
708*59036814SCostin Stroie                        $baseMetadata['registration'] = $matches[1];
709*59036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
710*59036814SCostin Stroie                    } else {
711*59036814SCostin Stroie                        // If no registration pattern found, treat entire part as template name
712*59036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
713*59036814SCostin Stroie                    }
714*59036814SCostin Stroie                } else {
715*59036814SCostin Stroie                    // If no match, treat entire part as template name
716*59036814SCostin Stroie                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
717*59036814SCostin Stroie                }
718*59036814SCostin Stroie            }
719*59036814SCostin Stroie
720*59036814SCostin Stroie            // Process each paragraph as a chunk with intelligent metadata handling
721*59036814SCostin Stroie            $chunkIds = [];
722*59036814SCostin Stroie            $chunkContents = [];
723*59036814SCostin Stroie            $chunkMetadatas = [];
724*59036814SCostin Stroie            $chunkEmbeddings = [];
725*59036814SCostin Stroie            $currentTags = [];
726*59036814SCostin Stroie
727*59036814SCostin Stroie            foreach ($paragraphs as $index => $paragraph) {
728*59036814SCostin Stroie                // Skip empty paragraphs to avoid processing whitespace-only content
729*59036814SCostin Stroie                $paragraph = trim($paragraph);
730*59036814SCostin Stroie                if (empty($paragraph)) {
731*59036814SCostin Stroie                    continue;
732*59036814SCostin Stroie                }
733*59036814SCostin Stroie
734*59036814SCostin Stroie                // Check if this is a DokuWiki title (starts and ends with =)
735*59036814SCostin Stroie                // Titles are converted to tags for better searchability but not stored as content chunks
736*59036814SCostin Stroie                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
737*59036814SCostin Stroie                    // Extract title content and clean it
738*59036814SCostin Stroie                    $titleContent = trim($matches[1]);
739*59036814SCostin Stroie
740*59036814SCostin Stroie                    // Split into words and create searchable tags
741*59036814SCostin Stroie                    $words = preg_split('/\s+/', $titleContent);
742*59036814SCostin Stroie                    $tags = [];
743*59036814SCostin Stroie
744*59036814SCostin Stroie                    foreach ($words as $word) {
745*59036814SCostin Stroie                        // Only use words longer than 3 characters to reduce noise
746*59036814SCostin Stroie                        if (strlen($word) >= 3) {
747*59036814SCostin Stroie                            $tags[] = strtolower($word);
748*59036814SCostin Stroie                        }
749*59036814SCostin Stroie                    }
750*59036814SCostin Stroie
751*59036814SCostin Stroie                    // Remove duplicate tags and store for use in subsequent chunks
752*59036814SCostin Stroie                    $currentTags = array_unique($tags);
753*59036814SCostin Stroie                    continue; // Skip storing title chunks as content
754*59036814SCostin Stroie                }
755*59036814SCostin Stroie
756*59036814SCostin Stroie                // Create chunk ID
757*59036814SCostin Stroie                $chunkId = $id . '@' . ($index + 1);
758*59036814SCostin Stroie
759*59036814SCostin Stroie                // Generate embeddings for the chunk
760*59036814SCostin Stroie                $embeddings = $this->generateEmbeddings($paragraph);
761*59036814SCostin Stroie
762*59036814SCostin Stroie                // Add chunk-specific metadata
763*59036814SCostin Stroie                $metadata = $baseMetadata;
764*59036814SCostin Stroie                $metadata['chunk_id'] = $chunkId;
765*59036814SCostin Stroie                $metadata['chunk_number'] = $index + 1;
766*59036814SCostin Stroie                $metadata['total_chunks'] = count($paragraphs);
767*59036814SCostin Stroie
768*59036814SCostin Stroie                // Add current tags to metadata if any exist
769*59036814SCostin Stroie                if (!empty($currentTags)) {
770*59036814SCostin Stroie                    $metadata['tags'] = implode(',', $currentTags);
771*59036814SCostin Stroie                }
772*59036814SCostin Stroie
773*59036814SCostin Stroie                // Store chunk data
774*59036814SCostin Stroie                $chunkIds[] = $chunkId;
775*59036814SCostin Stroie                $chunkContents[] = $paragraph;
776*59036814SCostin Stroie                $chunkMetadatas[] = $metadata;
777*59036814SCostin Stroie                $chunkEmbeddings[] = $embeddings;
778*59036814SCostin Stroie            }
779*59036814SCostin Stroie
780*59036814SCostin Stroie            // If no chunks were created, skip this file
781*59036814SCostin Stroie            if (empty($chunkIds)) {
782*59036814SCostin Stroie                return [
783*59036814SCostin Stroie                    'status' => 'skipped',
784*59036814SCostin Stroie                    'message' => "No valid chunks found in file '$id'. Skipping..."
785*59036814SCostin Stroie                ];
786*59036814SCostin Stroie            }
787*59036814SCostin Stroie
788*59036814SCostin Stroie            // Send all chunks to ChromaDB
789*59036814SCostin Stroie            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
790*59036814SCostin Stroie
791*59036814SCostin Stroie            return [
792*59036814SCostin Stroie                'status' => 'success',
793*59036814SCostin Stroie                'message' => "Successfully sent file to ChromaDB",
794*59036814SCostin Stroie                'details' => [
795*59036814SCostin Stroie                    'document_id' => $id,
796*59036814SCostin Stroie                    'chunks' => count($chunkIds),
797*59036814SCostin Stroie                    'collection' => $collectionName
798*59036814SCostin Stroie                ],
799*59036814SCostin Stroie                'collection_status' => $collectionStatus
800*59036814SCostin Stroie            ];
801*59036814SCostin Stroie        } catch (Exception $e) {
802*59036814SCostin Stroie            return [
803*59036814SCostin Stroie                'status' => 'error',
804*59036814SCostin Stroie                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
805*59036814SCostin Stroie            ];
806*59036814SCostin Stroie        }
807*59036814SCostin Stroie    }
808*59036814SCostin Stroie
809*59036814SCostin Stroie    /**
810*59036814SCostin Stroie     * Process all DokuWiki files in a directory and send them to ChromaDB
811*59036814SCostin Stroie     *
812*59036814SCostin Stroie     * This function recursively processes all .txt files in a directory and its subdirectories.
813*59036814SCostin Stroie     * It first checks if the appropriate collection exists and creates it if needed.
814*59036814SCostin Stroie     * Then it processes each file individually.
815*59036814SCostin Stroie     *
816*59036814SCostin Stroie     * @param string $dirPath The directory path to process
817*59036814SCostin Stroie     * @return array Result with status and details
818*59036814SCostin Stroie     */
819*59036814SCostin Stroie    public function processDirectory($dirPath) {
820*59036814SCostin Stroie        // Check if directory exists
821*59036814SCostin Stroie        if (!is_dir($dirPath)) {
822*59036814SCostin Stroie            return [
823*59036814SCostin Stroie                'status' => 'error',
824*59036814SCostin Stroie                'message' => "Directory does not exist: $dirPath"
825*59036814SCostin Stroie            ];
826*59036814SCostin Stroie        }
827*59036814SCostin Stroie
828*59036814SCostin Stroie        // Create RecursiveIteratorIterator to process directories recursively
829*59036814SCostin Stroie        $iterator = new RecursiveIteratorIterator(
830*59036814SCostin Stroie            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
831*59036814SCostin Stroie            RecursiveIteratorIterator::LEAVES_ONLY
832*59036814SCostin Stroie        );
833*59036814SCostin Stroie
834*59036814SCostin Stroie        $files = [];
835*59036814SCostin Stroie        foreach ($iterator as $file) {
836*59036814SCostin Stroie            // Process only .txt files that don't start with underscore
837*59036814SCostin Stroie            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
838*59036814SCostin Stroie                $files[] = $file->getPathname();
839*59036814SCostin Stroie            }
840*59036814SCostin Stroie        }
841*59036814SCostin Stroie
842*59036814SCostin Stroie        if (empty($files)) {
843*59036814SCostin Stroie            return [
844*59036814SCostin Stroie                'status' => 'skipped',
845*59036814SCostin Stroie                'message' => "No .txt files found in directory: $dirPath"
846*59036814SCostin Stroie            ];
847*59036814SCostin Stroie        }
848*59036814SCostin Stroie
849*59036814SCostin Stroie        // Use the first part of the document ID as collection name, fallback to 'documents'
850*59036814SCostin Stroie        $sampleFile = $files[0];
851*59036814SCostin Stroie        $id = parseFilePath($sampleFile);
852*59036814SCostin Stroie        $idParts = explode(':', $id);
853*59036814SCostin Stroie        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
854*59036814SCostin Stroie
855*59036814SCostin Stroie        try {
856*59036814SCostin Stroie            $this->ensureCollectionExists($collectionName);
857*59036814SCostin Stroie            $collectionChecked = true;
858*59036814SCostin Stroie        } catch (Exception $e) {
859*59036814SCostin Stroie            $collectionChecked = true;
860*59036814SCostin Stroie        }
861*59036814SCostin Stroie
862*59036814SCostin Stroie        $results = [];
863*59036814SCostin Stroie        foreach ($files as $file) {
864*59036814SCostin Stroie            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
865*59036814SCostin Stroie            $results[] = [
866*59036814SCostin Stroie                'file' => $file,
867*59036814SCostin Stroie                'result' => $result
868*59036814SCostin Stroie            ];
869*59036814SCostin Stroie        }
870*59036814SCostin Stroie
871*59036814SCostin Stroie        return [
872*59036814SCostin Stroie            'status' => 'success',
873*59036814SCostin Stroie            'message' => "Finished processing directory.",
874*59036814SCostin Stroie            'files_count' => count($files),
875*59036814SCostin Stroie            'results' => $results
876*59036814SCostin Stroie        ];
877*59036814SCostin Stroie    }
878*59036814SCostin Stroie}
879*59036814SCostin Stroie
880*59036814SCostin Stroie/**
881*59036814SCostin Stroie * Parse a file path and convert it to a DokuWiki ID
882*59036814SCostin Stroie *
883*59036814SCostin Stroie * Takes a file system path and converts it to the DokuWiki ID format by:
884*59036814SCostin Stroie * 1. Removing the base path prefix (using DokuWiki's pages directory)
885*59036814SCostin Stroie * 2. Removing the .txt extension
886*59036814SCostin Stroie * 3. Converting directory separators to colons
887*59036814SCostin Stroie *
888*59036814SCostin Stroie * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
889*59036814SCostin Stroie * Becomes: reports:mri:2024:g287-name-surname
890*59036814SCostin Stroie *
891*59036814SCostin Stroie * @param string $filePath The full file path to parse
892*59036814SCostin Stroie * @return string The DokuWiki ID
893*59036814SCostin Stroie */
894*59036814SCostin Stroiefunction parseFilePath($filePath) {
895*59036814SCostin Stroie    // Use DokuWiki's constant to get the pages directory if available
896*59036814SCostin Stroie    if (defined('DOKU_INC')) {
897*59036814SCostin Stroie        $pagesDir = DOKU_INC . 'data/pages/';
898*59036814SCostin Stroie    } else {
899*59036814SCostin Stroie        // Fallback to common DokuWiki installation path
900*59036814SCostin Stroie        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
901*59036814SCostin Stroie    }
902*59036814SCostin Stroie
903*59036814SCostin Stroie    // Remove the base path
904*59036814SCostin Stroie    $relativePath = str_replace($pagesDir, '', $filePath);
905*59036814SCostin Stroie
906*59036814SCostin Stroie    // Remove .txt extension
907*59036814SCostin Stroie    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
908*59036814SCostin Stroie
909*59036814SCostin Stroie    // Split path into parts and filter out empty parts
910*59036814SCostin Stroie    $parts = array_filter(explode('/', $relativePath));
911*59036814SCostin Stroie
912*59036814SCostin Stroie    // Build DokuWiki ID (use first part as namespace)
913*59036814SCostin Stroie    $idParts = [];
914*59036814SCostin Stroie    foreach ($parts as $part) {
915*59036814SCostin Stroie        if (!empty($part)) {
916*59036814SCostin Stroie            $idParts[] = $part;
917*59036814SCostin Stroie        }
918*59036814SCostin Stroie    }
919*59036814SCostin Stroie
920*59036814SCostin Stroie    return implode(':', $idParts);
921*59036814SCostin Stroie}
922*59036814SCostin Stroie
923