xref: /plugin/dokullm/ChromaDBClient.php (revision a068a1ba363ef960eac31d32ced19151b291674d)
159036814SCostin Stroie<?php
259036814SCostin Stroie
359036814SCostin Stroienamespace dokuwiki\plugin\dokullm;
459036814SCostin Stroie
559036814SCostin Stroieclass ChromaDBClient {
659036814SCostin Stroie    private $baseUrl;
759036814SCostin Stroie    private $client;
859036814SCostin Stroie    private $ollamaClient;
959036814SCostin Stroie    private $tenant;
1059036814SCostin Stroie    private $database;
1159036814SCostin Stroie    private $ollamaHost;
1259036814SCostin Stroie    private $ollamaPort;
13*a068a1baSCostin Stroie (aider)    private $ollamaModel;
14*a068a1baSCostin Stroie (aider)
15*a068a1baSCostin Stroie (aider)    /**
16*a068a1baSCostin Stroie (aider)     * Get configuration value for the dokullm plugin
17*a068a1baSCostin Stroie (aider)     *
18*a068a1baSCostin Stroie (aider)     * @param string $key Configuration key
19*a068a1baSCostin Stroie (aider)     * @param mixed $default Default value if key not found
20*a068a1baSCostin Stroie (aider)     * @return mixed Configuration value
21*a068a1baSCostin Stroie (aider)     */
22*a068a1baSCostin Stroie (aider)    private function getConf($key, $default = null) {
23*a068a1baSCostin Stroie (aider)        global $conf;
24*a068a1baSCostin Stroie (aider)        return isset($conf['plugin']['dokullm'][$key]) ? $conf['plugin']['dokullm'][$key] : $default;
25*a068a1baSCostin Stroie (aider)    }
26*a068a1baSCostin Stroie (aider)    private $baseUrl;
27*a068a1baSCostin Stroie (aider)    private $client;
28*a068a1baSCostin Stroie (aider)    private $ollamaClient;
29*a068a1baSCostin Stroie (aider)    private $tenant;
30*a068a1baSCostin Stroie (aider)    private $database;
31*a068a1baSCostin Stroie (aider)    private $ollamaHost;
32*a068a1baSCostin Stroie (aider)    private $ollamaPort;
3359036814SCostin Stroie    /**
3459036814SCostin Stroie     * Initialize the ChromaDB client
3559036814SCostin Stroie     *
3659036814SCostin Stroie     * Creates a new ChromaDB client instance with the specified connection parameters.
3759036814SCostin Stroie     * Also ensures that the specified tenant and database exist.
3859036814SCostin Stroie     *
397f9bf094SCostin Stroie (aider)     * @param string $host ChromaDB server host
407f9bf094SCostin Stroie (aider)     * @param int $port ChromaDB server port
417f9bf094SCostin Stroie (aider)     * @param string $tenant ChromaDB tenant name
427f9bf094SCostin Stroie (aider)     * @param string $database ChromaDB database name
437f9bf094SCostin Stroie (aider)     * @param string $ollamaHost Ollama server host
447f9bf094SCostin Stroie (aider)     * @param int $ollamaPort Ollama server port
457f9bf094SCostin Stroie (aider)     * @param string $ollamaModel Ollama embeddings model
4659036814SCostin Stroie     */
477f9bf094SCostin Stroie (aider)    public function __construct($host = null, $port = null, $tenant = null, $database = null, $ollamaHost = null, $ollamaPort = null, $ollamaModel = null) {
487f9bf094SCostin Stroie (aider)        // Use provided parameters or fall back to configuration values
49*a068a1baSCostin Stroie (aider)        $chromaHost = $host ?? $this->getConf('chroma_host', '127.0.0.1');
50*a068a1baSCostin Stroie (aider)        $chromaPort = $port ?? $this->getConf('chroma_port', 8000);
51*a068a1baSCostin Stroie (aider)        $this->tenant = $tenant ?? $this->getConf('chroma_tenant', 'dokullm');
52*a068a1baSCostin Stroie (aider)        $this->database = $database ?? $this->getConf('chroma_database', 'dokullm');
53*a068a1baSCostin Stroie (aider)        $this->ollamaHost = $ollamaHost ?? $this->getConf('ollama_host', '127.0.0.1');
54*a068a1baSCostin Stroie (aider)        $this->ollamaPort = $ollamaPort ?? $this->getConf('ollama_port', 11434);
55*a068a1baSCostin Stroie (aider)        $this->ollamaModel = $ollamaModel ?? $this->getConf('ollama_embeddings_model', 'nomic-embed-text');
567f9bf094SCostin Stroie (aider)
57f2ab331fSCostin Stroie (aider)        $this->baseUrl = "http://{$chromaHost}:{$chromaPort}";
5859036814SCostin Stroie        $this->client = curl_init();
5959036814SCostin Stroie        curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true);
6059036814SCostin Stroie        curl_setopt($this->client, CURLOPT_HTTPHEADER, [
6159036814SCostin Stroie            'Content-Type: application/json',
6259036814SCostin Stroie            'Accept: application/json'
6359036814SCostin Stroie        ]);
6459036814SCostin Stroie
6559036814SCostin Stroie        // Initialize Ollama client
6659036814SCostin Stroie        $this->ollamaClient = curl_init();
6759036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true);
6859036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [
6959036814SCostin Stroie            'Content-Type: application/json'
7059036814SCostin Stroie        ]);
7159036814SCostin Stroie
7259036814SCostin Stroie        // Check if tenant and database exist, create them if they don't
7359036814SCostin Stroie        $this->ensureTenantAndDatabase();
7459036814SCostin Stroie    }
7559036814SCostin Stroie
7659036814SCostin Stroie    /**
7759036814SCostin Stroie     * Clean up the cURL client when the object is destroyed
7859036814SCostin Stroie     *
7959036814SCostin Stroie     * @return void
8059036814SCostin Stroie     */
8159036814SCostin Stroie    public function __destruct() {
8259036814SCostin Stroie        curl_close($this->client);
8359036814SCostin Stroie        curl_close($this->ollamaClient);
8459036814SCostin Stroie    }
8559036814SCostin Stroie
8659036814SCostin Stroie    /**
8759036814SCostin Stroie     * Make an HTTP request to the ChromaDB API
8859036814SCostin Stroie     *
8959036814SCostin Stroie     * This is a helper function that handles making HTTP requests to the ChromaDB API,
9059036814SCostin Stroie     * including setting the appropriate headers for tenant and database.
9159036814SCostin Stroie     *
9259036814SCostin Stroie     * @param string $endpoint The API endpoint to call
9359036814SCostin Stroie     * @param string $method The HTTP method to use (default: 'GET')
9459036814SCostin Stroie     * @param array|null $data The data to send with the request (default: null)
9559036814SCostin Stroie     * @return array The JSON response decoded as an array
9659036814SCostin Stroie     * @throws Exception If there's a cURL error or HTTP error
9759036814SCostin Stroie     */
9859036814SCostin Stroie    private function makeRequest($endpoint, $method = 'GET', $data = null) {
9959036814SCostin Stroie        // Add tenant and database as headers instead of query parameters for v2 API
10059036814SCostin Stroie        $headers = [
10159036814SCostin Stroie            'Content-Type: application/json',
10259036814SCostin Stroie            'Accept: application/json'
10359036814SCostin Stroie        ];
10459036814SCostin Stroie
10559036814SCostin Stroie        $url = $this->baseUrl . '/api/v2' . $endpoint;
10659036814SCostin Stroie
10759036814SCostin Stroie        curl_setopt($this->client, CURLOPT_URL, $url);
10859036814SCostin Stroie        curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method);
10959036814SCostin Stroie        curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers);
11059036814SCostin Stroie
11159036814SCostin Stroie        if ($data) {
11259036814SCostin Stroie            curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data));
11359036814SCostin Stroie        } else {
11459036814SCostin Stroie            curl_setopt($this->client, CURLOPT_POSTFIELDS, null);
11559036814SCostin Stroie        }
11659036814SCostin Stroie
11759036814SCostin Stroie        $response = curl_exec($this->client);
11859036814SCostin Stroie        $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE);
11959036814SCostin Stroie
12059036814SCostin Stroie        if (curl_error($this->client)) {
1213eb8beceSCostin Stroie (aider)            throw new \Exception('Curl error: ' . curl_error($this->client));
12259036814SCostin Stroie        }
12359036814SCostin Stroie
12459036814SCostin Stroie        if ($httpCode >= 400) {
1253eb8beceSCostin Stroie (aider)            throw new \Exception("HTTP Error: $httpCode, Response: $response");
12659036814SCostin Stroie        }
12759036814SCostin Stroie
12859036814SCostin Stroie        return json_decode($response, true);
12959036814SCostin Stroie    }
13059036814SCostin Stroie
13159036814SCostin Stroie    /**
13259036814SCostin Stroie     * Generate embeddings for text using Ollama
13359036814SCostin Stroie     *
13459036814SCostin Stroie     * @param string $text The text to generate embeddings for
13559036814SCostin Stroie     * @return array The embeddings vector
13659036814SCostin Stroie     */
13759036814SCostin Stroie    public function generateEmbeddings($text) {
13859036814SCostin Stroie        $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings";
13959036814SCostin Stroie
14059036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl);
14159036814SCostin Stroie
14259036814SCostin Stroie        $data = [
14359036814SCostin Stroie            'model' => $this->ollamaModel,
14459036814SCostin Stroie            'prompt' => $text,
14559036814SCostin Stroie            'keep_alive' => '30m'
14659036814SCostin Stroie        ];
14759036814SCostin Stroie
14859036814SCostin Stroie        curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data));
14959036814SCostin Stroie
15059036814SCostin Stroie        $response = curl_exec($this->ollamaClient);
15159036814SCostin Stroie        $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE);
15259036814SCostin Stroie
15359036814SCostin Stroie        if (curl_error($this->ollamaClient)) {
1543eb8beceSCostin Stroie (aider)            throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient));
15559036814SCostin Stroie        }
15659036814SCostin Stroie
15759036814SCostin Stroie        if ($httpCode >= 400) {
1583eb8beceSCostin Stroie (aider)            throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response");
15959036814SCostin Stroie        }
16059036814SCostin Stroie
16159036814SCostin Stroie        $result = json_decode($response, true);
16259036814SCostin Stroie
16359036814SCostin Stroie        if (!isset($result['embedding'])) {
1643eb8beceSCostin Stroie (aider)            throw new \Exception("Ollama response missing embedding: " . $response);
16559036814SCostin Stroie        }
16659036814SCostin Stroie
16759036814SCostin Stroie        return $result['embedding'];
16859036814SCostin Stroie    }
16959036814SCostin Stroie
17059036814SCostin Stroie    /**
17159036814SCostin Stroie     * List all collections in the database
17259036814SCostin Stroie     *
17359036814SCostin Stroie     * Retrieves a list of all collections in the specified tenant and database.
17459036814SCostin Stroie     *
17559036814SCostin Stroie     * @return array List of collections
17659036814SCostin Stroie     */
17759036814SCostin Stroie    public function listCollections() {
17859036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
17959036814SCostin Stroie        return $this->makeRequest($endpoint);
18059036814SCostin Stroie    }
18159036814SCostin Stroie
18259036814SCostin Stroie    /**
18359036814SCostin Stroie     * Get a collection by name
18459036814SCostin Stroie     *
18559036814SCostin Stroie     * Retrieves information about a specific collection by its name.
18659036814SCostin Stroie     *
18759036814SCostin Stroie     * @param string $name The name of the collection to retrieve
18859036814SCostin Stroie     * @return array The collection information
18959036814SCostin Stroie     * @throws Exception If the collection is not found
19059036814SCostin Stroie     */
19159036814SCostin Stroie    public function getCollection($name) {
19259036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
19359036814SCostin Stroie        if (empty($name)) {
19459036814SCostin Stroie            $name = 'documents';
19559036814SCostin Stroie        }
19659036814SCostin Stroie
19759036814SCostin Stroie        // First try to get collection by name
19859036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
19959036814SCostin Stroie        $collections = $this->makeRequest($endpoint);
20059036814SCostin Stroie
20159036814SCostin Stroie        // Find collection by name
20259036814SCostin Stroie        foreach ($collections as $collection) {
20359036814SCostin Stroie            if (isset($collection['name']) && $collection['name'] === $name) {
20459036814SCostin Stroie                return $collection;
20559036814SCostin Stroie            }
20659036814SCostin Stroie        }
20759036814SCostin Stroie
20859036814SCostin Stroie        // If not found, throw exception
2093eb8beceSCostin Stroie (aider)        throw new \Exception("Collection '{$name}' not found");
21059036814SCostin Stroie    }
21159036814SCostin Stroie
21259036814SCostin Stroie    /**
21359036814SCostin Stroie     * Create a new collection
21459036814SCostin Stroie     *
21559036814SCostin Stroie     * Creates a new collection with the specified name and optional metadata.
21659036814SCostin Stroie     *
21759036814SCostin Stroie     * @param string $name The name of the collection to create
21859036814SCostin Stroie     * @param array|null $metadata Optional metadata for the collection
21959036814SCostin Stroie     * @return array The response from the API
22059036814SCostin Stroie     */
22159036814SCostin Stroie    public function createCollection($name, $metadata = null) {
22259036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
22359036814SCostin Stroie        if (empty($name)) {
22459036814SCostin Stroie            $name = 'documents';
22559036814SCostin Stroie        }
22659036814SCostin Stroie
22759036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections";
22859036814SCostin Stroie        $data = ['name' => $name];
22959036814SCostin Stroie        if ($metadata) {
23059036814SCostin Stroie            $data['metadata'] = $metadata;
23159036814SCostin Stroie        }
23259036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
23359036814SCostin Stroie    }
23459036814SCostin Stroie
23559036814SCostin Stroie    /**
23659036814SCostin Stroie     * Delete a collection by name
23759036814SCostin Stroie     *
23859036814SCostin Stroie     * Deletes a collection with the specified name.
23959036814SCostin Stroie     *
24059036814SCostin Stroie     * @param string $name The name of the collection to delete
24159036814SCostin Stroie     * @return array The response from the API
24259036814SCostin Stroie     * @throws Exception If the collection ID is not found
24359036814SCostin Stroie     */
24459036814SCostin Stroie    public function deleteCollection($name) {
24559036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
24659036814SCostin Stroie        if (empty($name)) {
24759036814SCostin Stroie            $name = 'documents';
24859036814SCostin Stroie        }
24959036814SCostin Stroie
25059036814SCostin Stroie        // First get the collection to find its ID
25159036814SCostin Stroie        $collection = $this->getCollection($name);
25259036814SCostin Stroie        if (!isset($collection['id'])) {
2533eb8beceSCostin Stroie (aider)            throw new \Exception("Collection ID not found for '{$name}'");
25459036814SCostin Stroie        }
25559036814SCostin Stroie
25659036814SCostin Stroie        $collectionId = $collection['id'];
25759036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}";
25859036814SCostin Stroie        return $this->makeRequest($endpoint, 'DELETE');
25959036814SCostin Stroie    }
26059036814SCostin Stroie
26159036814SCostin Stroie    /**
26259036814SCostin Stroie     * Get a document by its ID from a collection
26359036814SCostin Stroie     *
26459036814SCostin Stroie     * Retrieves a document from the specified collection using its ID.
26559036814SCostin Stroie     *
26659036814SCostin Stroie     * @param string $collectionName The name of the collection to get the document from
26759036814SCostin Stroie     * @param string $documentId The document ID to retrieve
26859036814SCostin Stroie     * @param array $include What to include in the response (default: ["metadatas", "documents"])
26959036814SCostin Stroie     * @return array The retrieved document
27059036814SCostin Stroie     * @throws Exception If the collection ID is not found
27159036814SCostin Stroie     */
27259036814SCostin Stroie    public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) {
27359036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
27459036814SCostin Stroie        if (empty($collectionName)) {
27559036814SCostin Stroie            $collectionName = 'documents';
27659036814SCostin Stroie        }
27759036814SCostin Stroie
27859036814SCostin Stroie        // First get the collection to find its ID
27959036814SCostin Stroie        $collection = $this->getCollection($collectionName);
28059036814SCostin Stroie        if (!isset($collection['id'])) {
2813eb8beceSCostin Stroie (aider)            throw new \Exception("Collection ID not found for '{$collectionName}'");
28259036814SCostin Stroie        }
28359036814SCostin Stroie
28459036814SCostin Stroie        $collectionId = $collection['id'];
28559036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
28659036814SCostin Stroie        $data = [
28759036814SCostin Stroie            'ids' => [$documentId],
28859036814SCostin Stroie            'include' => $include
28959036814SCostin Stroie        ];
29059036814SCostin Stroie
29159036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
29259036814SCostin Stroie    }
29359036814SCostin Stroie
29459036814SCostin Stroie    /**
29559036814SCostin Stroie     * Add documents to a collection
29659036814SCostin Stroie     *
29759036814SCostin Stroie     * Adds documents to the specified collection. Each document must have a corresponding ID.
29859036814SCostin Stroie     * Optional metadata and pre-computed embeddings can also be provided.
29959036814SCostin Stroie     *
30059036814SCostin Stroie     * @param string $collectionName The name of the collection to add documents to
30159036814SCostin Stroie     * @param array $documents The document contents
30259036814SCostin Stroie     * @param array $ids The document IDs
30359036814SCostin Stroie     * @param array|null $metadatas Optional metadata for each document
30459036814SCostin Stroie     * @param array|null $embeddings Optional pre-computed embeddings for each document
30559036814SCostin Stroie     * @return array The response from the API
30659036814SCostin Stroie     * @throws Exception If the collection ID is not found
30759036814SCostin Stroie     */
30859036814SCostin Stroie    public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) {
30959036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
31059036814SCostin Stroie        if (empty($collectionName)) {
31159036814SCostin Stroie            $collectionName = 'documents';
31259036814SCostin Stroie        }
31359036814SCostin Stroie
31459036814SCostin Stroie        // First get the collection to find its ID
31559036814SCostin Stroie        $collection = $this->getCollection($collectionName);
31659036814SCostin Stroie        if (!isset($collection['id'])) {
3173eb8beceSCostin Stroie (aider)            throw new \Exception("Collection ID not found for '{$collectionName}'");
31859036814SCostin Stroie        }
31959036814SCostin Stroie
32059036814SCostin Stroie        $collectionId = $collection['id'];
32159036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert";
32259036814SCostin Stroie        $data = [
32359036814SCostin Stroie            'ids' => $ids,
32459036814SCostin Stroie            'documents' => $documents
32559036814SCostin Stroie        ];
32659036814SCostin Stroie
32759036814SCostin Stroie        if ($metadatas) {
32859036814SCostin Stroie            $data['metadatas'] = $metadatas;
32959036814SCostin Stroie        }
33059036814SCostin Stroie
33159036814SCostin Stroie        if ($embeddings) {
33259036814SCostin Stroie            $data['embeddings'] = $embeddings;
33359036814SCostin Stroie        }
33459036814SCostin Stroie
33559036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
33659036814SCostin Stroie    }
33759036814SCostin Stroie
33859036814SCostin Stroie    /**
33959036814SCostin Stroie     * Check if a document needs to be updated based on timestamp comparison
34059036814SCostin Stroie     *
34159036814SCostin Stroie     * Determines whether a document should be reprocessed by comparing the file's last modification
34259036814SCostin Stroie     * time with the processed_at timestamp stored in the document's metadata. The function checks
34359036814SCostin Stroie     * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore
34459036814SCostin Stroie     * not included in the database.
34559036814SCostin Stroie     *
34659036814SCostin Stroie     * @param string $collectionId The ID of the collection to check documents in
34759036814SCostin Stroie     * @param string $documentId The base document ID to check (without chunk suffixes)
34859036814SCostin Stroie     * @param int $fileModifiedTime The file's last modification timestamp (from filemtime)
34959036814SCostin Stroie     * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date
35059036814SCostin Stroie     * @throws Exception If there's an error checking the document
35159036814SCostin Stroie     */
35259036814SCostin Stroie    public function needsUpdate($collectionId, $documentId, $fileModifiedTime) {
35359036814SCostin Stroie        try {
35459036814SCostin Stroie            $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get";
35559036814SCostin Stroie
35659036814SCostin Stroie            // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped
35759036814SCostin Stroie            $chunkIdsToCheck = [
35859036814SCostin Stroie                $documentId . '@1',
35959036814SCostin Stroie                $documentId . '@2',
36059036814SCostin Stroie                $documentId . '@3'
36159036814SCostin Stroie            ];
36259036814SCostin Stroie
36359036814SCostin Stroie            $data = [
36459036814SCostin Stroie                'ids' => $chunkIdsToCheck,
36559036814SCostin Stroie                'include' => [
36659036814SCostin Stroie                    "metadatas"
36759036814SCostin Stroie                ],
36859036814SCostin Stroie                'limit' => 1
36959036814SCostin Stroie            ];
37059036814SCostin Stroie
37159036814SCostin Stroie            // Check if document exists
37259036814SCostin Stroie            $result = $this->makeRequest($endpoint, 'POST', $data);
37359036814SCostin Stroie
37459036814SCostin Stroie            // If no documents found, return true (needs to be added)
37559036814SCostin Stroie            if (empty($result['ids'])) {
37659036814SCostin Stroie                return true;
37759036814SCostin Stroie            }
37859036814SCostin Stroie
37959036814SCostin Stroie            // Check if any document has a processed_at timestamp
38059036814SCostin Stroie            if (!empty($result['metadatas']) && is_array($result['metadatas'])) {
38159036814SCostin Stroie                // Check the first metadata entry directly
38259036814SCostin Stroie                $metadata = $result['metadatas'][0];
38359036814SCostin Stroie
38459036814SCostin Stroie                // If processed_at is not set, return true (needs update)
38559036814SCostin Stroie                if (!isset($metadata['processed_at'])) {
38659036814SCostin Stroie                    return true;
38759036814SCostin Stroie                }
38859036814SCostin Stroie
38959036814SCostin Stroie                // Parse the processed_at timestamp
39059036814SCostin Stroie                $processedTimestamp = strtotime($metadata['processed_at']);
39159036814SCostin Stroie
39259036814SCostin Stroie                // If file is newer than processed time, return true (needs update)
39359036814SCostin Stroie                if ($fileModifiedTime > $processedTimestamp) {
39459036814SCostin Stroie                    return true;
39559036814SCostin Stroie                }
39659036814SCostin Stroie            }
39759036814SCostin Stroie
39859036814SCostin Stroie            // Document exists and is up to date
39959036814SCostin Stroie            return false;
4001f06f0c8SCostin Stroie (aider)        } catch (\Exception $e) {
40159036814SCostin Stroie            // If there's an error checking the document, assume it needs to be updated
40259036814SCostin Stroie            return true;
40359036814SCostin Stroie        }
40459036814SCostin Stroie    }
40559036814SCostin Stroie
40659036814SCostin Stroie    /**
40759036814SCostin Stroie     * Query a collection for similar documents
40859036814SCostin Stroie     *
40959036814SCostin Stroie     * Queries the specified collection for documents similar to the provided query texts.
41059036814SCostin Stroie     * The function generates embeddings for the query texts and sends them to ChromaDB.
41159036814SCostin Stroie     * Supports filtering results by metadata using the where parameter.
41259036814SCostin Stroie     *
41359036814SCostin Stroie     * @param string $collectionName The name of the collection to query
41459036814SCostin Stroie     * @param array $queryTexts The query texts to search for
41559036814SCostin Stroie     * @param int $nResults The number of results to return (default: 5)
41659036814SCostin Stroie     * @param array|null $where Optional filter conditions for metadata
41759036814SCostin Stroie     * @return array The query results
41859036814SCostin Stroie     * @throws Exception If the collection ID is not found
41959036814SCostin Stroie     */
42059036814SCostin Stroie    public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) {
42159036814SCostin Stroie        // Use provided name, fallback to 'documents' if empty
42259036814SCostin Stroie        if (empty($collectionName)) {
42359036814SCostin Stroie            $collectionName = 'documents';
42459036814SCostin Stroie        }
42559036814SCostin Stroie
42659036814SCostin Stroie        // First get the collection to find its ID
42759036814SCostin Stroie        $collection = $this->getCollection($collectionName);
42859036814SCostin Stroie        if (!isset($collection['id'])) {
4293eb8beceSCostin Stroie (aider)            throw new \Exception("Collection ID not found for '{$collectionName}'");
43059036814SCostin Stroie        }
43159036814SCostin Stroie
43259036814SCostin Stroie        $collectionId = $collection['id'];
43359036814SCostin Stroie        $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query";
43459036814SCostin Stroie
43559036814SCostin Stroie        // Generate embeddings for query texts
43659036814SCostin Stroie        $queryEmbeddings = [];
43759036814SCostin Stroie        foreach ($queryTexts as $text) {
43859036814SCostin Stroie            $queryEmbeddings[] = $this->generateEmbeddings($text);
43959036814SCostin Stroie        }
44059036814SCostin Stroie
44159036814SCostin Stroie        $data = [
44259036814SCostin Stroie            'query_embeddings' => $queryEmbeddings,
44359036814SCostin Stroie            'n_results' => $nResults
44459036814SCostin Stroie        ];
44559036814SCostin Stroie
44659036814SCostin Stroie        // Add where clause for metadata filtering if provided
44759036814SCostin Stroie        if ($where && is_array($where)) {
44859036814SCostin Stroie            $data['where'] = $where;
44959036814SCostin Stroie        }
45059036814SCostin Stroie
45159036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
45259036814SCostin Stroie    }
45359036814SCostin Stroie
45459036814SCostin Stroie    /**
45559036814SCostin Stroie     * Check if the ChromaDB server is alive
45659036814SCostin Stroie     *
45759036814SCostin Stroie     * Sends a heartbeat request to verify that the ChromaDB server is running.
45859036814SCostin Stroie     *
45959036814SCostin Stroie     * @return array The response from the heartbeat endpoint
46059036814SCostin Stroie     */
46159036814SCostin Stroie    public function heartbeat() {
46259036814SCostin Stroie        $endpoint = "/heartbeat";
46359036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
46459036814SCostin Stroie    }
46559036814SCostin Stroie
46659036814SCostin Stroie    /**
46759036814SCostin Stroie     * Get authentication and identity information
46859036814SCostin Stroie     *
46959036814SCostin Stroie     * Retrieves authentication and identity information from the ChromaDB server.
47059036814SCostin Stroie     *
47159036814SCostin Stroie     * @return array The response from the auth/identity endpoint
47259036814SCostin Stroie     */
47359036814SCostin Stroie    public function getIdentity() {
47459036814SCostin Stroie        $endpoint = "/identity";
47559036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
47659036814SCostin Stroie    }
47759036814SCostin Stroie
47859036814SCostin Stroie    /**
47959036814SCostin Stroie     * Ensure that the specified tenant and database exist
48059036814SCostin Stroie     *
48159036814SCostin Stroie     * Checks if the specified tenant and database exist, and creates them if they don't.
48259036814SCostin Stroie     *
48359036814SCostin Stroie     * @return void
48459036814SCostin Stroie     */
48559036814SCostin Stroie    private function ensureTenantAndDatabase() {
48659036814SCostin Stroie        // Check if tenant exists, create if it doesn't
48759036814SCostin Stroie        try {
48859036814SCostin Stroie            $this->getTenant($this->tenant);
4893eb8beceSCostin Stroie (aider)        } catch (\Exception $e) {
49059036814SCostin Stroie            // Tenant doesn't exist, create it
49159036814SCostin Stroie            $this->createTenant($this->tenant);
49259036814SCostin Stroie        }
49359036814SCostin Stroie
49459036814SCostin Stroie        // Check if database exists, create if it doesn't
49559036814SCostin Stroie        try {
49659036814SCostin Stroie            $this->getDatabase($this->database, $this->tenant);
4973eb8beceSCostin Stroie (aider)        } catch (\Exception $e) {
49859036814SCostin Stroie            // Database doesn't exist, create it
49959036814SCostin Stroie            $this->createDatabase($this->database, $this->tenant);
50059036814SCostin Stroie        }
50159036814SCostin Stroie    }
50259036814SCostin Stroie
50359036814SCostin Stroie    /**
50459036814SCostin Stroie     * Get tenant information
50559036814SCostin Stroie     *
50659036814SCostin Stroie     * Retrieves information about the specified tenant.
50759036814SCostin Stroie     *
50859036814SCostin Stroie     * @param string $tenantName The tenant name
50959036814SCostin Stroie     * @return array The tenant information
51059036814SCostin Stroie     */
51159036814SCostin Stroie    public function getTenant($tenantName) {
51259036814SCostin Stroie        $endpoint = "/tenants/{$tenantName}";
51359036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
51459036814SCostin Stroie    }
51559036814SCostin Stroie
51659036814SCostin Stroie    /**
51759036814SCostin Stroie     * Create a new tenant
51859036814SCostin Stroie     *
51959036814SCostin Stroie     * Creates a new tenant with the specified name.
52059036814SCostin Stroie     *
52159036814SCostin Stroie     * @param string $tenantName The tenant name
52259036814SCostin Stroie     * @return array The response from the API
52359036814SCostin Stroie     */
52459036814SCostin Stroie    public function createTenant($tenantName) {
52559036814SCostin Stroie        $endpoint = "/tenants";
52659036814SCostin Stroie        $data = ['name' => $tenantName];
52759036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
52859036814SCostin Stroie    }
52959036814SCostin Stroie
53059036814SCostin Stroie    /**
53159036814SCostin Stroie     * Get database information
53259036814SCostin Stroie     *
53359036814SCostin Stroie     * Retrieves information about the specified database within a tenant.
53459036814SCostin Stroie     *
53559036814SCostin Stroie     * @param string $databaseName The database name
53659036814SCostin Stroie     * @param string $tenantName The tenant name
53759036814SCostin Stroie     * @return array The database information
53859036814SCostin Stroie     */
53959036814SCostin Stroie    public function getDatabase($databaseName, $tenantName) {
54059036814SCostin Stroie        $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}";
54159036814SCostin Stroie        return $this->makeRequest($endpoint, 'GET');
54259036814SCostin Stroie    }
54359036814SCostin Stroie
54459036814SCostin Stroie    /**
54559036814SCostin Stroie     * Create a new database
54659036814SCostin Stroie     *
54759036814SCostin Stroie     * Creates a new database with the specified name within a tenant.
54859036814SCostin Stroie     *
54959036814SCostin Stroie     * @param string $databaseName The database name
55059036814SCostin Stroie     * @param string $tenantName The tenant name
55159036814SCostin Stroie     * @return array The response from the API
55259036814SCostin Stroie     */
55359036814SCostin Stroie    public function createDatabase($databaseName, $tenantName) {
55459036814SCostin Stroie        $endpoint = "/tenants/{$tenantName}/databases";
55559036814SCostin Stroie        $data = ['name' => $databaseName];
55659036814SCostin Stroie        return $this->makeRequest($endpoint, 'POST', $data);
55759036814SCostin Stroie    }
55859036814SCostin Stroie
55959036814SCostin Stroie    /**
56059036814SCostin Stroie     * Ensure a collection exists, creating it if necessary
56159036814SCostin Stroie     *
56259036814SCostin Stroie     * This helper function checks if a collection exists and creates it if it doesn't.
56359036814SCostin Stroie     *
56459036814SCostin Stroie     * @param string $collectionName The name of the collection to check/create
56559036814SCostin Stroie     * @return string Status message indicating what happened
56659036814SCostin Stroie     */
56759036814SCostin Stroie    public function ensureCollectionExists($collectionName) {
56859036814SCostin Stroie        try {
56959036814SCostin Stroie            $collection = $this->getCollection($collectionName);
57059036814SCostin Stroie            return "Collection '$collectionName' already exists.";
5713eb8beceSCostin Stroie (aider)        } catch (\Exception $e) {
57259036814SCostin Stroie            // Collection doesn't exist, create it
57359036814SCostin Stroie            $created = $this->createCollection($collectionName);
57459036814SCostin Stroie            return "Collection '$collectionName' created.";
57559036814SCostin Stroie        }
57659036814SCostin Stroie    }
57759036814SCostin Stroie
57859036814SCostin Stroie    /**
57959036814SCostin Stroie     * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking
58059036814SCostin Stroie     *
58159036814SCostin Stroie     * This function handles the complete processing of a single DokuWiki file:
58259036814SCostin Stroie     * 1. Parses the file path to extract metadata and document ID
58359036814SCostin Stroie     * 2. Determines the appropriate collection based on document ID
58459036814SCostin Stroie     * 3. Checks if the document needs updating using timestamp comparison
58559036814SCostin Stroie     * 4. Reads and processes file content only if update is needed
58659036814SCostin Stroie     * 5. Splits the document into chunks (paragraphs)
58759036814SCostin Stroie     * 6. Extracts rich metadata from the DokuWiki ID format
58859036814SCostin Stroie     * 7. Generates embeddings for each chunk
58959036814SCostin Stroie     * 8. Sends all chunks to ChromaDB with metadata
59059036814SCostin Stroie     *
59159036814SCostin Stroie     * Supported ID formats:
59259036814SCostin Stroie     * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
59359036814SCostin Stroie     * - Format 2: reports:mri:2024:g287-name-surname (third part is year)
59459036814SCostin Stroie     * - Templates: reports:mri:templates:name-surname (contains 'templates' part)
59559036814SCostin Stroie     *
59659036814SCostin Stroie     * The function implements smart update checking by comparing file modification time
59759036814SCostin Stroie     * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files.
59859036814SCostin Stroie     *
59959036814SCostin Stroie     * @param string $filePath The path to the file to process
60059036814SCostin Stroie     * @param string $collectionName The name of the collection to use
60159036814SCostin Stroie     * @param bool $collectionChecked Whether the collection has already been checked/created
60259036814SCostin Stroie     * @return array Result with status and details
60359036814SCostin Stroie     */
60459036814SCostin Stroie    public function processSingleFile($filePath, $collectionName, $collectionChecked = false) {
60559036814SCostin Stroie        // Parse file path to extract metadata
60659036814SCostin Stroie        $id = parseFilePath($filePath);
60759036814SCostin Stroie
60859036814SCostin Stroie        try {
60959036814SCostin Stroie            // Create collection if it doesn't exist (only if not already checked)
61059036814SCostin Stroie            $collectionStatus = '';
61159036814SCostin Stroie            if (!$collectionChecked) {
61259036814SCostin Stroie                $collectionStatus = $this->ensureCollectionExists($collectionName);
61359036814SCostin Stroie            }
61459036814SCostin Stroie
61559036814SCostin Stroie            // Get collection ID
61659036814SCostin Stroie            $collection = $this->getCollection($collectionName);
61759036814SCostin Stroie            if (!isset($collection['id'])) {
61859036814SCostin Stroie                return [
61959036814SCostin Stroie                    'status' => 'error',
62059036814SCostin Stroie                    'message' => "Collection ID not found for '{$collectionName}'"
62159036814SCostin Stroie                ];
62259036814SCostin Stroie            }
62359036814SCostin Stroie            $collectionId = $collection['id'];
62459036814SCostin Stroie
62559036814SCostin Stroie            // Get file modification time
62659036814SCostin Stroie            $fileModifiedTime = filemtime($filePath);
62759036814SCostin Stroie
62859036814SCostin Stroie            // Check if document needs update
62959036814SCostin Stroie            $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime);
63059036814SCostin Stroie
63159036814SCostin Stroie            // If document is up to date, skip processing
63259036814SCostin Stroie            if (!$needsUpdate) {
63359036814SCostin Stroie                return [
63459036814SCostin Stroie                    'status' => 'skipped',
63559036814SCostin Stroie                    'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..."
63659036814SCostin Stroie                ];
63759036814SCostin Stroie            }
63859036814SCostin Stroie
63959036814SCostin Stroie            // Read file content
64059036814SCostin Stroie            $content = file_get_contents($filePath);
64159036814SCostin Stroie
64259036814SCostin Stroie            // Split document into chunks (paragraphs separated by two newlines)
64359036814SCostin Stroie            $paragraphs = preg_split('/\n\s*\n/', $content);
64459036814SCostin Stroie            $chunks = [];
64559036814SCostin Stroie            $chunkMetadata = [];
64659036814SCostin Stroie
64759036814SCostin Stroie            // Parse the DokuWiki ID to extract base metadata
64859036814SCostin Stroie            $parts = explode(':', $id);
64959036814SCostin Stroie
65059036814SCostin Stroie            // Extract metadata from the last part of the ID
65159036814SCostin Stroie            $lastPart = end($parts);
65259036814SCostin Stroie            $baseMetadata = [];
65359036814SCostin Stroie
65459036814SCostin Stroie            // Add the document ID as metadata
65559036814SCostin Stroie            $baseMetadata['document_id'] = $id;
65659036814SCostin Stroie
65759036814SCostin Stroie            // Add current timestamp
65859036814SCostin Stroie            $baseMetadata['processed_at'] = date('Y-m-d H:i:s');
65959036814SCostin Stroie
66059036814SCostin Stroie            // Check if any part of the ID is 'templates' and set template metadata
66159036814SCostin Stroie            $isTemplate = in_array('templates', $parts);
66259036814SCostin Stroie            if ($isTemplate) {
66359036814SCostin Stroie                $baseMetadata['type'] = 'template';
66459036814SCostin Stroie            } else {
66559036814SCostin Stroie                $baseMetadata['type'] = 'report';
66659036814SCostin Stroie            }
66759036814SCostin Stroie
66859036814SCostin Stroie            // Extract modality from the second part
66959036814SCostin Stroie            if (isset($parts[1])) {
67059036814SCostin Stroie                $baseMetadata['modality'] = $parts[1];
67159036814SCostin Stroie            }
67259036814SCostin Stroie
67359036814SCostin Stroie            // Handle different ID formats based on the third part: word (institution) or numeric (year)
67459036814SCostin Stroie            // Format 1: reports:mri:institution:250620-name-surname (third part is institution name)
67559036814SCostin Stroie            // Format 2: reports:mri:2024:g287-name-surname (third part is year)
67659036814SCostin Stroie            // For templates, don't set institution, date or year
67759036814SCostin Stroie            if (isset($parts[2]) && !$isTemplate) {
67859036814SCostin Stroie                // Check if third part is numeric (year) or word (institution)
67959036814SCostin Stroie                if (is_numeric($parts[2])) {
68059036814SCostin Stroie                    // Format: reports:mri:2024:g287-name-surname (year format)
68159036814SCostin Stroie                    // Extract year from the third part
68259036814SCostin Stroie                    $baseMetadata['year'] = $parts[2];
68359036814SCostin Stroie
68459036814SCostin Stroie                    // Set default institution from config
685*a068a1baSCostin Stroie (aider)                    $baseMetadata['institution'] = $this->getConf('default_institution', 'default');
68659036814SCostin Stroie
68759036814SCostin Stroie                    // Extract registration and name from the last part
68859036814SCostin Stroie                    // Registration should start with one letter or number and contain numbers before the '-' character
68959036814SCostin Stroie                    if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
69059036814SCostin Stroie                        // Check if the first part contains at least one digit to be considered a registration
69159036814SCostin Stroie                        if (preg_match('/[0-9]/', $matches[1])) {
69259036814SCostin Stroie                            $baseMetadata['registration'] = $matches[1];
69359036814SCostin Stroie                            $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
69459036814SCostin Stroie                        } else {
69559036814SCostin Stroie                            // If no registration pattern found, treat entire part as patient name
69659036814SCostin Stroie                            $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
69759036814SCostin Stroie                        }
69859036814SCostin Stroie                    } else {
69959036814SCostin Stroie                        // If no match, treat entire part as patient name
70059036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
70159036814SCostin Stroie                    }
70259036814SCostin Stroie                } else {
70359036814SCostin Stroie                    // Format: reports:mri:institution:250620-name-surname (institution format)
70459036814SCostin Stroie                    // Extract institution from the third part
70559036814SCostin Stroie                    $baseMetadata['institution'] = $parts[2];
70659036814SCostin Stroie
70759036814SCostin Stroie                    // Extract date and name from the last part
70859036814SCostin Stroie                    if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) {
70959036814SCostin Stroie                        $dateStr = $matches[1];
71059036814SCostin Stroie                        $name = $matches[2];
71159036814SCostin Stroie
71259036814SCostin Stroie                        // Convert date format (250620 -> 2025-06-20)
71359036814SCostin Stroie                        $day = substr($dateStr, 0, 2);
71459036814SCostin Stroie                        $month = substr($dateStr, 2, 2);
71559036814SCostin Stroie                        $year = substr($dateStr, 4, 2);
71659036814SCostin Stroie                        // Assuming 20xx for years 00-69 and 19xx for years 70-99
71759036814SCostin Stroie                        $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year;
71859036814SCostin Stroie                        $formattedDate = $fullYear . '-' . $month . '-' . $day;
71959036814SCostin Stroie
72059036814SCostin Stroie                        $baseMetadata['date'] = $formattedDate;
72159036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $name);
72259036814SCostin Stroie                    }
72359036814SCostin Stroie                }
72459036814SCostin Stroie            }
72559036814SCostin Stroie
72659036814SCostin Stroie            // For templates, always extract name from the last part
72759036814SCostin Stroie            if ($isTemplate && isset($lastPart)) {
72859036814SCostin Stroie                // Extract name from the last part (everything after the last colon)
72959036814SCostin Stroie                if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) {
73059036814SCostin Stroie                    // Check if the first part contains at least one digit to be considered a registration
73159036814SCostin Stroie                    if (preg_match('/[0-9]/', $matches[1])) {
73259036814SCostin Stroie                        $baseMetadata['registration'] = $matches[1];
73359036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $matches[2]);
73459036814SCostin Stroie                    } else {
73559036814SCostin Stroie                        // If no registration pattern found, treat entire part as template name
73659036814SCostin Stroie                        $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
73759036814SCostin Stroie                    }
73859036814SCostin Stroie                } else {
73959036814SCostin Stroie                    // If no match, treat entire part as template name
74059036814SCostin Stroie                    $baseMetadata['name'] = str_replace('-', ' ', $lastPart);
74159036814SCostin Stroie                }
74259036814SCostin Stroie            }
74359036814SCostin Stroie
74459036814SCostin Stroie            // Process each paragraph as a chunk with intelligent metadata handling
74559036814SCostin Stroie            $chunkIds = [];
74659036814SCostin Stroie            $chunkContents = [];
74759036814SCostin Stroie            $chunkMetadatas = [];
74859036814SCostin Stroie            $chunkEmbeddings = [];
74959036814SCostin Stroie            $currentTags = [];
75059036814SCostin Stroie
75159036814SCostin Stroie            foreach ($paragraphs as $index => $paragraph) {
75259036814SCostin Stroie                // Skip empty paragraphs to avoid processing whitespace-only content
75359036814SCostin Stroie                $paragraph = trim($paragraph);
75459036814SCostin Stroie                if (empty($paragraph)) {
75559036814SCostin Stroie                    continue;
75659036814SCostin Stroie                }
75759036814SCostin Stroie
75859036814SCostin Stroie                // Check if this is a DokuWiki title (starts and ends with =)
75959036814SCostin Stroie                // Titles are converted to tags for better searchability but not stored as content chunks
76059036814SCostin Stroie                if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) {
76159036814SCostin Stroie                    // Extract title content and clean it
76259036814SCostin Stroie                    $titleContent = trim($matches[1]);
76359036814SCostin Stroie
76459036814SCostin Stroie                    // Split into words and create searchable tags
76559036814SCostin Stroie                    $words = preg_split('/\s+/', $titleContent);
76659036814SCostin Stroie                    $tags = [];
76759036814SCostin Stroie
76859036814SCostin Stroie                    foreach ($words as $word) {
76959036814SCostin Stroie                        // Only use words longer than 3 characters to reduce noise
77059036814SCostin Stroie                        if (strlen($word) >= 3) {
77159036814SCostin Stroie                            $tags[] = strtolower($word);
77259036814SCostin Stroie                        }
77359036814SCostin Stroie                    }
77459036814SCostin Stroie
77559036814SCostin Stroie                    // Remove duplicate tags and store for use in subsequent chunks
77659036814SCostin Stroie                    $currentTags = array_unique($tags);
77759036814SCostin Stroie                    continue; // Skip storing title chunks as content
77859036814SCostin Stroie                }
77959036814SCostin Stroie
78059036814SCostin Stroie                // Create chunk ID
78159036814SCostin Stroie                $chunkId = $id . '@' . ($index + 1);
78259036814SCostin Stroie
78359036814SCostin Stroie                // Generate embeddings for the chunk
78459036814SCostin Stroie                $embeddings = $this->generateEmbeddings($paragraph);
78559036814SCostin Stroie
78659036814SCostin Stroie                // Add chunk-specific metadata
78759036814SCostin Stroie                $metadata = $baseMetadata;
78859036814SCostin Stroie                $metadata['chunk_id'] = $chunkId;
78959036814SCostin Stroie                $metadata['chunk_number'] = $index + 1;
79059036814SCostin Stroie                $metadata['total_chunks'] = count($paragraphs);
79159036814SCostin Stroie
79259036814SCostin Stroie                // Add current tags to metadata if any exist
79359036814SCostin Stroie                if (!empty($currentTags)) {
79459036814SCostin Stroie                    $metadata['tags'] = implode(',', $currentTags);
79559036814SCostin Stroie                }
79659036814SCostin Stroie
79759036814SCostin Stroie                // Store chunk data
79859036814SCostin Stroie                $chunkIds[] = $chunkId;
79959036814SCostin Stroie                $chunkContents[] = $paragraph;
80059036814SCostin Stroie                $chunkMetadatas[] = $metadata;
80159036814SCostin Stroie                $chunkEmbeddings[] = $embeddings;
80259036814SCostin Stroie            }
80359036814SCostin Stroie
80459036814SCostin Stroie            // If no chunks were created, skip this file
80559036814SCostin Stroie            if (empty($chunkIds)) {
80659036814SCostin Stroie                return [
80759036814SCostin Stroie                    'status' => 'skipped',
80859036814SCostin Stroie                    'message' => "No valid chunks found in file '$id'. Skipping..."
80959036814SCostin Stroie                ];
81059036814SCostin Stroie            }
81159036814SCostin Stroie
81259036814SCostin Stroie            // Send all chunks to ChromaDB
81359036814SCostin Stroie            $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings);
81459036814SCostin Stroie
81559036814SCostin Stroie            return [
81659036814SCostin Stroie                'status' => 'success',
81759036814SCostin Stroie                'message' => "Successfully sent file to ChromaDB",
81859036814SCostin Stroie                'details' => [
81959036814SCostin Stroie                    'document_id' => $id,
82059036814SCostin Stroie                    'chunks' => count($chunkIds),
82159036814SCostin Stroie                    'collection' => $collectionName
82259036814SCostin Stroie                ],
82359036814SCostin Stroie                'collection_status' => $collectionStatus
82459036814SCostin Stroie            ];
8251f06f0c8SCostin Stroie (aider)        } catch (\Exception $e) {
82659036814SCostin Stroie            return [
82759036814SCostin Stroie                'status' => 'error',
82859036814SCostin Stroie                'message' => "Error sending file to ChromaDB: " . $e->getMessage()
82959036814SCostin Stroie            ];
83059036814SCostin Stroie        }
83159036814SCostin Stroie    }
83259036814SCostin Stroie
83359036814SCostin Stroie    /**
83459036814SCostin Stroie     * Process all DokuWiki files in a directory and send them to ChromaDB
83559036814SCostin Stroie     *
83659036814SCostin Stroie     * This function recursively processes all .txt files in a directory and its subdirectories.
83759036814SCostin Stroie     * It first checks if the appropriate collection exists and creates it if needed.
83859036814SCostin Stroie     * Then it processes each file individually.
83959036814SCostin Stroie     *
84059036814SCostin Stroie     * @param string $dirPath The directory path to process
84159036814SCostin Stroie     * @return array Result with status and details
84259036814SCostin Stroie     */
84359036814SCostin Stroie    public function processDirectory($dirPath) {
84459036814SCostin Stroie        // Check if directory exists
84559036814SCostin Stroie        if (!is_dir($dirPath)) {
84659036814SCostin Stroie            return [
84759036814SCostin Stroie                'status' => 'error',
84859036814SCostin Stroie                'message' => "Directory does not exist: $dirPath"
84959036814SCostin Stroie            ];
85059036814SCostin Stroie        }
85159036814SCostin Stroie
85259036814SCostin Stroie        // Create RecursiveIteratorIterator to process directories recursively
85359036814SCostin Stroie        $iterator = new RecursiveIteratorIterator(
85459036814SCostin Stroie            new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS),
85559036814SCostin Stroie            RecursiveIteratorIterator::LEAVES_ONLY
85659036814SCostin Stroie        );
85759036814SCostin Stroie
85859036814SCostin Stroie        $files = [];
85959036814SCostin Stroie        foreach ($iterator as $file) {
86059036814SCostin Stroie            // Process only .txt files that don't start with underscore
86159036814SCostin Stroie            if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') {
86259036814SCostin Stroie                $files[] = $file->getPathname();
86359036814SCostin Stroie            }
86459036814SCostin Stroie        }
86559036814SCostin Stroie
86659036814SCostin Stroie        if (empty($files)) {
86759036814SCostin Stroie            return [
86859036814SCostin Stroie                'status' => 'skipped',
86959036814SCostin Stroie                'message' => "No .txt files found in directory: $dirPath"
87059036814SCostin Stroie            ];
87159036814SCostin Stroie        }
87259036814SCostin Stroie
87359036814SCostin Stroie        // Use the first part of the document ID as collection name, fallback to 'documents'
87459036814SCostin Stroie        $sampleFile = $files[0];
87559036814SCostin Stroie        $id = parseFilePath($sampleFile);
87659036814SCostin Stroie        $idParts = explode(':', $id);
87759036814SCostin Stroie        $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents';
87859036814SCostin Stroie
87959036814SCostin Stroie        try {
88059036814SCostin Stroie            $this->ensureCollectionExists($collectionName);
88159036814SCostin Stroie            $collectionChecked = true;
88259036814SCostin Stroie        } catch (Exception $e) {
88359036814SCostin Stroie            $collectionChecked = true;
88459036814SCostin Stroie        }
88559036814SCostin Stroie
88659036814SCostin Stroie        $results = [];
88759036814SCostin Stroie        foreach ($files as $file) {
88859036814SCostin Stroie            $result = $this->processSingleFile($file, $collectionName, $collectionChecked);
88959036814SCostin Stroie            $results[] = [
89059036814SCostin Stroie                'file' => $file,
89159036814SCostin Stroie                'result' => $result
89259036814SCostin Stroie            ];
89359036814SCostin Stroie        }
89459036814SCostin Stroie
89559036814SCostin Stroie        return [
89659036814SCostin Stroie            'status' => 'success',
89759036814SCostin Stroie            'message' => "Finished processing directory.",
89859036814SCostin Stroie            'files_count' => count($files),
89959036814SCostin Stroie            'results' => $results
90059036814SCostin Stroie        ];
90159036814SCostin Stroie    }
90259036814SCostin Stroie}
90359036814SCostin Stroie
90459036814SCostin Stroie/**
90559036814SCostin Stroie * Parse a file path and convert it to a DokuWiki ID
90659036814SCostin Stroie *
90759036814SCostin Stroie * Takes a file system path and converts it to the DokuWiki ID format by:
90859036814SCostin Stroie * 1. Removing the base path prefix (using DokuWiki's pages directory)
90959036814SCostin Stroie * 2. Removing the .txt extension
91059036814SCostin Stroie * 3. Converting directory separators to colons
91159036814SCostin Stroie *
91259036814SCostin Stroie * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt
91359036814SCostin Stroie * Becomes: reports:mri:2024:g287-name-surname
91459036814SCostin Stroie *
91559036814SCostin Stroie * @param string $filePath The full file path to parse
91659036814SCostin Stroie * @return string The DokuWiki ID
91759036814SCostin Stroie */
91859036814SCostin Stroiefunction parseFilePath($filePath) {
91959036814SCostin Stroie    // Use DokuWiki's constant to get the pages directory if available
92059036814SCostin Stroie    if (defined('DOKU_INC')) {
92159036814SCostin Stroie        $pagesDir = DOKU_INC . 'data/pages/';
92259036814SCostin Stroie    } else {
92359036814SCostin Stroie        // Fallback to common DokuWiki installation path
92459036814SCostin Stroie        $pagesDir = '/var/www/html/dokuwiki/data/pages/';
92559036814SCostin Stroie    }
92659036814SCostin Stroie
92759036814SCostin Stroie    // Remove the base path
92859036814SCostin Stroie    $relativePath = str_replace($pagesDir, '', $filePath);
92959036814SCostin Stroie
93059036814SCostin Stroie    // Remove .txt extension
93159036814SCostin Stroie    $relativePath = preg_replace('/\.txt$/', '', $relativePath);
93259036814SCostin Stroie
93359036814SCostin Stroie    // Split path into parts and filter out empty parts
93459036814SCostin Stroie    $parts = array_filter(explode('/', $relativePath));
93559036814SCostin Stroie
93659036814SCostin Stroie    // Build DokuWiki ID (use first part as namespace)
93759036814SCostin Stroie    $idParts = [];
93859036814SCostin Stroie    foreach ($parts as $part) {
93959036814SCostin Stroie        if (!empty($part)) {
94059036814SCostin Stroie            $idParts[] = $part;
94159036814SCostin Stroie        }
94259036814SCostin Stroie    }
94359036814SCostin Stroie
94459036814SCostin Stroie    return implode(':', $idParts);
94559036814SCostin Stroie}
94659036814SCostin Stroie
947