159036814SCostin Stroie<?php 259036814SCostin Stroie 359036814SCostin Stroienamespace dokuwiki\plugin\dokullm; 459036814SCostin Stroie 559036814SCostin Stroieclass ChromaDBClient { 659036814SCostin Stroie private $baseUrl; 759036814SCostin Stroie private $client; 859036814SCostin Stroie private $ollamaClient; 959036814SCostin Stroie private $tenant; 1059036814SCostin Stroie private $database; 1159036814SCostin Stroie private $ollamaHost; 1259036814SCostin Stroie private $ollamaPort; 13a068a1baSCostin Stroie (aider) private $ollamaModel; 14a068a1baSCostin Stroie (aider) 15a068a1baSCostin Stroie (aider) /** 16a068a1baSCostin Stroie (aider) * Get configuration value for the dokullm plugin 17a068a1baSCostin Stroie (aider) * 18a068a1baSCostin Stroie (aider) * @param string $key Configuration key 19a068a1baSCostin Stroie (aider) * @param mixed $default Default value if key not found 20a068a1baSCostin Stroie (aider) * @return mixed Configuration value 21a068a1baSCostin Stroie (aider) */ 2259036814SCostin Stroie /** 2359036814SCostin Stroie * Initialize the ChromaDB client 2459036814SCostin Stroie * 2559036814SCostin Stroie * Creates a new ChromaDB client instance with the specified connection parameters. 2659036814SCostin Stroie * Also ensures that the specified tenant and database exist. 2759036814SCostin Stroie * 287f9bf094SCostin Stroie (aider) * @param string $host ChromaDB server host 297f9bf094SCostin Stroie (aider) * @param int $port ChromaDB server port 307f9bf094SCostin Stroie (aider) * @param string $tenant ChromaDB tenant name 317f9bf094SCostin Stroie (aider) * @param string $database ChromaDB database name 327f9bf094SCostin Stroie (aider) * @param string $ollamaHost Ollama server host 337f9bf094SCostin Stroie (aider) * @param int $ollamaPort Ollama server port 347f9bf094SCostin Stroie (aider) * @param string $ollamaModel Ollama embeddings model 3559036814SCostin Stroie */ 36*35d66f98SCostin Stroie (aider) public function __construct($host, $port, $tenant, $database, $ollamaHost, $ollamaPort, $ollamaModel) { 37*35d66f98SCostin Stroie (aider) // Use provided parameters (no fallback since they're mandatory) 38*35d66f98SCostin Stroie (aider) $chromaHost = $host; 39*35d66f98SCostin Stroie (aider) $chromaPort = $port; 40*35d66f98SCostin Stroie (aider) $this->tenant = $tenant; 41*35d66f98SCostin Stroie (aider) $this->database = $database; 42*35d66f98SCostin Stroie (aider) $this->ollamaHost = $ollamaHost; 43*35d66f98SCostin Stroie (aider) $this->ollamaPort = $ollamaPort; 44*35d66f98SCostin Stroie (aider) $this->ollamaModel = $ollamaModel; 457f9bf094SCostin Stroie (aider) 46f2ab331fSCostin Stroie (aider) $this->baseUrl = "http://{$chromaHost}:{$chromaPort}"; 4759036814SCostin Stroie $this->client = curl_init(); 4859036814SCostin Stroie curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 4959036814SCostin Stroie curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 5059036814SCostin Stroie 'Content-Type: application/json', 5159036814SCostin Stroie 'Accept: application/json' 5259036814SCostin Stroie ]); 5359036814SCostin Stroie 5459036814SCostin Stroie // Initialize Ollama client 5559036814SCostin Stroie $this->ollamaClient = curl_init(); 5659036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 5759036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 5859036814SCostin Stroie 'Content-Type: application/json' 5959036814SCostin Stroie ]); 6059036814SCostin Stroie 6159036814SCostin Stroie // Check if tenant and database exist, create them if they don't 6259036814SCostin Stroie $this->ensureTenantAndDatabase(); 6359036814SCostin Stroie } 6459036814SCostin Stroie 6559036814SCostin Stroie /** 6659036814SCostin Stroie * Clean up the cURL client when the object is destroyed 6759036814SCostin Stroie * 6859036814SCostin Stroie * @return void 6959036814SCostin Stroie */ 7059036814SCostin Stroie public function __destruct() { 7159036814SCostin Stroie curl_close($this->client); 7259036814SCostin Stroie curl_close($this->ollamaClient); 7359036814SCostin Stroie } 7459036814SCostin Stroie 7559036814SCostin Stroie /** 7659036814SCostin Stroie * Make an HTTP request to the ChromaDB API 7759036814SCostin Stroie * 7859036814SCostin Stroie * This is a helper function that handles making HTTP requests to the ChromaDB API, 7959036814SCostin Stroie * including setting the appropriate headers for tenant and database. 8059036814SCostin Stroie * 8159036814SCostin Stroie * @param string $endpoint The API endpoint to call 8259036814SCostin Stroie * @param string $method The HTTP method to use (default: 'GET') 8359036814SCostin Stroie * @param array|null $data The data to send with the request (default: null) 8459036814SCostin Stroie * @return array The JSON response decoded as an array 8559036814SCostin Stroie * @throws Exception If there's a cURL error or HTTP error 8659036814SCostin Stroie */ 8759036814SCostin Stroie private function makeRequest($endpoint, $method = 'GET', $data = null) { 8859036814SCostin Stroie // Add tenant and database as headers instead of query parameters for v2 API 8959036814SCostin Stroie $headers = [ 9059036814SCostin Stroie 'Content-Type: application/json', 9159036814SCostin Stroie 'Accept: application/json' 9259036814SCostin Stroie ]; 9359036814SCostin Stroie 9459036814SCostin Stroie $url = $this->baseUrl . '/api/v2' . $endpoint; 9559036814SCostin Stroie 9659036814SCostin Stroie curl_setopt($this->client, CURLOPT_URL, $url); 9759036814SCostin Stroie curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 9859036814SCostin Stroie curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 9959036814SCostin Stroie 10059036814SCostin Stroie if ($data) { 10159036814SCostin Stroie curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 10259036814SCostin Stroie } else { 10359036814SCostin Stroie curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 10459036814SCostin Stroie } 10559036814SCostin Stroie 10659036814SCostin Stroie $response = curl_exec($this->client); 10759036814SCostin Stroie $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 10859036814SCostin Stroie 10959036814SCostin Stroie if (curl_error($this->client)) { 1103eb8beceSCostin Stroie (aider) throw new \Exception('Curl error: ' . curl_error($this->client)); 11159036814SCostin Stroie } 11259036814SCostin Stroie 11359036814SCostin Stroie if ($httpCode >= 400) { 1143eb8beceSCostin Stroie (aider) throw new \Exception("HTTP Error: $httpCode, Response: $response"); 11559036814SCostin Stroie } 11659036814SCostin Stroie 11759036814SCostin Stroie return json_decode($response, true); 11859036814SCostin Stroie } 11959036814SCostin Stroie 12059036814SCostin Stroie /** 12159036814SCostin Stroie * Generate embeddings for text using Ollama 12259036814SCostin Stroie * 12359036814SCostin Stroie * @param string $text The text to generate embeddings for 12459036814SCostin Stroie * @return array The embeddings vector 12559036814SCostin Stroie */ 12659036814SCostin Stroie public function generateEmbeddings($text) { 12759036814SCostin Stroie $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 12859036814SCostin Stroie 12959036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 13059036814SCostin Stroie 13159036814SCostin Stroie $data = [ 13259036814SCostin Stroie 'model' => $this->ollamaModel, 13359036814SCostin Stroie 'prompt' => $text, 13459036814SCostin Stroie 'keep_alive' => '30m' 13559036814SCostin Stroie ]; 13659036814SCostin Stroie 13759036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 13859036814SCostin Stroie 13959036814SCostin Stroie $response = curl_exec($this->ollamaClient); 14059036814SCostin Stroie $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 14159036814SCostin Stroie 14259036814SCostin Stroie if (curl_error($this->ollamaClient)) { 1433eb8beceSCostin Stroie (aider) throw new \Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 14459036814SCostin Stroie } 14559036814SCostin Stroie 14659036814SCostin Stroie if ($httpCode >= 400) { 1473eb8beceSCostin Stroie (aider) throw new \Exception("Ollama HTTP Error: $httpCode, Response: $response"); 14859036814SCostin Stroie } 14959036814SCostin Stroie 15059036814SCostin Stroie $result = json_decode($response, true); 15159036814SCostin Stroie 15259036814SCostin Stroie if (!isset($result['embedding'])) { 1533eb8beceSCostin Stroie (aider) throw new \Exception("Ollama response missing embedding: " . $response); 15459036814SCostin Stroie } 15559036814SCostin Stroie 15659036814SCostin Stroie return $result['embedding']; 15759036814SCostin Stroie } 15859036814SCostin Stroie 15959036814SCostin Stroie /** 16059036814SCostin Stroie * List all collections in the database 16159036814SCostin Stroie * 16259036814SCostin Stroie * Retrieves a list of all collections in the specified tenant and database. 16359036814SCostin Stroie * 16459036814SCostin Stroie * @return array List of collections 16559036814SCostin Stroie */ 16659036814SCostin Stroie public function listCollections() { 16759036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 16859036814SCostin Stroie return $this->makeRequest($endpoint); 16959036814SCostin Stroie } 17059036814SCostin Stroie 17159036814SCostin Stroie /** 17259036814SCostin Stroie * Get a collection by name 17359036814SCostin Stroie * 17459036814SCostin Stroie * Retrieves information about a specific collection by its name. 17559036814SCostin Stroie * 17659036814SCostin Stroie * @param string $name The name of the collection to retrieve 17759036814SCostin Stroie * @return array The collection information 17859036814SCostin Stroie * @throws Exception If the collection is not found 17959036814SCostin Stroie */ 18059036814SCostin Stroie public function getCollection($name) { 18159036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 18259036814SCostin Stroie if (empty($name)) { 18359036814SCostin Stroie $name = 'documents'; 18459036814SCostin Stroie } 18559036814SCostin Stroie 18659036814SCostin Stroie // First try to get collection by name 18759036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 18859036814SCostin Stroie $collections = $this->makeRequest($endpoint); 18959036814SCostin Stroie 19059036814SCostin Stroie // Find collection by name 19159036814SCostin Stroie foreach ($collections as $collection) { 19259036814SCostin Stroie if (isset($collection['name']) && $collection['name'] === $name) { 19359036814SCostin Stroie return $collection; 19459036814SCostin Stroie } 19559036814SCostin Stroie } 19659036814SCostin Stroie 19759036814SCostin Stroie // If not found, throw exception 1983eb8beceSCostin Stroie (aider) throw new \Exception("Collection '{$name}' not found"); 19959036814SCostin Stroie } 20059036814SCostin Stroie 20159036814SCostin Stroie /** 20259036814SCostin Stroie * Create a new collection 20359036814SCostin Stroie * 20459036814SCostin Stroie * Creates a new collection with the specified name and optional metadata. 20559036814SCostin Stroie * 20659036814SCostin Stroie * @param string $name The name of the collection to create 20759036814SCostin Stroie * @param array|null $metadata Optional metadata for the collection 20859036814SCostin Stroie * @return array The response from the API 20959036814SCostin Stroie */ 21059036814SCostin Stroie public function createCollection($name, $metadata = null) { 21159036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 21259036814SCostin Stroie if (empty($name)) { 21359036814SCostin Stroie $name = 'documents'; 21459036814SCostin Stroie } 21559036814SCostin Stroie 21659036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 21759036814SCostin Stroie $data = ['name' => $name]; 21859036814SCostin Stroie if ($metadata) { 21959036814SCostin Stroie $data['metadata'] = $metadata; 22059036814SCostin Stroie } 22159036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 22259036814SCostin Stroie } 22359036814SCostin Stroie 22459036814SCostin Stroie /** 22559036814SCostin Stroie * Delete a collection by name 22659036814SCostin Stroie * 22759036814SCostin Stroie * Deletes a collection with the specified name. 22859036814SCostin Stroie * 22959036814SCostin Stroie * @param string $name The name of the collection to delete 23059036814SCostin Stroie * @return array The response from the API 23159036814SCostin Stroie * @throws Exception If the collection ID is not found 23259036814SCostin Stroie */ 23359036814SCostin Stroie public function deleteCollection($name) { 23459036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 23559036814SCostin Stroie if (empty($name)) { 23659036814SCostin Stroie $name = 'documents'; 23759036814SCostin Stroie } 23859036814SCostin Stroie 23959036814SCostin Stroie // First get the collection to find its ID 24059036814SCostin Stroie $collection = $this->getCollection($name); 24159036814SCostin Stroie if (!isset($collection['id'])) { 2423eb8beceSCostin Stroie (aider) throw new \Exception("Collection ID not found for '{$name}'"); 24359036814SCostin Stroie } 24459036814SCostin Stroie 24559036814SCostin Stroie $collectionId = $collection['id']; 24659036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 24759036814SCostin Stroie return $this->makeRequest($endpoint, 'DELETE'); 24859036814SCostin Stroie } 24959036814SCostin Stroie 25059036814SCostin Stroie /** 25159036814SCostin Stroie * Get a document by its ID from a collection 25259036814SCostin Stroie * 25359036814SCostin Stroie * Retrieves a document from the specified collection using its ID. 25459036814SCostin Stroie * 25559036814SCostin Stroie * @param string $collectionName The name of the collection to get the document from 25659036814SCostin Stroie * @param string $documentId The document ID to retrieve 25759036814SCostin Stroie * @param array $include What to include in the response (default: ["metadatas", "documents"]) 25859036814SCostin Stroie * @return array The retrieved document 25959036814SCostin Stroie * @throws Exception If the collection ID is not found 26059036814SCostin Stroie */ 26159036814SCostin Stroie public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 26259036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 26359036814SCostin Stroie if (empty($collectionName)) { 26459036814SCostin Stroie $collectionName = 'documents'; 26559036814SCostin Stroie } 26659036814SCostin Stroie 26759036814SCostin Stroie // First get the collection to find its ID 26859036814SCostin Stroie $collection = $this->getCollection($collectionName); 26959036814SCostin Stroie if (!isset($collection['id'])) { 2703eb8beceSCostin Stroie (aider) throw new \Exception("Collection ID not found for '{$collectionName}'"); 27159036814SCostin Stroie } 27259036814SCostin Stroie 27359036814SCostin Stroie $collectionId = $collection['id']; 27459036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 27559036814SCostin Stroie $data = [ 27659036814SCostin Stroie 'ids' => [$documentId], 27759036814SCostin Stroie 'include' => $include 27859036814SCostin Stroie ]; 27959036814SCostin Stroie 28059036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 28159036814SCostin Stroie } 28259036814SCostin Stroie 28359036814SCostin Stroie /** 28459036814SCostin Stroie * Add documents to a collection 28559036814SCostin Stroie * 28659036814SCostin Stroie * Adds documents to the specified collection. Each document must have a corresponding ID. 28759036814SCostin Stroie * Optional metadata and pre-computed embeddings can also be provided. 28859036814SCostin Stroie * 28959036814SCostin Stroie * @param string $collectionName The name of the collection to add documents to 29059036814SCostin Stroie * @param array $documents The document contents 29159036814SCostin Stroie * @param array $ids The document IDs 29259036814SCostin Stroie * @param array|null $metadatas Optional metadata for each document 29359036814SCostin Stroie * @param array|null $embeddings Optional pre-computed embeddings for each document 29459036814SCostin Stroie * @return array The response from the API 29559036814SCostin Stroie * @throws Exception If the collection ID is not found 29659036814SCostin Stroie */ 29759036814SCostin Stroie public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 29859036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 29959036814SCostin Stroie if (empty($collectionName)) { 30059036814SCostin Stroie $collectionName = 'documents'; 30159036814SCostin Stroie } 30259036814SCostin Stroie 30359036814SCostin Stroie // First get the collection to find its ID 30459036814SCostin Stroie $collection = $this->getCollection($collectionName); 30559036814SCostin Stroie if (!isset($collection['id'])) { 3063eb8beceSCostin Stroie (aider) throw new \Exception("Collection ID not found for '{$collectionName}'"); 30759036814SCostin Stroie } 30859036814SCostin Stroie 30959036814SCostin Stroie $collectionId = $collection['id']; 31059036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 31159036814SCostin Stroie $data = [ 31259036814SCostin Stroie 'ids' => $ids, 31359036814SCostin Stroie 'documents' => $documents 31459036814SCostin Stroie ]; 31559036814SCostin Stroie 31659036814SCostin Stroie if ($metadatas) { 31759036814SCostin Stroie $data['metadatas'] = $metadatas; 31859036814SCostin Stroie } 31959036814SCostin Stroie 32059036814SCostin Stroie if ($embeddings) { 32159036814SCostin Stroie $data['embeddings'] = $embeddings; 32259036814SCostin Stroie } 32359036814SCostin Stroie 32459036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 32559036814SCostin Stroie } 32659036814SCostin Stroie 32759036814SCostin Stroie /** 32859036814SCostin Stroie * Check if a document needs to be updated based on timestamp comparison 32959036814SCostin Stroie * 33059036814SCostin Stroie * Determines whether a document should be reprocessed by comparing the file's last modification 33159036814SCostin Stroie * time with the processed_at timestamp stored in the document's metadata. The function checks 33259036814SCostin Stroie * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 33359036814SCostin Stroie * not included in the database. 33459036814SCostin Stroie * 33559036814SCostin Stroie * @param string $collectionId The ID of the collection to check documents in 33659036814SCostin Stroie * @param string $documentId The base document ID to check (without chunk suffixes) 33759036814SCostin Stroie * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 33859036814SCostin Stroie * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 33959036814SCostin Stroie * @throws Exception If there's an error checking the document 34059036814SCostin Stroie */ 34159036814SCostin Stroie public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 34259036814SCostin Stroie try { 34359036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 34459036814SCostin Stroie 34559036814SCostin Stroie // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 34659036814SCostin Stroie $chunkIdsToCheck = [ 34759036814SCostin Stroie $documentId . '@1', 34859036814SCostin Stroie $documentId . '@2', 34959036814SCostin Stroie $documentId . '@3' 35059036814SCostin Stroie ]; 35159036814SCostin Stroie 35259036814SCostin Stroie $data = [ 35359036814SCostin Stroie 'ids' => $chunkIdsToCheck, 35459036814SCostin Stroie 'include' => [ 35559036814SCostin Stroie "metadatas" 35659036814SCostin Stroie ], 35759036814SCostin Stroie 'limit' => 1 35859036814SCostin Stroie ]; 35959036814SCostin Stroie 36059036814SCostin Stroie // Check if document exists 36159036814SCostin Stroie $result = $this->makeRequest($endpoint, 'POST', $data); 36259036814SCostin Stroie 36359036814SCostin Stroie // If no documents found, return true (needs to be added) 36459036814SCostin Stroie if (empty($result['ids'])) { 36559036814SCostin Stroie return true; 36659036814SCostin Stroie } 36759036814SCostin Stroie 36859036814SCostin Stroie // Check if any document has a processed_at timestamp 36959036814SCostin Stroie if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 37059036814SCostin Stroie // Check the first metadata entry directly 37159036814SCostin Stroie $metadata = $result['metadatas'][0]; 37259036814SCostin Stroie 37359036814SCostin Stroie // If processed_at is not set, return true (needs update) 37459036814SCostin Stroie if (!isset($metadata['processed_at'])) { 37559036814SCostin Stroie return true; 37659036814SCostin Stroie } 37759036814SCostin Stroie 37859036814SCostin Stroie // Parse the processed_at timestamp 37959036814SCostin Stroie $processedTimestamp = strtotime($metadata['processed_at']); 38059036814SCostin Stroie 38159036814SCostin Stroie // If file is newer than processed time, return true (needs update) 38259036814SCostin Stroie if ($fileModifiedTime > $processedTimestamp) { 38359036814SCostin Stroie return true; 38459036814SCostin Stroie } 38559036814SCostin Stroie } 38659036814SCostin Stroie 38759036814SCostin Stroie // Document exists and is up to date 38859036814SCostin Stroie return false; 3891f06f0c8SCostin Stroie (aider) } catch (\Exception $e) { 39059036814SCostin Stroie // If there's an error checking the document, assume it needs to be updated 39159036814SCostin Stroie return true; 39259036814SCostin Stroie } 39359036814SCostin Stroie } 39459036814SCostin Stroie 39559036814SCostin Stroie /** 39659036814SCostin Stroie * Query a collection for similar documents 39759036814SCostin Stroie * 39859036814SCostin Stroie * Queries the specified collection for documents similar to the provided query texts. 39959036814SCostin Stroie * The function generates embeddings for the query texts and sends them to ChromaDB. 40059036814SCostin Stroie * Supports filtering results by metadata using the where parameter. 40159036814SCostin Stroie * 40259036814SCostin Stroie * @param string $collectionName The name of the collection to query 40359036814SCostin Stroie * @param array $queryTexts The query texts to search for 40459036814SCostin Stroie * @param int $nResults The number of results to return (default: 5) 40559036814SCostin Stroie * @param array|null $where Optional filter conditions for metadata 40659036814SCostin Stroie * @return array The query results 40759036814SCostin Stroie * @throws Exception If the collection ID is not found 40859036814SCostin Stroie */ 40959036814SCostin Stroie public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 41059036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 41159036814SCostin Stroie if (empty($collectionName)) { 41259036814SCostin Stroie $collectionName = 'documents'; 41359036814SCostin Stroie } 41459036814SCostin Stroie 41559036814SCostin Stroie // First get the collection to find its ID 41659036814SCostin Stroie $collection = $this->getCollection($collectionName); 41759036814SCostin Stroie if (!isset($collection['id'])) { 4183eb8beceSCostin Stroie (aider) throw new \Exception("Collection ID not found for '{$collectionName}'"); 41959036814SCostin Stroie } 42059036814SCostin Stroie 42159036814SCostin Stroie $collectionId = $collection['id']; 42259036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 42359036814SCostin Stroie 42459036814SCostin Stroie // Generate embeddings for query texts 42559036814SCostin Stroie $queryEmbeddings = []; 42659036814SCostin Stroie foreach ($queryTexts as $text) { 42759036814SCostin Stroie $queryEmbeddings[] = $this->generateEmbeddings($text); 42859036814SCostin Stroie } 42959036814SCostin Stroie 43059036814SCostin Stroie $data = [ 43159036814SCostin Stroie 'query_embeddings' => $queryEmbeddings, 43259036814SCostin Stroie 'n_results' => $nResults 43359036814SCostin Stroie ]; 43459036814SCostin Stroie 43559036814SCostin Stroie // Add where clause for metadata filtering if provided 43659036814SCostin Stroie if ($where && is_array($where)) { 43759036814SCostin Stroie $data['where'] = $where; 43859036814SCostin Stroie } 43959036814SCostin Stroie 44059036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 44159036814SCostin Stroie } 44259036814SCostin Stroie 44359036814SCostin Stroie /** 44459036814SCostin Stroie * Check if the ChromaDB server is alive 44559036814SCostin Stroie * 44659036814SCostin Stroie * Sends a heartbeat request to verify that the ChromaDB server is running. 44759036814SCostin Stroie * 44859036814SCostin Stroie * @return array The response from the heartbeat endpoint 44959036814SCostin Stroie */ 45059036814SCostin Stroie public function heartbeat() { 45159036814SCostin Stroie $endpoint = "/heartbeat"; 45259036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 45359036814SCostin Stroie } 45459036814SCostin Stroie 45559036814SCostin Stroie /** 45659036814SCostin Stroie * Get authentication and identity information 45759036814SCostin Stroie * 45859036814SCostin Stroie * Retrieves authentication and identity information from the ChromaDB server. 45959036814SCostin Stroie * 46059036814SCostin Stroie * @return array The response from the auth/identity endpoint 46159036814SCostin Stroie */ 46259036814SCostin Stroie public function getIdentity() { 46359036814SCostin Stroie $endpoint = "/identity"; 46459036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 46559036814SCostin Stroie } 46659036814SCostin Stroie 46759036814SCostin Stroie /** 46859036814SCostin Stroie * Ensure that the specified tenant and database exist 46959036814SCostin Stroie * 47059036814SCostin Stroie * Checks if the specified tenant and database exist, and creates them if they don't. 47159036814SCostin Stroie * 47259036814SCostin Stroie * @return void 47359036814SCostin Stroie */ 47459036814SCostin Stroie private function ensureTenantAndDatabase() { 47559036814SCostin Stroie // Check if tenant exists, create if it doesn't 47659036814SCostin Stroie try { 47759036814SCostin Stroie $this->getTenant($this->tenant); 4783eb8beceSCostin Stroie (aider) } catch (\Exception $e) { 47959036814SCostin Stroie // Tenant doesn't exist, create it 48059036814SCostin Stroie $this->createTenant($this->tenant); 48159036814SCostin Stroie } 48259036814SCostin Stroie 48359036814SCostin Stroie // Check if database exists, create if it doesn't 48459036814SCostin Stroie try { 48559036814SCostin Stroie $this->getDatabase($this->database, $this->tenant); 4863eb8beceSCostin Stroie (aider) } catch (\Exception $e) { 48759036814SCostin Stroie // Database doesn't exist, create it 48859036814SCostin Stroie $this->createDatabase($this->database, $this->tenant); 48959036814SCostin Stroie } 49059036814SCostin Stroie } 49159036814SCostin Stroie 49259036814SCostin Stroie /** 49359036814SCostin Stroie * Get tenant information 49459036814SCostin Stroie * 49559036814SCostin Stroie * Retrieves information about the specified tenant. 49659036814SCostin Stroie * 49759036814SCostin Stroie * @param string $tenantName The tenant name 49859036814SCostin Stroie * @return array The tenant information 49959036814SCostin Stroie */ 50059036814SCostin Stroie public function getTenant($tenantName) { 50159036814SCostin Stroie $endpoint = "/tenants/{$tenantName}"; 50259036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 50359036814SCostin Stroie } 50459036814SCostin Stroie 50559036814SCostin Stroie /** 50659036814SCostin Stroie * Create a new tenant 50759036814SCostin Stroie * 50859036814SCostin Stroie * Creates a new tenant with the specified name. 50959036814SCostin Stroie * 51059036814SCostin Stroie * @param string $tenantName The tenant name 51159036814SCostin Stroie * @return array The response from the API 51259036814SCostin Stroie */ 51359036814SCostin Stroie public function createTenant($tenantName) { 51459036814SCostin Stroie $endpoint = "/tenants"; 51559036814SCostin Stroie $data = ['name' => $tenantName]; 51659036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 51759036814SCostin Stroie } 51859036814SCostin Stroie 51959036814SCostin Stroie /** 52059036814SCostin Stroie * Get database information 52159036814SCostin Stroie * 52259036814SCostin Stroie * Retrieves information about the specified database within a tenant. 52359036814SCostin Stroie * 52459036814SCostin Stroie * @param string $databaseName The database name 52559036814SCostin Stroie * @param string $tenantName The tenant name 52659036814SCostin Stroie * @return array The database information 52759036814SCostin Stroie */ 52859036814SCostin Stroie public function getDatabase($databaseName, $tenantName) { 52959036814SCostin Stroie $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 53059036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 53159036814SCostin Stroie } 53259036814SCostin Stroie 53359036814SCostin Stroie /** 53459036814SCostin Stroie * Create a new database 53559036814SCostin Stroie * 53659036814SCostin Stroie * Creates a new database with the specified name within a tenant. 53759036814SCostin Stroie * 53859036814SCostin Stroie * @param string $databaseName The database name 53959036814SCostin Stroie * @param string $tenantName The tenant name 54059036814SCostin Stroie * @return array The response from the API 54159036814SCostin Stroie */ 54259036814SCostin Stroie public function createDatabase($databaseName, $tenantName) { 54359036814SCostin Stroie $endpoint = "/tenants/{$tenantName}/databases"; 54459036814SCostin Stroie $data = ['name' => $databaseName]; 54559036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 54659036814SCostin Stroie } 54759036814SCostin Stroie 54859036814SCostin Stroie /** 54959036814SCostin Stroie * Ensure a collection exists, creating it if necessary 55059036814SCostin Stroie * 55159036814SCostin Stroie * This helper function checks if a collection exists and creates it if it doesn't. 55259036814SCostin Stroie * 55359036814SCostin Stroie * @param string $collectionName The name of the collection to check/create 55459036814SCostin Stroie * @return string Status message indicating what happened 55559036814SCostin Stroie */ 55659036814SCostin Stroie public function ensureCollectionExists($collectionName) { 55759036814SCostin Stroie try { 55859036814SCostin Stroie $collection = $this->getCollection($collectionName); 55959036814SCostin Stroie return "Collection '$collectionName' already exists."; 5603eb8beceSCostin Stroie (aider) } catch (\Exception $e) { 56159036814SCostin Stroie // Collection doesn't exist, create it 56259036814SCostin Stroie $created = $this->createCollection($collectionName); 56359036814SCostin Stroie return "Collection '$collectionName' created."; 56459036814SCostin Stroie } 56559036814SCostin Stroie } 56659036814SCostin Stroie 56759036814SCostin Stroie /** 56859036814SCostin Stroie * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 56959036814SCostin Stroie * 57059036814SCostin Stroie * This function handles the complete processing of a single DokuWiki file: 57159036814SCostin Stroie * 1. Parses the file path to extract metadata and document ID 57259036814SCostin Stroie * 2. Determines the appropriate collection based on document ID 57359036814SCostin Stroie * 3. Checks if the document needs updating using timestamp comparison 57459036814SCostin Stroie * 4. Reads and processes file content only if update is needed 57559036814SCostin Stroie * 5. Splits the document into chunks (paragraphs) 57659036814SCostin Stroie * 6. Extracts rich metadata from the DokuWiki ID format 57759036814SCostin Stroie * 7. Generates embeddings for each chunk 57859036814SCostin Stroie * 8. Sends all chunks to ChromaDB with metadata 57959036814SCostin Stroie * 58059036814SCostin Stroie * Supported ID formats: 58159036814SCostin Stroie * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 58259036814SCostin Stroie * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 58359036814SCostin Stroie * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 58459036814SCostin Stroie * 58559036814SCostin Stroie * The function implements smart update checking by comparing file modification time 58659036814SCostin Stroie * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 58759036814SCostin Stroie * 58859036814SCostin Stroie * @param string $filePath The path to the file to process 58959036814SCostin Stroie * @param string $collectionName The name of the collection to use 59059036814SCostin Stroie * @param bool $collectionChecked Whether the collection has already been checked/created 59159036814SCostin Stroie * @return array Result with status and details 59259036814SCostin Stroie */ 59359036814SCostin Stroie public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 59459036814SCostin Stroie // Parse file path to extract metadata 59559036814SCostin Stroie $id = parseFilePath($filePath); 59659036814SCostin Stroie 59759036814SCostin Stroie try { 59859036814SCostin Stroie // Create collection if it doesn't exist (only if not already checked) 59959036814SCostin Stroie $collectionStatus = ''; 60059036814SCostin Stroie if (!$collectionChecked) { 60159036814SCostin Stroie $collectionStatus = $this->ensureCollectionExists($collectionName); 60259036814SCostin Stroie } 60359036814SCostin Stroie 60459036814SCostin Stroie // Get collection ID 60559036814SCostin Stroie $collection = $this->getCollection($collectionName); 60659036814SCostin Stroie if (!isset($collection['id'])) { 60759036814SCostin Stroie return [ 60859036814SCostin Stroie 'status' => 'error', 60959036814SCostin Stroie 'message' => "Collection ID not found for '{$collectionName}'" 61059036814SCostin Stroie ]; 61159036814SCostin Stroie } 61259036814SCostin Stroie $collectionId = $collection['id']; 61359036814SCostin Stroie 61459036814SCostin Stroie // Get file modification time 61559036814SCostin Stroie $fileModifiedTime = filemtime($filePath); 61659036814SCostin Stroie 61759036814SCostin Stroie // Check if document needs update 61859036814SCostin Stroie $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 61959036814SCostin Stroie 62059036814SCostin Stroie // If document is up to date, skip processing 62159036814SCostin Stroie if (!$needsUpdate) { 62259036814SCostin Stroie return [ 62359036814SCostin Stroie 'status' => 'skipped', 62459036814SCostin Stroie 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 62559036814SCostin Stroie ]; 62659036814SCostin Stroie } 62759036814SCostin Stroie 62859036814SCostin Stroie // Read file content 62959036814SCostin Stroie $content = file_get_contents($filePath); 63059036814SCostin Stroie 63159036814SCostin Stroie // Split document into chunks (paragraphs separated by two newlines) 63259036814SCostin Stroie $paragraphs = preg_split('/\n\s*\n/', $content); 63359036814SCostin Stroie $chunks = []; 63459036814SCostin Stroie $chunkMetadata = []; 63559036814SCostin Stroie 63659036814SCostin Stroie // Parse the DokuWiki ID to extract base metadata 63759036814SCostin Stroie $parts = explode(':', $id); 63859036814SCostin Stroie 63959036814SCostin Stroie // Extract metadata from the last part of the ID 64059036814SCostin Stroie $lastPart = end($parts); 64159036814SCostin Stroie $baseMetadata = []; 64259036814SCostin Stroie 64359036814SCostin Stroie // Add the document ID as metadata 64459036814SCostin Stroie $baseMetadata['document_id'] = $id; 64559036814SCostin Stroie 64659036814SCostin Stroie // Add current timestamp 64759036814SCostin Stroie $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 64859036814SCostin Stroie 64959036814SCostin Stroie // Check if any part of the ID is 'templates' and set template metadata 65059036814SCostin Stroie $isTemplate = in_array('templates', $parts); 65159036814SCostin Stroie if ($isTemplate) { 65259036814SCostin Stroie $baseMetadata['type'] = 'template'; 65359036814SCostin Stroie } else { 65459036814SCostin Stroie $baseMetadata['type'] = 'report'; 65559036814SCostin Stroie } 65659036814SCostin Stroie 65759036814SCostin Stroie // Extract modality from the second part 65859036814SCostin Stroie if (isset($parts[1])) { 65959036814SCostin Stroie $baseMetadata['modality'] = $parts[1]; 66059036814SCostin Stroie } 66159036814SCostin Stroie 66259036814SCostin Stroie // Handle different ID formats based on the third part: word (institution) or numeric (year) 66359036814SCostin Stroie // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 66459036814SCostin Stroie // Format 2: reports:mri:2024:g287-name-surname (third part is year) 66559036814SCostin Stroie // For templates, don't set institution, date or year 66659036814SCostin Stroie if (isset($parts[2]) && !$isTemplate) { 66759036814SCostin Stroie // Check if third part is numeric (year) or word (institution) 66859036814SCostin Stroie if (is_numeric($parts[2])) { 66959036814SCostin Stroie // Format: reports:mri:2024:g287-name-surname (year format) 67059036814SCostin Stroie // Extract year from the third part 67159036814SCostin Stroie $baseMetadata['year'] = $parts[2]; 67259036814SCostin Stroie 67359036814SCostin Stroie // Set default institution from config 674*35d66f98SCostin Stroie (aider) global $conf; 675*35d66f98SCostin Stroie (aider) $baseMetadata['institution'] = isset($conf['plugin']['dokullm']['default_institution']) ? $conf['plugin']['dokullm']['default_institution'] : 'default'; 67659036814SCostin Stroie 67759036814SCostin Stroie // Extract registration and name from the last part 67859036814SCostin Stroie // Registration should start with one letter or number and contain numbers before the '-' character 67959036814SCostin Stroie if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 68059036814SCostin Stroie // Check if the first part contains at least one digit to be considered a registration 68159036814SCostin Stroie if (preg_match('/[0-9]/', $matches[1])) { 68259036814SCostin Stroie $baseMetadata['registration'] = $matches[1]; 68359036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 68459036814SCostin Stroie } else { 68559036814SCostin Stroie // If no registration pattern found, treat entire part as patient name 68659036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 68759036814SCostin Stroie } 68859036814SCostin Stroie } else { 68959036814SCostin Stroie // If no match, treat entire part as patient name 69059036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 69159036814SCostin Stroie } 69259036814SCostin Stroie } else { 69359036814SCostin Stroie // Format: reports:mri:institution:250620-name-surname (institution format) 69459036814SCostin Stroie // Extract institution from the third part 69559036814SCostin Stroie $baseMetadata['institution'] = $parts[2]; 69659036814SCostin Stroie 69759036814SCostin Stroie // Extract date and name from the last part 69859036814SCostin Stroie if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 69959036814SCostin Stroie $dateStr = $matches[1]; 70059036814SCostin Stroie $name = $matches[2]; 70159036814SCostin Stroie 70259036814SCostin Stroie // Convert date format (250620 -> 2025-06-20) 70359036814SCostin Stroie $day = substr($dateStr, 0, 2); 70459036814SCostin Stroie $month = substr($dateStr, 2, 2); 70559036814SCostin Stroie $year = substr($dateStr, 4, 2); 70659036814SCostin Stroie // Assuming 20xx for years 00-69 and 19xx for years 70-99 70759036814SCostin Stroie $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 70859036814SCostin Stroie $formattedDate = $fullYear . '-' . $month . '-' . $day; 70959036814SCostin Stroie 71059036814SCostin Stroie $baseMetadata['date'] = $formattedDate; 71159036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $name); 71259036814SCostin Stroie } 71359036814SCostin Stroie } 71459036814SCostin Stroie } 71559036814SCostin Stroie 71659036814SCostin Stroie // For templates, always extract name from the last part 71759036814SCostin Stroie if ($isTemplate && isset($lastPart)) { 71859036814SCostin Stroie // Extract name from the last part (everything after the last colon) 71959036814SCostin Stroie if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 72059036814SCostin Stroie // Check if the first part contains at least one digit to be considered a registration 72159036814SCostin Stroie if (preg_match('/[0-9]/', $matches[1])) { 72259036814SCostin Stroie $baseMetadata['registration'] = $matches[1]; 72359036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 72459036814SCostin Stroie } else { 72559036814SCostin Stroie // If no registration pattern found, treat entire part as template name 72659036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 72759036814SCostin Stroie } 72859036814SCostin Stroie } else { 72959036814SCostin Stroie // If no match, treat entire part as template name 73059036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 73159036814SCostin Stroie } 73259036814SCostin Stroie } 73359036814SCostin Stroie 73459036814SCostin Stroie // Process each paragraph as a chunk with intelligent metadata handling 73559036814SCostin Stroie $chunkIds = []; 73659036814SCostin Stroie $chunkContents = []; 73759036814SCostin Stroie $chunkMetadatas = []; 73859036814SCostin Stroie $chunkEmbeddings = []; 73959036814SCostin Stroie $currentTags = []; 74059036814SCostin Stroie 74159036814SCostin Stroie foreach ($paragraphs as $index => $paragraph) { 74259036814SCostin Stroie // Skip empty paragraphs to avoid processing whitespace-only content 74359036814SCostin Stroie $paragraph = trim($paragraph); 74459036814SCostin Stroie if (empty($paragraph)) { 74559036814SCostin Stroie continue; 74659036814SCostin Stroie } 74759036814SCostin Stroie 74859036814SCostin Stroie // Check if this is a DokuWiki title (starts and ends with =) 74959036814SCostin Stroie // Titles are converted to tags for better searchability but not stored as content chunks 75059036814SCostin Stroie if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 75159036814SCostin Stroie // Extract title content and clean it 75259036814SCostin Stroie $titleContent = trim($matches[1]); 75359036814SCostin Stroie 75459036814SCostin Stroie // Split into words and create searchable tags 75559036814SCostin Stroie $words = preg_split('/\s+/', $titleContent); 75659036814SCostin Stroie $tags = []; 75759036814SCostin Stroie 75859036814SCostin Stroie foreach ($words as $word) { 75959036814SCostin Stroie // Only use words longer than 3 characters to reduce noise 76059036814SCostin Stroie if (strlen($word) >= 3) { 76159036814SCostin Stroie $tags[] = strtolower($word); 76259036814SCostin Stroie } 76359036814SCostin Stroie } 76459036814SCostin Stroie 76559036814SCostin Stroie // Remove duplicate tags and store for use in subsequent chunks 76659036814SCostin Stroie $currentTags = array_unique($tags); 76759036814SCostin Stroie continue; // Skip storing title chunks as content 76859036814SCostin Stroie } 76959036814SCostin Stroie 77059036814SCostin Stroie // Create chunk ID 77159036814SCostin Stroie $chunkId = $id . '@' . ($index + 1); 77259036814SCostin Stroie 77359036814SCostin Stroie // Generate embeddings for the chunk 77459036814SCostin Stroie $embeddings = $this->generateEmbeddings($paragraph); 77559036814SCostin Stroie 77659036814SCostin Stroie // Add chunk-specific metadata 77759036814SCostin Stroie $metadata = $baseMetadata; 77859036814SCostin Stroie $metadata['chunk_id'] = $chunkId; 77959036814SCostin Stroie $metadata['chunk_number'] = $index + 1; 78059036814SCostin Stroie $metadata['total_chunks'] = count($paragraphs); 78159036814SCostin Stroie 78259036814SCostin Stroie // Add current tags to metadata if any exist 78359036814SCostin Stroie if (!empty($currentTags)) { 78459036814SCostin Stroie $metadata['tags'] = implode(',', $currentTags); 78559036814SCostin Stroie } 78659036814SCostin Stroie 78759036814SCostin Stroie // Store chunk data 78859036814SCostin Stroie $chunkIds[] = $chunkId; 78959036814SCostin Stroie $chunkContents[] = $paragraph; 79059036814SCostin Stroie $chunkMetadatas[] = $metadata; 79159036814SCostin Stroie $chunkEmbeddings[] = $embeddings; 79259036814SCostin Stroie } 79359036814SCostin Stroie 79459036814SCostin Stroie // If no chunks were created, skip this file 79559036814SCostin Stroie if (empty($chunkIds)) { 79659036814SCostin Stroie return [ 79759036814SCostin Stroie 'status' => 'skipped', 79859036814SCostin Stroie 'message' => "No valid chunks found in file '$id'. Skipping..." 79959036814SCostin Stroie ]; 80059036814SCostin Stroie } 80159036814SCostin Stroie 80259036814SCostin Stroie // Send all chunks to ChromaDB 80359036814SCostin Stroie $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 80459036814SCostin Stroie 80559036814SCostin Stroie return [ 80659036814SCostin Stroie 'status' => 'success', 80759036814SCostin Stroie 'message' => "Successfully sent file to ChromaDB", 80859036814SCostin Stroie 'details' => [ 80959036814SCostin Stroie 'document_id' => $id, 81059036814SCostin Stroie 'chunks' => count($chunkIds), 81159036814SCostin Stroie 'collection' => $collectionName 81259036814SCostin Stroie ], 81359036814SCostin Stroie 'collection_status' => $collectionStatus 81459036814SCostin Stroie ]; 8151f06f0c8SCostin Stroie (aider) } catch (\Exception $e) { 81659036814SCostin Stroie return [ 81759036814SCostin Stroie 'status' => 'error', 81859036814SCostin Stroie 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 81959036814SCostin Stroie ]; 82059036814SCostin Stroie } 82159036814SCostin Stroie } 82259036814SCostin Stroie 82359036814SCostin Stroie /** 82459036814SCostin Stroie * Process all DokuWiki files in a directory and send them to ChromaDB 82559036814SCostin Stroie * 82659036814SCostin Stroie * This function recursively processes all .txt files in a directory and its subdirectories. 82759036814SCostin Stroie * It first checks if the appropriate collection exists and creates it if needed. 82859036814SCostin Stroie * Then it processes each file individually. 82959036814SCostin Stroie * 83059036814SCostin Stroie * @param string $dirPath The directory path to process 83159036814SCostin Stroie * @return array Result with status and details 83259036814SCostin Stroie */ 83359036814SCostin Stroie public function processDirectory($dirPath) { 83459036814SCostin Stroie // Check if directory exists 83559036814SCostin Stroie if (!is_dir($dirPath)) { 83659036814SCostin Stroie return [ 83759036814SCostin Stroie 'status' => 'error', 83859036814SCostin Stroie 'message' => "Directory does not exist: $dirPath" 83959036814SCostin Stroie ]; 84059036814SCostin Stroie } 84159036814SCostin Stroie 84259036814SCostin Stroie // Create RecursiveIteratorIterator to process directories recursively 84359036814SCostin Stroie $iterator = new RecursiveIteratorIterator( 84459036814SCostin Stroie new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 84559036814SCostin Stroie RecursiveIteratorIterator::LEAVES_ONLY 84659036814SCostin Stroie ); 84759036814SCostin Stroie 84859036814SCostin Stroie $files = []; 84959036814SCostin Stroie foreach ($iterator as $file) { 85059036814SCostin Stroie // Process only .txt files that don't start with underscore 85159036814SCostin Stroie if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 85259036814SCostin Stroie $files[] = $file->getPathname(); 85359036814SCostin Stroie } 85459036814SCostin Stroie } 85559036814SCostin Stroie 85659036814SCostin Stroie if (empty($files)) { 85759036814SCostin Stroie return [ 85859036814SCostin Stroie 'status' => 'skipped', 85959036814SCostin Stroie 'message' => "No .txt files found in directory: $dirPath" 86059036814SCostin Stroie ]; 86159036814SCostin Stroie } 86259036814SCostin Stroie 86359036814SCostin Stroie // Use the first part of the document ID as collection name, fallback to 'documents' 86459036814SCostin Stroie $sampleFile = $files[0]; 86559036814SCostin Stroie $id = parseFilePath($sampleFile); 86659036814SCostin Stroie $idParts = explode(':', $id); 86759036814SCostin Stroie $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 86859036814SCostin Stroie 86959036814SCostin Stroie try { 87059036814SCostin Stroie $this->ensureCollectionExists($collectionName); 87159036814SCostin Stroie $collectionChecked = true; 87259036814SCostin Stroie } catch (Exception $e) { 87359036814SCostin Stroie $collectionChecked = true; 87459036814SCostin Stroie } 87559036814SCostin Stroie 87659036814SCostin Stroie $results = []; 87759036814SCostin Stroie foreach ($files as $file) { 87859036814SCostin Stroie $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 87959036814SCostin Stroie $results[] = [ 88059036814SCostin Stroie 'file' => $file, 88159036814SCostin Stroie 'result' => $result 88259036814SCostin Stroie ]; 88359036814SCostin Stroie } 88459036814SCostin Stroie 88559036814SCostin Stroie return [ 88659036814SCostin Stroie 'status' => 'success', 88759036814SCostin Stroie 'message' => "Finished processing directory.", 88859036814SCostin Stroie 'files_count' => count($files), 88959036814SCostin Stroie 'results' => $results 89059036814SCostin Stroie ]; 89159036814SCostin Stroie } 89259036814SCostin Stroie} 89359036814SCostin Stroie 89459036814SCostin Stroie/** 89559036814SCostin Stroie * Parse a file path and convert it to a DokuWiki ID 89659036814SCostin Stroie * 89759036814SCostin Stroie * Takes a file system path and converts it to the DokuWiki ID format by: 89859036814SCostin Stroie * 1. Removing the base path prefix (using DokuWiki's pages directory) 89959036814SCostin Stroie * 2. Removing the .txt extension 90059036814SCostin Stroie * 3. Converting directory separators to colons 90159036814SCostin Stroie * 90259036814SCostin Stroie * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 90359036814SCostin Stroie * Becomes: reports:mri:2024:g287-name-surname 90459036814SCostin Stroie * 90559036814SCostin Stroie * @param string $filePath The full file path to parse 90659036814SCostin Stroie * @return string The DokuWiki ID 90759036814SCostin Stroie */ 90859036814SCostin Stroiefunction parseFilePath($filePath) { 90959036814SCostin Stroie // Use DokuWiki's constant to get the pages directory if available 91059036814SCostin Stroie if (defined('DOKU_INC')) { 91159036814SCostin Stroie $pagesDir = DOKU_INC . 'data/pages/'; 91259036814SCostin Stroie } else { 91359036814SCostin Stroie // Fallback to common DokuWiki installation path 91459036814SCostin Stroie $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 91559036814SCostin Stroie } 91659036814SCostin Stroie 91759036814SCostin Stroie // Remove the base path 91859036814SCostin Stroie $relativePath = str_replace($pagesDir, '', $filePath); 91959036814SCostin Stroie 92059036814SCostin Stroie // Remove .txt extension 92159036814SCostin Stroie $relativePath = preg_replace('/\.txt$/', '', $relativePath); 92259036814SCostin Stroie 92359036814SCostin Stroie // Split path into parts and filter out empty parts 92459036814SCostin Stroie $parts = array_filter(explode('/', $relativePath)); 92559036814SCostin Stroie 92659036814SCostin Stroie // Build DokuWiki ID (use first part as namespace) 92759036814SCostin Stroie $idParts = []; 92859036814SCostin Stroie foreach ($parts as $part) { 92959036814SCostin Stroie if (!empty($part)) { 93059036814SCostin Stroie $idParts[] = $part; 93159036814SCostin Stroie } 93259036814SCostin Stroie } 93359036814SCostin Stroie 93459036814SCostin Stroie return implode(':', $idParts); 93559036814SCostin Stroie} 93659036814SCostin Stroie 937