1*59036814SCostin Stroie<?php 2*59036814SCostin Stroie 3*59036814SCostin Stroienamespace dokuwiki\plugin\dokullm; 4*59036814SCostin Stroie 5*59036814SCostin Stroieclass ChromaDBClient { 6*59036814SCostin Stroie private $baseUrl; 7*59036814SCostin Stroie private $client; 8*59036814SCostin Stroie private $ollamaClient; 9*59036814SCostin Stroie private $tenant; 10*59036814SCostin Stroie private $database; 11*59036814SCostin Stroie private $ollamaHost; 12*59036814SCostin Stroie private $ollamaPort; 13*59036814SCostin Stroie /** 14*59036814SCostin Stroie * Initialize the ChromaDB client 15*59036814SCostin Stroie * 16*59036814SCostin Stroie * Creates a new ChromaDB client instance with the specified connection parameters. 17*59036814SCostin Stroie * Also ensures that the specified tenant and database exist. 18*59036814SCostin Stroie * 19*59036814SCostin Stroie * @param string $host ChromaDB server host (default: CHROMA_HOST) 20*59036814SCostin Stroie * @param int $port ChromaDB server port (default: CHROMA_PORT) 21*59036814SCostin Stroie * @param string $tenant ChromaDB tenant name (default: CHROMA_TENANT) 22*59036814SCostin Stroie * @param string $database ChromaDB database name (default: CHROMA_DATABASE) 23*59036814SCostin Stroie * @param string $ollamaHost Ollama server host (default: OLLAMA_HOST) 24*59036814SCostin Stroie * @param int $ollamaPort Ollama server port (default: OLLAMA_PORT) 25*59036814SCostin Stroie * @param string $ollamaModel Ollama embeddings model (default: OLLAMA_EMBEDDINGS_MODEL) 26*59036814SCostin Stroie */ 27*59036814SCostin Stroie public function __construct($host = CHROMA_HOST, $port = CHROMA_PORT, $tenant = CHROMA_TENANT, $database = CHROMA_DATABASE, $ollamaHost = OLLAMA_HOST, $ollamaPort = OLLAMA_PORT, $ollamaModel = OLLAMA_EMBEDDINGS_MODEL) { 28*59036814SCostin Stroie $this->baseUrl = "http://{$host}:{$port}"; 29*59036814SCostin Stroie $this->tenant = $tenant; 30*59036814SCostin Stroie $this->database = $database; 31*59036814SCostin Stroie $this->ollamaHost = $ollamaHost; 32*59036814SCostin Stroie $this->ollamaPort = $ollamaPort; 33*59036814SCostin Stroie $this->ollamaModel = $ollamaModel; 34*59036814SCostin Stroie $this->client = curl_init(); 35*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_RETURNTRANSFER, true); 36*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_HTTPHEADER, [ 37*59036814SCostin Stroie 'Content-Type: application/json', 38*59036814SCostin Stroie 'Accept: application/json' 39*59036814SCostin Stroie ]); 40*59036814SCostin Stroie 41*59036814SCostin Stroie // Initialize Ollama client 42*59036814SCostin Stroie $this->ollamaClient = curl_init(); 43*59036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_RETURNTRANSFER, true); 44*59036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_HTTPHEADER, [ 45*59036814SCostin Stroie 'Content-Type: application/json' 46*59036814SCostin Stroie ]); 47*59036814SCostin Stroie 48*59036814SCostin Stroie // Check if tenant and database exist, create them if they don't 49*59036814SCostin Stroie $this->ensureTenantAndDatabase(); 50*59036814SCostin Stroie } 51*59036814SCostin Stroie 52*59036814SCostin Stroie /** 53*59036814SCostin Stroie * Clean up the cURL client when the object is destroyed 54*59036814SCostin Stroie * 55*59036814SCostin Stroie * @return void 56*59036814SCostin Stroie */ 57*59036814SCostin Stroie public function __destruct() { 58*59036814SCostin Stroie curl_close($this->client); 59*59036814SCostin Stroie curl_close($this->ollamaClient); 60*59036814SCostin Stroie } 61*59036814SCostin Stroie 62*59036814SCostin Stroie /** 63*59036814SCostin Stroie * Make an HTTP request to the ChromaDB API 64*59036814SCostin Stroie * 65*59036814SCostin Stroie * This is a helper function that handles making HTTP requests to the ChromaDB API, 66*59036814SCostin Stroie * including setting the appropriate headers for tenant and database. 67*59036814SCostin Stroie * 68*59036814SCostin Stroie * @param string $endpoint The API endpoint to call 69*59036814SCostin Stroie * @param string $method The HTTP method to use (default: 'GET') 70*59036814SCostin Stroie * @param array|null $data The data to send with the request (default: null) 71*59036814SCostin Stroie * @return array The JSON response decoded as an array 72*59036814SCostin Stroie * @throws Exception If there's a cURL error or HTTP error 73*59036814SCostin Stroie */ 74*59036814SCostin Stroie private function makeRequest($endpoint, $method = 'GET', $data = null) { 75*59036814SCostin Stroie // Add tenant and database as headers instead of query parameters for v2 API 76*59036814SCostin Stroie $headers = [ 77*59036814SCostin Stroie 'Content-Type: application/json', 78*59036814SCostin Stroie 'Accept: application/json' 79*59036814SCostin Stroie ]; 80*59036814SCostin Stroie 81*59036814SCostin Stroie $url = $this->baseUrl . '/api/v2' . $endpoint; 82*59036814SCostin Stroie 83*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_URL, $url); 84*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_CUSTOMREQUEST, $method); 85*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_HTTPHEADER, $headers); 86*59036814SCostin Stroie 87*59036814SCostin Stroie if ($data) { 88*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_POSTFIELDS, json_encode($data)); 89*59036814SCostin Stroie } else { 90*59036814SCostin Stroie curl_setopt($this->client, CURLOPT_POSTFIELDS, null); 91*59036814SCostin Stroie } 92*59036814SCostin Stroie 93*59036814SCostin Stroie $response = curl_exec($this->client); 94*59036814SCostin Stroie $httpCode = curl_getinfo($this->client, CURLINFO_HTTP_CODE); 95*59036814SCostin Stroie 96*59036814SCostin Stroie if (curl_error($this->client)) { 97*59036814SCostin Stroie throw new Exception('Curl error: ' . curl_error($this->client)); 98*59036814SCostin Stroie } 99*59036814SCostin Stroie 100*59036814SCostin Stroie if ($httpCode >= 400) { 101*59036814SCostin Stroie throw new Exception("HTTP Error: $httpCode, Response: $response"); 102*59036814SCostin Stroie } 103*59036814SCostin Stroie 104*59036814SCostin Stroie return json_decode($response, true); 105*59036814SCostin Stroie } 106*59036814SCostin Stroie 107*59036814SCostin Stroie /** 108*59036814SCostin Stroie * Generate embeddings for text using Ollama 109*59036814SCostin Stroie * 110*59036814SCostin Stroie * @param string $text The text to generate embeddings for 111*59036814SCostin Stroie * @return array The embeddings vector 112*59036814SCostin Stroie */ 113*59036814SCostin Stroie public function generateEmbeddings($text) { 114*59036814SCostin Stroie $ollamaUrl = "http://{$this->ollamaHost}:{$this->ollamaPort}/api/embeddings"; 115*59036814SCostin Stroie 116*59036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_URL, $ollamaUrl); 117*59036814SCostin Stroie 118*59036814SCostin Stroie $data = [ 119*59036814SCostin Stroie 'model' => $this->ollamaModel, 120*59036814SCostin Stroie 'prompt' => $text, 121*59036814SCostin Stroie 'keep_alive' => '30m' 122*59036814SCostin Stroie ]; 123*59036814SCostin Stroie 124*59036814SCostin Stroie curl_setopt($this->ollamaClient, CURLOPT_POSTFIELDS, json_encode($data)); 125*59036814SCostin Stroie 126*59036814SCostin Stroie $response = curl_exec($this->ollamaClient); 127*59036814SCostin Stroie $httpCode = curl_getinfo($this->ollamaClient, CURLINFO_HTTP_CODE); 128*59036814SCostin Stroie 129*59036814SCostin Stroie if (curl_error($this->ollamaClient)) { 130*59036814SCostin Stroie throw new Exception('Ollama Curl error: ' . curl_error($this->ollamaClient)); 131*59036814SCostin Stroie } 132*59036814SCostin Stroie 133*59036814SCostin Stroie if ($httpCode >= 400) { 134*59036814SCostin Stroie throw new Exception("Ollama HTTP Error: $httpCode, Response: $response"); 135*59036814SCostin Stroie } 136*59036814SCostin Stroie 137*59036814SCostin Stroie $result = json_decode($response, true); 138*59036814SCostin Stroie 139*59036814SCostin Stroie if (!isset($result['embedding'])) { 140*59036814SCostin Stroie throw new Exception("Ollama response missing embedding: " . $response); 141*59036814SCostin Stroie } 142*59036814SCostin Stroie 143*59036814SCostin Stroie return $result['embedding']; 144*59036814SCostin Stroie } 145*59036814SCostin Stroie 146*59036814SCostin Stroie /** 147*59036814SCostin Stroie * List all collections in the database 148*59036814SCostin Stroie * 149*59036814SCostin Stroie * Retrieves a list of all collections in the specified tenant and database. 150*59036814SCostin Stroie * 151*59036814SCostin Stroie * @return array List of collections 152*59036814SCostin Stroie */ 153*59036814SCostin Stroie public function listCollections() { 154*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 155*59036814SCostin Stroie return $this->makeRequest($endpoint); 156*59036814SCostin Stroie } 157*59036814SCostin Stroie 158*59036814SCostin Stroie /** 159*59036814SCostin Stroie * Get a collection by name 160*59036814SCostin Stroie * 161*59036814SCostin Stroie * Retrieves information about a specific collection by its name. 162*59036814SCostin Stroie * 163*59036814SCostin Stroie * @param string $name The name of the collection to retrieve 164*59036814SCostin Stroie * @return array The collection information 165*59036814SCostin Stroie * @throws Exception If the collection is not found 166*59036814SCostin Stroie */ 167*59036814SCostin Stroie public function getCollection($name) { 168*59036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 169*59036814SCostin Stroie if (empty($name)) { 170*59036814SCostin Stroie $name = 'documents'; 171*59036814SCostin Stroie } 172*59036814SCostin Stroie 173*59036814SCostin Stroie // First try to get collection by name 174*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 175*59036814SCostin Stroie $collections = $this->makeRequest($endpoint); 176*59036814SCostin Stroie 177*59036814SCostin Stroie // Find collection by name 178*59036814SCostin Stroie foreach ($collections as $collection) { 179*59036814SCostin Stroie if (isset($collection['name']) && $collection['name'] === $name) { 180*59036814SCostin Stroie return $collection; 181*59036814SCostin Stroie } 182*59036814SCostin Stroie } 183*59036814SCostin Stroie 184*59036814SCostin Stroie // If not found, throw exception 185*59036814SCostin Stroie throw new Exception("Collection '{$name}' not found"); 186*59036814SCostin Stroie } 187*59036814SCostin Stroie 188*59036814SCostin Stroie /** 189*59036814SCostin Stroie * Create a new collection 190*59036814SCostin Stroie * 191*59036814SCostin Stroie * Creates a new collection with the specified name and optional metadata. 192*59036814SCostin Stroie * 193*59036814SCostin Stroie * @param string $name The name of the collection to create 194*59036814SCostin Stroie * @param array|null $metadata Optional metadata for the collection 195*59036814SCostin Stroie * @return array The response from the API 196*59036814SCostin Stroie */ 197*59036814SCostin Stroie public function createCollection($name, $metadata = null) { 198*59036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 199*59036814SCostin Stroie if (empty($name)) { 200*59036814SCostin Stroie $name = 'documents'; 201*59036814SCostin Stroie } 202*59036814SCostin Stroie 203*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections"; 204*59036814SCostin Stroie $data = ['name' => $name]; 205*59036814SCostin Stroie if ($metadata) { 206*59036814SCostin Stroie $data['metadata'] = $metadata; 207*59036814SCostin Stroie } 208*59036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 209*59036814SCostin Stroie } 210*59036814SCostin Stroie 211*59036814SCostin Stroie /** 212*59036814SCostin Stroie * Delete a collection by name 213*59036814SCostin Stroie * 214*59036814SCostin Stroie * Deletes a collection with the specified name. 215*59036814SCostin Stroie * 216*59036814SCostin Stroie * @param string $name The name of the collection to delete 217*59036814SCostin Stroie * @return array The response from the API 218*59036814SCostin Stroie * @throws Exception If the collection ID is not found 219*59036814SCostin Stroie */ 220*59036814SCostin Stroie public function deleteCollection($name) { 221*59036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 222*59036814SCostin Stroie if (empty($name)) { 223*59036814SCostin Stroie $name = 'documents'; 224*59036814SCostin Stroie } 225*59036814SCostin Stroie 226*59036814SCostin Stroie // First get the collection to find its ID 227*59036814SCostin Stroie $collection = $this->getCollection($name); 228*59036814SCostin Stroie if (!isset($collection['id'])) { 229*59036814SCostin Stroie throw new Exception("Collection ID not found for '{$name}'"); 230*59036814SCostin Stroie } 231*59036814SCostin Stroie 232*59036814SCostin Stroie $collectionId = $collection['id']; 233*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}"; 234*59036814SCostin Stroie return $this->makeRequest($endpoint, 'DELETE'); 235*59036814SCostin Stroie } 236*59036814SCostin Stroie 237*59036814SCostin Stroie /** 238*59036814SCostin Stroie * Get a document by its ID from a collection 239*59036814SCostin Stroie * 240*59036814SCostin Stroie * Retrieves a document from the specified collection using its ID. 241*59036814SCostin Stroie * 242*59036814SCostin Stroie * @param string $collectionName The name of the collection to get the document from 243*59036814SCostin Stroie * @param string $documentId The document ID to retrieve 244*59036814SCostin Stroie * @param array $include What to include in the response (default: ["metadatas", "documents"]) 245*59036814SCostin Stroie * @return array The retrieved document 246*59036814SCostin Stroie * @throws Exception If the collection ID is not found 247*59036814SCostin Stroie */ 248*59036814SCostin Stroie public function getDocument($collectionName, $documentId, $include = ["metadatas", "documents"]) { 249*59036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 250*59036814SCostin Stroie if (empty($collectionName)) { 251*59036814SCostin Stroie $collectionName = 'documents'; 252*59036814SCostin Stroie } 253*59036814SCostin Stroie 254*59036814SCostin Stroie // First get the collection to find its ID 255*59036814SCostin Stroie $collection = $this->getCollection($collectionName); 256*59036814SCostin Stroie if (!isset($collection['id'])) { 257*59036814SCostin Stroie throw new Exception("Collection ID not found for '{$collectionName}'"); 258*59036814SCostin Stroie } 259*59036814SCostin Stroie 260*59036814SCostin Stroie $collectionId = $collection['id']; 261*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 262*59036814SCostin Stroie $data = [ 263*59036814SCostin Stroie 'ids' => [$documentId], 264*59036814SCostin Stroie 'include' => $include 265*59036814SCostin Stroie ]; 266*59036814SCostin Stroie 267*59036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 268*59036814SCostin Stroie } 269*59036814SCostin Stroie 270*59036814SCostin Stroie /** 271*59036814SCostin Stroie * Add documents to a collection 272*59036814SCostin Stroie * 273*59036814SCostin Stroie * Adds documents to the specified collection. Each document must have a corresponding ID. 274*59036814SCostin Stroie * Optional metadata and pre-computed embeddings can also be provided. 275*59036814SCostin Stroie * 276*59036814SCostin Stroie * @param string $collectionName The name of the collection to add documents to 277*59036814SCostin Stroie * @param array $documents The document contents 278*59036814SCostin Stroie * @param array $ids The document IDs 279*59036814SCostin Stroie * @param array|null $metadatas Optional metadata for each document 280*59036814SCostin Stroie * @param array|null $embeddings Optional pre-computed embeddings for each document 281*59036814SCostin Stroie * @return array The response from the API 282*59036814SCostin Stroie * @throws Exception If the collection ID is not found 283*59036814SCostin Stroie */ 284*59036814SCostin Stroie public function addDocuments($collectionName, $documents, $ids, $metadatas = null, $embeddings = null) { 285*59036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 286*59036814SCostin Stroie if (empty($collectionName)) { 287*59036814SCostin Stroie $collectionName = 'documents'; 288*59036814SCostin Stroie } 289*59036814SCostin Stroie 290*59036814SCostin Stroie // First get the collection to find its ID 291*59036814SCostin Stroie $collection = $this->getCollection($collectionName); 292*59036814SCostin Stroie if (!isset($collection['id'])) { 293*59036814SCostin Stroie throw new Exception("Collection ID not found for '{$collectionName}'"); 294*59036814SCostin Stroie } 295*59036814SCostin Stroie 296*59036814SCostin Stroie $collectionId = $collection['id']; 297*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/upsert"; 298*59036814SCostin Stroie $data = [ 299*59036814SCostin Stroie 'ids' => $ids, 300*59036814SCostin Stroie 'documents' => $documents 301*59036814SCostin Stroie ]; 302*59036814SCostin Stroie 303*59036814SCostin Stroie if ($metadatas) { 304*59036814SCostin Stroie $data['metadatas'] = $metadatas; 305*59036814SCostin Stroie } 306*59036814SCostin Stroie 307*59036814SCostin Stroie if ($embeddings) { 308*59036814SCostin Stroie $data['embeddings'] = $embeddings; 309*59036814SCostin Stroie } 310*59036814SCostin Stroie 311*59036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 312*59036814SCostin Stroie } 313*59036814SCostin Stroie 314*59036814SCostin Stroie /** 315*59036814SCostin Stroie * Check if a document needs to be updated based on timestamp comparison 316*59036814SCostin Stroie * 317*59036814SCostin Stroie * Determines whether a document should be reprocessed by comparing the file's last modification 318*59036814SCostin Stroie * time with the processed_at timestamp stored in the document's metadata. The function checks 319*59036814SCostin Stroie * the first 3 chunk IDs (@1, @2, @3) since the first chunks might be titles and therefore 320*59036814SCostin Stroie * not included in the database. 321*59036814SCostin Stroie * 322*59036814SCostin Stroie * @param string $collectionId The ID of the collection to check documents in 323*59036814SCostin Stroie * @param string $documentId The base document ID to check (without chunk suffixes) 324*59036814SCostin Stroie * @param int $fileModifiedTime The file's last modification timestamp (from filemtime) 325*59036814SCostin Stroie * @return bool True if document needs to be updated (doesn't exist, has no timestamp, or is outdated), false if up to date 326*59036814SCostin Stroie * @throws Exception If there's an error checking the document 327*59036814SCostin Stroie */ 328*59036814SCostin Stroie public function needsUpdate($collectionId, $documentId, $fileModifiedTime) { 329*59036814SCostin Stroie try { 330*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/get"; 331*59036814SCostin Stroie 332*59036814SCostin Stroie // Check first 3 chunk numbers (@1, @2, @3) since first chunks might be titles and skipped 333*59036814SCostin Stroie $chunkIdsToCheck = [ 334*59036814SCostin Stroie $documentId . '@1', 335*59036814SCostin Stroie $documentId . '@2', 336*59036814SCostin Stroie $documentId . '@3' 337*59036814SCostin Stroie ]; 338*59036814SCostin Stroie 339*59036814SCostin Stroie $data = [ 340*59036814SCostin Stroie 'ids' => $chunkIdsToCheck, 341*59036814SCostin Stroie 'include' => [ 342*59036814SCostin Stroie "metadatas" 343*59036814SCostin Stroie ], 344*59036814SCostin Stroie 'limit' => 1 345*59036814SCostin Stroie ]; 346*59036814SCostin Stroie 347*59036814SCostin Stroie // Check if document exists 348*59036814SCostin Stroie $result = $this->makeRequest($endpoint, 'POST', $data); 349*59036814SCostin Stroie 350*59036814SCostin Stroie // If no documents found, return true (needs to be added) 351*59036814SCostin Stroie if (empty($result['ids'])) { 352*59036814SCostin Stroie return true; 353*59036814SCostin Stroie } 354*59036814SCostin Stroie 355*59036814SCostin Stroie // Check if any document has a processed_at timestamp 356*59036814SCostin Stroie if (!empty($result['metadatas']) && is_array($result['metadatas'])) { 357*59036814SCostin Stroie // Check the first metadata entry directly 358*59036814SCostin Stroie $metadata = $result['metadatas'][0]; 359*59036814SCostin Stroie 360*59036814SCostin Stroie // If processed_at is not set, return true (needs update) 361*59036814SCostin Stroie if (!isset($metadata['processed_at'])) { 362*59036814SCostin Stroie return true; 363*59036814SCostin Stroie } 364*59036814SCostin Stroie 365*59036814SCostin Stroie // Parse the processed_at timestamp 366*59036814SCostin Stroie $processedTimestamp = strtotime($metadata['processed_at']); 367*59036814SCostin Stroie 368*59036814SCostin Stroie // If file is newer than processed time, return true (needs update) 369*59036814SCostin Stroie if ($fileModifiedTime > $processedTimestamp) { 370*59036814SCostin Stroie return true; 371*59036814SCostin Stroie } 372*59036814SCostin Stroie } 373*59036814SCostin Stroie 374*59036814SCostin Stroie // Document exists and is up to date 375*59036814SCostin Stroie return false; 376*59036814SCostin Stroie } catch (Exception $e) { 377*59036814SCostin Stroie // If there's an error checking the document, assume it needs to be updated 378*59036814SCostin Stroie return true; 379*59036814SCostin Stroie } 380*59036814SCostin Stroie } 381*59036814SCostin Stroie 382*59036814SCostin Stroie /** 383*59036814SCostin Stroie * Query a collection for similar documents 384*59036814SCostin Stroie * 385*59036814SCostin Stroie * Queries the specified collection for documents similar to the provided query texts. 386*59036814SCostin Stroie * The function generates embeddings for the query texts and sends them to ChromaDB. 387*59036814SCostin Stroie * Supports filtering results by metadata using the where parameter. 388*59036814SCostin Stroie * 389*59036814SCostin Stroie * @param string $collectionName The name of the collection to query 390*59036814SCostin Stroie * @param array $queryTexts The query texts to search for 391*59036814SCostin Stroie * @param int $nResults The number of results to return (default: 5) 392*59036814SCostin Stroie * @param array|null $where Optional filter conditions for metadata 393*59036814SCostin Stroie * @return array The query results 394*59036814SCostin Stroie * @throws Exception If the collection ID is not found 395*59036814SCostin Stroie */ 396*59036814SCostin Stroie public function queryCollection($collectionName, $queryTexts, $nResults = 5, $where = null) { 397*59036814SCostin Stroie // Use provided name, fallback to 'documents' if empty 398*59036814SCostin Stroie if (empty($collectionName)) { 399*59036814SCostin Stroie $collectionName = 'documents'; 400*59036814SCostin Stroie } 401*59036814SCostin Stroie 402*59036814SCostin Stroie // First get the collection to find its ID 403*59036814SCostin Stroie $collection = $this->getCollection($collectionName); 404*59036814SCostin Stroie if (!isset($collection['id'])) { 405*59036814SCostin Stroie throw new Exception("Collection ID not found for '{$collectionName}'"); 406*59036814SCostin Stroie } 407*59036814SCostin Stroie 408*59036814SCostin Stroie $collectionId = $collection['id']; 409*59036814SCostin Stroie $endpoint = "/tenants/{$this->tenant}/databases/{$this->database}/collections/{$collectionId}/query"; 410*59036814SCostin Stroie 411*59036814SCostin Stroie // Generate embeddings for query texts 412*59036814SCostin Stroie $queryEmbeddings = []; 413*59036814SCostin Stroie foreach ($queryTexts as $text) { 414*59036814SCostin Stroie $queryEmbeddings[] = $this->generateEmbeddings($text); 415*59036814SCostin Stroie } 416*59036814SCostin Stroie 417*59036814SCostin Stroie $data = [ 418*59036814SCostin Stroie 'query_embeddings' => $queryEmbeddings, 419*59036814SCostin Stroie 'n_results' => $nResults 420*59036814SCostin Stroie ]; 421*59036814SCostin Stroie 422*59036814SCostin Stroie // Add where clause for metadata filtering if provided 423*59036814SCostin Stroie if ($where && is_array($where)) { 424*59036814SCostin Stroie $data['where'] = $where; 425*59036814SCostin Stroie } 426*59036814SCostin Stroie 427*59036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 428*59036814SCostin Stroie } 429*59036814SCostin Stroie 430*59036814SCostin Stroie /** 431*59036814SCostin Stroie * Check if the ChromaDB server is alive 432*59036814SCostin Stroie * 433*59036814SCostin Stroie * Sends a heartbeat request to verify that the ChromaDB server is running. 434*59036814SCostin Stroie * 435*59036814SCostin Stroie * @return array The response from the heartbeat endpoint 436*59036814SCostin Stroie */ 437*59036814SCostin Stroie public function heartbeat() { 438*59036814SCostin Stroie $endpoint = "/heartbeat"; 439*59036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 440*59036814SCostin Stroie } 441*59036814SCostin Stroie 442*59036814SCostin Stroie /** 443*59036814SCostin Stroie * Get authentication and identity information 444*59036814SCostin Stroie * 445*59036814SCostin Stroie * Retrieves authentication and identity information from the ChromaDB server. 446*59036814SCostin Stroie * 447*59036814SCostin Stroie * @return array The response from the auth/identity endpoint 448*59036814SCostin Stroie */ 449*59036814SCostin Stroie public function getIdentity() { 450*59036814SCostin Stroie $endpoint = "/identity"; 451*59036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 452*59036814SCostin Stroie } 453*59036814SCostin Stroie 454*59036814SCostin Stroie /** 455*59036814SCostin Stroie * Ensure that the specified tenant and database exist 456*59036814SCostin Stroie * 457*59036814SCostin Stroie * Checks if the specified tenant and database exist, and creates them if they don't. 458*59036814SCostin Stroie * 459*59036814SCostin Stroie * @return void 460*59036814SCostin Stroie */ 461*59036814SCostin Stroie private function ensureTenantAndDatabase() { 462*59036814SCostin Stroie // Check if tenant exists, create if it doesn't 463*59036814SCostin Stroie try { 464*59036814SCostin Stroie $this->getTenant($this->tenant); 465*59036814SCostin Stroie } catch (Exception $e) { 466*59036814SCostin Stroie // Tenant doesn't exist, create it 467*59036814SCostin Stroie $this->createTenant($this->tenant); 468*59036814SCostin Stroie } 469*59036814SCostin Stroie 470*59036814SCostin Stroie // Check if database exists, create if it doesn't 471*59036814SCostin Stroie try { 472*59036814SCostin Stroie $this->getDatabase($this->database, $this->tenant); 473*59036814SCostin Stroie } catch (Exception $e) { 474*59036814SCostin Stroie // Database doesn't exist, create it 475*59036814SCostin Stroie $this->createDatabase($this->database, $this->tenant); 476*59036814SCostin Stroie } 477*59036814SCostin Stroie } 478*59036814SCostin Stroie 479*59036814SCostin Stroie /** 480*59036814SCostin Stroie * Get tenant information 481*59036814SCostin Stroie * 482*59036814SCostin Stroie * Retrieves information about the specified tenant. 483*59036814SCostin Stroie * 484*59036814SCostin Stroie * @param string $tenantName The tenant name 485*59036814SCostin Stroie * @return array The tenant information 486*59036814SCostin Stroie */ 487*59036814SCostin Stroie public function getTenant($tenantName) { 488*59036814SCostin Stroie $endpoint = "/tenants/{$tenantName}"; 489*59036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 490*59036814SCostin Stroie } 491*59036814SCostin Stroie 492*59036814SCostin Stroie /** 493*59036814SCostin Stroie * Create a new tenant 494*59036814SCostin Stroie * 495*59036814SCostin Stroie * Creates a new tenant with the specified name. 496*59036814SCostin Stroie * 497*59036814SCostin Stroie * @param string $tenantName The tenant name 498*59036814SCostin Stroie * @return array The response from the API 499*59036814SCostin Stroie */ 500*59036814SCostin Stroie public function createTenant($tenantName) { 501*59036814SCostin Stroie $endpoint = "/tenants"; 502*59036814SCostin Stroie $data = ['name' => $tenantName]; 503*59036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 504*59036814SCostin Stroie } 505*59036814SCostin Stroie 506*59036814SCostin Stroie /** 507*59036814SCostin Stroie * Get database information 508*59036814SCostin Stroie * 509*59036814SCostin Stroie * Retrieves information about the specified database within a tenant. 510*59036814SCostin Stroie * 511*59036814SCostin Stroie * @param string $databaseName The database name 512*59036814SCostin Stroie * @param string $tenantName The tenant name 513*59036814SCostin Stroie * @return array The database information 514*59036814SCostin Stroie */ 515*59036814SCostin Stroie public function getDatabase($databaseName, $tenantName) { 516*59036814SCostin Stroie $endpoint = "/tenants/{$tenantName}/databases/{$databaseName}"; 517*59036814SCostin Stroie return $this->makeRequest($endpoint, 'GET'); 518*59036814SCostin Stroie } 519*59036814SCostin Stroie 520*59036814SCostin Stroie /** 521*59036814SCostin Stroie * Create a new database 522*59036814SCostin Stroie * 523*59036814SCostin Stroie * Creates a new database with the specified name within a tenant. 524*59036814SCostin Stroie * 525*59036814SCostin Stroie * @param string $databaseName The database name 526*59036814SCostin Stroie * @param string $tenantName The tenant name 527*59036814SCostin Stroie * @return array The response from the API 528*59036814SCostin Stroie */ 529*59036814SCostin Stroie public function createDatabase($databaseName, $tenantName) { 530*59036814SCostin Stroie $endpoint = "/tenants/{$tenantName}/databases"; 531*59036814SCostin Stroie $data = ['name' => $databaseName]; 532*59036814SCostin Stroie return $this->makeRequest($endpoint, 'POST', $data); 533*59036814SCostin Stroie } 534*59036814SCostin Stroie 535*59036814SCostin Stroie /** 536*59036814SCostin Stroie * Ensure a collection exists, creating it if necessary 537*59036814SCostin Stroie * 538*59036814SCostin Stroie * This helper function checks if a collection exists and creates it if it doesn't. 539*59036814SCostin Stroie * 540*59036814SCostin Stroie * @param string $collectionName The name of the collection to check/create 541*59036814SCostin Stroie * @return string Status message indicating what happened 542*59036814SCostin Stroie */ 543*59036814SCostin Stroie public function ensureCollectionExists($collectionName) { 544*59036814SCostin Stroie try { 545*59036814SCostin Stroie $collection = $this->getCollection($collectionName); 546*59036814SCostin Stroie return "Collection '$collectionName' already exists."; 547*59036814SCostin Stroie } catch (Exception $e) { 548*59036814SCostin Stroie // Collection doesn't exist, create it 549*59036814SCostin Stroie $created = $this->createCollection($collectionName); 550*59036814SCostin Stroie return "Collection '$collectionName' created."; 551*59036814SCostin Stroie } 552*59036814SCostin Stroie } 553*59036814SCostin Stroie 554*59036814SCostin Stroie /** 555*59036814SCostin Stroie * Process a single DokuWiki file and send it to ChromaDB with intelligent update checking 556*59036814SCostin Stroie * 557*59036814SCostin Stroie * This function handles the complete processing of a single DokuWiki file: 558*59036814SCostin Stroie * 1. Parses the file path to extract metadata and document ID 559*59036814SCostin Stroie * 2. Determines the appropriate collection based on document ID 560*59036814SCostin Stroie * 3. Checks if the document needs updating using timestamp comparison 561*59036814SCostin Stroie * 4. Reads and processes file content only if update is needed 562*59036814SCostin Stroie * 5. Splits the document into chunks (paragraphs) 563*59036814SCostin Stroie * 6. Extracts rich metadata from the DokuWiki ID format 564*59036814SCostin Stroie * 7. Generates embeddings for each chunk 565*59036814SCostin Stroie * 8. Sends all chunks to ChromaDB with metadata 566*59036814SCostin Stroie * 567*59036814SCostin Stroie * Supported ID formats: 568*59036814SCostin Stroie * - Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 569*59036814SCostin Stroie * - Format 2: reports:mri:2024:g287-name-surname (third part is year) 570*59036814SCostin Stroie * - Templates: reports:mri:templates:name-surname (contains 'templates' part) 571*59036814SCostin Stroie * 572*59036814SCostin Stroie * The function implements smart update checking by comparing file modification time 573*59036814SCostin Stroie * with the 'processed_at' timestamp in document metadata to avoid reprocessing unchanged files. 574*59036814SCostin Stroie * 575*59036814SCostin Stroie * @param string $filePath The path to the file to process 576*59036814SCostin Stroie * @param string $collectionName The name of the collection to use 577*59036814SCostin Stroie * @param bool $collectionChecked Whether the collection has already been checked/created 578*59036814SCostin Stroie * @return array Result with status and details 579*59036814SCostin Stroie */ 580*59036814SCostin Stroie public function processSingleFile($filePath, $collectionName, $collectionChecked = false) { 581*59036814SCostin Stroie // Parse file path to extract metadata 582*59036814SCostin Stroie $id = parseFilePath($filePath); 583*59036814SCostin Stroie 584*59036814SCostin Stroie try { 585*59036814SCostin Stroie // Create collection if it doesn't exist (only if not already checked) 586*59036814SCostin Stroie $collectionStatus = ''; 587*59036814SCostin Stroie if (!$collectionChecked) { 588*59036814SCostin Stroie $collectionStatus = $this->ensureCollectionExists($collectionName); 589*59036814SCostin Stroie } 590*59036814SCostin Stroie 591*59036814SCostin Stroie // Get collection ID 592*59036814SCostin Stroie $collection = $this->getCollection($collectionName); 593*59036814SCostin Stroie if (!isset($collection['id'])) { 594*59036814SCostin Stroie return [ 595*59036814SCostin Stroie 'status' => 'error', 596*59036814SCostin Stroie 'message' => "Collection ID not found for '{$collectionName}'" 597*59036814SCostin Stroie ]; 598*59036814SCostin Stroie } 599*59036814SCostin Stroie $collectionId = $collection['id']; 600*59036814SCostin Stroie 601*59036814SCostin Stroie // Get file modification time 602*59036814SCostin Stroie $fileModifiedTime = filemtime($filePath); 603*59036814SCostin Stroie 604*59036814SCostin Stroie // Check if document needs update 605*59036814SCostin Stroie $needsUpdate = $this->needsUpdate($collectionId, $id, $fileModifiedTime); 606*59036814SCostin Stroie 607*59036814SCostin Stroie // If document is up to date, skip processing 608*59036814SCostin Stroie if (!$needsUpdate) { 609*59036814SCostin Stroie return [ 610*59036814SCostin Stroie 'status' => 'skipped', 611*59036814SCostin Stroie 'message' => "Document '$id' is up to date in collection '$collectionName'. Skipping..." 612*59036814SCostin Stroie ]; 613*59036814SCostin Stroie } 614*59036814SCostin Stroie 615*59036814SCostin Stroie // Read file content 616*59036814SCostin Stroie $content = file_get_contents($filePath); 617*59036814SCostin Stroie 618*59036814SCostin Stroie // Split document into chunks (paragraphs separated by two newlines) 619*59036814SCostin Stroie $paragraphs = preg_split('/\n\s*\n/', $content); 620*59036814SCostin Stroie $chunks = []; 621*59036814SCostin Stroie $chunkMetadata = []; 622*59036814SCostin Stroie 623*59036814SCostin Stroie // Parse the DokuWiki ID to extract base metadata 624*59036814SCostin Stroie $parts = explode(':', $id); 625*59036814SCostin Stroie 626*59036814SCostin Stroie // Extract metadata from the last part of the ID 627*59036814SCostin Stroie $lastPart = end($parts); 628*59036814SCostin Stroie $baseMetadata = []; 629*59036814SCostin Stroie 630*59036814SCostin Stroie // Add the document ID as metadata 631*59036814SCostin Stroie $baseMetadata['document_id'] = $id; 632*59036814SCostin Stroie 633*59036814SCostin Stroie // Add current timestamp 634*59036814SCostin Stroie $baseMetadata['processed_at'] = date('Y-m-d H:i:s'); 635*59036814SCostin Stroie 636*59036814SCostin Stroie // Check if any part of the ID is 'templates' and set template metadata 637*59036814SCostin Stroie $isTemplate = in_array('templates', $parts); 638*59036814SCostin Stroie if ($isTemplate) { 639*59036814SCostin Stroie $baseMetadata['type'] = 'template'; 640*59036814SCostin Stroie } else { 641*59036814SCostin Stroie $baseMetadata['type'] = 'report'; 642*59036814SCostin Stroie } 643*59036814SCostin Stroie 644*59036814SCostin Stroie // Extract modality from the second part 645*59036814SCostin Stroie if (isset($parts[1])) { 646*59036814SCostin Stroie $baseMetadata['modality'] = $parts[1]; 647*59036814SCostin Stroie } 648*59036814SCostin Stroie 649*59036814SCostin Stroie // Handle different ID formats based on the third part: word (institution) or numeric (year) 650*59036814SCostin Stroie // Format 1: reports:mri:institution:250620-name-surname (third part is institution name) 651*59036814SCostin Stroie // Format 2: reports:mri:2024:g287-name-surname (third part is year) 652*59036814SCostin Stroie // For templates, don't set institution, date or year 653*59036814SCostin Stroie if (isset($parts[2]) && !$isTemplate) { 654*59036814SCostin Stroie // Check if third part is numeric (year) or word (institution) 655*59036814SCostin Stroie if (is_numeric($parts[2])) { 656*59036814SCostin Stroie // Format: reports:mri:2024:g287-name-surname (year format) 657*59036814SCostin Stroie // Extract year from the third part 658*59036814SCostin Stroie $baseMetadata['year'] = $parts[2]; 659*59036814SCostin Stroie 660*59036814SCostin Stroie // Set default institution from config 661*59036814SCostin Stroie $baseMetadata['institution'] = DEFAULT_INSTITUTION; 662*59036814SCostin Stroie 663*59036814SCostin Stroie // Extract registration and name from the last part 664*59036814SCostin Stroie // Registration should start with one letter or number and contain numbers before the '-' character 665*59036814SCostin Stroie if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 666*59036814SCostin Stroie // Check if the first part contains at least one digit to be considered a registration 667*59036814SCostin Stroie if (preg_match('/[0-9]/', $matches[1])) { 668*59036814SCostin Stroie $baseMetadata['registration'] = $matches[1]; 669*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 670*59036814SCostin Stroie } else { 671*59036814SCostin Stroie // If no registration pattern found, treat entire part as patient name 672*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 673*59036814SCostin Stroie } 674*59036814SCostin Stroie } else { 675*59036814SCostin Stroie // If no match, treat entire part as patient name 676*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 677*59036814SCostin Stroie } 678*59036814SCostin Stroie } else { 679*59036814SCostin Stroie // Format: reports:mri:institution:250620-name-surname (institution format) 680*59036814SCostin Stroie // Extract institution from the third part 681*59036814SCostin Stroie $baseMetadata['institution'] = $parts[2]; 682*59036814SCostin Stroie 683*59036814SCostin Stroie // Extract date and name from the last part 684*59036814SCostin Stroie if (preg_match('/^(\d{6})-(.+)$/', $lastPart, $matches)) { 685*59036814SCostin Stroie $dateStr = $matches[1]; 686*59036814SCostin Stroie $name = $matches[2]; 687*59036814SCostin Stroie 688*59036814SCostin Stroie // Convert date format (250620 -> 2025-06-20) 689*59036814SCostin Stroie $day = substr($dateStr, 0, 2); 690*59036814SCostin Stroie $month = substr($dateStr, 2, 2); 691*59036814SCostin Stroie $year = substr($dateStr, 4, 2); 692*59036814SCostin Stroie // Assuming 20xx for years 00-69 and 19xx for years 70-99 693*59036814SCostin Stroie $fullYear = (int)$year <= 70 ? '20' . $year : '19' . $year; 694*59036814SCostin Stroie $formattedDate = $fullYear . '-' . $month . '-' . $day; 695*59036814SCostin Stroie 696*59036814SCostin Stroie $baseMetadata['date'] = $formattedDate; 697*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $name); 698*59036814SCostin Stroie } 699*59036814SCostin Stroie } 700*59036814SCostin Stroie } 701*59036814SCostin Stroie 702*59036814SCostin Stroie // For templates, always extract name from the last part 703*59036814SCostin Stroie if ($isTemplate && isset($lastPart)) { 704*59036814SCostin Stroie // Extract name from the last part (everything after the last colon) 705*59036814SCostin Stroie if (preg_match('/^([a-zA-Z0-9]+[0-9]*)-(.+)$/', $lastPart, $matches)) { 706*59036814SCostin Stroie // Check if the first part contains at least one digit to be considered a registration 707*59036814SCostin Stroie if (preg_match('/[0-9]/', $matches[1])) { 708*59036814SCostin Stroie $baseMetadata['registration'] = $matches[1]; 709*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $matches[2]); 710*59036814SCostin Stroie } else { 711*59036814SCostin Stroie // If no registration pattern found, treat entire part as template name 712*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 713*59036814SCostin Stroie } 714*59036814SCostin Stroie } else { 715*59036814SCostin Stroie // If no match, treat entire part as template name 716*59036814SCostin Stroie $baseMetadata['name'] = str_replace('-', ' ', $lastPart); 717*59036814SCostin Stroie } 718*59036814SCostin Stroie } 719*59036814SCostin Stroie 720*59036814SCostin Stroie // Process each paragraph as a chunk with intelligent metadata handling 721*59036814SCostin Stroie $chunkIds = []; 722*59036814SCostin Stroie $chunkContents = []; 723*59036814SCostin Stroie $chunkMetadatas = []; 724*59036814SCostin Stroie $chunkEmbeddings = []; 725*59036814SCostin Stroie $currentTags = []; 726*59036814SCostin Stroie 727*59036814SCostin Stroie foreach ($paragraphs as $index => $paragraph) { 728*59036814SCostin Stroie // Skip empty paragraphs to avoid processing whitespace-only content 729*59036814SCostin Stroie $paragraph = trim($paragraph); 730*59036814SCostin Stroie if (empty($paragraph)) { 731*59036814SCostin Stroie continue; 732*59036814SCostin Stroie } 733*59036814SCostin Stroie 734*59036814SCostin Stroie // Check if this is a DokuWiki title (starts and ends with =) 735*59036814SCostin Stroie // Titles are converted to tags for better searchability but not stored as content chunks 736*59036814SCostin Stroie if (preg_match('/^=+(.*?)=+$/', $paragraph, $matches)) { 737*59036814SCostin Stroie // Extract title content and clean it 738*59036814SCostin Stroie $titleContent = trim($matches[1]); 739*59036814SCostin Stroie 740*59036814SCostin Stroie // Split into words and create searchable tags 741*59036814SCostin Stroie $words = preg_split('/\s+/', $titleContent); 742*59036814SCostin Stroie $tags = []; 743*59036814SCostin Stroie 744*59036814SCostin Stroie foreach ($words as $word) { 745*59036814SCostin Stroie // Only use words longer than 3 characters to reduce noise 746*59036814SCostin Stroie if (strlen($word) >= 3) { 747*59036814SCostin Stroie $tags[] = strtolower($word); 748*59036814SCostin Stroie } 749*59036814SCostin Stroie } 750*59036814SCostin Stroie 751*59036814SCostin Stroie // Remove duplicate tags and store for use in subsequent chunks 752*59036814SCostin Stroie $currentTags = array_unique($tags); 753*59036814SCostin Stroie continue; // Skip storing title chunks as content 754*59036814SCostin Stroie } 755*59036814SCostin Stroie 756*59036814SCostin Stroie // Create chunk ID 757*59036814SCostin Stroie $chunkId = $id . '@' . ($index + 1); 758*59036814SCostin Stroie 759*59036814SCostin Stroie // Generate embeddings for the chunk 760*59036814SCostin Stroie $embeddings = $this->generateEmbeddings($paragraph); 761*59036814SCostin Stroie 762*59036814SCostin Stroie // Add chunk-specific metadata 763*59036814SCostin Stroie $metadata = $baseMetadata; 764*59036814SCostin Stroie $metadata['chunk_id'] = $chunkId; 765*59036814SCostin Stroie $metadata['chunk_number'] = $index + 1; 766*59036814SCostin Stroie $metadata['total_chunks'] = count($paragraphs); 767*59036814SCostin Stroie 768*59036814SCostin Stroie // Add current tags to metadata if any exist 769*59036814SCostin Stroie if (!empty($currentTags)) { 770*59036814SCostin Stroie $metadata['tags'] = implode(',', $currentTags); 771*59036814SCostin Stroie } 772*59036814SCostin Stroie 773*59036814SCostin Stroie // Store chunk data 774*59036814SCostin Stroie $chunkIds[] = $chunkId; 775*59036814SCostin Stroie $chunkContents[] = $paragraph; 776*59036814SCostin Stroie $chunkMetadatas[] = $metadata; 777*59036814SCostin Stroie $chunkEmbeddings[] = $embeddings; 778*59036814SCostin Stroie } 779*59036814SCostin Stroie 780*59036814SCostin Stroie // If no chunks were created, skip this file 781*59036814SCostin Stroie if (empty($chunkIds)) { 782*59036814SCostin Stroie return [ 783*59036814SCostin Stroie 'status' => 'skipped', 784*59036814SCostin Stroie 'message' => "No valid chunks found in file '$id'. Skipping..." 785*59036814SCostin Stroie ]; 786*59036814SCostin Stroie } 787*59036814SCostin Stroie 788*59036814SCostin Stroie // Send all chunks to ChromaDB 789*59036814SCostin Stroie $result = $this->addDocuments($collectionName, $chunkContents, $chunkIds, $chunkMetadatas, $chunkEmbeddings); 790*59036814SCostin Stroie 791*59036814SCostin Stroie return [ 792*59036814SCostin Stroie 'status' => 'success', 793*59036814SCostin Stroie 'message' => "Successfully sent file to ChromaDB", 794*59036814SCostin Stroie 'details' => [ 795*59036814SCostin Stroie 'document_id' => $id, 796*59036814SCostin Stroie 'chunks' => count($chunkIds), 797*59036814SCostin Stroie 'collection' => $collectionName 798*59036814SCostin Stroie ], 799*59036814SCostin Stroie 'collection_status' => $collectionStatus 800*59036814SCostin Stroie ]; 801*59036814SCostin Stroie } catch (Exception $e) { 802*59036814SCostin Stroie return [ 803*59036814SCostin Stroie 'status' => 'error', 804*59036814SCostin Stroie 'message' => "Error sending file to ChromaDB: " . $e->getMessage() 805*59036814SCostin Stroie ]; 806*59036814SCostin Stroie } 807*59036814SCostin Stroie } 808*59036814SCostin Stroie 809*59036814SCostin Stroie /** 810*59036814SCostin Stroie * Process all DokuWiki files in a directory and send them to ChromaDB 811*59036814SCostin Stroie * 812*59036814SCostin Stroie * This function recursively processes all .txt files in a directory and its subdirectories. 813*59036814SCostin Stroie * It first checks if the appropriate collection exists and creates it if needed. 814*59036814SCostin Stroie * Then it processes each file individually. 815*59036814SCostin Stroie * 816*59036814SCostin Stroie * @param string $dirPath The directory path to process 817*59036814SCostin Stroie * @return array Result with status and details 818*59036814SCostin Stroie */ 819*59036814SCostin Stroie public function processDirectory($dirPath) { 820*59036814SCostin Stroie // Check if directory exists 821*59036814SCostin Stroie if (!is_dir($dirPath)) { 822*59036814SCostin Stroie return [ 823*59036814SCostin Stroie 'status' => 'error', 824*59036814SCostin Stroie 'message' => "Directory does not exist: $dirPath" 825*59036814SCostin Stroie ]; 826*59036814SCostin Stroie } 827*59036814SCostin Stroie 828*59036814SCostin Stroie // Create RecursiveIteratorIterator to process directories recursively 829*59036814SCostin Stroie $iterator = new RecursiveIteratorIterator( 830*59036814SCostin Stroie new RecursiveDirectoryIterator($dirPath, RecursiveDirectoryIterator::SKIP_DOTS), 831*59036814SCostin Stroie RecursiveIteratorIterator::LEAVES_ONLY 832*59036814SCostin Stroie ); 833*59036814SCostin Stroie 834*59036814SCostin Stroie $files = []; 835*59036814SCostin Stroie foreach ($iterator as $file) { 836*59036814SCostin Stroie // Process only .txt files that don't start with underscore 837*59036814SCostin Stroie if ($file->isFile() && $file->getExtension() === 'txt' && $file->getFilename()[0] !== '_') { 838*59036814SCostin Stroie $files[] = $file->getPathname(); 839*59036814SCostin Stroie } 840*59036814SCostin Stroie } 841*59036814SCostin Stroie 842*59036814SCostin Stroie if (empty($files)) { 843*59036814SCostin Stroie return [ 844*59036814SCostin Stroie 'status' => 'skipped', 845*59036814SCostin Stroie 'message' => "No .txt files found in directory: $dirPath" 846*59036814SCostin Stroie ]; 847*59036814SCostin Stroie } 848*59036814SCostin Stroie 849*59036814SCostin Stroie // Use the first part of the document ID as collection name, fallback to 'documents' 850*59036814SCostin Stroie $sampleFile = $files[0]; 851*59036814SCostin Stroie $id = parseFilePath($sampleFile); 852*59036814SCostin Stroie $idParts = explode(':', $id); 853*59036814SCostin Stroie $collectionName = isset($idParts[0]) && !empty($idParts[0]) ? $idParts[0] : 'documents'; 854*59036814SCostin Stroie 855*59036814SCostin Stroie try { 856*59036814SCostin Stroie $this->ensureCollectionExists($collectionName); 857*59036814SCostin Stroie $collectionChecked = true; 858*59036814SCostin Stroie } catch (Exception $e) { 859*59036814SCostin Stroie $collectionChecked = true; 860*59036814SCostin Stroie } 861*59036814SCostin Stroie 862*59036814SCostin Stroie $results = []; 863*59036814SCostin Stroie foreach ($files as $file) { 864*59036814SCostin Stroie $result = $this->processSingleFile($file, $collectionName, $collectionChecked); 865*59036814SCostin Stroie $results[] = [ 866*59036814SCostin Stroie 'file' => $file, 867*59036814SCostin Stroie 'result' => $result 868*59036814SCostin Stroie ]; 869*59036814SCostin Stroie } 870*59036814SCostin Stroie 871*59036814SCostin Stroie return [ 872*59036814SCostin Stroie 'status' => 'success', 873*59036814SCostin Stroie 'message' => "Finished processing directory.", 874*59036814SCostin Stroie 'files_count' => count($files), 875*59036814SCostin Stroie 'results' => $results 876*59036814SCostin Stroie ]; 877*59036814SCostin Stroie } 878*59036814SCostin Stroie} 879*59036814SCostin Stroie 880*59036814SCostin Stroie/** 881*59036814SCostin Stroie * Parse a file path and convert it to a DokuWiki ID 882*59036814SCostin Stroie * 883*59036814SCostin Stroie * Takes a file system path and converts it to the DokuWiki ID format by: 884*59036814SCostin Stroie * 1. Removing the base path prefix (using DokuWiki's pages directory) 885*59036814SCostin Stroie * 2. Removing the .txt extension 886*59036814SCostin Stroie * 3. Converting directory separators to colons 887*59036814SCostin Stroie * 888*59036814SCostin Stroie * Example: /var/www/html/dokuwiki/data/pages/reports/mri/2024/g287-name-surname.txt 889*59036814SCostin Stroie * Becomes: reports:mri:2024:g287-name-surname 890*59036814SCostin Stroie * 891*59036814SCostin Stroie * @param string $filePath The full file path to parse 892*59036814SCostin Stroie * @return string The DokuWiki ID 893*59036814SCostin Stroie */ 894*59036814SCostin Stroiefunction parseFilePath($filePath) { 895*59036814SCostin Stroie // Use DokuWiki's constant to get the pages directory if available 896*59036814SCostin Stroie if (defined('DOKU_INC')) { 897*59036814SCostin Stroie $pagesDir = DOKU_INC . 'data/pages/'; 898*59036814SCostin Stroie } else { 899*59036814SCostin Stroie // Fallback to common DokuWiki installation path 900*59036814SCostin Stroie $pagesDir = '/var/www/html/dokuwiki/data/pages/'; 901*59036814SCostin Stroie } 902*59036814SCostin Stroie 903*59036814SCostin Stroie // Remove the base path 904*59036814SCostin Stroie $relativePath = str_replace($pagesDir, '', $filePath); 905*59036814SCostin Stroie 906*59036814SCostin Stroie // Remove .txt extension 907*59036814SCostin Stroie $relativePath = preg_replace('/\.txt$/', '', $relativePath); 908*59036814SCostin Stroie 909*59036814SCostin Stroie // Split path into parts and filter out empty parts 910*59036814SCostin Stroie $parts = array_filter(explode('/', $relativePath)); 911*59036814SCostin Stroie 912*59036814SCostin Stroie // Build DokuWiki ID (use first part as namespace) 913*59036814SCostin Stroie $idParts = []; 914*59036814SCostin Stroie foreach ($parts as $part) { 915*59036814SCostin Stroie if (!empty($part)) { 916*59036814SCostin Stroie $idParts[] = $part; 917*59036814SCostin Stroie } 918*59036814SCostin Stroie } 919*59036814SCostin Stroie 920*59036814SCostin Stroie return implode(':', $idParts); 921*59036814SCostin Stroie} 922*59036814SCostin Stroie 923