1<?php 2 3use dokuwiki\Extension\CLIPlugin; 4use dokuwiki\plugin\aichat\AbstractCLI; 5use dokuwiki\plugin\aichat\Chunk; 6use dokuwiki\plugin\aichat\ModelFactory; 7use dokuwiki\Search\Indexer; 8use splitbrain\phpcli\Colors; 9use splitbrain\phpcli\Options; 10use splitbrain\phpcli\TableFormatter; 11 12/** 13 * DokuWiki Plugin aichat (CLI Component) 14 * 15 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 16 * @author Andreas Gohr <gohr@cosmocode.de> 17 */ 18class cli_plugin_aichat extends AbstractCLI 19{ 20 /** @var helper_plugin_aichat */ 21 protected $helper; 22 23 /** @inheritDoc */ 24 protected function setup(Options $options) 25 { 26 parent::setup($options); 27 28 $options->setHelp( 29 'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' . 30 'This may incur costs.' 31 ); 32 33 $options->registerOption( 34 'model', 35 'Overrides the chat and rephrasing model settings and uses this model instead', 36 '', 37 'model' 38 ); 39 40 $options->registerCommand( 41 'embed', 42 'Create embeddings for all pages. This skips pages that already have embeddings' 43 ); 44 $options->registerOption( 45 'clear', 46 'Clear all existing embeddings before creating new ones', 47 'c', 48 false, 49 'embed' 50 ); 51 52 $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.'); 53 54 $options->registerCommand('similar', 'Search for similar pages'); 55 $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar'); 56 57 $options->registerCommand('ask', 'Ask a question'); 58 $options->registerArgument('question', 'The question to ask', true, 'ask'); 59 60 $options->registerCommand('chat', 'Start an interactive chat session'); 61 62 $options->registerCommand('models', 'List available models'); 63 64 $options->registerCommand('info', 'Get Info about the vector storage and other stats'); 65 66 $options->registerCommand('split', 'Split a page into chunks (for debugging)'); 67 $options->registerArgument('page', 'The page to split', true, 'split'); 68 69 $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)'); 70 $options->registerArgument('page', 'The page to check', true, 'page'); 71 $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page'); 72 73 $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' . 74 ' Not supported on all storages.'); 75 $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv'); 76 $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv'); 77 } 78 79 /** @inheritDoc */ 80 protected function main(Options $options) 81 { 82 parent::main($options); 83 auth_setup(); // make sure ACLs are initialized 84 85 $model = $options->getOpt('model'); 86 if ($model) { 87 $this->helper->updateConfig( 88 ['chatmodel' => $model, 'rephasemodel' => $model] 89 ); 90 } 91 92 switch ($options->getCmd()) { 93 case 'embed': 94 $this->createEmbeddings($options->getOpt('clear')); 95 break; 96 case 'maintenance': 97 $this->runMaintenance(); 98 break; 99 case 'similar': 100 $this->similar($options->getArgs()[0]); 101 break; 102 case 'ask': 103 $this->ask($options->getArgs()[0]); 104 break; 105 case 'chat': 106 $this->chat(); 107 break; 108 case 'models': 109 $this->models(); 110 break; 111 case 'split': 112 $this->split($options->getArgs()[0]); 113 break; 114 case 'page': 115 $this->page($options->getArgs()[0], $options->getOpt('dump')); 116 break; 117 case 'info': 118 $this->showinfo(); 119 break; 120 case 'tsv': 121 $args = $options->getArgs(); 122 $vector = $args[0] ?? 'vector.tsv'; 123 $meta = $args[1] ?? 'meta.tsv'; 124 $this->tsv($vector, $meta); 125 break; 126 default: 127 echo $options->help(); 128 } 129 } 130 131 /** 132 * @return void 133 */ 134 protected function showinfo() 135 { 136 $stats = [ 137 'embed model' => (string) $this->helper->getEmbeddingModel(), 138 'rephrase model' => (string) $this->helper->getRephraseModel(), 139 'chat model' => (string) $this->helper->getChatModel(), 140 ]; 141 $stats = array_merge( 142 $stats, 143 $this->helper->getRunData(), 144 $this->helper->getStorage()->statistics() 145 ); 146 $this->printTable($stats); 147 } 148 149 /** 150 * Print key value data as tabular data 151 * 152 * @param array $data 153 * @param int $level 154 * @return void 155 */ 156 protected function printTable($data, $level = 0) 157 { 158 $tf = new TableFormatter($this->colors); 159 foreach ($data as $key => $value) { 160 if (is_array($value)) { 161 echo $tf->format( 162 [$level * 2, 20, '*'], 163 ['', $key, ''], 164 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 165 ); 166 $this->printTable($value, $level + 1); 167 } else { 168 echo $tf->format( 169 [$level * 2, 20, '*'], 170 ['', $key, $value], 171 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY] 172 ); 173 } 174 } 175 } 176 177 /** 178 * Check chunk availability for a given page 179 * 180 * @param string $page 181 * @return void 182 */ 183 protected function page($page, $dump = false) 184 { 185 $indexer = new Indexer(); 186 $pages = $indexer->getPages(); 187 $pos = array_search(cleanID($page), $pages); 188 189 if ($pos === false) { 190 $this->error('Page not found'); 191 return; 192 } 193 194 $storage = $this->helper->getStorage(); 195 $chunks = $storage->getPageChunks($page, $pos * 100); 196 if ($chunks) { 197 $this->success('Found ' . count($chunks) . ' chunks'); 198 if ($dump) { 199 echo json_encode($chunks, JSON_PRETTY_PRINT); 200 } 201 } else { 202 $this->error('No chunks found'); 203 } 204 } 205 206 /** 207 * Split the given page into chunks and print them 208 * 209 * @param string $page 210 * @return void 211 * @throws Exception 212 */ 213 protected function split($page) 214 { 215 $chunks = $this->helper->getEmbeddings()->createPageChunks($page, 0); 216 foreach ($chunks as $chunk) { 217 echo $chunk->getText(); 218 echo "\n"; 219 $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE); 220 } 221 $this->success('Split into ' . count($chunks) . ' chunks'); 222 } 223 224 /** 225 * Interactive Chat Session 226 * 227 * @return void 228 * @throws Exception 229 */ 230 protected function chat() 231 { 232 $history = []; 233 while ($q = $this->readLine('Your Question')) { 234 $this->helper->getChatModel()->resetUsageStats(); 235 $this->helper->getRephraseModel()->resetUsageStats(); 236 $this->helper->getEmbeddingModel()->resetUsageStats(); 237 $result = $this->helper->askChatQuestion($q, $history); 238 $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE); 239 $history[] = [$result['question'], $result['answer']]; 240 $this->printAnswer($result); 241 } 242 } 243 244 /** 245 * Print information about the available models 246 * 247 * @return void 248 */ 249 protected function models() 250 { 251 $result = (new ModelFactory($this->conf))->getModels(); 252 253 $td = new TableFormatter($this->colors); 254 $cols = [30, 20, 20, '*']; 255 echo "==== Chat Models ====\n\n"; 256 echo $td->format( 257 $cols, 258 ['Model', 'Token Limits', 'Price USD/M', 'Description'], 259 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 260 ); 261 foreach ($result['chat'] as $name => $info) { 262 echo $td->format( 263 $cols, 264 [ 265 $name, 266 sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']), 267 sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']), 268 $info['description'] . "\n" 269 ], 270 [ 271 $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 272 ] 273 ); 274 } 275 276 $cols = [30, 10, 10, 10, '*']; 277 echo "==== Embedding Models ====\n\n"; 278 echo $td->format( 279 $cols, 280 ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'], 281 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 282 ); 283 foreach ($result['embedding'] as $name => $info) { 284 echo $td->format( 285 $cols, 286 [ 287 $name, 288 sprintf("%7d", $info['inputTokens']), 289 sprintf("%.2f", $info['inputTokenPrice']), 290 $info['dimensions'], 291 $info['description'] . "\n" 292 ], 293 [ 294 $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 295 ] 296 ); 297 } 298 299 $this->colors->ptln('Current prices may differ', Colors::C_RED); 300 } 301 302 /** 303 * Handle a single, standalone question 304 * 305 * @param string $query 306 * @return void 307 * @throws Exception 308 */ 309 protected function ask($query) 310 { 311 $result = $this->helper->askQuestion($query); 312 $this->printAnswer($result); 313 } 314 315 /** 316 * Get the pages that are similar to the query 317 * 318 * @param string $query 319 * @return void 320 */ 321 protected function similar($query) 322 { 323 $langlimit = $this->helper->getLanguageLimit(); 324 if ($langlimit) { 325 $this->info('Limiting results to {lang}', ['lang' => $langlimit]); 326 } 327 328 $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit); 329 $this->printSources($sources); 330 } 331 332 /** 333 * Run the maintenance tasks 334 * 335 * @return void 336 */ 337 protected function runMaintenance() 338 { 339 $start = time(); 340 $this->helper->getStorage()->runMaintenance(); 341 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 342 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 343 344 $data = $this->helper->getRunData(); 345 $data['maintenance ran at'] = dformat(); 346 $this->helper->setRunData($data); 347 } 348 349 /** 350 * Recreate chunks and embeddings for all pages 351 * 352 * @return void 353 */ 354 protected function createEmbeddings($clear) 355 { 356 [$skipRE, $matchRE] = $this->getRegexps(); 357 358 $data = $this->helper->getRunData(); 359 $lastEmbedModel = $data['embed used'] ?? ''; 360 361 if ( 362 !$clear && $lastEmbedModel && 363 $lastEmbedModel != (string) $this->helper->getEmbeddingModel() 364 ) { 365 $this->warning('Embedding model has changed since last run. Forcing an index rebuild'); 366 $clear = true; 367 } 368 369 $data['embed ran at'] = dformat(); 370 $data['embed used'] = (string) $this->helper->getEmbeddingModel(); 371 $this->helper->setRunData($data); 372 373 $start = time(); 374 $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear); 375 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 376 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 377 } 378 379 /** 380 * Dump TSV files for debugging 381 * 382 * @return void 383 */ 384 protected function tsv($vector, $meta) 385 { 386 387 $storage = $this->helper->getStorage(); 388 $storage->dumpTSV($vector, $meta); 389 $this->success('written to ' . $vector . ' and ' . $meta); 390 } 391 392 /** 393 * Print the given detailed answer in a nice way 394 * 395 * @param array $answer 396 * @return void 397 */ 398 protected function printAnswer($answer) 399 { 400 $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN); 401 echo "\n"; 402 $this->printSources($answer['sources']); 403 echo "\n"; 404 $this->printUsage(); 405 } 406 407 /** 408 * Print the given sources 409 * 410 * @param Chunk[] $sources 411 * @return void 412 */ 413 protected function printSources($sources) 414 { 415 foreach ($sources as $source) { 416 /** @var Chunk $source */ 417 $this->colors->ptln( 418 "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')', 419 Colors::C_LIGHTBLUE 420 ); 421 } 422 } 423 424 /** 425 * Print the usage statistics for OpenAI 426 * 427 * @return void 428 */ 429 protected function printUsage() 430 { 431 $chat = $this->helper->getChatModel()->getUsageStats(); 432 $rephrase = $this->helper->getRephraseModel()->getUsageStats(); 433 $embed = $this->helper->getEmbeddingModel()->getUsageStats(); 434 435 $this->info( 436 'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.', 437 [ 438 'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'], 439 'time' => $chat['time'] + $rephrase['time'] + $embed['time'], 440 'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'], 441 'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'], 442 ] 443 ); 444 } 445 446 /** 447 * Interactively ask for a value from the user 448 * 449 * @param string $prompt 450 * @return string 451 */ 452 protected function readLine($prompt) 453 { 454 $value = ''; 455 456 while ($value === '') { 457 echo $prompt; 458 echo ': '; 459 460 $fh = fopen('php://stdin', 'r'); 461 $value = trim(fgets($fh)); 462 fclose($fh); 463 } 464 465 return $value; 466 } 467 468 /** 469 * Read the skip and match regex from the config 470 * 471 * Ensures the regular expressions are valid 472 * 473 * @return string[] [$skipRE, $matchRE] 474 */ 475 protected function getRegexps() 476 { 477 $skip = $this->getConf('skipRegex'); 478 $skipRE = ''; 479 $match = $this->getConf('matchRegex'); 480 $matchRE = ''; 481 482 if ($skip) { 483 $skipRE = '/' . $skip . '/'; 484 if (@preg_match($skipRE, '') === false) { 485 $this->error(preg_last_error_msg()); 486 $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.'); 487 $skipRE = ''; 488 } else { 489 $this->success('Skipping pages matching ' . $skipRE); 490 } 491 } 492 493 if ($match) { 494 $matchRE = '/' . $match . '/'; 495 if (@preg_match($matchRE, '') === false) { 496 $this->error(preg_last_error_msg()); 497 $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.'); 498 $matchRE = ''; 499 } else { 500 $this->success('Only indexing pages matching ' . $matchRE); 501 } 502 } 503 return [$skipRE, $matchRE]; 504 } 505} 506