1<?php 2 3use dokuwiki\Extension\CLIPlugin; 4use dokuwiki\plugin\aichat\AbstractCLI; 5use dokuwiki\plugin\aichat\Chunk; 6use dokuwiki\plugin\aichat\ModelFactory; 7use dokuwiki\Search\Indexer; 8use splitbrain\phpcli\Colors; 9use splitbrain\phpcli\Options; 10use splitbrain\phpcli\TableFormatter; 11 12/** 13 * DokuWiki Plugin aichat (CLI Component) 14 * 15 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 16 * @author Andreas Gohr <gohr@cosmocode.de> 17 */ 18class cli_plugin_aichat extends AbstractCLI 19{ 20 /** @var helper_plugin_aichat */ 21 protected $helper; 22 23 /** @inheritDoc */ 24 protected function setup(Options $options) 25 { 26 parent::setup($options); 27 28 $options->setHelp( 29 'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' . 30 'This may incur costs.' 31 ); 32 33 $options->registerOption( 34 'model', 35 'Overrides the chat and rephrasing model settings and uses this model instead', 36 '', 37 'model' 38 ); 39 40 $options->registerCommand( 41 'embed', 42 'Create embeddings for all pages. This skips pages that already have embeddings' 43 ); 44 $options->registerOption( 45 'clear', 46 'Clear all existing embeddings before creating new ones', 47 'c', 48 false, 49 'embed' 50 ); 51 52 $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.'); 53 54 $options->registerCommand('similar', 'Search for similar pages'); 55 $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar'); 56 57 $options->registerCommand('ask', 'Ask a question'); 58 $options->registerArgument('question', 'The question to ask', true, 'ask'); 59 60 $options->registerCommand('chat', 'Start an interactive chat session'); 61 62 $options->registerCommand('models', 'List available models'); 63 64 $options->registerCommand('info', 'Get Info about the vector storage and other stats'); 65 66 $options->registerCommand('split', 'Split a page into chunks (for debugging)'); 67 $options->registerArgument('page', 'The page to split', true, 'split'); 68 69 $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)'); 70 $options->registerArgument('page', 'The page to check', true, 'page'); 71 $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page'); 72 73 $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' . 74 ' Not supported on all storages.'); 75 $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv'); 76 $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv'); 77 } 78 79 /** @inheritDoc */ 80 protected function main(Options $options) 81 { 82 parent::main($options); 83 84 $model = $options->getOpt('model'); 85 if ($model) { 86 $this->helper->updateConfig( 87 ['chatmodel' => $model, 'rephasemodel' => $model] 88 ); 89 } 90 91 switch ($options->getCmd()) { 92 case 'embed': 93 $this->createEmbeddings($options->getOpt('clear')); 94 break; 95 case 'maintenance': 96 $this->runMaintenance(); 97 break; 98 case 'similar': 99 $this->similar($options->getArgs()[0]); 100 break; 101 case 'ask': 102 $this->ask($options->getArgs()[0]); 103 break; 104 case 'chat': 105 $this->chat(); 106 break; 107 case 'models': 108 $this->models(); 109 break; 110 case 'split': 111 $this->split($options->getArgs()[0]); 112 break; 113 case 'page': 114 $this->page($options->getArgs()[0], $options->getOpt('dump')); 115 break; 116 case 'info': 117 $this->showinfo(); 118 break; 119 case 'tsv': 120 $args = $options->getArgs(); 121 $vector = $args[0] ?? 'vector.tsv'; 122 $meta = $args[1] ?? 'meta.tsv'; 123 $this->tsv($vector, $meta); 124 break; 125 default: 126 echo $options->help(); 127 } 128 } 129 130 /** 131 * @return void 132 */ 133 protected function showinfo() 134 { 135 $stats = [ 136 'embed model' => (string) $this->helper->getEmbeddingModel(), 137 'rephrase model' => (string) $this->helper->getRephraseModel(), 138 'chat model' => (string) $this->helper->getChatModel(), 139 ]; 140 $stats = array_merge( 141 $stats, 142 $this->helper->getRunData(), 143 $this->helper->getStorage()->statistics() 144 ); 145 $this->printTable($stats); 146 } 147 148 /** 149 * Print key value data as tabular data 150 * 151 * @param array $data 152 * @param int $level 153 * @return void 154 */ 155 protected function printTable($data, $level = 0) 156 { 157 $tf = new TableFormatter($this->colors); 158 foreach ($data as $key => $value) { 159 if (is_array($value)) { 160 echo $tf->format( 161 [$level * 2, 20, '*'], 162 ['', $key, ''], 163 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 164 ); 165 $this->printTable($value, $level + 1); 166 } else { 167 echo $tf->format( 168 [$level * 2, 20, '*'], 169 ['', $key, $value], 170 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY] 171 ); 172 } 173 } 174 } 175 176 /** 177 * Check chunk availability for a given page 178 * 179 * @param string $page 180 * @return void 181 */ 182 protected function page($page, $dump = false) 183 { 184 $indexer = new Indexer(); 185 $pages = $indexer->getPages(); 186 $pos = array_search(cleanID($page), $pages); 187 188 if ($pos === false) { 189 $this->error('Page not found'); 190 return; 191 } 192 193 $storage = $this->helper->getStorage(); 194 $chunks = $storage->getPageChunks($page, $pos * 100); 195 if ($chunks) { 196 $this->success('Found ' . count($chunks) . ' chunks'); 197 if ($dump) { 198 echo json_encode($chunks, JSON_PRETTY_PRINT); 199 } 200 } else { 201 $this->error('No chunks found'); 202 } 203 } 204 205 /** 206 * Split the given page into chunks and print them 207 * 208 * @param string $page 209 * @return void 210 * @throws Exception 211 */ 212 protected function split($page) 213 { 214 $chunks = $this->helper->getEmbeddings()->createPageChunks($page, 0); 215 foreach ($chunks as $chunk) { 216 echo $chunk->getText(); 217 echo "\n"; 218 $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE); 219 } 220 $this->success('Split into ' . count($chunks) . ' chunks'); 221 } 222 223 /** 224 * Interactive Chat Session 225 * 226 * @return void 227 * @throws Exception 228 */ 229 protected function chat() 230 { 231 $history = []; 232 while ($q = $this->readLine('Your Question')) { 233 $this->helper->getChatModel()->resetUsageStats(); 234 $this->helper->getRephraseModel()->resetUsageStats(); 235 $this->helper->getEmbeddingModel()->resetUsageStats(); 236 $result = $this->helper->askChatQuestion($q, $history); 237 $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE); 238 $history[] = [$result['question'], $result['answer']]; 239 $this->printAnswer($result); 240 } 241 } 242 243 /** 244 * Print information about the available models 245 * 246 * @return void 247 */ 248 protected function models() 249 { 250 $result = (new ModelFactory($this->conf))->getModels(); 251 252 $td = new TableFormatter($this->colors); 253 $cols = [30, 20, 20, '*']; 254 echo "==== Chat Models ====\n\n"; 255 echo $td->format( 256 $cols, 257 ['Model', 'Token Limits', 'Price USD/M', 'Description'], 258 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 259 ); 260 foreach ($result['chat'] as $name => $info) { 261 echo $td->format( 262 $cols, 263 [ 264 $name, 265 sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']), 266 sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']), 267 $info['description'] . "\n" 268 ], 269 [ 270 $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 271 ] 272 ); 273 } 274 275 $cols = [30, 10, 10, 10, '*']; 276 echo "==== Embedding Models ====\n\n"; 277 echo $td->format( 278 $cols, 279 ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'], 280 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 281 ); 282 foreach ($result['embedding'] as $name => $info) { 283 echo $td->format( 284 $cols, 285 [ 286 $name, 287 sprintf("%7d", $info['inputTokens']), 288 sprintf("%.2f", $info['inputTokenPrice']), 289 $info['dimensions'], 290 $info['description'] . "\n" 291 ], 292 [ 293 $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 294 ] 295 ); 296 } 297 298 $this->colors->ptln('Current prices may differ', Colors::C_RED); 299 } 300 301 /** 302 * Handle a single, standalone question 303 * 304 * @param string $query 305 * @return void 306 * @throws Exception 307 */ 308 protected function ask($query) 309 { 310 $result = $this->helper->askQuestion($query); 311 $this->printAnswer($result); 312 } 313 314 /** 315 * Get the pages that are similar to the query 316 * 317 * @param string $query 318 * @return void 319 */ 320 protected function similar($query) 321 { 322 $langlimit = $this->helper->getLanguageLimit(); 323 if ($langlimit) { 324 $this->info('Limiting results to {lang}', ['lang' => $langlimit]); 325 } 326 327 $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit); 328 $this->printSources($sources); 329 } 330 331 /** 332 * Run the maintenance tasks 333 * 334 * @return void 335 */ 336 protected function runMaintenance() 337 { 338 $start = time(); 339 $this->helper->getStorage()->runMaintenance(); 340 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 341 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 342 343 $data = $this->helper->getRunData(); 344 $data['maintenance ran at'] = dformat(); 345 $this->helper->setRunData($data); 346 } 347 348 /** 349 * Recreate chunks and embeddings for all pages 350 * 351 * @return void 352 */ 353 protected function createEmbeddings($clear) 354 { 355 [$skipRE, $matchRE] = $this->getRegexps(); 356 357 $data = $this->helper->getRunData(); 358 $lastEmbedModel = $data['embed used'] ?? ''; 359 360 if( 361 !$clear && $lastEmbedModel && 362 $lastEmbedModel != (string) $this->helper->getEmbeddingModel() 363 ){ 364 $this->warning('Embedding model has changed since last run. Forcing an index rebuild'); 365 $clear = true; 366 } 367 368 $start = time(); 369 $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear); 370 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 371 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 372 373 374 $data['embed ran at'] = dformat(); 375 $data['embed used'] = (string) $this->helper->getEmbeddingModel(); 376 $this->helper->setRunData($data); 377 } 378 379 /** 380 * Dump TSV files for debugging 381 * 382 * @return void 383 */ 384 protected function tsv($vector, $meta) 385 { 386 387 $storage = $this->helper->getStorage(); 388 $storage->dumpTSV($vector, $meta); 389 $this->success('written to ' . $vector . ' and ' . $meta); 390 } 391 392 /** 393 * Print the given detailed answer in a nice way 394 * 395 * @param array $answer 396 * @return void 397 */ 398 protected function printAnswer($answer) 399 { 400 $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN); 401 echo "\n"; 402 $this->printSources($answer['sources']); 403 echo "\n"; 404 $this->printUsage(); 405 } 406 407 /** 408 * Print the given sources 409 * 410 * @param Chunk[] $sources 411 * @return void 412 */ 413 protected function printSources($sources) 414 { 415 foreach ($sources as $source) { 416 /** @var Chunk $source */ 417 $this->colors->ptln( 418 "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')', 419 Colors::C_LIGHTBLUE 420 ); 421 } 422 } 423 424 /** 425 * Print the usage statistics for OpenAI 426 * 427 * @return void 428 */ 429 protected function printUsage() 430 { 431 $chat = $this->helper->getChatModel()->getUsageStats(); 432 $rephrase = $this->helper->getRephraseModel()->getUsageStats(); 433 $embed = $this->helper->getEmbeddingModel()->getUsageStats(); 434 435 $this->info( 436 'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.', 437 [ 438 'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'], 439 'time' => $chat['time'] + $rephrase['time'] + $embed['time'], 440 'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'], 441 'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'], 442 ] 443 ); 444 } 445 446 /** 447 * Interactively ask for a value from the user 448 * 449 * @param string $prompt 450 * @return string 451 */ 452 protected function readLine($prompt) 453 { 454 $value = ''; 455 456 while ($value === '') { 457 echo $prompt; 458 echo ': '; 459 460 $fh = fopen('php://stdin', 'r'); 461 $value = trim(fgets($fh)); 462 fclose($fh); 463 } 464 465 return $value; 466 } 467 468 /** 469 * Read the skip and match regex from the config 470 * 471 * Ensures the regular expressions are valid 472 * 473 * @return string[] [$skipRE, $matchRE] 474 */ 475 protected function getRegexps() 476 { 477 $skip = $this->getConf('skipRegex'); 478 $skipRE = ''; 479 $match = $this->getConf('matchRegex'); 480 $matchRE = ''; 481 482 if ($skip) { 483 $skipRE = '/' . $skip . '/'; 484 if (@preg_match($skipRE, '') === false) { 485 $this->error(preg_last_error_msg()); 486 $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.'); 487 $skipRE = ''; 488 } else { 489 $this->success('Skipping pages matching ' . $skipRE); 490 } 491 } 492 493 if ($match) { 494 $matchRE = '/' . $match . '/'; 495 if (@preg_match($matchRE, '') === false) { 496 $this->error(preg_last_error_msg()); 497 $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.'); 498 $matchRE = ''; 499 } else { 500 $this->success('Only indexing pages matching ' . $matchRE); 501 } 502 } 503 return [$skipRE, $matchRE]; 504 } 505} 506