1<?php 2 3use dokuwiki\Extension\CLIPlugin; 4use dokuwiki\plugin\aichat\Chunk; 5use dokuwiki\Search\Indexer; 6use splitbrain\phpcli\Colors; 7use splitbrain\phpcli\Options; 8use splitbrain\phpcli\TableFormatter; 9 10/** 11 * DokuWiki Plugin aichat (CLI Component) 12 * 13 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 14 * @author Andreas Gohr <gohr@cosmocode.de> 15 */ 16class cli_plugin_aichat extends CLIPlugin 17{ 18 /** @var helper_plugin_aichat */ 19 protected $helper; 20 21 public function __construct($autocatch = true) 22 { 23 parent::__construct($autocatch); 24 $this->helper = plugin_load('helper', 'aichat'); 25 $this->helper->setLogger($this); 26 } 27 28 /** @inheritDoc */ 29 protected function setup(Options $options) 30 { 31 $options->useCompactHelp(); 32 33 $options->setHelp( 34 'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' . 35 'This may incur costs.' 36 ); 37 38 $options->registerCommand( 39 'embed', 40 'Create embeddings for all pages. This skips pages that already have embeddings' 41 ); 42 $options->registerOption( 43 'clear', 44 'Clear all existing embeddings before creating new ones', 45 'c', 46 false, 47 'embed' 48 ); 49 50 $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.'); 51 52 $options->registerCommand('similar', 'Search for similar pages'); 53 $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar'); 54 55 $options->registerCommand('ask', 'Ask a question'); 56 $options->registerArgument('question', 'The question to ask', true, 'ask'); 57 58 $options->registerCommand('chat', 'Start an interactive chat session'); 59 60 $options->registerCommand('models', 'List available models'); 61 62 $options->registerCommand('info', 'Get Info about the vector storage and other stats'); 63 64 $options->registerCommand('split', 'Split a page into chunks (for debugging)'); 65 $options->registerArgument('page', 'The page to split', true, 'split'); 66 67 $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)'); 68 $options->registerArgument('page', 'The page to check', true, 'page'); 69 $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page'); 70 71 $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' . 72 ' Not supported on all storages.'); 73 $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv'); 74 $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv'); 75 } 76 77 /** @inheritDoc */ 78 protected function main(Options $options) 79 { 80 $this->loadConfig(); 81 ini_set('memory_limit', -1); 82 switch ($options->getCmd()) { 83 case 'embed': 84 $this->createEmbeddings($options->getOpt('clear')); 85 break; 86 case 'maintenance': 87 $this->runMaintenance(); 88 break; 89 case 'similar': 90 $this->similar($options->getArgs()[0]); 91 break; 92 case 'ask': 93 $this->ask($options->getArgs()[0]); 94 break; 95 case 'chat': 96 $this->chat(); 97 break; 98 case 'models': 99 $this->models(); 100 break; 101 case 'split': 102 $this->split($options->getArgs()[0]); 103 break; 104 case 'page': 105 $this->page($options->getArgs()[0], $options->getOpt('dump')); 106 break; 107 case 'info': 108 $this->showinfo(); 109 break; 110 case 'tsv': 111 $args = $options->getArgs(); 112 $vector = $args[0] ?? 'vector.tsv'; 113 $meta = $args[1] ?? 'meta.tsv'; 114 $this->tsv($vector, $meta); 115 break; 116 default: 117 echo $options->help(); 118 } 119 } 120 121 /** 122 * @return void 123 */ 124 protected function showinfo() 125 { 126 $stats = [ 127 'chat model' => $this->getConf('chatmodel'), 128 'embed model' => $this->getConf('embedmodel'), 129 ]; 130 $stats = array_merge( 131 $stats, 132 array_map('dformat', $this->helper->getRunData()), 133 $this->helper->getStorage()->statistics() 134 ); 135 $this->printTable($stats); 136 } 137 138 /** 139 * Print key value data as tabular data 140 * 141 * @param array $data 142 * @param int $level 143 * @return void 144 */ 145 protected function printTable($data, $level = 0) 146 { 147 $tf = new TableFormatter($this->colors); 148 foreach ($data as $key => $value) { 149 if (is_array($value)) { 150 echo $tf->format( 151 [$level * 2, 20, '*'], 152 ['', $key, ''], 153 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 154 ); 155 $this->printTable($value, $level + 1); 156 } else { 157 echo $tf->format( 158 [$level * 2, 20, '*'], 159 ['', $key, $value], 160 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY] 161 ); 162 } 163 } 164 } 165 166 /** 167 * Check chunk availability for a given page 168 * 169 * @param string $page 170 * @return void 171 */ 172 protected function page($page, $dump = false) 173 { 174 $indexer = new Indexer(); 175 $pages = $indexer->getPages(); 176 $pos = array_search(cleanID($page), $pages); 177 178 if ($pos === false) { 179 $this->error('Page not found'); 180 return; 181 } 182 183 $storage = $this->helper->getStorage(); 184 $chunks = $storage->getPageChunks($page, $pos * 100); 185 if ($chunks) { 186 $this->success('Found ' . count($chunks) . ' chunks'); 187 if ($dump) { 188 echo json_encode($chunks, JSON_PRETTY_PRINT); 189 } 190 } else { 191 $this->error('No chunks found'); 192 } 193 } 194 195 /** 196 * Split the given page into chunks and print them 197 * 198 * @param string $page 199 * @return void 200 * @throws Exception 201 */ 202 protected function split($page) 203 { 204 $text = rawWiki($page); 205 $chunks = $this->helper->getEmbeddings()->splitIntoChunks($text); 206 foreach ($chunks as $chunk) { 207 echo $chunk; 208 echo "\n"; 209 $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE); 210 } 211 $this->success('Split into ' . count($chunks) . ' chunks'); 212 } 213 214 /** 215 * Interactive Chat Session 216 * 217 * @return void 218 * @throws Exception 219 */ 220 protected function chat() 221 { 222 if ($this->loglevel['debug']['enabled']) { 223 $this->helper->getChatModel()->setDebug(true); 224 $this->helper->getRephraseModel()->setDebug(true); 225 $this->helper->getEmbedModel()->setDebug(true); 226 } 227 228 $history = []; 229 while ($q = $this->readLine('Your Question')) { 230 $this->helper->getChatModel()->resetUsageStats(); 231 $this->helper->getRephraseModel()->resetUsageStats(); 232 $this->helper->getEmbedModel()->resetUsageStats(); 233 $result = $this->helper->askChatQuestion($q, $history); 234 $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE); 235 $history[] = [$result['question'], $result['answer']]; 236 $this->printAnswer($result); 237 } 238 } 239 240 protected function models() 241 { 242 $result = [ 243 'chat' => [], 244 'embedding' => [], 245 ]; 246 247 248 $jsons = glob(__DIR__ . '/Model/*/models.json'); 249 foreach ($jsons as $json) { 250 $models = json_decode(file_get_contents($json), true); 251 foreach ($models as $type => $model) { 252 $namespace = basename(dirname($json)); 253 foreach ($model as $name => $info) { 254 255 256 $class = '\\dokuwiki\\plugin\\aichat\\Model\\' . $namespace . '\\' . ucfirst($type) . 'Model'; 257 try { 258 new $class($name, $this->conf); 259 $info['confok'] = true; 260 } catch (Exception $e) { 261 $info['confok'] = false; 262 } 263 264 $result[$type]["$namespace $name"] = $info; 265 } 266 } 267 } 268 269 $td = new TableFormatter($this->colors); 270 $cols = [30, 20, 20, '*']; 271 echo "==== Chat Models ====\n\n"; 272 echo $td->format( 273 $cols, 274 ['Model', 'Token Limits', 'Price USD/M', 'Description'], 275 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 276 ); 277 foreach ($result['chat'] as $name => $info) { 278 echo $td->format( 279 $cols, 280 [ 281 $name, 282 sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']), 283 sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']), 284 $info['description'] . "\n" 285 ], 286 [ 287 $info['confok'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 288 ] 289 ); 290 } 291 292 $cols = [30, 10, 10, 10, '*']; 293 echo "==== Embedding Models ====\n\n"; 294 echo $td->format( 295 $cols, 296 ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'], 297 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 298 ); 299 foreach ($result['embedding'] as $name => $info) { 300 echo $td->format( 301 $cols, 302 [ 303 $name, 304 sprintf("%7d", $info['inputTokens']), 305 sprintf("%.2f", $info['inputTokenPrice']), 306 $info['dimensions'], 307 $info['description'] . "\n" 308 ], 309 [ 310 $info['confok'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 311 ] 312 ); 313 } 314 315 $this->colors->ptln('Current prices may differ', Colors::C_RED); 316 } 317 318 /** 319 * Handle a single, standalone question 320 * 321 * @param string $query 322 * @return void 323 * @throws Exception 324 */ 325 protected function ask($query) 326 { 327 if ($this->loglevel['debug']['enabled']) { 328 $this->helper->getChatModel()->setDebug(true); 329 $this->helper->getRephraseModel()->setDebug(true); 330 $this->helper->getEmbedModel()->setDebug(true); 331 } 332 333 $result = $this->helper->askQuestion($query); 334 $this->printAnswer($result); 335 } 336 337 /** 338 * Get the pages that are similar to the query 339 * 340 * @param string $query 341 * @return void 342 */ 343 protected function similar($query) 344 { 345 $langlimit = $this->helper->getLanguageLimit(); 346 if ($langlimit) { 347 $this->info('Limiting results to {lang}', ['lang' => $langlimit]); 348 } 349 350 $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit); 351 $this->printSources($sources); 352 } 353 354 /** 355 * Run the maintenance tasks 356 * 357 * @return void 358 */ 359 protected function runMaintenance() 360 { 361 $start = time(); 362 $this->helper->getStorage()->runMaintenance(); 363 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 364 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 365 366 $data = $this->helper->getRunData(); 367 $data['maintenance ran at'] = time(); 368 $this->helper->setRunData($data); 369 } 370 371 /** 372 * Recreate chunks and embeddings for all pages 373 * 374 * @return void 375 */ 376 protected function createEmbeddings($clear) 377 { 378 [$skipRE, $matchRE] = $this->getRegexps(); 379 380 $start = time(); 381 $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear); 382 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 383 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 384 385 $data = $this->helper->getRunData(); 386 $data['embed ran at'] = time(); 387 $this->helper->setRunData($data); 388 } 389 390 /** 391 * Dump TSV files for debugging 392 * 393 * @return void 394 */ 395 protected function tsv($vector, $meta) 396 { 397 398 $storage = $this->helper->getStorage(); 399 $storage->dumpTSV($vector, $meta); 400 $this->success('written to ' . $vector . ' and ' . $meta); 401 } 402 403 /** 404 * Print the given detailed answer in a nice way 405 * 406 * @param array $answer 407 * @return void 408 */ 409 protected function printAnswer($answer) 410 { 411 $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN); 412 echo "\n"; 413 $this->printSources($answer['sources']); 414 echo "\n"; 415 $this->printUsage(); 416 } 417 418 /** 419 * Print the given sources 420 * 421 * @param Chunk[] $sources 422 * @return void 423 */ 424 protected function printSources($sources) 425 { 426 foreach ($sources as $source) { 427 /** @var Chunk $source */ 428 $this->colors->ptln( 429 "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')', 430 Colors::C_LIGHTBLUE 431 ); 432 } 433 } 434 435 /** 436 * Print the usage statistics for OpenAI 437 * 438 * @return void 439 */ 440 protected function printUsage() 441 { 442 $chat = $this->helper->getChatModel()->getUsageStats(); 443 $rephrase = $this->helper->getRephraseModel()->getUsageStats(); 444 $embed = $this->helper->getEmbedModel()->getUsageStats(); 445 446 $this->info( 447 'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.', 448 [ 449 'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'], 450 'time' => $chat['time'] + $rephrase['time'] + $embed['time'], 451 'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'], 452 'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'], 453 ] 454 ); 455 } 456 457 /** 458 * Interactively ask for a value from the user 459 * 460 * @param string $prompt 461 * @return string 462 */ 463 protected function readLine($prompt) 464 { 465 $value = ''; 466 467 while ($value === '') { 468 echo $prompt; 469 echo ': '; 470 471 $fh = fopen('php://stdin', 'r'); 472 $value = trim(fgets($fh)); 473 fclose($fh); 474 } 475 476 return $value; 477 } 478 479 /** 480 * Read the skip and match regex from the config 481 * 482 * Ensures the regular expressions are valid 483 * 484 * @return string[] [$skipRE, $matchRE] 485 */ 486 protected function getRegexps() 487 { 488 $skip = $this->getConf('skipRegex'); 489 $skipRE = ''; 490 $match = $this->getConf('matchRegex'); 491 $matchRE = ''; 492 493 if ($skip) { 494 $skipRE = '/' . $skip . '/'; 495 if (@preg_match($skipRE, '') === false) { 496 $this->error(preg_last_error_msg()); 497 $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.'); 498 $skipRE = ''; 499 } else { 500 $this->success('Skipping pages matching ' . $skipRE); 501 } 502 } 503 504 if ($match) { 505 $matchRE = '/' . $match . '/'; 506 if (@preg_match($matchRE, '') === false) { 507 $this->error(preg_last_error_msg()); 508 $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.'); 509 $matchRE = ''; 510 } else { 511 $this->success('Only indexing pages matching ' . $matchRE); 512 } 513 } 514 return [$skipRE, $matchRE]; 515 } 516} 517