1<?php 2 3use dokuwiki\Extension\CLIPlugin; 4use dokuwiki\plugin\aichat\Chunk; 5use dokuwiki\plugin\aichat\ModelFactory; 6use dokuwiki\Search\Indexer; 7use splitbrain\phpcli\Colors; 8use splitbrain\phpcli\Options; 9use splitbrain\phpcli\TableFormatter; 10 11/** 12 * DokuWiki Plugin aichat (CLI Component) 13 * 14 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 15 * @author Andreas Gohr <gohr@cosmocode.de> 16 */ 17class cli_plugin_aichat extends CLIPlugin 18{ 19 /** @var helper_plugin_aichat */ 20 protected $helper; 21 22 /** @inheritdoc */ 23 public function __construct($autocatch = true) 24 { 25 parent::__construct($autocatch); 26 $this->helper = plugin_load('helper', 'aichat'); 27 $this->helper->setLogger($this); 28 $this->loadConfig(); 29 } 30 31 /** @inheritDoc */ 32 protected function setup(Options $options) 33 { 34 $options->useCompactHelp(); 35 36 $options->setHelp( 37 'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' . 38 'This may incur costs.' 39 ); 40 41 $options->registerCommand( 42 'embed', 43 'Create embeddings for all pages. This skips pages that already have embeddings' 44 ); 45 $options->registerOption( 46 'clear', 47 'Clear all existing embeddings before creating new ones', 48 'c', 49 false, 50 'embed' 51 ); 52 53 $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.'); 54 55 $options->registerCommand('similar', 'Search for similar pages'); 56 $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar'); 57 58 $options->registerCommand('ask', 'Ask a question'); 59 $options->registerArgument('question', 'The question to ask', true, 'ask'); 60 61 $options->registerCommand('chat', 'Start an interactive chat session'); 62 63 $options->registerCommand('models', 'List available models'); 64 65 $options->registerCommand('info', 'Get Info about the vector storage and other stats'); 66 67 $options->registerCommand('split', 'Split a page into chunks (for debugging)'); 68 $options->registerArgument('page', 'The page to split', true, 'split'); 69 70 $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)'); 71 $options->registerArgument('page', 'The page to check', true, 'page'); 72 $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page'); 73 74 $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' . 75 ' Not supported on all storages.'); 76 $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv'); 77 $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv'); 78 } 79 80 /** @inheritDoc */ 81 protected function main(Options $options) 82 { 83 if ($this->loglevel['debug']['enabled']) { 84 $this->helper->factory->setDebug(true); 85 } 86 87 ini_set('memory_limit', -1); 88 switch ($options->getCmd()) { 89 case 'embed': 90 $this->createEmbeddings($options->getOpt('clear')); 91 break; 92 case 'maintenance': 93 $this->runMaintenance(); 94 break; 95 case 'similar': 96 $this->similar($options->getArgs()[0]); 97 break; 98 case 'ask': 99 $this->ask($options->getArgs()[0]); 100 break; 101 case 'chat': 102 $this->chat(); 103 break; 104 case 'models': 105 $this->models(); 106 break; 107 case 'split': 108 $this->split($options->getArgs()[0]); 109 break; 110 case 'page': 111 $this->page($options->getArgs()[0], $options->getOpt('dump')); 112 break; 113 case 'info': 114 $this->showinfo(); 115 break; 116 case 'tsv': 117 $args = $options->getArgs(); 118 $vector = $args[0] ?? 'vector.tsv'; 119 $meta = $args[1] ?? 'meta.tsv'; 120 $this->tsv($vector, $meta); 121 break; 122 default: 123 echo $options->help(); 124 } 125 } 126 127 /** 128 * @return void 129 */ 130 protected function showinfo() 131 { 132 $stats = [ 133 'chat model' => $this->getConf('chatmodel'), 134 'embed model' => $this->getConf('embedmodel'), 135 ]; 136 $stats = array_merge( 137 $stats, 138 array_map('dformat', $this->helper->getRunData()), 139 $this->helper->getStorage()->statistics() 140 ); 141 $this->printTable($stats); 142 } 143 144 /** 145 * Print key value data as tabular data 146 * 147 * @param array $data 148 * @param int $level 149 * @return void 150 */ 151 protected function printTable($data, $level = 0) 152 { 153 $tf = new TableFormatter($this->colors); 154 foreach ($data as $key => $value) { 155 if (is_array($value)) { 156 echo $tf->format( 157 [$level * 2, 20, '*'], 158 ['', $key, ''], 159 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 160 ); 161 $this->printTable($value, $level + 1); 162 } else { 163 echo $tf->format( 164 [$level * 2, 20, '*'], 165 ['', $key, $value], 166 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY] 167 ); 168 } 169 } 170 } 171 172 /** 173 * Check chunk availability for a given page 174 * 175 * @param string $page 176 * @return void 177 */ 178 protected function page($page, $dump = false) 179 { 180 $indexer = new Indexer(); 181 $pages = $indexer->getPages(); 182 $pos = array_search(cleanID($page), $pages); 183 184 if ($pos === false) { 185 $this->error('Page not found'); 186 return; 187 } 188 189 $storage = $this->helper->getStorage(); 190 $chunks = $storage->getPageChunks($page, $pos * 100); 191 if ($chunks) { 192 $this->success('Found ' . count($chunks) . ' chunks'); 193 if ($dump) { 194 echo json_encode($chunks, JSON_PRETTY_PRINT); 195 } 196 } else { 197 $this->error('No chunks found'); 198 } 199 } 200 201 /** 202 * Split the given page into chunks and print them 203 * 204 * @param string $page 205 * @return void 206 * @throws Exception 207 */ 208 protected function split($page) 209 { 210 $text = rawWiki($page); 211 $chunks = $this->helper->getEmbeddings()->splitIntoChunks($text); 212 foreach ($chunks as $chunk) { 213 echo $chunk; 214 echo "\n"; 215 $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE); 216 } 217 $this->success('Split into ' . count($chunks) . ' chunks'); 218 } 219 220 /** 221 * Interactive Chat Session 222 * 223 * @return void 224 * @throws Exception 225 */ 226 protected function chat() 227 { 228 $history = []; 229 while ($q = $this->readLine('Your Question')) { 230 $this->helper->getChatModel()->resetUsageStats(); 231 $this->helper->getRephraseModel()->resetUsageStats(); 232 $this->helper->getEmbeddingModel()->resetUsageStats(); 233 $result = $this->helper->askChatQuestion($q, $history); 234 $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE); 235 $history[] = [$result['question'], $result['answer']]; 236 $this->printAnswer($result); 237 } 238 } 239 240 /** 241 * Print information about the available models 242 * 243 * @return void 244 */ 245 protected function models() 246 { 247 $result = (new ModelFactory($this->conf))->getModels(); 248 249 $td = new TableFormatter($this->colors); 250 $cols = [30, 20, 20, '*']; 251 echo "==== Chat Models ====\n\n"; 252 echo $td->format( 253 $cols, 254 ['Model', 'Token Limits', 'Price USD/M', 'Description'], 255 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 256 ); 257 foreach ($result['chat'] as $name => $info) { 258 echo $td->format( 259 $cols, 260 [ 261 $name, 262 sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']), 263 sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']), 264 $info['description'] . "\n" 265 ], 266 [ 267 $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 268 ] 269 ); 270 } 271 272 $cols = [30, 10, 10, 10, '*']; 273 echo "==== Embedding Models ====\n\n"; 274 echo $td->format( 275 $cols, 276 ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'], 277 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 278 ); 279 foreach ($result['embedding'] as $name => $info) { 280 echo $td->format( 281 $cols, 282 [ 283 $name, 284 sprintf("%7d", $info['inputTokens']), 285 sprintf("%.2f", $info['inputTokenPrice']), 286 $info['dimensions'], 287 $info['description'] . "\n" 288 ], 289 [ 290 $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 291 ] 292 ); 293 } 294 295 $this->colors->ptln('Current prices may differ', Colors::C_RED); 296 } 297 298 /** 299 * Handle a single, standalone question 300 * 301 * @param string $query 302 * @return void 303 * @throws Exception 304 */ 305 protected function ask($query) 306 { 307 $result = $this->helper->askQuestion($query); 308 $this->printAnswer($result); 309 } 310 311 /** 312 * Get the pages that are similar to the query 313 * 314 * @param string $query 315 * @return void 316 */ 317 protected function similar($query) 318 { 319 $langlimit = $this->helper->getLanguageLimit(); 320 if ($langlimit) { 321 $this->info('Limiting results to {lang}', ['lang' => $langlimit]); 322 } 323 324 $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit); 325 $this->printSources($sources); 326 } 327 328 /** 329 * Run the maintenance tasks 330 * 331 * @return void 332 */ 333 protected function runMaintenance() 334 { 335 $start = time(); 336 $this->helper->getStorage()->runMaintenance(); 337 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 338 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 339 340 $data = $this->helper->getRunData(); 341 $data['maintenance ran at'] = time(); 342 $this->helper->setRunData($data); 343 } 344 345 /** 346 * Recreate chunks and embeddings for all pages 347 * 348 * @return void 349 */ 350 protected function createEmbeddings($clear) 351 { 352 [$skipRE, $matchRE] = $this->getRegexps(); 353 354 $start = time(); 355 $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear); 356 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 357 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 358 359 $data = $this->helper->getRunData(); 360 $data['embed ran at'] = time(); 361 $this->helper->setRunData($data); 362 } 363 364 /** 365 * Dump TSV files for debugging 366 * 367 * @return void 368 */ 369 protected function tsv($vector, $meta) 370 { 371 372 $storage = $this->helper->getStorage(); 373 $storage->dumpTSV($vector, $meta); 374 $this->success('written to ' . $vector . ' and ' . $meta); 375 } 376 377 /** 378 * Print the given detailed answer in a nice way 379 * 380 * @param array $answer 381 * @return void 382 */ 383 protected function printAnswer($answer) 384 { 385 $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN); 386 echo "\n"; 387 $this->printSources($answer['sources']); 388 echo "\n"; 389 $this->printUsage(); 390 } 391 392 /** 393 * Print the given sources 394 * 395 * @param Chunk[] $sources 396 * @return void 397 */ 398 protected function printSources($sources) 399 { 400 foreach ($sources as $source) { 401 /** @var Chunk $source */ 402 $this->colors->ptln( 403 "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')', 404 Colors::C_LIGHTBLUE 405 ); 406 } 407 } 408 409 /** 410 * Print the usage statistics for OpenAI 411 * 412 * @return void 413 */ 414 protected function printUsage() 415 { 416 $chat = $this->helper->getChatModel()->getUsageStats(); 417 $rephrase = $this->helper->getRephraseModel()->getUsageStats(); 418 $embed = $this->helper->getEmbeddingModel()->getUsageStats(); 419 420 $this->info( 421 'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.', 422 [ 423 'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'], 424 'time' => $chat['time'] + $rephrase['time'] + $embed['time'], 425 'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'], 426 'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'], 427 ] 428 ); 429 } 430 431 /** 432 * Interactively ask for a value from the user 433 * 434 * @param string $prompt 435 * @return string 436 */ 437 protected function readLine($prompt) 438 { 439 $value = ''; 440 441 while ($value === '') { 442 echo $prompt; 443 echo ': '; 444 445 $fh = fopen('php://stdin', 'r'); 446 $value = trim(fgets($fh)); 447 fclose($fh); 448 } 449 450 return $value; 451 } 452 453 /** 454 * Read the skip and match regex from the config 455 * 456 * Ensures the regular expressions are valid 457 * 458 * @return string[] [$skipRE, $matchRE] 459 */ 460 protected function getRegexps() 461 { 462 $skip = $this->getConf('skipRegex'); 463 $skipRE = ''; 464 $match = $this->getConf('matchRegex'); 465 $matchRE = ''; 466 467 if ($skip) { 468 $skipRE = '/' . $skip . '/'; 469 if (@preg_match($skipRE, '') === false) { 470 $this->error(preg_last_error_msg()); 471 $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.'); 472 $skipRE = ''; 473 } else { 474 $this->success('Skipping pages matching ' . $skipRE); 475 } 476 } 477 478 if ($match) { 479 $matchRE = '/' . $match . '/'; 480 if (@preg_match($matchRE, '') === false) { 481 $this->error(preg_last_error_msg()); 482 $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.'); 483 $matchRE = ''; 484 } else { 485 $this->success('Only indexing pages matching ' . $matchRE); 486 } 487 } 488 return [$skipRE, $matchRE]; 489 } 490} 491