1<?php 2 3use dokuwiki\Extension\CLIPlugin; 4use dokuwiki\plugin\aichat\Chunk; 5use dokuwiki\Search\Indexer; 6use splitbrain\phpcli\Colors; 7use splitbrain\phpcli\Options; 8use splitbrain\phpcli\TableFormatter; 9 10/** 11 * DokuWiki Plugin aichat (CLI Component) 12 * 13 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 14 * @author Andreas Gohr <gohr@cosmocode.de> 15 */ 16class cli_plugin_aichat extends CLIPlugin 17{ 18 /** @var helper_plugin_aichat */ 19 protected $helper; 20 21 public function __construct($autocatch = true) 22 { 23 parent::__construct($autocatch); 24 $this->helper = plugin_load('helper', 'aichat'); 25 $this->helper->setLogger($this); 26 } 27 28 /** @inheritDoc */ 29 protected function setup(Options $options) 30 { 31 $options->useCompactHelp(); 32 33 $options->setHelp( 34 'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' . 35 'This may incur costs.' 36 ); 37 38 $options->registerCommand( 39 'embed', 40 'Create embeddings for all pages. This skips pages that already have embeddings' 41 ); 42 $options->registerOption( 43 'clear', 44 'Clear all existing embeddings before creating new ones', 45 'c', 46 false, 47 'embed' 48 ); 49 50 $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.'); 51 52 $options->registerCommand('similar', 'Search for similar pages'); 53 $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar'); 54 55 $options->registerCommand('ask', 'Ask a question'); 56 $options->registerArgument('question', 'The question to ask', true, 'ask'); 57 58 $options->registerCommand('chat', 'Start an interactive chat session'); 59 60 $options->registerCommand('models', 'List available models'); 61 62 $options->registerCommand('info', 'Get Info about the vector storage and other stats'); 63 64 $options->registerCommand('split', 'Split a page into chunks (for debugging)'); 65 $options->registerArgument('page', 'The page to split', true, 'split'); 66 67 $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)'); 68 $options->registerArgument('page', 'The page to check', true, 'page'); 69 $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page'); 70 71 $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' . 72 ' Not supported on all storages.'); 73 $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv'); 74 $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv'); 75 } 76 77 /** @inheritDoc */ 78 protected function main(Options $options) 79 { 80 $this->loadConfig(); 81 ini_set('memory_limit', -1); 82 switch ($options->getCmd()) { 83 case 'embed': 84 $this->createEmbeddings($options->getOpt('clear')); 85 break; 86 case 'maintenance': 87 $this->runMaintenance(); 88 break; 89 case 'similar': 90 $this->similar($options->getArgs()[0]); 91 break; 92 case 'ask': 93 $this->ask($options->getArgs()[0]); 94 break; 95 case 'chat': 96 $this->chat(); 97 break; 98 case 'models': 99 $this->models(); 100 break; 101 case 'split': 102 $this->split($options->getArgs()[0]); 103 break; 104 case 'page': 105 $this->page($options->getArgs()[0], $options->getOpt('dump')); 106 break; 107 case 'info': 108 $this->showinfo(); 109 break; 110 case 'tsv': 111 $args = $options->getArgs(); 112 $vector = $args[0] ?? 'vector.tsv'; 113 $meta = $args[1] ?? 'meta.tsv'; 114 $this->tsv($vector, $meta); 115 break; 116 default: 117 echo $options->help(); 118 } 119 } 120 121 /** 122 * @return void 123 */ 124 protected function showinfo() 125 { 126 $stats = [ 127 'model' => $this->getConf('model'), 128 ]; 129 $stats = array_merge( 130 $stats, 131 array_map('dformat', $this->helper->getRunData()), 132 $this->helper->getStorage()->statistics() 133 ); 134 $this->printTable($stats); 135 } 136 137 /** 138 * Print key value data as tabular data 139 * 140 * @param array $data 141 * @param int $level 142 * @return void 143 */ 144 protected function printTable($data, $level = 0) 145 { 146 $tf = new TableFormatter($this->colors); 147 foreach ($data as $key => $value) { 148 if (is_array($value)) { 149 echo $tf->format( 150 [$level * 2, 20, '*'], 151 ['', $key, ''], 152 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 153 ); 154 $this->printTable($value, $level + 1); 155 } else { 156 echo $tf->format( 157 [$level * 2, 20, '*'], 158 ['', $key, $value], 159 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY] 160 ); 161 } 162 } 163 } 164 165 /** 166 * Check chunk availability for a given page 167 * 168 * @param string $page 169 * @return void 170 */ 171 protected function page($page, $dump = false) 172 { 173 $indexer = new Indexer(); 174 $pages = $indexer->getPages(); 175 $pos = array_search(cleanID($page), $pages); 176 177 if ($pos === false) { 178 $this->error('Page not found'); 179 return; 180 } 181 182 $storage = $this->helper->getStorage(); 183 $chunks = $storage->getPageChunks($page, $pos * 100); 184 if ($chunks) { 185 $this->success('Found ' . count($chunks) . ' chunks'); 186 if ($dump) { 187 echo json_encode($chunks, JSON_PRETTY_PRINT); 188 } 189 } else { 190 $this->error('No chunks found'); 191 } 192 } 193 194 /** 195 * Split the given page into chunks and print them 196 * 197 * @param string $page 198 * @return void 199 * @throws Exception 200 */ 201 protected function split($page) 202 { 203 $text = rawWiki($page); 204 $chunks = $this->helper->getEmbeddings()->splitIntoChunks($text); 205 foreach ($chunks as $chunk) { 206 echo $chunk; 207 echo "\n"; 208 $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE); 209 } 210 $this->success('Split into ' . count($chunks) . ' chunks'); 211 } 212 213 /** 214 * Interactive Chat Session 215 * 216 * @return void 217 * @throws Exception 218 */ 219 protected function chat() 220 { 221 if ($this->loglevel['debug']['enabled']) { 222 $this->helper->getChatModel()->setDebug(true); 223 } 224 225 $history = []; 226 while ($q = $this->readLine('Your Question')) { 227 $this->helper->getChatModel()->resetUsageStats(); 228 $result = $this->helper->askChatQuestion($q, $history); 229 $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE); 230 $history[] = [$result['question'], $result['answer']]; 231 $this->printAnswer($result); 232 } 233 } 234 235 protected function models() 236 { 237 $result = [ 238 'chat' => [], 239 'embedding' => [], 240 ]; 241 242 243 $jsons = glob(__DIR__ . '/Model/*/models.json'); 244 foreach ($jsons as $json) { 245 $models = json_decode(file_get_contents($json), true); 246 foreach ($models as $type => $model) { 247 $namespace = basename(dirname($json)); 248 foreach ($model as $name => $info) { 249 250 251 $class = '\\dokuwiki\\plugin\\aichat\\Model\\' . $namespace . '\\' . ucfirst($type) . 'Model'; 252 try { 253 new $class($name, $this->conf); 254 $info['confok'] = true; 255 } catch (Exception $e) { 256 $info['confok'] = false; 257 } 258 259 $result[$type]["$namespace $name"] = $info; 260 } 261 } 262 } 263 264 $td = new TableFormatter($this->colors); 265 $cols = [30, 20, 20, '*']; 266 echo "==== Chat Models ====\n\n"; 267 echo $td->format( 268 $cols, 269 ['Model', 'Token Limits', 'Price USD/M', 'Description'], 270 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 271 ); 272 foreach ($result['chat'] as $name => $info) { 273 echo $td->format( 274 $cols, 275 [ 276 $name, 277 sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']), 278 sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['inputTokenPrice']), 279 $info['description']."\n" 280 ], 281 [ 282 $info['confok'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 283 ] 284 ); 285 } 286 287 $cols = [30, 10, 10, 10, '*']; 288 echo "==== Embedding Models ====\n\n"; 289 echo $td->format( 290 $cols, 291 ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'], 292 [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE] 293 ); 294 foreach ($result['embedding'] as $name => $info) { 295 echo $td->format( 296 $cols, 297 [ 298 $name, 299 sprintf("%7d", $info['inputTokens']), 300 sprintf("%.2f", $info['inputTokenPrice']), 301 $info['dimensions'], 302 $info['description']."\n" 303 ], 304 [ 305 $info['confok'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED, 306 ] 307 ); 308 } 309 310 $this->colors->ptln('Current prices may differ', Colors::C_RED); 311 } 312 313 /** 314 * Handle a single, standalone question 315 * 316 * @param string $query 317 * @return void 318 * @throws Exception 319 */ 320 protected function ask($query) 321 { 322 if ($this->loglevel['debug']['enabled']) { 323 $this->helper->getChatModel()->setDebug(true); 324 } 325 326 $result = $this->helper->askQuestion($query); 327 $this->printAnswer($result); 328 } 329 330 /** 331 * Get the pages that are similar to the query 332 * 333 * @param string $query 334 * @return void 335 */ 336 protected function similar($query) 337 { 338 $langlimit = $this->helper->getLanguageLimit(); 339 if ($langlimit) { 340 $this->info('Limiting results to {lang}', ['lang' => $langlimit]); 341 } 342 343 $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit); 344 $this->printSources($sources); 345 } 346 347 /** 348 * Run the maintenance tasks 349 * 350 * @return void 351 */ 352 protected function runMaintenance() 353 { 354 $start = time(); 355 $this->helper->getStorage()->runMaintenance(); 356 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 357 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 358 359 $data = $this->helper->getRunData(); 360 $data['maintenance ran at'] = time(); 361 $this->helper->setRunData($data); 362 } 363 364 /** 365 * Recreate chunks and embeddings for all pages 366 * 367 * @return void 368 */ 369 protected function createEmbeddings($clear) 370 { 371 [$skipRE, $matchRE] = $this->getRegexps(); 372 373 $start = time(); 374 $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear); 375 $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]); 376 $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]); 377 378 $data = $this->helper->getRunData(); 379 $data['embed ran at'] = time(); 380 $this->helper->setRunData($data); 381 } 382 383 /** 384 * Dump TSV files for debugging 385 * 386 * @return void 387 */ 388 protected function tsv($vector, $meta) 389 { 390 391 $storage = $this->helper->getStorage(); 392 $storage->dumpTSV($vector, $meta); 393 $this->success('written to ' . $vector . ' and ' . $meta); 394 } 395 396 /** 397 * Print the given detailed answer in a nice way 398 * 399 * @param array $answer 400 * @return void 401 */ 402 protected function printAnswer($answer) 403 { 404 $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN); 405 echo "\n"; 406 $this->printSources($answer['sources']); 407 echo "\n"; 408 $this->printUsage(); 409 } 410 411 /** 412 * Print the given sources 413 * 414 * @param Chunk[] $sources 415 * @return void 416 */ 417 protected function printSources($sources) 418 { 419 foreach ($sources as $source) { 420 /** @var Chunk $source */ 421 $this->colors->ptln( 422 "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')', 423 Colors::C_LIGHTBLUE 424 ); 425 } 426 } 427 428 /** 429 * Print the usage statistics for OpenAI 430 * 431 * @return void 432 */ 433 protected function printUsage() 434 { 435 $this->info( 436 'Made {requests} requests in {time}s to Model. Used {tokens} tokens for about ${cost}.', 437 $this->helper->getChatModel()->getUsageStats() 438 ); 439 } 440 441 /** 442 * Interactively ask for a value from the user 443 * 444 * @param string $prompt 445 * @return string 446 */ 447 protected function readLine($prompt) 448 { 449 $value = ''; 450 451 while ($value === '') { 452 echo $prompt; 453 echo ': '; 454 455 $fh = fopen('php://stdin', 'r'); 456 $value = trim(fgets($fh)); 457 fclose($fh); 458 } 459 460 return $value; 461 } 462 463 /** 464 * Read the skip and match regex from the config 465 * 466 * Ensures the regular expressions are valid 467 * 468 * @return string[] [$skipRE, $matchRE] 469 */ 470 protected function getRegexps() 471 { 472 $skip = $this->getConf('skipRegex'); 473 $skipRE = ''; 474 $match = $this->getConf('matchRegex'); 475 $matchRE = ''; 476 477 if ($skip) { 478 $skipRE = '/' . $skip . '/'; 479 if (@preg_match($skipRE, '') === false) { 480 $this->error(preg_last_error_msg()); 481 $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.'); 482 $skipRE = ''; 483 } else { 484 $this->success('Skipping pages matching ' . $skipRE); 485 } 486 } 487 488 if ($match) { 489 $matchRE = '/' . $match . '/'; 490 if (@preg_match($matchRE, '') === false) { 491 $this->error(preg_last_error_msg()); 492 $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.'); 493 $matchRE = ''; 494 } else { 495 $this->success('Only indexing pages matching ' . $matchRE); 496 } 497 } 498 return [$skipRE, $matchRE]; 499 } 500} 501