xref: /plugin/aichat/cli.php (revision 51aa8517a15244890eb0132c8019c9857c046a12)
1<?php
2
3use dokuwiki\Extension\CLIPlugin;
4use dokuwiki\plugin\aichat\Chunk;
5use dokuwiki\Search\Indexer;
6use splitbrain\phpcli\Colors;
7use splitbrain\phpcli\Options;
8use splitbrain\phpcli\TableFormatter;
9
10/**
11 * DokuWiki Plugin aichat (CLI Component)
12 *
13 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
14 * @author  Andreas Gohr <gohr@cosmocode.de>
15 */
16class cli_plugin_aichat extends CLIPlugin
17{
18    /** @var helper_plugin_aichat */
19    protected $helper;
20
21    public function __construct($autocatch = true)
22    {
23        parent::__construct($autocatch);
24        $this->helper = plugin_load('helper', 'aichat');
25        $this->helper->setLogger($this);
26    }
27
28    /** @inheritDoc */
29    protected function setup(Options $options)
30    {
31        $options->useCompactHelp();
32
33        $options->setHelp(
34            'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' .
35            'This may incur costs.'
36        );
37
38        $options->registerCommand(
39            'embed',
40            'Create embeddings for all pages. This skips pages that already have embeddings'
41        );
42        $options->registerOption(
43            'clear',
44            'Clear all existing embeddings before creating new ones',
45            'c',
46            false,
47            'embed'
48        );
49
50        $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.');
51
52        $options->registerCommand('similar', 'Search for similar pages');
53        $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar');
54
55        $options->registerCommand('ask', 'Ask a question');
56        $options->registerArgument('question', 'The question to ask', true, 'ask');
57
58        $options->registerCommand('chat', 'Start an interactive chat session');
59
60        $options->registerCommand('models', 'List available models');
61
62        $options->registerCommand('info', 'Get Info about the vector storage and other stats');
63
64        $options->registerCommand('split', 'Split a page into chunks (for debugging)');
65        $options->registerArgument('page', 'The page to split', true, 'split');
66
67        $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)');
68        $options->registerArgument('page', 'The page to check', true, 'page');
69        $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page');
70
71        $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' .
72            ' Not supported on all storages.');
73        $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv');
74        $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv');
75    }
76
77    /** @inheritDoc */
78    protected function main(Options $options)
79    {
80        $this->loadConfig();
81        ini_set('memory_limit', -1);
82        switch ($options->getCmd()) {
83            case 'embed':
84                $this->createEmbeddings($options->getOpt('clear'));
85                break;
86            case 'maintenance':
87                $this->runMaintenance();
88                break;
89            case 'similar':
90                $this->similar($options->getArgs()[0]);
91                break;
92            case 'ask':
93                $this->ask($options->getArgs()[0]);
94                break;
95            case 'chat':
96                $this->chat();
97                break;
98            case 'models':
99                $this->models();
100                break;
101            case 'split':
102                $this->split($options->getArgs()[0]);
103                break;
104            case 'page':
105                $this->page($options->getArgs()[0], $options->getOpt('dump'));
106                break;
107            case 'info':
108                $this->showinfo();
109                break;
110            case 'tsv':
111                $args = $options->getArgs();
112                $vector = $args[0] ?? 'vector.tsv';
113                $meta = $args[1] ?? 'meta.tsv';
114                $this->tsv($vector, $meta);
115                break;
116            default:
117                echo $options->help();
118        }
119    }
120
121    /**
122     * @return void
123     */
124    protected function showinfo()
125    {
126        $stats = [
127            'chat model' => $this->getConf('chatmodel'),
128            'embed model' => $this->getConf('embedmodel'),
129        ];
130        $stats = array_merge(
131            $stats,
132            array_map('dformat', $this->helper->getRunData()),
133            $this->helper->getStorage()->statistics()
134        );
135        $this->printTable($stats);
136    }
137
138    /**
139     * Print key value data as tabular data
140     *
141     * @param array $data
142     * @param int $level
143     * @return void
144     */
145    protected function printTable($data, $level = 0)
146    {
147        $tf = new TableFormatter($this->colors);
148        foreach ($data as $key => $value) {
149            if (is_array($value)) {
150                echo $tf->format(
151                    [$level * 2, 20, '*'],
152                    ['', $key, ''],
153                    [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
154                );
155                $this->printTable($value, $level + 1);
156            } else {
157                echo $tf->format(
158                    [$level * 2, 20, '*'],
159                    ['', $key, $value],
160                    [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY]
161                );
162            }
163        }
164    }
165
166    /**
167     * Check chunk availability for a given page
168     *
169     * @param string $page
170     * @return void
171     */
172    protected function page($page, $dump = false)
173    {
174        $indexer = new Indexer();
175        $pages = $indexer->getPages();
176        $pos = array_search(cleanID($page), $pages);
177
178        if ($pos === false) {
179            $this->error('Page not found');
180            return;
181        }
182
183        $storage = $this->helper->getStorage();
184        $chunks = $storage->getPageChunks($page, $pos * 100);
185        if ($chunks) {
186            $this->success('Found ' . count($chunks) . ' chunks');
187            if ($dump) {
188                echo json_encode($chunks, JSON_PRETTY_PRINT);
189            }
190        } else {
191            $this->error('No chunks found');
192        }
193    }
194
195    /**
196     * Split the given page into chunks and print them
197     *
198     * @param string $page
199     * @return void
200     * @throws Exception
201     */
202    protected function split($page)
203    {
204        $text = rawWiki($page);
205        $chunks = $this->helper->getEmbeddings()->splitIntoChunks($text);
206        foreach ($chunks as $chunk) {
207            echo $chunk;
208            echo "\n";
209            $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE);
210        }
211        $this->success('Split into ' . count($chunks) . ' chunks');
212    }
213
214    /**
215     * Interactive Chat Session
216     *
217     * @return void
218     * @throws Exception
219     */
220    protected function chat()
221    {
222        if ($this->loglevel['debug']['enabled']) {
223            $this->helper->getChatModel()->setDebug(true);
224            $this->helper->getRephraseModel()->setDebug(true);
225            $this->helper->getEmbedModel()->setDebug(true);
226        }
227
228        $history = [];
229        while ($q = $this->readLine('Your Question')) {
230            $this->helper->getChatModel()->resetUsageStats();
231            $this->helper->getRephraseModel()->resetUsageStats();
232            $this->helper->getEmbedModel()->resetUsageStats();
233            $result = $this->helper->askChatQuestion($q, $history);
234            $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE);
235            $history[] = [$result['question'], $result['answer']];
236            $this->printAnswer($result);
237        }
238    }
239
240    protected function models()
241    {
242        $result = [
243            'chat' => [],
244            'embedding' => [],
245        ];
246
247
248        $jsons = glob(__DIR__ . '/Model/*/models.json');
249        foreach ($jsons as $json) {
250            $models = json_decode(file_get_contents($json), true);
251            foreach ($models as $type => $model) {
252                $namespace = basename(dirname($json));
253                foreach ($model as $name => $info) {
254
255
256                    $class = '\\dokuwiki\\plugin\\aichat\\Model\\' . $namespace . '\\' . ucfirst($type) . 'Model';
257                    try {
258                        new $class($name, $this->conf);
259                        $info['confok'] = true;
260                    } catch (Exception $e) {
261                        $info['confok'] = false;
262                    }
263
264                    $result[$type]["$namespace $name"] = $info;
265                }
266            }
267        }
268
269        $td = new TableFormatter($this->colors);
270        $cols = [30, 20, 20, '*'];
271        echo "==== Chat Models ====\n\n";
272        echo $td->format(
273            $cols,
274            ['Model', 'Token Limits', 'Price USD/M', 'Description'],
275            [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
276        );
277        foreach ($result['chat'] as $name => $info) {
278            echo $td->format(
279                $cols,
280                [
281                    $name,
282                    sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']),
283                    sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']),
284                    $info['description'] . "\n"
285                ],
286                [
287                    $info['confok'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED,
288                ]
289            );
290        }
291
292        $cols = [30, 10, 10, 10, '*'];
293        echo "==== Embedding Models ====\n\n";
294        echo $td->format(
295            $cols,
296            ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'],
297            [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
298        );
299        foreach ($result['embedding'] as $name => $info) {
300            echo $td->format(
301                $cols,
302                [
303                    $name,
304                    sprintf("%7d", $info['inputTokens']),
305                    sprintf("%.2f", $info['inputTokenPrice']),
306                    $info['dimensions'],
307                    $info['description'] . "\n"
308                ],
309                [
310                    $info['confok'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED,
311                ]
312            );
313        }
314
315        $this->colors->ptln('Current prices may differ', Colors::C_RED);
316    }
317
318    /**
319     * Handle a single, standalone question
320     *
321     * @param string $query
322     * @return void
323     * @throws Exception
324     */
325    protected function ask($query)
326    {
327        if ($this->loglevel['debug']['enabled']) {
328            $this->helper->getChatModel()->setDebug(true);
329            $this->helper->getRephraseModel()->setDebug(true);
330            $this->helper->getEmbedModel()->setDebug(true);
331        }
332
333        $result = $this->helper->askQuestion($query);
334        $this->printAnswer($result);
335    }
336
337    /**
338     * Get the pages that are similar to the query
339     *
340     * @param string $query
341     * @return void
342     */
343    protected function similar($query)
344    {
345        $langlimit = $this->helper->getLanguageLimit();
346        if ($langlimit) {
347            $this->info('Limiting results to {lang}', ['lang' => $langlimit]);
348        }
349
350        $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit);
351        $this->printSources($sources);
352    }
353
354    /**
355     * Run the maintenance tasks
356     *
357     * @return void
358     */
359    protected function runMaintenance()
360    {
361        $start = time();
362        $this->helper->getStorage()->runMaintenance();
363        $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
364        $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
365
366        $data = $this->helper->getRunData();
367        $data['maintenance ran at'] = time();
368        $this->helper->setRunData($data);
369    }
370
371    /**
372     * Recreate chunks and embeddings for all pages
373     *
374     * @return void
375     */
376    protected function createEmbeddings($clear)
377    {
378        [$skipRE, $matchRE] = $this->getRegexps();
379
380        $start = time();
381        $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear);
382        $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
383        $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
384
385        $data = $this->helper->getRunData();
386        $data['embed ran at'] = time();
387        $this->helper->setRunData($data);
388    }
389
390    /**
391     * Dump TSV files for debugging
392     *
393     * @return void
394     */
395    protected function tsv($vector, $meta)
396    {
397
398        $storage = $this->helper->getStorage();
399        $storage->dumpTSV($vector, $meta);
400        $this->success('written to ' . $vector . ' and ' . $meta);
401    }
402
403    /**
404     * Print the given detailed answer in a nice way
405     *
406     * @param array $answer
407     * @return void
408     */
409    protected function printAnswer($answer)
410    {
411        $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN);
412        echo "\n";
413        $this->printSources($answer['sources']);
414        echo "\n";
415        $this->printUsage();
416    }
417
418    /**
419     * Print the given sources
420     *
421     * @param Chunk[] $sources
422     * @return void
423     */
424    protected function printSources($sources)
425    {
426        foreach ($sources as $source) {
427            /** @var Chunk $source */
428            $this->colors->ptln(
429                "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')',
430                Colors::C_LIGHTBLUE
431            );
432        }
433    }
434
435    /**
436     * Print the usage statistics for OpenAI
437     *
438     * @return void
439     */
440    protected function printUsage()
441    {
442        $chat = $this->helper->getChatModel()->getUsageStats();
443        $rephrase = $this->helper->getRephraseModel()->getUsageStats();
444        $embed = $this->helper->getEmbedModel()->getUsageStats();
445
446        $this->info(
447            'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.',
448            [
449                'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'],
450                'time' => $chat['time'] + $rephrase['time'] + $embed['time'],
451                'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'],
452                'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'],
453            ]
454        );
455    }
456
457    /**
458     * Interactively ask for a value from the user
459     *
460     * @param string $prompt
461     * @return string
462     */
463    protected function readLine($prompt)
464    {
465        $value = '';
466
467        while ($value === '') {
468            echo $prompt;
469            echo ': ';
470
471            $fh = fopen('php://stdin', 'r');
472            $value = trim(fgets($fh));
473            fclose($fh);
474        }
475
476        return $value;
477    }
478
479    /**
480     * Read the skip and match regex from the config
481     *
482     * Ensures the regular expressions are valid
483     *
484     * @return string[] [$skipRE, $matchRE]
485     */
486    protected function getRegexps()
487    {
488        $skip = $this->getConf('skipRegex');
489        $skipRE = '';
490        $match = $this->getConf('matchRegex');
491        $matchRE = '';
492
493        if ($skip) {
494            $skipRE = '/' . $skip . '/';
495            if (@preg_match($skipRE, '') === false) {
496                $this->error(preg_last_error_msg());
497                $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.');
498                $skipRE = '';
499            } else {
500                $this->success('Skipping pages matching ' . $skipRE);
501            }
502        }
503
504        if ($match) {
505            $matchRE = '/' . $match . '/';
506            if (@preg_match($matchRE, '') === false) {
507                $this->error(preg_last_error_msg());
508                $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.');
509                $matchRE = '';
510            } else {
511                $this->success('Only indexing pages matching ' . $matchRE);
512            }
513        }
514        return [$skipRE, $matchRE];
515    }
516}
517