xref: /plugin/aichat/cli.php (revision e1251882372557ff3ce8f12d253df8d66390690e)
1<?php
2
3use dokuwiki\Extension\CLIPlugin;
4use dokuwiki\plugin\aichat\AbstractCLI;
5use dokuwiki\plugin\aichat\Chunk;
6use dokuwiki\plugin\aichat\ModelFactory;
7use dokuwiki\Search\Indexer;
8use splitbrain\phpcli\Colors;
9use splitbrain\phpcli\Options;
10use splitbrain\phpcli\TableFormatter;
11
12/**
13 * DokuWiki Plugin aichat (CLI Component)
14 *
15 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
16 * @author  Andreas Gohr <gohr@cosmocode.de>
17 */
18class cli_plugin_aichat extends AbstractCLI
19{
20    /** @var helper_plugin_aichat */
21    protected $helper;
22
23    /** @inheritDoc */
24    protected function setup(Options $options)
25    {
26        parent::setup($options);
27
28        $options->setHelp(
29            'Manage and query the AI chatbot data. Please note that calls to your LLM provider will be made. ' .
30            'This may incur costs.'
31        );
32
33        $options->registerOption(
34            'model',
35            'Overrides the chat and rephrasing model settings and uses this model instead',
36            '',
37            'model'
38        );
39
40        $options->registerCommand(
41            'embed',
42            'Create embeddings for all pages. This skips pages that already have embeddings'
43        );
44        $options->registerOption(
45            'clear',
46            'Clear all existing embeddings before creating new ones',
47            'c',
48            false,
49            'embed'
50        );
51
52        $options->registerCommand('maintenance', 'Run storage maintenance. Refer to the documentation for details.');
53
54        $options->registerCommand('similar', 'Search for similar pages');
55        $options->registerArgument('query', 'Look up chunks similar to this query', true, 'similar');
56
57        $options->registerCommand('ask', 'Ask a question');
58        $options->registerArgument('question', 'The question to ask', true, 'ask');
59
60        $options->registerCommand('chat', 'Start an interactive chat session');
61
62        $options->registerCommand('models', 'List available models');
63
64        $options->registerCommand('info', 'Get Info about the vector storage and other stats');
65
66        $options->registerCommand('split', 'Split a page into chunks (for debugging)');
67        $options->registerArgument('page', 'The page to split', true, 'split');
68
69        $options->registerCommand('page', 'Check if chunks for a given page are available (for debugging)');
70        $options->registerArgument('page', 'The page to check', true, 'page');
71        $options->registerOption('dump', 'Dump the chunks', 'd', false, 'page');
72
73        $options->registerCommand('tsv', 'Create TSV files for visualizing at http://projector.tensorflow.org/' .
74            ' Not supported on all storages.');
75        $options->registerArgument('vector.tsv', 'The vector file', false, 'tsv');
76        $options->registerArgument('meta.tsv', 'The meta file', false, 'tsv');
77    }
78
79    /** @inheritDoc */
80    protected function main(Options $options)
81    {
82        parent::main($options);
83        auth_setup(); // make sure ACLs are initialized
84
85        $model = $options->getOpt('model');
86        if ($model) {
87            $this->helper->updateConfig(
88                ['chatmodel' => $model, 'rephasemodel' => $model]
89            );
90        }
91
92        switch ($options->getCmd()) {
93            case 'embed':
94                $this->createEmbeddings($options->getOpt('clear'));
95                break;
96            case 'maintenance':
97                $this->runMaintenance();
98                break;
99            case 'similar':
100                $this->similar($options->getArgs()[0]);
101                break;
102            case 'ask':
103                $this->ask($options->getArgs()[0]);
104                break;
105            case 'chat':
106                $this->chat();
107                break;
108            case 'models':
109                $this->models();
110                break;
111            case 'split':
112                $this->split($options->getArgs()[0]);
113                break;
114            case 'page':
115                $this->page($options->getArgs()[0], $options->getOpt('dump'));
116                break;
117            case 'info':
118                $this->showinfo();
119                break;
120            case 'tsv':
121                $args = $options->getArgs();
122                $vector = $args[0] ?? 'vector.tsv';
123                $meta = $args[1] ?? 'meta.tsv';
124                $this->tsv($vector, $meta);
125                break;
126            default:
127                echo $options->help();
128        }
129    }
130
131    /**
132     * @return void
133     */
134    protected function showinfo()
135    {
136        $stats = [
137            'embed model' => (string) $this->helper->getEmbeddingModel(),
138            'rephrase model' => (string) $this->helper->getRephraseModel(),
139            'chat model' => (string) $this->helper->getChatModel(),
140        ];
141        $stats = array_merge(
142            $stats,
143            $this->helper->getRunData(),
144            $this->helper->getStorage()->statistics()
145        );
146        $this->printTable($stats);
147    }
148
149    /**
150     * Print key value data as tabular data
151     *
152     * @param array $data
153     * @param int $level
154     * @return void
155     */
156    protected function printTable($data, $level = 0)
157    {
158        $tf = new TableFormatter($this->colors);
159        foreach ($data as $key => $value) {
160            if (is_array($value)) {
161                echo $tf->format(
162                    [$level * 2, 20, '*'],
163                    ['', $key, ''],
164                    [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
165                );
166                $this->printTable($value, $level + 1);
167            } else {
168                echo $tf->format(
169                    [$level * 2, 20, '*'],
170                    ['', $key, $value],
171                    [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTGRAY]
172                );
173            }
174        }
175    }
176
177    /**
178     * Check chunk availability for a given page
179     *
180     * @param string $page
181     * @return void
182     */
183    protected function page($page, $dump = false)
184    {
185        $indexer = new Indexer();
186        $pages = $indexer->getPages();
187        $pos = array_search(cleanID($page), $pages);
188
189        if ($pos === false) {
190            $this->error('Page not found');
191            return;
192        }
193
194        $storage = $this->helper->getStorage();
195        $chunks = $storage->getPageChunks($page, $pos * 100);
196        if ($chunks) {
197            $this->success('Found ' . count($chunks) . ' chunks');
198            if ($dump) {
199                echo json_encode($chunks, JSON_PRETTY_PRINT);
200            }
201        } else {
202            $this->error('No chunks found');
203        }
204    }
205
206    /**
207     * Split the given page into chunks and print them
208     *
209     * @param string $page
210     * @return void
211     * @throws Exception
212     */
213    protected function split($page)
214    {
215        $chunks = $this->helper->getEmbeddings()->createPageChunks($page, 0);
216        foreach ($chunks as $chunk) {
217            echo $chunk->getText();
218            echo "\n";
219            $this->colors->ptln('--------------------------------', Colors::C_LIGHTPURPLE);
220        }
221        $this->success('Split into ' . count($chunks) . ' chunks');
222    }
223
224    /**
225     * Interactive Chat Session
226     *
227     * @return void
228     * @throws Exception
229     */
230    protected function chat()
231    {
232        $history = [];
233        while ($q = $this->readLine('Your Question')) {
234            $this->helper->getChatModel()->resetUsageStats();
235            $this->helper->getRephraseModel()->resetUsageStats();
236            $this->helper->getEmbeddingModel()->resetUsageStats();
237            $result = $this->helper->askChatQuestion($q, $history);
238            $this->colors->ptln("Interpretation: {$result['question']}", Colors::C_LIGHTPURPLE);
239            $history[] = [$result['question'], $result['answer']];
240            $this->printAnswer($result);
241        }
242    }
243
244    /**
245     * Print information about the available models
246     *
247     * @return void
248     */
249    protected function models()
250    {
251        $result = (new ModelFactory($this->conf))->getModels();
252
253        $td = new TableFormatter($this->colors);
254        $cols = [30, 20, 20, '*'];
255        echo "==== Chat Models ====\n\n";
256        echo $td->format(
257            $cols,
258            ['Model', 'Token Limits', 'Price USD/M', 'Description'],
259            [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
260        );
261        foreach ($result['chat'] as $name => $info) {
262            echo $td->format(
263                $cols,
264                [
265                    $name,
266                    sprintf(" In: %7d\nOut: %7d", $info['inputTokens'], $info['outputTokens']),
267                    sprintf(" In: %.2f\nOut: %.2f", $info['inputTokenPrice'], $info['outputTokenPrice']),
268                    $info['description'] . "\n"
269                ],
270                [
271                    $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED,
272                ]
273            );
274        }
275
276        $cols = [30, 10, 10, 10, '*'];
277        echo "==== Embedding Models ====\n\n";
278        echo $td->format(
279            $cols,
280            ['Model', 'Token Limits', 'Price USD/M', 'Dimensions', 'Description'],
281            [Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE, Colors::C_LIGHTBLUE]
282        );
283        foreach ($result['embedding'] as $name => $info) {
284            echo $td->format(
285                $cols,
286                [
287                    $name,
288                    sprintf("%7d", $info['inputTokens']),
289                    sprintf("%.2f", $info['inputTokenPrice']),
290                    $info['dimensions'],
291                    $info['description'] . "\n"
292                ],
293                [
294                    $info['instance'] ? Colors::C_LIGHTGREEN : Colors::C_LIGHTRED,
295                ]
296            );
297        }
298
299        $this->colors->ptln('Current prices may differ', Colors::C_RED);
300    }
301
302    /**
303     * Handle a single, standalone question
304     *
305     * @param string $query
306     * @return void
307     * @throws Exception
308     */
309    protected function ask($query)
310    {
311        $result = $this->helper->askQuestion($query);
312        $this->printAnswer($result);
313    }
314
315    /**
316     * Get the pages that are similar to the query
317     *
318     * @param string $query
319     * @return void
320     */
321    protected function similar($query)
322    {
323        $langlimit = $this->helper->getLanguageLimit();
324        if ($langlimit) {
325            $this->info('Limiting results to {lang}', ['lang' => $langlimit]);
326        }
327
328        $sources = $this->helper->getEmbeddings()->getSimilarChunks($query, $langlimit);
329        $this->printSources($sources);
330    }
331
332    /**
333     * Run the maintenance tasks
334     *
335     * @return void
336     */
337    protected function runMaintenance()
338    {
339        $start = time();
340        $this->helper->getStorage()->runMaintenance();
341        $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
342        $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
343
344        $data = $this->helper->getRunData();
345        $data['maintenance ran at'] = dformat();
346        $this->helper->setRunData($data);
347    }
348
349    /**
350     * Recreate chunks and embeddings for all pages
351     *
352     * @return void
353     */
354    protected function createEmbeddings($clear)
355    {
356        [$skipRE, $matchRE] = $this->getRegexps();
357
358        $data = $this->helper->getRunData();
359        $lastEmbedModel = $data['embed used'] ?? '';
360
361        if(
362            !$clear && $lastEmbedModel &&
363            $lastEmbedModel != (string) $this->helper->getEmbeddingModel()
364        ){
365            $this->warning('Embedding model has changed since last run. Forcing an index rebuild');
366            $clear = true;
367        }
368
369        $start = time();
370        $this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear);
371        $this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
372        $this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
373
374
375        $data['embed ran at'] = dformat();
376        $data['embed used'] = (string) $this->helper->getEmbeddingModel();
377        $this->helper->setRunData($data);
378    }
379
380    /**
381     * Dump TSV files for debugging
382     *
383     * @return void
384     */
385    protected function tsv($vector, $meta)
386    {
387
388        $storage = $this->helper->getStorage();
389        $storage->dumpTSV($vector, $meta);
390        $this->success('written to ' . $vector . ' and ' . $meta);
391    }
392
393    /**
394     * Print the given detailed answer in a nice way
395     *
396     * @param array $answer
397     * @return void
398     */
399    protected function printAnswer($answer)
400    {
401        $this->colors->ptln($answer['answer'], Colors::C_LIGHTCYAN);
402        echo "\n";
403        $this->printSources($answer['sources']);
404        echo "\n";
405        $this->printUsage();
406    }
407
408    /**
409     * Print the given sources
410     *
411     * @param Chunk[] $sources
412     * @return void
413     */
414    protected function printSources($sources)
415    {
416        foreach ($sources as $source) {
417            /** @var Chunk $source */
418            $this->colors->ptln(
419                "\t" . $source->getPage() . ' ' . $source->getId() . ' (' . $source->getScore() . ')',
420                Colors::C_LIGHTBLUE
421            );
422        }
423    }
424
425    /**
426     * Print the usage statistics for OpenAI
427     *
428     * @return void
429     */
430    protected function printUsage()
431    {
432        $chat = $this->helper->getChatModel()->getUsageStats();
433        $rephrase = $this->helper->getRephraseModel()->getUsageStats();
434        $embed = $this->helper->getEmbeddingModel()->getUsageStats();
435
436        $this->info(
437            'Made {requests} requests in {time}s to models. Used {tokens} tokens for about ${cost}.',
438            [
439                'requests' => $chat['requests'] + $rephrase['requests'] + $embed['requests'],
440                'time' => $chat['time'] + $rephrase['time'] + $embed['time'],
441                'tokens' => $chat['tokens'] + $chat['tokens'] + $embed['tokens'],
442                'cost' => $chat['cost'] + $chat['cost'] + $embed['cost'],
443            ]
444        );
445    }
446
447    /**
448     * Interactively ask for a value from the user
449     *
450     * @param string $prompt
451     * @return string
452     */
453    protected function readLine($prompt)
454    {
455        $value = '';
456
457        while ($value === '') {
458            echo $prompt;
459            echo ': ';
460
461            $fh = fopen('php://stdin', 'r');
462            $value = trim(fgets($fh));
463            fclose($fh);
464        }
465
466        return $value;
467    }
468
469    /**
470     * Read the skip and match regex from the config
471     *
472     * Ensures the regular expressions are valid
473     *
474     * @return string[] [$skipRE, $matchRE]
475     */
476    protected function getRegexps()
477    {
478        $skip = $this->getConf('skipRegex');
479        $skipRE = '';
480        $match = $this->getConf('matchRegex');
481        $matchRE = '';
482
483        if ($skip) {
484            $skipRE = '/' . $skip . '/';
485            if (@preg_match($skipRE, '') === false) {
486                $this->error(preg_last_error_msg());
487                $this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.');
488                $skipRE = '';
489            } else {
490                $this->success('Skipping pages matching ' . $skipRE);
491            }
492        }
493
494        if ($match) {
495            $matchRE = '/' . $match . '/';
496            if (@preg_match($matchRE, '') === false) {
497                $this->error(preg_last_error_msg());
498                $this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.');
499                $matchRE = '';
500            } else {
501                $this->success('Only indexing pages matching ' . $matchRE);
502            }
503        }
504        return [$skipRE, $matchRE];
505    }
506}
507