xref: /plugin/aichat/Embeddings.php (revision 661701ee9039525690adc9c537cc48f68a48903b)
18817535bSAndreas Gohr<?php
28817535bSAndreas Gohr
38817535bSAndreas Gohrnamespace dokuwiki\plugin\aichat;
48817535bSAndreas Gohr
5ab1f8ddeSAndreas Gohruse dokuwiki\Extension\Event;
67ebc7895Ssplitbrainuse dokuwiki\Extension\PluginInterface;
7*661701eeSAndreas Gohruse dokuwiki\File\PageResolver;
8294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\ChatInterface;
9294a9eafSAndreas Gohruse dokuwiki\plugin\aichat\Model\EmbeddingInterface;
10f6ef2e50SAndreas Gohruse dokuwiki\plugin\aichat\Storage\AbstractStorage;
118817535bSAndreas Gohruse dokuwiki\Search\Indexer;
122ecc089aSAndreas Gohruse splitbrain\phpcli\CLI;
138817535bSAndreas Gohruse TikToken\Encoder;
148817535bSAndreas Gohruse Vanderlee\Sentence\Sentence;
158817535bSAndreas Gohr
169da5f0dfSAndreas Gohr/**
179da5f0dfSAndreas Gohr * Manage the embeddings index
189da5f0dfSAndreas Gohr *
199da5f0dfSAndreas Gohr * Pages are split into chunks of 1000 tokens each. For each chunk the embedding vector is fetched from
207ee8b02dSAndreas Gohr * OpenAI and stored in the Storage backend.
219da5f0dfSAndreas Gohr */
228817535bSAndreas Gohrclass Embeddings
238817535bSAndreas Gohr{
2468908844SAndreas Gohr    /** @var int maximum overlap between chunks in tokens */
2530b9cbc7Ssplitbrain    final public const MAX_OVERLAP_LEN = 200;
268817535bSAndreas Gohr
27294a9eafSAndreas Gohr    /** @var ChatInterface */
286a18e0f4SAndreas Gohr    protected $chatModel;
296a18e0f4SAndreas Gohr
30294a9eafSAndreas Gohr    /** @var EmbeddingInterface */
316a18e0f4SAndreas Gohr    protected $embedModel;
326a18e0f4SAndreas Gohr
332ecc089aSAndreas Gohr    /** @var CLI|null */
342ecc089aSAndreas Gohr    protected $logger;
3568908844SAndreas Gohr    /** @var Encoder */
3668908844SAndreas Gohr    protected $tokenEncoder;
378817535bSAndreas Gohr
387ee8b02dSAndreas Gohr    /** @var AbstractStorage */
397ee8b02dSAndreas Gohr    protected $storage;
407ee8b02dSAndreas Gohr
4168908844SAndreas Gohr    /** @var array remember sentences when chunking */
4268908844SAndreas Gohr    private $sentenceQueue = [];
4368908844SAndreas Gohr
44c2b7a1f7SAndreas Gohr    /** @var int the time spent for the last similar chunk retrieval */
45c2b7a1f7SAndreas Gohr    public $timeSpent = 0;
46c2b7a1f7SAndreas Gohr
4734a1c478SAndreas Gohr    protected $configChunkSize;
4834a1c478SAndreas Gohr    protected $configContextChunks;
49720bb43fSAndreas Gohr    protected $similarityThreshold;
5034a1c478SAndreas Gohr
5134a1c478SAndreas Gohr    /**
5234a1c478SAndreas Gohr     * Embeddings constructor.
5334a1c478SAndreas Gohr     *
5434a1c478SAndreas Gohr     * @param ChatInterface $chatModel
5534a1c478SAndreas Gohr     * @param EmbeddingInterface $embedModel
5634a1c478SAndreas Gohr     * @param AbstractStorage $storage
5734a1c478SAndreas Gohr     * @param array $config The plugin configuration
5834a1c478SAndreas Gohr     */
596a18e0f4SAndreas Gohr    public function __construct(
60294a9eafSAndreas Gohr        ChatInterface $chatModel,
61294a9eafSAndreas Gohr        EmbeddingInterface $embedModel,
6234a1c478SAndreas Gohr        AbstractStorage $storage,
6334a1c478SAndreas Gohr        $config
648c08cb3fSAndreas Gohr    ) {
656a18e0f4SAndreas Gohr        $this->chatModel = $chatModel;
666a18e0f4SAndreas Gohr        $this->embedModel = $embedModel;
67f6ef2e50SAndreas Gohr        $this->storage = $storage;
6834a1c478SAndreas Gohr        $this->configChunkSize = $config['chunkSize'];
6934a1c478SAndreas Gohr        $this->configContextChunks = $config['contextChunks'];
70720bb43fSAndreas Gohr        $this->similarityThreshold = $config['similarityThreshold'] / 100;
717ee8b02dSAndreas Gohr    }
727ee8b02dSAndreas Gohr
737ee8b02dSAndreas Gohr    /**
747ee8b02dSAndreas Gohr     * Access storage
757ee8b02dSAndreas Gohr     *
767ee8b02dSAndreas Gohr     * @return AbstractStorage
777ee8b02dSAndreas Gohr     */
787ee8b02dSAndreas Gohr    public function getStorage()
797ee8b02dSAndreas Gohr    {
807ee8b02dSAndreas Gohr        return $this->storage;
812ecc089aSAndreas Gohr    }
822ecc089aSAndreas Gohr
832ecc089aSAndreas Gohr    /**
842ecc089aSAndreas Gohr     * Add a logger instance
852ecc089aSAndreas Gohr     *
862ecc089aSAndreas Gohr     * @return void
872ecc089aSAndreas Gohr     */
882ecc089aSAndreas Gohr    public function setLogger(CLI $logger)
892ecc089aSAndreas Gohr    {
908817535bSAndreas Gohr        $this->logger = $logger;
918817535bSAndreas Gohr    }
928817535bSAndreas Gohr
932ecc089aSAndreas Gohr    /**
9468908844SAndreas Gohr     * Get the token encoder instance
9568908844SAndreas Gohr     *
9668908844SAndreas Gohr     * @return Encoder
9768908844SAndreas Gohr     */
9868908844SAndreas Gohr    public function getTokenEncoder()
9968908844SAndreas Gohr    {
1007ebc7895Ssplitbrain        if (!$this->tokenEncoder instanceof Encoder) {
10168908844SAndreas Gohr            $this->tokenEncoder = new Encoder();
10268908844SAndreas Gohr        }
10368908844SAndreas Gohr        return $this->tokenEncoder;
10468908844SAndreas Gohr    }
10568908844SAndreas Gohr
10668908844SAndreas Gohr    /**
1076a18e0f4SAndreas Gohr     * Return the chunk size to use
1086a18e0f4SAndreas Gohr     *
1096a18e0f4SAndreas Gohr     * @return int
1106a18e0f4SAndreas Gohr     */
1116a18e0f4SAndreas Gohr    public function getChunkSize()
1126a18e0f4SAndreas Gohr    {
1136a18e0f4SAndreas Gohr        return min(
11434a1c478SAndreas Gohr            floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
11534a1c478SAndreas Gohr            floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
11634a1c478SAndreas Gohr            $this->configChunkSize, // this is usually the smallest
1176a18e0f4SAndreas Gohr        );
1186a18e0f4SAndreas Gohr    }
1196a18e0f4SAndreas Gohr
1206a18e0f4SAndreas Gohr    /**
1215284515dSAndreas Gohr     * Update the embeddings storage
1222ecc089aSAndreas Gohr     *
123ad38c5fdSAndreas Gohr     * @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
124d5c102b3SAndreas Gohr     * @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
1255284515dSAndreas Gohr     * @param bool $clear Should any existing storage be cleared before updating?
1262ecc089aSAndreas Gohr     * @return void
1275284515dSAndreas Gohr     * @throws \Exception
1282ecc089aSAndreas Gohr     */
129d5c102b3SAndreas Gohr    public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
1308817535bSAndreas Gohr    {
1318817535bSAndreas Gohr        $indexer = new Indexer();
1328817535bSAndreas Gohr        $pages = $indexer->getPages();
1338817535bSAndreas Gohr
134f6ef2e50SAndreas Gohr        $this->storage->startCreation($clear);
1355aa45b4dSAndreas Gohr        foreach ($pages as $pid => $page) {
1365aa45b4dSAndreas Gohr            $chunkID = $pid * 100; // chunk IDs start at page ID * 100
1375aa45b4dSAndreas Gohr
1385284515dSAndreas Gohr            if (
1395284515dSAndreas Gohr                !page_exists($page) ||
1405284515dSAndreas Gohr                isHiddenPage($page) ||
1414e206c13SAndreas Gohr                filesize(wikiFN($page)) < 150 || // skip very small pages
142d5c102b3SAndreas Gohr                ($skipRE && preg_match($skipRE, (string)$page)) ||
143d5c102b3SAndreas Gohr                ($matchRE && !preg_match($matchRE, ":$page"))
1445284515dSAndreas Gohr            ) {
1455284515dSAndreas Gohr                // this page should not be in the index (anymore)
1465284515dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
1475284515dSAndreas Gohr                continue;
1485284515dSAndreas Gohr            }
1495284515dSAndreas Gohr
1507ee8b02dSAndreas Gohr            $firstChunk = $this->storage->getChunk($chunkID);
1517ee8b02dSAndreas Gohr            if ($firstChunk && @filemtime(wikiFN($page)) < $firstChunk->getCreated()) {
1525aa45b4dSAndreas Gohr                // page is older than the chunks we have, reuse the existing chunks
1537ee8b02dSAndreas Gohr                $this->storage->reusePageChunks($page, $chunkID);
1547ebc7895Ssplitbrain                if ($this->logger instanceof CLI) $this->logger->info("Reusing chunks for $page");
1555aa45b4dSAndreas Gohr            } else {
1565aa45b4dSAndreas Gohr                // page is newer than the chunks we have, create new chunks
1577ee8b02dSAndreas Gohr                $this->storage->deletePageChunks($page, $chunkID);
158ecb0a423SAndreas Gohr                $chunks = $this->createPageChunks($page, $chunkID);
159ecb0a423SAndreas Gohr                if ($chunks) $this->storage->addPageChunks($chunks);
1605aa45b4dSAndreas Gohr            }
1615aa45b4dSAndreas Gohr        }
1627ee8b02dSAndreas Gohr        $this->storage->finalizeCreation();
1635aa45b4dSAndreas Gohr    }
1645aa45b4dSAndreas Gohr
1655aa45b4dSAndreas Gohr    /**
1667ee8b02dSAndreas Gohr     * Split the given page, fetch embedding vectors and return Chunks
1675aa45b4dSAndreas Gohr     *
16888305719SAndreas Gohr     * Will use the text renderer plugin if available to get the rendered text.
16988305719SAndreas Gohr     * Otherwise the raw wiki text is used.
17088305719SAndreas Gohr     *
1715aa45b4dSAndreas Gohr     * @param string $page Name of the page to split
1727ee8b02dSAndreas Gohr     * @param int $firstChunkID The ID of the first chunk of this page
1737ee8b02dSAndreas Gohr     * @return Chunk[] A list of chunks created for this page
174ab1f8ddeSAndreas Gohr     * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
1755aa45b4dSAndreas Gohr     * @throws \Exception
1765aa45b4dSAndreas Gohr     */
177ab1f8ddeSAndreas Gohr    public function createPageChunks($page, $firstChunkID)
1785aa45b4dSAndreas Gohr    {
1797ee8b02dSAndreas Gohr        $chunkList = [];
18088305719SAndreas Gohr
18188305719SAndreas Gohr        global $ID;
18288305719SAndreas Gohr        $ID = $page;
183303d0c59SAndreas Gohr        try {
184*661701eeSAndreas Gohr            $text = p_cached_output(wikiFN($page), 'aichat', $page);
185303d0c59SAndreas Gohr        } catch (\Throwable $e) {
186303d0c59SAndreas Gohr            if ($this->logger) $this->logger->error(
187*661701eeSAndreas Gohr                'Failed to render page {page}. Using raw text instead. {msg}',
188303d0c59SAndreas Gohr                ['page' => $page, 'msg' => $e->getMessage()]
189303d0c59SAndreas Gohr            );
190303d0c59SAndreas Gohr            $text = rawWiki($page);
191303d0c59SAndreas Gohr        }
192*661701eeSAndreas Gohr
193*661701eeSAndreas Gohr        $crumbs = $this->breadcrumbTrail($page);
19488305719SAndreas Gohr
195ab1f8ddeSAndreas Gohr        // allow plugins to modify the text before splitting
196ab1f8ddeSAndreas Gohr        $eventData = [
197ab1f8ddeSAndreas Gohr            'page' => $page,
198ab1f8ddeSAndreas Gohr            'body' => '',
199ab1f8ddeSAndreas Gohr            'metadata' => ['title' => $page, 'relation_references' => []],
200ab1f8ddeSAndreas Gohr        ];
201ab1f8ddeSAndreas Gohr        $event = new Event('INDEXER_PAGE_ADD', $eventData);
202ab1f8ddeSAndreas Gohr        if ($event->advise_before()) {
203ab1f8ddeSAndreas Gohr            $text = $eventData['body'] . ' ' . $text;
204ab1f8ddeSAndreas Gohr        } else {
205ab1f8ddeSAndreas Gohr            $text = $eventData['body'];
206ab1f8ddeSAndreas Gohr        }
207ab1f8ddeSAndreas Gohr
20888305719SAndreas Gohr        $parts = $this->splitIntoChunks($text);
2097ee8b02dSAndreas Gohr        foreach ($parts as $part) {
21030b9cbc7Ssplitbrain            if (trim((string)$part) == '') continue; // skip empty chunks
21193c1dbf4SAndreas Gohr
212*661701eeSAndreas Gohr            $part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
213*661701eeSAndreas Gohr
214ad38c5fdSAndreas Gohr            try {
2156a18e0f4SAndreas Gohr                $embedding = $this->embedModel->getEmbedding($part);
216ad38c5fdSAndreas Gohr            } catch (\Exception $e) {
2177ebc7895Ssplitbrain                if ($this->logger instanceof CLI) {
218ad38c5fdSAndreas Gohr                    $this->logger->error(
219ad38c5fdSAndreas Gohr                        'Failed to get embedding for chunk of page {page}: {msg}',
220ad38c5fdSAndreas Gohr                        ['page' => $page, 'msg' => $e->getMessage()]
221ad38c5fdSAndreas Gohr                    );
222ad38c5fdSAndreas Gohr                }
223ad38c5fdSAndreas Gohr                continue;
224ad38c5fdSAndreas Gohr            }
2257ee8b02dSAndreas Gohr            $chunkList[] = new Chunk($page, $firstChunkID, $part, $embedding);
2267ee8b02dSAndreas Gohr            $firstChunkID++;
2278817535bSAndreas Gohr        }
2287ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
2297ebc7895Ssplitbrain            if ($chunkList !== []) {
230f8d5ae01SAndreas Gohr                $this->logger->success(
231f8d5ae01SAndreas Gohr                    '{id} split into {count} chunks',
232f8d5ae01SAndreas Gohr                    ['id' => $page, 'count' => count($chunkList)]
233f8d5ae01SAndreas Gohr                );
23493c1dbf4SAndreas Gohr            } else {
23593c1dbf4SAndreas Gohr                $this->logger->warning('{id} could not be split into chunks', ['id' => $page]);
23693c1dbf4SAndreas Gohr            }
2378817535bSAndreas Gohr        }
2387ee8b02dSAndreas Gohr        return $chunkList;
2398817535bSAndreas Gohr    }
2408817535bSAndreas Gohr
2419e81bea7SAndreas Gohr    /**
2429e81bea7SAndreas Gohr     * Do a nearest neighbor search for chunks similar to the given question
2439e81bea7SAndreas Gohr     *
2449e81bea7SAndreas Gohr     * Returns only chunks the current user is allowed to read, may return an empty result.
24568908844SAndreas Gohr     * The number of returned chunks depends on the MAX_CONTEXT_LEN setting.
2469e81bea7SAndreas Gohr     *
2479e81bea7SAndreas Gohr     * @param string $query The question
248e33a1d7aSAndreas Gohr     * @param string $lang Limit results to this language
2497ee8b02dSAndreas Gohr     * @return Chunk[]
2509e81bea7SAndreas Gohr     * @throws \Exception
2519e81bea7SAndreas Gohr     */
252e33a1d7aSAndreas Gohr    public function getSimilarChunks($query, $lang = '')
2538817535bSAndreas Gohr    {
2549e81bea7SAndreas Gohr        global $auth;
2556a18e0f4SAndreas Gohr        $vector = $this->embedModel->getEmbedding($query);
2568817535bSAndreas Gohr
257e3640be8SAndreas Gohr        $fetch = min(
25834a1c478SAndreas Gohr            ($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
25934a1c478SAndreas Gohr            $this->configContextChunks
260f6ef2e50SAndreas Gohr        );
261aee9b383SAndreas Gohr
262aee9b383SAndreas Gohr        $time = microtime(true);
263e33a1d7aSAndreas Gohr        $chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
2645f71c9bbSAndreas Gohr        $this->timeSpent = round(microtime(true) - $time, 2);
2657ebc7895Ssplitbrain        if ($this->logger instanceof CLI) {
266aee9b383SAndreas Gohr            $this->logger->info(
267aee9b383SAndreas Gohr                'Fetched {count} similar chunks from store in {time} seconds',
2685f71c9bbSAndreas Gohr                ['count' => count($chunks), 'time' => $this->timeSpent]
269aee9b383SAndreas Gohr            );
270aee9b383SAndreas Gohr        }
27168908844SAndreas Gohr
27268908844SAndreas Gohr        $size = 0;
2738817535bSAndreas Gohr        $result = [];
2747ee8b02dSAndreas Gohr        foreach ($chunks as $chunk) {
2759e81bea7SAndreas Gohr            // filter out chunks the user is not allowed to read
2767ee8b02dSAndreas Gohr            if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
277720bb43fSAndreas Gohr            if ($chunk->getScore() < $this->similarityThreshold) continue;
27868908844SAndreas Gohr
27968908844SAndreas Gohr            $chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
28034a1c478SAndreas Gohr            if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
28168908844SAndreas Gohr
2829e81bea7SAndreas Gohr            $result[] = $chunk;
28368908844SAndreas Gohr            $size += $chunkSize;
2848817535bSAndreas Gohr        }
2858817535bSAndreas Gohr        return $result;
2868817535bSAndreas Gohr    }
2878817535bSAndreas Gohr
288*661701eeSAndreas Gohr    /**
289*661701eeSAndreas Gohr     * Create a breadcrumb trail for the given page
290*661701eeSAndreas Gohr     *
291*661701eeSAndreas Gohr     * Uses the first heading of each namespace and the page itself. This is added as a prefix to
292*661701eeSAndreas Gohr     * each chunk to give the AI some context.
293*661701eeSAndreas Gohr     *
294*661701eeSAndreas Gohr     * @param string $id
295*661701eeSAndreas Gohr     * @return string
296*661701eeSAndreas Gohr     */
297*661701eeSAndreas Gohr    protected function breadcrumbTrail($id)
298*661701eeSAndreas Gohr    {
299*661701eeSAndreas Gohr        $namespaces = explode(':', getNS($id));
300*661701eeSAndreas Gohr        $resolver = new PageResolver($id);
301*661701eeSAndreas Gohr        $crumbs = [];
302*661701eeSAndreas Gohr
303*661701eeSAndreas Gohr        // all namespaces
304*661701eeSAndreas Gohr        $check = '';
305*661701eeSAndreas Gohr        foreach ($namespaces as $namespace) {
306*661701eeSAndreas Gohr            $check .= $namespace . ':';
307*661701eeSAndreas Gohr            $page = $resolver->resolveId($check);
308*661701eeSAndreas Gohr            $title = p_get_first_heading($page);
309*661701eeSAndreas Gohr            $crumbs[] = $title ? "$title ($namespace)" : $namespace;
310*661701eeSAndreas Gohr        }
311*661701eeSAndreas Gohr
312*661701eeSAndreas Gohr        // the page itself
313*661701eeSAndreas Gohr        $title = p_get_first_heading($id);
314*661701eeSAndreas Gohr        $page = noNS($id);
315*661701eeSAndreas Gohr        $crumbs[] = $title ? "$title ($page)" : $page;
316*661701eeSAndreas Gohr
317*661701eeSAndreas Gohr        return implode(' » ', $crumbs);
318*661701eeSAndreas Gohr    }
3195786be46SAndreas Gohr
3205786be46SAndreas Gohr    /**
3218817535bSAndreas Gohr     * @param $text
3228817535bSAndreas Gohr     * @return array
3238817535bSAndreas Gohr     * @throws \Exception
3248817535bSAndreas Gohr     * @todo support splitting too long sentences
3258817535bSAndreas Gohr     */
326ab1f8ddeSAndreas Gohr    protected function splitIntoChunks($text)
3278817535bSAndreas Gohr    {
3288817535bSAndreas Gohr        $sentenceSplitter = new Sentence();
32968908844SAndreas Gohr        $tiktok = $this->getTokenEncoder();
3308817535bSAndreas Gohr
3318817535bSAndreas Gohr        $chunks = [];
3328817535bSAndreas Gohr        $sentences = $sentenceSplitter->split($text);
3338817535bSAndreas Gohr
3348817535bSAndreas Gohr        $chunklen = 0;
3358817535bSAndreas Gohr        $chunk = '';
3368817535bSAndreas Gohr        while ($sentence = array_shift($sentences)) {
3378817535bSAndreas Gohr            $slen = count($tiktok->encode($sentence));
3386a18e0f4SAndreas Gohr            if ($slen > $this->getChunkSize()) {
3398817535bSAndreas Gohr                // sentence is too long, we need to split it further
340f8d5ae01SAndreas Gohr                if ($this->logger instanceof CLI) $this->logger->warning(
341f8d5ae01SAndreas Gohr                    'Sentence too long, splitting not implemented yet'
342f8d5ae01SAndreas Gohr                );
343ad38c5fdSAndreas Gohr                continue;
3448817535bSAndreas Gohr            }
3458817535bSAndreas Gohr
3466a18e0f4SAndreas Gohr            if ($chunklen + $slen < $this->getChunkSize()) {
3478817535bSAndreas Gohr                // add to current chunk
3488817535bSAndreas Gohr                $chunk .= $sentence;
3498817535bSAndreas Gohr                $chunklen += $slen;
35068908844SAndreas Gohr                // remember sentence for overlap check
35168908844SAndreas Gohr                $this->rememberSentence($sentence);
3528817535bSAndreas Gohr            } else {
35368908844SAndreas Gohr                // add current chunk to result
354ab1f8ddeSAndreas Gohr                $chunk = trim($chunk);
355ab1f8ddeSAndreas Gohr                if ($chunk !== '') $chunks[] = $chunk;
35668908844SAndreas Gohr
35768908844SAndreas Gohr                // start new chunk with remembered sentences
3587ebc7895Ssplitbrain                $chunk = implode(' ', $this->sentenceQueue);
35968908844SAndreas Gohr                $chunk .= $sentence;
36068908844SAndreas Gohr                $chunklen = count($tiktok->encode($chunk));
3618817535bSAndreas Gohr            }
3628817535bSAndreas Gohr        }
3638817535bSAndreas Gohr        $chunks[] = $chunk;
3648817535bSAndreas Gohr
3658817535bSAndreas Gohr        return $chunks;
3668817535bSAndreas Gohr    }
36768908844SAndreas Gohr
36868908844SAndreas Gohr    /**
36968908844SAndreas Gohr     * Add a sentence to the queue of remembered sentences
37068908844SAndreas Gohr     *
37168908844SAndreas Gohr     * @param string $sentence
37268908844SAndreas Gohr     * @return void
37368908844SAndreas Gohr     */
37468908844SAndreas Gohr    protected function rememberSentence($sentence)
37568908844SAndreas Gohr    {
37668908844SAndreas Gohr        // add sentence to queue
37768908844SAndreas Gohr        $this->sentenceQueue[] = $sentence;
37868908844SAndreas Gohr
37968908844SAndreas Gohr        // remove oldest sentences from queue until we are below the max overlap
38068908844SAndreas Gohr        $encoder = $this->getTokenEncoder();
3817ebc7895Ssplitbrain        while (count($encoder->encode(implode(' ', $this->sentenceQueue))) > self::MAX_OVERLAP_LEN) {
38268908844SAndreas Gohr            array_shift($this->sentenceQueue);
38368908844SAndreas Gohr        }
38468908844SAndreas Gohr    }
3858817535bSAndreas Gohr}
386