xref: /dokuwiki/inc/Search/Indexer.php (revision 21fbd01b3c3eea88b767376b7b158f31f0f63127)
16225b270SMichael Große<?php
26225b270SMichael Große
36225b270SMichael Großenamespace dokuwiki\Search;
46225b270SMichael Große
56225b270SMichael Großeuse dokuwiki\Extension\Event;
683b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageFulltextCollection;
783b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection;
883b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection;
915f699acSAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException;
10*21fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException;
11a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexLockException;
12a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexWriteException;
1383b3acccSAndreas Gohruse dokuwiki\Search\Index\FileIndex;
1483b3acccSAndreas Gohruse dokuwiki\Search\Index\Lock;
154027a91aSSatoshi Sahara
164027a91aSSatoshi Sahara// Version tag used to force rebuild on upgrade
174027a91aSSatoshi Saharaconst INDEXER_VERSION = 8;
186225b270SMichael Große
196225b270SMichael Große/**
20a32da6ddSSatoshi Sahara * Class DokuWiki Indexer
216225b270SMichael Große *
2283b3acccSAndreas Gohr * Manages the page search index by delegating to Collection classes.
2383b3acccSAndreas Gohr *
244027a91aSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
256225b270SMichael Große * @author     Andreas Gohr <andi@splitbrain.org>
264027a91aSSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org>
276225b270SMichael Große */
2883b3acccSAndreas Gohrclass Indexer
294027a91aSSatoshi Sahara{
3083b3acccSAndreas Gohr    /** @var callable|null Logging callback, receives a string message */
3183b3acccSAndreas Gohr    protected $logger;
326225b270SMichael Große
334027a91aSSatoshi Sahara    /**
3483b3acccSAndreas Gohr     * Set a logging callback
354027a91aSSatoshi Sahara     *
3683b3acccSAndreas Gohr     * The callback receives a single string message. Use this to integrate
3783b3acccSAndreas Gohr     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
3883b3acccSAndreas Gohr     *
3983b3acccSAndreas Gohr     * @param callable $logger
4083b3acccSAndreas Gohr     * @return static
414027a91aSSatoshi Sahara     */
4283b3acccSAndreas Gohr    public function setLogger(callable $logger): static
434027a91aSSatoshi Sahara    {
4483b3acccSAndreas Gohr        $this->logger = $logger;
4583b3acccSAndreas Gohr        return $this;
466225b270SMichael Große    }
476225b270SMichael Große
486225b270SMichael Große    /**
4983b3acccSAndreas Gohr     * Send a message to the registered logger
506225b270SMichael Große     *
5183b3acccSAndreas Gohr     * @param string $message
526225b270SMichael Große     */
5383b3acccSAndreas Gohr    protected function log(string $message): void
544027a91aSSatoshi Sahara    {
5583b3acccSAndreas Gohr        if ($this->logger) ($this->logger)($message);
566225b270SMichael Große    }
576225b270SMichael Große
586225b270SMichael Große    /**
594027a91aSSatoshi Sahara     * Version of the indexer taking into consideration the external tokenizer.
604027a91aSSatoshi Sahara     * The indexer is only compatible with data written by the same version.
616225b270SMichael Große     *
624027a91aSSatoshi Sahara     * @triggers INDEXER_VERSION_GET
634027a91aSSatoshi Sahara     * Plugins that modify what gets indexed should hook this event and
644027a91aSSatoshi Sahara     * add their version info to the event data like so:
654027a91aSSatoshi Sahara     *     $data[$plugin_name] = $plugin_version;
666225b270SMichael Große     *
674027a91aSSatoshi Sahara     * @return int|string
686225b270SMichael Große     */
694027a91aSSatoshi Sahara    public function getVersion()
704027a91aSSatoshi Sahara    {
714027a91aSSatoshi Sahara        static $indexer_version = null;
724027a91aSSatoshi Sahara        if ($indexer_version == null) {
734027a91aSSatoshi Sahara            $version = INDEXER_VERSION;
744027a91aSSatoshi Sahara
7583b3acccSAndreas Gohr            $data = ['dokuwiki' => $version];
764027a91aSSatoshi Sahara            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
774027a91aSSatoshi Sahara            unset($data['dokuwiki']); // this needs to be first
784027a91aSSatoshi Sahara            ksort($data);
794027a91aSSatoshi Sahara            foreach ($data as $plugin => $vers) {
804027a91aSSatoshi Sahara                $version .= '+' . $plugin . '=' . $vers;
814027a91aSSatoshi Sahara            }
824027a91aSSatoshi Sahara            $indexer_version = $version;
834027a91aSSatoshi Sahara        }
844027a91aSSatoshi Sahara        return $indexer_version;
856225b270SMichael Große    }
866225b270SMichael Große
874027a91aSSatoshi Sahara    /**
8883b3acccSAndreas Gohr     * Return a list of all indexed pages
8983b3acccSAndreas Gohr     *
9083b3acccSAndreas Gohr     * @param bool $existsFilter only return pages that exist on disk
9183b3acccSAndreas Gohr     * @return string[] list of page names (keys are the RIDs in the page index)
9283b3acccSAndreas Gohr     */
9383b3acccSAndreas Gohr    public function getAllPages(bool $existsFilter = false): array
9483b3acccSAndreas Gohr    {
9583b3acccSAndreas Gohr        $pageIndex = new Index\MemoryIndex('page');
9683b3acccSAndreas Gohr        return array_filter(
9783b3acccSAndreas Gohr            iterator_to_array($pageIndex),
9883b3acccSAndreas Gohr            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
9983b3acccSAndreas Gohr        );
10083b3acccSAndreas Gohr    }
10183b3acccSAndreas Gohr
10283b3acccSAndreas Gohr    /**
10383b3acccSAndreas Gohr     * Check if a page needs (re-)indexing
10483b3acccSAndreas Gohr     *
10583b3acccSAndreas Gohr     * @param string $page
10683b3acccSAndreas Gohr     * @param bool $force
10783b3acccSAndreas Gohr     * @return bool true if indexing is needed
10883b3acccSAndreas Gohr     */
10983b3acccSAndreas Gohr    public function needsIndexing(string $page, bool $force = false): bool
11083b3acccSAndreas Gohr    {
11183b3acccSAndreas Gohr        $idxtag = metaFN($page, '.indexed');
11283b3acccSAndreas Gohr        if ($force || !file_exists($idxtag)) return true;
11383b3acccSAndreas Gohr
11483b3acccSAndreas Gohr        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
11583b3acccSAndreas Gohr
11683b3acccSAndreas Gohr        $last = @filemtime($idxtag);
11783b3acccSAndreas Gohr        return $last <= @filemtime(wikiFN($page));
11883b3acccSAndreas Gohr    }
11983b3acccSAndreas Gohr
12083b3acccSAndreas Gohr    /**
12183b3acccSAndreas Gohr     * Add/update the search index for a page
1224027a91aSSatoshi Sahara     *
1234027a91aSSatoshi Sahara     * Locking is handled internally.
1244027a91aSSatoshi Sahara     *
12583b3acccSAndreas Gohr     * @param string $page The page to index
1264027a91aSSatoshi Sahara     * @param bool $force force reindexing even when the index is up to date
1274027a91aSSatoshi Sahara     *
128a32da6ddSSatoshi Sahara     * @throws IndexAccessException
129a16bd548SSatoshi Sahara     * @throws IndexLockException
130a16bd548SSatoshi Sahara     * @throws IndexWriteException
1314027a91aSSatoshi Sahara     */
13283b3acccSAndreas Gohr    public function addPage(string $page, bool $force = false): void
1334027a91aSSatoshi Sahara    {
13483b3acccSAndreas Gohr        if (!$this->needsIndexing($page, $force)) {
13583b3acccSAndreas Gohr            $this->log("Indexer: index for {$page} up to date");
13683b3acccSAndreas Gohr            return;
137a32da6ddSSatoshi Sahara        }
138a32da6ddSSatoshi Sahara
13983b3acccSAndreas Gohr        // create shared writable page index early so we can resolve the PID for plugins
14083b3acccSAndreas Gohr        $pageIndex = new FileIndex('page', '', true);
1416225b270SMichael Große
14283b3acccSAndreas Gohr        // prepare event data
14383b3acccSAndreas Gohr        $data = [
14483b3acccSAndreas Gohr            'page' => $page,
14583b3acccSAndreas Gohr            'body' => '',
14683b3acccSAndreas Gohr            'metadata' => [
14783b3acccSAndreas Gohr                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
14883b3acccSAndreas Gohr                'relation_references' => array_keys(
14983b3acccSAndreas Gohr                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
15083b3acccSAndreas Gohr                ),
15183b3acccSAndreas Gohr                'relation_media' => array_keys(
15283b3acccSAndreas Gohr                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
15383b3acccSAndreas Gohr                ),
15483b3acccSAndreas Gohr                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
15583b3acccSAndreas Gohr            ],
15683b3acccSAndreas Gohr            'pid' => $pageIndex->accessCachedValue($page),
15783b3acccSAndreas Gohr        ];
1586225b270SMichael Große
15983b3acccSAndreas Gohr        // let plugins modify the data
1604027a91aSSatoshi Sahara        $event = new Event('INDEXER_PAGE_ADD', $data);
16183b3acccSAndreas Gohr        if ($event->advise_before()) {
16283b3acccSAndreas Gohr            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
16383b3acccSAndreas Gohr        }
1644027a91aSSatoshi Sahara        $event->advise_after();
1654027a91aSSatoshi Sahara        unset($event);
1666225b270SMichael Große
16783b3acccSAndreas Gohr        // index title
16883b3acccSAndreas Gohr        (new PageTitleCollection($pageIndex))->lock()
16983b3acccSAndreas Gohr            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
17083b3acccSAndreas Gohr        unset($data['metadata']['title']);
1716225b270SMichael Große
17283b3acccSAndreas Gohr        // index fulltext
17383b3acccSAndreas Gohr        if ($data['metadata']['internal_index']) {
17483b3acccSAndreas Gohr            $words = Tokenizer::getWords($data['body']);
17583b3acccSAndreas Gohr            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
1766225b270SMichael Große        } else {
17783b3acccSAndreas Gohr            $this->log("Indexer: full text indexing disabled for {$data['page']}");
17883b3acccSAndreas Gohr            // clear any previously stored fulltext data
17983b3acccSAndreas Gohr            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
1806225b270SMichael Große        }
18183b3acccSAndreas Gohr        unset($data['metadata']['internal_index']);
18283b3acccSAndreas Gohr
18383b3acccSAndreas Gohr        // index metadata keys
18483b3acccSAndreas Gohr        foreach ($data['metadata'] as $key => $values) {
18583b3acccSAndreas Gohr            if (!is_array($values)) {
18683b3acccSAndreas Gohr                $values = ($values !== null && $values !== '') ? [$values] : [];
1876225b270SMichael Große            }
18883b3acccSAndreas Gohr            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
18983b3acccSAndreas Gohr        }
19083b3acccSAndreas Gohr
19183b3acccSAndreas Gohr        // update metadata registry
19283b3acccSAndreas Gohr        $this->updateMetadataRegistry(array_keys($data['metadata']));
1936225b270SMichael Große
1944027a91aSSatoshi Sahara        // update index tag file
19583b3acccSAndreas Gohr        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
19683b3acccSAndreas Gohr        $this->log("Indexer: finished indexing {$data['page']}");
1976225b270SMichael Große    }
1986225b270SMichael Große
1996225b270SMichael Große    /**
2005f9bd525SSatoshi Sahara     * Remove a page from the index
2016225b270SMichael Große     *
20283b3acccSAndreas Gohr     * Clears the page's data from all collections. The entity persists in page.idx.
2036225b270SMichael Große     *
20483b3acccSAndreas Gohr     * @param string $page The page to remove
20583b3acccSAndreas Gohr     * @param bool $force force deletion even when no .indexed tag exists
2066225b270SMichael Große     *
207a32da6ddSSatoshi Sahara     * @throws IndexAccessException
208a16bd548SSatoshi Sahara     * @throws IndexLockException
209a16bd548SSatoshi Sahara     * @throws IndexWriteException
2106225b270SMichael Große     */
21183b3acccSAndreas Gohr    public function deletePage(string $page, bool $force = false): void
2124027a91aSSatoshi Sahara    {
2134027a91aSSatoshi Sahara        $idxtag = metaFN($page, '.indexed');
2144027a91aSSatoshi Sahara        if (!$force && !file_exists($idxtag)) {
21583b3acccSAndreas Gohr            $this->log("Indexer: {$page}.indexed file does not exist, ignoring");
21683b3acccSAndreas Gohr            return;
2174027a91aSSatoshi Sahara        }
2186225b270SMichael Große
21983b3acccSAndreas Gohr        $pageIndex = new FileIndex('page', '', true);
220725e8e5fSSatoshi Sahara
22183b3acccSAndreas Gohr        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
22283b3acccSAndreas Gohr        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
22383b3acccSAndreas Gohr
22483b3acccSAndreas Gohr        foreach ($this->getMetadataRegistryKeys() as $key) {
22583b3acccSAndreas Gohr            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
2264027a91aSSatoshi Sahara        }
2276225b270SMichael Große
22883b3acccSAndreas Gohr        $this->log("Indexer: deleted {$page} from index");
2294027a91aSSatoshi Sahara        @unlink($idxtag);
2304027a91aSSatoshi Sahara    }
2314027a91aSSatoshi Sahara
2324027a91aSSatoshi Sahara    /**
23383b3acccSAndreas Gohr     * Rename a page in the search index
23483b3acccSAndreas Gohr     *
23583b3acccSAndreas Gohr     * The page must already have been moved on disk before calling this.
23683b3acccSAndreas Gohr     * Clears the old page's data and re-indexes under the new name.
2374027a91aSSatoshi Sahara     *
2384027a91aSSatoshi Sahara     * @param string $oldpage The old page name
2394027a91aSSatoshi Sahara     * @param string $newpage The new page name
24083b3acccSAndreas Gohr     *
24183b3acccSAndreas Gohr     * @throws IndexAccessException
242a16bd548SSatoshi Sahara     * @throws IndexLockException
243a16bd548SSatoshi Sahara     * @throws IndexWriteException
2444027a91aSSatoshi Sahara     */
24583b3acccSAndreas Gohr    public function renamePage(string $oldpage, string $newpage): void
2464027a91aSSatoshi Sahara    {
24783b3acccSAndreas Gohr        $this->deletePage($oldpage, true);
24883b3acccSAndreas Gohr        $this->addPage($newpage, true);
2496225b270SMichael Große    }
2506225b270SMichael Große
2516225b270SMichael Große    /**
25283b3acccSAndreas Gohr     * Clear all page indexes
2536225b270SMichael Große     */
25483b3acccSAndreas Gohr    public function clear(): void
2554027a91aSSatoshi Sahara    {
2566225b270SMichael Große        global $conf;
2576225b270SMichael Große
25883b3acccSAndreas Gohr        Lock::acquire('page');
2594027a91aSSatoshi Sahara
26083b3acccSAndreas Gohr        // clear metadata indexes
26183b3acccSAndreas Gohr        foreach ($this->getMetadataRegistryKeys() as $key) {
26283b3acccSAndreas Gohr            $clean = PageMetaCollection::cleanName($key);
26383b3acccSAndreas Gohr            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
26483b3acccSAndreas Gohr            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
26583b3acccSAndreas Gohr            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
2666225b270SMichael Große        }
2676225b270SMichael Große
26883b3acccSAndreas Gohr        // clear fulltext indexes
26983b3acccSAndreas Gohr        $files = glob($conf['indexdir'] . '/i*.idx');
27083b3acccSAndreas Gohr        if ($files) foreach ($files as $f) @unlink($f);
27183b3acccSAndreas Gohr        $files = glob($conf['indexdir'] . '/w*.idx');
27283b3acccSAndreas Gohr        if ($files) foreach ($files as $f) @unlink($f);
27383b3acccSAndreas Gohr
27483b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/pageword.idx');
27583b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/lengths.idx');
27683b3acccSAndreas Gohr
27783b3acccSAndreas Gohr        // clear title and page indexes
27883b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/title.idx');
27983b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/page.idx');
28083b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/metadata.idx');
28183b3acccSAndreas Gohr
28283b3acccSAndreas Gohr        Lock::release('page');
28383b3acccSAndreas Gohr    }
28483b3acccSAndreas Gohr
28583b3acccSAndreas Gohr    /**
286*21fbd01bSAndreas Gohr     * Check the structural integrity of all search indexes
287*21fbd01bSAndreas Gohr     *
288*21fbd01bSAndreas Gohr     * @throws IndexIntegrityException when a structural inconsistency is found
289*21fbd01bSAndreas Gohr     */
290*21fbd01bSAndreas Gohr    public function checkIntegrity(): void
291*21fbd01bSAndreas Gohr    {
292*21fbd01bSAndreas Gohr        (new PageFulltextCollection())->checkIntegrity();
293*21fbd01bSAndreas Gohr        (new PageTitleCollection())->checkIntegrity();
294*21fbd01bSAndreas Gohr
295*21fbd01bSAndreas Gohr        foreach ($this->getMetadataRegistryKeys() as $key) {
296*21fbd01bSAndreas Gohr            (new PageMetaCollection($key))->checkIntegrity();
297*21fbd01bSAndreas Gohr        }
298*21fbd01bSAndreas Gohr    }
299*21fbd01bSAndreas Gohr
300*21fbd01bSAndreas Gohr    /**
301*21fbd01bSAndreas Gohr     * Whether the search index is empty (no fulltext data indexed yet)
302*21fbd01bSAndreas Gohr     *
303*21fbd01bSAndreas Gohr     * @return bool
304*21fbd01bSAndreas Gohr     */
305*21fbd01bSAndreas Gohr    public function isIndexEmpty(): bool
306*21fbd01bSAndreas Gohr    {
307*21fbd01bSAndreas Gohr        return (new PageFulltextCollection())->getTokenIndexMaximum() === 0;
308*21fbd01bSAndreas Gohr    }
309*21fbd01bSAndreas Gohr
310*21fbd01bSAndreas Gohr    /**
31183b3acccSAndreas Gohr     * Get the list of known metadata keys from the metadata registry
31283b3acccSAndreas Gohr     *
31383b3acccSAndreas Gohr     * @return string[] list of metadata key names
31483b3acccSAndreas Gohr     */
31583b3acccSAndreas Gohr    protected function getMetadataRegistryKeys(): array
31683b3acccSAndreas Gohr    {
31783b3acccSAndreas Gohr        global $conf;
31883b3acccSAndreas Gohr        $fn = $conf['indexdir'] . '/metadata.idx';
31983b3acccSAndreas Gohr        if (!file_exists($fn)) return [];
32083b3acccSAndreas Gohr        $keys = file($fn, FILE_IGNORE_NEW_LINES);
32183b3acccSAndreas Gohr        return $keys ?: [];
32283b3acccSAndreas Gohr    }
32383b3acccSAndreas Gohr
32483b3acccSAndreas Gohr    /**
32583b3acccSAndreas Gohr     * Update the metadata registry with new keys
32683b3acccSAndreas Gohr     *
32783b3acccSAndreas Gohr     * @param string[] $keys metadata key names to ensure are registered
32883b3acccSAndreas Gohr     */
32983b3acccSAndreas Gohr    protected function updateMetadataRegistry(array $keys): void
33083b3acccSAndreas Gohr    {
33183b3acccSAndreas Gohr        global $conf;
33283b3acccSAndreas Gohr        $fn = $conf['indexdir'] . '/metadata.idx';
33383b3acccSAndreas Gohr        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
33483b3acccSAndreas Gohr        if (!$existing) $existing = [];
33583b3acccSAndreas Gohr
33683b3acccSAndreas Gohr        $added = false;
33783b3acccSAndreas Gohr        foreach ($keys as $key) {
33883b3acccSAndreas Gohr            if (!in_array($key, $existing)) {
33983b3acccSAndreas Gohr                $existing[] = $key;
34083b3acccSAndreas Gohr                $added = true;
34183b3acccSAndreas Gohr            }
34283b3acccSAndreas Gohr        }
34383b3acccSAndreas Gohr
34483b3acccSAndreas Gohr        if ($added) {
34583b3acccSAndreas Gohr            io_saveFile($fn, implode("\n", $existing) . "\n");
34683b3acccSAndreas Gohr        }
34783b3acccSAndreas Gohr    }
3486225b270SMichael Große}
349