xref: /dokuwiki/inc/Search/Indexer.php (revision 83b3acccb42578eaa33f84e6b13612436320090b)
16225b270SMichael Große<?php
26225b270SMichael Große
36225b270SMichael Großenamespace dokuwiki\Search;
46225b270SMichael Große
56225b270SMichael Großeuse dokuwiki\Extension\Event;
6*83b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageFulltextCollection;
7*83b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection;
8*83b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection;
915f699acSAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException;
10a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexLockException;
11a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexWriteException;
12*83b3acccSAndreas Gohruse dokuwiki\Search\Index\FileIndex;
13*83b3acccSAndreas Gohruse dokuwiki\Search\Index\Lock;
144027a91aSSatoshi Sahara
154027a91aSSatoshi Sahara// Version tag used to force rebuild on upgrade
164027a91aSSatoshi Saharaconst INDEXER_VERSION = 8;
176225b270SMichael Große
186225b270SMichael Große/**
19a32da6ddSSatoshi Sahara * Class DokuWiki Indexer
206225b270SMichael Große *
21*83b3acccSAndreas Gohr * Manages the page search index by delegating to Collection classes.
22*83b3acccSAndreas Gohr *
234027a91aSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
246225b270SMichael Große * @author     Andreas Gohr <andi@splitbrain.org>
254027a91aSSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org>
266225b270SMichael Große */
27*83b3acccSAndreas Gohrclass Indexer
284027a91aSSatoshi Sahara{
29*83b3acccSAndreas Gohr    /** @var callable|null Logging callback, receives a string message */
30*83b3acccSAndreas Gohr    protected $logger;
316225b270SMichael Große
324027a91aSSatoshi Sahara    /**
33*83b3acccSAndreas Gohr     * Set a logging callback
344027a91aSSatoshi Sahara     *
35*83b3acccSAndreas Gohr     * The callback receives a single string message. Use this to integrate
36*83b3acccSAndreas Gohr     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
37*83b3acccSAndreas Gohr     *
38*83b3acccSAndreas Gohr     * @param callable $logger
39*83b3acccSAndreas Gohr     * @return static
404027a91aSSatoshi Sahara     */
41*83b3acccSAndreas Gohr    public function setLogger(callable $logger): static
424027a91aSSatoshi Sahara    {
43*83b3acccSAndreas Gohr        $this->logger = $logger;
44*83b3acccSAndreas Gohr        return $this;
456225b270SMichael Große    }
466225b270SMichael Große
476225b270SMichael Große    /**
48*83b3acccSAndreas Gohr     * Send a message to the registered logger
496225b270SMichael Große     *
50*83b3acccSAndreas Gohr     * @param string $message
516225b270SMichael Große     */
52*83b3acccSAndreas Gohr    protected function log(string $message): void
534027a91aSSatoshi Sahara    {
54*83b3acccSAndreas Gohr        if ($this->logger) ($this->logger)($message);
556225b270SMichael Große    }
566225b270SMichael Große
576225b270SMichael Große    /**
584027a91aSSatoshi Sahara     * Version of the indexer taking into consideration the external tokenizer.
594027a91aSSatoshi Sahara     * The indexer is only compatible with data written by the same version.
606225b270SMichael Große     *
614027a91aSSatoshi Sahara     * @triggers INDEXER_VERSION_GET
624027a91aSSatoshi Sahara     * Plugins that modify what gets indexed should hook this event and
634027a91aSSatoshi Sahara     * add their version info to the event data like so:
644027a91aSSatoshi Sahara     *     $data[$plugin_name] = $plugin_version;
656225b270SMichael Große     *
664027a91aSSatoshi Sahara     * @return int|string
676225b270SMichael Große     */
684027a91aSSatoshi Sahara    public function getVersion()
694027a91aSSatoshi Sahara    {
704027a91aSSatoshi Sahara        static $indexer_version = null;
714027a91aSSatoshi Sahara        if ($indexer_version == null) {
724027a91aSSatoshi Sahara            $version = INDEXER_VERSION;
734027a91aSSatoshi Sahara
74*83b3acccSAndreas Gohr            $data = ['dokuwiki' => $version];
754027a91aSSatoshi Sahara            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
764027a91aSSatoshi Sahara            unset($data['dokuwiki']); // this needs to be first
774027a91aSSatoshi Sahara            ksort($data);
784027a91aSSatoshi Sahara            foreach ($data as $plugin => $vers) {
794027a91aSSatoshi Sahara                $version .= '+' . $plugin . '=' . $vers;
804027a91aSSatoshi Sahara            }
814027a91aSSatoshi Sahara            $indexer_version = $version;
824027a91aSSatoshi Sahara        }
834027a91aSSatoshi Sahara        return $indexer_version;
846225b270SMichael Große    }
856225b270SMichael Große
864027a91aSSatoshi Sahara    /**
87*83b3acccSAndreas Gohr     * Return a list of all indexed pages
88*83b3acccSAndreas Gohr     *
89*83b3acccSAndreas Gohr     * @param bool $existsFilter only return pages that exist on disk
90*83b3acccSAndreas Gohr     * @return string[] list of page names (keys are the RIDs in the page index)
91*83b3acccSAndreas Gohr     */
92*83b3acccSAndreas Gohr    public function getAllPages(bool $existsFilter = false): array
93*83b3acccSAndreas Gohr    {
94*83b3acccSAndreas Gohr        $pageIndex = new Index\MemoryIndex('page');
95*83b3acccSAndreas Gohr        return array_filter(
96*83b3acccSAndreas Gohr            iterator_to_array($pageIndex),
97*83b3acccSAndreas Gohr            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
98*83b3acccSAndreas Gohr        );
99*83b3acccSAndreas Gohr    }
100*83b3acccSAndreas Gohr
101*83b3acccSAndreas Gohr    /**
102*83b3acccSAndreas Gohr     * Check if a page needs (re-)indexing
103*83b3acccSAndreas Gohr     *
104*83b3acccSAndreas Gohr     * @param string $page
105*83b3acccSAndreas Gohr     * @param bool $force
106*83b3acccSAndreas Gohr     * @return bool true if indexing is needed
107*83b3acccSAndreas Gohr     */
108*83b3acccSAndreas Gohr    public function needsIndexing(string $page, bool $force = false): bool
109*83b3acccSAndreas Gohr    {
110*83b3acccSAndreas Gohr        $idxtag = metaFN($page, '.indexed');
111*83b3acccSAndreas Gohr        if ($force || !file_exists($idxtag)) return true;
112*83b3acccSAndreas Gohr
113*83b3acccSAndreas Gohr        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
114*83b3acccSAndreas Gohr
115*83b3acccSAndreas Gohr        $last = @filemtime($idxtag);
116*83b3acccSAndreas Gohr        return $last <= @filemtime(wikiFN($page));
117*83b3acccSAndreas Gohr    }
118*83b3acccSAndreas Gohr
119*83b3acccSAndreas Gohr    /**
120*83b3acccSAndreas Gohr     * Add/update the search index for a page
1214027a91aSSatoshi Sahara     *
1224027a91aSSatoshi Sahara     * Locking is handled internally.
1234027a91aSSatoshi Sahara     *
124*83b3acccSAndreas Gohr     * @param string $page The page to index
1254027a91aSSatoshi Sahara     * @param bool $force force reindexing even when the index is up to date
1264027a91aSSatoshi Sahara     *
127a32da6ddSSatoshi Sahara     * @throws IndexAccessException
128a16bd548SSatoshi Sahara     * @throws IndexLockException
129a16bd548SSatoshi Sahara     * @throws IndexWriteException
1304027a91aSSatoshi Sahara     */
131*83b3acccSAndreas Gohr    public function addPage(string $page, bool $force = false): void
1324027a91aSSatoshi Sahara    {
133*83b3acccSAndreas Gohr        if (!$this->needsIndexing($page, $force)) {
134*83b3acccSAndreas Gohr            $this->log("Indexer: index for {$page} up to date");
135*83b3acccSAndreas Gohr            return;
136a32da6ddSSatoshi Sahara        }
137a32da6ddSSatoshi Sahara
138*83b3acccSAndreas Gohr        // create shared writable page index early so we can resolve the PID for plugins
139*83b3acccSAndreas Gohr        $pageIndex = new FileIndex('page', '', true);
1406225b270SMichael Große
141*83b3acccSAndreas Gohr        // prepare event data
142*83b3acccSAndreas Gohr        $data = [
143*83b3acccSAndreas Gohr            'page' => $page,
144*83b3acccSAndreas Gohr            'body' => '',
145*83b3acccSAndreas Gohr            'metadata' => [
146*83b3acccSAndreas Gohr                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
147*83b3acccSAndreas Gohr                'relation_references' => array_keys(
148*83b3acccSAndreas Gohr                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
149*83b3acccSAndreas Gohr                ),
150*83b3acccSAndreas Gohr                'relation_media' => array_keys(
151*83b3acccSAndreas Gohr                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
152*83b3acccSAndreas Gohr                ),
153*83b3acccSAndreas Gohr                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
154*83b3acccSAndreas Gohr            ],
155*83b3acccSAndreas Gohr            'pid' => $pageIndex->accessCachedValue($page),
156*83b3acccSAndreas Gohr        ];
1576225b270SMichael Große
158*83b3acccSAndreas Gohr        // let plugins modify the data
1594027a91aSSatoshi Sahara        $event = new Event('INDEXER_PAGE_ADD', $data);
160*83b3acccSAndreas Gohr        if ($event->advise_before()) {
161*83b3acccSAndreas Gohr            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
162*83b3acccSAndreas Gohr        }
1634027a91aSSatoshi Sahara        $event->advise_after();
1644027a91aSSatoshi Sahara        unset($event);
1656225b270SMichael Große
166*83b3acccSAndreas Gohr        // index title
167*83b3acccSAndreas Gohr        (new PageTitleCollection($pageIndex))->lock()
168*83b3acccSAndreas Gohr            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
169*83b3acccSAndreas Gohr        unset($data['metadata']['title']);
1706225b270SMichael Große
171*83b3acccSAndreas Gohr        // index fulltext
172*83b3acccSAndreas Gohr        if ($data['metadata']['internal_index']) {
173*83b3acccSAndreas Gohr            $words = Tokenizer::getWords($data['body']);
174*83b3acccSAndreas Gohr            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
1756225b270SMichael Große        } else {
176*83b3acccSAndreas Gohr            $this->log("Indexer: full text indexing disabled for {$data['page']}");
177*83b3acccSAndreas Gohr            // clear any previously stored fulltext data
178*83b3acccSAndreas Gohr            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
1796225b270SMichael Große        }
180*83b3acccSAndreas Gohr        unset($data['metadata']['internal_index']);
181*83b3acccSAndreas Gohr
182*83b3acccSAndreas Gohr        // index metadata keys
183*83b3acccSAndreas Gohr        foreach ($data['metadata'] as $key => $values) {
184*83b3acccSAndreas Gohr            if (!is_array($values)) {
185*83b3acccSAndreas Gohr                $values = ($values !== null && $values !== '') ? [$values] : [];
1866225b270SMichael Große            }
187*83b3acccSAndreas Gohr            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
188*83b3acccSAndreas Gohr        }
189*83b3acccSAndreas Gohr
190*83b3acccSAndreas Gohr        // update metadata registry
191*83b3acccSAndreas Gohr        $this->updateMetadataRegistry(array_keys($data['metadata']));
1926225b270SMichael Große
1934027a91aSSatoshi Sahara        // update index tag file
194*83b3acccSAndreas Gohr        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
195*83b3acccSAndreas Gohr        $this->log("Indexer: finished indexing {$data['page']}");
1966225b270SMichael Große    }
1976225b270SMichael Große
1986225b270SMichael Große    /**
1995f9bd525SSatoshi Sahara     * Remove a page from the index
2006225b270SMichael Große     *
201*83b3acccSAndreas Gohr     * Clears the page's data from all collections. The entity persists in page.idx.
2026225b270SMichael Große     *
203*83b3acccSAndreas Gohr     * @param string $page The page to remove
204*83b3acccSAndreas Gohr     * @param bool $force force deletion even when no .indexed tag exists
2056225b270SMichael Große     *
206a32da6ddSSatoshi Sahara     * @throws IndexAccessException
207a16bd548SSatoshi Sahara     * @throws IndexLockException
208a16bd548SSatoshi Sahara     * @throws IndexWriteException
2096225b270SMichael Große     */
210*83b3acccSAndreas Gohr    public function deletePage(string $page, bool $force = false): void
2114027a91aSSatoshi Sahara    {
2124027a91aSSatoshi Sahara        $idxtag = metaFN($page, '.indexed');
2134027a91aSSatoshi Sahara        if (!$force && !file_exists($idxtag)) {
214*83b3acccSAndreas Gohr            $this->log("Indexer: {$page}.indexed file does not exist, ignoring");
215*83b3acccSAndreas Gohr            return;
2164027a91aSSatoshi Sahara        }
2176225b270SMichael Große
218*83b3acccSAndreas Gohr        $pageIndex = new FileIndex('page', '', true);
219725e8e5fSSatoshi Sahara
220*83b3acccSAndreas Gohr        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
221*83b3acccSAndreas Gohr        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
222*83b3acccSAndreas Gohr
223*83b3acccSAndreas Gohr        foreach ($this->getMetadataRegistryKeys() as $key) {
224*83b3acccSAndreas Gohr            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
2254027a91aSSatoshi Sahara        }
2266225b270SMichael Große
227*83b3acccSAndreas Gohr        $this->log("Indexer: deleted {$page} from index");
2284027a91aSSatoshi Sahara        @unlink($idxtag);
2294027a91aSSatoshi Sahara    }
2304027a91aSSatoshi Sahara
2314027a91aSSatoshi Sahara    /**
232*83b3acccSAndreas Gohr     * Rename a page in the search index
233*83b3acccSAndreas Gohr     *
234*83b3acccSAndreas Gohr     * The page must already have been moved on disk before calling this.
235*83b3acccSAndreas Gohr     * Clears the old page's data and re-indexes under the new name.
2364027a91aSSatoshi Sahara     *
2374027a91aSSatoshi Sahara     * @param string $oldpage The old page name
2384027a91aSSatoshi Sahara     * @param string $newpage The new page name
239*83b3acccSAndreas Gohr     *
240*83b3acccSAndreas Gohr     * @throws IndexAccessException
241a16bd548SSatoshi Sahara     * @throws IndexLockException
242a16bd548SSatoshi Sahara     * @throws IndexWriteException
2434027a91aSSatoshi Sahara     */
244*83b3acccSAndreas Gohr    public function renamePage(string $oldpage, string $newpage): void
2454027a91aSSatoshi Sahara    {
246*83b3acccSAndreas Gohr        $this->deletePage($oldpage, true);
247*83b3acccSAndreas Gohr        $this->addPage($newpage, true);
2486225b270SMichael Große    }
2496225b270SMichael Große
2506225b270SMichael Große    /**
251*83b3acccSAndreas Gohr     * Clear all page indexes
2526225b270SMichael Große     */
253*83b3acccSAndreas Gohr    public function clear(): void
2544027a91aSSatoshi Sahara    {
2556225b270SMichael Große        global $conf;
2566225b270SMichael Große
257*83b3acccSAndreas Gohr        Lock::acquire('page');
2584027a91aSSatoshi Sahara
259*83b3acccSAndreas Gohr        // clear metadata indexes
260*83b3acccSAndreas Gohr        foreach ($this->getMetadataRegistryKeys() as $key) {
261*83b3acccSAndreas Gohr            $clean = PageMetaCollection::cleanName($key);
262*83b3acccSAndreas Gohr            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
263*83b3acccSAndreas Gohr            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
264*83b3acccSAndreas Gohr            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
2656225b270SMichael Große        }
2666225b270SMichael Große
267*83b3acccSAndreas Gohr        // clear fulltext indexes
268*83b3acccSAndreas Gohr        $files = glob($conf['indexdir'] . '/i*.idx');
269*83b3acccSAndreas Gohr        if ($files) foreach ($files as $f) @unlink($f);
270*83b3acccSAndreas Gohr        $files = glob($conf['indexdir'] . '/w*.idx');
271*83b3acccSAndreas Gohr        if ($files) foreach ($files as $f) @unlink($f);
272*83b3acccSAndreas Gohr
273*83b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/pageword.idx');
274*83b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/lengths.idx');
275*83b3acccSAndreas Gohr
276*83b3acccSAndreas Gohr        // clear title and page indexes
277*83b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/title.idx');
278*83b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/page.idx');
279*83b3acccSAndreas Gohr        @unlink($conf['indexdir'] . '/metadata.idx');
280*83b3acccSAndreas Gohr
281*83b3acccSAndreas Gohr        Lock::release('page');
282*83b3acccSAndreas Gohr    }
283*83b3acccSAndreas Gohr
284*83b3acccSAndreas Gohr    /**
285*83b3acccSAndreas Gohr     * Get the list of known metadata keys from the metadata registry
286*83b3acccSAndreas Gohr     *
287*83b3acccSAndreas Gohr     * @return string[] list of metadata key names
288*83b3acccSAndreas Gohr     */
289*83b3acccSAndreas Gohr    protected function getMetadataRegistryKeys(): array
290*83b3acccSAndreas Gohr    {
291*83b3acccSAndreas Gohr        global $conf;
292*83b3acccSAndreas Gohr        $fn = $conf['indexdir'] . '/metadata.idx';
293*83b3acccSAndreas Gohr        if (!file_exists($fn)) return [];
294*83b3acccSAndreas Gohr        $keys = file($fn, FILE_IGNORE_NEW_LINES);
295*83b3acccSAndreas Gohr        return $keys ?: [];
296*83b3acccSAndreas Gohr    }
297*83b3acccSAndreas Gohr
298*83b3acccSAndreas Gohr    /**
299*83b3acccSAndreas Gohr     * Update the metadata registry with new keys
300*83b3acccSAndreas Gohr     *
301*83b3acccSAndreas Gohr     * @param string[] $keys metadata key names to ensure are registered
302*83b3acccSAndreas Gohr     */
303*83b3acccSAndreas Gohr    protected function updateMetadataRegistry(array $keys): void
304*83b3acccSAndreas Gohr    {
305*83b3acccSAndreas Gohr        global $conf;
306*83b3acccSAndreas Gohr        $fn = $conf['indexdir'] . '/metadata.idx';
307*83b3acccSAndreas Gohr        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
308*83b3acccSAndreas Gohr        if (!$existing) $existing = [];
309*83b3acccSAndreas Gohr
310*83b3acccSAndreas Gohr        $added = false;
311*83b3acccSAndreas Gohr        foreach ($keys as $key) {
312*83b3acccSAndreas Gohr            if (!in_array($key, $existing)) {
313*83b3acccSAndreas Gohr                $existing[] = $key;
314*83b3acccSAndreas Gohr                $added = true;
315*83b3acccSAndreas Gohr            }
316*83b3acccSAndreas Gohr        }
317*83b3acccSAndreas Gohr
318*83b3acccSAndreas Gohr        if ($added) {
319*83b3acccSAndreas Gohr            io_saveFile($fn, implode("\n", $existing) . "\n");
320*83b3acccSAndreas Gohr        }
321*83b3acccSAndreas Gohr    }
3226225b270SMichael Große}
323