xref: /dokuwiki/inc/Search/Indexer.php (revision 2cda016644e923dbda996c52bedee2113ba6d653)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Debug\DebugHelper;
6use dokuwiki\Extension\Event;
7use dokuwiki\Search\Collection\PageFulltextCollection;
8use dokuwiki\Search\Collection\PageMetaCollection;
9use dokuwiki\Search\Collection\PageTitleCollection;
10use dokuwiki\Search\Exception\IndexAccessException;
11use dokuwiki\Search\Exception\IndexIntegrityException;
12use dokuwiki\Search\Exception\IndexLockException;
13use dokuwiki\Search\Exception\IndexWriteException;
14use dokuwiki\Search\Index\FileIndex;
15use dokuwiki\Search\Index\Lock;
16use dokuwiki\Search\Index\MemoryIndex;
17
18// Version tag used to force rebuild on upgrade
19const INDEXER_VERSION = 9;
20
21/**
22 * Class DokuWiki Indexer
23 *
24 * Manages the page search index by delegating to Collection classes.
25 *
26 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
27 * @author     Andreas Gohr <andi@splitbrain.org>
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 */
30class Indexer
31{
32    /** @var callable|null Logging callback, receives a string message */
33    protected $logger;
34
35    /**
36     * Set a logging callback
37     *
38     * The callback receives a single string message. Use this to integrate
39     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
40     *
41     * @param callable $logger
42     * @return static
43     */
44    public function setLogger(callable $logger): static
45    {
46        $this->logger = $logger;
47        return $this;
48    }
49
50    /**
51     * Send a message to the registered logger
52     *
53     * @param string $message
54     */
55    protected function log(string $message): void
56    {
57        if ($this->logger)($this->logger)($message);
58    }
59
60    /**
61     * Version of the indexer taking into consideration the external tokenizer.
62     * The indexer is only compatible with data written by the same version.
63     *
64     * @triggers INDEXER_VERSION_GET
65     * Plugins that modify what gets indexed should hook this event and
66     * add their version info to the event data like so:
67     *     $data[$plugin_name] = $plugin_version;
68     *
69     * @return int|string
70     */
71    public function getVersion(): int|string
72    {
73        static $indexer_version = null;
74        if ($indexer_version == null) {
75            $version = INDEXER_VERSION;
76
77            $data = ['dokuwiki' => $version];
78            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
79            unset($data['dokuwiki']); // this needs to be first
80            ksort($data);
81            foreach ($data as $plugin => $vers) {
82                $version .= '+' . $plugin . '=' . $vers;
83            }
84            $indexer_version = $version;
85        }
86        return $indexer_version;
87    }
88
89    /**
90     * Return a list of all indexed pages
91     *
92     * @param bool $existsFilter only return pages that exist on disk
93     * @return string[] list of page names (keys are the RIDs in the page index)
94     */
95    public function getAllPages(bool $existsFilter = false): array
96    {
97        $pageIndex = new MemoryIndex('page');
98        return array_filter(
99            iterator_to_array($pageIndex),
100            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
101        );
102    }
103
104    /**
105     * Check if a page needs (re-)indexing
106     *
107     * @param string $page
108     * @param bool $force
109     * @return bool true if indexing is needed
110     */
111    public function needsIndexing(string $page, bool $force = false): bool
112    {
113        $idxtag = metaFN($page, '.indexed');
114        if ($force || !file_exists($idxtag)) return true;
115
116        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
117
118        // the index tag is written when the page is indexed; the page only needs
119        // (re-)indexing if it was changed *after* that - an equal mtime means it was
120        // saved and indexed within the same second and is therefore up to date
121        $last = @filemtime($idxtag);
122        return $last < @filemtime(wikiFN($page));
123    }
124
125    /**
126     * Add/update the search index for a page
127     *
128     * Locking is handled internally.
129     *
130     * @param string $page The page to index
131     * @param bool $force force reindexing even when the index is up to date
132     *
133     * @return bool true if the page was indexed, false if there was nothing to do
134     * @throws IndexAccessException
135     * @throws IndexLockException
136     * @throws IndexWriteException
137     */
138    public function addPage(string $page, bool $force = false): bool
139    {
140        if (!$this->needsIndexing($page, $force)) {
141            $this->log("Indexer: index for $page up to date");
142            return false;
143        }
144
145        // create shared writable page index early so we can resolve the PID for plugins
146        $pageIndex = new FileIndex('page', '', true);
147
148        // prepare event data
149        $data = [
150            'page' => $page,
151            'body' => '',
152            'metadata' => [
153                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
154                'relation_references' => array_keys(
155                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
156                ),
157                'relation_media' => array_keys(
158                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
159                ),
160                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
161            ],
162            'pid' => $pageIndex->accessCachedValue($page),
163        ];
164
165        // let plugins modify the data
166        $event = new Event('INDEXER_PAGE_ADD', $data);
167        if ($event->advise_before()) {
168            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
169        }
170        $event->advise_after();
171        unset($event);
172
173        // index title
174        (new PageTitleCollection($pageIndex))->lock()
175            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
176        unset($data['metadata']['title']);
177
178        // index fulltext
179        if ($data['metadata']['internal_index']) {
180            $words = Tokenizer::getWords($data['body']);
181            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
182        } else {
183            $this->log("Indexer: full text indexing disabled for {$data['page']}");
184            // clear any previously stored fulltext data
185            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
186        }
187        unset($data['metadata']['internal_index']);
188
189        // index metadata keys
190        foreach ($data['metadata'] as $key => $values) {
191            if (!is_array($values)) {
192                $values = ($values !== null && $values !== '') ? [$values] : [];
193            }
194            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
195        }
196
197        // update metadata registry
198        $this->updateMetadataRegistry(array_keys($data['metadata']));
199
200        // update index tag file
201        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
202        $this->log("Indexer: finished indexing {$data['page']}");
203        return true;
204    }
205
206    /**
207     * Remove a page from the index
208     *
209     * Clears the page's data from all collections. The entity persists in page.idx.
210     *
211     * @param string $page The page to remove
212     * @param bool $force force deletion even when no .indexed tag exists
213     *
214     * @return bool true if the page was removed, false if there was nothing to do
215     * @throws IndexAccessException
216     * @throws IndexLockException
217     * @throws IndexWriteException
218     */
219    public function deletePage(string $page, bool $force = false): bool
220    {
221        $idxtag = metaFN($page, '.indexed');
222        if (!$force && !file_exists($idxtag)) {
223            $this->log("Indexer: $page.indexed file does not exist, ignoring");
224            return false;
225        }
226
227        $pageIndex = new FileIndex('page', '', true);
228
229        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
230        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
231
232        foreach ($this->getMetadataRegistryKeys() as $key) {
233            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
234        }
235
236        $this->log("Indexer: deleted $page from index");
237        @unlink($idxtag);
238        return true;
239    }
240
241    /**
242     * Rename a page in the search index
243     *
244     * This renames the page's entity entry in place: its entity ID (the row in the
245     * page index) is kept and only its name is changed. Because every collection
246     * (title, fulltext and all metadata keys such as relation_references) is keyed by
247     * that entity ID, all token, frequency and reverse associations are preserved and
248     * transparently belong to the new name afterwards.
249     *
250     * In particular this keeps the renamed page's *outgoing* references intact. That is
251     * essential during multi-step operations such as namespace moves: a page renamed
252     * early on must still be discoverable as a backlink source for pages that are moved
253     * later. Re-indexing from disk instead would lose this, because the destination page
254     * has usually not been written to disk yet when this method is called.
255     *
256     * @param string $oldpage The old page name
257     * @param string $newpage The new page name
258     *
259     * @return bool true if the page was renamed, false if there was nothing to do
260     * @throws IndexAccessException
261     * @throws IndexLockException
262     * @throws IndexWriteException
263     */
264    public function renamePage(string $oldpage, string $newpage): bool
265    {
266        if ($oldpage === $newpage) return false;
267
268        $pageIndex = new FileIndex('page', '', true);
269
270        // locate the existing entity rows; stop as soon as both are known
271        $oldId = null;
272        $newId = null;
273        foreach ($pageIndex as $rid => $value) {
274            if ($value === $oldpage) $oldId = $rid;
275            if ($value === $newpage) $newId = $rid;
276            if ($oldId !== null && $newId !== null) break;
277        }
278
279        // nothing to rename if the old page was never indexed
280        if ($oldId === null) {
281            $pageIndex->unlock();
282            $this->log("Indexer: $oldpage is not in the index, nothing to rename");
283            return false;
284        }
285
286        // If the new name already has its own entity, drop its indexed data first.
287        // deletePage() intentionally keeps the entity row in page.idx, so we additionally
288        // blank that row - an empty entry is the index's "removed" marker (see getAllPages()).
289        // Otherwise two rows would carry the new name and a lookup could resolve to the
290        // now-empty one instead of the renamed entity that holds the data.
291        if ($newId !== null) {
292            $this->deletePage($newpage, true);
293            $pageIndex->changeRow($newId, '');
294        }
295
296        // rename in place — keeps the entity ID and thus all index associations
297        $pageIndex->changeRow($oldId, $newpage);
298
299        $pageIndex->unlock();
300        $this->log("Indexer: renamed $oldpage to $newpage in index");
301        return true;
302    }
303
304    /**
305     * Clear all page indexes
306     */
307    public function clear(): void
308    {
309        global $conf;
310
311        Lock::acquire('page');
312
313        // clear metadata indexes
314        foreach ($this->getMetadataRegistryKeys() as $key) {
315            $clean = PageMetaCollection::cleanName($key);
316            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
317            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
318            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
319        }
320
321        // clear fulltext indexes
322        $files = glob($conf['indexdir'] . '/i*.idx');
323        if ($files) foreach ($files as $f) @unlink($f);
324        $files = glob($conf['indexdir'] . '/w*.idx');
325        if ($files) foreach ($files as $f) @unlink($f);
326
327        @unlink($conf['indexdir'] . '/pageword.idx');
328        @unlink($conf['indexdir'] . '/lengths.idx');
329
330        // clear title and page indexes
331        @unlink($conf['indexdir'] . '/title.idx');
332        @unlink($conf['indexdir'] . '/page.idx');
333        @unlink($conf['indexdir'] . '/metadata.idx');
334
335        Lock::release('page');
336    }
337
338    /**
339     * Check the structural integrity of all search indexes
340     *
341     * @throws IndexIntegrityException when a structural inconsistency is found
342     */
343    public function checkIntegrity(): void
344    {
345        (new PageFulltextCollection())->checkIntegrity();
346        (new PageTitleCollection())->checkIntegrity();
347
348        foreach ($this->getMetadataRegistryKeys() as $key) {
349            (new PageMetaCollection($key))->checkIntegrity();
350        }
351    }
352
353    /**
354     * Whether the search index is empty (no fulltext data indexed yet)
355     *
356     * @return bool
357     */
358    public function isIndexEmpty(): bool
359    {
360        return (new PageFulltextCollection())->getTokenIndexMaximum() === 0;
361    }
362
363    /**
364     * Get the list of known metadata keys from the metadata registry
365     *
366     * @return string[] list of metadata key names
367     */
368    protected function getMetadataRegistryKeys(): array
369    {
370        global $conf;
371        $fn = $conf['indexdir'] . '/metadata.idx';
372        if (!file_exists($fn)) return [];
373        $keys = file($fn, FILE_IGNORE_NEW_LINES);
374        return $keys ?: [];
375    }
376
377    /**
378     * Update the metadata registry with new keys
379     *
380     * @param string[] $keys metadata key names to ensure are registered
381     *
382     * @internal Only marked public for access via LegacyIndexer
383     */
384    public function updateMetadataRegistry(array $keys): void
385    {
386        global $conf;
387        $fn = $conf['indexdir'] . '/metadata.idx';
388        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
389        if (!$existing) $existing = [];
390
391        $added = false;
392        foreach ($keys as $key) {
393            if (!in_array($key, $existing)) {
394                $existing[] = $key;
395                $added = true;
396            }
397        }
398
399        if ($added) {
400            io_saveFile($fn, implode("\n", $existing) . "\n");
401        }
402    }
403
404    /**
405     * Return a list of all indexed pages, optionally filtered by metadata key
406     *
407     * Kept on Indexer (not just LegacyIndexer) because several plugins call it
408     * directly on `new Indexer()` instances rather than going through
409     * idx_get_indexer().
410     *
411     * @param string|null $key metadata key name
412     * @return string[]
413     *
414     * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead
415     */
416    public function getPages($key = null)
417    {
418        DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()');
419        return (new MetadataSearch())->getPages($key);
420    }
421}
422