xref: /dokuwiki/inc/Search/Indexer.php (revision 79dae64d6746363b953e6c82844ef285c37c3310)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Debug\DebugHelper;
6use dokuwiki\Extension\Event;
7use dokuwiki\Search\Collection\PageFulltextCollection;
8use dokuwiki\Search\Collection\PageMetaCollection;
9use dokuwiki\Search\Collection\PageTitleCollection;
10use dokuwiki\Search\Exception\IndexAccessException;
11use dokuwiki\Search\Exception\IndexIntegrityException;
12use dokuwiki\Search\Exception\IndexLockException;
13use dokuwiki\Search\Exception\IndexWriteException;
14use dokuwiki\Search\Index\FileIndex;
15use dokuwiki\Search\Index\Lock;
16use dokuwiki\Search\Index\MemoryIndex;
17
18// Version tag used to force rebuild on upgrade
19const INDEXER_VERSION = 9;
20
21/**
22 * Class DokuWiki Indexer
23 *
24 * Manages the page search index by delegating to Collection classes.
25 *
26 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
27 * @author     Andreas Gohr <andi@splitbrain.org>
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 */
30class Indexer
31{
32    /** @var callable|null Logging callback, receives a string message */
33    protected $logger;
34
35    /**
36     * Set a logging callback
37     *
38     * The callback receives a single string message. Use this to integrate
39     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
40     *
41     * @param callable $logger
42     * @return static
43     */
44    public function setLogger(callable $logger): static
45    {
46        $this->logger = $logger;
47        return $this;
48    }
49
50    /**
51     * Send a message to the registered logger
52     *
53     * @param string $message
54     */
55    protected function log(string $message): void
56    {
57        if ($this->logger)($this->logger)($message);
58    }
59
60    /**
61     * Version of the indexer taking into consideration the external tokenizer.
62     * The indexer is only compatible with data written by the same version.
63     *
64     * @triggers INDEXER_VERSION_GET
65     * Plugins that modify what gets indexed should hook this event and
66     * add their version info to the event data like so:
67     *     $data[$plugin_name] = $plugin_version;
68     *
69     * @return int|string
70     */
71    public function getVersion(): int|string
72    {
73        static $indexer_version = null;
74        if ($indexer_version == null) {
75            $version = INDEXER_VERSION;
76
77            $data = ['dokuwiki' => $version];
78            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
79            unset($data['dokuwiki']); // this needs to be first
80            ksort($data);
81            foreach ($data as $plugin => $vers) {
82                $version .= '+' . $plugin . '=' . $vers;
83            }
84            $indexer_version = $version;
85        }
86        return $indexer_version;
87    }
88
89    /**
90     * Return a list of all indexed pages
91     *
92     * @param bool $existsFilter only return pages that exist on disk
93     * @return string[] list of page names (keys are the RIDs in the page index)
94     */
95    public function getAllPages(bool $existsFilter = false): array
96    {
97        $pageIndex = new MemoryIndex('page');
98        return array_filter(
99            iterator_to_array($pageIndex),
100            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
101        );
102    }
103
104    /**
105     * Check if a page needs (re-)indexing
106     *
107     * @param string $page
108     * @param bool $force
109     * @return bool true if indexing is needed
110     */
111    public function needsIndexing(string $page, bool $force = false): bool
112    {
113        $idxtag = metaFN($page, '.indexed');
114        if ($force || !file_exists($idxtag)) return true;
115
116        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
117
118        // the index tag is written when the page is indexed; the page only needs
119        // (re-)indexing if it was changed *after* that - an equal mtime means it was
120        // saved and indexed within the same second and is therefore up to date
121        $last = @filemtime($idxtag);
122        return $last < @filemtime(wikiFN($page));
123    }
124
125    /**
126     * Add/update the search index for a page
127     *
128     * Locking is handled internally.
129     *
130     * @param string $page The page to index
131     * @param bool $force force reindexing even when the index is up to date
132     *
133     * @throws IndexAccessException
134     * @throws IndexLockException
135     * @throws IndexWriteException
136     */
137    public function addPage(string $page, bool $force = false): void
138    {
139        if (!$this->needsIndexing($page, $force)) {
140            $this->log("Indexer: index for $page up to date");
141            return;
142        }
143
144        // create shared writable page index early so we can resolve the PID for plugins
145        $pageIndex = new FileIndex('page', '', true);
146
147        // prepare event data
148        $data = [
149            'page' => $page,
150            'body' => '',
151            'metadata' => [
152                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
153                'relation_references' => array_keys(
154                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
155                ),
156                'relation_media' => array_keys(
157                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
158                ),
159                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
160            ],
161            'pid' => $pageIndex->accessCachedValue($page),
162        ];
163
164        // let plugins modify the data
165        $event = new Event('INDEXER_PAGE_ADD', $data);
166        if ($event->advise_before()) {
167            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
168        }
169        $event->advise_after();
170        unset($event);
171
172        // index title
173        (new PageTitleCollection($pageIndex))->lock()
174            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
175        unset($data['metadata']['title']);
176
177        // index fulltext
178        if ($data['metadata']['internal_index']) {
179            $words = Tokenizer::getWords($data['body']);
180            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
181        } else {
182            $this->log("Indexer: full text indexing disabled for {$data['page']}");
183            // clear any previously stored fulltext data
184            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
185        }
186        unset($data['metadata']['internal_index']);
187
188        // index metadata keys
189        foreach ($data['metadata'] as $key => $values) {
190            if (!is_array($values)) {
191                $values = ($values !== null && $values !== '') ? [$values] : [];
192            }
193            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
194        }
195
196        // update metadata registry
197        $this->updateMetadataRegistry(array_keys($data['metadata']));
198
199        // update index tag file
200        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
201        $this->log("Indexer: finished indexing {$data['page']}");
202    }
203
204    /**
205     * Remove a page from the index
206     *
207     * Clears the page's data from all collections. The entity persists in page.idx.
208     *
209     * @param string $page The page to remove
210     * @param bool $force force deletion even when no .indexed tag exists
211     *
212     * @throws IndexAccessException
213     * @throws IndexLockException
214     * @throws IndexWriteException
215     */
216    public function deletePage(string $page, bool $force = false): void
217    {
218        $idxtag = metaFN($page, '.indexed');
219        if (!$force && !file_exists($idxtag)) {
220            $this->log("Indexer: $page.indexed file does not exist, ignoring");
221            return;
222        }
223
224        $pageIndex = new FileIndex('page', '', true);
225
226        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
227        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
228
229        foreach ($this->getMetadataRegistryKeys() as $key) {
230            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
231        }
232
233        $this->log("Indexer: deleted $page from index");
234        @unlink($idxtag);
235    }
236
237    /**
238     * Rename a page in the search index
239     *
240     * This renames the page's entity entry in place: its entity ID (the row in the
241     * page index) is kept and only its name is changed. Because every collection
242     * (title, fulltext and all metadata keys such as relation_references) is keyed by
243     * that entity ID, all token, frequency and reverse associations are preserved and
244     * transparently belong to the new name afterwards.
245     *
246     * In particular this keeps the renamed page's *outgoing* references intact. That is
247     * essential during multi-step operations such as namespace moves: a page renamed
248     * early on must still be discoverable as a backlink source for pages that are moved
249     * later. Re-indexing from disk instead would lose this, because the destination page
250     * has usually not been written to disk yet when this method is called.
251     *
252     * @param string $oldpage The old page name
253     * @param string $newpage The new page name
254     *
255     * @throws IndexAccessException
256     * @throws IndexLockException
257     * @throws IndexWriteException
258     */
259    public function renamePage(string $oldpage, string $newpage): void
260    {
261        if ($oldpage === $newpage) return;
262
263        $pageIndex = new FileIndex('page', '', true);
264
265        // locate the existing entity rows; stop as soon as both are known
266        $oldId = null;
267        $newId = null;
268        foreach ($pageIndex as $rid => $value) {
269            if ($value === $oldpage) $oldId = $rid;
270            if ($value === $newpage) $newId = $rid;
271            if ($oldId !== null && $newId !== null) break;
272        }
273
274        // nothing to rename if the old page was never indexed
275        if ($oldId === null) {
276            $pageIndex->unlock();
277            $this->log("Indexer: $oldpage is not in the index, nothing to rename");
278            return;
279        }
280
281        // If the new name already has its own entity, drop its indexed data first.
282        // deletePage() intentionally keeps the entity row in page.idx, so we additionally
283        // blank that row - an empty entry is the index's "removed" marker (see getAllPages()).
284        // Otherwise two rows would carry the new name and a lookup could resolve to the
285        // now-empty one instead of the renamed entity that holds the data.
286        if ($newId !== null) {
287            $this->deletePage($newpage, true);
288            $pageIndex->changeRow($newId, '');
289        }
290
291        // rename in place — keeps the entity ID and thus all index associations
292        $pageIndex->changeRow($oldId, $newpage);
293
294        $pageIndex->unlock();
295        $this->log("Indexer: renamed $oldpage to $newpage in index");
296    }
297
298    /**
299     * Clear all page indexes
300     */
301    public function clear(): void
302    {
303        global $conf;
304
305        Lock::acquire('page');
306
307        // clear metadata indexes
308        foreach ($this->getMetadataRegistryKeys() as $key) {
309            $clean = PageMetaCollection::cleanName($key);
310            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
311            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
312            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
313        }
314
315        // clear fulltext indexes
316        $files = glob($conf['indexdir'] . '/i*.idx');
317        if ($files) foreach ($files as $f) @unlink($f);
318        $files = glob($conf['indexdir'] . '/w*.idx');
319        if ($files) foreach ($files as $f) @unlink($f);
320
321        @unlink($conf['indexdir'] . '/pageword.idx');
322        @unlink($conf['indexdir'] . '/lengths.idx');
323
324        // clear title and page indexes
325        @unlink($conf['indexdir'] . '/title.idx');
326        @unlink($conf['indexdir'] . '/page.idx');
327        @unlink($conf['indexdir'] . '/metadata.idx');
328
329        Lock::release('page');
330    }
331
332    /**
333     * Check the structural integrity of all search indexes
334     *
335     * @throws IndexIntegrityException when a structural inconsistency is found
336     */
337    public function checkIntegrity(): void
338    {
339        (new PageFulltextCollection())->checkIntegrity();
340        (new PageTitleCollection())->checkIntegrity();
341
342        foreach ($this->getMetadataRegistryKeys() as $key) {
343            (new PageMetaCollection($key))->checkIntegrity();
344        }
345    }
346
347    /**
348     * Whether the search index is empty (no fulltext data indexed yet)
349     *
350     * @return bool
351     */
352    public function isIndexEmpty(): bool
353    {
354        return (new PageFulltextCollection())->getTokenIndexMaximum() === 0;
355    }
356
357    /**
358     * Get the list of known metadata keys from the metadata registry
359     *
360     * @return string[] list of metadata key names
361     */
362    protected function getMetadataRegistryKeys(): array
363    {
364        global $conf;
365        $fn = $conf['indexdir'] . '/metadata.idx';
366        if (!file_exists($fn)) return [];
367        $keys = file($fn, FILE_IGNORE_NEW_LINES);
368        return $keys ?: [];
369    }
370
371    /**
372     * Update the metadata registry with new keys
373     *
374     * @param string[] $keys metadata key names to ensure are registered
375     *
376     * @internal Only marked public for access via LegacyIndexer
377     */
378    public function updateMetadataRegistry(array $keys): void
379    {
380        global $conf;
381        $fn = $conf['indexdir'] . '/metadata.idx';
382        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
383        if (!$existing) $existing = [];
384
385        $added = false;
386        foreach ($keys as $key) {
387            if (!in_array($key, $existing)) {
388                $existing[] = $key;
389                $added = true;
390            }
391        }
392
393        if ($added) {
394            io_saveFile($fn, implode("\n", $existing) . "\n");
395        }
396    }
397
398    /**
399     * Return a list of all indexed pages, optionally filtered by metadata key
400     *
401     * Kept on Indexer (not just LegacyIndexer) because several plugins call it
402     * directly on `new Indexer()` instances rather than going through
403     * idx_get_indexer().
404     *
405     * @param string|null $key metadata key name
406     * @return string[]
407     *
408     * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead
409     */
410    public function getPages($key = null)
411    {
412        DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()');
413        return (new MetadataSearch())->getPages($key);
414    }
415}
416