xref: /dokuwiki/inc/Search/Indexer.php (revision 5d034a75ec636eaf8dd957fac678f8d04e5b23fc)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Debug\DebugHelper;
6use dokuwiki\Extension\Event;
7use dokuwiki\Search\Collection\CollectionSearch;
8use dokuwiki\Search\Collection\PageFulltextCollection;
9use dokuwiki\Search\Collection\PageMetaCollection;
10use dokuwiki\Search\Collection\PageTitleCollection;
11use dokuwiki\Search\Exception\IndexAccessException;
12use dokuwiki\Search\Exception\IndexIntegrityException;
13use dokuwiki\Search\Exception\IndexLockException;
14use dokuwiki\Search\Exception\IndexWriteException;
15use dokuwiki\Search\Exception\SearchException;
16use dokuwiki\Search\Index\FileIndex;
17use dokuwiki\Search\Index\Lock;
18use dokuwiki\Search\Index\MemoryIndex;
19use dokuwiki\Search\Index\TupleOps;
20
21// Version tag used to force rebuild on upgrade
22const INDEXER_VERSION = 9;
23
24/**
25 * Class DokuWiki Indexer
26 *
27 * Manages the page search index by delegating to Collection classes.
28 *
29 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
30 * @author     Andreas Gohr <andi@splitbrain.org>
31 * @author Tom N Harris <tnharris@whoopdedo.org>
32 */
33class Indexer
34{
35    /** @var callable|null Logging callback, receives a string message */
36    protected $logger;
37
38    /**
39     * Set a logging callback
40     *
41     * The callback receives a single string message. Use this to integrate
42     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
43     *
44     * @param callable $logger
45     * @return static
46     */
47    public function setLogger(callable $logger): static
48    {
49        $this->logger = $logger;
50        return $this;
51    }
52
53    /**
54     * Send a message to the registered logger
55     *
56     * @param string $message
57     */
58    protected function log(string $message): void
59    {
60        if ($this->logger)($this->logger)($message);
61    }
62
63    /**
64     * Version of the indexer taking into consideration the external tokenizer.
65     * The indexer is only compatible with data written by the same version.
66     *
67     * @triggers INDEXER_VERSION_GET
68     * Plugins that modify what gets indexed should hook this event and
69     * add their version info to the event data like so:
70     *     $data[$plugin_name] = $plugin_version;
71     *
72     * @return int|string
73     */
74    public function getVersion(): int|string
75    {
76        static $indexer_version = null;
77        if ($indexer_version == null) {
78            $version = INDEXER_VERSION;
79
80            $data = ['dokuwiki' => $version];
81            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
82            unset($data['dokuwiki']); // this needs to be first
83            ksort($data);
84            foreach ($data as $plugin => $vers) {
85                $version .= '+' . $plugin . '=' . $vers;
86            }
87            $indexer_version = $version;
88        }
89        return $indexer_version;
90    }
91
92    /**
93     * Return a list of all indexed pages
94     *
95     * @param bool $existsFilter only return pages that exist on disk
96     * @return string[] list of page names (keys are the RIDs in the page index)
97     */
98    public function getAllPages(bool $existsFilter = false): array
99    {
100        $pageIndex = new MemoryIndex('page');
101        return array_filter(
102            iterator_to_array($pageIndex),
103            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
104        );
105    }
106
107    /**
108     * Check if a page needs (re-)indexing
109     *
110     * @param string $page
111     * @param bool $force
112     * @return bool true if indexing is needed
113     */
114    public function needsIndexing(string $page, bool $force = false): bool
115    {
116        $idxtag = metaFN($page, '.indexed');
117        if ($force || !file_exists($idxtag)) return true;
118
119        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
120
121        $last = @filemtime($idxtag);
122        return $last <= @filemtime(wikiFN($page));
123    }
124
125    /**
126     * Add/update the search index for a page
127     *
128     * Locking is handled internally.
129     *
130     * @param string $page The page to index
131     * @param bool $force force reindexing even when the index is up to date
132     *
133     * @throws IndexAccessException
134     * @throws IndexLockException
135     * @throws IndexWriteException
136     */
137    public function addPage(string $page, bool $force = false): void
138    {
139        if (!$this->needsIndexing($page, $force)) {
140            $this->log("Indexer: index for $page up to date");
141            return;
142        }
143
144        // create shared writable page index early so we can resolve the PID for plugins
145        $pageIndex = new FileIndex('page', '', true);
146
147        // prepare event data
148        $data = [
149            'page' => $page,
150            'body' => '',
151            'metadata' => [
152                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
153                'relation_references' => array_keys(
154                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
155                ),
156                'relation_media' => array_keys(
157                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
158                ),
159                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
160            ],
161            'pid' => $pageIndex->accessCachedValue($page),
162        ];
163
164        // let plugins modify the data
165        $event = new Event('INDEXER_PAGE_ADD', $data);
166        if ($event->advise_before()) {
167            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
168        }
169        $event->advise_after();
170        unset($event);
171
172        // index title
173        (new PageTitleCollection($pageIndex))->lock()
174            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
175        unset($data['metadata']['title']);
176
177        // index fulltext
178        if ($data['metadata']['internal_index']) {
179            $words = Tokenizer::getWords($data['body']);
180            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
181        } else {
182            $this->log("Indexer: full text indexing disabled for {$data['page']}");
183            // clear any previously stored fulltext data
184            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
185        }
186        unset($data['metadata']['internal_index']);
187
188        // index metadata keys
189        foreach ($data['metadata'] as $key => $values) {
190            if (!is_array($values)) {
191                $values = ($values !== null && $values !== '') ? [$values] : [];
192            }
193            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
194        }
195
196        // update metadata registry
197        $this->updateMetadataRegistry(array_keys($data['metadata']));
198
199        // update index tag file
200        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
201        $this->log("Indexer: finished indexing {$data['page']}");
202    }
203
204    /**
205     * Remove a page from the index
206     *
207     * Clears the page's data from all collections. The entity persists in page.idx.
208     *
209     * @param string $page The page to remove
210     * @param bool $force force deletion even when no .indexed tag exists
211     *
212     * @throws IndexAccessException
213     * @throws IndexLockException
214     * @throws IndexWriteException
215     */
216    public function deletePage(string $page, bool $force = false): void
217    {
218        $idxtag = metaFN($page, '.indexed');
219        if (!$force && !file_exists($idxtag)) {
220            $this->log("Indexer: $page.indexed file does not exist, ignoring");
221            return;
222        }
223
224        $pageIndex = new FileIndex('page', '', true);
225
226        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
227        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
228
229        foreach ($this->getMetadataRegistryKeys() as $key) {
230            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
231        }
232
233        $this->log("Indexer: deleted $page from index");
234        @unlink($idxtag);
235    }
236
237    /**
238     * Rename a page in the search index
239     *
240     * The page must already have been moved on disk before calling this.
241     * Clears the old page's data and re-indexes under the new name.
242     *
243     * @param string $oldpage The old page name
244     * @param string $newpage The new page name
245     *
246     * @throws IndexAccessException
247     * @throws IndexLockException
248     * @throws IndexWriteException
249     */
250    public function renamePage(string $oldpage, string $newpage): void
251    {
252        $this->deletePage($oldpage, true);
253        $this->addPage($newpage, true);
254    }
255
256    /**
257     * Clear all page indexes
258     */
259    public function clear(): void
260    {
261        global $conf;
262
263        Lock::acquire('page');
264
265        // clear metadata indexes
266        foreach ($this->getMetadataRegistryKeys() as $key) {
267            $clean = PageMetaCollection::cleanName($key);
268            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
269            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
270            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
271        }
272
273        // clear fulltext indexes
274        $files = glob($conf['indexdir'] . '/i*.idx');
275        if ($files) foreach ($files as $f) @unlink($f);
276        $files = glob($conf['indexdir'] . '/w*.idx');
277        if ($files) foreach ($files as $f) @unlink($f);
278
279        @unlink($conf['indexdir'] . '/pageword.idx');
280        @unlink($conf['indexdir'] . '/lengths.idx');
281
282        // clear title and page indexes
283        @unlink($conf['indexdir'] . '/title.idx');
284        @unlink($conf['indexdir'] . '/page.idx');
285        @unlink($conf['indexdir'] . '/metadata.idx');
286
287        Lock::release('page');
288    }
289
290    /**
291     * Check the structural integrity of all search indexes
292     *
293     * @throws IndexIntegrityException when a structural inconsistency is found
294     */
295    public function checkIntegrity(): void
296    {
297        (new PageFulltextCollection())->checkIntegrity();
298        (new PageTitleCollection())->checkIntegrity();
299
300        foreach ($this->getMetadataRegistryKeys() as $key) {
301            (new PageMetaCollection($key))->checkIntegrity();
302        }
303    }
304
305    /**
306     * Whether the search index is empty (no fulltext data indexed yet)
307     *
308     * @return bool
309     */
310    public function isIndexEmpty(): bool
311    {
312        return (new PageFulltextCollection())->getTokenIndexMaximum() === 0;
313    }
314
315    /**
316     * Get the list of known metadata keys from the metadata registry
317     *
318     * @return string[] list of metadata key names
319     */
320    protected function getMetadataRegistryKeys(): array
321    {
322        global $conf;
323        $fn = $conf['indexdir'] . '/metadata.idx';
324        if (!file_exists($fn)) return [];
325        $keys = file($fn, FILE_IGNORE_NEW_LINES);
326        return $keys ?: [];
327    }
328
329    /**
330     * Update the metadata registry with new keys
331     *
332     * @param string[] $keys metadata key names to ensure are registered
333     */
334    protected function updateMetadataRegistry(array $keys): void
335    {
336        global $conf;
337        $fn = $conf['indexdir'] . '/metadata.idx';
338        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
339        if (!$existing) $existing = [];
340
341        $added = false;
342        foreach ($keys as $key) {
343            if (!in_array($key, $existing)) {
344                $existing[] = $key;
345                $added = true;
346            }
347        }
348
349        if ($added) {
350            io_saveFile($fn, implode("\n", $existing) . "\n");
351        }
352    }
353
354    // region Deprecated methods
355
356    /**
357     * Find pages containing a metadata value
358     *
359     * @param string $key metadata key name
360     * @param string|string[] $value search term(s)
361     * @param callable|null $func ignored, kept for backward compatibility
362     * @return array
363     *
364     * @deprecated 2026-04-07 use MetadataSearch::lookupKey() instead
365     */
366    public function lookupKey($key, &$value, $func = null)
367    {
368        DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::lookupKey()');
369        return (new MetadataSearch())->lookupKey($key, $value);
370    }
371
372    /**
373     * Return a list of all indexed pages, optionally filtered by metadata key
374     *
375     * @param string|null $key metadata key name
376     * @return string[]
377     *
378     * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead
379     */
380    public function getPages($key = null)
381    {
382        DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()');
383        return (new MetadataSearch())->getPages($key);
384    }
385
386    /**
387     * Add metadata values for a page
388     *
389     * @param string $page page name
390     * @param string $key metadata key name
391     * @param string|string[]|null $value value(s) to add
392     * @return bool
393     *
394     * @deprecated 2026-04-07 use Collection classes directly instead
395     */
396    public function addMetaKeys($page, $key, $value = null)
397    {
398        DebugHelper::dbgDeprecatedFunction('Collection classes');
399        try {
400            if ($key === 'title') {
401                $collection = new PageTitleCollection();
402            } else {
403                $collection = new PageMetaCollection($key);
404            }
405            $values = is_array($value) ? $value : ($value !== null && $value !== '' ? [$value] : []);
406            $collection->lock()->addEntity($page, $values)->unlock();
407            $this->updateMetadataRegistry([$key]);
408            return true;
409        } catch (SearchException) {
410            return false;
411        }
412    }
413
414    /**
415     * Rename a metadata value in the index
416     *
417     * @param string $key metadata key name
418     * @param string $oldvalue old value
419     * @param string $newvalue new value
420     * @return bool
421     *
422     * @deprecated 2026-04-07 use Collection classes directly instead
423     */
424    public function renameMetaValue($key, $oldvalue, $newvalue)
425    {
426        DebugHelper::dbgDeprecatedFunction('Collection classes');
427        try {
428            $collection = new PageMetaCollection($key);
429            $collection->lock();
430
431            $tokenIndex = $collection->getTokenIndex();
432
433            // find old value — search() is read-only, won't create entries
434            $matches = $tokenIndex->search('/^' . preg_quote($oldvalue, '/') . '$/');
435            if ($matches === []) {
436                $collection->unlock();
437                return true;
438            }
439            $oldid = array_key_first($matches);
440
441            // check if new value already exists (read-only lookup)
442            $newMatches = $tokenIndex->search('/^' . preg_quote($newvalue, '/') . '$/');
443
444            if ($newMatches !== []) {
445                // both values exist — merge frequency data from old to new
446                $newid = array_key_first($newMatches);
447                $freqIndex = $collection->getFrequencyIndex();
448                $reverseIndex = $collection->getReverseIndex();
449                $oldFreqLine = $freqIndex->retrieveRow($oldid);
450
451                if ($oldFreqLine !== '') {
452                    $newFreqLine = $freqIndex->retrieveRow($newid);
453                    foreach (TupleOps::parseTuples($oldFreqLine) as $entityId => $count) {
454                        $newFreqLine = TupleOps::updateTuple($newFreqLine, $entityId, $count);
455
456                        // update reverse index: remove old token, add new
457                        $reverseRow = $reverseIndex->retrieveRow((int)$entityId);
458                        $keyline = explode(':', $reverseRow);
459                        $keyline = array_diff($keyline, [(string)$oldid]);
460                        if (!in_array((string)$newid, $keyline)) {
461                            $keyline[] = $newid;
462                        }
463                        $reverseIndex->changeRow(
464                            (int)$entityId,
465                            implode(':', array_filter($keyline, fn($v) => $v !== ''))
466                        );
467                    }
468                    $freqIndex->changeRow($oldid, '');
469                    $freqIndex->changeRow($newid, $newFreqLine);
470                }
471            } else {
472                // new value doesn't exist — simple rename
473                $tokenIndex->changeRow($oldid, $newvalue);
474            }
475
476            $collection->unlock();
477            return true;
478        } catch (SearchException) {
479            return false;
480        }
481    }
482
483    /**
484     * Get the page ID for a page name
485     *
486     * @param string $page page name
487     * @return int|false
488     *
489     * @deprecated 2026-04-07 use FileIndex directly instead
490     */
491    public function getPID($page)
492    {
493        DebugHelper::dbgDeprecatedFunction(FileIndex::class);
494        try {
495            return (new FileIndex('page', '', true))->accessCachedValue($page);
496        } catch (SearchException) {
497            return false;
498        }
499    }
500
501    /**
502     * Find tokens in the fulltext index
503     *
504     * @param array $tokens list of words to search for
505     * @return array list of pages found [word => [page => count, ...]]
506     *
507     * @deprecated 2026-04-07 use CollectionSearch on PageFulltextCollection instead
508     */
509    public function lookup($tokens)
510    {
511        DebugHelper::dbgDeprecatedFunction(CollectionSearch::class);
512        $collection = new PageFulltextCollection();
513        $search = new CollectionSearch($collection);
514        $termMap = [];
515        foreach ($tokens as $token) {
516            if (!Tokenizer::isValidSearchTerm($token)) continue;
517            $term = $search->addTerm($token);
518            $termMap[$token] = $term;
519        }
520
521        if ($termMap === []) return [];
522        $search->execute();
523
524        $result = [];
525        foreach ($termMap as $word => $term) {
526            $freqs = $term->getEntityFrequencies();
527            // filter to only existing pages
528            $filtered = array_filter($freqs, fn($page) => page_exists($page, '', false), ARRAY_FILTER_USE_KEY);
529            $result[$word] = $filtered;
530        }
531        return $result;
532    }
533
534    // endregion
535}
536