xref: /dokuwiki/inc/Search/Indexer.php (revision 6e39b4e379a661a3abd765df49fa679d2119741c)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Debug\DebugHelper;
6use dokuwiki\Extension\Event;
7use dokuwiki\Search\Collection\PageFulltextCollection;
8use dokuwiki\Search\Collection\PageMetaCollection;
9use dokuwiki\Search\Collection\PageTitleCollection;
10use dokuwiki\Search\Exception\IndexAccessException;
11use dokuwiki\Search\Exception\IndexIntegrityException;
12use dokuwiki\Search\Exception\IndexLockException;
13use dokuwiki\Search\Exception\IndexWriteException;
14use dokuwiki\Search\Index\FileIndex;
15use dokuwiki\Search\Index\Lock;
16use dokuwiki\Search\Index\MemoryIndex;
17
18// Version tag used to force rebuild on upgrade
19const INDEXER_VERSION = 9;
20
21/**
22 * Class DokuWiki Indexer
23 *
24 * Manages the page search index by delegating to Collection classes.
25 *
26 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
27 * @author     Andreas Gohr <andi@splitbrain.org>
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 */
30class Indexer
31{
32    /** @var callable|null Logging callback, receives a string message */
33    protected $logger;
34
35    /**
36     * Set a logging callback
37     *
38     * The callback receives a single string message. Use this to integrate
39     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
40     *
41     * @param callable $logger
42     * @return static
43     */
44    public function setLogger(callable $logger): static
45    {
46        $this->logger = $logger;
47        return $this;
48    }
49
50    /**
51     * Send a message to the registered logger
52     *
53     * @param string $message
54     */
55    protected function log(string $message): void
56    {
57        if ($this->logger)($this->logger)($message);
58    }
59
60    /**
61     * Version of the indexer taking into consideration the external tokenizer.
62     * The indexer is only compatible with data written by the same version.
63     *
64     * @triggers INDEXER_VERSION_GET
65     * Plugins that modify what gets indexed should hook this event and
66     * add their version info to the event data like so:
67     *     $data[$plugin_name] = $plugin_version;
68     *
69     * @return int|string
70     */
71    public function getVersion(): int|string
72    {
73        static $indexer_version = null;
74        if ($indexer_version == null) {
75            $version = INDEXER_VERSION;
76
77            $data = ['dokuwiki' => $version];
78            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
79            unset($data['dokuwiki']); // this needs to be first
80            ksort($data);
81            foreach ($data as $plugin => $vers) {
82                $version .= '+' . $plugin . '=' . $vers;
83            }
84            $indexer_version = $version;
85        }
86        return $indexer_version;
87    }
88
89    /**
90     * Return a list of all indexed pages
91     *
92     * @param bool $existsFilter only return pages that exist on disk
93     * @return string[] list of page names (keys are the RIDs in the page index)
94     */
95    public function getAllPages(bool $existsFilter = false): array
96    {
97        $pageIndex = new MemoryIndex('page');
98        return array_filter(
99            iterator_to_array($pageIndex),
100            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
101        );
102    }
103
104    /**
105     * Check if a page needs (re-)indexing
106     *
107     * @param string $page
108     * @param bool $force
109     * @return bool true if indexing is needed
110     */
111    public function needsIndexing(string $page, bool $force = false): bool
112    {
113        $idxtag = metaFN($page, '.indexed');
114        if ($force || !file_exists($idxtag)) return true;
115
116        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
117
118        $last = @filemtime($idxtag);
119        return $last <= @filemtime(wikiFN($page));
120    }
121
122    /**
123     * Add/update the search index for a page
124     *
125     * Locking is handled internally.
126     *
127     * @param string $page The page to index
128     * @param bool $force force reindexing even when the index is up to date
129     *
130     * @throws IndexAccessException
131     * @throws IndexLockException
132     * @throws IndexWriteException
133     */
134    public function addPage(string $page, bool $force = false): void
135    {
136        if (!$this->needsIndexing($page, $force)) {
137            $this->log("Indexer: index for $page up to date");
138            return;
139        }
140
141        // create shared writable page index early so we can resolve the PID for plugins
142        $pageIndex = new FileIndex('page', '', true);
143
144        // prepare event data
145        $data = [
146            'page' => $page,
147            'body' => '',
148            'metadata' => [
149                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
150                'relation_references' => array_keys(
151                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
152                ),
153                'relation_media' => array_keys(
154                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
155                ),
156                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
157            ],
158            'pid' => $pageIndex->accessCachedValue($page),
159        ];
160
161        // let plugins modify the data
162        $event = new Event('INDEXER_PAGE_ADD', $data);
163        if ($event->advise_before()) {
164            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
165        }
166        $event->advise_after();
167        unset($event);
168
169        // index title
170        (new PageTitleCollection($pageIndex))->lock()
171            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
172        unset($data['metadata']['title']);
173
174        // index fulltext
175        if ($data['metadata']['internal_index']) {
176            $words = Tokenizer::getWords($data['body']);
177            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
178        } else {
179            $this->log("Indexer: full text indexing disabled for {$data['page']}");
180            // clear any previously stored fulltext data
181            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
182        }
183        unset($data['metadata']['internal_index']);
184
185        // index metadata keys
186        foreach ($data['metadata'] as $key => $values) {
187            if (!is_array($values)) {
188                $values = ($values !== null && $values !== '') ? [$values] : [];
189            }
190            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
191        }
192
193        // update metadata registry
194        $this->updateMetadataRegistry(array_keys($data['metadata']));
195
196        // update index tag file
197        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
198        $this->log("Indexer: finished indexing {$data['page']}");
199    }
200
201    /**
202     * Remove a page from the index
203     *
204     * Clears the page's data from all collections. The entity persists in page.idx.
205     *
206     * @param string $page The page to remove
207     * @param bool $force force deletion even when no .indexed tag exists
208     *
209     * @throws IndexAccessException
210     * @throws IndexLockException
211     * @throws IndexWriteException
212     */
213    public function deletePage(string $page, bool $force = false): void
214    {
215        $idxtag = metaFN($page, '.indexed');
216        if (!$force && !file_exists($idxtag)) {
217            $this->log("Indexer: $page.indexed file does not exist, ignoring");
218            return;
219        }
220
221        $pageIndex = new FileIndex('page', '', true);
222
223        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
224        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
225
226        foreach ($this->getMetadataRegistryKeys() as $key) {
227            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
228        }
229
230        $this->log("Indexer: deleted $page from index");
231        @unlink($idxtag);
232    }
233
234    /**
235     * Rename a page in the search index
236     *
237     * The page must already have been moved on disk before calling this.
238     * Clears the old page's data and re-indexes under the new name.
239     *
240     * @param string $oldpage The old page name
241     * @param string $newpage The new page name
242     *
243     * @throws IndexAccessException
244     * @throws IndexLockException
245     * @throws IndexWriteException
246     */
247    public function renamePage(string $oldpage, string $newpage): void
248    {
249        $this->deletePage($oldpage, true);
250        $this->addPage($newpage, true);
251    }
252
253    /**
254     * Clear all page indexes
255     */
256    public function clear(): void
257    {
258        global $conf;
259
260        Lock::acquire('page');
261
262        // clear metadata indexes
263        foreach ($this->getMetadataRegistryKeys() as $key) {
264            $clean = PageMetaCollection::cleanName($key);
265            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
266            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
267            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
268        }
269
270        // clear fulltext indexes
271        $files = glob($conf['indexdir'] . '/i*.idx');
272        if ($files) foreach ($files as $f) @unlink($f);
273        $files = glob($conf['indexdir'] . '/w*.idx');
274        if ($files) foreach ($files as $f) @unlink($f);
275
276        @unlink($conf['indexdir'] . '/pageword.idx');
277        @unlink($conf['indexdir'] . '/lengths.idx');
278
279        // clear title and page indexes
280        @unlink($conf['indexdir'] . '/title.idx');
281        @unlink($conf['indexdir'] . '/page.idx');
282        @unlink($conf['indexdir'] . '/metadata.idx');
283
284        Lock::release('page');
285    }
286
287    /**
288     * Check the structural integrity of all search indexes
289     *
290     * @throws IndexIntegrityException when a structural inconsistency is found
291     */
292    public function checkIntegrity(): void
293    {
294        (new PageFulltextCollection())->checkIntegrity();
295        (new PageTitleCollection())->checkIntegrity();
296
297        foreach ($this->getMetadataRegistryKeys() as $key) {
298            (new PageMetaCollection($key))->checkIntegrity();
299        }
300    }
301
302    /**
303     * Whether the search index is empty (no fulltext data indexed yet)
304     *
305     * @return bool
306     */
307    public function isIndexEmpty(): bool
308    {
309        return (new PageFulltextCollection())->getTokenIndexMaximum() === 0;
310    }
311
312    /**
313     * Get the list of known metadata keys from the metadata registry
314     *
315     * @return string[] list of metadata key names
316     */
317    protected function getMetadataRegistryKeys(): array
318    {
319        global $conf;
320        $fn = $conf['indexdir'] . '/metadata.idx';
321        if (!file_exists($fn)) return [];
322        $keys = file($fn, FILE_IGNORE_NEW_LINES);
323        return $keys ?: [];
324    }
325
326    /**
327     * Update the metadata registry with new keys
328     *
329     * @param string[] $keys metadata key names to ensure are registered
330     *
331     * @internal Only marked public for access via LegacyIndexer
332     */
333    public function updateMetadataRegistry(array $keys): void
334    {
335        global $conf;
336        $fn = $conf['indexdir'] . '/metadata.idx';
337        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
338        if (!$existing) $existing = [];
339
340        $added = false;
341        foreach ($keys as $key) {
342            if (!in_array($key, $existing)) {
343                $existing[] = $key;
344                $added = true;
345            }
346        }
347
348        if ($added) {
349            io_saveFile($fn, implode("\n", $existing) . "\n");
350        }
351    }
352
353    /**
354     * Return a list of all indexed pages, optionally filtered by metadata key
355     *
356     * Kept on Indexer (not just LegacyIndexer) because several plugins call it
357     * directly on `new Indexer()` instances rather than going through
358     * idx_get_indexer().
359     *
360     * @param string|null $key metadata key name
361     * @return string[]
362     *
363     * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead
364     */
365    public function getPages($key = null)
366    {
367        DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()');
368        return (new MetadataSearch())->getPages($key);
369    }
370}
371