xref: /dokuwiki/inc/Search/Indexer.php (revision 21fbd01b3c3eea88b767376b7b158f31f0f63127)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\PageFulltextCollection;
7use dokuwiki\Search\Collection\PageMetaCollection;
8use dokuwiki\Search\Collection\PageTitleCollection;
9use dokuwiki\Search\Exception\IndexAccessException;
10use dokuwiki\Search\Exception\IndexIntegrityException;
11use dokuwiki\Search\Exception\IndexLockException;
12use dokuwiki\Search\Exception\IndexWriteException;
13use dokuwiki\Search\Index\FileIndex;
14use dokuwiki\Search\Index\Lock;
15
16// Version tag used to force rebuild on upgrade
17const INDEXER_VERSION = 8;
18
19/**
20 * Class DokuWiki Indexer
21 *
22 * Manages the page search index by delegating to Collection classes.
23 *
24 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
25 * @author     Andreas Gohr <andi@splitbrain.org>
26 * @author Tom N Harris <tnharris@whoopdedo.org>
27 */
28class Indexer
29{
30    /** @var callable|null Logging callback, receives a string message */
31    protected $logger;
32
33    /**
34     * Set a logging callback
35     *
36     * The callback receives a single string message. Use this to integrate
37     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
38     *
39     * @param callable $logger
40     * @return static
41     */
42    public function setLogger(callable $logger): static
43    {
44        $this->logger = $logger;
45        return $this;
46    }
47
48    /**
49     * Send a message to the registered logger
50     *
51     * @param string $message
52     */
53    protected function log(string $message): void
54    {
55        if ($this->logger) ($this->logger)($message);
56    }
57
58    /**
59     * Version of the indexer taking into consideration the external tokenizer.
60     * The indexer is only compatible with data written by the same version.
61     *
62     * @triggers INDEXER_VERSION_GET
63     * Plugins that modify what gets indexed should hook this event and
64     * add their version info to the event data like so:
65     *     $data[$plugin_name] = $plugin_version;
66     *
67     * @return int|string
68     */
69    public function getVersion()
70    {
71        static $indexer_version = null;
72        if ($indexer_version == null) {
73            $version = INDEXER_VERSION;
74
75            $data = ['dokuwiki' => $version];
76            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
77            unset($data['dokuwiki']); // this needs to be first
78            ksort($data);
79            foreach ($data as $plugin => $vers) {
80                $version .= '+' . $plugin . '=' . $vers;
81            }
82            $indexer_version = $version;
83        }
84        return $indexer_version;
85    }
86
87    /**
88     * Return a list of all indexed pages
89     *
90     * @param bool $existsFilter only return pages that exist on disk
91     * @return string[] list of page names (keys are the RIDs in the page index)
92     */
93    public function getAllPages(bool $existsFilter = false): array
94    {
95        $pageIndex = new Index\MemoryIndex('page');
96        return array_filter(
97            iterator_to_array($pageIndex),
98            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
99        );
100    }
101
102    /**
103     * Check if a page needs (re-)indexing
104     *
105     * @param string $page
106     * @param bool $force
107     * @return bool true if indexing is needed
108     */
109    public function needsIndexing(string $page, bool $force = false): bool
110    {
111        $idxtag = metaFN($page, '.indexed');
112        if ($force || !file_exists($idxtag)) return true;
113
114        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
115
116        $last = @filemtime($idxtag);
117        return $last <= @filemtime(wikiFN($page));
118    }
119
120    /**
121     * Add/update the search index for a page
122     *
123     * Locking is handled internally.
124     *
125     * @param string $page The page to index
126     * @param bool $force force reindexing even when the index is up to date
127     *
128     * @throws IndexAccessException
129     * @throws IndexLockException
130     * @throws IndexWriteException
131     */
132    public function addPage(string $page, bool $force = false): void
133    {
134        if (!$this->needsIndexing($page, $force)) {
135            $this->log("Indexer: index for {$page} up to date");
136            return;
137        }
138
139        // create shared writable page index early so we can resolve the PID for plugins
140        $pageIndex = new FileIndex('page', '', true);
141
142        // prepare event data
143        $data = [
144            'page' => $page,
145            'body' => '',
146            'metadata' => [
147                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
148                'relation_references' => array_keys(
149                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
150                ),
151                'relation_media' => array_keys(
152                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
153                ),
154                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
155            ],
156            'pid' => $pageIndex->accessCachedValue($page),
157        ];
158
159        // let plugins modify the data
160        $event = new Event('INDEXER_PAGE_ADD', $data);
161        if ($event->advise_before()) {
162            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
163        }
164        $event->advise_after();
165        unset($event);
166
167        // index title
168        (new PageTitleCollection($pageIndex))->lock()
169            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
170        unset($data['metadata']['title']);
171
172        // index fulltext
173        if ($data['metadata']['internal_index']) {
174            $words = Tokenizer::getWords($data['body']);
175            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
176        } else {
177            $this->log("Indexer: full text indexing disabled for {$data['page']}");
178            // clear any previously stored fulltext data
179            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
180        }
181        unset($data['metadata']['internal_index']);
182
183        // index metadata keys
184        foreach ($data['metadata'] as $key => $values) {
185            if (!is_array($values)) {
186                $values = ($values !== null && $values !== '') ? [$values] : [];
187            }
188            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
189        }
190
191        // update metadata registry
192        $this->updateMetadataRegistry(array_keys($data['metadata']));
193
194        // update index tag file
195        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
196        $this->log("Indexer: finished indexing {$data['page']}");
197    }
198
199    /**
200     * Remove a page from the index
201     *
202     * Clears the page's data from all collections. The entity persists in page.idx.
203     *
204     * @param string $page The page to remove
205     * @param bool $force force deletion even when no .indexed tag exists
206     *
207     * @throws IndexAccessException
208     * @throws IndexLockException
209     * @throws IndexWriteException
210     */
211    public function deletePage(string $page, bool $force = false): void
212    {
213        $idxtag = metaFN($page, '.indexed');
214        if (!$force && !file_exists($idxtag)) {
215            $this->log("Indexer: {$page}.indexed file does not exist, ignoring");
216            return;
217        }
218
219        $pageIndex = new FileIndex('page', '', true);
220
221        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
222        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
223
224        foreach ($this->getMetadataRegistryKeys() as $key) {
225            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
226        }
227
228        $this->log("Indexer: deleted {$page} from index");
229        @unlink($idxtag);
230    }
231
232    /**
233     * Rename a page in the search index
234     *
235     * The page must already have been moved on disk before calling this.
236     * Clears the old page's data and re-indexes under the new name.
237     *
238     * @param string $oldpage The old page name
239     * @param string $newpage The new page name
240     *
241     * @throws IndexAccessException
242     * @throws IndexLockException
243     * @throws IndexWriteException
244     */
245    public function renamePage(string $oldpage, string $newpage): void
246    {
247        $this->deletePage($oldpage, true);
248        $this->addPage($newpage, true);
249    }
250
251    /**
252     * Clear all page indexes
253     */
254    public function clear(): void
255    {
256        global $conf;
257
258        Lock::acquire('page');
259
260        // clear metadata indexes
261        foreach ($this->getMetadataRegistryKeys() as $key) {
262            $clean = PageMetaCollection::cleanName($key);
263            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
264            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
265            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
266        }
267
268        // clear fulltext indexes
269        $files = glob($conf['indexdir'] . '/i*.idx');
270        if ($files) foreach ($files as $f) @unlink($f);
271        $files = glob($conf['indexdir'] . '/w*.idx');
272        if ($files) foreach ($files as $f) @unlink($f);
273
274        @unlink($conf['indexdir'] . '/pageword.idx');
275        @unlink($conf['indexdir'] . '/lengths.idx');
276
277        // clear title and page indexes
278        @unlink($conf['indexdir'] . '/title.idx');
279        @unlink($conf['indexdir'] . '/page.idx');
280        @unlink($conf['indexdir'] . '/metadata.idx');
281
282        Lock::release('page');
283    }
284
285    /**
286     * Check the structural integrity of all search indexes
287     *
288     * @throws IndexIntegrityException when a structural inconsistency is found
289     */
290    public function checkIntegrity(): void
291    {
292        (new PageFulltextCollection())->checkIntegrity();
293        (new PageTitleCollection())->checkIntegrity();
294
295        foreach ($this->getMetadataRegistryKeys() as $key) {
296            (new PageMetaCollection($key))->checkIntegrity();
297        }
298    }
299
300    /**
301     * Whether the search index is empty (no fulltext data indexed yet)
302     *
303     * @return bool
304     */
305    public function isIndexEmpty(): bool
306    {
307        return (new PageFulltextCollection())->getTokenIndexMaximum() === 0;
308    }
309
310    /**
311     * Get the list of known metadata keys from the metadata registry
312     *
313     * @return string[] list of metadata key names
314     */
315    protected function getMetadataRegistryKeys(): array
316    {
317        global $conf;
318        $fn = $conf['indexdir'] . '/metadata.idx';
319        if (!file_exists($fn)) return [];
320        $keys = file($fn, FILE_IGNORE_NEW_LINES);
321        return $keys ?: [];
322    }
323
324    /**
325     * Update the metadata registry with new keys
326     *
327     * @param string[] $keys metadata key names to ensure are registered
328     */
329    protected function updateMetadataRegistry(array $keys): void
330    {
331        global $conf;
332        $fn = $conf['indexdir'] . '/metadata.idx';
333        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
334        if (!$existing) $existing = [];
335
336        $added = false;
337        foreach ($keys as $key) {
338            if (!in_array($key, $existing)) {
339                $existing[] = $key;
340                $added = true;
341            }
342        }
343
344        if ($added) {
345            io_saveFile($fn, implode("\n", $existing) . "\n");
346        }
347    }
348}
349