xref: /dokuwiki/inc/Search/Indexer.php (revision 83b3acccb42578eaa33f84e6b13612436320090b)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\PageFulltextCollection;
7use dokuwiki\Search\Collection\PageMetaCollection;
8use dokuwiki\Search\Collection\PageTitleCollection;
9use dokuwiki\Search\Exception\IndexAccessException;
10use dokuwiki\Search\Exception\IndexLockException;
11use dokuwiki\Search\Exception\IndexWriteException;
12use dokuwiki\Search\Index\FileIndex;
13use dokuwiki\Search\Index\Lock;
14
15// Version tag used to force rebuild on upgrade
16const INDEXER_VERSION = 8;
17
18/**
19 * Class DokuWiki Indexer
20 *
21 * Manages the page search index by delegating to Collection classes.
22 *
23 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
24 * @author     Andreas Gohr <andi@splitbrain.org>
25 * @author Tom N Harris <tnharris@whoopdedo.org>
26 */
27class Indexer
28{
29    /** @var callable|null Logging callback, receives a string message */
30    protected $logger;
31
32    /**
33     * Set a logging callback
34     *
35     * The callback receives a single string message. Use this to integrate
36     * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.)
37     *
38     * @param callable $logger
39     * @return static
40     */
41    public function setLogger(callable $logger): static
42    {
43        $this->logger = $logger;
44        return $this;
45    }
46
47    /**
48     * Send a message to the registered logger
49     *
50     * @param string $message
51     */
52    protected function log(string $message): void
53    {
54        if ($this->logger) ($this->logger)($message);
55    }
56
57    /**
58     * Version of the indexer taking into consideration the external tokenizer.
59     * The indexer is only compatible with data written by the same version.
60     *
61     * @triggers INDEXER_VERSION_GET
62     * Plugins that modify what gets indexed should hook this event and
63     * add their version info to the event data like so:
64     *     $data[$plugin_name] = $plugin_version;
65     *
66     * @return int|string
67     */
68    public function getVersion()
69    {
70        static $indexer_version = null;
71        if ($indexer_version == null) {
72            $version = INDEXER_VERSION;
73
74            $data = ['dokuwiki' => $version];
75            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
76            unset($data['dokuwiki']); // this needs to be first
77            ksort($data);
78            foreach ($data as $plugin => $vers) {
79                $version .= '+' . $plugin . '=' . $vers;
80            }
81            $indexer_version = $version;
82        }
83        return $indexer_version;
84    }
85
86    /**
87     * Return a list of all indexed pages
88     *
89     * @param bool $existsFilter only return pages that exist on disk
90     * @return string[] list of page names (keys are the RIDs in the page index)
91     */
92    public function getAllPages(bool $existsFilter = false): array
93    {
94        $pageIndex = new Index\MemoryIndex('page');
95        return array_filter(
96            iterator_to_array($pageIndex),
97            static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false))
98        );
99    }
100
101    /**
102     * Check if a page needs (re-)indexing
103     *
104     * @param string $page
105     * @param bool $force
106     * @return bool true if indexing is needed
107     */
108    public function needsIndexing(string $page, bool $force = false): bool
109    {
110        $idxtag = metaFN($page, '.indexed');
111        if ($force || !file_exists($idxtag)) return true;
112
113        if (trim(io_readFile($idxtag)) != $this->getVersion()) return true;
114
115        $last = @filemtime($idxtag);
116        return $last <= @filemtime(wikiFN($page));
117    }
118
119    /**
120     * Add/update the search index for a page
121     *
122     * Locking is handled internally.
123     *
124     * @param string $page The page to index
125     * @param bool $force force reindexing even when the index is up to date
126     *
127     * @throws IndexAccessException
128     * @throws IndexLockException
129     * @throws IndexWriteException
130     */
131    public function addPage(string $page, bool $force = false): void
132    {
133        if (!$this->needsIndexing($page, $force)) {
134            $this->log("Indexer: index for {$page} up to date");
135            return;
136        }
137
138        // create shared writable page index early so we can resolve the PID for plugins
139        $pageIndex = new FileIndex('page', '', true);
140
141        // prepare event data
142        $data = [
143            'page' => $page,
144            'body' => '',
145            'metadata' => [
146                'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED),
147                'relation_references' => array_keys(
148                    p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? []
149                ),
150                'relation_media' => array_keys(
151                    p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? []
152                ),
153                'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false,
154            ],
155            'pid' => $pageIndex->accessCachedValue($page),
156        ];
157
158        // let plugins modify the data
159        $event = new Event('INDEXER_PAGE_ADD', $data);
160        if ($event->advise_before()) {
161            $data['body'] = $data['body'] . ' ' . rawWiki($data['page']);
162        }
163        $event->advise_after();
164        unset($event);
165
166        // index title
167        (new PageTitleCollection($pageIndex))->lock()
168            ->addEntity($data['page'], [$data['metadata']['title']])->unlock();
169        unset($data['metadata']['title']);
170
171        // index fulltext
172        if ($data['metadata']['internal_index']) {
173            $words = Tokenizer::getWords($data['body']);
174            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock();
175        } else {
176            $this->log("Indexer: full text indexing disabled for {$data['page']}");
177            // clear any previously stored fulltext data
178            (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock();
179        }
180        unset($data['metadata']['internal_index']);
181
182        // index metadata keys
183        foreach ($data['metadata'] as $key => $values) {
184            if (!is_array($values)) {
185                $values = ($values !== null && $values !== '') ? [$values] : [];
186            }
187            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock();
188        }
189
190        // update metadata registry
191        $this->updateMetadataRegistry(array_keys($data['metadata']));
192
193        // update index tag file
194        io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion());
195        $this->log("Indexer: finished indexing {$data['page']}");
196    }
197
198    /**
199     * Remove a page from the index
200     *
201     * Clears the page's data from all collections. The entity persists in page.idx.
202     *
203     * @param string $page The page to remove
204     * @param bool $force force deletion even when no .indexed tag exists
205     *
206     * @throws IndexAccessException
207     * @throws IndexLockException
208     * @throws IndexWriteException
209     */
210    public function deletePage(string $page, bool $force = false): void
211    {
212        $idxtag = metaFN($page, '.indexed');
213        if (!$force && !file_exists($idxtag)) {
214            $this->log("Indexer: {$page}.indexed file does not exist, ignoring");
215            return;
216        }
217
218        $pageIndex = new FileIndex('page', '', true);
219
220        (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
221        (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock();
222
223        foreach ($this->getMetadataRegistryKeys() as $key) {
224            (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock();
225        }
226
227        $this->log("Indexer: deleted {$page} from index");
228        @unlink($idxtag);
229    }
230
231    /**
232     * Rename a page in the search index
233     *
234     * The page must already have been moved on disk before calling this.
235     * Clears the old page's data and re-indexes under the new name.
236     *
237     * @param string $oldpage The old page name
238     * @param string $newpage The new page name
239     *
240     * @throws IndexAccessException
241     * @throws IndexLockException
242     * @throws IndexWriteException
243     */
244    public function renamePage(string $oldpage, string $newpage): void
245    {
246        $this->deletePage($oldpage, true);
247        $this->addPage($newpage, true);
248    }
249
250    /**
251     * Clear all page indexes
252     */
253    public function clear(): void
254    {
255        global $conf;
256
257        Lock::acquire('page');
258
259        // clear metadata indexes
260        foreach ($this->getMetadataRegistryKeys() as $key) {
261            $clean = PageMetaCollection::cleanName($key);
262            @unlink($conf['indexdir'] . '/' . $clean . '_w.idx');
263            @unlink($conf['indexdir'] . '/' . $clean . '_i.idx');
264            @unlink($conf['indexdir'] . '/' . $clean . '_p.idx');
265        }
266
267        // clear fulltext indexes
268        $files = glob($conf['indexdir'] . '/i*.idx');
269        if ($files) foreach ($files as $f) @unlink($f);
270        $files = glob($conf['indexdir'] . '/w*.idx');
271        if ($files) foreach ($files as $f) @unlink($f);
272
273        @unlink($conf['indexdir'] . '/pageword.idx');
274        @unlink($conf['indexdir'] . '/lengths.idx');
275
276        // clear title and page indexes
277        @unlink($conf['indexdir'] . '/title.idx');
278        @unlink($conf['indexdir'] . '/page.idx');
279        @unlink($conf['indexdir'] . '/metadata.idx');
280
281        Lock::release('page');
282    }
283
284    /**
285     * Get the list of known metadata keys from the metadata registry
286     *
287     * @return string[] list of metadata key names
288     */
289    protected function getMetadataRegistryKeys(): array
290    {
291        global $conf;
292        $fn = $conf['indexdir'] . '/metadata.idx';
293        if (!file_exists($fn)) return [];
294        $keys = file($fn, FILE_IGNORE_NEW_LINES);
295        return $keys ?: [];
296    }
297
298    /**
299     * Update the metadata registry with new keys
300     *
301     * @param string[] $keys metadata key names to ensure are registered
302     */
303    protected function updateMetadataRegistry(array $keys): void
304    {
305        global $conf;
306        $fn = $conf['indexdir'] . '/metadata.idx';
307        $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : [];
308        if (!$existing) $existing = [];
309
310        $added = false;
311        foreach ($keys as $key) {
312            if (!in_array($key, $existing)) {
313                $existing[] = $key;
314                $added = true;
315            }
316        }
317
318        if ($added) {
319            io_saveFile($fn, implode("\n", $existing) . "\n");
320        }
321    }
322}
323