xref: /dokuwiki/inc/Search/Indexer.php (revision 4d04b7bbfe9c97673b0f22586d88e161aca34f70)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\PagewordIndex;
7use dokuwiki\Search\MetadataIndex;
8
9// Version tag used to force rebuild on upgrade
10const INDEXER_VERSION = 8;
11
12/**
13 * Class DokuWiki Indexer (Singleton)
14 *
15 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
16 * @author     Andreas Gohr <andi@splitbrain.org>
17 * @author Tom N Harris <tnharris@whoopdedo.org>
18 */
19class Indexer extends AbstractIndex
20{
21    /** @var Indexer $instance */
22    protected static $instance = null;
23
24    /**
25     * Get new or existing singleton instance of the Indexer
26     *
27     * @return Indexer
28     */
29    public static function getInstance()
30    {
31        if (is_null(static::$instance)) {
32            static::$instance = new static();
33        }
34        return static::$instance;
35    }
36
37    /**
38     * Dispatch Indexing request for the page, called by TaskRunner::runIndexer()
39     *
40     * @param string        $page   name of the page to index
41     * @param bool          $verbose    print status messages
42     * @param bool          $force  force reindexing even when the index is up to date
43     * @return bool  If the function completed successfully
44     *
45     * @author Tom N Harris <tnharris@whoopdedo.org>
46     * @author Satoshi Sahara <sahara.satoshi@gmail.com>
47     */
48    public function dispatch($page, $verbose = false, $force = false)
49    {
50        // check if page was deleted but is still in the index
51        if (!page_exists($page)) {
52            $result = $this->deletePage($page, $verbose, $force);
53            return $result;
54        } else {
55            // update search index
56            $result = $this->addPage($page, $verbose, $force);
57            return $result;
58        }
59    }
60
61    /**
62     * Version of the indexer taking into consideration the external tokenizer.
63     * The indexer is only compatible with data written by the same version.
64     *
65     * @triggers INDEXER_VERSION_GET
66     * Plugins that modify what gets indexed should hook this event and
67     * add their version info to the event data like so:
68     *     $data[$plugin_name] = $plugin_version;
69     *
70     * @author Tom N Harris <tnharris@whoopdedo.org>
71     * @author Michael Hamann <michael@content-space.de>
72     *
73     * @return int|string
74     */
75    public function getVersion()
76    {
77        static $indexer_version = null;
78        if ($indexer_version == null) {
79            $version = INDEXER_VERSION;
80
81            // DokuWiki version is included for the convenience of plugins
82            $data = array('dokuwiki' => $version);
83            Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
84            unset($data['dokuwiki']); // this needs to be first
85            ksort($data);
86            foreach ($data as $plugin => $vers) {
87                $version .= '+'.$plugin.'='.$vers;
88            }
89            $indexer_version = $version;
90        }
91        return $indexer_version;
92    }
93
94    /**
95     * Adds/updates the search index for the given page
96     *
97     * Locking is handled internally.
98     *
99     * @param string        $page   name of the page to index
100     * @param bool          $verbose    print status messages
101     * @param bool          $force  force reindexing even when the index is up to date
102     * @return bool  If the function completed successfully
103     *
104     * @author Tom N Harris <tnharris@whoopdedo.org>
105     * @author Satoshi Sahara <sahara.satoshi@gmail.com>
106     */
107    public function addPage($page, $verbose = false, $force = false)
108    {
109        // check if indexing needed for the existing page (full text and/or metadata indexing)
110        $idxtag = metaFN($page,'.indexed');
111        if (!$force && file_exists($idxtag)) {
112            if (trim(io_readFile($idxtag)) == $this->getVersion()) {
113                $last = @filemtime($idxtag);
114                if ($last > @filemtime(wikiFN($page))) {
115                    if ($verbose) dbglog("Indexer: index for {$page} up to date");
116                    return true;
117                }
118            }
119        }
120
121        // register the page to the page.idx
122        $pid = $this->getPID($page);
123        if ($pid === false) {
124            if ($verbose) dbglog("Indexer: getting the PID failed for {$page}");
125            trigger_error("Failed to get PID for {$page}", E_USER_ERROR);
126            return false;
127        }
128
129        // prepare metadata indexing
130        $metadata = array();
131        $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
132
133        $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED);
134        $metadata['relation_references'] = ($references !== null) ?
135                array_keys($references) : array();
136
137        $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED);
138        $metadata['relation_media'] = ($media !== null) ?
139                array_keys($media) : array();
140
141        // check if full text indexing allowed
142        $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
143        if ($indexenabled !== false) $indexenabled = true;
144        $metadata['internal_index'] = $indexenabled;
145
146        $body = '';
147        $data = compact('page', 'body', 'metadata', 'pid');
148        $event = new Event('INDEXER_PAGE_ADD', $data);
149        if ($event->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page);
150        $event->advise_after();
151        unset($event);
152        extract($data);
153        $indexenabled = $metadata['internal_index'];
154        unset($metadata['internal_index']);
155
156        // Access to Metadata Index
157        $MetadataIndex = MetadataIndex::getInstance();
158        $result = $MetadataIndex->addMetaKeys($page, $metadata);
159        if ($verbose) dbglog("Indexer: addMetaKeys({$page}) ".($result ? 'done' : 'failed'));
160        if (!$result) {
161            return false;
162        }
163
164        // Access to Pageword Index
165        $PagewordIndex = PagewordIndex::getInstance();
166        if ($indexenabled) {
167            $result = $PagewordIndex->addPageWords($page, $body);
168            if ($verbose) dbglog("Indexer: addPageWords({$page}) ".($result ? 'done' : 'failed'));
169            if (!$result) {
170                return false;
171            }
172        } else {
173            if ($verbose) dbglog("Indexer: full text indexing disabled for {$page}");
174            // ensure the page content deleted from the pageword index
175            $result = $PagewordIndex->deletePageWords($page);
176            if ($verbose) dbglog("Indexer: deletePageWords({$page}) ".($result ? 'done' : 'failed'));
177            if (!$result) {
178                return false;
179            }
180        }
181
182        // update index tag file
183        io_saveFile($idxtag, $this->getVersion());
184        if ($verbose) dbglog("Indexer: finished");
185
186        return $result;
187    }
188
189    /**
190     * Remove a page from the index, erases entries in all known indexes
191     *
192     * Locking is handled internally.
193     *
194     * @param string        $page   name of the page to index
195     * @param bool          $verbose    print status messages
196     * @param bool          $force  force reindexing even when the index is up to date
197     * @return bool  If the function completed successfully
198     *
199     * @author Tom N Harris <tnharris@whoopdedo.org>
200     * @author Satoshi Sahara <sahara.satoshi@gmail.com>
201     */
202    public function deletePage($page, $verbose = false, $force = false)
203    {
204        $idxtag = metaFN($page,'.indexed');
205        if (!$force && !file_exists($idxtag)) {
206            if ($verbose) dbglog("Indexer: {$page}.indexed file does not exist, ignoring");
207            return true;
208        }
209
210        // remove obsoleted content from pageword index
211        $PagewordIndex = PagewordIndex::getInstance();
212        $result = $PagewordIndex->deletePageWords($page);
213        if ($verbose) dbglog("Indexer: deletePageWords({$page}) ".($result ? 'done' : 'failed'));
214        if (!$result) {
215            return false;
216        }
217
218        // delete all keys of the page from metadata index
219        $MetadataIndex = MetadataIndex::getInstance();
220        $result = $MetadataIndex->deleteMetaKeys($page);
221        if ($verbose) dbglog("Indexer: deleteMetaKeys({$page}) ".($result ? 'done' : 'failed'));
222        if (!$result) {
223            return false;
224        }
225
226        // mark the page as deleted in the page.idx
227        $pid = $this->getPID($page);
228        if ($pid !== false) {
229            if (!$this->lock()) return false;  // set $errors property
230            $result = $this->saveIndexKey('page', '', $pid, '#deleted:'.$page);
231            if ($verbose) dbglog("Indexer: update page.idx  ".($result ? 'done' : 'failed'));
232            $this->unlock();
233        } else {
234            if ($verbose) dbglog("Indexer: {$page} not found in the page.idx, ignoring");
235            $result = true;
236        }
237
238        unset(static::$pidCache[$pid]);
239        @unlink($idxtag);
240        return $result;
241    }
242
243    /**
244     * Rename a page in the search index without changing the indexed content.
245     * This function doesn't check if the old or new name exists in the filesystem.
246     * It returns an error if the old page isn't in the page list of the indexer
247     * and it deletes all previously indexed content of the new page.
248     *
249     * @param string $oldpage The old page name
250     * @param string $newpage The new page name
251     * @return bool           If the page was successfully renamed
252     */
253    public function renamePage($oldpage, $newpage)
254    {
255        $index = $this->getIndex('page', '');
256        // check if oldpage found in page.idx
257        $oldPid = array_search($oldpage, $index, true);
258        if ($oldPid === false) return false;
259
260        // check if newpage found in page.idx
261        $newPid = array_search($newpage, $index, true);
262        if ($newPid !== false) {
263            $result = $this->deletePage($newpage);
264            if (!$result) return false;
265            // Note: $index is no longer valid after deletePage()!
266            unset($index);
267        }
268
269        // update page.idx
270        if (!$this->lock()) return false;  // set $errors property
271        $result = $this->saveIndexKey('page', '', $oldPid, $newpage);
272        $this->unlock();
273
274        // reset the pid cache
275        $this->resetPIDCache();
276
277        return $result;
278    }
279
280    /**
281     * Clear the Page Index
282     *
283     * @param bool   $requireLock
284     * @return bool  If the index has been cleared successfully
285     */
286    public function clear($requireLock = true)
287    {
288        global $conf;
289
290        if ($requireLock && !$this->lock()) return false;
291
292        // clear Metadata Index
293        $MetadataIndex = MetadataIndex::getInstance();
294        $MetadataIndex->clear(false);
295
296        // clear Pageword Index
297        $PagewordIndex = PagewordIndex::getInstance();
298        $PagewordIndex->clear(false);
299
300        @unlink($conf['indexdir'].'/page.idx');
301
302        // clear the pid cache
303        $this->resetPIDCache();
304
305        if ($requireLock) $this->unlock();
306        return true;
307    }
308
309
310    /**
311     * Return a list of words sorted by number of times used
312     *
313     * @param int       $min    bottom frequency threshold
314     * @param int       $max    upper frequency limit. No limit if $max<$min
315     * @param int       $minlen minimum length of words to count
316     * @param string    $key    metadata key to list. Uses the fulltext index if not given
317     * @return array            list of words as the keys and frequency as values
318     *
319     * @author Tom N Harris <tnharris@whoopdedo.org>
320     */
321    public function histogram($key = null, $min = 1, $max = 0, $minlen = 3)
322    {
323        if ($min < 1)    $min = 1;
324        if ($max < $min) $max = 0;
325
326        $result = array();
327
328        if ($key == 'title') {
329            $index = $this->getIndex('title', '');
330            $index = array_count_values($index);
331            foreach ($index as $val => $cnt) {
332                if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) {
333                    $result[$val] = $cnt;
334                }
335            }
336        } elseif (!is_null($key)) {
337            $metaname = $this->cleanName($key);
338            $index = $this->getIndex($metaname.'_i', '');
339            $val_idx = array();
340            foreach ($index as $wid => $line) {
341                $freq = $this->countTuples($line);
342                if ($freq >= $min && (!$max || $freq <= $max)) {
343                    $val_idx[$wid] = $freq;
344                }
345            }
346            if (!empty($val_idx)) {
347                $words = $this->getIndex($metaname.'_w', '');
348                foreach ($val_idx as $wid => $freq) {
349                    if (strlen($words[$wid]) >= $minlen) {
350                        $result[$words[$wid]] = $freq;
351                    }
352                }
353            }
354        } else {
355            $PagewordIndex = PagewordIndex::getInstance();
356            $lengths = $PagewordIndex->listIndexLengths();
357            foreach ($lengths as $length) {
358                if ($length < $minlen) continue;
359                $index = $this->getIndex('i', $length);
360                $words = null;
361                foreach ($index as $wid => $line) {
362                    $freq = $this->countTuples($line);
363                    if ($freq >= $min && (!$max || $freq <= $max)) {
364                        if ($words === null) {
365                            $words = $this->getIndex('w', $length);
366                        }
367                        $result[$words[$wid]] = $freq;
368                    }
369                }
370            }
371        }
372
373        arsort($result);
374        return $result;
375    }
376}
377