xref: /dokuwiki/inc/indexer.php (revision fe15e2c063a38f65804c55e581c72b96ac36edf7)
1<?php
2/**
3 * Functions to create the fulltext search index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 * @author     Tom N Harris <tnharris@whoopdedo.org>
8 */
9use dokuwiki\Utf8\Clean;
10use dokuwiki\Extension\Event;
11use dokuwiki\Search\Indexer;
12
13// Version tag used to force rebuild on upgrade
14define('INDEXER_VERSION', 8);
15
16// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
18
19/**
20 * Version of the indexer taking into consideration the external tokenizer.
21 * The indexer is only compatible with data written by the same version.
22 *
23 * @triggers INDEXER_VERSION_GET
24 * Plugins that modify what gets indexed should hook this event and
25 * add their version info to the event data like so:
26 *     $data[$plugin_name] = $plugin_version;
27 *
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 * @author Michael Hamann <michael@content-space.de>
30 *
31 * @return int|string
32 */
33function idx_get_version()
34{
35    static $indexer_version = null;
36    if ($indexer_version == null) {
37        $version = INDEXER_VERSION;
38
39        // DokuWiki version is included for the convenience of plugins
40        $data = ['dokuwiki' => $version];
41        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
42        unset($data['dokuwiki']); // this needs to be first
43        ksort($data);
44        foreach ($data as $plugin => $vers)
45            $version .= '+' . $plugin . '=' . $vers;
46        $indexer_version = $version;
47    }
48    return $indexer_version;
49}
50
51/**
52 * Measure the length of a string.
53 * Differs from strlen in handling of asian characters.
54 *
55 * @author Tom N Harris <tnharris@whoopdedo.org>
56 *
57 * @param string $w
58 * @return int
59 */
60function wordlen($w)
61{
62    $l = strlen($w);
63    // If left alone, all chinese "words" will get put into w3.idx
64    // So the "length" of a "word" is faked
65    if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
66        foreach ($leadbytes[0] as $b)
67            $l += ord($b) - 0xE1;
68    }
69    return $l;
70}
71
72/**
73 * Create an instance of the indexer.
74 *
75 * @return Indexer    an Indexer
76 *
77 * @author Tom N Harris <tnharris@whoopdedo.org>
78 */
79function idx_get_indexer()
80{
81    static $Indexer;
82    if (!isset($Indexer)) {
83        $Indexer = new Indexer();
84    }
85    return $Indexer;
86}
87
88/**
89 * Returns words that will be ignored.
90 *
91 * @return array                list of stop words
92 *
93 * @author Tom N Harris <tnharris@whoopdedo.org>
94 */
95function & idx_get_stopwords()
96{
97    static $stopwords = null;
98    if (is_null($stopwords)) {
99        global $conf;
100        $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
101        if (file_exists($swfile)) {
102            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
103        } else {
104            $stopwords = [];
105        }
106    }
107    return $stopwords;
108}
109
110/**
111 * Adds/updates the search index for the given page
112 *
113 * Locking is handled internally.
114 *
115 * @param string        $page   name of the page to index
116 * @param boolean       $verbose    print status messages
117 * @param boolean       $force  force reindexing even when the index is up to date
118 * @return string|boolean  the function completed successfully
119 *
120 * @author Tom N Harris <tnharris@whoopdedo.org>
121 */
122function idx_addPage($page, $verbose = false, $force = false)
123{
124    $idxtag = metaFN($page, '.indexed');
125    // check if page was deleted but is still in the index
126    if (!page_exists($page)) {
127        if (!file_exists($idxtag)) {
128            if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
129            return false;
130        }
131        $Indexer = idx_get_indexer();
132        $result = $Indexer->deletePage($page);
133        if ($result === "locked") {
134            if ($verbose) echo "Indexer: locked" . DOKU_LF;
135            return false;
136        }
137        @unlink($idxtag);
138        return $result;
139    }
140
141    // check if indexing needed
142    if (!$force && file_exists($idxtag)) {
143        if (trim(io_readFile($idxtag)) == idx_get_version()) {
144            $last = @filemtime($idxtag);
145            if ($last > @filemtime(wikiFN($page))) {
146                if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
147                return false;
148            }
149        }
150    }
151
152    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
153    if ($indexenabled === false) {
154        $result = false;
155        if (file_exists($idxtag)) {
156            $Indexer = idx_get_indexer();
157            $result = $Indexer->deletePage($page);
158            if ($result === "locked") {
159                if ($verbose) echo "Indexer: locked" . DOKU_LF;
160                return false;
161            }
162            @unlink($idxtag);
163        }
164        if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
165        return $result;
166    }
167
168    $Indexer = idx_get_indexer();
169    $pid = $Indexer->getPID($page);
170    if ($pid === false) {
171        if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
172        return false;
173    }
174    $body = '';
175    $metadata = [];
176    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
177    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
178        $metadata['relation_references'] = array_keys($references);
179    else $metadata['relation_references'] = [];
180
181    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
182        $metadata['relation_media'] = array_keys($media);
183    else $metadata['relation_media'] = [];
184
185    $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
186    $evt = new Event('INDEXER_PAGE_ADD', $data);
187    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
188    $evt->advise_after();
189    unset($evt);
190    extract($data);
191
192    $result = $Indexer->addPageWords($page, $body);
193    if ($result === "locked") {
194        if ($verbose) echo "Indexer: locked" . DOKU_LF;
195        return false;
196    }
197
198    if ($result) {
199        $result = $Indexer->addMetaKeys($page, $metadata);
200        if ($result === "locked") {
201            if ($verbose) echo "Indexer: locked" . DOKU_LF;
202            return false;
203        }
204    }
205
206    if ($result)
207        io_saveFile(metaFN($page, '.indexed'), idx_get_version());
208    if ($verbose) {
209        echo "Indexer: finished" . DOKU_LF;
210        return true;
211    }
212    return $result;
213}
214
215/**
216 * Find tokens in the fulltext index
217 *
218 * Takes an array of words and will return a list of matching
219 * pages for each one.
220 *
221 * Important: No ACL checking is done here! All results are
222 *            returned, regardless of permissions
223 *
224 * @param array      $words  list of words to search for
225 * @return array             list of pages found, associated with the search terms
226 */
227function idx_lookup(&$words)
228{
229    $Indexer = idx_get_indexer();
230    return $Indexer->lookup($words);
231}
232
233/**
234 * Split a string into tokens
235 *
236 * @param string $string
237 * @param bool $wc
238 *
239 * @return array
240 */
241function idx_tokenizer($string, $wc = false)
242{
243    $Indexer = idx_get_indexer();
244    return $Indexer->tokenizer($string, $wc);
245}
246
247/* For compatibility */
248
249/**
250 * Read the list of words in an index (if it exists).
251 *
252 * @author Tom N Harris <tnharris@whoopdedo.org>
253 *
254 * @param string $idx
255 * @param string $suffix
256 * @return array
257 */
258function idx_getIndex($idx, $suffix)
259{
260    global $conf;
261    $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
262    if (!file_exists($fn)) return [];
263    return file($fn);
264}
265
266/**
267 * Get the list of lengths indexed in the wiki.
268 *
269 * Read the index directory or a cache file and returns
270 * a sorted array of lengths of the words used in the wiki.
271 *
272 * @author YoBoY <yoboy.leguesh@gmail.com>
273 *
274 * @return array
275 */
276function idx_listIndexLengths()
277{
278    global $conf;
279    // testing what we have to do, create a cache file or not.
280    if ($conf['readdircache'] == 0) {
281        $docache = false;
282    } else {
283        clearstatcache();
284        if (
285            file_exists($conf['indexdir'] . '/lengths.idx')
286            && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
287        ) {
288            if (
289                ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
290                !== false
291            ) {
292                $idx = [];
293                foreach ($lengths as $length) {
294                    $idx[] = (int)$length;
295                }
296                return $idx;
297            }
298        }
299        $docache = true;
300    }
301
302    if ($conf['readdircache'] == 0 || $docache) {
303        $dir = @opendir($conf['indexdir']);
304        if ($dir === false)
305            return [];
306        $idx = [];
307        while (($f = readdir($dir)) !== false) {
308            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
309                $i = substr($f, 1, -4);
310                if (is_numeric($i))
311                    $idx[] = (int)$i;
312            }
313        }
314        closedir($dir);
315        sort($idx);
316        // save this in a file
317        if ($docache) {
318            $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
319            @fwrite($handle, implode("\n", $idx));
320            @fclose($handle);
321        }
322        return $idx;
323    }
324
325    return [];
326}
327
328/**
329 * Get the word lengths that have been indexed.
330 *
331 * Reads the index directory and returns an array of lengths
332 * that there are indices for.
333 *
334 * @author YoBoY <yoboy.leguesh@gmail.com>
335 *
336 * @param array|int $filter
337 * @return array
338 */
339function idx_indexLengths($filter)
340{
341    global $conf;
342    $idx = [];
343    if (is_array($filter)) {
344        // testing if index files exist only
345        $path = $conf['indexdir'] . "/i";
346        foreach (array_keys($filter) as $key) {
347            if (file_exists($path . $key . '.idx'))
348                $idx[] = $key;
349        }
350    } else {
351        $lengths = idx_listIndexLengths();
352        foreach ($lengths as $length) {
353            // keep all the values equal or superior
354            if ((int)$length >= (int)$filter)
355                $idx[] = $length;
356        }
357    }
358    return $idx;
359}
360
361/**
362 * Clean a name of a key for use as a file name.
363 *
364 * Romanizes non-latin characters, then strips away anything that's
365 * not a letter, number, or underscore.
366 *
367 * @author Tom N Harris <tnharris@whoopdedo.org>
368 *
369 * @param string $name
370 * @return string
371 */
372function idx_cleanName($name)
373{
374    $name = Clean::romanize(trim((string)$name));
375    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
376    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
377    return strtolower($name);
378}
379
380//Setup VIM: ex: et ts=4 :
381