1<?php
2
3/**
4 * Functions to create the fulltext search index
5 *
6 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
7 * @author     Andreas Gohr <andi@splitbrain.org>
8 * @author     Tom N Harris <tnharris@whoopdedo.org>
9 */
10
11use dokuwiki\Utf8\Clean;
12use dokuwiki\Extension\Event;
13use dokuwiki\Search\Indexer;
14
15// Version tag used to force rebuild on upgrade
16define('INDEXER_VERSION', 8);
17
18// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
19if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
20
21/**
22 * Version of the indexer taking into consideration the external tokenizer.
23 * The indexer is only compatible with data written by the same version.
24 *
25 * @triggers INDEXER_VERSION_GET
26 * Plugins that modify what gets indexed should hook this event and
27 * add their version info to the event data like so:
28 *     $data[$plugin_name] = $plugin_version;
29 *
30 * @author Tom N Harris <tnharris@whoopdedo.org>
31 * @author Michael Hamann <michael@content-space.de>
32 *
33 * @return int|string
34 */
35function idx_get_version()
36{
37    static $indexer_version = null;
38    if ($indexer_version == null) {
39        $version = INDEXER_VERSION;
40
41        // DokuWiki version is included for the convenience of plugins
42        $data = ['dokuwiki' => $version];
43        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
44        unset($data['dokuwiki']); // this needs to be first
45        ksort($data);
46        foreach ($data as $plugin => $vers)
47            $version .= '+' . $plugin . '=' . $vers;
48        $indexer_version = $version;
49    }
50    return $indexer_version;
51}
52
53/**
54 * Measure the length of a string.
55 * Differs from strlen in handling of asian characters.
56 *
57 * @author Tom N Harris <tnharris@whoopdedo.org>
58 *
59 * @param string $w
60 * @return int
61 */
62function wordlen($w)
63{
64    $l = strlen($w);
65    // If left alone, all chinese "words" will get put into w3.idx
66    // So the "length" of a "word" is faked
67    if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
68        foreach ($leadbytes[0] as $b)
69            $l += ord($b) - 0xE1;
70    }
71    return $l;
72}
73
74/**
75 * Create an instance of the indexer.
76 *
77 * @return Indexer    an Indexer
78 *
79 * @author Tom N Harris <tnharris@whoopdedo.org>
80 */
81function idx_get_indexer()
82{
83    static $Indexer;
84    if (!isset($Indexer)) {
85        $Indexer = new Indexer();
86    }
87    return $Indexer;
88}
89
90/**
91 * Returns words that will be ignored.
92 *
93 * @return array                list of stop words
94 *
95 * @author Tom N Harris <tnharris@whoopdedo.org>
96 */
97function & idx_get_stopwords()
98{
99    static $stopwords = null;
100    if (is_null($stopwords)) {
101        global $conf;
102        $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
103        if (file_exists($swfile)) {
104            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
105        } else {
106            $stopwords = [];
107        }
108    }
109    return $stopwords;
110}
111
112/**
113 * Adds/updates the search index for the given page
114 *
115 * Locking is handled internally.
116 *
117 * @param string        $page   name of the page to index
118 * @param boolean       $verbose    print status messages
119 * @param boolean       $force  force reindexing even when the index is up to date
120 * @return string|boolean  the function completed successfully
121 *
122 * @author Tom N Harris <tnharris@whoopdedo.org>
123 */
124function idx_addPage($page, $verbose = false, $force = false)
125{
126    $idxtag = metaFN($page, '.indexed');
127    // check if page was deleted but is still in the index
128    if (!page_exists($page)) {
129        if (!file_exists($idxtag)) {
130            if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
131            return false;
132        }
133        $Indexer = idx_get_indexer();
134        $result = $Indexer->deletePage($page);
135        if ($result === "locked") {
136            if ($verbose) echo "Indexer: locked" . DOKU_LF;
137            return false;
138        }
139        @unlink($idxtag);
140        return $result;
141    }
142
143    // check if indexing needed
144    if (!$force && file_exists($idxtag)) {
145        if (trim(io_readFile($idxtag)) == idx_get_version()) {
146            $last = @filemtime($idxtag);
147            if ($last > @filemtime(wikiFN($page))) {
148                if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
149                return false;
150            }
151        }
152    }
153
154    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
155    if ($indexenabled === false) {
156        $result = false;
157        if (file_exists($idxtag)) {
158            $Indexer = idx_get_indexer();
159            $result = $Indexer->deletePage($page);
160            if ($result === "locked") {
161                if ($verbose) echo "Indexer: locked" . DOKU_LF;
162                return false;
163            }
164            @unlink($idxtag);
165        }
166        if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
167        return $result;
168    }
169
170    $Indexer = idx_get_indexer();
171    $pid = $Indexer->getPID($page);
172    if ($pid === false) {
173        if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
174        return false;
175    }
176    $body = '';
177    $metadata = [];
178    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
179    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
180        $metadata['relation_references'] = array_keys($references);
181    else $metadata['relation_references'] = [];
182
183    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
184        $metadata['relation_media'] = array_keys($media);
185    else $metadata['relation_media'] = [];
186
187    $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
188    $evt = new Event('INDEXER_PAGE_ADD', $data);
189    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
190    $evt->advise_after();
191    unset($evt);
192    extract($data);
193
194    $result = $Indexer->addPageWords($page, $body);
195    if ($result === "locked") {
196        if ($verbose) echo "Indexer: locked" . DOKU_LF;
197        return false;
198    }
199
200    if ($result) {
201        $result = $Indexer->addMetaKeys($page, $metadata);
202        if ($result === "locked") {
203            if ($verbose) echo "Indexer: locked" . DOKU_LF;
204            return false;
205        }
206    }
207
208    if ($result)
209        io_saveFile(metaFN($page, '.indexed'), idx_get_version());
210    if ($verbose) {
211        echo "Indexer: finished" . DOKU_LF;
212        return true;
213    }
214    return $result;
215}
216
217/**
218 * Find tokens in the fulltext index
219 *
220 * Takes an array of words and will return a list of matching
221 * pages for each one.
222 *
223 * Important: No ACL checking is done here! All results are
224 *            returned, regardless of permissions
225 *
226 * @param array      $words  list of words to search for
227 * @return array             list of pages found, associated with the search terms
228 */
229function idx_lookup(&$words)
230{
231    $Indexer = idx_get_indexer();
232    return $Indexer->lookup($words);
233}
234
235/**
236 * Split a string into tokens
237 *
238 * @param string $string
239 * @param bool $wc
240 *
241 * @return array
242 */
243function idx_tokenizer($string, $wc = false)
244{
245    $Indexer = idx_get_indexer();
246    return $Indexer->tokenizer($string, $wc);
247}
248
249/* For compatibility */
250
251/**
252 * Read the list of words in an index (if it exists).
253 *
254 * @author Tom N Harris <tnharris@whoopdedo.org>
255 *
256 * @param string $idx
257 * @param string $suffix
258 * @return array
259 */
260function idx_getIndex($idx, $suffix)
261{
262    global $conf;
263    $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
264    if (!file_exists($fn)) return [];
265    return file($fn);
266}
267
268/**
269 * Get the list of lengths indexed in the wiki.
270 *
271 * Read the index directory or a cache file and returns
272 * a sorted array of lengths of the words used in the wiki.
273 *
274 * @author YoBoY <yoboy.leguesh@gmail.com>
275 *
276 * @return array
277 */
278function idx_listIndexLengths()
279{
280    global $conf;
281    // testing what we have to do, create a cache file or not.
282    if ($conf['readdircache'] == 0) {
283        $docache = false;
284    } else {
285        clearstatcache();
286        if (
287            file_exists($conf['indexdir'] . '/lengths.idx')
288            && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
289        ) {
290            if (
291                ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
292                !== false
293            ) {
294                $idx = [];
295                foreach ($lengths as $length) {
296                    $idx[] = (int)$length;
297                }
298                return $idx;
299            }
300        }
301        $docache = true;
302    }
303
304    if ($conf['readdircache'] == 0 || $docache) {
305        $dir = @opendir($conf['indexdir']);
306        if ($dir === false)
307            return [];
308        $idx = [];
309        while (($f = readdir($dir)) !== false) {
310            if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) {
311                $i = substr($f, 1, -4);
312                if (is_numeric($i))
313                    $idx[] = (int)$i;
314            }
315        }
316        closedir($dir);
317        sort($idx);
318        // save this in a file
319        if ($docache) {
320            $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
321            @fwrite($handle, implode("\n", $idx));
322            @fclose($handle);
323        }
324        return $idx;
325    }
326
327    return [];
328}
329
330/**
331 * Get the word lengths that have been indexed.
332 *
333 * Reads the index directory and returns an array of lengths
334 * that there are indices for.
335 *
336 * @author YoBoY <yoboy.leguesh@gmail.com>
337 *
338 * @param array|int $filter
339 * @return array
340 */
341function idx_indexLengths($filter)
342{
343    global $conf;
344    $idx = [];
345    if (is_array($filter)) {
346        // testing if index files exist only
347        $path = $conf['indexdir'] . "/i";
348        foreach (array_keys($filter) as $key) {
349            if (file_exists($path . $key . '.idx'))
350                $idx[] = $key;
351        }
352    } else {
353        $lengths = idx_listIndexLengths();
354        foreach ($lengths as $length) {
355            // keep all the values equal or superior
356            if ((int)$length >= (int)$filter)
357                $idx[] = $length;
358        }
359    }
360    return $idx;
361}
362
363/**
364 * Clean a name of a key for use as a file name.
365 *
366 * Romanizes non-latin characters, then strips away anything that's
367 * not a letter, number, or underscore.
368 *
369 * @author Tom N Harris <tnharris@whoopdedo.org>
370 *
371 * @param string $name
372 * @return string
373 */
374function idx_cleanName($name)
375{
376    $name = Clean::romanize(trim((string)$name));
377    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
378    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
379    return strtolower($name);
380}
381
382//Setup VIM: ex: et ts=4 :
383