1 <?php
2 
3 /**
4  * Functions to create the fulltext search index
5  *
6  * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
7  * @author     Andreas Gohr <andi@splitbrain.org>
8  * @author     Tom N Harris <tnharris@whoopdedo.org>
9  */
10 
11 use dokuwiki\Utf8\Clean;
12 use dokuwiki\Extension\Event;
13 use dokuwiki\Search\Indexer;
14 
15 // Version tag used to force rebuild on upgrade
16 define('INDEXER_VERSION', 8);
17 
18 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
19 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
20 
21 /**
22  * Version of the indexer taking into consideration the external tokenizer.
23  * The indexer is only compatible with data written by the same version.
24  *
25  * @triggers INDEXER_VERSION_GET
26  * Plugins that modify what gets indexed should hook this event and
27  * add their version info to the event data like so:
28  *     $data[$plugin_name] = $plugin_version;
29  *
30  * @author Tom N Harris <tnharris@whoopdedo.org>
31  * @author Michael Hamann <michael@content-space.de>
32  *
33  * @return int|string
34  */
35 function idx_get_version()
36 {
37     static $indexer_version = null;
38     if ($indexer_version == null) {
39         $version = INDEXER_VERSION;
40 
41         // DokuWiki version is included for the convenience of plugins
42         $data = ['dokuwiki' => $version];
43         Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
44         unset($data['dokuwiki']); // this needs to be first
45         ksort($data);
46         foreach ($data as $plugin => $vers)
47             $version .= '+' . $plugin . '=' . $vers;
48         $indexer_version = $version;
49     }
50     return $indexer_version;
51 }
52 
53 /**
54  * Measure the length of a string.
55  * Differs from strlen in handling of asian characters.
56  *
57  * @author Tom N Harris <tnharris@whoopdedo.org>
58  *
59  * @param string $w
60  * @return int
61  */
62 function wordlen($w)
63 {
64     $l = strlen($w);
65     // If left alone, all chinese "words" will get put into w3.idx
66     // So the "length" of a "word" is faked
67     if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
68         foreach ($leadbytes[0] as $b)
69             $l += ord($b) - 0xE1;
70     }
71     return $l;
72 }
73 
74 /**
75  * Create an instance of the indexer.
76  *
77  * @return Indexer    an Indexer
78  *
79  * @author Tom N Harris <tnharris@whoopdedo.org>
80  */
81 function idx_get_indexer()
82 {
83     static $Indexer;
84     if (!isset($Indexer)) {
85         $Indexer = new Indexer();
86     }
87     return $Indexer;
88 }
89 
90 /**
91  * Returns words that will be ignored.
92  *
93  * @return array                list of stop words
94  *
95  * @author Tom N Harris <tnharris@whoopdedo.org>
96  */
97 function & idx_get_stopwords()
98 {
99     static $stopwords = null;
100     if (is_null($stopwords)) {
101         global $conf;
102         $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
103         if (file_exists($swfile)) {
104             $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
105         } else {
106             $stopwords = [];
107         }
108     }
109     return $stopwords;
110 }
111 
112 /**
113  * Adds/updates the search index for the given page
114  *
115  * Locking is handled internally.
116  *
117  * @param string        $page   name of the page to index
118  * @param boolean       $verbose    print status messages
119  * @param boolean       $force  force reindexing even when the index is up to date
120  * @return string|boolean  the function completed successfully
121  *
122  * @author Tom N Harris <tnharris@whoopdedo.org>
123  */
124 function idx_addPage($page, $verbose = false, $force = false)
125 {
126     $idxtag = metaFN($page, '.indexed');
127     // check if page was deleted but is still in the index
128     if (!page_exists($page)) {
129         if (!file_exists($idxtag)) {
130             if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
131             return false;
132         }
133         $Indexer = idx_get_indexer();
134         $result = $Indexer->deletePage($page);
135         if ($result === "locked") {
136             if ($verbose) echo "Indexer: locked" . DOKU_LF;
137             return false;
138         }
139         @unlink($idxtag);
140         return $result;
141     }
142 
143     // check if indexing needed
144     if (!$force && file_exists($idxtag)) {
145         if (trim(io_readFile($idxtag)) == idx_get_version()) {
146             $last = @filemtime($idxtag);
147             if ($last > @filemtime(wikiFN($page))) {
148                 if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
149                 return false;
150             }
151         }
152     }
153 
154     $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
155     if ($indexenabled === false) {
156         $result = false;
157         if (file_exists($idxtag)) {
158             $Indexer = idx_get_indexer();
159             $result = $Indexer->deletePage($page);
160             if ($result === "locked") {
161                 if ($verbose) echo "Indexer: locked" . DOKU_LF;
162                 return false;
163             }
164             @unlink($idxtag);
165         }
166         if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
167         return $result;
168     }
169 
170     $Indexer = idx_get_indexer();
171     $pid = $Indexer->getPID($page);
172     if ($pid === false) {
173         if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
174         return false;
175     }
176     $body = '';
177     $metadata = [];
178     $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
179     if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
180         $metadata['relation_references'] = array_keys($references);
181     else $metadata['relation_references'] = [];
182 
183     if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
184         $metadata['relation_media'] = array_keys($media);
185     else $metadata['relation_media'] = [];
186 
187     $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
188     $evt = new Event('INDEXER_PAGE_ADD', $data);
189     if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
190     $evt->advise_after();
191     unset($evt);
192     extract($data);
193 
194     $result = $Indexer->addPageWords($page, $body);
195     if ($result === "locked") {
196         if ($verbose) echo "Indexer: locked" . DOKU_LF;
197         return false;
198     }
199 
200     if ($result) {
201         $result = $Indexer->addMetaKeys($page, $metadata);
202         if ($result === "locked") {
203             if ($verbose) echo "Indexer: locked" . DOKU_LF;
204             return false;
205         }
206     }
207 
208     if ($result)
209         io_saveFile(metaFN($page, '.indexed'), idx_get_version());
210     if ($verbose) {
211         echo "Indexer: finished" . DOKU_LF;
212         return true;
213     }
214     return $result;
215 }
216 
217 /**
218  * Find tokens in the fulltext index
219  *
220  * Takes an array of words and will return a list of matching
221  * pages for each one.
222  *
223  * Important: No ACL checking is done here! All results are
224  *            returned, regardless of permissions
225  *
226  * @param array      $words  list of words to search for
227  * @return array             list of pages found, associated with the search terms
228  */
229 function idx_lookup(&$words)
230 {
231     $Indexer = idx_get_indexer();
232     return $Indexer->lookup($words);
233 }
234 
235 /**
236  * Split a string into tokens
237  *
238  * @param string $string
239  * @param bool $wc
240  *
241  * @return array
242  */
243 function idx_tokenizer($string, $wc = false)
244 {
245     $Indexer = idx_get_indexer();
246     return $Indexer->tokenizer($string, $wc);
247 }
248 
249 /* For compatibility */
250 
251 /**
252  * Read the list of words in an index (if it exists).
253  *
254  * @author Tom N Harris <tnharris@whoopdedo.org>
255  *
256  * @param string $idx
257  * @param string $suffix
258  * @return array
259  */
260 function idx_getIndex($idx, $suffix)
261 {
262     global $conf;
263     $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
264     if (!file_exists($fn)) return [];
265     return file($fn);
266 }
267 
268 /**
269  * Get the list of lengths indexed in the wiki.
270  *
271  * Read the index directory or a cache file and returns
272  * a sorted array of lengths of the words used in the wiki.
273  *
274  * @author YoBoY <yoboy.leguesh@gmail.com>
275  *
276  * @return array
277  */
278 function idx_listIndexLengths()
279 {
280     global $conf;
281     // testing what we have to do, create a cache file or not.
282     if ($conf['readdircache'] == 0) {
283         $docache = false;
284     } else {
285         clearstatcache();
286         if (
287             file_exists($conf['indexdir'] . '/lengths.idx')
288             && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
289         ) {
290             if (
291                 ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
292                 !== false
293             ) {
294                 $idx = [];
295                 foreach ($lengths as $length) {
296                     $idx[] = (int)$length;
297                 }
298                 return $idx;
299             }
300         }
301         $docache = true;
302     }
303 
304     if ($conf['readdircache'] == 0 || $docache) {
305         $dir = @opendir($conf['indexdir']);
306         if ($dir === false)
307             return [];
308         $idx = [];
309         while (($f = readdir($dir)) !== false) {
310             if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) {
311                 $i = substr($f, 1, -4);
312                 if (is_numeric($i))
313                     $idx[] = (int)$i;
314             }
315         }
316         closedir($dir);
317         sort($idx);
318         // save this in a file
319         if ($docache) {
320             $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
321             @fwrite($handle, implode("\n", $idx));
322             @fclose($handle);
323         }
324         return $idx;
325     }
326 
327     return [];
328 }
329 
330 /**
331  * Get the word lengths that have been indexed.
332  *
333  * Reads the index directory and returns an array of lengths
334  * that there are indices for.
335  *
336  * @author YoBoY <yoboy.leguesh@gmail.com>
337  *
338  * @param array|int $filter
339  * @return array
340  */
341 function idx_indexLengths($filter)
342 {
343     global $conf;
344     $idx = [];
345     if (is_array($filter)) {
346         // testing if index files exist only
347         $path = $conf['indexdir'] . "/i";
348         foreach (array_keys($filter) as $key) {
349             if (file_exists($path . $key . '.idx'))
350                 $idx[] = $key;
351         }
352     } else {
353         $lengths = idx_listIndexLengths();
354         foreach ($lengths as $length) {
355             // keep all the values equal or superior
356             if ((int)$length >= (int)$filter)
357                 $idx[] = $length;
358         }
359     }
360     return $idx;
361 }
362 
363 /**
364  * Clean a name of a key for use as a file name.
365  *
366  * Romanizes non-latin characters, then strips away anything that's
367  * not a letter, number, or underscore.
368  *
369  * @author Tom N Harris <tnharris@whoopdedo.org>
370  *
371  * @param string $name
372  * @return string
373  */
374 function idx_cleanName($name)
375 {
376     $name = Clean::romanize(trim((string)$name));
377     $name = preg_replace('#[ \./\\:-]+#', '_', $name);
378     $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
379     return strtolower($name);
380 }
381 
382 //Setup VIM: ex: et ts=4 :
383