1 <?php
2 
3 namespace dokuwiki\Search;
4 
5 use dokuwiki\Utf8\Asian;
6 use dokuwiki\Utf8\Clean;
7 use dokuwiki\Utf8\PhpString;
8 use dokuwiki\Extension\Event;
9 
10 /**
11  * Class that encapsulates operations on the indexer database.
12  *
13  * @author Tom N Harris <tnharris@whoopdedo.org>
14  */
15 class Indexer
16 {
17     /**
18      * @var array $pidCache Cache for getPID()
19      */
20     protected $pidCache = [];
21 
22     /**
23      * Adds the contents of a page to the fulltext index
24      *
25      * The added text replaces previous words for the same page.
26      * An empty value erases the page.
27      *
28      * @param string    $page   a page name
29      * @param string    $text   the body of the page
30      * @return string|boolean  the function completed successfully
31      *
32      * @author Tom N Harris <tnharris@whoopdedo.org>
33      * @author Andreas Gohr <andi@splitbrain.org>
34      */
35     public function addPageWords($page, $text)
36     {
37         if (!$this->lock())
38             return "locked";
39 
40         // load known documents
41         $pid = $this->getPIDNoLock($page);
42         if ($pid === false) {
43             $this->unlock();
44             return false;
45         }
46 
47         $pagewords = [];
48         // get word usage in page
49         $words = $this->getPageWords($text);
50         if ($words === false) {
51             $this->unlock();
52             return false;
53         }
54 
55         if (!empty($words)) {
56             foreach (array_keys($words) as $wlen) {
57                 $index = $this->getIndex('i', $wlen);
58                 foreach ($words[$wlen] as $wid => $freq) {
59                     $idx = ($wid < count($index)) ? $index[$wid] : '';
60                     $index[$wid] = $this->updateTuple($idx, $pid, $freq);
61                     $pagewords[] = "$wlen*$wid";
62                 }
63                 if (!$this->saveIndex('i', $wlen, $index)) {
64                     $this->unlock();
65                     return false;
66                 }
67             }
68         }
69 
70         // Remove obsolete index entries
71         $pageword_idx = $this->getIndexKey('pageword', '', $pid);
72         if ($pageword_idx !== '') {
73             $oldwords = explode(':', $pageword_idx);
74             $delwords = array_diff($oldwords, $pagewords);
75             $upwords = [];
76             foreach ($delwords as $word) {
77                 if ($word != '') {
78                     [$wlen, $wid] = explode('*', $word);
79                     $wid = (int)$wid;
80                     $upwords[$wlen][] = $wid;
81                 }
82             }
83             foreach ($upwords as $wlen => $widx) {
84                 $index = $this->getIndex('i', $wlen);
85                 foreach ($widx as $wid) {
86                     $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
87                 }
88                 $this->saveIndex('i', $wlen, $index);
89             }
90         }
91         // Save the reverse index
92         $pageword_idx = implode(':', $pagewords);
93         if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
94             $this->unlock();
95             return false;
96         }
97 
98         $this->unlock();
99         return true;
100     }
101 
102     /**
103      * Split the words in a page and add them to the index.
104      *
105      * @param string    $text   content of the page
106      * @return array            list of word IDs and number of times used
107      *
108      * @author Andreas Gohr <andi@splitbrain.org>
109      * @author Christopher Smith <chris@jalakai.co.uk>
110      * @author Tom N Harris <tnharris@whoopdedo.org>
111      */
112     protected function getPageWords($text)
113     {
114 
115         $tokens = $this->tokenizer($text);
116         $tokens = array_count_values($tokens);  // count the frequency of each token
117 
118         $words = [];
119         foreach ($tokens as $w => $c) {
120             $l = wordlen($w);
121             if (isset($words[$l])) {
122                 $words[$l][$w] = $c + ($words[$l][$w] ?? 0);
123             } else {
124                 $words[$l] = [$w => $c];
125             }
126         }
127 
128         // arrive here with $words = array(wordlen => array(word => frequency))
129         $index = [];   //resulting index
130         foreach (array_keys($words) as $wlen) {
131             $word_idx = $this->getIndex('w', $wlen);
132             $word_idx_modified = false;
133             foreach ($words[$wlen] as $word => $freq) {
134                 $word = (string)$word;
135                 $wid = array_search($word, $word_idx, true);
136                 if ($wid === false) {
137                     $wid = count($word_idx);
138                     $word_idx[] = $word;
139                     $word_idx_modified = true;
140                 }
141                 if (!isset($index[$wlen]))
142                     $index[$wlen] = [];
143                 $index[$wlen][$wid] = $freq;
144             }
145             // save back the word index
146             if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx))
147                 return false;
148         }
149 
150         return $index;
151     }
152 
153     /**
154      * Add/update keys to/of the metadata index.
155      *
156      * Adding new keys does not remove other keys for the page.
157      * An empty value will erase the key.
158      * The $key parameter can be an array to add multiple keys. $value will
159      * not be used if $key is an array.
160      *
161      * @param string    $page   a page name
162      * @param mixed     $key    a key string or array of key=>value pairs
163      * @param mixed     $value  the value or list of values
164      * @return boolean|string     the function completed successfully
165      *
166      * @author Tom N Harris <tnharris@whoopdedo.org>
167      * @author Michael Hamann <michael@content-space.de>
168      */
169     public function addMetaKeys($page, $key, $value = null)
170     {
171         if (!is_array($key)) {
172             $key = [$key => $value];
173         } elseif (!is_null($value)) {
174             // $key is array, but $value is not null
175             trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
176         }
177 
178         if (!$this->lock())
179             return "locked";
180 
181         // load known documents
182         $pid = $this->getPIDNoLock($page);
183         if ($pid === false) {
184             $this->unlock();
185             return false;
186         }
187 
188         // Special handling for titles so the index file is simpler
189         if (isset($key['title'])) {
190             $value = $key['title'];
191             if (is_array($value)) {
192                 $value = $value[0];
193             }
194             $this->saveIndexKey('title', '', $pid, $value);
195             unset($key['title']);
196         }
197 
198         foreach ($key as $name => $values) {
199             $metaname = idx_cleanName($name);
200             $this->addIndexKey('metadata', '', $metaname);
201             $metaidx = $this->getIndex($metaname . '_i', '');
202             $metawords = $this->getIndex($metaname . '_w', '');
203             $addwords = false;
204 
205             if (!is_array($values)) $values = [$values];
206 
207             $val_idx = $this->getIndexKey($metaname . '_p', '', $pid);
208             if ($val_idx !== '') {
209                 $val_idx = explode(':', $val_idx);
210                 // -1 means remove, 0 keep, 1 add
211                 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
212             } else {
213                 $val_idx = [];
214             }
215 
216             foreach ($values as $val) {
217                 $val = (string)$val;
218                 if ($val !== "") {
219                     $id = array_search($val, $metawords, true);
220                     if ($id === false) {
221                         // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx
222                         $id = count($metawords);
223                         $metawords[$id] = $val;
224                         $metaidx[$id] = '';
225                         $addwords = true;
226                     }
227                     // test if value is already in the index
228                     if (isset($val_idx[$id]) && $val_idx[$id] <= 0) {
229                         $val_idx[$id] = 0;
230                     } else { // else add it
231                         $val_idx[$id] = 1;
232                     }
233                 }
234             }
235 
236             if ($addwords) {
237                 $this->saveIndex($metaname . '_w', '', $metawords);
238             }
239             $vals_changed = false;
240             foreach ($val_idx as $id => $action) {
241                 if ($action == -1) {
242                     $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0);
243                     $vals_changed = true;
244                     unset($val_idx[$id]);
245                 } elseif ($action == 1) {
246                     $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1);
247                     $vals_changed = true;
248                 }
249             }
250 
251             if ($vals_changed) {
252                 $this->saveIndex($metaname . '_i', '', $metaidx);
253                 $val_idx = implode(':', array_keys($val_idx));
254                 $this->saveIndexKey($metaname . '_p', '', $pid, $val_idx);
255             }
256 
257             unset($metaidx);
258             unset($metawords);
259         }
260 
261         $this->unlock();
262         return true;
263     }
264 
265     /**
266      * Rename a page in the search index without changing the indexed content. This function doesn't check if the
267      * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the
268      * indexer and it deletes all previously indexed content of the new page.
269      *
270      * @param string $oldpage The old page name
271      * @param string $newpage The new page name
272      * @return string|bool If the page was successfully renamed, can be a message in the case of an error
273      */
274     public function renamePage($oldpage, $newpage)
275     {
276         if (!$this->lock()) return 'locked';
277 
278         $pages = $this->getPages();
279 
280         $id = array_search($oldpage, $pages, true);
281         if ($id === false) {
282             $this->unlock();
283             return 'page is not in index';
284         }
285 
286         $new_id = array_search($newpage, $pages, true);
287         if ($new_id !== false) {
288             // make sure the page is not in the index anymore
289             if (!$this->deletePageNoLock($newpage)) {
290                 return false;
291             }
292 
293             $pages[$new_id] = 'deleted:' . time() . random_int(0, 9999);
294         }
295 
296         $pages[$id] = $newpage;
297 
298         // update index
299         if (!$this->saveIndex('page', '', $pages)) {
300             $this->unlock();
301             return false;
302         }
303 
304         // reset the pid cache
305         $this->pidCache = [];
306 
307         $this->unlock();
308         return true;
309     }
310 
311     /**
312      * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages
313      * will be updated.
314      *
315      * @param string $key       The metadata key of which a value shall be changed
316      * @param string $oldvalue  The old value that shall be renamed
317      * @param string $newvalue  The new value to which the old value shall be renamed, if exists values will be merged
318      * @return bool|string      If renaming the value has been successful, false or error message on error.
319      */
320     public function renameMetaValue($key, $oldvalue, $newvalue)
321     {
322         if (!$this->lock()) return 'locked';
323 
324         // change the relation references index
325         $metavalues = $this->getIndex($key, '_w');
326         $oldid = array_search($oldvalue, $metavalues, true);
327         if ($oldid !== false) {
328             $newid = array_search($newvalue, $metavalues, true);
329             if ($newid !== false) {
330                 // free memory
331                 unset($metavalues);
332 
333                 // okay, now we have two entries for the same value. we need to merge them.
334                 $indexline = $this->getIndexKey($key . '_i', '', $oldid);
335                 if ($indexline != '') {
336                     $newindexline = $this->getIndexKey($key . '_i', '', $newid);
337                     $pagekeys     = $this->getIndex($key . '_p', '');
338                     $parts = explode(':', $indexline);
339                     foreach ($parts as $part) {
340                         [$id, $count] = explode('*', $part);
341                         $newindexline =  $this->updateTuple($newindexline, $id, $count);
342 
343                         $keyline = explode(':', $pagekeys[$id]);
344                         // remove old meta value
345                         $keyline = array_diff($keyline, [$oldid]);
346                         // add new meta value when not already present
347                         if (!in_array($newid, $keyline)) {
348                             $keyline[] = $newid;
349                         }
350                         $pagekeys[$id] = implode(':', $keyline);
351                     }
352                     $this->saveIndex($key . '_p', '', $pagekeys);
353                     unset($pagekeys);
354                     $this->saveIndexKey($key . '_i', '', $oldid, '');
355                     $this->saveIndexKey($key . '_i', '', $newid, $newindexline);
356                 }
357             } else {
358                 $metavalues[$oldid] = $newvalue;
359                 if (!$this->saveIndex($key . '_w', '', $metavalues)) {
360                     $this->unlock();
361                     return false;
362                 }
363             }
364         }
365 
366         $this->unlock();
367         return true;
368     }
369 
370     /**
371      * Remove a page from the index
372      *
373      * Erases entries in all known indexes.
374      *
375      * @param string    $page   a page name
376      * @return string|boolean  the function completed successfully
377      *
378      * @author Tom N Harris <tnharris@whoopdedo.org>
379      */
380     public function deletePage($page)
381     {
382         if (!$this->lock())
383             return "locked";
384 
385         $result = $this->deletePageNoLock($page);
386 
387         $this->unlock();
388 
389         return $result;
390     }
391 
392     /**
393      * Remove a page from the index without locking the index, only use this function if the index is already locked
394      *
395      * Erases entries in all known indexes.
396      *
397      * @param string    $page   a page name
398      * @return boolean          the function completed successfully
399      *
400      * @author Tom N Harris <tnharris@whoopdedo.org>
401      */
402     protected function deletePageNoLock($page)
403     {
404         // load known documents
405         $pid = $this->getPIDNoLock($page);
406         if ($pid === false) {
407             return false;
408         }
409 
410         // Remove obsolete index entries
411         $pageword_idx = $this->getIndexKey('pageword', '', $pid);
412         if ($pageword_idx !== '') {
413             $delwords = explode(':', $pageword_idx);
414             $upwords = [];
415             foreach ($delwords as $word) {
416                 if ($word != '') {
417                     [$wlen, $wid] = explode('*', $word);
418                     $wid = (int)$wid;
419                     $upwords[$wlen][] = $wid;
420                 }
421             }
422             foreach ($upwords as $wlen => $widx) {
423                 $index = $this->getIndex('i', $wlen);
424                 foreach ($widx as $wid) {
425                     $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
426                 }
427                 $this->saveIndex('i', $wlen, $index);
428             }
429         }
430         // Save the reverse index
431         if (!$this->saveIndexKey('pageword', '', $pid, "")) {
432             return false;
433         }
434 
435         $this->saveIndexKey('title', '', $pid, "");
436         $keyidx = $this->getIndex('metadata', '');
437         foreach ($keyidx as $metaname) {
438             $val_idx = explode(':', $this->getIndexKey($metaname . '_p', '', $pid));
439             $meta_idx = $this->getIndex($metaname . '_i', '');
440             foreach ($val_idx as $id) {
441                 if ($id === '') continue;
442                 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0);
443             }
444             $this->saveIndex($metaname . '_i', '', $meta_idx);
445             $this->saveIndexKey($metaname . '_p', '', $pid, '');
446         }
447 
448         return true;
449     }
450 
451     /**
452      * Clear the whole index
453      *
454      * @return bool If the index has been cleared successfully
455      */
456     public function clear()
457     {
458         global $conf;
459 
460         if (!$this->lock()) return false;
461 
462         @unlink($conf['indexdir'] . '/page.idx');
463         @unlink($conf['indexdir'] . '/title.idx');
464         @unlink($conf['indexdir'] . '/pageword.idx');
465         @unlink($conf['indexdir'] . '/metadata.idx');
466         $dir = @opendir($conf['indexdir']);
467         if ($dir !== false) {
468             while (($f = readdir($dir)) !== false) {
469                 if (
470                     str_ends_with($f, '.idx') &&
471                     (str_starts_with($f, 'i') ||
472                      str_starts_with($f, 'w') ||
473                      str_ends_with($f, '_w.idx') ||
474                      str_ends_with($f, '_i.idx') ||
475                      str_ends_with($f, '_p.idx'))
476                 )
477                     @unlink($conf['indexdir'] . "/$f");
478             }
479         }
480         @unlink($conf['indexdir'] . '/lengths.idx');
481 
482         // clear the pid cache
483         $this->pidCache = [];
484 
485         $this->unlock();
486         return true;
487     }
488 
489     /**
490      * Split the text into words for fulltext search
491      *
492      * TODO: does this also need &$stopwords ?
493      *
494      * @triggers INDEXER_TEXT_PREPARE
495      * This event allows plugins to modify the text before it gets tokenized.
496      * Plugins intercepting this event should also intercept INDEX_VERSION_GET
497      *
498      * @param string    $text   plain text
499      * @param boolean   $wc     are wildcards allowed?
500      * @return array            list of words in the text
501      *
502      * @author Tom N Harris <tnharris@whoopdedo.org>
503      * @author Andreas Gohr <andi@splitbrain.org>
504      */
505     public function tokenizer($text, $wc = false)
506     {
507         $wc = ($wc) ? '' : '\*';
508         $stopwords =& idx_get_stopwords();
509 
510         // prepare the text to be tokenized
511         $evt = new Event('INDEXER_TEXT_PREPARE', $text);
512         if ($evt->advise_before(true)) {
513             if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
514                 $text = Asian::separateAsianWords($text);
515             }
516         }
517         $evt->advise_after();
518         unset($evt);
519 
520         $text = strtr(
521             $text,
522             ["\r" => ' ', "\n" => ' ', "\t" => ' ', "\xC2\xAD" => '']
523         );
524         if (preg_match('/[^0-9A-Za-z ]/u', $text))
525             $text = Clean::stripspecials($text, ' ', '\._\-:' . $wc);
526 
527         $wordlist = explode(' ', $text);
528         foreach ($wordlist as $i => $word) {
529             $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
530                 PhpString::strtolower($word) : strtolower($word);
531         }
532 
533         foreach ($wordlist as $i => $word) {
534             if (
535                 (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH)
536                 || in_array($word, $stopwords, true)
537             )
538                 unset($wordlist[$i]);
539         }
540         return array_values($wordlist);
541     }
542 
543     /**
544      * Get the numeric PID of a page
545      *
546      * @param string $page The page to get the PID for
547      * @return bool|int The page id on success, false on error
548      */
549     public function getPID($page)
550     {
551         // return PID without locking when it is in the cache
552         if (isset($this->pidCache[$page])) return $this->pidCache[$page];
553 
554         if (!$this->lock())
555             return false;
556 
557         // load known documents
558         $pid = $this->getPIDNoLock($page);
559         if ($pid === false) {
560             $this->unlock();
561             return false;
562         }
563 
564         $this->unlock();
565         return $pid;
566     }
567 
568     /**
569      * Get the numeric PID of a page without locking the index.
570      * Only use this function when the index is already locked.
571      *
572      * @param string $page The page to get the PID for
573      * @return bool|int The page id on success, false on error
574      */
575     protected function getPIDNoLock($page)
576     {
577         // avoid expensive addIndexKey operation for the most recently requested pages by using a cache
578         if (isset($this->pidCache[$page])) return $this->pidCache[$page];
579         $pid = $this->addIndexKey('page', '', $page);
580         // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently
581         // added item will be requested again
582         if (count($this->pidCache) > 10) array_shift($this->pidCache);
583         $this->pidCache[$page] = $pid;
584         return $pid;
585     }
586 
587     /**
588      * Get the page id of a numeric PID
589      *
590      * @param int $pid The PID to get the page id for
591      * @return string The page id
592      */
593     public function getPageFromPID($pid)
594     {
595         return $this->getIndexKey('page', '', $pid);
596     }
597 
598     /**
599      * Find pages in the fulltext index containing the words,
600      *
601      * The search words must be pre-tokenized, meaning only letters and
602      * numbers with an optional wildcard
603      *
604      * The returned array will have the original tokens as key. The values
605      * in the returned list is an array with the page names as keys and the
606      * number of times that token appears on the page as value.
607      *
608      * @param array  $tokens list of words to search for
609      * @return array         list of page names with usage counts
610      *
611      * @author Tom N Harris <tnharris@whoopdedo.org>
612      * @author Andreas Gohr <andi@splitbrain.org>
613      */
614     public function lookup(&$tokens)
615     {
616         $result = [];
617         $wids = $this->getIndexWords($tokens, $result);
618         if (empty($wids)) return [];
619         // load known words and documents
620         $page_idx = $this->getIndex('page', '');
621         $docs = [];
622         foreach (array_keys($wids) as $wlen) {
623             $wids[$wlen] = array_unique($wids[$wlen]);
624             $index = $this->getIndex('i', $wlen);
625             foreach ($wids[$wlen] as $ixid) {
626                 if ($ixid < count($index))
627                     $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]);
628             }
629         }
630         // merge found pages into final result array
631         $final = [];
632         foreach ($result as $word => $res) {
633             $final[$word] = [];
634             foreach ($res as $wid) {
635                 // handle the case when ($ixid < count($index)) has been false
636                 // and thus $docs[$wid] hasn't been set.
637                 if (!isset($docs[$wid])) continue;
638                 $hits = &$docs[$wid];
639                 foreach ($hits as $hitkey => $hitcnt) {
640                     // make sure the document still exists
641                     if (!page_exists($hitkey, '', false)) continue;
642                     if (!isset($final[$word][$hitkey]))
643                         $final[$word][$hitkey] = $hitcnt;
644                     else $final[$word][$hitkey] += $hitcnt;
645                 }
646             }
647         }
648         return $final;
649     }
650 
651     /**
652      * Find pages containing a metadata key.
653      *
654      * The metadata values are compared as case-sensitive strings. Pass a
655      * callback function that returns true or false to use a different
656      * comparison function. The function will be called with the $value being
657      * searched for as the first argument, and the word in the index as the
658      * second argument. The function preg_match can be used directly if the
659      * values are regexes.
660      *
661      * @param string    $key    name of the metadata key to look for
662      * @param string    $value  search term to look for, must be a string or array of strings
663      * @param callback  $func   comparison function
664      * @return array            lists with page names, keys are query values if $value is array
665      *
666      * @author Tom N Harris <tnharris@whoopdedo.org>
667      * @author Michael Hamann <michael@content-space.de>
668      */
669     public function lookupKey($key, &$value, $func = null)
670     {
671         if (!is_array($value))
672             $value_array = [$value];
673         else $value_array =& $value;
674 
675         // the matching ids for the provided value(s)
676         $value_ids = [];
677 
678         $metaname = idx_cleanName($key);
679 
680         // get all words in order to search the matching ids
681         if ($key == 'title') {
682             $words = $this->getIndex('title', '');
683         } else {
684             $words = $this->getIndex($metaname . '_w', '');
685         }
686 
687         if (!is_null($func)) {
688             foreach ($value_array as $val) {
689                 foreach ($words as $i => $word) {
690                     if (call_user_func_array($func, [$val, $word]))
691                         $value_ids[$i][] = $val;
692                 }
693             }
694         } else {
695             foreach ($value_array as $val) {
696                 $xval = $val;
697                 $caret = '^';
698                 $dollar = '$';
699                 // check for wildcards
700                 if (str_starts_with($xval, '*')) {
701                     $xval = substr($xval, 1);
702                     $caret = '';
703                 }
704                 if (str_ends_with($xval, '*')) {
705                     $xval = substr($xval, 0, -1);
706                     $dollar = '';
707                 }
708                 if (!$caret || !$dollar) {
709                     $re = $caret . preg_quote($xval, '/') . $dollar;
710                     foreach (array_keys(preg_grep('/' . $re . '/', $words)) as $i)
711                         $value_ids[$i][] = $val;
712                 } elseif (($i = array_search($val, $words, true)) !== false) {
713                     $value_ids[$i][] = $val;
714                 }
715             }
716         }
717 
718         unset($words); // free the used memory
719 
720         // initialize the result so it won't be null
721         $result = [];
722         foreach ($value_array as $val) {
723             $result[$val] = [];
724         }
725 
726         $page_idx = $this->getIndex('page', '');
727 
728         // Special handling for titles
729         if ($key == 'title') {
730             foreach ($value_ids as $pid => $val_list) {
731                 $page = $page_idx[$pid];
732                 foreach ($val_list as $val) {
733                     $result[$val][] = $page;
734                 }
735             }
736         } else {
737             // load all lines and pages so the used lines can be taken and matched with the pages
738             $lines = $this->getIndex($metaname . '_i', '');
739 
740             foreach ($value_ids as $value_id => $val_list) {
741                 // parse the tuples of the form page_id*1:page2_id*1 and so on, return value
742                 // is an array with page_id => 1, page2_id => 1 etc. so take the keys only
743                 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id]));
744                 foreach ($val_list as $val) {
745                     $result[$val] = [...$result[$val], ...$pages];
746                 }
747             }
748         }
749         if (!is_array($value)) $result = $result[$value];
750         return $result;
751     }
752 
753     /**
754      * Find the index ID of each search term.
755      *
756      * The query terms should only contain valid characters, with a '*' at
757      * either the beginning or end of the word (or both).
758      * The $result parameter can be used to merge the index locations with
759      * the appropriate query term.
760      *
761      * @param array  $words  The query terms.
762      * @param array  $result Set to word => array("length*id" ...)
763      * @return array         Set to length => array(id ...)
764      *
765      * @author Tom N Harris <tnharris@whoopdedo.org>
766      */
767     protected function getIndexWords(&$words, &$result)
768     {
769         $tokens = [];
770         $tokenlength = [];
771         $tokenwild = [];
772         foreach ($words as $word) {
773             $result[$word] = [];
774             $caret = '^';
775             $dollar = '$';
776             $xword = $word;
777             $wlen = wordlen($word);
778 
779             // check for wildcards
780             if (str_starts_with($xword, '*')) {
781                 $xword = substr($xword, 1);
782                 $caret = '';
783                 --$wlen;
784             }
785             if (str_ends_with($xword, '*')) {
786                 $xword = substr($xword, 0, -1);
787                 $dollar = '';
788                 --$wlen;
789             }
790             if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword))
791                 continue;
792             if (!isset($tokens[$xword]))
793                 $tokenlength[$wlen][] = $xword;
794             if (!$caret || !$dollar) {
795                 $re = $caret . preg_quote($xword, '/') . $dollar;
796                 $tokens[$xword][] = [$word, '/' . $re . '/'];
797                 if (!isset($tokenwild[$xword]))
798                     $tokenwild[$xword] = $wlen;
799             } else {
800                 $tokens[$xword][] = [$word, null];
801             }
802         }
803         asort($tokenwild);
804         // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
805         // $tokenlength = array( base word length => base word ... )
806         // $tokenwild = array( base word => base word length ... )
807         $length_filter = $tokenwild === [] ? $tokenlength : min(array_keys($tokenlength));
808         $indexes_known = $this->indexLengths($length_filter);
809         if ($tokenwild !== []) sort($indexes_known);
810         // get word IDs
811         $wids = [];
812         foreach ($indexes_known as $ixlen) {
813             $word_idx = $this->getIndex('w', $ixlen);
814             // handle exact search
815             if (isset($tokenlength[$ixlen])) {
816                 foreach ($tokenlength[$ixlen] as $xword) {
817                     $wid = array_search($xword, $word_idx, true);
818                     if ($wid !== false) {
819                         $wids[$ixlen][] = $wid;
820                         foreach ($tokens[$xword] as $w)
821                             $result[$w[0]][] = "$ixlen*$wid";
822                     }
823                 }
824             }
825             // handle wildcard search
826             foreach ($tokenwild as $xword => $wlen) {
827                 if ($wlen >= $ixlen) break;
828                 foreach ($tokens[$xword] as $w) {
829                     if (is_null($w[1])) continue;
830                     foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
831                         $wids[$ixlen][] = $wid;
832                         $result[$w[0]][] = "$ixlen*$wid";
833                     }
834                 }
835             }
836         }
837         return $wids;
838     }
839 
840     /**
841      * Return a list of all pages
842      * Warning: pages may not exist!
843      *
844      * @param string    $key    list only pages containing the metadata key (optional)
845      * @return array            list of page names
846      *
847      * @author Tom N Harris <tnharris@whoopdedo.org>
848      */
849     public function getPages($key = null)
850     {
851         $page_idx = $this->getIndex('page', '');
852         if (is_null($key)) return $page_idx;
853 
854         $metaname = idx_cleanName($key);
855 
856         // Special handling for titles
857         if ($key == 'title') {
858             $title_idx = $this->getIndex('title', '');
859             array_splice($page_idx, count($title_idx));
860             foreach ($title_idx as $i => $title)
861                 if ($title === "") unset($page_idx[$i]);
862             return array_values($page_idx);
863         }
864 
865         $pages = [];
866         $lines = $this->getIndex($metaname . '_i', '');
867         foreach ($lines as $line) {
868             $pages = array_merge($pages, $this->parseTuples($page_idx, $line));
869         }
870         return array_keys($pages);
871     }
872 
873     /**
874      * Return a list of words sorted by number of times used
875      *
876      * @param int       $min    bottom frequency threshold
877      * @param int       $max    upper frequency limit. No limit if $max<$min
878      * @param int       $minlen minimum length of words to count
879      * @param string    $key    metadata key to list. Uses the fulltext index if not given
880      * @return array            list of words as the keys and frequency as values
881      *
882      * @author Tom N Harris <tnharris@whoopdedo.org>
883      */
884     public function histogram($min = 1, $max = 0, $minlen = 3, $key = null)
885     {
886         if ($min < 1)
887             $min = 1;
888         if ($max < $min)
889             $max = 0;
890 
891         $result = [];
892 
893         if ($key == 'title') {
894             $index = $this->getIndex('title', '');
895             $index = array_count_values($index);
896             foreach ($index as $val => $cnt) {
897                 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen)
898                     $result[$val] = $cnt;
899             }
900         } elseif (!is_null($key)) {
901             $metaname = idx_cleanName($key);
902             $index = $this->getIndex($metaname . '_i', '');
903             $val_idx = [];
904             foreach ($index as $wid => $line) {
905                 $freq = $this->countTuples($line);
906                 if ($freq >= $min && (!$max || $freq <= $max))
907                     $val_idx[$wid] = $freq;
908             }
909             if ($val_idx !== []) {
910                 $words = $this->getIndex($metaname . '_w', '');
911                 foreach ($val_idx as $wid => $freq) {
912                     if (strlen($words[$wid]) >= $minlen)
913                         $result[$words[$wid]] = $freq;
914                 }
915             }
916         } else {
917             $lengths = idx_listIndexLengths();
918             foreach ($lengths as $length) {
919                 if ($length < $minlen) continue;
920                 $index = $this->getIndex('i', $length);
921                 $words = null;
922                 foreach ($index as $wid => $line) {
923                     $freq = $this->countTuples($line);
924                     if ($freq >= $min && (!$max || $freq <= $max)) {
925                         if ($words === null)
926                             $words = $this->getIndex('w', $length);
927                         $result[$words[$wid]] = $freq;
928                     }
929                 }
930             }
931         }
932 
933         arsort($result);
934         return $result;
935     }
936 
937     /**
938      * Lock the indexer.
939      *
940      * @author Tom N Harris <tnharris@whoopdedo.org>
941      *
942      * @return bool|string
943      */
944     protected function lock()
945     {
946         global $conf;
947         $status = true;
948         $run = 0;
949         $lock = $conf['lockdir'] . '/_indexer.lock';
950         while (!@mkdir($lock)) {
951             usleep(50);
952             if (is_dir($lock) && time() - @filemtime($lock) > 60 * 5) {
953                 // looks like a stale lock - remove it
954                 if (!@rmdir($lock)) {
955                     $status = "removing the stale lock failed";
956                     return false;
957                 } else {
958                     $status = "stale lock removed";
959                 }
960             } elseif ($run++ == 1000) {
961                 // we waited 5 seconds for that lock
962                 return false;
963             }
964         }
965         if ($conf['dperm']) {
966             chmod($lock, $conf['dperm']);
967         }
968         return $status;
969     }
970 
971     /**
972      * Release the indexer lock.
973      *
974      * @author Tom N Harris <tnharris@whoopdedo.org>
975      *
976      * @return bool
977      */
978     protected function unlock()
979     {
980         global $conf;
981         @rmdir($conf['lockdir'] . '/_indexer.lock');
982         return true;
983     }
984 
985     /**
986      * Retrieve the entire index.
987      *
988      * The $suffix argument is for an index that is split into
989      * multiple parts. Different index files should use different
990      * base names.
991      *
992      * @param string    $idx    name of the index
993      * @param string    $suffix subpart identifier
994      * @return array            list of lines without CR or LF
995      *
996      * @author Tom N Harris <tnharris@whoopdedo.org>
997      */
998     protected function getIndex($idx, $suffix)
999     {
1000         global $conf;
1001         $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
1002         if (!file_exists($fn)) return [];
1003         return file($fn, FILE_IGNORE_NEW_LINES);
1004     }
1005 
1006     /**
1007      * Replace the contents of the index with an array.
1008      *
1009      * @param string    $idx    name of the index
1010      * @param string    $suffix subpart identifier
1011      * @param array     $lines  list of lines without LF
1012      * @return bool             If saving succeeded
1013      *
1014      * @author Tom N Harris <tnharris@whoopdedo.org>
1015      */
1016     protected function saveIndex($idx, $suffix, &$lines)
1017     {
1018         global $conf;
1019         $fn = $conf['indexdir'] . '/' . $idx . $suffix;
1020         $fh = @fopen($fn . '.tmp', 'w');
1021         if (!$fh) return false;
1022         fwrite($fh, implode("\n", $lines));
1023         if (!empty($lines))
1024             fwrite($fh, "\n");
1025         fclose($fh);
1026         if ($conf['fperm'])
1027             chmod($fn . '.tmp', $conf['fperm']);
1028         io_rename($fn . '.tmp', $fn . '.idx');
1029         return true;
1030     }
1031 
1032     /**
1033      * Retrieve a line from the index.
1034      *
1035      * @param string    $idx    name of the index
1036      * @param string    $suffix subpart identifier
1037      * @param int       $id     the line number
1038      * @return string           a line with trailing whitespace removed
1039      *
1040      * @author Tom N Harris <tnharris@whoopdedo.org>
1041      */
1042     protected function getIndexKey($idx, $suffix, $id)
1043     {
1044         global $conf;
1045         $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
1046         if (!file_exists($fn)) return '';
1047         $fh = @fopen($fn, 'r');
1048         if (!$fh) return '';
1049         $ln = -1;
1050         while (($line = fgets($fh)) !== false) {
1051             if (++$ln == $id) break;
1052         }
1053         fclose($fh);
1054         return rtrim((string)$line);
1055     }
1056 
1057     /**
1058      * Write a line into the index.
1059      *
1060      * @param string    $idx    name of the index
1061      * @param string    $suffix subpart identifier
1062      * @param int       $id     the line number
1063      * @param string    $line   line to write
1064      * @return bool             If saving succeeded
1065      *
1066      * @author Tom N Harris <tnharris@whoopdedo.org>
1067      */
1068     protected function saveIndexKey($idx, $suffix, $id, $line)
1069     {
1070         global $conf;
1071         if (!str_ends_with($line, "\n"))
1072             $line .= "\n";
1073         $fn = $conf['indexdir'] . '/' . $idx . $suffix;
1074         $fh = @fopen($fn . '.tmp', 'w');
1075         if (!$fh) return false;
1076         $ih = @fopen($fn . '.idx', 'r');
1077         if ($ih) {
1078             $ln = -1;
1079             while (($curline = fgets($ih)) !== false) {
1080                 fwrite($fh, (++$ln == $id) ? $line : $curline);
1081             }
1082             if ($id > $ln) {
1083                 while ($id > ++$ln)
1084                     fwrite($fh, "\n");
1085                 fwrite($fh, $line);
1086             }
1087             fclose($ih);
1088         } else {
1089             $ln = -1;
1090             while ($id > ++$ln)
1091                 fwrite($fh, "\n");
1092             fwrite($fh, $line);
1093         }
1094         fclose($fh);
1095         if ($conf['fperm'])
1096             chmod($fn . '.tmp', $conf['fperm']);
1097         io_rename($fn . '.tmp', $fn . '.idx');
1098         return true;
1099     }
1100 
1101     /**
1102      * Retrieve or insert a value in the index.
1103      *
1104      * @param string    $idx    name of the index
1105      * @param string    $suffix subpart identifier
1106      * @param string    $value  line to find in the index
1107      * @return int|bool          line number of the value in the index or false if writing the index failed
1108      *
1109      * @author Tom N Harris <tnharris@whoopdedo.org>
1110      */
1111     protected function addIndexKey($idx, $suffix, $value)
1112     {
1113         $index = $this->getIndex($idx, $suffix);
1114         $id = array_search($value, $index, true);
1115         if ($id === false) {
1116             $id = count($index);
1117             $index[$id] = $value;
1118             if (!$this->saveIndex($idx, $suffix, $index)) {
1119                 throw new \RuntimeException("Failed to write $idx index");
1120             }
1121         }
1122         return $id;
1123     }
1124 
1125     /**
1126      * Get the list of lengths indexed in the wiki.
1127      *
1128      * Read the index directory or a cache file and returns
1129      * a sorted array of lengths of the words used in the wiki.
1130      *
1131      * @author YoBoY <yoboy.leguesh@gmail.com>
1132      *
1133      * @return array
1134      */
1135     protected function listIndexLengths()
1136     {
1137         return idx_listIndexLengths();
1138     }
1139 
1140     /**
1141      * Get the word lengths that have been indexed.
1142      *
1143      * Reads the index directory and returns an array of lengths
1144      * that there are indices for.
1145      *
1146      * @author YoBoY <yoboy.leguesh@gmail.com>
1147      *
1148      * @param array|int $filter
1149      * @return array
1150      */
1151     protected function indexLengths($filter)
1152     {
1153         global $conf;
1154         $idx = [];
1155         if (is_array($filter)) {
1156             // testing if index files exist only
1157             $path = $conf['indexdir'] . "/i";
1158             foreach (array_keys($filter) as $key) {
1159                 if (file_exists($path . $key . '.idx'))
1160                     $idx[] = $key;
1161             }
1162         } else {
1163             $lengths = idx_listIndexLengths();
1164             foreach ($lengths as $length) {
1165                 // keep all the values equal or superior
1166                 if ((int)$length >= (int)$filter)
1167                     $idx[] = $length;
1168             }
1169         }
1170         return $idx;
1171     }
1172 
1173     /**
1174      * Insert or replace a tuple in a line.
1175      *
1176      * @author Tom N Harris <tnharris@whoopdedo.org>
1177      *
1178      * @param string $line
1179      * @param string|int $id
1180      * @param int    $count
1181      * @return string
1182      */
1183     protected function updateTuple($line, $id, $count)
1184     {
1185         if ($line != '') {
1186             $line = preg_replace('/(^|:)' . preg_quote($id, '/') . '\*\d*/', '', $line);
1187         }
1188         $line = trim($line, ':');
1189         if ($count) {
1190             if ($line) {
1191                 return "$id*$count:" . $line;
1192             } else {
1193                 return "$id*$count";
1194             }
1195         }
1196         return $line;
1197     }
1198 
1199     /**
1200      * Split a line into an array of tuples.
1201      *
1202      * @author Tom N Harris <tnharris@whoopdedo.org>
1203      * @author Andreas Gohr <andi@splitbrain.org>
1204      *
1205      * @param array $keys
1206      * @param string $line
1207      * @return array
1208      */
1209     protected function parseTuples(&$keys, $line)
1210     {
1211         $result = [];
1212         if ($line == '') return $result;
1213         $parts = explode(':', $line);
1214         foreach ($parts as $tuple) {
1215             if ($tuple === '') continue;
1216             [$key, $cnt] = explode('*', $tuple);
1217             if (!$cnt) continue;
1218             if (isset($keys[$key])) {
1219                 $key = $keys[$key];
1220                 if ($key === false || is_null($key)) continue;
1221             }
1222             $result[$key] = $cnt;
1223         }
1224         return $result;
1225     }
1226 
1227     /**
1228      * Sum the counts in a list of tuples.
1229      *
1230      * @author Tom N Harris <tnharris@whoopdedo.org>
1231      *
1232      * @param string $line
1233      * @return int
1234      */
1235     protected function countTuples($line)
1236     {
1237         $freq = 0;
1238         $parts = explode(':', $line);
1239         foreach ($parts as $tuple) {
1240             if ($tuple === '') continue;
1241             [/* pid */, $cnt] = explode('*', $tuple);
1242             $freq += (int)$cnt;
1243         }
1244         return $freq;
1245     }
1246 }
1247