1<?php
2/**
3 * Functions to create the fulltext search index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 * @author     Tom N Harris <tnharris@whoopdedo.org>
8 */
9
10use dokuwiki\Extension\Event;
11use dokuwiki\Search\Indexer;
12
13// Version tag used to force rebuild on upgrade
14define('INDEXER_VERSION', 8);
15
16// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
18
19/**
20 * Version of the indexer taking into consideration the external tokenizer.
21 * The indexer is only compatible with data written by the same version.
22 *
23 * @triggers INDEXER_VERSION_GET
24 * Plugins that modify what gets indexed should hook this event and
25 * add their version info to the event data like so:
26 *     $data[$plugin_name] = $plugin_version;
27 *
28 * @author Tom N Harris <tnharris@whoopdedo.org>
29 * @author Michael Hamann <michael@content-space.de>
30 *
31 * @return int|string
32 */
33function idx_get_version(){
34    static $indexer_version = null;
35    if ($indexer_version == null) {
36        $version = INDEXER_VERSION;
37
38        // DokuWiki version is included for the convenience of plugins
39        $data = array('dokuwiki'=>$version);
40        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
41        unset($data['dokuwiki']); // this needs to be first
42        ksort($data);
43        foreach ($data as $plugin=>$vers)
44            $version .= '+'.$plugin.'='.$vers;
45        $indexer_version = $version;
46    }
47    return $indexer_version;
48}
49
50/**
51 * Measure the length of a string.
52 * Differs from strlen in handling of asian characters.
53 *
54 * @author Tom N Harris <tnharris@whoopdedo.org>
55 *
56 * @param string $w
57 * @return int
58 */
59function wordlen($w){
60    $l = strlen($w);
61    // If left alone, all chinese "words" will get put into w3.idx
62    // So the "length" of a "word" is faked
63    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
64        foreach($leadbytes[0] as $b)
65            $l += ord($b) - 0xE1;
66    }
67    return $l;
68}
69
70/**
71 * Create an instance of the indexer.
72 *
73 * @return Indexer    an Indexer
74 *
75 * @author Tom N Harris <tnharris@whoopdedo.org>
76 */
77function idx_get_indexer() {
78    static $Indexer;
79    if (!isset($Indexer)) {
80        $Indexer = new Indexer();
81    }
82    return $Indexer;
83}
84
85/**
86 * Returns words that will be ignored.
87 *
88 * @return array                list of stop words
89 *
90 * @author Tom N Harris <tnharris@whoopdedo.org>
91 */
92function & idx_get_stopwords() {
93    static $stopwords = null;
94    if (is_null($stopwords)) {
95        global $conf;
96        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
97        if(file_exists($swfile)){
98            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
99        }else{
100            $stopwords = array();
101        }
102    }
103    return $stopwords;
104}
105
106/**
107 * Adds/updates the search index for the given page
108 *
109 * Locking is handled internally.
110 *
111 * @param string        $page   name of the page to index
112 * @param boolean       $verbose    print status messages
113 * @param boolean       $force  force reindexing even when the index is up to date
114 * @return string|boolean  the function completed successfully
115 *
116 * @author Tom N Harris <tnharris@whoopdedo.org>
117 */
118function idx_addPage($page, $verbose=false, $force=false) {
119    $idxtag = metaFN($page,'.indexed');
120    // check if page was deleted but is still in the index
121    if (!page_exists($page)) {
122        if (!file_exists($idxtag)) {
123            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
124            return false;
125        }
126        $Indexer = idx_get_indexer();
127        $result = $Indexer->deletePage($page);
128        if ($result === "locked") {
129            if ($verbose) print("Indexer: locked".DOKU_LF);
130            return false;
131        }
132        @unlink($idxtag);
133        return $result;
134    }
135
136    // check if indexing needed
137    if(!$force && file_exists($idxtag)){
138        if(trim(io_readFile($idxtag)) == idx_get_version()){
139            $last = @filemtime($idxtag);
140            if($last > @filemtime(wikiFN($page))){
141                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
142                return false;
143            }
144        }
145    }
146
147    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
148    if ($indexenabled === false) {
149        $result = false;
150        if (file_exists($idxtag)) {
151            $Indexer = idx_get_indexer();
152            $result = $Indexer->deletePage($page);
153            if ($result === "locked") {
154                if ($verbose) print("Indexer: locked".DOKU_LF);
155                return false;
156            }
157            @unlink($idxtag);
158        }
159        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
160        return $result;
161    }
162
163    $Indexer = idx_get_indexer();
164    $pid = $Indexer->getPID($page);
165    if ($pid === false) {
166        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
167        return false;
168    }
169    $body = '';
170    $metadata = array();
171    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
172    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
173        $metadata['relation_references'] = array_keys($references);
174    else
175        $metadata['relation_references'] = array();
176
177    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
178        $metadata['relation_media'] = array_keys($media);
179    else
180        $metadata['relation_media'] = array();
181
182    $data = compact('page', 'body', 'metadata', 'pid');
183    $evt = new Event('INDEXER_PAGE_ADD', $data);
184    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
185    $evt->advise_after();
186    unset($evt);
187    extract($data);
188
189    $result = $Indexer->addPageWords($page, $body);
190    if ($result === "locked") {
191        if ($verbose) print("Indexer: locked".DOKU_LF);
192        return false;
193    }
194
195    if ($result) {
196        $result = $Indexer->addMetaKeys($page, $metadata);
197        if ($result === "locked") {
198            if ($verbose) print("Indexer: locked".DOKU_LF);
199            return false;
200        }
201    }
202
203    if ($result)
204        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
205    if ($verbose) {
206        print("Indexer: finished".DOKU_LF);
207        return true;
208    }
209    return $result;
210}
211
212/**
213 * Find tokens in the fulltext index
214 *
215 * Takes an array of words and will return a list of matching
216 * pages for each one.
217 *
218 * Important: No ACL checking is done here! All results are
219 *            returned, regardless of permissions
220 *
221 * @param array      $words  list of words to search for
222 * @return array             list of pages found, associated with the search terms
223 */
224function idx_lookup(&$words) {
225    $Indexer = idx_get_indexer();
226    return $Indexer->lookup($words);
227}
228
229/**
230 * Split a string into tokens
231 *
232 * @param string $string
233 * @param bool $wc
234 *
235 * @return array
236 */
237function idx_tokenizer($string, $wc=false) {
238    $Indexer = idx_get_indexer();
239    return $Indexer->tokenizer($string, $wc);
240}
241
242/* For compatibility */
243
244/**
245 * Read the list of words in an index (if it exists).
246 *
247 * @author Tom N Harris <tnharris@whoopdedo.org>
248 *
249 * @param string $idx
250 * @param string $suffix
251 * @return array
252 */
253function idx_getIndex($idx, $suffix) {
254    global $conf;
255    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
256    if (!file_exists($fn)) return array();
257    return file($fn);
258}
259
260/**
261 * Get the list of lengths indexed in the wiki.
262 *
263 * Read the index directory or a cache file and returns
264 * a sorted array of lengths of the words used in the wiki.
265 *
266 * @author YoBoY <yoboy.leguesh@gmail.com>
267 *
268 * @return array
269 */
270function idx_listIndexLengths() {
271    global $conf;
272    // testing what we have to do, create a cache file or not.
273    if ($conf['readdircache'] == 0) {
274        $docache = false;
275    } else {
276        clearstatcache();
277        if (file_exists($conf['indexdir'].'/lengths.idx')
278        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
279            if (
280                ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
281                !== false
282            ) {
283                $idx = array();
284                foreach ($lengths as $length) {
285                    $idx[] = (int)$length;
286                }
287                return $idx;
288            }
289        }
290        $docache = true;
291    }
292
293    if ($conf['readdircache'] == 0 || $docache) {
294        $dir = @opendir($conf['indexdir']);
295        if ($dir === false)
296            return array();
297        $idx = array();
298        while (($f = readdir($dir)) !== false) {
299            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
300                $i = substr($f, 1, -4);
301                if (is_numeric($i))
302                    $idx[] = (int)$i;
303            }
304        }
305        closedir($dir);
306        sort($idx);
307        // save this in a file
308        if ($docache) {
309            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
310            @fwrite($handle, implode("\n", $idx));
311            @fclose($handle);
312        }
313        return $idx;
314    }
315
316    return array();
317}
318
319/**
320 * Get the word lengths that have been indexed.
321 *
322 * Reads the index directory and returns an array of lengths
323 * that there are indices for.
324 *
325 * @author YoBoY <yoboy.leguesh@gmail.com>
326 *
327 * @param array|int $filter
328 * @return array
329 */
330function idx_indexLengths($filter) {
331    global $conf;
332    $idx = array();
333    if (is_array($filter)) {
334        // testing if index files exist only
335        $path = $conf['indexdir']."/i";
336        foreach ($filter as $key => $value) {
337            if (file_exists($path.$key.'.idx'))
338                $idx[] = $key;
339        }
340    } else {
341        $lengths = idx_listIndexLengths();
342        foreach ($lengths as $key => $length) {
343            // keep all the values equal or superior
344            if ((int)$length >= (int)$filter)
345                $idx[] = $length;
346        }
347    }
348    return $idx;
349}
350
351/**
352 * Clean a name of a key for use as a file name.
353 *
354 * Romanizes non-latin characters, then strips away anything that's
355 * not a letter, number, or underscore.
356 *
357 * @author Tom N Harris <tnharris@whoopdedo.org>
358 *
359 * @param string $name
360 * @return string
361 */
362function idx_cleanName($name) {
363    $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
364    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
365    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
366    return strtolower($name);
367}
368
369//Setup VIM: ex: et ts=4 :
370