xref: /dokuwiki/inc/indexer.php (revision 24870174d2ee45460ba6bcfe5f5a0ae94715efd7)
1b4ce25e9SAndreas Gohr<?php
2b4ce25e9SAndreas Gohr/**
3fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index
4b4ce25e9SAndreas Gohr *
5b4ce25e9SAndreas Gohr * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6b4ce25e9SAndreas Gohr * @author     Andreas Gohr <andi@splitbrain.org>
700803e56STom N Harris * @author     Tom N Harris <tnharris@whoopdedo.org>
8b4ce25e9SAndreas Gohr */
9*24870174SAndreas Gohruse dokuwiki\Utf8\Clean;
10dbc189b2SAndreas Gohruse dokuwiki\Extension\Event;
116225b270SMichael Großeuse dokuwiki\Search\Indexer;
12dbc189b2SAndreas Gohr
137c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade
14dbc189b2SAndreas Gohrdefine('INDEXER_VERSION', 8);
157c2ef4e8STom N Harris
1633815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
17d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
1833815ce2SChris Smith
19b4ce25e9SAndreas Gohr/**
207c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer.
217c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version.
227c2ef4e8STom N Harris *
238cd4c12fSAndreas Gohr * @triggers INDEXER_VERSION_GET
24d0d6fe1bSTom N Harris * Plugins that modify what gets indexed should hook this event and
25d0d6fe1bSTom N Harris * add their version info to the event data like so:
26d0d6fe1bSTom N Harris *     $data[$plugin_name] = $plugin_version;
27d0d6fe1bSTom N Harris *
287c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
298605afb1SMichael Hamann * @author Michael Hamann <michael@content-space.de>
3042ea7f44SGerrit Uitslag *
3142ea7f44SGerrit Uitslag * @return int|string
327c2ef4e8STom N Harris */
337c2ef4e8STom N Harrisfunction idx_get_version(){
34d0d6fe1bSTom N Harris    static $indexer_version = null;
35d0d6fe1bSTom N Harris    if ($indexer_version == null) {
368605afb1SMichael Hamann        $version = INDEXER_VERSION;
378605afb1SMichael Hamann
38d0d6fe1bSTom N Harris        // DokuWiki version is included for the convenience of plugins
39*24870174SAndreas Gohr        $data = ['dokuwiki'=>$version];
40cbb44eabSAndreas Gohr        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
41d0d6fe1bSTom N Harris        unset($data['dokuwiki']); // this needs to be first
42d0d6fe1bSTom N Harris        ksort($data);
43d0d6fe1bSTom N Harris        foreach ($data as $plugin=>$vers)
44d0d6fe1bSTom N Harris            $version .= '+'.$plugin.'='.$vers;
45d0d6fe1bSTom N Harris        $indexer_version = $version;
46d0d6fe1bSTom N Harris    }
47d0d6fe1bSTom N Harris    return $indexer_version;
487c2ef4e8STom N Harris}
497c2ef4e8STom N Harris
507c2ef4e8STom N Harris/**
51d5b23302STom N Harris * Measure the length of a string.
52d5b23302STom N Harris * Differs from strlen in handling of asian characters.
53d5b23302STom N Harris *
54d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
5542ea7f44SGerrit Uitslag *
5642ea7f44SGerrit Uitslag * @param string $w
5742ea7f44SGerrit Uitslag * @return int
58d5b23302STom N Harris */
59d5b23302STom N Harrisfunction wordlen($w){
60d5b23302STom N Harris    $l = strlen($w);
61d5b23302STom N Harris    // If left alone, all chinese "words" will get put into w3.idx
62d5b23302STom N Harris    // So the "length" of a "word" is faked
634b9792c6STom N Harris    if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
644b9792c6STom N Harris        foreach($leadbytes[0] as $b)
654b9792c6STom N Harris            $l += ord($b) - 0xE1;
664b9792c6STom N Harris    }
67d5b23302STom N Harris    return $l;
68d5b23302STom N Harris}
69d5b23302STom N Harris
70d5b23302STom N Harris/**
7100803e56STom N Harris * Create an instance of the indexer.
7200803e56STom N Harris *
736225b270SMichael Große * @return Indexer    an Indexer
7442ea7f44SGerrit Uitslag *
7500803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
7600803e56STom N Harris */
779b41be24STom N Harrisfunction idx_get_indexer() {
784f708321SMichael Hamann    static $Indexer;
791421e548SMichael Hamann    if (!isset($Indexer)) {
806225b270SMichael Große        $Indexer = new Indexer();
8100803e56STom N Harris    }
8200803e56STom N Harris    return $Indexer;
8300803e56STom N Harris}
8400803e56STom N Harris
8500803e56STom N Harris/**
8600803e56STom N Harris * Returns words that will be ignored.
8700803e56STom N Harris *
8800803e56STom N Harris * @return array                list of stop words
8942ea7f44SGerrit Uitslag *
9000803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
9100803e56STom N Harris */
9200803e56STom N Harrisfunction & idx_get_stopwords() {
9300803e56STom N Harris    static $stopwords = null;
9400803e56STom N Harris    if (is_null($stopwords)) {
9500803e56STom N Harris        global $conf;
9600803e56STom N Harris        $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
9779e79377SAndreas Gohr        if(file_exists($swfile)){
9800803e56STom N Harris            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
9900803e56STom N Harris        }else{
100*24870174SAndreas Gohr            $stopwords = [];
10100803e56STom N Harris        }
10200803e56STom N Harris    }
10300803e56STom N Harris    return $stopwords;
10400803e56STom N Harris}
10500803e56STom N Harris
10600803e56STom N Harris/**
10700803e56STom N Harris * Adds/updates the search index for the given page
10800803e56STom N Harris *
10900803e56STom N Harris * Locking is handled internally.
11000803e56STom N Harris *
11100803e56STom N Harris * @param string        $page   name of the page to index
1129b41be24STom N Harris * @param boolean       $verbose    print status messages
113d041f8dbSMichael Hamann * @param boolean       $force  force reindexing even when the index is up to date
11442ea7f44SGerrit Uitslag * @return string|boolean  the function completed successfully
11542ea7f44SGerrit Uitslag *
11600803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
11700803e56STom N Harris */
118d041f8dbSMichael Hamannfunction idx_addPage($page, $verbose=false, $force=false) {
1199b41be24STom N Harris    $idxtag = metaFN($page,'.indexed');
120a23ac4d7SMichael Hamann    // check if page was deleted but is still in the index
121bbc85ee4STom N Harris    if (!page_exists($page)) {
12279e79377SAndreas Gohr        if (!file_exists($idxtag)) {
123bbc85ee4STom N Harris            if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
124bbc85ee4STom N Harris            return false;
125bbc85ee4STom N Harris        }
126bbc85ee4STom N Harris        $Indexer = idx_get_indexer();
127bbc85ee4STom N Harris        $result = $Indexer->deletePage($page);
128bbc85ee4STom N Harris        if ($result === "locked") {
129bbc85ee4STom N Harris            if ($verbose) print("Indexer: locked".DOKU_LF);
130bbc85ee4STom N Harris            return false;
131bbc85ee4STom N Harris        }
132bbc85ee4STom N Harris        @unlink($idxtag);
133bbc85ee4STom N Harris        return $result;
134bbc85ee4STom N Harris    }
135a23ac4d7SMichael Hamann
136a23ac4d7SMichael Hamann    // check if indexing needed
13779e79377SAndreas Gohr    if(!$force && file_exists($idxtag)){
138a23ac4d7SMichael Hamann        if(trim(io_readFile($idxtag)) == idx_get_version()){
139a23ac4d7SMichael Hamann            $last = @filemtime($idxtag);
140a23ac4d7SMichael Hamann            if($last > @filemtime(wikiFN($page))){
141a23ac4d7SMichael Hamann                if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
142a23ac4d7SMichael Hamann                return false;
143a23ac4d7SMichael Hamann            }
144a23ac4d7SMichael Hamann        }
145a23ac4d7SMichael Hamann    }
146a23ac4d7SMichael Hamann
14765aa8490SMichael Hamann    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
148bbc85ee4STom N Harris    if ($indexenabled === false) {
149bbc85ee4STom N Harris        $result = false;
15079e79377SAndreas Gohr        if (file_exists($idxtag)) {
151bbc85ee4STom N Harris            $Indexer = idx_get_indexer();
152bbc85ee4STom N Harris            $result = $Indexer->deletePage($page);
153bbc85ee4STom N Harris            if ($result === "locked") {
154bbc85ee4STom N Harris                if ($verbose) print("Indexer: locked".DOKU_LF);
155bbc85ee4STom N Harris                return false;
156bbc85ee4STom N Harris            }
157bbc85ee4STom N Harris            @unlink($idxtag);
158bbc85ee4STom N Harris        }
159bbc85ee4STom N Harris        if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
160bbc85ee4STom N Harris        return $result;
161bbc85ee4STom N Harris    }
162bbc85ee4STom N Harris
16303aafe1cSMichael Hamann    $Indexer = idx_get_indexer();
16403aafe1cSMichael Hamann    $pid = $Indexer->getPID($page);
16503aafe1cSMichael Hamann    if ($pid === false) {
16603aafe1cSMichael Hamann        if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
16703aafe1cSMichael Hamann        return false;
16803aafe1cSMichael Hamann    }
16900803e56STom N Harris    $body = '';
170*24870174SAndreas Gohr    $metadata = [];
17165aa8490SMichael Hamann    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
17265aa8490SMichael Hamann    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
17339d6fd30SMichael Hamann        $metadata['relation_references'] = array_keys($references);
174a424180eSMichael Hamann    else
175*24870174SAndreas Gohr        $metadata['relation_references'] = [];
176ffec1009SMichael Hamann
177ffec1009SMichael Hamann    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
178ffec1009SMichael Hamann        $metadata['relation_media'] = array_keys($media);
179ffec1009SMichael Hamann    else
180*24870174SAndreas Gohr        $metadata['relation_media'] = [];
181ffec1009SMichael Hamann
182*24870174SAndreas Gohr    $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
183e1d9dcc8SAndreas Gohr    $evt = new Event('INDEXER_PAGE_ADD', $data);
18439d6fd30SMichael Hamann    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
18500803e56STom N Harris    $evt->advise_after();
18600803e56STom N Harris    unset($evt);
18739d6fd30SMichael Hamann    extract($data);
18800803e56STom N Harris
1899b41be24STom N Harris    $result = $Indexer->addPageWords($page, $body);
190e1e1a7e0SMichael Hamann    if ($result === "locked") {
1919b41be24STom N Harris        if ($verbose) print("Indexer: locked".DOKU_LF);
1929b41be24STom N Harris        return false;
1939b41be24STom N Harris    }
194320f489aSMichael Hamann
195320f489aSMichael Hamann    if ($result) {
19639d6fd30SMichael Hamann        $result = $Indexer->addMetaKeys($page, $metadata);
197320f489aSMichael Hamann        if ($result === "locked") {
198320f489aSMichael Hamann            if ($verbose) print("Indexer: locked".DOKU_LF);
199320f489aSMichael Hamann            return false;
200320f489aSMichael Hamann        }
201320f489aSMichael Hamann    }
202320f489aSMichael Hamann
2039b41be24STom N Harris    if ($result)
2049b41be24STom N Harris        io_saveFile(metaFN($page,'.indexed'), idx_get_version());
2059b41be24STom N Harris    if ($verbose) {
2069b41be24STom N Harris        print("Indexer: finished".DOKU_LF);
2079b41be24STom N Harris        return true;
2089b41be24STom N Harris    }
2099b41be24STom N Harris    return $result;
21000803e56STom N Harris}
21100803e56STom N Harris
21200803e56STom N Harris/**
21300803e56STom N Harris * Find tokens in the fulltext index
21400803e56STom N Harris *
21500803e56STom N Harris * Takes an array of words and will return a list of matching
21600803e56STom N Harris * pages for each one.
217488dd6ceSAndreas Gohr *
21863773904SAndreas Gohr * Important: No ACL checking is done here! All results are
21963773904SAndreas Gohr *            returned, regardless of permissions
22063773904SAndreas Gohr *
221e3ab6fc5SMichael Hamann * @param array      $words  list of words to search for
22200803e56STom N Harris * @return array             list of pages found, associated with the search terms
223488dd6ceSAndreas Gohr */
2249b41be24STom N Harrisfunction idx_lookup(&$words) {
2259b41be24STom N Harris    $Indexer = idx_get_indexer();
22600803e56STom N Harris    return $Indexer->lookup($words);
227488dd6ceSAndreas Gohr}
228488dd6ceSAndreas Gohr
229488dd6ceSAndreas Gohr/**
23000803e56STom N Harris * Split a string into tokens
231488dd6ceSAndreas Gohr *
232f50a239bSTakamura * @param string $string
233f50a239bSTakamura * @param bool $wc
234f50a239bSTakamura *
235f50a239bSTakamura * @return array
236488dd6ceSAndreas Gohr */
23700803e56STom N Harrisfunction idx_tokenizer($string, $wc=false) {
2389b41be24STom N Harris    $Indexer = idx_get_indexer();
23900803e56STom N Harris    return $Indexer->tokenizer($string, $wc);
240488dd6ceSAndreas Gohr}
24100803e56STom N Harris
24200803e56STom N Harris/* For compatibility */
243488dd6ceSAndreas Gohr
244f5eb7cf0SAndreas Gohr/**
24500803e56STom N Harris * Read the list of words in an index (if it exists).
246f5eb7cf0SAndreas Gohr *
2474e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
24842ea7f44SGerrit Uitslag *
24942ea7f44SGerrit Uitslag * @param string $idx
25042ea7f44SGerrit Uitslag * @param string $suffix
25142ea7f44SGerrit Uitslag * @return array
252f5eb7cf0SAndreas Gohr */
25300803e56STom N Harrisfunction idx_getIndex($idx, $suffix) {
2541c07b9e6STom N Harris    global $conf;
25500803e56STom N Harris    $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
256*24870174SAndreas Gohr    if (!file_exists($fn)) return [];
25700803e56STom N Harris    return file($fn);
25800803e56STom N Harris}
259f5eb7cf0SAndreas Gohr
26000803e56STom N Harris/**
26100803e56STom N Harris * Get the list of lengths indexed in the wiki.
26200803e56STom N Harris *
26300803e56STom N Harris * Read the index directory or a cache file and returns
26400803e56STom N Harris * a sorted array of lengths of the words used in the wiki.
26500803e56STom N Harris *
26600803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com>
26742ea7f44SGerrit Uitslag *
26842ea7f44SGerrit Uitslag * @return array
26900803e56STom N Harris */
27000803e56STom N Harrisfunction idx_listIndexLengths() {
27100803e56STom N Harris    global $conf;
27200803e56STom N Harris    // testing what we have to do, create a cache file or not.
27300803e56STom N Harris    if ($conf['readdircache'] == 0) {
27400803e56STom N Harris        $docache = false;
2751c07b9e6STom N Harris    } else {
27600803e56STom N Harris        clearstatcache();
27779e79377SAndreas Gohr        if (file_exists($conf['indexdir'].'/lengths.idx')
27800803e56STom N Harris        && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
27964159a61SAndreas Gohr            if (
28064159a61SAndreas Gohr                ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
28164159a61SAndreas Gohr                !== false
28264159a61SAndreas Gohr            ) {
283*24870174SAndreas Gohr                $idx = [];
28400803e56STom N Harris                foreach ($lengths as $length) {
28500803e56STom N Harris                    $idx[] = (int)$length;
28600803e56STom N Harris                }
28700803e56STom N Harris                return $idx;
288f5eb7cf0SAndreas Gohr            }
2891c07b9e6STom N Harris        }
29000803e56STom N Harris        $docache = true;
29100803e56STom N Harris    }
2924e1bf408STom N Harris
29300803e56STom N Harris    if ($conf['readdircache'] == 0 || $docache) {
29400803e56STom N Harris        $dir = @opendir($conf['indexdir']);
29500803e56STom N Harris        if ($dir === false)
296*24870174SAndreas Gohr            return [];
297*24870174SAndreas Gohr        $idx = [];
29800803e56STom N Harris        while (($f = readdir($dir)) !== false) {
29900803e56STom N Harris            if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
30000803e56STom N Harris                $i = substr($f, 1, -4);
30100803e56STom N Harris                if (is_numeric($i))
30200803e56STom N Harris                    $idx[] = (int)$i;
30300803e56STom N Harris            }
30400803e56STom N Harris        }
30500803e56STom N Harris        closedir($dir);
30600803e56STom N Harris        sort($idx);
30700803e56STom N Harris        // save this in a file
30800803e56STom N Harris        if ($docache) {
30900803e56STom N Harris            $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
31000803e56STom N Harris            @fwrite($handle, implode("\n", $idx));
31100803e56STom N Harris            @fclose($handle);
31200803e56STom N Harris        }
31300803e56STom N Harris        return $idx;
31400803e56STom N Harris    }
31500803e56STom N Harris
316*24870174SAndreas Gohr    return [];
31700803e56STom N Harris}
31800803e56STom N Harris
31900803e56STom N Harris/**
32000803e56STom N Harris * Get the word lengths that have been indexed.
32100803e56STom N Harris *
32200803e56STom N Harris * Reads the index directory and returns an array of lengths
32300803e56STom N Harris * that there are indices for.
32400803e56STom N Harris *
32500803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com>
32642ea7f44SGerrit Uitslag *
32742ea7f44SGerrit Uitslag * @param array|int $filter
32842ea7f44SGerrit Uitslag * @return array
32900803e56STom N Harris */
33000803e56STom N Harrisfunction idx_indexLengths($filter) {
33100803e56STom N Harris    global $conf;
332*24870174SAndreas Gohr    $idx = [];
33300803e56STom N Harris    if (is_array($filter)) {
33400803e56STom N Harris        // testing if index files exist only
33500803e56STom N Harris        $path = $conf['indexdir']."/i";
336*24870174SAndreas Gohr        foreach (array_keys($filter) as $key) {
33779e79377SAndreas Gohr            if (file_exists($path.$key.'.idx'))
33800803e56STom N Harris                $idx[] = $key;
33900803e56STom N Harris        }
340f5eb7cf0SAndreas Gohr    } else {
34100803e56STom N Harris        $lengths = idx_listIndexLengths();
342*24870174SAndreas Gohr        foreach ($lengths as $length) {
34300803e56STom N Harris            // keep all the values equal or superior
34400803e56STom N Harris            if ((int)$length >= (int)$filter)
34500803e56STom N Harris                $idx[] = $length;
346f5eb7cf0SAndreas Gohr        }
34700803e56STom N Harris    }
34800803e56STom N Harris    return $idx;
349f5eb7cf0SAndreas Gohr}
350f5eb7cf0SAndreas Gohr
35100803e56STom N Harris/**
35200803e56STom N Harris * Clean a name of a key for use as a file name.
35300803e56STom N Harris *
35400803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's
35500803e56STom N Harris * not a letter, number, or underscore.
35600803e56STom N Harris *
35700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
35842ea7f44SGerrit Uitslag *
35942ea7f44SGerrit Uitslag * @param string $name
36042ea7f44SGerrit Uitslag * @return string
36100803e56STom N Harris */
36200803e56STom N Harrisfunction idx_cleanName($name) {
363*24870174SAndreas Gohr    $name = Clean::romanize(trim((string)$name));
36400803e56STom N Harris    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
36500803e56STom N Harris    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
36600803e56STom N Harris    return strtolower($name);
367f5eb7cf0SAndreas Gohr}
368f5eb7cf0SAndreas Gohr
36900803e56STom N Harris//Setup VIM: ex: et ts=4 :
370