xref: /dokuwiki/inc/indexer.php (revision 6c16a3a9aa602bb7e269fb6d5d18e1353e17f97f)
1b4ce25e9SAndreas Gohr<?php
2d4f83172SAndreas Gohr
3b4ce25e9SAndreas Gohr/**
4fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index
5b4ce25e9SAndreas Gohr *
6b4ce25e9SAndreas Gohr * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
7b4ce25e9SAndreas Gohr * @author     Andreas Gohr <andi@splitbrain.org>
800803e56STom N Harris * @author     Tom N Harris <tnharris@whoopdedo.org>
9b4ce25e9SAndreas Gohr */
10d4f83172SAndreas Gohr
1124870174SAndreas Gohruse dokuwiki\Utf8\Clean;
12dbc189b2SAndreas Gohruse dokuwiki\Extension\Event;
136225b270SMichael Großeuse dokuwiki\Search\Indexer;
14dbc189b2SAndreas Gohr
157c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade
16dbc189b2SAndreas Gohrdefine('INDEXER_VERSION', 8);
177c2ef4e8STom N Harris
1833815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
19d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
2033815ce2SChris Smith
21b4ce25e9SAndreas Gohr/**
227c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer.
237c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version.
247c2ef4e8STom N Harris *
258cd4c12fSAndreas Gohr * @triggers INDEXER_VERSION_GET
26d0d6fe1bSTom N Harris * Plugins that modify what gets indexed should hook this event and
27d0d6fe1bSTom N Harris * add their version info to the event data like so:
28d0d6fe1bSTom N Harris *     $data[$plugin_name] = $plugin_version;
29d0d6fe1bSTom N Harris *
307c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
318605afb1SMichael Hamann * @author Michael Hamann <michael@content-space.de>
3242ea7f44SGerrit Uitslag *
3342ea7f44SGerrit Uitslag * @return int|string
347c2ef4e8STom N Harris */
35d868eb89SAndreas Gohrfunction idx_get_version()
36d868eb89SAndreas Gohr{
37d0d6fe1bSTom N Harris    static $indexer_version = null;
38d0d6fe1bSTom N Harris    if ($indexer_version == null) {
398605afb1SMichael Hamann        $version = INDEXER_VERSION;
408605afb1SMichael Hamann
41d0d6fe1bSTom N Harris        // DokuWiki version is included for the convenience of plugins
4224870174SAndreas Gohr        $data = ['dokuwiki' => $version];
43cbb44eabSAndreas Gohr        Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
44d0d6fe1bSTom N Harris        unset($data['dokuwiki']); // this needs to be first
45d0d6fe1bSTom N Harris        ksort($data);
46d0d6fe1bSTom N Harris        foreach ($data as $plugin => $vers)
47d0d6fe1bSTom N Harris            $version .= '+' . $plugin . '=' . $vers;
48d0d6fe1bSTom N Harris        $indexer_version = $version;
49d0d6fe1bSTom N Harris    }
50d0d6fe1bSTom N Harris    return $indexer_version;
517c2ef4e8STom N Harris}
527c2ef4e8STom N Harris
537c2ef4e8STom N Harris/**
54d5b23302STom N Harris * Measure the length of a string.
55d5b23302STom N Harris * Differs from strlen in handling of asian characters.
56d5b23302STom N Harris *
57d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
5842ea7f44SGerrit Uitslag *
5942ea7f44SGerrit Uitslag * @param string $w
6042ea7f44SGerrit Uitslag * @return int
61d5b23302STom N Harris */
62d868eb89SAndreas Gohrfunction wordlen($w)
63d868eb89SAndreas Gohr{
64d5b23302STom N Harris    $l = strlen($w);
65d5b23302STom N Harris    // If left alone, all chinese "words" will get put into w3.idx
66d5b23302STom N Harris    // So the "length" of a "word" is faked
674b9792c6STom N Harris    if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
684b9792c6STom N Harris        foreach ($leadbytes[0] as $b)
694b9792c6STom N Harris            $l += ord($b) - 0xE1;
704b9792c6STom N Harris    }
71d5b23302STom N Harris    return $l;
72d5b23302STom N Harris}
73d5b23302STom N Harris
74d5b23302STom N Harris/**
7500803e56STom N Harris * Create an instance of the indexer.
7600803e56STom N Harris *
776225b270SMichael Große * @return Indexer    an Indexer
7842ea7f44SGerrit Uitslag *
7900803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
8000803e56STom N Harris */
81d868eb89SAndreas Gohrfunction idx_get_indexer()
82d868eb89SAndreas Gohr{
834f708321SMichael Hamann    static $Indexer;
841421e548SMichael Hamann    if (!isset($Indexer)) {
856225b270SMichael Große        $Indexer = new Indexer();
8600803e56STom N Harris    }
8700803e56STom N Harris    return $Indexer;
8800803e56STom N Harris}
8900803e56STom N Harris
9000803e56STom N Harris/**
9100803e56STom N Harris * Returns words that will be ignored.
9200803e56STom N Harris *
9300803e56STom N Harris * @return array                list of stop words
9442ea7f44SGerrit Uitslag *
9500803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
9600803e56STom N Harris */
97d868eb89SAndreas Gohrfunction & idx_get_stopwords()
98d868eb89SAndreas Gohr{
9900803e56STom N Harris    static $stopwords = null;
10000803e56STom N Harris    if (is_null($stopwords)) {
10100803e56STom N Harris        global $conf;
10200803e56STom N Harris        $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
10379e79377SAndreas Gohr        if (file_exists($swfile)) {
10400803e56STom N Harris            $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
10500803e56STom N Harris        } else {
10624870174SAndreas Gohr            $stopwords = [];
10700803e56STom N Harris        }
10800803e56STom N Harris    }
10900803e56STom N Harris    return $stopwords;
11000803e56STom N Harris}
11100803e56STom N Harris
11200803e56STom N Harris/**
11300803e56STom N Harris * Adds/updates the search index for the given page
11400803e56STom N Harris *
11500803e56STom N Harris * Locking is handled internally.
11600803e56STom N Harris *
11700803e56STom N Harris * @param string        $page   name of the page to index
1189b41be24STom N Harris * @param boolean       $verbose    print status messages
119d041f8dbSMichael Hamann * @param boolean       $force  force reindexing even when the index is up to date
12042ea7f44SGerrit Uitslag * @return string|boolean  the function completed successfully
12142ea7f44SGerrit Uitslag *
12200803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
12300803e56STom N Harris */
124d868eb89SAndreas Gohrfunction idx_addPage($page, $verbose = false, $force = false)
125d868eb89SAndreas Gohr{
1269b41be24STom N Harris    $idxtag = metaFN($page, '.indexed');
127a23ac4d7SMichael Hamann    // check if page was deleted but is still in the index
128bbc85ee4STom N Harris    if (!page_exists($page)) {
12979e79377SAndreas Gohr        if (!file_exists($idxtag)) {
13026dfc232SAndreas Gohr            if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
131bbc85ee4STom N Harris            return false;
132bbc85ee4STom N Harris        }
133bbc85ee4STom N Harris        $Indexer = idx_get_indexer();
134bbc85ee4STom N Harris        $result = $Indexer->deletePage($page);
135bbc85ee4STom N Harris        if ($result === "locked") {
13626dfc232SAndreas Gohr            if ($verbose) echo "Indexer: locked" . DOKU_LF;
137bbc85ee4STom N Harris            return false;
138bbc85ee4STom N Harris        }
139bbc85ee4STom N Harris        @unlink($idxtag);
140bbc85ee4STom N Harris        return $result;
141bbc85ee4STom N Harris    }
142a23ac4d7SMichael Hamann
143a23ac4d7SMichael Hamann    // check if indexing needed
14479e79377SAndreas Gohr    if (!$force && file_exists($idxtag)) {
145a23ac4d7SMichael Hamann        if (trim(io_readFile($idxtag)) == idx_get_version()) {
146a23ac4d7SMichael Hamann            $last = @filemtime($idxtag);
147a23ac4d7SMichael Hamann            if ($last > @filemtime(wikiFN($page))) {
14826dfc232SAndreas Gohr                if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
149a23ac4d7SMichael Hamann                return false;
150a23ac4d7SMichael Hamann            }
151a23ac4d7SMichael Hamann        }
152a23ac4d7SMichael Hamann    }
153a23ac4d7SMichael Hamann
15465aa8490SMichael Hamann    $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
155bbc85ee4STom N Harris    if ($indexenabled === false) {
156bbc85ee4STom N Harris        $result = false;
15779e79377SAndreas Gohr        if (file_exists($idxtag)) {
158bbc85ee4STom N Harris            $Indexer = idx_get_indexer();
159bbc85ee4STom N Harris            $result = $Indexer->deletePage($page);
160bbc85ee4STom N Harris            if ($result === "locked") {
16126dfc232SAndreas Gohr                if ($verbose) echo "Indexer: locked" . DOKU_LF;
162bbc85ee4STom N Harris                return false;
163bbc85ee4STom N Harris            }
164bbc85ee4STom N Harris            @unlink($idxtag);
165bbc85ee4STom N Harris        }
16626dfc232SAndreas Gohr        if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
167bbc85ee4STom N Harris        return $result;
168bbc85ee4STom N Harris    }
169bbc85ee4STom N Harris
17003aafe1cSMichael Hamann    $Indexer = idx_get_indexer();
17103aafe1cSMichael Hamann    $pid = $Indexer->getPID($page);
17203aafe1cSMichael Hamann    if ($pid === false) {
17326dfc232SAndreas Gohr        if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
17403aafe1cSMichael Hamann        return false;
17503aafe1cSMichael Hamann    }
17600803e56STom N Harris    $body = '';
17724870174SAndreas Gohr    $metadata = [];
17865aa8490SMichael Hamann    $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
17965aa8490SMichael Hamann    if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
18039d6fd30SMichael Hamann        $metadata['relation_references'] = array_keys($references);
181177d6836SAndreas Gohr    else $metadata['relation_references'] = [];
182ffec1009SMichael Hamann
183ffec1009SMichael Hamann    if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
184ffec1009SMichael Hamann        $metadata['relation_media'] = array_keys($media);
185177d6836SAndreas Gohr    else $metadata['relation_media'] = [];
186ffec1009SMichael Hamann
18724870174SAndreas Gohr    $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
188e1d9dcc8SAndreas Gohr    $evt = new Event('INDEXER_PAGE_ADD', $data);
18939d6fd30SMichael Hamann    if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
19000803e56STom N Harris    $evt->advise_after();
19100803e56STom N Harris    unset($evt);
19239d6fd30SMichael Hamann    extract($data);
19300803e56STom N Harris
1949b41be24STom N Harris    $result = $Indexer->addPageWords($page, $body);
195e1e1a7e0SMichael Hamann    if ($result === "locked") {
19626dfc232SAndreas Gohr        if ($verbose) echo "Indexer: locked" . DOKU_LF;
1979b41be24STom N Harris        return false;
1989b41be24STom N Harris    }
199320f489aSMichael Hamann
200320f489aSMichael Hamann    if ($result) {
20139d6fd30SMichael Hamann        $result = $Indexer->addMetaKeys($page, $metadata);
202320f489aSMichael Hamann        if ($result === "locked") {
20326dfc232SAndreas Gohr            if ($verbose) echo "Indexer: locked" . DOKU_LF;
204320f489aSMichael Hamann            return false;
205320f489aSMichael Hamann        }
206320f489aSMichael Hamann    }
207320f489aSMichael Hamann
2089b41be24STom N Harris    if ($result)
2099b41be24STom N Harris        io_saveFile(metaFN($page, '.indexed'), idx_get_version());
2109b41be24STom N Harris    if ($verbose) {
21126dfc232SAndreas Gohr        echo "Indexer: finished" . DOKU_LF;
2129b41be24STom N Harris        return true;
2139b41be24STom N Harris    }
2149b41be24STom N Harris    return $result;
21500803e56STom N Harris}
21600803e56STom N Harris
21700803e56STom N Harris/**
21800803e56STom N Harris * Find tokens in the fulltext index
21900803e56STom N Harris *
22000803e56STom N Harris * Takes an array of words and will return a list of matching
22100803e56STom N Harris * pages for each one.
222488dd6ceSAndreas Gohr *
22363773904SAndreas Gohr * Important: No ACL checking is done here! All results are
22463773904SAndreas Gohr *            returned, regardless of permissions
22563773904SAndreas Gohr *
226e3ab6fc5SMichael Hamann * @param array      $words  list of words to search for
22700803e56STom N Harris * @return array             list of pages found, associated with the search terms
228488dd6ceSAndreas Gohr */
229d868eb89SAndreas Gohrfunction idx_lookup(&$words)
230d868eb89SAndreas Gohr{
2319b41be24STom N Harris    $Indexer = idx_get_indexer();
23200803e56STom N Harris    return $Indexer->lookup($words);
233488dd6ceSAndreas Gohr}
234488dd6ceSAndreas Gohr
235488dd6ceSAndreas Gohr/**
23600803e56STom N Harris * Split a string into tokens
237488dd6ceSAndreas Gohr *
238f50a239bSTakamura * @param string $string
239f50a239bSTakamura * @param bool $wc
240f50a239bSTakamura *
241f50a239bSTakamura * @return array
242488dd6ceSAndreas Gohr */
243d868eb89SAndreas Gohrfunction idx_tokenizer($string, $wc = false)
244d868eb89SAndreas Gohr{
2459b41be24STom N Harris    $Indexer = idx_get_indexer();
24600803e56STom N Harris    return $Indexer->tokenizer($string, $wc);
247488dd6ceSAndreas Gohr}
24800803e56STom N Harris
24900803e56STom N Harris/* For compatibility */
250488dd6ceSAndreas Gohr
251f5eb7cf0SAndreas Gohr/**
25200803e56STom N Harris * Read the list of words in an index (if it exists).
253f5eb7cf0SAndreas Gohr *
2544e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
25542ea7f44SGerrit Uitslag *
25642ea7f44SGerrit Uitslag * @param string $idx
25742ea7f44SGerrit Uitslag * @param string $suffix
25842ea7f44SGerrit Uitslag * @return array
259f5eb7cf0SAndreas Gohr */
260d868eb89SAndreas Gohrfunction idx_getIndex($idx, $suffix)
261d868eb89SAndreas Gohr{
2621c07b9e6STom N Harris    global $conf;
26300803e56STom N Harris    $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
26424870174SAndreas Gohr    if (!file_exists($fn)) return [];
26500803e56STom N Harris    return file($fn);
26600803e56STom N Harris}
267f5eb7cf0SAndreas Gohr
26800803e56STom N Harris/**
26900803e56STom N Harris * Get the list of lengths indexed in the wiki.
27000803e56STom N Harris *
27100803e56STom N Harris * Read the index directory or a cache file and returns
27200803e56STom N Harris * a sorted array of lengths of the words used in the wiki.
27300803e56STom N Harris *
27400803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com>
27542ea7f44SGerrit Uitslag *
27642ea7f44SGerrit Uitslag * @return array
27700803e56STom N Harris */
278d868eb89SAndreas Gohrfunction idx_listIndexLengths()
279d868eb89SAndreas Gohr{
28000803e56STom N Harris    global $conf;
28100803e56STom N Harris    // testing what we have to do, create a cache file or not.
28200803e56STom N Harris    if ($conf['readdircache'] == 0) {
28300803e56STom N Harris        $docache = false;
2841c07b9e6STom N Harris    } else {
28500803e56STom N Harris        clearstatcache();
2867d34963bSAndreas Gohr        if (
2877d34963bSAndreas Gohr            file_exists($conf['indexdir'] . '/lengths.idx')
2887d34963bSAndreas Gohr            && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
2897d34963bSAndreas Gohr        ) {
29064159a61SAndreas Gohr            if (
29164159a61SAndreas Gohr                ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
29264159a61SAndreas Gohr                !== false
29364159a61SAndreas Gohr            ) {
29424870174SAndreas Gohr                $idx = [];
29500803e56STom N Harris                foreach ($lengths as $length) {
29600803e56STom N Harris                    $idx[] = (int)$length;
29700803e56STom N Harris                }
29800803e56STom N Harris                return $idx;
299f5eb7cf0SAndreas Gohr            }
3001c07b9e6STom N Harris        }
30100803e56STom N Harris        $docache = true;
30200803e56STom N Harris    }
3034e1bf408STom N Harris
30400803e56STom N Harris    if ($conf['readdircache'] == 0 || $docache) {
30500803e56STom N Harris        $dir = @opendir($conf['indexdir']);
30600803e56STom N Harris        if ($dir === false)
30724870174SAndreas Gohr            return [];
30824870174SAndreas Gohr        $idx = [];
30900803e56STom N Harris        while (($f = readdir($dir)) !== false) {
310*6c16a3a9Sfiwswe            if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) {
31100803e56STom N Harris                $i = substr($f, 1, -4);
31200803e56STom N Harris                if (is_numeric($i))
31300803e56STom N Harris                    $idx[] = (int)$i;
31400803e56STom N Harris            }
31500803e56STom N Harris        }
31600803e56STom N Harris        closedir($dir);
31700803e56STom N Harris        sort($idx);
31800803e56STom N Harris        // save this in a file
31900803e56STom N Harris        if ($docache) {
32000803e56STom N Harris            $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
32100803e56STom N Harris            @fwrite($handle, implode("\n", $idx));
32200803e56STom N Harris            @fclose($handle);
32300803e56STom N Harris        }
32400803e56STom N Harris        return $idx;
32500803e56STom N Harris    }
32600803e56STom N Harris
32724870174SAndreas Gohr    return [];
32800803e56STom N Harris}
32900803e56STom N Harris
33000803e56STom N Harris/**
33100803e56STom N Harris * Get the word lengths that have been indexed.
33200803e56STom N Harris *
33300803e56STom N Harris * Reads the index directory and returns an array of lengths
33400803e56STom N Harris * that there are indices for.
33500803e56STom N Harris *
33600803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com>
33742ea7f44SGerrit Uitslag *
33842ea7f44SGerrit Uitslag * @param array|int $filter
33942ea7f44SGerrit Uitslag * @return array
34000803e56STom N Harris */
341d868eb89SAndreas Gohrfunction idx_indexLengths($filter)
342d868eb89SAndreas Gohr{
34300803e56STom N Harris    global $conf;
34424870174SAndreas Gohr    $idx = [];
34500803e56STom N Harris    if (is_array($filter)) {
34600803e56STom N Harris        // testing if index files exist only
34700803e56STom N Harris        $path = $conf['indexdir'] . "/i";
34824870174SAndreas Gohr        foreach (array_keys($filter) as $key) {
34979e79377SAndreas Gohr            if (file_exists($path . $key . '.idx'))
35000803e56STom N Harris                $idx[] = $key;
35100803e56STom N Harris        }
352f5eb7cf0SAndreas Gohr    } else {
35300803e56STom N Harris        $lengths = idx_listIndexLengths();
35424870174SAndreas Gohr        foreach ($lengths as $length) {
35500803e56STom N Harris            // keep all the values equal or superior
35600803e56STom N Harris            if ((int)$length >= (int)$filter)
35700803e56STom N Harris                $idx[] = $length;
358f5eb7cf0SAndreas Gohr        }
35900803e56STom N Harris    }
36000803e56STom N Harris    return $idx;
361f5eb7cf0SAndreas Gohr}
362f5eb7cf0SAndreas Gohr
36300803e56STom N Harris/**
36400803e56STom N Harris * Clean a name of a key for use as a file name.
36500803e56STom N Harris *
36600803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's
36700803e56STom N Harris * not a letter, number, or underscore.
36800803e56STom N Harris *
36900803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org>
37042ea7f44SGerrit Uitslag *
37142ea7f44SGerrit Uitslag * @param string $name
37242ea7f44SGerrit Uitslag * @return string
37300803e56STom N Harris */
374d868eb89SAndreas Gohrfunction idx_cleanName($name)
375d868eb89SAndreas Gohr{
37624870174SAndreas Gohr    $name = Clean::romanize(trim((string)$name));
37700803e56STom N Harris    $name = preg_replace('#[ \./\\:-]+#', '_', $name);
37800803e56STom N Harris    $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
37900803e56STom N Harris    return strtolower($name);
380f5eb7cf0SAndreas Gohr}
381f5eb7cf0SAndreas Gohr
38200803e56STom N Harris//Setup VIM: ex: et ts=4 :
383