1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 8b4ce25e9SAndreas Gohr */ 924870174SAndreas Gohruse dokuwiki\Utf8\Clean; 10dbc189b2SAndreas Gohruse dokuwiki\Extension\Event; 116225b270SMichael Großeuse dokuwiki\Search\Indexer; 12dbc189b2SAndreas Gohr 137c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade 14dbc189b2SAndreas Gohrdefine('INDEXER_VERSION', 8); 157c2ef4e8STom N Harris 1633815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 17d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); 1833815ce2SChris Smith 19b4ce25e9SAndreas Gohr/** 207c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer. 217c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version. 227c2ef4e8STom N Harris * 238cd4c12fSAndreas Gohr * @triggers INDEXER_VERSION_GET 24d0d6fe1bSTom N Harris * Plugins that modify what gets indexed should hook this event and 25d0d6fe1bSTom N Harris * add their version info to the event data like so: 26d0d6fe1bSTom N Harris * $data[$plugin_name] = $plugin_version; 27d0d6fe1bSTom N Harris * 287c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 298605afb1SMichael Hamann * @author Michael Hamann <michael@content-space.de> 3042ea7f44SGerrit Uitslag * 3142ea7f44SGerrit Uitslag * @return int|string 327c2ef4e8STom N Harris */ 33*d868eb89SAndreas Gohrfunction idx_get_version() 34*d868eb89SAndreas Gohr{ 35d0d6fe1bSTom N Harris static $indexer_version = null; 36d0d6fe1bSTom N Harris if ($indexer_version == null) { 378605afb1SMichael Hamann $version = INDEXER_VERSION; 388605afb1SMichael Hamann 39d0d6fe1bSTom N Harris // DokuWiki version is included for the convenience of plugins 4024870174SAndreas Gohr $data = ['dokuwiki'=>$version]; 41cbb44eabSAndreas Gohr Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 42d0d6fe1bSTom N Harris unset($data['dokuwiki']); // this needs to be first 43d0d6fe1bSTom N Harris ksort($data); 44d0d6fe1bSTom N Harris foreach ($data as $plugin=>$vers) 45d0d6fe1bSTom N Harris $version .= '+'.$plugin.'='.$vers; 46d0d6fe1bSTom N Harris $indexer_version = $version; 47d0d6fe1bSTom N Harris } 48d0d6fe1bSTom N Harris return $indexer_version; 497c2ef4e8STom N Harris} 507c2ef4e8STom N Harris 517c2ef4e8STom N Harris/** 52d5b23302STom N Harris * Measure the length of a string. 53d5b23302STom N Harris * Differs from strlen in handling of asian characters. 54d5b23302STom N Harris * 55d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 5642ea7f44SGerrit Uitslag * 5742ea7f44SGerrit Uitslag * @param string $w 5842ea7f44SGerrit Uitslag * @return int 59d5b23302STom N Harris */ 60*d868eb89SAndreas Gohrfunction wordlen($w) 61*d868eb89SAndreas Gohr{ 62d5b23302STom N Harris $l = strlen($w); 63d5b23302STom N Harris // If left alone, all chinese "words" will get put into w3.idx 64d5b23302STom N Harris // So the "length" of a "word" is faked 654b9792c6STom N Harris if(preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 664b9792c6STom N Harris foreach($leadbytes[0] as $b) 674b9792c6STom N Harris $l += ord($b) - 0xE1; 684b9792c6STom N Harris } 69d5b23302STom N Harris return $l; 70d5b23302STom N Harris} 71d5b23302STom N Harris 72d5b23302STom N Harris/** 7300803e56STom N Harris * Create an instance of the indexer. 7400803e56STom N Harris * 756225b270SMichael Große * @return Indexer an Indexer 7642ea7f44SGerrit Uitslag * 7700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 7800803e56STom N Harris */ 79*d868eb89SAndreas Gohrfunction idx_get_indexer() 80*d868eb89SAndreas Gohr{ 814f708321SMichael Hamann static $Indexer; 821421e548SMichael Hamann if (!isset($Indexer)) { 836225b270SMichael Große $Indexer = new Indexer(); 8400803e56STom N Harris } 8500803e56STom N Harris return $Indexer; 8600803e56STom N Harris} 8700803e56STom N Harris 8800803e56STom N Harris/** 8900803e56STom N Harris * Returns words that will be ignored. 9000803e56STom N Harris * 9100803e56STom N Harris * @return array list of stop words 9242ea7f44SGerrit Uitslag * 9300803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 9400803e56STom N Harris */ 95*d868eb89SAndreas Gohrfunction & idx_get_stopwords() 96*d868eb89SAndreas Gohr{ 9700803e56STom N Harris static $stopwords = null; 9800803e56STom N Harris if (is_null($stopwords)) { 9900803e56STom N Harris global $conf; 10000803e56STom N Harris $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 10179e79377SAndreas Gohr if(file_exists($swfile)){ 10200803e56STom N Harris $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 10300803e56STom N Harris }else{ 10424870174SAndreas Gohr $stopwords = []; 10500803e56STom N Harris } 10600803e56STom N Harris } 10700803e56STom N Harris return $stopwords; 10800803e56STom N Harris} 10900803e56STom N Harris 11000803e56STom N Harris/** 11100803e56STom N Harris * Adds/updates the search index for the given page 11200803e56STom N Harris * 11300803e56STom N Harris * Locking is handled internally. 11400803e56STom N Harris * 11500803e56STom N Harris * @param string $page name of the page to index 1169b41be24STom N Harris * @param boolean $verbose print status messages 117d041f8dbSMichael Hamann * @param boolean $force force reindexing even when the index is up to date 11842ea7f44SGerrit Uitslag * @return string|boolean the function completed successfully 11942ea7f44SGerrit Uitslag * 12000803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 12100803e56STom N Harris */ 122*d868eb89SAndreas Gohrfunction idx_addPage($page, $verbose = false, $force = false) 123*d868eb89SAndreas Gohr{ 1249b41be24STom N Harris $idxtag = metaFN($page, '.indexed'); 125a23ac4d7SMichael Hamann // check if page was deleted but is still in the index 126bbc85ee4STom N Harris if (!page_exists($page)) { 12779e79377SAndreas Gohr if (!file_exists($idxtag)) { 128bbc85ee4STom N Harris if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 129bbc85ee4STom N Harris return false; 130bbc85ee4STom N Harris } 131bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 132bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 133bbc85ee4STom N Harris if ($result === "locked") { 134bbc85ee4STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 135bbc85ee4STom N Harris return false; 136bbc85ee4STom N Harris } 137bbc85ee4STom N Harris @unlink($idxtag); 138bbc85ee4STom N Harris return $result; 139bbc85ee4STom N Harris } 140a23ac4d7SMichael Hamann 141a23ac4d7SMichael Hamann // check if indexing needed 14279e79377SAndreas Gohr if(!$force && file_exists($idxtag)){ 143a23ac4d7SMichael Hamann if(trim(io_readFile($idxtag)) == idx_get_version()){ 144a23ac4d7SMichael Hamann $last = @filemtime($idxtag); 145a23ac4d7SMichael Hamann if($last > @filemtime(wikiFN($page))){ 146a23ac4d7SMichael Hamann if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 147a23ac4d7SMichael Hamann return false; 148a23ac4d7SMichael Hamann } 149a23ac4d7SMichael Hamann } 150a23ac4d7SMichael Hamann } 151a23ac4d7SMichael Hamann 15265aa8490SMichael Hamann $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 153bbc85ee4STom N Harris if ($indexenabled === false) { 154bbc85ee4STom N Harris $result = false; 15579e79377SAndreas Gohr if (file_exists($idxtag)) { 156bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 157bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 158bbc85ee4STom N Harris if ($result === "locked") { 159bbc85ee4STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 160bbc85ee4STom N Harris return false; 161bbc85ee4STom N Harris } 162bbc85ee4STom N Harris @unlink($idxtag); 163bbc85ee4STom N Harris } 164bbc85ee4STom N Harris if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 165bbc85ee4STom N Harris return $result; 166bbc85ee4STom N Harris } 167bbc85ee4STom N Harris 16803aafe1cSMichael Hamann $Indexer = idx_get_indexer(); 16903aafe1cSMichael Hamann $pid = $Indexer->getPID($page); 17003aafe1cSMichael Hamann if ($pid === false) { 17103aafe1cSMichael Hamann if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 17203aafe1cSMichael Hamann return false; 17303aafe1cSMichael Hamann } 17400803e56STom N Harris $body = ''; 17524870174SAndreas Gohr $metadata = []; 17665aa8490SMichael Hamann $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 17765aa8490SMichael Hamann if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 17839d6fd30SMichael Hamann $metadata['relation_references'] = array_keys($references); 179a424180eSMichael Hamann else 18024870174SAndreas Gohr $metadata['relation_references'] = []; 181ffec1009SMichael Hamann 182ffec1009SMichael Hamann if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 183ffec1009SMichael Hamann $metadata['relation_media'] = array_keys($media); 184ffec1009SMichael Hamann else 18524870174SAndreas Gohr $metadata['relation_media'] = []; 186ffec1009SMichael Hamann 18724870174SAndreas Gohr $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 188e1d9dcc8SAndreas Gohr $evt = new Event('INDEXER_PAGE_ADD', $data); 18939d6fd30SMichael Hamann if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 19000803e56STom N Harris $evt->advise_after(); 19100803e56STom N Harris unset($evt); 19239d6fd30SMichael Hamann extract($data); 19300803e56STom N Harris 1949b41be24STom N Harris $result = $Indexer->addPageWords($page, $body); 195e1e1a7e0SMichael Hamann if ($result === "locked") { 1969b41be24STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 1979b41be24STom N Harris return false; 1989b41be24STom N Harris } 199320f489aSMichael Hamann 200320f489aSMichael Hamann if ($result) { 20139d6fd30SMichael Hamann $result = $Indexer->addMetaKeys($page, $metadata); 202320f489aSMichael Hamann if ($result === "locked") { 203320f489aSMichael Hamann if ($verbose) print("Indexer: locked".DOKU_LF); 204320f489aSMichael Hamann return false; 205320f489aSMichael Hamann } 206320f489aSMichael Hamann } 207320f489aSMichael Hamann 2089b41be24STom N Harris if ($result) 2099b41be24STom N Harris io_saveFile(metaFN($page, '.indexed'), idx_get_version()); 2109b41be24STom N Harris if ($verbose) { 2119b41be24STom N Harris print("Indexer: finished".DOKU_LF); 2129b41be24STom N Harris return true; 2139b41be24STom N Harris } 2149b41be24STom N Harris return $result; 21500803e56STom N Harris} 21600803e56STom N Harris 21700803e56STom N Harris/** 21800803e56STom N Harris * Find tokens in the fulltext index 21900803e56STom N Harris * 22000803e56STom N Harris * Takes an array of words and will return a list of matching 22100803e56STom N Harris * pages for each one. 222488dd6ceSAndreas Gohr * 22363773904SAndreas Gohr * Important: No ACL checking is done here! All results are 22463773904SAndreas Gohr * returned, regardless of permissions 22563773904SAndreas Gohr * 226e3ab6fc5SMichael Hamann * @param array $words list of words to search for 22700803e56STom N Harris * @return array list of pages found, associated with the search terms 228488dd6ceSAndreas Gohr */ 229*d868eb89SAndreas Gohrfunction idx_lookup(&$words) 230*d868eb89SAndreas Gohr{ 2319b41be24STom N Harris $Indexer = idx_get_indexer(); 23200803e56STom N Harris return $Indexer->lookup($words); 233488dd6ceSAndreas Gohr} 234488dd6ceSAndreas Gohr 235488dd6ceSAndreas Gohr/** 23600803e56STom N Harris * Split a string into tokens 237488dd6ceSAndreas Gohr * 238f50a239bSTakamura * @param string $string 239f50a239bSTakamura * @param bool $wc 240f50a239bSTakamura * 241f50a239bSTakamura * @return array 242488dd6ceSAndreas Gohr */ 243*d868eb89SAndreas Gohrfunction idx_tokenizer($string, $wc = false) 244*d868eb89SAndreas Gohr{ 2459b41be24STom N Harris $Indexer = idx_get_indexer(); 24600803e56STom N Harris return $Indexer->tokenizer($string, $wc); 247488dd6ceSAndreas Gohr} 24800803e56STom N Harris 24900803e56STom N Harris/* For compatibility */ 250488dd6ceSAndreas Gohr 251f5eb7cf0SAndreas Gohr/** 25200803e56STom N Harris * Read the list of words in an index (if it exists). 253f5eb7cf0SAndreas Gohr * 2544e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 25542ea7f44SGerrit Uitslag * 25642ea7f44SGerrit Uitslag * @param string $idx 25742ea7f44SGerrit Uitslag * @param string $suffix 25842ea7f44SGerrit Uitslag * @return array 259f5eb7cf0SAndreas Gohr */ 260*d868eb89SAndreas Gohrfunction idx_getIndex($idx, $suffix) 261*d868eb89SAndreas Gohr{ 2621c07b9e6STom N Harris global $conf; 26300803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 26424870174SAndreas Gohr if (!file_exists($fn)) return []; 26500803e56STom N Harris return file($fn); 26600803e56STom N Harris} 267f5eb7cf0SAndreas Gohr 26800803e56STom N Harris/** 26900803e56STom N Harris * Get the list of lengths indexed in the wiki. 27000803e56STom N Harris * 27100803e56STom N Harris * Read the index directory or a cache file and returns 27200803e56STom N Harris * a sorted array of lengths of the words used in the wiki. 27300803e56STom N Harris * 27400803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 27542ea7f44SGerrit Uitslag * 27642ea7f44SGerrit Uitslag * @return array 27700803e56STom N Harris */ 278*d868eb89SAndreas Gohrfunction idx_listIndexLengths() 279*d868eb89SAndreas Gohr{ 28000803e56STom N Harris global $conf; 28100803e56STom N Harris // testing what we have to do, create a cache file or not. 28200803e56STom N Harris if ($conf['readdircache'] == 0) { 28300803e56STom N Harris $docache = false; 2841c07b9e6STom N Harris } else { 28500803e56STom N Harris clearstatcache(); 28679e79377SAndreas Gohr if (file_exists($conf['indexdir'].'/lengths.idx') 28700803e56STom N Harris && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 28864159a61SAndreas Gohr if ( 28964159a61SAndreas Gohr ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 29064159a61SAndreas Gohr !== false 29164159a61SAndreas Gohr ) { 29224870174SAndreas Gohr $idx = []; 29300803e56STom N Harris foreach ($lengths as $length) { 29400803e56STom N Harris $idx[] = (int)$length; 29500803e56STom N Harris } 29600803e56STom N Harris return $idx; 297f5eb7cf0SAndreas Gohr } 2981c07b9e6STom N Harris } 29900803e56STom N Harris $docache = true; 30000803e56STom N Harris } 3014e1bf408STom N Harris 30200803e56STom N Harris if ($conf['readdircache'] == 0 || $docache) { 30300803e56STom N Harris $dir = @opendir($conf['indexdir']); 30400803e56STom N Harris if ($dir === false) 30524870174SAndreas Gohr return []; 30624870174SAndreas Gohr $idx = []; 30700803e56STom N Harris while (($f = readdir($dir)) !== false) { 30800803e56STom N Harris if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 30900803e56STom N Harris $i = substr($f, 1, -4); 31000803e56STom N Harris if (is_numeric($i)) 31100803e56STom N Harris $idx[] = (int)$i; 31200803e56STom N Harris } 31300803e56STom N Harris } 31400803e56STom N Harris closedir($dir); 31500803e56STom N Harris sort($idx); 31600803e56STom N Harris // save this in a file 31700803e56STom N Harris if ($docache) { 31800803e56STom N Harris $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 31900803e56STom N Harris @fwrite($handle, implode("\n", $idx)); 32000803e56STom N Harris @fclose($handle); 32100803e56STom N Harris } 32200803e56STom N Harris return $idx; 32300803e56STom N Harris } 32400803e56STom N Harris 32524870174SAndreas Gohr return []; 32600803e56STom N Harris} 32700803e56STom N Harris 32800803e56STom N Harris/** 32900803e56STom N Harris * Get the word lengths that have been indexed. 33000803e56STom N Harris * 33100803e56STom N Harris * Reads the index directory and returns an array of lengths 33200803e56STom N Harris * that there are indices for. 33300803e56STom N Harris * 33400803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 33542ea7f44SGerrit Uitslag * 33642ea7f44SGerrit Uitslag * @param array|int $filter 33742ea7f44SGerrit Uitslag * @return array 33800803e56STom N Harris */ 339*d868eb89SAndreas Gohrfunction idx_indexLengths($filter) 340*d868eb89SAndreas Gohr{ 34100803e56STom N Harris global $conf; 34224870174SAndreas Gohr $idx = []; 34300803e56STom N Harris if (is_array($filter)) { 34400803e56STom N Harris // testing if index files exist only 34500803e56STom N Harris $path = $conf['indexdir']."/i"; 34624870174SAndreas Gohr foreach (array_keys($filter) as $key) { 34779e79377SAndreas Gohr if (file_exists($path.$key.'.idx')) 34800803e56STom N Harris $idx[] = $key; 34900803e56STom N Harris } 350f5eb7cf0SAndreas Gohr } else { 35100803e56STom N Harris $lengths = idx_listIndexLengths(); 35224870174SAndreas Gohr foreach ($lengths as $length) { 35300803e56STom N Harris // keep all the values equal or superior 35400803e56STom N Harris if ((int)$length >= (int)$filter) 35500803e56STom N Harris $idx[] = $length; 356f5eb7cf0SAndreas Gohr } 35700803e56STom N Harris } 35800803e56STom N Harris return $idx; 359f5eb7cf0SAndreas Gohr} 360f5eb7cf0SAndreas Gohr 36100803e56STom N Harris/** 36200803e56STom N Harris * Clean a name of a key for use as a file name. 36300803e56STom N Harris * 36400803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's 36500803e56STom N Harris * not a letter, number, or underscore. 36600803e56STom N Harris * 36700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 36842ea7f44SGerrit Uitslag * 36942ea7f44SGerrit Uitslag * @param string $name 37042ea7f44SGerrit Uitslag * @return string 37100803e56STom N Harris */ 372*d868eb89SAndreas Gohrfunction idx_cleanName($name) 373*d868eb89SAndreas Gohr{ 37424870174SAndreas Gohr $name = Clean::romanize(trim((string)$name)); 37500803e56STom N Harris $name = preg_replace('#[ \./\\:-]+#', '_', $name); 37600803e56STom N Harris $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 37700803e56STom N Harris return strtolower($name); 378f5eb7cf0SAndreas Gohr} 379f5eb7cf0SAndreas Gohr 38000803e56STom N Harris//Setup VIM: ex: et ts=4 : 381