1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 8b4ce25e9SAndreas Gohr */ 924870174SAndreas Gohruse dokuwiki\Utf8\Clean; 10dbc189b2SAndreas Gohruse dokuwiki\Extension\Event; 116225b270SMichael Großeuse dokuwiki\Search\Indexer; 12dbc189b2SAndreas Gohr 137c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade 14dbc189b2SAndreas Gohrdefine('INDEXER_VERSION', 8); 157c2ef4e8STom N Harris 1633815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 17d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); 1833815ce2SChris Smith 19b4ce25e9SAndreas Gohr/** 207c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer. 217c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version. 227c2ef4e8STom N Harris * 238cd4c12fSAndreas Gohr * @triggers INDEXER_VERSION_GET 24d0d6fe1bSTom N Harris * Plugins that modify what gets indexed should hook this event and 25d0d6fe1bSTom N Harris * add their version info to the event data like so: 26d0d6fe1bSTom N Harris * $data[$plugin_name] = $plugin_version; 27d0d6fe1bSTom N Harris * 287c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 298605afb1SMichael Hamann * @author Michael Hamann <michael@content-space.de> 3042ea7f44SGerrit Uitslag * 3142ea7f44SGerrit Uitslag * @return int|string 327c2ef4e8STom N Harris */ 33d868eb89SAndreas Gohrfunction idx_get_version() 34d868eb89SAndreas Gohr{ 35d0d6fe1bSTom N Harris static $indexer_version = null; 36d0d6fe1bSTom N Harris if ($indexer_version == null) { 378605afb1SMichael Hamann $version = INDEXER_VERSION; 388605afb1SMichael Hamann 39d0d6fe1bSTom N Harris // DokuWiki version is included for the convenience of plugins 4024870174SAndreas Gohr $data = ['dokuwiki'=>$version]; 41cbb44eabSAndreas Gohr Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 42d0d6fe1bSTom N Harris unset($data['dokuwiki']); // this needs to be first 43d0d6fe1bSTom N Harris ksort($data); 44d0d6fe1bSTom N Harris foreach ($data as $plugin => $vers) 45d0d6fe1bSTom N Harris $version .= '+'.$plugin.'='.$vers; 46d0d6fe1bSTom N Harris $indexer_version = $version; 47d0d6fe1bSTom N Harris } 48d0d6fe1bSTom N Harris return $indexer_version; 497c2ef4e8STom N Harris} 507c2ef4e8STom N Harris 517c2ef4e8STom N Harris/** 52d5b23302STom N Harris * Measure the length of a string. 53d5b23302STom N Harris * Differs from strlen in handling of asian characters. 54d5b23302STom N Harris * 55d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 5642ea7f44SGerrit Uitslag * 5742ea7f44SGerrit Uitslag * @param string $w 5842ea7f44SGerrit Uitslag * @return int 59d5b23302STom N Harris */ 60d868eb89SAndreas Gohrfunction wordlen($w) 61d868eb89SAndreas Gohr{ 62d5b23302STom N Harris $l = strlen($w); 63d5b23302STom N Harris // If left alone, all chinese "words" will get put into w3.idx 64d5b23302STom N Harris // So the "length" of a "word" is faked 654b9792c6STom N Harris if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 664b9792c6STom N Harris foreach ($leadbytes[0] as $b) 674b9792c6STom N Harris $l += ord($b) - 0xE1; 684b9792c6STom N Harris } 69d5b23302STom N Harris return $l; 70d5b23302STom N Harris} 71d5b23302STom N Harris 72d5b23302STom N Harris/** 7300803e56STom N Harris * Create an instance of the indexer. 7400803e56STom N Harris * 756225b270SMichael Große * @return Indexer an Indexer 7642ea7f44SGerrit Uitslag * 7700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 7800803e56STom N Harris */ 79d868eb89SAndreas Gohrfunction idx_get_indexer() 80d868eb89SAndreas Gohr{ 814f708321SMichael Hamann static $Indexer; 821421e548SMichael Hamann if (!isset($Indexer)) { 836225b270SMichael Große $Indexer = new Indexer(); 8400803e56STom N Harris } 8500803e56STom N Harris return $Indexer; 8600803e56STom N Harris} 8700803e56STom N Harris 8800803e56STom N Harris/** 8900803e56STom N Harris * Returns words that will be ignored. 9000803e56STom N Harris * 9100803e56STom N Harris * @return array list of stop words 9242ea7f44SGerrit Uitslag * 9300803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 9400803e56STom N Harris */ 95d868eb89SAndreas Gohrfunction & idx_get_stopwords() 96d868eb89SAndreas Gohr{ 9700803e56STom N Harris static $stopwords = null; 9800803e56STom N Harris if (is_null($stopwords)) { 9900803e56STom N Harris global $conf; 10000803e56STom N Harris $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 10179e79377SAndreas Gohr if (file_exists($swfile)) { 10200803e56STom N Harris $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 10300803e56STom N Harris } else { 10424870174SAndreas Gohr $stopwords = []; 10500803e56STom N Harris } 10600803e56STom N Harris } 10700803e56STom N Harris return $stopwords; 10800803e56STom N Harris} 10900803e56STom N Harris 11000803e56STom N Harris/** 11100803e56STom N Harris * Adds/updates the search index for the given page 11200803e56STom N Harris * 11300803e56STom N Harris * Locking is handled internally. 11400803e56STom N Harris * 11500803e56STom N Harris * @param string $page name of the page to index 1169b41be24STom N Harris * @param boolean $verbose print status messages 117d041f8dbSMichael Hamann * @param boolean $force force reindexing even when the index is up to date 11842ea7f44SGerrit Uitslag * @return string|boolean the function completed successfully 11942ea7f44SGerrit Uitslag * 12000803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 12100803e56STom N Harris */ 122d868eb89SAndreas Gohrfunction idx_addPage($page, $verbose = false, $force = false) 123d868eb89SAndreas Gohr{ 1249b41be24STom N Harris $idxtag = metaFN($page, '.indexed'); 125a23ac4d7SMichael Hamann // check if page was deleted but is still in the index 126bbc85ee4STom N Harris if (!page_exists($page)) { 12779e79377SAndreas Gohr if (!file_exists($idxtag)) { 128bbc85ee4STom N Harris if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 129bbc85ee4STom N Harris return false; 130bbc85ee4STom N Harris } 131bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 132bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 133bbc85ee4STom N Harris if ($result === "locked") { 134bbc85ee4STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 135bbc85ee4STom N Harris return false; 136bbc85ee4STom N Harris } 137bbc85ee4STom N Harris @unlink($idxtag); 138bbc85ee4STom N Harris return $result; 139bbc85ee4STom N Harris } 140a23ac4d7SMichael Hamann 141a23ac4d7SMichael Hamann // check if indexing needed 14279e79377SAndreas Gohr if (!$force && file_exists($idxtag)) { 143a23ac4d7SMichael Hamann if (trim(io_readFile($idxtag)) == idx_get_version()) { 144a23ac4d7SMichael Hamann $last = @filemtime($idxtag); 145a23ac4d7SMichael Hamann if ($last > @filemtime(wikiFN($page))) { 146a23ac4d7SMichael Hamann if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 147a23ac4d7SMichael Hamann return false; 148a23ac4d7SMichael Hamann } 149a23ac4d7SMichael Hamann } 150a23ac4d7SMichael Hamann } 151a23ac4d7SMichael Hamann 15265aa8490SMichael Hamann $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 153bbc85ee4STom N Harris if ($indexenabled === false) { 154bbc85ee4STom N Harris $result = false; 15579e79377SAndreas Gohr if (file_exists($idxtag)) { 156bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 157bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 158bbc85ee4STom N Harris if ($result === "locked") { 159bbc85ee4STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 160bbc85ee4STom N Harris return false; 161bbc85ee4STom N Harris } 162bbc85ee4STom N Harris @unlink($idxtag); 163bbc85ee4STom N Harris } 164bbc85ee4STom N Harris if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 165bbc85ee4STom N Harris return $result; 166bbc85ee4STom N Harris } 167bbc85ee4STom N Harris 16803aafe1cSMichael Hamann $Indexer = idx_get_indexer(); 16903aafe1cSMichael Hamann $pid = $Indexer->getPID($page); 17003aafe1cSMichael Hamann if ($pid === false) { 17103aafe1cSMichael Hamann if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 17203aafe1cSMichael Hamann return false; 17303aafe1cSMichael Hamann } 17400803e56STom N Harris $body = ''; 17524870174SAndreas Gohr $metadata = []; 17665aa8490SMichael Hamann $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 17765aa8490SMichael Hamann if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 17839d6fd30SMichael Hamann $metadata['relation_references'] = array_keys($references); 179*177d6836SAndreas Gohr else $metadata['relation_references'] = []; 180ffec1009SMichael Hamann 181ffec1009SMichael Hamann if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 182ffec1009SMichael Hamann $metadata['relation_media'] = array_keys($media); 183*177d6836SAndreas Gohr else $metadata['relation_media'] = []; 184ffec1009SMichael Hamann 18524870174SAndreas Gohr $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 186e1d9dcc8SAndreas Gohr $evt = new Event('INDEXER_PAGE_ADD', $data); 18739d6fd30SMichael Hamann if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 18800803e56STom N Harris $evt->advise_after(); 18900803e56STom N Harris unset($evt); 19039d6fd30SMichael Hamann extract($data); 19100803e56STom N Harris 1929b41be24STom N Harris $result = $Indexer->addPageWords($page, $body); 193e1e1a7e0SMichael Hamann if ($result === "locked") { 1949b41be24STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 1959b41be24STom N Harris return false; 1969b41be24STom N Harris } 197320f489aSMichael Hamann 198320f489aSMichael Hamann if ($result) { 19939d6fd30SMichael Hamann $result = $Indexer->addMetaKeys($page, $metadata); 200320f489aSMichael Hamann if ($result === "locked") { 201320f489aSMichael Hamann if ($verbose) print("Indexer: locked".DOKU_LF); 202320f489aSMichael Hamann return false; 203320f489aSMichael Hamann } 204320f489aSMichael Hamann } 205320f489aSMichael Hamann 2069b41be24STom N Harris if ($result) 2079b41be24STom N Harris io_saveFile(metaFN($page, '.indexed'), idx_get_version()); 2089b41be24STom N Harris if ($verbose) { 2099b41be24STom N Harris print("Indexer: finished".DOKU_LF); 2109b41be24STom N Harris return true; 2119b41be24STom N Harris } 2129b41be24STom N Harris return $result; 21300803e56STom N Harris} 21400803e56STom N Harris 21500803e56STom N Harris/** 21600803e56STom N Harris * Find tokens in the fulltext index 21700803e56STom N Harris * 21800803e56STom N Harris * Takes an array of words and will return a list of matching 21900803e56STom N Harris * pages for each one. 220488dd6ceSAndreas Gohr * 22163773904SAndreas Gohr * Important: No ACL checking is done here! All results are 22263773904SAndreas Gohr * returned, regardless of permissions 22363773904SAndreas Gohr * 224e3ab6fc5SMichael Hamann * @param array $words list of words to search for 22500803e56STom N Harris * @return array list of pages found, associated with the search terms 226488dd6ceSAndreas Gohr */ 227d868eb89SAndreas Gohrfunction idx_lookup(&$words) 228d868eb89SAndreas Gohr{ 2299b41be24STom N Harris $Indexer = idx_get_indexer(); 23000803e56STom N Harris return $Indexer->lookup($words); 231488dd6ceSAndreas Gohr} 232488dd6ceSAndreas Gohr 233488dd6ceSAndreas Gohr/** 23400803e56STom N Harris * Split a string into tokens 235488dd6ceSAndreas Gohr * 236f50a239bSTakamura * @param string $string 237f50a239bSTakamura * @param bool $wc 238f50a239bSTakamura * 239f50a239bSTakamura * @return array 240488dd6ceSAndreas Gohr */ 241d868eb89SAndreas Gohrfunction idx_tokenizer($string, $wc = false) 242d868eb89SAndreas Gohr{ 2439b41be24STom N Harris $Indexer = idx_get_indexer(); 24400803e56STom N Harris return $Indexer->tokenizer($string, $wc); 245488dd6ceSAndreas Gohr} 24600803e56STom N Harris 24700803e56STom N Harris/* For compatibility */ 248488dd6ceSAndreas Gohr 249f5eb7cf0SAndreas Gohr/** 25000803e56STom N Harris * Read the list of words in an index (if it exists). 251f5eb7cf0SAndreas Gohr * 2524e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 25342ea7f44SGerrit Uitslag * 25442ea7f44SGerrit Uitslag * @param string $idx 25542ea7f44SGerrit Uitslag * @param string $suffix 25642ea7f44SGerrit Uitslag * @return array 257f5eb7cf0SAndreas Gohr */ 258d868eb89SAndreas Gohrfunction idx_getIndex($idx, $suffix) 259d868eb89SAndreas Gohr{ 2601c07b9e6STom N Harris global $conf; 26100803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 26224870174SAndreas Gohr if (!file_exists($fn)) return []; 26300803e56STom N Harris return file($fn); 26400803e56STom N Harris} 265f5eb7cf0SAndreas Gohr 26600803e56STom N Harris/** 26700803e56STom N Harris * Get the list of lengths indexed in the wiki. 26800803e56STom N Harris * 26900803e56STom N Harris * Read the index directory or a cache file and returns 27000803e56STom N Harris * a sorted array of lengths of the words used in the wiki. 27100803e56STom N Harris * 27200803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 27342ea7f44SGerrit Uitslag * 27442ea7f44SGerrit Uitslag * @return array 27500803e56STom N Harris */ 276d868eb89SAndreas Gohrfunction idx_listIndexLengths() 277d868eb89SAndreas Gohr{ 27800803e56STom N Harris global $conf; 27900803e56STom N Harris // testing what we have to do, create a cache file or not. 28000803e56STom N Harris if ($conf['readdircache'] == 0) { 28100803e56STom N Harris $docache = false; 2821c07b9e6STom N Harris } else { 28300803e56STom N Harris clearstatcache(); 28479e79377SAndreas Gohr if (file_exists($conf['indexdir'].'/lengths.idx') 28500803e56STom N Harris && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 28664159a61SAndreas Gohr if ( 28764159a61SAndreas Gohr ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 28864159a61SAndreas Gohr !== false 28964159a61SAndreas Gohr ) { 29024870174SAndreas Gohr $idx = []; 29100803e56STom N Harris foreach ($lengths as $length) { 29200803e56STom N Harris $idx[] = (int)$length; 29300803e56STom N Harris } 29400803e56STom N Harris return $idx; 295f5eb7cf0SAndreas Gohr } 2961c07b9e6STom N Harris } 29700803e56STom N Harris $docache = true; 29800803e56STom N Harris } 2994e1bf408STom N Harris 30000803e56STom N Harris if ($conf['readdircache'] == 0 || $docache) { 30100803e56STom N Harris $dir = @opendir($conf['indexdir']); 30200803e56STom N Harris if ($dir === false) 30324870174SAndreas Gohr return []; 30424870174SAndreas Gohr $idx = []; 30500803e56STom N Harris while (($f = readdir($dir)) !== false) { 30600803e56STom N Harris if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 30700803e56STom N Harris $i = substr($f, 1, -4); 30800803e56STom N Harris if (is_numeric($i)) 30900803e56STom N Harris $idx[] = (int)$i; 31000803e56STom N Harris } 31100803e56STom N Harris } 31200803e56STom N Harris closedir($dir); 31300803e56STom N Harris sort($idx); 31400803e56STom N Harris // save this in a file 31500803e56STom N Harris if ($docache) { 31600803e56STom N Harris $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 31700803e56STom N Harris @fwrite($handle, implode("\n", $idx)); 31800803e56STom N Harris @fclose($handle); 31900803e56STom N Harris } 32000803e56STom N Harris return $idx; 32100803e56STom N Harris } 32200803e56STom N Harris 32324870174SAndreas Gohr return []; 32400803e56STom N Harris} 32500803e56STom N Harris 32600803e56STom N Harris/** 32700803e56STom N Harris * Get the word lengths that have been indexed. 32800803e56STom N Harris * 32900803e56STom N Harris * Reads the index directory and returns an array of lengths 33000803e56STom N Harris * that there are indices for. 33100803e56STom N Harris * 33200803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 33342ea7f44SGerrit Uitslag * 33442ea7f44SGerrit Uitslag * @param array|int $filter 33542ea7f44SGerrit Uitslag * @return array 33600803e56STom N Harris */ 337d868eb89SAndreas Gohrfunction idx_indexLengths($filter) 338d868eb89SAndreas Gohr{ 33900803e56STom N Harris global $conf; 34024870174SAndreas Gohr $idx = []; 34100803e56STom N Harris if (is_array($filter)) { 34200803e56STom N Harris // testing if index files exist only 34300803e56STom N Harris $path = $conf['indexdir']."/i"; 34424870174SAndreas Gohr foreach (array_keys($filter) as $key) { 34579e79377SAndreas Gohr if (file_exists($path.$key.'.idx')) 34600803e56STom N Harris $idx[] = $key; 34700803e56STom N Harris } 348f5eb7cf0SAndreas Gohr } else { 34900803e56STom N Harris $lengths = idx_listIndexLengths(); 35024870174SAndreas Gohr foreach ($lengths as $length) { 35100803e56STom N Harris // keep all the values equal or superior 35200803e56STom N Harris if ((int)$length >= (int)$filter) 35300803e56STom N Harris $idx[] = $length; 354f5eb7cf0SAndreas Gohr } 35500803e56STom N Harris } 35600803e56STom N Harris return $idx; 357f5eb7cf0SAndreas Gohr} 358f5eb7cf0SAndreas Gohr 35900803e56STom N Harris/** 36000803e56STom N Harris * Clean a name of a key for use as a file name. 36100803e56STom N Harris * 36200803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's 36300803e56STom N Harris * not a letter, number, or underscore. 36400803e56STom N Harris * 36500803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 36642ea7f44SGerrit Uitslag * 36742ea7f44SGerrit Uitslag * @param string $name 36842ea7f44SGerrit Uitslag * @return string 36900803e56STom N Harris */ 370d868eb89SAndreas Gohrfunction idx_cleanName($name) 371d868eb89SAndreas Gohr{ 37224870174SAndreas Gohr $name = Clean::romanize(trim((string)$name)); 37300803e56STom N Harris $name = preg_replace('#[ \./\\:-]+#', '_', $name); 37400803e56STom N Harris $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 37500803e56STom N Harris return strtolower($name); 376f5eb7cf0SAndreas Gohr} 377f5eb7cf0SAndreas Gohr 37800803e56STom N Harris//Setup VIM: ex: et ts=4 : 379