1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 8b4ce25e9SAndreas Gohr */ 9*24870174SAndreas Gohruse dokuwiki\Utf8\Clean; 10dbc189b2SAndreas Gohruse dokuwiki\Extension\Event; 116225b270SMichael Großeuse dokuwiki\Search\Indexer; 12dbc189b2SAndreas Gohr 137c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade 14dbc189b2SAndreas Gohrdefine('INDEXER_VERSION', 8); 157c2ef4e8STom N Harris 1633815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 17d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 1833815ce2SChris Smith 19b4ce25e9SAndreas Gohr/** 207c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer. 217c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version. 227c2ef4e8STom N Harris * 238cd4c12fSAndreas Gohr * @triggers INDEXER_VERSION_GET 24d0d6fe1bSTom N Harris * Plugins that modify what gets indexed should hook this event and 25d0d6fe1bSTom N Harris * add their version info to the event data like so: 26d0d6fe1bSTom N Harris * $data[$plugin_name] = $plugin_version; 27d0d6fe1bSTom N Harris * 287c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 298605afb1SMichael Hamann * @author Michael Hamann <michael@content-space.de> 3042ea7f44SGerrit Uitslag * 3142ea7f44SGerrit Uitslag * @return int|string 327c2ef4e8STom N Harris */ 337c2ef4e8STom N Harrisfunction idx_get_version(){ 34d0d6fe1bSTom N Harris static $indexer_version = null; 35d0d6fe1bSTom N Harris if ($indexer_version == null) { 368605afb1SMichael Hamann $version = INDEXER_VERSION; 378605afb1SMichael Hamann 38d0d6fe1bSTom N Harris // DokuWiki version is included for the convenience of plugins 39*24870174SAndreas Gohr $data = ['dokuwiki'=>$version]; 40cbb44eabSAndreas Gohr Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 41d0d6fe1bSTom N Harris unset($data['dokuwiki']); // this needs to be first 42d0d6fe1bSTom N Harris ksort($data); 43d0d6fe1bSTom N Harris foreach ($data as $plugin=>$vers) 44d0d6fe1bSTom N Harris $version .= '+'.$plugin.'='.$vers; 45d0d6fe1bSTom N Harris $indexer_version = $version; 46d0d6fe1bSTom N Harris } 47d0d6fe1bSTom N Harris return $indexer_version; 487c2ef4e8STom N Harris} 497c2ef4e8STom N Harris 507c2ef4e8STom N Harris/** 51d5b23302STom N Harris * Measure the length of a string. 52d5b23302STom N Harris * Differs from strlen in handling of asian characters. 53d5b23302STom N Harris * 54d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 5542ea7f44SGerrit Uitslag * 5642ea7f44SGerrit Uitslag * @param string $w 5742ea7f44SGerrit Uitslag * @return int 58d5b23302STom N Harris */ 59d5b23302STom N Harrisfunction wordlen($w){ 60d5b23302STom N Harris $l = strlen($w); 61d5b23302STom N Harris // If left alone, all chinese "words" will get put into w3.idx 62d5b23302STom N Harris // So the "length" of a "word" is faked 634b9792c6STom N Harris if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 644b9792c6STom N Harris foreach($leadbytes[0] as $b) 654b9792c6STom N Harris $l += ord($b) - 0xE1; 664b9792c6STom N Harris } 67d5b23302STom N Harris return $l; 68d5b23302STom N Harris} 69d5b23302STom N Harris 70d5b23302STom N Harris/** 7100803e56STom N Harris * Create an instance of the indexer. 7200803e56STom N Harris * 736225b270SMichael Große * @return Indexer an Indexer 7442ea7f44SGerrit Uitslag * 7500803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 7600803e56STom N Harris */ 779b41be24STom N Harrisfunction idx_get_indexer() { 784f708321SMichael Hamann static $Indexer; 791421e548SMichael Hamann if (!isset($Indexer)) { 806225b270SMichael Große $Indexer = new Indexer(); 8100803e56STom N Harris } 8200803e56STom N Harris return $Indexer; 8300803e56STom N Harris} 8400803e56STom N Harris 8500803e56STom N Harris/** 8600803e56STom N Harris * Returns words that will be ignored. 8700803e56STom N Harris * 8800803e56STom N Harris * @return array list of stop words 8942ea7f44SGerrit Uitslag * 9000803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 9100803e56STom N Harris */ 9200803e56STom N Harrisfunction & idx_get_stopwords() { 9300803e56STom N Harris static $stopwords = null; 9400803e56STom N Harris if (is_null($stopwords)) { 9500803e56STom N Harris global $conf; 9600803e56STom N Harris $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 9779e79377SAndreas Gohr if(file_exists($swfile)){ 9800803e56STom N Harris $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 9900803e56STom N Harris }else{ 100*24870174SAndreas Gohr $stopwords = []; 10100803e56STom N Harris } 10200803e56STom N Harris } 10300803e56STom N Harris return $stopwords; 10400803e56STom N Harris} 10500803e56STom N Harris 10600803e56STom N Harris/** 10700803e56STom N Harris * Adds/updates the search index for the given page 10800803e56STom N Harris * 10900803e56STom N Harris * Locking is handled internally. 11000803e56STom N Harris * 11100803e56STom N Harris * @param string $page name of the page to index 1129b41be24STom N Harris * @param boolean $verbose print status messages 113d041f8dbSMichael Hamann * @param boolean $force force reindexing even when the index is up to date 11442ea7f44SGerrit Uitslag * @return string|boolean the function completed successfully 11542ea7f44SGerrit Uitslag * 11600803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 11700803e56STom N Harris */ 118d041f8dbSMichael Hamannfunction idx_addPage($page, $verbose=false, $force=false) { 1199b41be24STom N Harris $idxtag = metaFN($page,'.indexed'); 120a23ac4d7SMichael Hamann // check if page was deleted but is still in the index 121bbc85ee4STom N Harris if (!page_exists($page)) { 12279e79377SAndreas Gohr if (!file_exists($idxtag)) { 123bbc85ee4STom N Harris if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 124bbc85ee4STom N Harris return false; 125bbc85ee4STom N Harris } 126bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 127bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 128bbc85ee4STom N Harris if ($result === "locked") { 129bbc85ee4STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 130bbc85ee4STom N Harris return false; 131bbc85ee4STom N Harris } 132bbc85ee4STom N Harris @unlink($idxtag); 133bbc85ee4STom N Harris return $result; 134bbc85ee4STom N Harris } 135a23ac4d7SMichael Hamann 136a23ac4d7SMichael Hamann // check if indexing needed 13779e79377SAndreas Gohr if(!$force && file_exists($idxtag)){ 138a23ac4d7SMichael Hamann if(trim(io_readFile($idxtag)) == idx_get_version()){ 139a23ac4d7SMichael Hamann $last = @filemtime($idxtag); 140a23ac4d7SMichael Hamann if($last > @filemtime(wikiFN($page))){ 141a23ac4d7SMichael Hamann if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 142a23ac4d7SMichael Hamann return false; 143a23ac4d7SMichael Hamann } 144a23ac4d7SMichael Hamann } 145a23ac4d7SMichael Hamann } 146a23ac4d7SMichael Hamann 14765aa8490SMichael Hamann $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 148bbc85ee4STom N Harris if ($indexenabled === false) { 149bbc85ee4STom N Harris $result = false; 15079e79377SAndreas Gohr if (file_exists($idxtag)) { 151bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 152bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 153bbc85ee4STom N Harris if ($result === "locked") { 154bbc85ee4STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 155bbc85ee4STom N Harris return false; 156bbc85ee4STom N Harris } 157bbc85ee4STom N Harris @unlink($idxtag); 158bbc85ee4STom N Harris } 159bbc85ee4STom N Harris if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 160bbc85ee4STom N Harris return $result; 161bbc85ee4STom N Harris } 162bbc85ee4STom N Harris 16303aafe1cSMichael Hamann $Indexer = idx_get_indexer(); 16403aafe1cSMichael Hamann $pid = $Indexer->getPID($page); 16503aafe1cSMichael Hamann if ($pid === false) { 16603aafe1cSMichael Hamann if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 16703aafe1cSMichael Hamann return false; 16803aafe1cSMichael Hamann } 16900803e56STom N Harris $body = ''; 170*24870174SAndreas Gohr $metadata = []; 17165aa8490SMichael Hamann $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 17265aa8490SMichael Hamann if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 17339d6fd30SMichael Hamann $metadata['relation_references'] = array_keys($references); 174a424180eSMichael Hamann else 175*24870174SAndreas Gohr $metadata['relation_references'] = []; 176ffec1009SMichael Hamann 177ffec1009SMichael Hamann if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 178ffec1009SMichael Hamann $metadata['relation_media'] = array_keys($media); 179ffec1009SMichael Hamann else 180*24870174SAndreas Gohr $metadata['relation_media'] = []; 181ffec1009SMichael Hamann 182*24870174SAndreas Gohr $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 183e1d9dcc8SAndreas Gohr $evt = new Event('INDEXER_PAGE_ADD', $data); 18439d6fd30SMichael Hamann if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 18500803e56STom N Harris $evt->advise_after(); 18600803e56STom N Harris unset($evt); 18739d6fd30SMichael Hamann extract($data); 18800803e56STom N Harris 1899b41be24STom N Harris $result = $Indexer->addPageWords($page, $body); 190e1e1a7e0SMichael Hamann if ($result === "locked") { 1919b41be24STom N Harris if ($verbose) print("Indexer: locked".DOKU_LF); 1929b41be24STom N Harris return false; 1939b41be24STom N Harris } 194320f489aSMichael Hamann 195320f489aSMichael Hamann if ($result) { 19639d6fd30SMichael Hamann $result = $Indexer->addMetaKeys($page, $metadata); 197320f489aSMichael Hamann if ($result === "locked") { 198320f489aSMichael Hamann if ($verbose) print("Indexer: locked".DOKU_LF); 199320f489aSMichael Hamann return false; 200320f489aSMichael Hamann } 201320f489aSMichael Hamann } 202320f489aSMichael Hamann 2039b41be24STom N Harris if ($result) 2049b41be24STom N Harris io_saveFile(metaFN($page,'.indexed'), idx_get_version()); 2059b41be24STom N Harris if ($verbose) { 2069b41be24STom N Harris print("Indexer: finished".DOKU_LF); 2079b41be24STom N Harris return true; 2089b41be24STom N Harris } 2099b41be24STom N Harris return $result; 21000803e56STom N Harris} 21100803e56STom N Harris 21200803e56STom N Harris/** 21300803e56STom N Harris * Find tokens in the fulltext index 21400803e56STom N Harris * 21500803e56STom N Harris * Takes an array of words and will return a list of matching 21600803e56STom N Harris * pages for each one. 217488dd6ceSAndreas Gohr * 21863773904SAndreas Gohr * Important: No ACL checking is done here! All results are 21963773904SAndreas Gohr * returned, regardless of permissions 22063773904SAndreas Gohr * 221e3ab6fc5SMichael Hamann * @param array $words list of words to search for 22200803e56STom N Harris * @return array list of pages found, associated with the search terms 223488dd6ceSAndreas Gohr */ 2249b41be24STom N Harrisfunction idx_lookup(&$words) { 2259b41be24STom N Harris $Indexer = idx_get_indexer(); 22600803e56STom N Harris return $Indexer->lookup($words); 227488dd6ceSAndreas Gohr} 228488dd6ceSAndreas Gohr 229488dd6ceSAndreas Gohr/** 23000803e56STom N Harris * Split a string into tokens 231488dd6ceSAndreas Gohr * 232f50a239bSTakamura * @param string $string 233f50a239bSTakamura * @param bool $wc 234f50a239bSTakamura * 235f50a239bSTakamura * @return array 236488dd6ceSAndreas Gohr */ 23700803e56STom N Harrisfunction idx_tokenizer($string, $wc=false) { 2389b41be24STom N Harris $Indexer = idx_get_indexer(); 23900803e56STom N Harris return $Indexer->tokenizer($string, $wc); 240488dd6ceSAndreas Gohr} 24100803e56STom N Harris 24200803e56STom N Harris/* For compatibility */ 243488dd6ceSAndreas Gohr 244f5eb7cf0SAndreas Gohr/** 24500803e56STom N Harris * Read the list of words in an index (if it exists). 246f5eb7cf0SAndreas Gohr * 2474e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 24842ea7f44SGerrit Uitslag * 24942ea7f44SGerrit Uitslag * @param string $idx 25042ea7f44SGerrit Uitslag * @param string $suffix 25142ea7f44SGerrit Uitslag * @return array 252f5eb7cf0SAndreas Gohr */ 25300803e56STom N Harrisfunction idx_getIndex($idx, $suffix) { 2541c07b9e6STom N Harris global $conf; 25500803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 256*24870174SAndreas Gohr if (!file_exists($fn)) return []; 25700803e56STom N Harris return file($fn); 25800803e56STom N Harris} 259f5eb7cf0SAndreas Gohr 26000803e56STom N Harris/** 26100803e56STom N Harris * Get the list of lengths indexed in the wiki. 26200803e56STom N Harris * 26300803e56STom N Harris * Read the index directory or a cache file and returns 26400803e56STom N Harris * a sorted array of lengths of the words used in the wiki. 26500803e56STom N Harris * 26600803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 26742ea7f44SGerrit Uitslag * 26842ea7f44SGerrit Uitslag * @return array 26900803e56STom N Harris */ 27000803e56STom N Harrisfunction idx_listIndexLengths() { 27100803e56STom N Harris global $conf; 27200803e56STom N Harris // testing what we have to do, create a cache file or not. 27300803e56STom N Harris if ($conf['readdircache'] == 0) { 27400803e56STom N Harris $docache = false; 2751c07b9e6STom N Harris } else { 27600803e56STom N Harris clearstatcache(); 27779e79377SAndreas Gohr if (file_exists($conf['indexdir'].'/lengths.idx') 27800803e56STom N Harris && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 27964159a61SAndreas Gohr if ( 28064159a61SAndreas Gohr ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 28164159a61SAndreas Gohr !== false 28264159a61SAndreas Gohr ) { 283*24870174SAndreas Gohr $idx = []; 28400803e56STom N Harris foreach ($lengths as $length) { 28500803e56STom N Harris $idx[] = (int)$length; 28600803e56STom N Harris } 28700803e56STom N Harris return $idx; 288f5eb7cf0SAndreas Gohr } 2891c07b9e6STom N Harris } 29000803e56STom N Harris $docache = true; 29100803e56STom N Harris } 2924e1bf408STom N Harris 29300803e56STom N Harris if ($conf['readdircache'] == 0 || $docache) { 29400803e56STom N Harris $dir = @opendir($conf['indexdir']); 29500803e56STom N Harris if ($dir === false) 296*24870174SAndreas Gohr return []; 297*24870174SAndreas Gohr $idx = []; 29800803e56STom N Harris while (($f = readdir($dir)) !== false) { 29900803e56STom N Harris if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 30000803e56STom N Harris $i = substr($f, 1, -4); 30100803e56STom N Harris if (is_numeric($i)) 30200803e56STom N Harris $idx[] = (int)$i; 30300803e56STom N Harris } 30400803e56STom N Harris } 30500803e56STom N Harris closedir($dir); 30600803e56STom N Harris sort($idx); 30700803e56STom N Harris // save this in a file 30800803e56STom N Harris if ($docache) { 30900803e56STom N Harris $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 31000803e56STom N Harris @fwrite($handle, implode("\n", $idx)); 31100803e56STom N Harris @fclose($handle); 31200803e56STom N Harris } 31300803e56STom N Harris return $idx; 31400803e56STom N Harris } 31500803e56STom N Harris 316*24870174SAndreas Gohr return []; 31700803e56STom N Harris} 31800803e56STom N Harris 31900803e56STom N Harris/** 32000803e56STom N Harris * Get the word lengths that have been indexed. 32100803e56STom N Harris * 32200803e56STom N Harris * Reads the index directory and returns an array of lengths 32300803e56STom N Harris * that there are indices for. 32400803e56STom N Harris * 32500803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 32642ea7f44SGerrit Uitslag * 32742ea7f44SGerrit Uitslag * @param array|int $filter 32842ea7f44SGerrit Uitslag * @return array 32900803e56STom N Harris */ 33000803e56STom N Harrisfunction idx_indexLengths($filter) { 33100803e56STom N Harris global $conf; 332*24870174SAndreas Gohr $idx = []; 33300803e56STom N Harris if (is_array($filter)) { 33400803e56STom N Harris // testing if index files exist only 33500803e56STom N Harris $path = $conf['indexdir']."/i"; 336*24870174SAndreas Gohr foreach (array_keys($filter) as $key) { 33779e79377SAndreas Gohr if (file_exists($path.$key.'.idx')) 33800803e56STom N Harris $idx[] = $key; 33900803e56STom N Harris } 340f5eb7cf0SAndreas Gohr } else { 34100803e56STom N Harris $lengths = idx_listIndexLengths(); 342*24870174SAndreas Gohr foreach ($lengths as $length) { 34300803e56STom N Harris // keep all the values equal or superior 34400803e56STom N Harris if ((int)$length >= (int)$filter) 34500803e56STom N Harris $idx[] = $length; 346f5eb7cf0SAndreas Gohr } 34700803e56STom N Harris } 34800803e56STom N Harris return $idx; 349f5eb7cf0SAndreas Gohr} 350f5eb7cf0SAndreas Gohr 35100803e56STom N Harris/** 35200803e56STom N Harris * Clean a name of a key for use as a file name. 35300803e56STom N Harris * 35400803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's 35500803e56STom N Harris * not a letter, number, or underscore. 35600803e56STom N Harris * 35700803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 35842ea7f44SGerrit Uitslag * 35942ea7f44SGerrit Uitslag * @param string $name 36042ea7f44SGerrit Uitslag * @return string 36100803e56STom N Harris */ 36200803e56STom N Harrisfunction idx_cleanName($name) { 363*24870174SAndreas Gohr $name = Clean::romanize(trim((string)$name)); 36400803e56STom N Harris $name = preg_replace('#[ \./\\:-]+#', '_', $name); 36500803e56STom N Harris $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 36600803e56STom N Harris return strtolower($name); 367f5eb7cf0SAndreas Gohr} 368f5eb7cf0SAndreas Gohr 36900803e56STom N Harris//Setup VIM: ex: et ts=4 : 370