1b4ce25e9SAndreas Gohr<?php 2d4f83172SAndreas Gohr 3b4ce25e9SAndreas Gohr/** 4fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index 5b4ce25e9SAndreas Gohr * 6b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 7b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 800803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 9b4ce25e9SAndreas Gohr */ 10d4f83172SAndreas Gohr 1124870174SAndreas Gohruse dokuwiki\Utf8\Clean; 12dbc189b2SAndreas Gohruse dokuwiki\Extension\Event; 136225b270SMichael Großeuse dokuwiki\Search\Indexer; 14dbc189b2SAndreas Gohr 157c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade 16dbc189b2SAndreas Gohrdefine('INDEXER_VERSION', 8); 177c2ef4e8STom N Harris 1833815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 19d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); 2033815ce2SChris Smith 21b4ce25e9SAndreas Gohr/** 227c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer. 237c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version. 247c2ef4e8STom N Harris * 258cd4c12fSAndreas Gohr * @triggers INDEXER_VERSION_GET 26d0d6fe1bSTom N Harris * Plugins that modify what gets indexed should hook this event and 27d0d6fe1bSTom N Harris * add their version info to the event data like so: 28d0d6fe1bSTom N Harris * $data[$plugin_name] = $plugin_version; 29d0d6fe1bSTom N Harris * 307c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 318605afb1SMichael Hamann * @author Michael Hamann <michael@content-space.de> 3242ea7f44SGerrit Uitslag * 3342ea7f44SGerrit Uitslag * @return int|string 347c2ef4e8STom N Harris */ 35d868eb89SAndreas Gohrfunction idx_get_version() 36d868eb89SAndreas Gohr{ 37d0d6fe1bSTom N Harris static $indexer_version = null; 38d0d6fe1bSTom N Harris if ($indexer_version == null) { 398605afb1SMichael Hamann $version = INDEXER_VERSION; 408605afb1SMichael Hamann 41d0d6fe1bSTom N Harris // DokuWiki version is included for the convenience of plugins 4224870174SAndreas Gohr $data = ['dokuwiki' => $version]; 43cbb44eabSAndreas Gohr Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 44d0d6fe1bSTom N Harris unset($data['dokuwiki']); // this needs to be first 45d0d6fe1bSTom N Harris ksort($data); 46d0d6fe1bSTom N Harris foreach ($data as $plugin => $vers) 47d0d6fe1bSTom N Harris $version .= '+' . $plugin . '=' . $vers; 48d0d6fe1bSTom N Harris $indexer_version = $version; 49d0d6fe1bSTom N Harris } 50d0d6fe1bSTom N Harris return $indexer_version; 517c2ef4e8STom N Harris} 527c2ef4e8STom N Harris 537c2ef4e8STom N Harris/** 54d5b23302STom N Harris * Measure the length of a string. 55d5b23302STom N Harris * Differs from strlen in handling of asian characters. 56d5b23302STom N Harris * 57d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 5842ea7f44SGerrit Uitslag * 5942ea7f44SGerrit Uitslag * @param string $w 6042ea7f44SGerrit Uitslag * @return int 61d5b23302STom N Harris */ 62d868eb89SAndreas Gohrfunction wordlen($w) 63d868eb89SAndreas Gohr{ 64d5b23302STom N Harris $l = strlen($w); 65d5b23302STom N Harris // If left alone, all chinese "words" will get put into w3.idx 66d5b23302STom N Harris // So the "length" of a "word" is faked 674b9792c6STom N Harris if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 684b9792c6STom N Harris foreach ($leadbytes[0] as $b) 694b9792c6STom N Harris $l += ord($b) - 0xE1; 704b9792c6STom N Harris } 71d5b23302STom N Harris return $l; 72d5b23302STom N Harris} 73d5b23302STom N Harris 74d5b23302STom N Harris/** 7500803e56STom N Harris * Create an instance of the indexer. 7600803e56STom N Harris * 776225b270SMichael Große * @return Indexer an Indexer 7842ea7f44SGerrit Uitslag * 7900803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 8000803e56STom N Harris */ 81d868eb89SAndreas Gohrfunction idx_get_indexer() 82d868eb89SAndreas Gohr{ 834f708321SMichael Hamann static $Indexer; 841421e548SMichael Hamann if (!isset($Indexer)) { 856225b270SMichael Große $Indexer = new Indexer(); 8600803e56STom N Harris } 8700803e56STom N Harris return $Indexer; 8800803e56STom N Harris} 8900803e56STom N Harris 9000803e56STom N Harris/** 9100803e56STom N Harris * Returns words that will be ignored. 9200803e56STom N Harris * 9300803e56STom N Harris * @return array list of stop words 9442ea7f44SGerrit Uitslag * 9500803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 9600803e56STom N Harris */ 97d868eb89SAndreas Gohrfunction & idx_get_stopwords() 98d868eb89SAndreas Gohr{ 9900803e56STom N Harris static $stopwords = null; 10000803e56STom N Harris if (is_null($stopwords)) { 10100803e56STom N Harris global $conf; 10200803e56STom N Harris $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; 10379e79377SAndreas Gohr if (file_exists($swfile)) { 10400803e56STom N Harris $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 10500803e56STom N Harris } else { 10624870174SAndreas Gohr $stopwords = []; 10700803e56STom N Harris } 10800803e56STom N Harris } 10900803e56STom N Harris return $stopwords; 11000803e56STom N Harris} 11100803e56STom N Harris 11200803e56STom N Harris/** 11300803e56STom N Harris * Adds/updates the search index for the given page 11400803e56STom N Harris * 11500803e56STom N Harris * Locking is handled internally. 11600803e56STom N Harris * 11700803e56STom N Harris * @param string $page name of the page to index 1189b41be24STom N Harris * @param boolean $verbose print status messages 119d041f8dbSMichael Hamann * @param boolean $force force reindexing even when the index is up to date 12042ea7f44SGerrit Uitslag * @return string|boolean the function completed successfully 12142ea7f44SGerrit Uitslag * 12200803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 12300803e56STom N Harris */ 124d868eb89SAndreas Gohrfunction idx_addPage($page, $verbose = false, $force = false) 125d868eb89SAndreas Gohr{ 1269b41be24STom N Harris $idxtag = metaFN($page, '.indexed'); 127a23ac4d7SMichael Hamann // check if page was deleted but is still in the index 128bbc85ee4STom N Harris if (!page_exists($page)) { 12979e79377SAndreas Gohr if (!file_exists($idxtag)) { 13026dfc232SAndreas Gohr if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF; 131bbc85ee4STom N Harris return false; 132bbc85ee4STom N Harris } 133bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 134bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 135bbc85ee4STom N Harris if ($result === "locked") { 13626dfc232SAndreas Gohr if ($verbose) echo "Indexer: locked" . DOKU_LF; 137bbc85ee4STom N Harris return false; 138bbc85ee4STom N Harris } 139bbc85ee4STom N Harris @unlink($idxtag); 140bbc85ee4STom N Harris return $result; 141bbc85ee4STom N Harris } 142a23ac4d7SMichael Hamann 143a23ac4d7SMichael Hamann // check if indexing needed 14479e79377SAndreas Gohr if (!$force && file_exists($idxtag)) { 145a23ac4d7SMichael Hamann if (trim(io_readFile($idxtag)) == idx_get_version()) { 146a23ac4d7SMichael Hamann $last = @filemtime($idxtag); 147a23ac4d7SMichael Hamann if ($last > @filemtime(wikiFN($page))) { 14826dfc232SAndreas Gohr if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF; 149a23ac4d7SMichael Hamann return false; 150a23ac4d7SMichael Hamann } 151a23ac4d7SMichael Hamann } 152a23ac4d7SMichael Hamann } 153a23ac4d7SMichael Hamann 15465aa8490SMichael Hamann $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 155bbc85ee4STom N Harris if ($indexenabled === false) { 156bbc85ee4STom N Harris $result = false; 15779e79377SAndreas Gohr if (file_exists($idxtag)) { 158bbc85ee4STom N Harris $Indexer = idx_get_indexer(); 159bbc85ee4STom N Harris $result = $Indexer->deletePage($page); 160bbc85ee4STom N Harris if ($result === "locked") { 16126dfc232SAndreas Gohr if ($verbose) echo "Indexer: locked" . DOKU_LF; 162bbc85ee4STom N Harris return false; 163bbc85ee4STom N Harris } 164bbc85ee4STom N Harris @unlink($idxtag); 165bbc85ee4STom N Harris } 16626dfc232SAndreas Gohr if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF; 167bbc85ee4STom N Harris return $result; 168bbc85ee4STom N Harris } 169bbc85ee4STom N Harris 17003aafe1cSMichael Hamann $Indexer = idx_get_indexer(); 17103aafe1cSMichael Hamann $pid = $Indexer->getPID($page); 17203aafe1cSMichael Hamann if ($pid === false) { 17326dfc232SAndreas Gohr if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF; 17403aafe1cSMichael Hamann return false; 17503aafe1cSMichael Hamann } 17600803e56STom N Harris $body = ''; 17724870174SAndreas Gohr $metadata = []; 17865aa8490SMichael Hamann $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 17965aa8490SMichael Hamann if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 18039d6fd30SMichael Hamann $metadata['relation_references'] = array_keys($references); 181177d6836SAndreas Gohr else $metadata['relation_references'] = []; 182ffec1009SMichael Hamann 183ffec1009SMichael Hamann if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 184ffec1009SMichael Hamann $metadata['relation_media'] = array_keys($media); 185177d6836SAndreas Gohr else $metadata['relation_media'] = []; 186ffec1009SMichael Hamann 18724870174SAndreas Gohr $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 188e1d9dcc8SAndreas Gohr $evt = new Event('INDEXER_PAGE_ADD', $data); 18939d6fd30SMichael Hamann if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 19000803e56STom N Harris $evt->advise_after(); 19100803e56STom N Harris unset($evt); 19239d6fd30SMichael Hamann extract($data); 19300803e56STom N Harris 1949b41be24STom N Harris $result = $Indexer->addPageWords($page, $body); 195e1e1a7e0SMichael Hamann if ($result === "locked") { 19626dfc232SAndreas Gohr if ($verbose) echo "Indexer: locked" . DOKU_LF; 1979b41be24STom N Harris return false; 1989b41be24STom N Harris } 199320f489aSMichael Hamann 200320f489aSMichael Hamann if ($result) { 20139d6fd30SMichael Hamann $result = $Indexer->addMetaKeys($page, $metadata); 202320f489aSMichael Hamann if ($result === "locked") { 20326dfc232SAndreas Gohr if ($verbose) echo "Indexer: locked" . DOKU_LF; 204320f489aSMichael Hamann return false; 205320f489aSMichael Hamann } 206320f489aSMichael Hamann } 207320f489aSMichael Hamann 2089b41be24STom N Harris if ($result) 2099b41be24STom N Harris io_saveFile(metaFN($page, '.indexed'), idx_get_version()); 2109b41be24STom N Harris if ($verbose) { 21126dfc232SAndreas Gohr echo "Indexer: finished" . DOKU_LF; 2129b41be24STom N Harris return true; 2139b41be24STom N Harris } 2149b41be24STom N Harris return $result; 21500803e56STom N Harris} 21600803e56STom N Harris 21700803e56STom N Harris/** 21800803e56STom N Harris * Find tokens in the fulltext index 21900803e56STom N Harris * 22000803e56STom N Harris * Takes an array of words and will return a list of matching 22100803e56STom N Harris * pages for each one. 222488dd6ceSAndreas Gohr * 22363773904SAndreas Gohr * Important: No ACL checking is done here! All results are 22463773904SAndreas Gohr * returned, regardless of permissions 22563773904SAndreas Gohr * 226e3ab6fc5SMichael Hamann * @param array $words list of words to search for 22700803e56STom N Harris * @return array list of pages found, associated with the search terms 228488dd6ceSAndreas Gohr */ 229d868eb89SAndreas Gohrfunction idx_lookup(&$words) 230d868eb89SAndreas Gohr{ 2319b41be24STom N Harris $Indexer = idx_get_indexer(); 23200803e56STom N Harris return $Indexer->lookup($words); 233488dd6ceSAndreas Gohr} 234488dd6ceSAndreas Gohr 235488dd6ceSAndreas Gohr/** 23600803e56STom N Harris * Split a string into tokens 237488dd6ceSAndreas Gohr * 238f50a239bSTakamura * @param string $string 239f50a239bSTakamura * @param bool $wc 240f50a239bSTakamura * 241f50a239bSTakamura * @return array 242488dd6ceSAndreas Gohr */ 243d868eb89SAndreas Gohrfunction idx_tokenizer($string, $wc = false) 244d868eb89SAndreas Gohr{ 2459b41be24STom N Harris $Indexer = idx_get_indexer(); 24600803e56STom N Harris return $Indexer->tokenizer($string, $wc); 247488dd6ceSAndreas Gohr} 24800803e56STom N Harris 24900803e56STom N Harris/* For compatibility */ 250488dd6ceSAndreas Gohr 251f5eb7cf0SAndreas Gohr/** 25200803e56STom N Harris * Read the list of words in an index (if it exists). 253f5eb7cf0SAndreas Gohr * 2544e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 25542ea7f44SGerrit Uitslag * 25642ea7f44SGerrit Uitslag * @param string $idx 25742ea7f44SGerrit Uitslag * @param string $suffix 25842ea7f44SGerrit Uitslag * @return array 259f5eb7cf0SAndreas Gohr */ 260d868eb89SAndreas Gohrfunction idx_getIndex($idx, $suffix) 261d868eb89SAndreas Gohr{ 2621c07b9e6STom N Harris global $conf; 26300803e56STom N Harris $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx'; 26424870174SAndreas Gohr if (!file_exists($fn)) return []; 26500803e56STom N Harris return file($fn); 26600803e56STom N Harris} 267f5eb7cf0SAndreas Gohr 26800803e56STom N Harris/** 26900803e56STom N Harris * Get the list of lengths indexed in the wiki. 27000803e56STom N Harris * 27100803e56STom N Harris * Read the index directory or a cache file and returns 27200803e56STom N Harris * a sorted array of lengths of the words used in the wiki. 27300803e56STom N Harris * 27400803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 27542ea7f44SGerrit Uitslag * 27642ea7f44SGerrit Uitslag * @return array 27700803e56STom N Harris */ 278d868eb89SAndreas Gohrfunction idx_listIndexLengths() 279d868eb89SAndreas Gohr{ 28000803e56STom N Harris global $conf; 28100803e56STom N Harris // testing what we have to do, create a cache file or not. 28200803e56STom N Harris if ($conf['readdircache'] == 0) { 28300803e56STom N Harris $docache = false; 2841c07b9e6STom N Harris } else { 28500803e56STom N Harris clearstatcache(); 2867d34963bSAndreas Gohr if ( 2877d34963bSAndreas Gohr file_exists($conf['indexdir'] . '/lengths.idx') 2887d34963bSAndreas Gohr && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache']) 2897d34963bSAndreas Gohr ) { 29064159a61SAndreas Gohr if ( 29164159a61SAndreas Gohr ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 29264159a61SAndreas Gohr !== false 29364159a61SAndreas Gohr ) { 29424870174SAndreas Gohr $idx = []; 29500803e56STom N Harris foreach ($lengths as $length) { 29600803e56STom N Harris $idx[] = (int)$length; 29700803e56STom N Harris } 29800803e56STom N Harris return $idx; 299f5eb7cf0SAndreas Gohr } 3001c07b9e6STom N Harris } 30100803e56STom N Harris $docache = true; 30200803e56STom N Harris } 3034e1bf408STom N Harris 30400803e56STom N Harris if ($conf['readdircache'] == 0 || $docache) { 30500803e56STom N Harris $dir = @opendir($conf['indexdir']); 30600803e56STom N Harris if ($dir === false) 30724870174SAndreas Gohr return []; 30824870174SAndreas Gohr $idx = []; 30900803e56STom N Harris while (($f = readdir($dir)) !== false) { 310*6c16a3a9Sfiwswe if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) { 31100803e56STom N Harris $i = substr($f, 1, -4); 31200803e56STom N Harris if (is_numeric($i)) 31300803e56STom N Harris $idx[] = (int)$i; 31400803e56STom N Harris } 31500803e56STom N Harris } 31600803e56STom N Harris closedir($dir); 31700803e56STom N Harris sort($idx); 31800803e56STom N Harris // save this in a file 31900803e56STom N Harris if ($docache) { 32000803e56STom N Harris $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w'); 32100803e56STom N Harris @fwrite($handle, implode("\n", $idx)); 32200803e56STom N Harris @fclose($handle); 32300803e56STom N Harris } 32400803e56STom N Harris return $idx; 32500803e56STom N Harris } 32600803e56STom N Harris 32724870174SAndreas Gohr return []; 32800803e56STom N Harris} 32900803e56STom N Harris 33000803e56STom N Harris/** 33100803e56STom N Harris * Get the word lengths that have been indexed. 33200803e56STom N Harris * 33300803e56STom N Harris * Reads the index directory and returns an array of lengths 33400803e56STom N Harris * that there are indices for. 33500803e56STom N Harris * 33600803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 33742ea7f44SGerrit Uitslag * 33842ea7f44SGerrit Uitslag * @param array|int $filter 33942ea7f44SGerrit Uitslag * @return array 34000803e56STom N Harris */ 341d868eb89SAndreas Gohrfunction idx_indexLengths($filter) 342d868eb89SAndreas Gohr{ 34300803e56STom N Harris global $conf; 34424870174SAndreas Gohr $idx = []; 34500803e56STom N Harris if (is_array($filter)) { 34600803e56STom N Harris // testing if index files exist only 34700803e56STom N Harris $path = $conf['indexdir'] . "/i"; 34824870174SAndreas Gohr foreach (array_keys($filter) as $key) { 34979e79377SAndreas Gohr if (file_exists($path . $key . '.idx')) 35000803e56STom N Harris $idx[] = $key; 35100803e56STom N Harris } 352f5eb7cf0SAndreas Gohr } else { 35300803e56STom N Harris $lengths = idx_listIndexLengths(); 35424870174SAndreas Gohr foreach ($lengths as $length) { 35500803e56STom N Harris // keep all the values equal or superior 35600803e56STom N Harris if ((int)$length >= (int)$filter) 35700803e56STom N Harris $idx[] = $length; 358f5eb7cf0SAndreas Gohr } 35900803e56STom N Harris } 36000803e56STom N Harris return $idx; 361f5eb7cf0SAndreas Gohr} 362f5eb7cf0SAndreas Gohr 36300803e56STom N Harris/** 36400803e56STom N Harris * Clean a name of a key for use as a file name. 36500803e56STom N Harris * 36600803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's 36700803e56STom N Harris * not a letter, number, or underscore. 36800803e56STom N Harris * 36900803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 37042ea7f44SGerrit Uitslag * 37142ea7f44SGerrit Uitslag * @param string $name 37242ea7f44SGerrit Uitslag * @return string 37300803e56STom N Harris */ 374d868eb89SAndreas Gohrfunction idx_cleanName($name) 375d868eb89SAndreas Gohr{ 37624870174SAndreas Gohr $name = Clean::romanize(trim((string)$name)); 37700803e56STom N Harris $name = preg_replace('#[ \./\\:-]+#', '_', $name); 37800803e56STom N Harris $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 37900803e56STom N Harris return strtolower($name); 380f5eb7cf0SAndreas Gohr} 381f5eb7cf0SAndreas Gohr 38200803e56STom N Harris//Setup VIM: ex: et ts=4 : 383