1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9use dokuwiki\Utf8\Clean; 10use dokuwiki\Extension\Event; 11use dokuwiki\Search\Indexer; 12 13// Version tag used to force rebuild on upgrade 14define('INDEXER_VERSION', 8); 15 16// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 17if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); 18 19/** 20 * Version of the indexer taking into consideration the external tokenizer. 21 * The indexer is only compatible with data written by the same version. 22 * 23 * @triggers INDEXER_VERSION_GET 24 * Plugins that modify what gets indexed should hook this event and 25 * add their version info to the event data like so: 26 * $data[$plugin_name] = $plugin_version; 27 * 28 * @author Tom N Harris <tnharris@whoopdedo.org> 29 * @author Michael Hamann <michael@content-space.de> 30 * 31 * @return int|string 32 */ 33function idx_get_version() 34{ 35 static $indexer_version = null; 36 if ($indexer_version == null) { 37 $version = INDEXER_VERSION; 38 39 // DokuWiki version is included for the convenience of plugins 40 $data = ['dokuwiki'=>$version]; 41 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 42 unset($data['dokuwiki']); // this needs to be first 43 ksort($data); 44 foreach ($data as $plugin => $vers) 45 $version .= '+'.$plugin.'='.$vers; 46 $indexer_version = $version; 47 } 48 return $indexer_version; 49} 50 51/** 52 * Measure the length of a string. 53 * Differs from strlen in handling of asian characters. 54 * 55 * @author Tom N Harris <tnharris@whoopdedo.org> 56 * 57 * @param string $w 58 * @return int 59 */ 60function wordlen($w) 61{ 62 $l = strlen($w); 63 // If left alone, all chinese "words" will get put into w3.idx 64 // So the "length" of a "word" is faked 65 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 66 foreach ($leadbytes[0] as $b) 67 $l += ord($b) - 0xE1; 68 } 69 return $l; 70} 71 72/** 73 * Create an instance of the indexer. 74 * 75 * @return Indexer an Indexer 76 * 77 * @author Tom N Harris <tnharris@whoopdedo.org> 78 */ 79function idx_get_indexer() 80{ 81 static $Indexer; 82 if (!isset($Indexer)) { 83 $Indexer = new Indexer(); 84 } 85 return $Indexer; 86} 87 88/** 89 * Returns words that will be ignored. 90 * 91 * @return array list of stop words 92 * 93 * @author Tom N Harris <tnharris@whoopdedo.org> 94 */ 95function & idx_get_stopwords() 96{ 97 static $stopwords = null; 98 if (is_null($stopwords)) { 99 global $conf; 100 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 101 if (file_exists($swfile)) { 102 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 103 } else { 104 $stopwords = []; 105 } 106 } 107 return $stopwords; 108} 109 110/** 111 * Adds/updates the search index for the given page 112 * 113 * Locking is handled internally. 114 * 115 * @param string $page name of the page to index 116 * @param boolean $verbose print status messages 117 * @param boolean $force force reindexing even when the index is up to date 118 * @return string|boolean the function completed successfully 119 * 120 * @author Tom N Harris <tnharris@whoopdedo.org> 121 */ 122function idx_addPage($page, $verbose = false, $force = false) 123{ 124 $idxtag = metaFN($page, '.indexed'); 125 // check if page was deleted but is still in the index 126 if (!page_exists($page)) { 127 if (!file_exists($idxtag)) { 128 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 129 return false; 130 } 131 $Indexer = idx_get_indexer(); 132 $result = $Indexer->deletePage($page); 133 if ($result === "locked") { 134 if ($verbose) print("Indexer: locked".DOKU_LF); 135 return false; 136 } 137 @unlink($idxtag); 138 return $result; 139 } 140 141 // check if indexing needed 142 if (!$force && file_exists($idxtag)) { 143 if (trim(io_readFile($idxtag)) == idx_get_version()) { 144 $last = @filemtime($idxtag); 145 if ($last > @filemtime(wikiFN($page))) { 146 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 147 return false; 148 } 149 } 150 } 151 152 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 153 if ($indexenabled === false) { 154 $result = false; 155 if (file_exists($idxtag)) { 156 $Indexer = idx_get_indexer(); 157 $result = $Indexer->deletePage($page); 158 if ($result === "locked") { 159 if ($verbose) print("Indexer: locked".DOKU_LF); 160 return false; 161 } 162 @unlink($idxtag); 163 } 164 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 165 return $result; 166 } 167 168 $Indexer = idx_get_indexer(); 169 $pid = $Indexer->getPID($page); 170 if ($pid === false) { 171 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 172 return false; 173 } 174 $body = ''; 175 $metadata = []; 176 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 177 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 178 $metadata['relation_references'] = array_keys($references); 179 else $metadata['relation_references'] = []; 180 181 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 182 $metadata['relation_media'] = array_keys($media); 183 else $metadata['relation_media'] = []; 184 185 $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 186 $evt = new Event('INDEXER_PAGE_ADD', $data); 187 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 188 $evt->advise_after(); 189 unset($evt); 190 extract($data); 191 192 $result = $Indexer->addPageWords($page, $body); 193 if ($result === "locked") { 194 if ($verbose) print("Indexer: locked".DOKU_LF); 195 return false; 196 } 197 198 if ($result) { 199 $result = $Indexer->addMetaKeys($page, $metadata); 200 if ($result === "locked") { 201 if ($verbose) print("Indexer: locked".DOKU_LF); 202 return false; 203 } 204 } 205 206 if ($result) 207 io_saveFile(metaFN($page, '.indexed'), idx_get_version()); 208 if ($verbose) { 209 print("Indexer: finished".DOKU_LF); 210 return true; 211 } 212 return $result; 213} 214 215/** 216 * Find tokens in the fulltext index 217 * 218 * Takes an array of words and will return a list of matching 219 * pages for each one. 220 * 221 * Important: No ACL checking is done here! All results are 222 * returned, regardless of permissions 223 * 224 * @param array $words list of words to search for 225 * @return array list of pages found, associated with the search terms 226 */ 227function idx_lookup(&$words) 228{ 229 $Indexer = idx_get_indexer(); 230 return $Indexer->lookup($words); 231} 232 233/** 234 * Split a string into tokens 235 * 236 * @param string $string 237 * @param bool $wc 238 * 239 * @return array 240 */ 241function idx_tokenizer($string, $wc = false) 242{ 243 $Indexer = idx_get_indexer(); 244 return $Indexer->tokenizer($string, $wc); 245} 246 247/* For compatibility */ 248 249/** 250 * Read the list of words in an index (if it exists). 251 * 252 * @author Tom N Harris <tnharris@whoopdedo.org> 253 * 254 * @param string $idx 255 * @param string $suffix 256 * @return array 257 */ 258function idx_getIndex($idx, $suffix) 259{ 260 global $conf; 261 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 262 if (!file_exists($fn)) return []; 263 return file($fn); 264} 265 266/** 267 * Get the list of lengths indexed in the wiki. 268 * 269 * Read the index directory or a cache file and returns 270 * a sorted array of lengths of the words used in the wiki. 271 * 272 * @author YoBoY <yoboy.leguesh@gmail.com> 273 * 274 * @return array 275 */ 276function idx_listIndexLengths() 277{ 278 global $conf; 279 // testing what we have to do, create a cache file or not. 280 if ($conf['readdircache'] == 0) { 281 $docache = false; 282 } else { 283 clearstatcache(); 284 if (file_exists($conf['indexdir'].'/lengths.idx') 285 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 286 if ( 287 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 288 !== false 289 ) { 290 $idx = []; 291 foreach ($lengths as $length) { 292 $idx[] = (int)$length; 293 } 294 return $idx; 295 } 296 } 297 $docache = true; 298 } 299 300 if ($conf['readdircache'] == 0 || $docache) { 301 $dir = @opendir($conf['indexdir']); 302 if ($dir === false) 303 return []; 304 $idx = []; 305 while (($f = readdir($dir)) !== false) { 306 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 307 $i = substr($f, 1, -4); 308 if (is_numeric($i)) 309 $idx[] = (int)$i; 310 } 311 } 312 closedir($dir); 313 sort($idx); 314 // save this in a file 315 if ($docache) { 316 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 317 @fwrite($handle, implode("\n", $idx)); 318 @fclose($handle); 319 } 320 return $idx; 321 } 322 323 return []; 324} 325 326/** 327 * Get the word lengths that have been indexed. 328 * 329 * Reads the index directory and returns an array of lengths 330 * that there are indices for. 331 * 332 * @author YoBoY <yoboy.leguesh@gmail.com> 333 * 334 * @param array|int $filter 335 * @return array 336 */ 337function idx_indexLengths($filter) 338{ 339 global $conf; 340 $idx = []; 341 if (is_array($filter)) { 342 // testing if index files exist only 343 $path = $conf['indexdir']."/i"; 344 foreach (array_keys($filter) as $key) { 345 if (file_exists($path.$key.'.idx')) 346 $idx[] = $key; 347 } 348 } else { 349 $lengths = idx_listIndexLengths(); 350 foreach ($lengths as $length) { 351 // keep all the values equal or superior 352 if ((int)$length >= (int)$filter) 353 $idx[] = $length; 354 } 355 } 356 return $idx; 357} 358 359/** 360 * Clean a name of a key for use as a file name. 361 * 362 * Romanizes non-latin characters, then strips away anything that's 363 * not a letter, number, or underscore. 364 * 365 * @author Tom N Harris <tnharris@whoopdedo.org> 366 * 367 * @param string $name 368 * @return string 369 */ 370function idx_cleanName($name) 371{ 372 $name = Clean::romanize(trim((string)$name)); 373 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 374 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 375 return strtolower($name); 376} 377 378//Setup VIM: ex: et ts=4 : 379