1<?php 2/** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9use dokuwiki\Utf8\Clean; 10use dokuwiki\Extension\Event; 11use dokuwiki\Search\Indexer; 12 13// Version tag used to force rebuild on upgrade 14define('INDEXER_VERSION', 8); 15 16// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 17if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); 18 19/** 20 * Version of the indexer taking into consideration the external tokenizer. 21 * The indexer is only compatible with data written by the same version. 22 * 23 * @triggers INDEXER_VERSION_GET 24 * Plugins that modify what gets indexed should hook this event and 25 * add their version info to the event data like so: 26 * $data[$plugin_name] = $plugin_version; 27 * 28 * @author Tom N Harris <tnharris@whoopdedo.org> 29 * @author Michael Hamann <michael@content-space.de> 30 * 31 * @return int|string 32 */ 33function idx_get_version() 34{ 35 static $indexer_version = null; 36 if ($indexer_version == null) { 37 $version = INDEXER_VERSION; 38 39 // DokuWiki version is included for the convenience of plugins 40 $data = ['dokuwiki'=>$version]; 41 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 42 unset($data['dokuwiki']); // this needs to be first 43 ksort($data); 44 foreach ($data as $plugin => $vers) 45 $version .= '+'.$plugin.'='.$vers; 46 $indexer_version = $version; 47 } 48 return $indexer_version; 49} 50 51/** 52 * Measure the length of a string. 53 * Differs from strlen in handling of asian characters. 54 * 55 * @author Tom N Harris <tnharris@whoopdedo.org> 56 * 57 * @param string $w 58 * @return int 59 */ 60function wordlen($w) 61{ 62 $l = strlen($w); 63 // If left alone, all chinese "words" will get put into w3.idx 64 // So the "length" of a "word" is faked 65 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 66 foreach ($leadbytes[0] as $b) 67 $l += ord($b) - 0xE1; 68 } 69 return $l; 70} 71 72/** 73 * Create an instance of the indexer. 74 * 75 * @return Indexer an Indexer 76 * 77 * @author Tom N Harris <tnharris@whoopdedo.org> 78 */ 79function idx_get_indexer() 80{ 81 static $Indexer; 82 if (!isset($Indexer)) { 83 $Indexer = new Indexer(); 84 } 85 return $Indexer; 86} 87 88/** 89 * Returns words that will be ignored. 90 * 91 * @return array list of stop words 92 * 93 * @author Tom N Harris <tnharris@whoopdedo.org> 94 */ 95function & idx_get_stopwords() 96{ 97 static $stopwords = null; 98 if (is_null($stopwords)) { 99 global $conf; 100 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 101 if (file_exists($swfile)) { 102 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 103 } else { 104 $stopwords = []; 105 } 106 } 107 return $stopwords; 108} 109 110/** 111 * Adds/updates the search index for the given page 112 * 113 * Locking is handled internally. 114 * 115 * @param string $page name of the page to index 116 * @param boolean $verbose print status messages 117 * @param boolean $force force reindexing even when the index is up to date 118 * @return string|boolean the function completed successfully 119 * 120 * @author Tom N Harris <tnharris@whoopdedo.org> 121 */ 122function idx_addPage($page, $verbose = false, $force = false) 123{ 124 $idxtag = metaFN($page, '.indexed'); 125 // check if page was deleted but is still in the index 126 if (!page_exists($page)) { 127 if (!file_exists($idxtag)) { 128 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 129 return false; 130 } 131 $Indexer = idx_get_indexer(); 132 $result = $Indexer->deletePage($page); 133 if ($result === "locked") { 134 if ($verbose) print("Indexer: locked".DOKU_LF); 135 return false; 136 } 137 @unlink($idxtag); 138 return $result; 139 } 140 141 // check if indexing needed 142 if (!$force && file_exists($idxtag)) { 143 if (trim(io_readFile($idxtag)) == idx_get_version()) { 144 $last = @filemtime($idxtag); 145 if ($last > @filemtime(wikiFN($page))) { 146 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 147 return false; 148 } 149 } 150 } 151 152 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 153 if ($indexenabled === false) { 154 $result = false; 155 if (file_exists($idxtag)) { 156 $Indexer = idx_get_indexer(); 157 $result = $Indexer->deletePage($page); 158 if ($result === "locked") { 159 if ($verbose) print("Indexer: locked".DOKU_LF); 160 return false; 161 } 162 @unlink($idxtag); 163 } 164 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 165 return $result; 166 } 167 168 $Indexer = idx_get_indexer(); 169 $pid = $Indexer->getPID($page); 170 if ($pid === false) { 171 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 172 return false; 173 } 174 $body = ''; 175 $metadata = []; 176 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 177 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 178 $metadata['relation_references'] = array_keys($references); 179 else $metadata['relation_references'] = []; 180 181 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 182 $metadata['relation_media'] = array_keys($media); 183 else $metadata['relation_media'] = []; 184 185 $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 186 $evt = new Event('INDEXER_PAGE_ADD', $data); 187 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 188 $evt->advise_after(); 189 unset($evt); 190 extract($data); 191 192 $result = $Indexer->addPageWords($page, $body); 193 if ($result === "locked") { 194 if ($verbose) print("Indexer: locked".DOKU_LF); 195 return false; 196 } 197 198 if ($result) { 199 $result = $Indexer->addMetaKeys($page, $metadata); 200 if ($result === "locked") { 201 if ($verbose) print("Indexer: locked".DOKU_LF); 202 return false; 203 } 204 } 205 206 if ($result) 207 io_saveFile(metaFN($page, '.indexed'), idx_get_version()); 208 if ($verbose) { 209 print("Indexer: finished".DOKU_LF); 210 return true; 211 } 212 return $result; 213} 214 215/** 216 * Find tokens in the fulltext index 217 * 218 * Takes an array of words and will return a list of matching 219 * pages for each one. 220 * 221 * Important: No ACL checking is done here! All results are 222 * returned, regardless of permissions 223 * 224 * @param array $words list of words to search for 225 * @return array list of pages found, associated with the search terms 226 */ 227function idx_lookup(&$words) 228{ 229 $Indexer = idx_get_indexer(); 230 return $Indexer->lookup($words); 231} 232 233/** 234 * Split a string into tokens 235 * 236 * @param string $string 237 * @param bool $wc 238 * 239 * @return array 240 */ 241function idx_tokenizer($string, $wc = false) 242{ 243 $Indexer = idx_get_indexer(); 244 return $Indexer->tokenizer($string, $wc); 245} 246 247/* For compatibility */ 248 249/** 250 * Read the list of words in an index (if it exists). 251 * 252 * @author Tom N Harris <tnharris@whoopdedo.org> 253 * 254 * @param string $idx 255 * @param string $suffix 256 * @return array 257 */ 258function idx_getIndex($idx, $suffix) 259{ 260 global $conf; 261 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 262 if (!file_exists($fn)) return []; 263 return file($fn); 264} 265 266/** 267 * Get the list of lengths indexed in the wiki. 268 * 269 * Read the index directory or a cache file and returns 270 * a sorted array of lengths of the words used in the wiki. 271 * 272 * @author YoBoY <yoboy.leguesh@gmail.com> 273 * 274 * @return array 275 */ 276function idx_listIndexLengths() 277{ 278 global $conf; 279 // testing what we have to do, create a cache file or not. 280 if ($conf['readdircache'] == 0) { 281 $docache = false; 282 } else { 283 clearstatcache(); 284 if ( 285 file_exists($conf['indexdir'].'/lengths.idx') 286 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache']) 287 ) { 288 if ( 289 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 290 !== false 291 ) { 292 $idx = []; 293 foreach ($lengths as $length) { 294 $idx[] = (int)$length; 295 } 296 return $idx; 297 } 298 } 299 $docache = true; 300 } 301 302 if ($conf['readdircache'] == 0 || $docache) { 303 $dir = @opendir($conf['indexdir']); 304 if ($dir === false) 305 return []; 306 $idx = []; 307 while (($f = readdir($dir)) !== false) { 308 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 309 $i = substr($f, 1, -4); 310 if (is_numeric($i)) 311 $idx[] = (int)$i; 312 } 313 } 314 closedir($dir); 315 sort($idx); 316 // save this in a file 317 if ($docache) { 318 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 319 @fwrite($handle, implode("\n", $idx)); 320 @fclose($handle); 321 } 322 return $idx; 323 } 324 325 return []; 326} 327 328/** 329 * Get the word lengths that have been indexed. 330 * 331 * Reads the index directory and returns an array of lengths 332 * that there are indices for. 333 * 334 * @author YoBoY <yoboy.leguesh@gmail.com> 335 * 336 * @param array|int $filter 337 * @return array 338 */ 339function idx_indexLengths($filter) 340{ 341 global $conf; 342 $idx = []; 343 if (is_array($filter)) { 344 // testing if index files exist only 345 $path = $conf['indexdir']."/i"; 346 foreach (array_keys($filter) as $key) { 347 if (file_exists($path.$key.'.idx')) 348 $idx[] = $key; 349 } 350 } else { 351 $lengths = idx_listIndexLengths(); 352 foreach ($lengths as $length) { 353 // keep all the values equal or superior 354 if ((int)$length >= (int)$filter) 355 $idx[] = $length; 356 } 357 } 358 return $idx; 359} 360 361/** 362 * Clean a name of a key for use as a file name. 363 * 364 * Romanizes non-latin characters, then strips away anything that's 365 * not a letter, number, or underscore. 366 * 367 * @author Tom N Harris <tnharris@whoopdedo.org> 368 * 369 * @param string $name 370 * @return string 371 */ 372function idx_cleanName($name) 373{ 374 $name = Clean::romanize(trim((string)$name)); 375 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 376 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 377 return strtolower($name); 378} 379 380//Setup VIM: ex: et ts=4 : 381