1<?php 2 3/** 4 * Functions to create the fulltext search index 5 * 6 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 7 * @author Andreas Gohr <andi@splitbrain.org> 8 * @author Tom N Harris <tnharris@whoopdedo.org> 9 */ 10 11use dokuwiki\Utf8\Clean; 12use dokuwiki\Extension\Event; 13use dokuwiki\Search\Indexer; 14 15// Version tag used to force rebuild on upgrade 16define('INDEXER_VERSION', 8); 17 18// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 19if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); 20 21/** 22 * Version of the indexer taking into consideration the external tokenizer. 23 * The indexer is only compatible with data written by the same version. 24 * 25 * @triggers INDEXER_VERSION_GET 26 * Plugins that modify what gets indexed should hook this event and 27 * add their version info to the event data like so: 28 * $data[$plugin_name] = $plugin_version; 29 * 30 * @author Tom N Harris <tnharris@whoopdedo.org> 31 * @author Michael Hamann <michael@content-space.de> 32 * 33 * @return int|string 34 */ 35function idx_get_version() 36{ 37 static $indexer_version = null; 38 if ($indexer_version == null) { 39 $version = INDEXER_VERSION; 40 41 // DokuWiki version is included for the convenience of plugins 42 $data = ['dokuwiki' => $version]; 43 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 44 unset($data['dokuwiki']); // this needs to be first 45 ksort($data); 46 foreach ($data as $plugin => $vers) 47 $version .= '+' . $plugin . '=' . $vers; 48 $indexer_version = $version; 49 } 50 return $indexer_version; 51} 52 53/** 54 * Measure the length of a string. 55 * Differs from strlen in handling of asian characters. 56 * 57 * @author Tom N Harris <tnharris@whoopdedo.org> 58 * 59 * @param string $w 60 * @return int 61 */ 62function wordlen($w) 63{ 64 $l = strlen($w); 65 // If left alone, all chinese "words" will get put into w3.idx 66 // So the "length" of a "word" is faked 67 if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { 68 foreach ($leadbytes[0] as $b) 69 $l += ord($b) - 0xE1; 70 } 71 return $l; 72} 73 74/** 75 * Create an instance of the indexer. 76 * 77 * @return Indexer an Indexer 78 * 79 * @author Tom N Harris <tnharris@whoopdedo.org> 80 */ 81function idx_get_indexer() 82{ 83 static $Indexer; 84 if (!isset($Indexer)) { 85 $Indexer = new Indexer(); 86 } 87 return $Indexer; 88} 89 90/** 91 * Returns words that will be ignored. 92 * 93 * @return array list of stop words 94 * 95 * @author Tom N Harris <tnharris@whoopdedo.org> 96 */ 97function & idx_get_stopwords() 98{ 99 static $stopwords = null; 100 if (is_null($stopwords)) { 101 global $conf; 102 $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; 103 if (file_exists($swfile)) { 104 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 105 } else { 106 $stopwords = []; 107 } 108 } 109 return $stopwords; 110} 111 112/** 113 * Adds/updates the search index for the given page 114 * 115 * Locking is handled internally. 116 * 117 * @param string $page name of the page to index 118 * @param boolean $verbose print status messages 119 * @param boolean $force force reindexing even when the index is up to date 120 * @return string|boolean the function completed successfully 121 * 122 * @author Tom N Harris <tnharris@whoopdedo.org> 123 */ 124function idx_addPage($page, $verbose = false, $force = false) 125{ 126 $idxtag = metaFN($page, '.indexed'); 127 // check if page was deleted but is still in the index 128 if (!page_exists($page)) { 129 if (!file_exists($idxtag)) { 130 if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF; 131 return false; 132 } 133 $Indexer = idx_get_indexer(); 134 $result = $Indexer->deletePage($page); 135 if ($result === "locked") { 136 if ($verbose) echo "Indexer: locked" . DOKU_LF; 137 return false; 138 } 139 @unlink($idxtag); 140 return $result; 141 } 142 143 // check if indexing needed 144 if (!$force && file_exists($idxtag)) { 145 if (trim(io_readFile($idxtag)) == idx_get_version()) { 146 $last = @filemtime($idxtag); 147 if ($last > @filemtime(wikiFN($page))) { 148 if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF; 149 return false; 150 } 151 } 152 } 153 154 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 155 if ($indexenabled === false) { 156 $result = false; 157 if (file_exists($idxtag)) { 158 $Indexer = idx_get_indexer(); 159 $result = $Indexer->deletePage($page); 160 if ($result === "locked") { 161 if ($verbose) echo "Indexer: locked" . DOKU_LF; 162 return false; 163 } 164 @unlink($idxtag); 165 } 166 if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF; 167 return $result; 168 } 169 170 $Indexer = idx_get_indexer(); 171 $pid = $Indexer->getPID($page); 172 if ($pid === false) { 173 if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF; 174 return false; 175 } 176 $body = ''; 177 $metadata = []; 178 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 179 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 180 $metadata['relation_references'] = array_keys($references); 181 else $metadata['relation_references'] = []; 182 183 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 184 $metadata['relation_media'] = array_keys($media); 185 else $metadata['relation_media'] = []; 186 187 $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; 188 $evt = new Event('INDEXER_PAGE_ADD', $data); 189 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 190 $evt->advise_after(); 191 unset($evt); 192 extract($data); 193 194 $result = $Indexer->addPageWords($page, $body); 195 if ($result === "locked") { 196 if ($verbose) echo "Indexer: locked" . DOKU_LF; 197 return false; 198 } 199 200 if ($result) { 201 $result = $Indexer->addMetaKeys($page, $metadata); 202 if ($result === "locked") { 203 if ($verbose) echo "Indexer: locked" . DOKU_LF; 204 return false; 205 } 206 } 207 208 if ($result) 209 io_saveFile(metaFN($page, '.indexed'), idx_get_version()); 210 if ($verbose) { 211 echo "Indexer: finished" . DOKU_LF; 212 return true; 213 } 214 return $result; 215} 216 217/** 218 * Find tokens in the fulltext index 219 * 220 * Takes an array of words and will return a list of matching 221 * pages for each one. 222 * 223 * Important: No ACL checking is done here! All results are 224 * returned, regardless of permissions 225 * 226 * @param array $words list of words to search for 227 * @return array list of pages found, associated with the search terms 228 */ 229function idx_lookup(&$words) 230{ 231 $Indexer = idx_get_indexer(); 232 return $Indexer->lookup($words); 233} 234 235/** 236 * Split a string into tokens 237 * 238 * @param string $string 239 * @param bool $wc 240 * 241 * @return array 242 */ 243function idx_tokenizer($string, $wc = false) 244{ 245 $Indexer = idx_get_indexer(); 246 return $Indexer->tokenizer($string, $wc); 247} 248 249/* For compatibility */ 250 251/** 252 * Read the list of words in an index (if it exists). 253 * 254 * @author Tom N Harris <tnharris@whoopdedo.org> 255 * 256 * @param string $idx 257 * @param string $suffix 258 * @return array 259 */ 260function idx_getIndex($idx, $suffix) 261{ 262 global $conf; 263 $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx'; 264 if (!file_exists($fn)) return []; 265 return file($fn); 266} 267 268/** 269 * Get the list of lengths indexed in the wiki. 270 * 271 * Read the index directory or a cache file and returns 272 * a sorted array of lengths of the words used in the wiki. 273 * 274 * @author YoBoY <yoboy.leguesh@gmail.com> 275 * 276 * @return array 277 */ 278function idx_listIndexLengths() 279{ 280 global $conf; 281 // testing what we have to do, create a cache file or not. 282 if ($conf['readdircache'] == 0) { 283 $docache = false; 284 } else { 285 clearstatcache(); 286 if ( 287 file_exists($conf['indexdir'] . '/lengths.idx') 288 && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache']) 289 ) { 290 if ( 291 ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 292 !== false 293 ) { 294 $idx = []; 295 foreach ($lengths as $length) { 296 $idx[] = (int)$length; 297 } 298 return $idx; 299 } 300 } 301 $docache = true; 302 } 303 304 if ($conf['readdircache'] == 0 || $docache) { 305 $dir = @opendir($conf['indexdir']); 306 if ($dir === false) 307 return []; 308 $idx = []; 309 while (($f = readdir($dir)) !== false) { 310 if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) { 311 $i = substr($f, 1, -4); 312 if (is_numeric($i)) 313 $idx[] = (int)$i; 314 } 315 } 316 closedir($dir); 317 sort($idx); 318 // save this in a file 319 if ($docache) { 320 $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w'); 321 @fwrite($handle, implode("\n", $idx)); 322 @fclose($handle); 323 } 324 return $idx; 325 } 326 327 return []; 328} 329 330/** 331 * Get the word lengths that have been indexed. 332 * 333 * Reads the index directory and returns an array of lengths 334 * that there are indices for. 335 * 336 * @author YoBoY <yoboy.leguesh@gmail.com> 337 * 338 * @param array|int $filter 339 * @return array 340 */ 341function idx_indexLengths($filter) 342{ 343 global $conf; 344 $idx = []; 345 if (is_array($filter)) { 346 // testing if index files exist only 347 $path = $conf['indexdir'] . "/i"; 348 foreach (array_keys($filter) as $key) { 349 if (file_exists($path . $key . '.idx')) 350 $idx[] = $key; 351 } 352 } else { 353 $lengths = idx_listIndexLengths(); 354 foreach ($lengths as $length) { 355 // keep all the values equal or superior 356 if ((int)$length >= (int)$filter) 357 $idx[] = $length; 358 } 359 } 360 return $idx; 361} 362 363/** 364 * Clean a name of a key for use as a file name. 365 * 366 * Romanizes non-latin characters, then strips away anything that's 367 * not a letter, number, or underscore. 368 * 369 * @author Tom N Harris <tnharris@whoopdedo.org> 370 * 371 * @param string $name 372 * @return string 373 */ 374function idx_cleanName($name) 375{ 376 $name = Clean::romanize(trim((string)$name)); 377 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 378 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 379 return strtolower($name); 380} 381 382//Setup VIM: ex: et ts=4 : 383