1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\FulltextIndex; 7use dokuwiki\Search\MetadataIndex; 8 9// Version tag used to force rebuild on upgrade 10const INDEXER_VERSION = 8; 11 12/** 13 * Class DokuWiki Indexer (Singleton) 14 * 15 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 16 * @author Andreas Gohr <andi@splitbrain.org> 17 * @author Tom N Harris <tnharris@whoopdedo.org> 18 */ 19class Indexer extends AbstractIndex 20{ 21 /** @var Indexer $instance */ 22 protected static $instance = null; 23 24 /** 25 * Get new or existing singleton instance of the Indexer 26 * 27 * @return Indexer 28 */ 29 public static function getInstance() 30 { 31 if (is_null(static::$instance)) { 32 static::$instance = new static(); 33 } 34 return static::$instance; 35 } 36 37 /** 38 * Dispatch Indexing request for the page, called by TaskRunner::runIndexer() 39 * 40 * @param string $page name of the page to index 41 * @param bool $verbose print status messages 42 * @param bool $force force reindexing even when the index is up to date 43 * @return bool If the function completed successfully 44 * 45 * @author Tom N Harris <tnharris@whoopdedo.org> 46 * @author Satoshi Sahara <sahara.satoshi@gmail.com> 47 */ 48 public function dispatch($page, $verbose = false, $force = false) 49 { 50 // check if page was deleted but is still in the index 51 if (!page_exists($page)) { 52 $result = $this->deletePage($page, $verbose, $force); 53 return $result; 54 } else { 55 // update search index 56 $result = $this->addPage($page, $verbose, $force); 57 return $result; 58 } 59 } 60 61 /** 62 * Version of the indexer taking into consideration the external tokenizer. 63 * The indexer is only compatible with data written by the same version. 64 * 65 * @triggers INDEXER_VERSION_GET 66 * Plugins that modify what gets indexed should hook this event and 67 * add their version info to the event data like so: 68 * $data[$plugin_name] = $plugin_version; 69 * 70 * @author Tom N Harris <tnharris@whoopdedo.org> 71 * @author Michael Hamann <michael@content-space.de> 72 * 73 * @return int|string 74 */ 75 public function getVersion() 76 { 77 static $indexer_version = null; 78 if ($indexer_version == null) { 79 $version = INDEXER_VERSION; 80 81 // DokuWiki version is included for the convenience of plugins 82 $data = array('dokuwiki' => $version); 83 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 84 unset($data['dokuwiki']); // this needs to be first 85 ksort($data); 86 foreach ($data as $plugin => $vers) { 87 $version .= '+'.$plugin.'='.$vers; 88 } 89 $indexer_version = $version; 90 } 91 return $indexer_version; 92 } 93 94 /** 95 * Adds/updates the search index for the given page 96 * 97 * Locking is handled internally. 98 * 99 * @param string $page name of the page to index 100 * @param bool $verbose print status messages 101 * @param bool $force force reindexing even when the index is up to date 102 * @return bool If the function completed successfully 103 * 104 * @author Tom N Harris <tnharris@whoopdedo.org> 105 * @author Satoshi Sahara <sahara.satoshi@gmail.com> 106 */ 107 public function addPage($page, $verbose = false, $force = false) 108 { 109 // check if indexing needed for the existing page (full text and/or metadata indexing) 110 $idxtag = metaFN($page,'.indexed'); 111 if (!$force && file_exists($idxtag)) { 112 if (trim(io_readFile($idxtag)) == $this->getVersion()) { 113 $last = @filemtime($idxtag); 114 if ($last > @filemtime(wikiFN($page))) { 115 if ($verbose) dbglog("Indexer: index for {$page} up to date"); 116 return true; 117 } 118 } 119 } 120 121 // register the page to the page.idx 122 $pid = $this->getPID($page); 123 if ($pid === false) { 124 if ($verbose) dbglog("Indexer: getting the PID failed for {$page}"); 125 trigger_error("Failed to get PID for {$page}", E_USER_ERROR); 126 return false; 127 } 128 129 // prepare metadata indexing 130 $metadata = array(); 131 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 132 133 $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED); 134 $metadata['relation_references'] = ($references !== null) ? 135 array_keys($references) : array(); 136 137 $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED); 138 $metadata['relation_media'] = ($media !== null) ? 139 array_keys($media) : array(); 140 141 // check if full text indexing allowed 142 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 143 if ($indexenabled !== false) $indexenabled = true; 144 $metadata['internal_index'] = $indexenabled; 145 146 $body = ''; 147 $data = compact('page', 'body', 'metadata', 'pid'); 148 $event = new Event('INDEXER_PAGE_ADD', $data); 149 if ($event->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page); 150 $event->advise_after(); 151 unset($event); 152 extract($data); 153 $indexenabled = $metadata['internal_index']; 154 unset($metadata['internal_index']); 155 156 // Access to Metadata Index 157 $MetadataIndex = MetadataIndex::getInstance(); 158 $result = $MetadataIndex->addMetaKeys($page, $metadata); 159 if ($verbose) dbglog("Indexer: addMetaKeys({$page}) ".($result ? 'done' : 'failed')); 160 if (!$result) { 161 return false; 162 } 163 164 // Access to Fulltext Index 165 $FulltextIndex = FulltextIndex::getInstance(); 166 if ($indexenabled) { 167 $result = $FulltextIndex->addPagewords($page, $body); 168 if ($verbose) dbglog("Indexer: addPageWords({$page}) ".($result ? 'done' : 'failed')); 169 if (!$result) { 170 return false; 171 } 172 } else { 173 if ($verbose) dbglog("Indexer: full text indexing disabled for {$page}"); 174 // ensure the page content deleted from the Fulltext index 175 $result = $FulltextIndex->deletePageWords($page); 176 if ($verbose) dbglog("Indexer: deletePageWords({$page}) ".($result ? 'done' : 'failed')); 177 if (!$result) { 178 return false; 179 } 180 } 181 182 // update index tag file 183 io_saveFile($idxtag, $this->getVersion()); 184 if ($verbose) dbglog("Indexer: finished"); 185 186 return $result; 187 } 188 189 /** 190 * Remove a page from the index, erases entries in all known indexes 191 * 192 * Locking is handled internally. 193 * 194 * @param string $page name of the page to index 195 * @param bool $verbose print status messages 196 * @param bool $force force reindexing even when the index is up to date 197 * @return bool If the function completed successfully 198 * 199 * @author Tom N Harris <tnharris@whoopdedo.org> 200 * @author Satoshi Sahara <sahara.satoshi@gmail.com> 201 */ 202 public function deletePage($page, $verbose = false, $force = false) 203 { 204 $idxtag = metaFN($page,'.indexed'); 205 if (!$force && !file_exists($idxtag)) { 206 if ($verbose) dbglog("Indexer: {$page}.indexed file does not exist, ignoring"); 207 return true; 208 } 209 210 // remove obsoleted content from Fulltext index 211 $FulltextIndex = FulltextIndex::getInstance(); 212 $result = $FulltextIndex->deletePageWords($page); 213 if ($verbose) dbglog("Indexer: deletePageWords({$page}) ".($result ? 'done' : 'failed')); 214 if (!$result) { 215 return false; 216 } 217 218 // delete all keys of the page from metadata index 219 $MetadataIndex = MetadataIndex::getInstance(); 220 $result = $MetadataIndex->deleteMetaKeys($page); 221 if ($verbose) dbglog("Indexer: deleteMetaKeys({$page}) ".($result ? 'done' : 'failed')); 222 if (!$result) { 223 return false; 224 } 225 226 // mark the page as deleted in the page.idx 227 $pid = $this->getPID($page); 228 if ($pid !== false) { 229 if (!$this->lock()) return false; 230 $result = $this->saveIndexKey('page', '', $pid, self::INDEX_MARK_DELETED.$page); 231 if ($verbose) dbglog("Indexer: update page.idx ".($result ? 'done' : 'failed')); 232 $this->unlock(); 233 } else { 234 if ($verbose) dbglog("Indexer: {$page} not found in the page.idx, ignoring"); 235 $result = true; 236 } 237 238 unset(static::$pidCache[$pid]); 239 @unlink($idxtag); 240 return $result; 241 } 242 243 /** 244 * Rename a page in the search index without changing the indexed content. 245 * This function doesn't check if the old or new name exists in the filesystem. 246 * It returns an error if the old page isn't in the page list of the indexer 247 * and it deletes all previously indexed content of the new page. 248 * 249 * @param string $oldpage The old page name 250 * @param string $newpage The new page name 251 * @return bool If the page was successfully renamed 252 */ 253 public function renamePage($oldpage, $newpage) 254 { 255 $index = $this->getIndex('page', ''); 256 // check if oldpage found in page.idx 257 $oldPid = array_search($oldpage, $index, true); 258 if ($oldPid === false) return false; 259 260 // check if newpage found in page.idx 261 $newPid = array_search($newpage, $index, true); 262 if ($newPid !== false) { 263 $result = $this->deletePage($newpage); 264 if (!$result) return false; 265 // Note: $index is no longer valid after deletePage()! 266 unset($index); 267 } 268 269 // update page.idx 270 if (!$this->lock()) return false; 271 $result = $this->saveIndexKey('page', '', $oldPid, $newpage); 272 $this->unlock(); 273 274 // reset the pid cache 275 $this->resetPIDCache(); 276 277 return $result; 278 } 279 280 /** 281 * Clear the Page Index 282 * 283 * @param bool $requireLock 284 * @return bool If the index has been cleared successfully 285 */ 286 public function clear($requireLock = true) 287 { 288 global $conf; 289 290 if ($requireLock && !$this->lock()) return false; 291 292 // clear Metadata Index 293 $MetadataIndex = MetadataIndex::getInstance(); 294 $MetadataIndex->clear(false); 295 296 // clear Fulltext Index 297 $FulltextIndex = FulltextIndex::getInstance(); 298 $FulltextIndex->clear(false); 299 300 @unlink($conf['indexdir'].'/page.idx'); 301 302 // clear the pid cache 303 $this->resetPIDCache(); 304 305 if ($requireLock) $this->unlock(); 306 return true; 307 } 308 309 310 /** 311 * Return a list of words sorted by number of times used 312 * 313 * @param int $min bottom frequency threshold 314 * @param int $max upper frequency limit. No limit if $max<$min 315 * @param int $minlen minimum length of words to count 316 * @param string $key metadata key to list. Uses the fulltext index if not given 317 * @return array list of words as the keys and frequency as values 318 * 319 * @author Tom N Harris <tnharris@whoopdedo.org> 320 */ 321 public function histogram($key = null, $min = 1, $max = 0, $minlen = 3) 322 { 323 if ($min < 1) $min = 1; 324 if ($max < $min) $max = 0; 325 326 $result = array(); 327 328 if ($key == 'title') { 329 $index = $this->getIndex('title', ''); 330 $index = array_count_values($index); 331 foreach ($index as $val => $cnt) { 332 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) { 333 $result[$val] = $cnt; 334 } 335 } 336 } elseif (!is_null($key)) { 337 $metaname = $this->cleanName($key); 338 $index = $this->getIndex($metaname.'_i', ''); 339 $val_idx = array(); 340 foreach ($index as $wid => $line) { 341 $freq = $this->countTuples($line); 342 if ($freq >= $min && (!$max || $freq <= $max)) { 343 $val_idx[$wid] = $freq; 344 } 345 } 346 if (!empty($val_idx)) { 347 $words = $this->getIndex($metaname.'_w', ''); 348 foreach ($val_idx as $wid => $freq) { 349 if (strlen($words[$wid]) >= $minlen) { 350 $result[$words[$wid]] = $freq; 351 } 352 } 353 } 354 } else { 355 $FulltextIndex = FulltextIndex::getInstance(); 356 $lengths = $FulltextIndex->listIndexLengths(); 357 foreach ($lengths as $length) { 358 if ($length < $minlen) continue; 359 $index = $this->getIndex('i', $length); 360 $words = null; 361 foreach ($index as $wid => $line) { 362 $freq = $this->countTuples($line); 363 if ($freq >= $min && (!$max || $freq <= $max)) { 364 if ($words === null) { 365 $words = $this->getIndex('w', $length); 366 } 367 $result[$words[$wid]] = $freq; 368 } 369 } 370 } 371 } 372 373 arsort($result); 374 return $result; 375 } 376} 377