1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\FulltextIndex; 7use dokuwiki\Search\MetadataIndex; 8 9// Version tag used to force rebuild on upgrade 10const INDEXER_VERSION = 8; 11 12/** 13 * Class DokuWiki Indexer (Singleton) 14 * 15 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 16 * @author Andreas Gohr <andi@splitbrain.org> 17 * @author Tom N Harris <tnharris@whoopdedo.org> 18 */ 19class Indexer extends AbstractIndex 20{ 21 /** @var Indexer $instance */ 22 protected static $instance = null; 23 24 /** 25 * Get new or existing singleton instance of the Indexer 26 * 27 * @return Indexer 28 */ 29 public static function getInstance() 30 { 31 if (is_null(static::$instance)) { 32 static::$instance = new static(); 33 } 34 return static::$instance; 35 } 36 37 /** 38 * Dispatch Indexing request for the page, called by TaskRunner::runIndexer() 39 * 40 * @param string $page name of the page to index 41 * @param bool $verbose print status messages 42 * @param bool $force force reindexing even when the index is up to date 43 * @return bool If the function completed successfully 44 * 45 * @author Tom N Harris <tnharris@whoopdedo.org> 46 * @author Satoshi Sahara <sahara.satoshi@gmail.com> 47 */ 48 public function dispatch($page, $verbose = false, $force = false) 49 { 50 // check if page was deleted but is still in the index 51 if (!page_exists($page)) { 52 return $this->deletePage($page, $verbose, $force); 53 } 54 55 // update search index 56 return $this->addPage($page, $verbose, $force); 57 } 58 59 /** 60 * Version of the indexer taking into consideration the external tokenizer. 61 * The indexer is only compatible with data written by the same version. 62 * 63 * @triggers INDEXER_VERSION_GET 64 * Plugins that modify what gets indexed should hook this event and 65 * add their version info to the event data like so: 66 * $data[$plugin_name] = $plugin_version; 67 * 68 * @author Tom N Harris <tnharris@whoopdedo.org> 69 * @author Michael Hamann <michael@content-space.de> 70 * 71 * @return int|string 72 */ 73 public function getVersion() 74 { 75 static $indexer_version = null; 76 if ($indexer_version == null) { 77 $version = INDEXER_VERSION; 78 79 // DokuWiki version is included for the convenience of plugins 80 $data = array('dokuwiki' => $version); 81 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 82 unset($data['dokuwiki']); // this needs to be first 83 ksort($data); 84 foreach ($data as $plugin => $vers) { 85 $version .= '+'.$plugin.'='.$vers; 86 } 87 $indexer_version = $version; 88 } 89 return $indexer_version; 90 } 91 92 /** 93 * Adds/updates the search index for the given page 94 * 95 * Locking is handled internally. 96 * 97 * @param string $page name of the page to index 98 * @param bool $verbose print status messages 99 * @param bool $force force reindexing even when the index is up to date 100 * @return bool If the function completed successfully 101 * 102 * @author Tom N Harris <tnharris@whoopdedo.org> 103 * @author Satoshi Sahara <sahara.satoshi@gmail.com> 104 */ 105 public function addPage($page, $verbose = false, $force = false) 106 { 107 // check if indexing needed for the existing page (full text and/or metadata indexing) 108 $idxtag = metaFN($page,'.indexed'); 109 if (!$force && file_exists($idxtag)) { 110 if (trim(io_readFile($idxtag)) == $this->getVersion()) { 111 $last = @filemtime($idxtag); 112 if ($last > @filemtime(wikiFN($page))) { 113 if ($verbose) dbglog("Indexer: index for {$page} up to date"); 114 return true; 115 } 116 } 117 } 118 119 // register the page to the page.idx 120 $pid = $this->getPID($page); 121 if ($pid === false) { 122 if ($verbose) dbglog("Indexer: getting the PID failed for {$page}"); 123 trigger_error("Failed to get PID for {$page}", E_USER_ERROR); 124 return false; 125 } 126 127 // prepare metadata indexing 128 $metadata = array(); 129 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 130 131 $references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED); 132 $metadata['relation_references'] = ($references !== null) ? 133 array_keys($references) : array(); 134 135 $media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED); 136 $metadata['relation_media'] = ($media !== null) ? 137 array_keys($media) : array(); 138 139 // check if full text indexing allowed 140 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 141 if ($indexenabled !== false) $indexenabled = true; 142 $metadata['internal_index'] = $indexenabled; 143 144 $body = ''; 145 $data = compact('page', 'body', 'metadata', 'pid'); 146 $event = new Event('INDEXER_PAGE_ADD', $data); 147 if ($event->advise_before()) $data['body'] = $data['body'].' '.rawWiki($page); 148 $event->advise_after(); 149 unset($event); 150 extract($data); 151 $indexenabled = $metadata['internal_index']; 152 unset($metadata['internal_index']); 153 154 // Access to Metadata Index 155 $MetadataIndex = MetadataIndex::getInstance(); 156 $result = $MetadataIndex->addMetaKeys($page, $metadata); 157 if ($verbose) dbglog("Indexer: addMetaKeys({$page}) ".($result ? 'done' : 'failed')); 158 if (!$result) { 159 return false; 160 } 161 162 // Access to Fulltext Index 163 $FulltextIndex = FulltextIndex::getInstance(); 164 if ($indexenabled) { 165 $result = $FulltextIndex->addPagewords($page, $body); 166 if ($verbose) dbglog("Indexer: addPageWords({$page}) ".($result ? 'done' : 'failed')); 167 if (!$result) { 168 return false; 169 } 170 } else { 171 if ($verbose) dbglog("Indexer: full text indexing disabled for {$page}"); 172 // ensure the page content deleted from the Fulltext index 173 $result = $FulltextIndex->deletePageWords($page); 174 if ($verbose) dbglog("Indexer: deletePageWords({$page}) ".($result ? 'done' : 'failed')); 175 if (!$result) { 176 return false; 177 } 178 } 179 180 // update index tag file 181 io_saveFile($idxtag, $this->getVersion()); 182 if ($verbose) dbglog("Indexer: finished"); 183 184 return $result; 185 } 186 187 /** 188 * Remove a page from the index 189 * 190 * Erases entries in all known indexes. Locking is handled internally. 191 * 192 * @param string $page name of the page to index 193 * @param bool $verbose print status messages 194 * @param bool $force force reindexing even when the index is up to date 195 * @return bool If the function completed successfully 196 * 197 * @author Tom N Harris <tnharris@whoopdedo.org> 198 * @author Satoshi Sahara <sahara.satoshi@gmail.com> 199 */ 200 public function deletePage($page, $verbose = false, $force = false) 201 { 202 $idxtag = metaFN($page,'.indexed'); 203 if (!$force && !file_exists($idxtag)) { 204 if ($verbose) dbglog("Indexer: {$page}.indexed file does not exist, ignoring"); 205 return true; 206 } 207 208 // remove obsoleted content from Fulltext index 209 $FulltextIndex = FulltextIndex::getInstance(); 210 $result = $FulltextIndex->deletePageWords($page); 211 if ($verbose) dbglog("Indexer: deletePageWords({$page}) ".($result ? 'done' : 'failed')); 212 if (!$result) { 213 return false; 214 } 215 216 // delete all keys of the page from metadata index 217 $MetadataIndex = MetadataIndex::getInstance(); 218 $result = $MetadataIndex->deleteMetaKeys($page); 219 if ($verbose) dbglog("Indexer: deleteMetaKeys({$page}) ".($result ? 'done' : 'failed')); 220 if (!$result) { 221 return false; 222 } 223 224 // mark the page as deleted in the page.idx 225 $pid = $this->getPID($page); 226 if ($pid !== false) { 227 if (!$this->lock()) return false; 228 $result = $this->saveIndexKey('page', '', $pid, self::INDEX_MARK_DELETED.$page); 229 if ($verbose) dbglog("Indexer: update page.idx ".($result ? 'done' : 'failed')); 230 $this->unlock(); 231 } else { 232 if ($verbose) dbglog("Indexer: {$page} not found in the page.idx, ignoring"); 233 $result = true; 234 } 235 236 unset(static::$pidCache[$pid]); 237 @unlink($idxtag); 238 return $result; 239 } 240 241 /** 242 * Rename a page in the search index without changing the indexed content. 243 * This function doesn't check if the old or new name exists in the filesystem. 244 * It returns an error if the old page isn't in the page list of the indexer 245 * and it deletes all previously indexed content of the new page. 246 * 247 * @param string $oldpage The old page name 248 * @param string $newpage The new page name 249 * @return bool If the page was successfully renamed 250 */ 251 public function renamePage($oldpage, $newpage) 252 { 253 $index = $this->getIndex('page', ''); 254 // check if oldpage found in page.idx 255 $oldPid = array_search($oldpage, $index, true); 256 if ($oldPid === false) return false; 257 258 // check if newpage found in page.idx 259 $newPid = array_search($newpage, $index, true); 260 if ($newPid !== false) { 261 $result = $this->deletePage($newpage); 262 if (!$result) return false; 263 // Note: $index is no longer valid after deletePage()! 264 unset($index); 265 } 266 267 // update page.idx 268 if (!$this->lock()) return false; 269 $result = $this->saveIndexKey('page', '', $oldPid, $newpage); 270 $this->unlock(); 271 272 // reset the pid cache 273 $this->resetPIDCache(); 274 275 return $result; 276 } 277 278 /** 279 * Clear the Page Index 280 * 281 * @param bool $requireLock 282 * @return bool If the index has been cleared successfully 283 */ 284 public function clear($requireLock = true) 285 { 286 global $conf; 287 288 if ($requireLock && !$this->lock()) return false; 289 290 // clear Metadata Index 291 $MetadataIndex = MetadataIndex::getInstance(); 292 $MetadataIndex->clear(false); 293 294 // clear Fulltext Index 295 $FulltextIndex = FulltextIndex::getInstance(); 296 $FulltextIndex->clear(false); 297 298 @unlink($conf['indexdir'].'/page.idx'); 299 300 // clear the pid cache 301 $this->resetPIDCache(); 302 303 if ($requireLock) $this->unlock(); 304 return true; 305 } 306 307 308 /** 309 * Return a list of words sorted by number of times used 310 * 311 * @param int $min bottom frequency threshold 312 * @param int $max upper frequency limit. No limit if $max<$min 313 * @param int $minlen minimum length of words to count 314 * @param string $key metadata key to list. Uses the fulltext index if not given 315 * @return array list of words as the keys and frequency as values 316 * 317 * @author Tom N Harris <tnharris@whoopdedo.org> 318 */ 319 public function histogram($min=1, $max=0, $minlen=3, $key=null) 320 { 321 if ($min < 1) $min = 1; 322 if ($max < $min) $max = 0; 323 324 $result = array(); 325 326 if ($key == 'title') { 327 $index = $this->getIndex('title', ''); 328 $index = array_count_values($index); 329 foreach ($index as $val => $cnt) { 330 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) { 331 $result[$val] = $cnt; 332 } 333 } 334 } elseif (!is_null($key)) { 335 $metaname = $this->cleanName($key); 336 $index = $this->getIndex($metaname.'_i', ''); 337 $val_idx = array(); 338 foreach ($index as $wid => $line) { 339 $freq = $this->countTuples($line); 340 if ($freq >= $min && (!$max || $freq <= $max)) { 341 $val_idx[$wid] = $freq; 342 } 343 } 344 if (!empty($val_idx)) { 345 $words = $this->getIndex($metaname.'_w', ''); 346 foreach ($val_idx as $wid => $freq) { 347 if (strlen($words[$wid]) >= $minlen) { 348 $result[$words[$wid]] = $freq; 349 } 350 } 351 } 352 } else { 353 $FulltextIndex = FulltextIndex::getInstance(); 354 $lengths = $FulltextIndex->listIndexLengths(); 355 foreach ($lengths as $length) { 356 if ($length < $minlen) continue; 357 $index = $this->getIndex('i', $length); 358 $words = null; 359 foreach ($index as $wid => $line) { 360 $freq = $this->countTuples($line); 361 if ($freq >= $min && (!$max || $freq <= $max)) { 362 if ($words === null) { 363 $words = $this->getIndex('w', $length); 364 } 365 $result[$words[$wid]] = $freq; 366 } 367 } 368 } 369 } 370 371 arsort($result); 372 return $result; 373 } 374} 375