1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Debug\DebugHelper; 6use dokuwiki\Extension\Event; 7use dokuwiki\Search\Collection\CollectionSearch; 8use dokuwiki\Search\Collection\PageFulltextCollection; 9use dokuwiki\Search\Collection\PageMetaCollection; 10use dokuwiki\Search\Collection\PageTitleCollection; 11use dokuwiki\Search\Exception\IndexAccessException; 12use dokuwiki\Search\Exception\IndexIntegrityException; 13use dokuwiki\Search\Exception\IndexLockException; 14use dokuwiki\Search\Exception\IndexWriteException; 15use dokuwiki\Search\Exception\SearchException; 16use dokuwiki\Search\Index\FileIndex; 17use dokuwiki\Search\Index\Lock; 18use dokuwiki\Search\Index\MemoryIndex; 19use dokuwiki\Search\Index\TupleOps; 20 21// Version tag used to force rebuild on upgrade 22const INDEXER_VERSION = 8; 23 24/** 25 * Class DokuWiki Indexer 26 * 27 * Manages the page search index by delegating to Collection classes. 28 * 29 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 30 * @author Andreas Gohr <andi@splitbrain.org> 31 * @author Tom N Harris <tnharris@whoopdedo.org> 32 */ 33class Indexer 34{ 35 /** @var callable|null Logging callback, receives a string message */ 36 protected $logger; 37 38 /** 39 * Set a logging callback 40 * 41 * The callback receives a single string message. Use this to integrate 42 * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) 43 * 44 * @param callable $logger 45 * @return static 46 */ 47 public function setLogger(callable $logger): static 48 { 49 $this->logger = $logger; 50 return $this; 51 } 52 53 /** 54 * Send a message to the registered logger 55 * 56 * @param string $message 57 */ 58 protected function log(string $message): void 59 { 60 if ($this->logger)($this->logger)($message); 61 } 62 63 /** 64 * Version of the indexer taking into consideration the external tokenizer. 65 * The indexer is only compatible with data written by the same version. 66 * 67 * @triggers INDEXER_VERSION_GET 68 * Plugins that modify what gets indexed should hook this event and 69 * add their version info to the event data like so: 70 * $data[$plugin_name] = $plugin_version; 71 * 72 * @return int|string 73 */ 74 public function getVersion(): int|string 75 { 76 static $indexer_version = null; 77 if ($indexer_version == null) { 78 $version = INDEXER_VERSION; 79 80 $data = ['dokuwiki' => $version]; 81 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 82 unset($data['dokuwiki']); // this needs to be first 83 ksort($data); 84 foreach ($data as $plugin => $vers) { 85 $version .= '+' . $plugin . '=' . $vers; 86 } 87 $indexer_version = $version; 88 } 89 return $indexer_version; 90 } 91 92 /** 93 * Return a list of all indexed pages 94 * 95 * @param bool $existsFilter only return pages that exist on disk 96 * @return string[] list of page names (keys are the RIDs in the page index) 97 */ 98 public function getAllPages(bool $existsFilter = false): array 99 { 100 $pageIndex = new MemoryIndex('page'); 101 return array_filter( 102 iterator_to_array($pageIndex), 103 static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) 104 ); 105 } 106 107 /** 108 * Check if a page needs (re-)indexing 109 * 110 * @param string $page 111 * @param bool $force 112 * @return bool true if indexing is needed 113 */ 114 public function needsIndexing(string $page, bool $force = false): bool 115 { 116 $idxtag = metaFN($page, '.indexed'); 117 if ($force || !file_exists($idxtag)) return true; 118 119 if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; 120 121 $last = @filemtime($idxtag); 122 return $last <= @filemtime(wikiFN($page)); 123 } 124 125 /** 126 * Add/update the search index for a page 127 * 128 * Locking is handled internally. 129 * 130 * @param string $page The page to index 131 * @param bool $force force reindexing even when the index is up to date 132 * 133 * @throws IndexAccessException 134 * @throws IndexLockException 135 * @throws IndexWriteException 136 */ 137 public function addPage(string $page, bool $force = false): void 138 { 139 if (!$this->needsIndexing($page, $force)) { 140 $this->log("Indexer: index for $page up to date"); 141 return; 142 } 143 144 // create shared writable page index early so we can resolve the PID for plugins 145 $pageIndex = new FileIndex('page', '', true); 146 147 // prepare event data 148 $data = [ 149 'page' => $page, 150 'body' => '', 151 'metadata' => [ 152 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 153 'relation_references' => array_keys( 154 p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] 155 ), 156 'relation_media' => array_keys( 157 p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] 158 ), 159 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, 160 ], 161 'pid' => $pageIndex->accessCachedValue($page), 162 ]; 163 164 // let plugins modify the data 165 $event = new Event('INDEXER_PAGE_ADD', $data); 166 if ($event->advise_before()) { 167 $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); 168 } 169 $event->advise_after(); 170 unset($event); 171 172 // index title 173 (new PageTitleCollection($pageIndex))->lock() 174 ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); 175 unset($data['metadata']['title']); 176 177 // index fulltext 178 if ($data['metadata']['internal_index']) { 179 $words = Tokenizer::getWords($data['body']); 180 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); 181 } else { 182 $this->log("Indexer: full text indexing disabled for {$data['page']}"); 183 // clear any previously stored fulltext data 184 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); 185 } 186 unset($data['metadata']['internal_index']); 187 188 // index metadata keys 189 foreach ($data['metadata'] as $key => $values) { 190 if (!is_array($values)) { 191 $values = ($values !== null && $values !== '') ? [$values] : []; 192 } 193 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); 194 } 195 196 // update metadata registry 197 $this->updateMetadataRegistry(array_keys($data['metadata'])); 198 199 // update index tag file 200 io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); 201 $this->log("Indexer: finished indexing {$data['page']}"); 202 } 203 204 /** 205 * Remove a page from the index 206 * 207 * Clears the page's data from all collections. The entity persists in page.idx. 208 * 209 * @param string $page The page to remove 210 * @param bool $force force deletion even when no .indexed tag exists 211 * 212 * @throws IndexAccessException 213 * @throws IndexLockException 214 * @throws IndexWriteException 215 */ 216 public function deletePage(string $page, bool $force = false): void 217 { 218 $idxtag = metaFN($page, '.indexed'); 219 if (!$force && !file_exists($idxtag)) { 220 $this->log("Indexer: $page.indexed file does not exist, ignoring"); 221 return; 222 } 223 224 $pageIndex = new FileIndex('page', '', true); 225 226 (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 227 (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 228 229 foreach ($this->getMetadataRegistryKeys() as $key) { 230 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); 231 } 232 233 $this->log("Indexer: deleted $page from index"); 234 @unlink($idxtag); 235 } 236 237 /** 238 * Rename a page in the search index 239 * 240 * The page must already have been moved on disk before calling this. 241 * Clears the old page's data and re-indexes under the new name. 242 * 243 * @param string $oldpage The old page name 244 * @param string $newpage The new page name 245 * 246 * @throws IndexAccessException 247 * @throws IndexLockException 248 * @throws IndexWriteException 249 */ 250 public function renamePage(string $oldpage, string $newpage): void 251 { 252 $this->deletePage($oldpage, true); 253 $this->addPage($newpage, true); 254 } 255 256 /** 257 * Clear all page indexes 258 */ 259 public function clear(): void 260 { 261 global $conf; 262 263 Lock::acquire('page'); 264 265 // clear metadata indexes 266 foreach ($this->getMetadataRegistryKeys() as $key) { 267 $clean = PageMetaCollection::cleanName($key); 268 @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); 269 @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); 270 @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); 271 } 272 273 // clear fulltext indexes 274 $files = glob($conf['indexdir'] . '/i*.idx'); 275 if ($files) foreach ($files as $f) @unlink($f); 276 $files = glob($conf['indexdir'] . '/w*.idx'); 277 if ($files) foreach ($files as $f) @unlink($f); 278 279 @unlink($conf['indexdir'] . '/pageword.idx'); 280 @unlink($conf['indexdir'] . '/lengths.idx'); 281 282 // clear title and page indexes 283 @unlink($conf['indexdir'] . '/title.idx'); 284 @unlink($conf['indexdir'] . '/page.idx'); 285 @unlink($conf['indexdir'] . '/metadata.idx'); 286 287 Lock::release('page'); 288 } 289 290 /** 291 * Check the structural integrity of all search indexes 292 * 293 * @throws IndexIntegrityException when a structural inconsistency is found 294 */ 295 public function checkIntegrity(): void 296 { 297 (new PageFulltextCollection())->checkIntegrity(); 298 (new PageTitleCollection())->checkIntegrity(); 299 300 foreach ($this->getMetadataRegistryKeys() as $key) { 301 (new PageMetaCollection($key))->checkIntegrity(); 302 } 303 } 304 305 /** 306 * Whether the search index is empty (no fulltext data indexed yet) 307 * 308 * @return bool 309 */ 310 public function isIndexEmpty(): bool 311 { 312 return (new PageFulltextCollection())->getTokenIndexMaximum() === 0; 313 } 314 315 /** 316 * Get the list of known metadata keys from the metadata registry 317 * 318 * @return string[] list of metadata key names 319 */ 320 protected function getMetadataRegistryKeys(): array 321 { 322 global $conf; 323 $fn = $conf['indexdir'] . '/metadata.idx'; 324 if (!file_exists($fn)) return []; 325 $keys = file($fn, FILE_IGNORE_NEW_LINES); 326 return $keys ?: []; 327 } 328 329 /** 330 * Update the metadata registry with new keys 331 * 332 * @param string[] $keys metadata key names to ensure are registered 333 */ 334 protected function updateMetadataRegistry(array $keys): void 335 { 336 global $conf; 337 $fn = $conf['indexdir'] . '/metadata.idx'; 338 $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; 339 if (!$existing) $existing = []; 340 341 $added = false; 342 foreach ($keys as $key) { 343 if (!in_array($key, $existing)) { 344 $existing[] = $key; 345 $added = true; 346 } 347 } 348 349 if ($added) { 350 io_saveFile($fn, implode("\n", $existing) . "\n"); 351 } 352 } 353 354 // region Deprecated methods 355 356 /** 357 * Find pages containing a metadata value 358 * 359 * @param string $key metadata key name 360 * @param string|string[] $value search term(s) 361 * @param callable|null $func ignored, kept for backward compatibility 362 * @return array 363 * 364 * @deprecated 2026-04-07 use MetadataSearch::lookupKey() instead 365 */ 366 public function lookupKey($key, &$value, $func = null) 367 { 368 DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::lookupKey()'); 369 return (new MetadataSearch())->lookupKey($key, $value); 370 } 371 372 /** 373 * Return a list of all indexed pages, optionally filtered by metadata key 374 * 375 * @param string|null $key metadata key name 376 * @return string[] 377 * 378 * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead 379 */ 380 public function getPages($key = null) 381 { 382 DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()'); 383 return (new MetadataSearch())->getPages($key); 384 } 385 386 /** 387 * Add metadata values for a page 388 * 389 * @param string $page page name 390 * @param string $key metadata key name 391 * @param string|string[]|null $value value(s) to add 392 * @return bool 393 * 394 * @deprecated 2026-04-07 use Collection classes directly instead 395 */ 396 public function addMetaKeys($page, $key, $value = null) 397 { 398 DebugHelper::dbgDeprecatedFunction('Collection classes'); 399 try { 400 if ($key === 'title') { 401 $collection = new PageTitleCollection(); 402 } else { 403 $collection = new PageMetaCollection($key); 404 } 405 $values = is_array($value) ? $value : ($value !== null && $value !== '' ? [$value] : []); 406 $collection->lock()->addEntity($page, $values)->unlock(); 407 $this->updateMetadataRegistry([$key]); 408 return true; 409 } catch (SearchException) { 410 return false; 411 } 412 } 413 414 /** 415 * Rename a metadata value in the index 416 * 417 * @param string $key metadata key name 418 * @param string $oldvalue old value 419 * @param string $newvalue new value 420 * @return bool 421 * 422 * @deprecated 2026-04-07 use Collection classes directly instead 423 */ 424 public function renameMetaValue($key, $oldvalue, $newvalue) 425 { 426 DebugHelper::dbgDeprecatedFunction('Collection classes'); 427 try { 428 $collection = new PageMetaCollection($key); 429 $collection->lock(); 430 431 $tokenIndex = $collection->getTokenIndex(); 432 433 // find old value — search() is read-only, won't create entries 434 $matches = $tokenIndex->search('/^' . preg_quote($oldvalue, '/') . '$/'); 435 if ($matches === []) { 436 $collection->unlock(); 437 return true; 438 } 439 $oldid = array_key_first($matches); 440 441 // check if new value already exists (read-only lookup) 442 $newMatches = $tokenIndex->search('/^' . preg_quote($newvalue, '/') . '$/'); 443 444 if ($newMatches !== []) { 445 // both values exist — merge frequency data from old to new 446 $newid = array_key_first($newMatches); 447 $freqIndex = $collection->getFrequencyIndex(); 448 $reverseIndex = $collection->getReverseIndex(); 449 $oldFreqLine = $freqIndex->retrieveRow($oldid); 450 451 if ($oldFreqLine !== '') { 452 $newFreqLine = $freqIndex->retrieveRow($newid); 453 foreach (TupleOps::parseTuples($oldFreqLine) as $entityId => $count) { 454 $newFreqLine = TupleOps::updateTuple($newFreqLine, $entityId, $count); 455 456 // update reverse index: remove old token, add new 457 $reverseRow = $reverseIndex->retrieveRow((int)$entityId); 458 $keyline = explode(':', $reverseRow); 459 $keyline = array_diff($keyline, [(string)$oldid]); 460 if (!in_array((string)$newid, $keyline)) { 461 $keyline[] = $newid; 462 } 463 $reverseIndex->changeRow( 464 (int)$entityId, 465 implode(':', array_filter($keyline, fn($v) => $v !== '')) 466 ); 467 } 468 $freqIndex->changeRow($oldid, ''); 469 $freqIndex->changeRow($newid, $newFreqLine); 470 } 471 } else { 472 // new value doesn't exist — simple rename 473 $tokenIndex->changeRow($oldid, $newvalue); 474 } 475 476 $collection->unlock(); 477 return true; 478 } catch (SearchException) { 479 return false; 480 } 481 } 482 483 /** 484 * Get the page ID for a page name 485 * 486 * @param string $page page name 487 * @return int|false 488 * 489 * @deprecated 2026-04-07 use FileIndex directly instead 490 */ 491 public function getPID($page) 492 { 493 DebugHelper::dbgDeprecatedFunction(FileIndex::class); 494 try { 495 return (new FileIndex('page', '', true))->accessCachedValue($page); 496 } catch (SearchException) { 497 return false; 498 } 499 } 500 501 /** 502 * Find tokens in the fulltext index 503 * 504 * @param array $tokens list of words to search for 505 * @return array list of pages found [word => [page => count, ...]] 506 * 507 * @deprecated 2026-04-07 use CollectionSearch on PageFulltextCollection instead 508 */ 509 public function lookup($tokens) 510 { 511 DebugHelper::dbgDeprecatedFunction(CollectionSearch::class); 512 $collection = new PageFulltextCollection(); 513 $search = new CollectionSearch($collection); 514 $termMap = []; 515 foreach ($tokens as $token) { 516 if (!Tokenizer::isValidSearchTerm($token)) continue; 517 $term = $search->addTerm($token); 518 $termMap[$token] = $term; 519 } 520 521 if ($termMap === []) return []; 522 $search->execute(); 523 524 $result = []; 525 foreach ($termMap as $word => $term) { 526 $freqs = $term->getEntityFrequencies(); 527 // filter to only existing pages 528 $filtered = array_filter($freqs, fn($page) => page_exists($page, '', false), ARRAY_FILTER_USE_KEY); 529 $result[$word] = $filtered; 530 } 531 return $result; 532 } 533 534 // endregion 535} 536