1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Debug\DebugHelper; 6use dokuwiki\Extension\Event; 7use dokuwiki\Search\Collection\PageFulltextCollection; 8use dokuwiki\Search\Collection\PageMetaCollection; 9use dokuwiki\Search\Collection\PageTitleCollection; 10use dokuwiki\Search\Exception\IndexAccessException; 11use dokuwiki\Search\Exception\IndexIntegrityException; 12use dokuwiki\Search\Exception\IndexLockException; 13use dokuwiki\Search\Exception\IndexWriteException; 14use dokuwiki\Search\Index\FileIndex; 15use dokuwiki\Search\Index\Lock; 16use dokuwiki\Search\Index\MemoryIndex; 17 18// Version tag used to force rebuild on upgrade 19const INDEXER_VERSION = 9; 20 21/** 22 * Class DokuWiki Indexer 23 * 24 * Manages the page search index by delegating to Collection classes. 25 * 26 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 27 * @author Andreas Gohr <andi@splitbrain.org> 28 * @author Tom N Harris <tnharris@whoopdedo.org> 29 */ 30class Indexer 31{ 32 /** @var callable|null Logging callback, receives a string message */ 33 protected $logger; 34 35 /** 36 * Set a logging callback 37 * 38 * The callback receives a single string message. Use this to integrate 39 * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) 40 * 41 * @param callable $logger 42 * @return static 43 */ 44 public function setLogger(callable $logger): static 45 { 46 $this->logger = $logger; 47 return $this; 48 } 49 50 /** 51 * Send a message to the registered logger 52 * 53 * @param string $message 54 */ 55 protected function log(string $message): void 56 { 57 if ($this->logger)($this->logger)($message); 58 } 59 60 /** 61 * Version of the indexer taking into consideration the external tokenizer. 62 * The indexer is only compatible with data written by the same version. 63 * 64 * @triggers INDEXER_VERSION_GET 65 * Plugins that modify what gets indexed should hook this event and 66 * add their version info to the event data like so: 67 * $data[$plugin_name] = $plugin_version; 68 * 69 * @return int|string 70 */ 71 public function getVersion(): int|string 72 { 73 static $indexer_version = null; 74 if ($indexer_version == null) { 75 $version = INDEXER_VERSION; 76 77 $data = ['dokuwiki' => $version]; 78 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 79 unset($data['dokuwiki']); // this needs to be first 80 ksort($data); 81 foreach ($data as $plugin => $vers) { 82 $version .= '+' . $plugin . '=' . $vers; 83 } 84 $indexer_version = $version; 85 } 86 return $indexer_version; 87 } 88 89 /** 90 * Return a list of all indexed pages 91 * 92 * @param bool $existsFilter only return pages that exist on disk 93 * @return string[] list of page names (keys are the RIDs in the page index) 94 */ 95 public function getAllPages(bool $existsFilter = false): array 96 { 97 $pageIndex = new MemoryIndex('page'); 98 return array_filter( 99 iterator_to_array($pageIndex), 100 static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) 101 ); 102 } 103 104 /** 105 * Check if a page needs (re-)indexing 106 * 107 * @param string $page 108 * @param bool $force 109 * @return bool true if indexing is needed 110 */ 111 public function needsIndexing(string $page, bool $force = false): bool 112 { 113 $idxtag = metaFN($page, '.indexed'); 114 if ($force || !file_exists($idxtag)) return true; 115 116 if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; 117 118 // the index tag is written when the page is indexed; the page only needs 119 // (re-)indexing if it was changed *after* that - an equal mtime means it was 120 // saved and indexed within the same second and is therefore up to date 121 $last = @filemtime($idxtag); 122 return $last < @filemtime(wikiFN($page)); 123 } 124 125 /** 126 * Add/update the search index for a page 127 * 128 * Locking is handled internally. 129 * 130 * @param string $page The page to index 131 * @param bool $force force reindexing even when the index is up to date 132 * 133 * @return bool true if the page was indexed, false if there was nothing to do 134 * @throws IndexAccessException 135 * @throws IndexLockException 136 * @throws IndexWriteException 137 */ 138 public function addPage(string $page, bool $force = false): bool 139 { 140 if (!$this->needsIndexing($page, $force)) { 141 $this->log("Indexer: index for $page up to date"); 142 return false; 143 } 144 145 // create shared writable page index early so we can resolve the PID for plugins 146 $pageIndex = new FileIndex('page', '', true); 147 148 // prepare event data 149 $data = [ 150 'page' => $page, 151 'body' => '', 152 'metadata' => [ 153 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 154 'relation_references' => array_keys( 155 p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] 156 ), 157 'relation_media' => array_keys( 158 p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] 159 ), 160 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, 161 ], 162 'pid' => $pageIndex->accessCachedValue($page), 163 ]; 164 165 // let plugins modify the data 166 $event = new Event('INDEXER_PAGE_ADD', $data); 167 if ($event->advise_before()) { 168 $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); 169 } 170 $event->advise_after(); 171 unset($event); 172 173 // index title 174 (new PageTitleCollection($pageIndex))->lock() 175 ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); 176 unset($data['metadata']['title']); 177 178 // index fulltext 179 if ($data['metadata']['internal_index']) { 180 $words = Tokenizer::getWords($data['body']); 181 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); 182 } else { 183 $this->log("Indexer: full text indexing disabled for {$data['page']}"); 184 // clear any previously stored fulltext data 185 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); 186 } 187 unset($data['metadata']['internal_index']); 188 189 // index metadata keys 190 foreach ($data['metadata'] as $key => $values) { 191 if (!is_array($values)) { 192 $values = ($values !== null && $values !== '') ? [$values] : []; 193 } 194 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); 195 } 196 197 // update metadata registry 198 $this->updateMetadataRegistry(array_keys($data['metadata'])); 199 200 // update index tag file 201 io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); 202 $this->log("Indexer: finished indexing {$data['page']}"); 203 return true; 204 } 205 206 /** 207 * Remove a page from the index 208 * 209 * Clears the page's data from all collections. The entity persists in page.idx. 210 * 211 * @param string $page The page to remove 212 * @param bool $force force deletion even when no .indexed tag exists 213 * 214 * @return bool true if the page was removed, false if there was nothing to do 215 * @throws IndexAccessException 216 * @throws IndexLockException 217 * @throws IndexWriteException 218 */ 219 public function deletePage(string $page, bool $force = false): bool 220 { 221 $idxtag = metaFN($page, '.indexed'); 222 if (!$force && !file_exists($idxtag)) { 223 $this->log("Indexer: $page.indexed file does not exist, ignoring"); 224 return false; 225 } 226 227 $pageIndex = new FileIndex('page', '', true); 228 229 (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 230 (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 231 232 foreach ($this->getMetadataRegistryKeys() as $key) { 233 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); 234 } 235 236 $this->log("Indexer: deleted $page from index"); 237 @unlink($idxtag); 238 return true; 239 } 240 241 /** 242 * Rename a page in the search index 243 * 244 * This renames the page's entity entry in place: its entity ID (the row in the 245 * page index) is kept and only its name is changed. Because every collection 246 * (title, fulltext and all metadata keys such as relation_references) is keyed by 247 * that entity ID, all token, frequency and reverse associations are preserved and 248 * transparently belong to the new name afterwards. 249 * 250 * In particular this keeps the renamed page's *outgoing* references intact. That is 251 * essential during multi-step operations such as namespace moves: a page renamed 252 * early on must still be discoverable as a backlink source for pages that are moved 253 * later. Re-indexing from disk instead would lose this, because the destination page 254 * has usually not been written to disk yet when this method is called. 255 * 256 * @param string $oldpage The old page name 257 * @param string $newpage The new page name 258 * 259 * @return bool true if the page was renamed, false if there was nothing to do 260 * @throws IndexAccessException 261 * @throws IndexLockException 262 * @throws IndexWriteException 263 */ 264 public function renamePage(string $oldpage, string $newpage): bool 265 { 266 if ($oldpage === $newpage) return false; 267 268 $pageIndex = new FileIndex('page', '', true); 269 270 // locate the existing entity rows; stop as soon as both are known 271 $oldId = null; 272 $newId = null; 273 foreach ($pageIndex as $rid => $value) { 274 if ($value === $oldpage) $oldId = $rid; 275 if ($value === $newpage) $newId = $rid; 276 if ($oldId !== null && $newId !== null) break; 277 } 278 279 // nothing to rename if the old page was never indexed 280 if ($oldId === null) { 281 $pageIndex->unlock(); 282 $this->log("Indexer: $oldpage is not in the index, nothing to rename"); 283 return false; 284 } 285 286 // If the new name already has its own entity, drop its indexed data first. 287 // deletePage() intentionally keeps the entity row in page.idx, so we additionally 288 // blank that row - an empty entry is the index's "removed" marker (see getAllPages()). 289 // Otherwise two rows would carry the new name and a lookup could resolve to the 290 // now-empty one instead of the renamed entity that holds the data. 291 if ($newId !== null) { 292 $this->deletePage($newpage, true); 293 $pageIndex->changeRow($newId, ''); 294 } 295 296 // rename in place — keeps the entity ID and thus all index associations 297 $pageIndex->changeRow($oldId, $newpage); 298 299 $pageIndex->unlock(); 300 $this->log("Indexer: renamed $oldpage to $newpage in index"); 301 return true; 302 } 303 304 /** 305 * Clear all page indexes 306 */ 307 public function clear(): void 308 { 309 global $conf; 310 311 Lock::acquire('page'); 312 313 // clear metadata indexes 314 foreach ($this->getMetadataRegistryKeys() as $key) { 315 $clean = PageMetaCollection::cleanName($key); 316 @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); 317 @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); 318 @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); 319 } 320 321 // clear fulltext indexes 322 $files = glob($conf['indexdir'] . '/i*.idx'); 323 if ($files) foreach ($files as $f) @unlink($f); 324 $files = glob($conf['indexdir'] . '/w*.idx'); 325 if ($files) foreach ($files as $f) @unlink($f); 326 327 @unlink($conf['indexdir'] . '/pageword.idx'); 328 @unlink($conf['indexdir'] . '/lengths.idx'); 329 330 // clear title and page indexes 331 @unlink($conf['indexdir'] . '/title.idx'); 332 @unlink($conf['indexdir'] . '/page.idx'); 333 @unlink($conf['indexdir'] . '/metadata.idx'); 334 335 Lock::release('page'); 336 } 337 338 /** 339 * Check the structural integrity of all search indexes 340 * 341 * @throws IndexIntegrityException when a structural inconsistency is found 342 */ 343 public function checkIntegrity(): void 344 { 345 (new PageFulltextCollection())->checkIntegrity(); 346 (new PageTitleCollection())->checkIntegrity(); 347 348 foreach ($this->getMetadataRegistryKeys() as $key) { 349 (new PageMetaCollection($key))->checkIntegrity(); 350 } 351 } 352 353 /** 354 * Whether the search index is empty (no fulltext data indexed yet) 355 * 356 * @return bool 357 */ 358 public function isIndexEmpty(): bool 359 { 360 return (new PageFulltextCollection())->getTokenIndexMaximum() === 0; 361 } 362 363 /** 364 * Get the list of known metadata keys from the metadata registry 365 * 366 * @return string[] list of metadata key names 367 */ 368 protected function getMetadataRegistryKeys(): array 369 { 370 global $conf; 371 $fn = $conf['indexdir'] . '/metadata.idx'; 372 if (!file_exists($fn)) return []; 373 $keys = file($fn, FILE_IGNORE_NEW_LINES); 374 return $keys ?: []; 375 } 376 377 /** 378 * Update the metadata registry with new keys 379 * 380 * @param string[] $keys metadata key names to ensure are registered 381 * 382 * @internal Only marked public for access via LegacyIndexer 383 */ 384 public function updateMetadataRegistry(array $keys): void 385 { 386 global $conf; 387 $fn = $conf['indexdir'] . '/metadata.idx'; 388 $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; 389 if (!$existing) $existing = []; 390 391 $added = false; 392 foreach ($keys as $key) { 393 if (!in_array($key, $existing)) { 394 $existing[] = $key; 395 $added = true; 396 } 397 } 398 399 if ($added) { 400 io_saveFile($fn, implode("\n", $existing) . "\n"); 401 } 402 } 403 404 /** 405 * Return a list of all indexed pages, optionally filtered by metadata key 406 * 407 * Kept on Indexer (not just LegacyIndexer) because several plugins call it 408 * directly on `new Indexer()` instances rather than going through 409 * idx_get_indexer(). 410 * 411 * @param string|null $key metadata key name 412 * @return string[] 413 * 414 * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead 415 */ 416 public function getPages($key = null) 417 { 418 DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()'); 419 return (new MetadataSearch())->getPages($key); 420 } 421} 422