1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\PageFulltextCollection; 7use dokuwiki\Search\Collection\PageMetaCollection; 8use dokuwiki\Search\Collection\PageTitleCollection; 9use dokuwiki\Search\Exception\IndexAccessException; 10use dokuwiki\Search\Exception\IndexIntegrityException; 11use dokuwiki\Search\Exception\IndexLockException; 12use dokuwiki\Search\Exception\IndexWriteException; 13use dokuwiki\Search\Index\FileIndex; 14use dokuwiki\Search\Index\Lock; 15 16// Version tag used to force rebuild on upgrade 17const INDEXER_VERSION = 8; 18 19/** 20 * Class DokuWiki Indexer 21 * 22 * Manages the page search index by delegating to Collection classes. 23 * 24 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 25 * @author Andreas Gohr <andi@splitbrain.org> 26 * @author Tom N Harris <tnharris@whoopdedo.org> 27 */ 28class Indexer 29{ 30 /** @var callable|null Logging callback, receives a string message */ 31 protected $logger; 32 33 /** 34 * Set a logging callback 35 * 36 * The callback receives a single string message. Use this to integrate 37 * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) 38 * 39 * @param callable $logger 40 * @return static 41 */ 42 public function setLogger(callable $logger): static 43 { 44 $this->logger = $logger; 45 return $this; 46 } 47 48 /** 49 * Send a message to the registered logger 50 * 51 * @param string $message 52 */ 53 protected function log(string $message): void 54 { 55 if ($this->logger) ($this->logger)($message); 56 } 57 58 /** 59 * Version of the indexer taking into consideration the external tokenizer. 60 * The indexer is only compatible with data written by the same version. 61 * 62 * @triggers INDEXER_VERSION_GET 63 * Plugins that modify what gets indexed should hook this event and 64 * add their version info to the event data like so: 65 * $data[$plugin_name] = $plugin_version; 66 * 67 * @return int|string 68 */ 69 public function getVersion() 70 { 71 static $indexer_version = null; 72 if ($indexer_version == null) { 73 $version = INDEXER_VERSION; 74 75 $data = ['dokuwiki' => $version]; 76 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 77 unset($data['dokuwiki']); // this needs to be first 78 ksort($data); 79 foreach ($data as $plugin => $vers) { 80 $version .= '+' . $plugin . '=' . $vers; 81 } 82 $indexer_version = $version; 83 } 84 return $indexer_version; 85 } 86 87 /** 88 * Return a list of all indexed pages 89 * 90 * @param bool $existsFilter only return pages that exist on disk 91 * @return string[] list of page names (keys are the RIDs in the page index) 92 */ 93 public function getAllPages(bool $existsFilter = false): array 94 { 95 $pageIndex = new Index\MemoryIndex('page'); 96 return array_filter( 97 iterator_to_array($pageIndex), 98 static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) 99 ); 100 } 101 102 /** 103 * Check if a page needs (re-)indexing 104 * 105 * @param string $page 106 * @param bool $force 107 * @return bool true if indexing is needed 108 */ 109 public function needsIndexing(string $page, bool $force = false): bool 110 { 111 $idxtag = metaFN($page, '.indexed'); 112 if ($force || !file_exists($idxtag)) return true; 113 114 if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; 115 116 $last = @filemtime($idxtag); 117 return $last <= @filemtime(wikiFN($page)); 118 } 119 120 /** 121 * Add/update the search index for a page 122 * 123 * Locking is handled internally. 124 * 125 * @param string $page The page to index 126 * @param bool $force force reindexing even when the index is up to date 127 * 128 * @throws IndexAccessException 129 * @throws IndexLockException 130 * @throws IndexWriteException 131 */ 132 public function addPage(string $page, bool $force = false): void 133 { 134 if (!$this->needsIndexing($page, $force)) { 135 $this->log("Indexer: index for {$page} up to date"); 136 return; 137 } 138 139 // create shared writable page index early so we can resolve the PID for plugins 140 $pageIndex = new FileIndex('page', '', true); 141 142 // prepare event data 143 $data = [ 144 'page' => $page, 145 'body' => '', 146 'metadata' => [ 147 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 148 'relation_references' => array_keys( 149 p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] 150 ), 151 'relation_media' => array_keys( 152 p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] 153 ), 154 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, 155 ], 156 'pid' => $pageIndex->accessCachedValue($page), 157 ]; 158 159 // let plugins modify the data 160 $event = new Event('INDEXER_PAGE_ADD', $data); 161 if ($event->advise_before()) { 162 $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); 163 } 164 $event->advise_after(); 165 unset($event); 166 167 // index title 168 (new PageTitleCollection($pageIndex))->lock() 169 ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); 170 unset($data['metadata']['title']); 171 172 // index fulltext 173 if ($data['metadata']['internal_index']) { 174 $words = Tokenizer::getWords($data['body']); 175 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); 176 } else { 177 $this->log("Indexer: full text indexing disabled for {$data['page']}"); 178 // clear any previously stored fulltext data 179 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); 180 } 181 unset($data['metadata']['internal_index']); 182 183 // index metadata keys 184 foreach ($data['metadata'] as $key => $values) { 185 if (!is_array($values)) { 186 $values = ($values !== null && $values !== '') ? [$values] : []; 187 } 188 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); 189 } 190 191 // update metadata registry 192 $this->updateMetadataRegistry(array_keys($data['metadata'])); 193 194 // update index tag file 195 io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); 196 $this->log("Indexer: finished indexing {$data['page']}"); 197 } 198 199 /** 200 * Remove a page from the index 201 * 202 * Clears the page's data from all collections. The entity persists in page.idx. 203 * 204 * @param string $page The page to remove 205 * @param bool $force force deletion even when no .indexed tag exists 206 * 207 * @throws IndexAccessException 208 * @throws IndexLockException 209 * @throws IndexWriteException 210 */ 211 public function deletePage(string $page, bool $force = false): void 212 { 213 $idxtag = metaFN($page, '.indexed'); 214 if (!$force && !file_exists($idxtag)) { 215 $this->log("Indexer: {$page}.indexed file does not exist, ignoring"); 216 return; 217 } 218 219 $pageIndex = new FileIndex('page', '', true); 220 221 (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 222 (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 223 224 foreach ($this->getMetadataRegistryKeys() as $key) { 225 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); 226 } 227 228 $this->log("Indexer: deleted {$page} from index"); 229 @unlink($idxtag); 230 } 231 232 /** 233 * Rename a page in the search index 234 * 235 * The page must already have been moved on disk before calling this. 236 * Clears the old page's data and re-indexes under the new name. 237 * 238 * @param string $oldpage The old page name 239 * @param string $newpage The new page name 240 * 241 * @throws IndexAccessException 242 * @throws IndexLockException 243 * @throws IndexWriteException 244 */ 245 public function renamePage(string $oldpage, string $newpage): void 246 { 247 $this->deletePage($oldpage, true); 248 $this->addPage($newpage, true); 249 } 250 251 /** 252 * Clear all page indexes 253 */ 254 public function clear(): void 255 { 256 global $conf; 257 258 Lock::acquire('page'); 259 260 // clear metadata indexes 261 foreach ($this->getMetadataRegistryKeys() as $key) { 262 $clean = PageMetaCollection::cleanName($key); 263 @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); 264 @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); 265 @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); 266 } 267 268 // clear fulltext indexes 269 $files = glob($conf['indexdir'] . '/i*.idx'); 270 if ($files) foreach ($files as $f) @unlink($f); 271 $files = glob($conf['indexdir'] . '/w*.idx'); 272 if ($files) foreach ($files as $f) @unlink($f); 273 274 @unlink($conf['indexdir'] . '/pageword.idx'); 275 @unlink($conf['indexdir'] . '/lengths.idx'); 276 277 // clear title and page indexes 278 @unlink($conf['indexdir'] . '/title.idx'); 279 @unlink($conf['indexdir'] . '/page.idx'); 280 @unlink($conf['indexdir'] . '/metadata.idx'); 281 282 Lock::release('page'); 283 } 284 285 /** 286 * Check the structural integrity of all search indexes 287 * 288 * @throws IndexIntegrityException when a structural inconsistency is found 289 */ 290 public function checkIntegrity(): void 291 { 292 (new PageFulltextCollection())->checkIntegrity(); 293 (new PageTitleCollection())->checkIntegrity(); 294 295 foreach ($this->getMetadataRegistryKeys() as $key) { 296 (new PageMetaCollection($key))->checkIntegrity(); 297 } 298 } 299 300 /** 301 * Whether the search index is empty (no fulltext data indexed yet) 302 * 303 * @return bool 304 */ 305 public function isIndexEmpty(): bool 306 { 307 return (new PageFulltextCollection())->getTokenIndexMaximum() === 0; 308 } 309 310 /** 311 * Get the list of known metadata keys from the metadata registry 312 * 313 * @return string[] list of metadata key names 314 */ 315 protected function getMetadataRegistryKeys(): array 316 { 317 global $conf; 318 $fn = $conf['indexdir'] . '/metadata.idx'; 319 if (!file_exists($fn)) return []; 320 $keys = file($fn, FILE_IGNORE_NEW_LINES); 321 return $keys ?: []; 322 } 323 324 /** 325 * Update the metadata registry with new keys 326 * 327 * @param string[] $keys metadata key names to ensure are registered 328 */ 329 protected function updateMetadataRegistry(array $keys): void 330 { 331 global $conf; 332 $fn = $conf['indexdir'] . '/metadata.idx'; 333 $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; 334 if (!$existing) $existing = []; 335 336 $added = false; 337 foreach ($keys as $key) { 338 if (!in_array($key, $existing)) { 339 $existing[] = $key; 340 $added = true; 341 } 342 } 343 344 if ($added) { 345 io_saveFile($fn, implode("\n", $existing) . "\n"); 346 } 347 } 348} 349