1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\PageFulltextCollection; 7use dokuwiki\Search\Collection\PageMetaCollection; 8use dokuwiki\Search\Collection\PageTitleCollection; 9use dokuwiki\Search\Exception\IndexAccessException; 10use dokuwiki\Search\Exception\IndexLockException; 11use dokuwiki\Search\Exception\IndexWriteException; 12use dokuwiki\Search\Index\FileIndex; 13use dokuwiki\Search\Index\Lock; 14 15// Version tag used to force rebuild on upgrade 16const INDEXER_VERSION = 8; 17 18/** 19 * Class DokuWiki Indexer 20 * 21 * Manages the page search index by delegating to Collection classes. 22 * 23 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 24 * @author Andreas Gohr <andi@splitbrain.org> 25 * @author Tom N Harris <tnharris@whoopdedo.org> 26 */ 27class Indexer 28{ 29 /** @var callable|null Logging callback, receives a string message */ 30 protected $logger; 31 32 /** 33 * Set a logging callback 34 * 35 * The callback receives a single string message. Use this to integrate 36 * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) 37 * 38 * @param callable $logger 39 * @return static 40 */ 41 public function setLogger(callable $logger): static 42 { 43 $this->logger = $logger; 44 return $this; 45 } 46 47 /** 48 * Send a message to the registered logger 49 * 50 * @param string $message 51 */ 52 protected function log(string $message): void 53 { 54 if ($this->logger) ($this->logger)($message); 55 } 56 57 /** 58 * Version of the indexer taking into consideration the external tokenizer. 59 * The indexer is only compatible with data written by the same version. 60 * 61 * @triggers INDEXER_VERSION_GET 62 * Plugins that modify what gets indexed should hook this event and 63 * add their version info to the event data like so: 64 * $data[$plugin_name] = $plugin_version; 65 * 66 * @return int|string 67 */ 68 public function getVersion() 69 { 70 static $indexer_version = null; 71 if ($indexer_version == null) { 72 $version = INDEXER_VERSION; 73 74 $data = ['dokuwiki' => $version]; 75 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 76 unset($data['dokuwiki']); // this needs to be first 77 ksort($data); 78 foreach ($data as $plugin => $vers) { 79 $version .= '+' . $plugin . '=' . $vers; 80 } 81 $indexer_version = $version; 82 } 83 return $indexer_version; 84 } 85 86 /** 87 * Return a list of all indexed pages 88 * 89 * @param bool $existsFilter only return pages that exist on disk 90 * @return string[] list of page names (keys are the RIDs in the page index) 91 */ 92 public function getAllPages(bool $existsFilter = false): array 93 { 94 $pageIndex = new Index\MemoryIndex('page'); 95 return array_filter( 96 iterator_to_array($pageIndex), 97 static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) 98 ); 99 } 100 101 /** 102 * Check if a page needs (re-)indexing 103 * 104 * @param string $page 105 * @param bool $force 106 * @return bool true if indexing is needed 107 */ 108 public function needsIndexing(string $page, bool $force = false): bool 109 { 110 $idxtag = metaFN($page, '.indexed'); 111 if ($force || !file_exists($idxtag)) return true; 112 113 if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; 114 115 $last = @filemtime($idxtag); 116 return $last <= @filemtime(wikiFN($page)); 117 } 118 119 /** 120 * Add/update the search index for a page 121 * 122 * Locking is handled internally. 123 * 124 * @param string $page The page to index 125 * @param bool $force force reindexing even when the index is up to date 126 * 127 * @throws IndexAccessException 128 * @throws IndexLockException 129 * @throws IndexWriteException 130 */ 131 public function addPage(string $page, bool $force = false): void 132 { 133 if (!$this->needsIndexing($page, $force)) { 134 $this->log("Indexer: index for {$page} up to date"); 135 return; 136 } 137 138 // create shared writable page index early so we can resolve the PID for plugins 139 $pageIndex = new FileIndex('page', '', true); 140 141 // prepare event data 142 $data = [ 143 'page' => $page, 144 'body' => '', 145 'metadata' => [ 146 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 147 'relation_references' => array_keys( 148 p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] 149 ), 150 'relation_media' => array_keys( 151 p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] 152 ), 153 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, 154 ], 155 'pid' => $pageIndex->accessCachedValue($page), 156 ]; 157 158 // let plugins modify the data 159 $event = new Event('INDEXER_PAGE_ADD', $data); 160 if ($event->advise_before()) { 161 $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); 162 } 163 $event->advise_after(); 164 unset($event); 165 166 // index title 167 (new PageTitleCollection($pageIndex))->lock() 168 ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); 169 unset($data['metadata']['title']); 170 171 // index fulltext 172 if ($data['metadata']['internal_index']) { 173 $words = Tokenizer::getWords($data['body']); 174 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); 175 } else { 176 $this->log("Indexer: full text indexing disabled for {$data['page']}"); 177 // clear any previously stored fulltext data 178 (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); 179 } 180 unset($data['metadata']['internal_index']); 181 182 // index metadata keys 183 foreach ($data['metadata'] as $key => $values) { 184 if (!is_array($values)) { 185 $values = ($values !== null && $values !== '') ? [$values] : []; 186 } 187 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); 188 } 189 190 // update metadata registry 191 $this->updateMetadataRegistry(array_keys($data['metadata'])); 192 193 // update index tag file 194 io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); 195 $this->log("Indexer: finished indexing {$data['page']}"); 196 } 197 198 /** 199 * Remove a page from the index 200 * 201 * Clears the page's data from all collections. The entity persists in page.idx. 202 * 203 * @param string $page The page to remove 204 * @param bool $force force deletion even when no .indexed tag exists 205 * 206 * @throws IndexAccessException 207 * @throws IndexLockException 208 * @throws IndexWriteException 209 */ 210 public function deletePage(string $page, bool $force = false): void 211 { 212 $idxtag = metaFN($page, '.indexed'); 213 if (!$force && !file_exists($idxtag)) { 214 $this->log("Indexer: {$page}.indexed file does not exist, ignoring"); 215 return; 216 } 217 218 $pageIndex = new FileIndex('page', '', true); 219 220 (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 221 (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 222 223 foreach ($this->getMetadataRegistryKeys() as $key) { 224 (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); 225 } 226 227 $this->log("Indexer: deleted {$page} from index"); 228 @unlink($idxtag); 229 } 230 231 /** 232 * Rename a page in the search index 233 * 234 * The page must already have been moved on disk before calling this. 235 * Clears the old page's data and re-indexes under the new name. 236 * 237 * @param string $oldpage The old page name 238 * @param string $newpage The new page name 239 * 240 * @throws IndexAccessException 241 * @throws IndexLockException 242 * @throws IndexWriteException 243 */ 244 public function renamePage(string $oldpage, string $newpage): void 245 { 246 $this->deletePage($oldpage, true); 247 $this->addPage($newpage, true); 248 } 249 250 /** 251 * Clear all page indexes 252 */ 253 public function clear(): void 254 { 255 global $conf; 256 257 Lock::acquire('page'); 258 259 // clear metadata indexes 260 foreach ($this->getMetadataRegistryKeys() as $key) { 261 $clean = PageMetaCollection::cleanName($key); 262 @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); 263 @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); 264 @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); 265 } 266 267 // clear fulltext indexes 268 $files = glob($conf['indexdir'] . '/i*.idx'); 269 if ($files) foreach ($files as $f) @unlink($f); 270 $files = glob($conf['indexdir'] . '/w*.idx'); 271 if ($files) foreach ($files as $f) @unlink($f); 272 273 @unlink($conf['indexdir'] . '/pageword.idx'); 274 @unlink($conf['indexdir'] . '/lengths.idx'); 275 276 // clear title and page indexes 277 @unlink($conf['indexdir'] . '/title.idx'); 278 @unlink($conf['indexdir'] . '/page.idx'); 279 @unlink($conf['indexdir'] . '/metadata.idx'); 280 281 Lock::release('page'); 282 } 283 284 /** 285 * Get the list of known metadata keys from the metadata registry 286 * 287 * @return string[] list of metadata key names 288 */ 289 protected function getMetadataRegistryKeys(): array 290 { 291 global $conf; 292 $fn = $conf['indexdir'] . '/metadata.idx'; 293 if (!file_exists($fn)) return []; 294 $keys = file($fn, FILE_IGNORE_NEW_LINES); 295 return $keys ?: []; 296 } 297 298 /** 299 * Update the metadata registry with new keys 300 * 301 * @param string[] $keys metadata key names to ensure are registered 302 */ 303 protected function updateMetadataRegistry(array $keys): void 304 { 305 global $conf; 306 $fn = $conf['indexdir'] . '/metadata.idx'; 307 $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; 308 if (!$existing) $existing = []; 309 310 $added = false; 311 foreach ($keys as $key) { 312 if (!in_array($key, $existing)) { 313 $existing[] = $key; 314 $added = true; 315 } 316 } 317 318 if ($added) { 319 io_saveFile($fn, implode("\n", $existing) . "\n"); 320 } 321 } 322} 323