16225b270SMichael Große<?php 26225b270SMichael Große 36225b270SMichael Großenamespace dokuwiki\Search; 46225b270SMichael Große 56225b270SMichael Großeuse dokuwiki\Extension\Event; 6*83b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageFulltextCollection; 7*83b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection; 8*83b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection; 915f699acSAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException; 10a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexLockException; 11a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexWriteException; 12*83b3acccSAndreas Gohruse dokuwiki\Search\Index\FileIndex; 13*83b3acccSAndreas Gohruse dokuwiki\Search\Index\Lock; 144027a91aSSatoshi Sahara 154027a91aSSatoshi Sahara// Version tag used to force rebuild on upgrade 164027a91aSSatoshi Saharaconst INDEXER_VERSION = 8; 176225b270SMichael Große 186225b270SMichael Große/** 19a32da6ddSSatoshi Sahara * Class DokuWiki Indexer 206225b270SMichael Große * 21*83b3acccSAndreas Gohr * Manages the page search index by delegating to Collection classes. 22*83b3acccSAndreas Gohr * 234027a91aSSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 246225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 254027a91aSSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 266225b270SMichael Große */ 27*83b3acccSAndreas Gohrclass Indexer 284027a91aSSatoshi Sahara{ 29*83b3acccSAndreas Gohr /** @var callable|null Logging callback, receives a string message */ 30*83b3acccSAndreas Gohr protected $logger; 316225b270SMichael Große 324027a91aSSatoshi Sahara /** 33*83b3acccSAndreas Gohr * Set a logging callback 344027a91aSSatoshi Sahara * 35*83b3acccSAndreas Gohr * The callback receives a single string message. Use this to integrate 36*83b3acccSAndreas Gohr * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) 37*83b3acccSAndreas Gohr * 38*83b3acccSAndreas Gohr * @param callable $logger 39*83b3acccSAndreas Gohr * @return static 404027a91aSSatoshi Sahara */ 41*83b3acccSAndreas Gohr public function setLogger(callable $logger): static 424027a91aSSatoshi Sahara { 43*83b3acccSAndreas Gohr $this->logger = $logger; 44*83b3acccSAndreas Gohr return $this; 456225b270SMichael Große } 466225b270SMichael Große 476225b270SMichael Große /** 48*83b3acccSAndreas Gohr * Send a message to the registered logger 496225b270SMichael Große * 50*83b3acccSAndreas Gohr * @param string $message 516225b270SMichael Große */ 52*83b3acccSAndreas Gohr protected function log(string $message): void 534027a91aSSatoshi Sahara { 54*83b3acccSAndreas Gohr if ($this->logger) ($this->logger)($message); 556225b270SMichael Große } 566225b270SMichael Große 576225b270SMichael Große /** 584027a91aSSatoshi Sahara * Version of the indexer taking into consideration the external tokenizer. 594027a91aSSatoshi Sahara * The indexer is only compatible with data written by the same version. 606225b270SMichael Große * 614027a91aSSatoshi Sahara * @triggers INDEXER_VERSION_GET 624027a91aSSatoshi Sahara * Plugins that modify what gets indexed should hook this event and 634027a91aSSatoshi Sahara * add their version info to the event data like so: 644027a91aSSatoshi Sahara * $data[$plugin_name] = $plugin_version; 656225b270SMichael Große * 664027a91aSSatoshi Sahara * @return int|string 676225b270SMichael Große */ 684027a91aSSatoshi Sahara public function getVersion() 694027a91aSSatoshi Sahara { 704027a91aSSatoshi Sahara static $indexer_version = null; 714027a91aSSatoshi Sahara if ($indexer_version == null) { 724027a91aSSatoshi Sahara $version = INDEXER_VERSION; 734027a91aSSatoshi Sahara 74*83b3acccSAndreas Gohr $data = ['dokuwiki' => $version]; 754027a91aSSatoshi Sahara Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 764027a91aSSatoshi Sahara unset($data['dokuwiki']); // this needs to be first 774027a91aSSatoshi Sahara ksort($data); 784027a91aSSatoshi Sahara foreach ($data as $plugin => $vers) { 794027a91aSSatoshi Sahara $version .= '+' . $plugin . '=' . $vers; 804027a91aSSatoshi Sahara } 814027a91aSSatoshi Sahara $indexer_version = $version; 824027a91aSSatoshi Sahara } 834027a91aSSatoshi Sahara return $indexer_version; 846225b270SMichael Große } 856225b270SMichael Große 864027a91aSSatoshi Sahara /** 87*83b3acccSAndreas Gohr * Return a list of all indexed pages 88*83b3acccSAndreas Gohr * 89*83b3acccSAndreas Gohr * @param bool $existsFilter only return pages that exist on disk 90*83b3acccSAndreas Gohr * @return string[] list of page names (keys are the RIDs in the page index) 91*83b3acccSAndreas Gohr */ 92*83b3acccSAndreas Gohr public function getAllPages(bool $existsFilter = false): array 93*83b3acccSAndreas Gohr { 94*83b3acccSAndreas Gohr $pageIndex = new Index\MemoryIndex('page'); 95*83b3acccSAndreas Gohr return array_filter( 96*83b3acccSAndreas Gohr iterator_to_array($pageIndex), 97*83b3acccSAndreas Gohr static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) 98*83b3acccSAndreas Gohr ); 99*83b3acccSAndreas Gohr } 100*83b3acccSAndreas Gohr 101*83b3acccSAndreas Gohr /** 102*83b3acccSAndreas Gohr * Check if a page needs (re-)indexing 103*83b3acccSAndreas Gohr * 104*83b3acccSAndreas Gohr * @param string $page 105*83b3acccSAndreas Gohr * @param bool $force 106*83b3acccSAndreas Gohr * @return bool true if indexing is needed 107*83b3acccSAndreas Gohr */ 108*83b3acccSAndreas Gohr public function needsIndexing(string $page, bool $force = false): bool 109*83b3acccSAndreas Gohr { 110*83b3acccSAndreas Gohr $idxtag = metaFN($page, '.indexed'); 111*83b3acccSAndreas Gohr if ($force || !file_exists($idxtag)) return true; 112*83b3acccSAndreas Gohr 113*83b3acccSAndreas Gohr if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; 114*83b3acccSAndreas Gohr 115*83b3acccSAndreas Gohr $last = @filemtime($idxtag); 116*83b3acccSAndreas Gohr return $last <= @filemtime(wikiFN($page)); 117*83b3acccSAndreas Gohr } 118*83b3acccSAndreas Gohr 119*83b3acccSAndreas Gohr /** 120*83b3acccSAndreas Gohr * Add/update the search index for a page 1214027a91aSSatoshi Sahara * 1224027a91aSSatoshi Sahara * Locking is handled internally. 1234027a91aSSatoshi Sahara * 124*83b3acccSAndreas Gohr * @param string $page The page to index 1254027a91aSSatoshi Sahara * @param bool $force force reindexing even when the index is up to date 1264027a91aSSatoshi Sahara * 127a32da6ddSSatoshi Sahara * @throws IndexAccessException 128a16bd548SSatoshi Sahara * @throws IndexLockException 129a16bd548SSatoshi Sahara * @throws IndexWriteException 1304027a91aSSatoshi Sahara */ 131*83b3acccSAndreas Gohr public function addPage(string $page, bool $force = false): void 1324027a91aSSatoshi Sahara { 133*83b3acccSAndreas Gohr if (!$this->needsIndexing($page, $force)) { 134*83b3acccSAndreas Gohr $this->log("Indexer: index for {$page} up to date"); 135*83b3acccSAndreas Gohr return; 136a32da6ddSSatoshi Sahara } 137a32da6ddSSatoshi Sahara 138*83b3acccSAndreas Gohr // create shared writable page index early so we can resolve the PID for plugins 139*83b3acccSAndreas Gohr $pageIndex = new FileIndex('page', '', true); 1406225b270SMichael Große 141*83b3acccSAndreas Gohr // prepare event data 142*83b3acccSAndreas Gohr $data = [ 143*83b3acccSAndreas Gohr 'page' => $page, 144*83b3acccSAndreas Gohr 'body' => '', 145*83b3acccSAndreas Gohr 'metadata' => [ 146*83b3acccSAndreas Gohr 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 147*83b3acccSAndreas Gohr 'relation_references' => array_keys( 148*83b3acccSAndreas Gohr p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] 149*83b3acccSAndreas Gohr ), 150*83b3acccSAndreas Gohr 'relation_media' => array_keys( 151*83b3acccSAndreas Gohr p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] 152*83b3acccSAndreas Gohr ), 153*83b3acccSAndreas Gohr 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, 154*83b3acccSAndreas Gohr ], 155*83b3acccSAndreas Gohr 'pid' => $pageIndex->accessCachedValue($page), 156*83b3acccSAndreas Gohr ]; 1576225b270SMichael Große 158*83b3acccSAndreas Gohr // let plugins modify the data 1594027a91aSSatoshi Sahara $event = new Event('INDEXER_PAGE_ADD', $data); 160*83b3acccSAndreas Gohr if ($event->advise_before()) { 161*83b3acccSAndreas Gohr $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); 162*83b3acccSAndreas Gohr } 1634027a91aSSatoshi Sahara $event->advise_after(); 1644027a91aSSatoshi Sahara unset($event); 1656225b270SMichael Große 166*83b3acccSAndreas Gohr // index title 167*83b3acccSAndreas Gohr (new PageTitleCollection($pageIndex))->lock() 168*83b3acccSAndreas Gohr ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); 169*83b3acccSAndreas Gohr unset($data['metadata']['title']); 1706225b270SMichael Große 171*83b3acccSAndreas Gohr // index fulltext 172*83b3acccSAndreas Gohr if ($data['metadata']['internal_index']) { 173*83b3acccSAndreas Gohr $words = Tokenizer::getWords($data['body']); 174*83b3acccSAndreas Gohr (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); 1756225b270SMichael Große } else { 176*83b3acccSAndreas Gohr $this->log("Indexer: full text indexing disabled for {$data['page']}"); 177*83b3acccSAndreas Gohr // clear any previously stored fulltext data 178*83b3acccSAndreas Gohr (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); 1796225b270SMichael Große } 180*83b3acccSAndreas Gohr unset($data['metadata']['internal_index']); 181*83b3acccSAndreas Gohr 182*83b3acccSAndreas Gohr // index metadata keys 183*83b3acccSAndreas Gohr foreach ($data['metadata'] as $key => $values) { 184*83b3acccSAndreas Gohr if (!is_array($values)) { 185*83b3acccSAndreas Gohr $values = ($values !== null && $values !== '') ? [$values] : []; 1866225b270SMichael Große } 187*83b3acccSAndreas Gohr (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); 188*83b3acccSAndreas Gohr } 189*83b3acccSAndreas Gohr 190*83b3acccSAndreas Gohr // update metadata registry 191*83b3acccSAndreas Gohr $this->updateMetadataRegistry(array_keys($data['metadata'])); 1926225b270SMichael Große 1934027a91aSSatoshi Sahara // update index tag file 194*83b3acccSAndreas Gohr io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); 195*83b3acccSAndreas Gohr $this->log("Indexer: finished indexing {$data['page']}"); 1966225b270SMichael Große } 1976225b270SMichael Große 1986225b270SMichael Große /** 1995f9bd525SSatoshi Sahara * Remove a page from the index 2006225b270SMichael Große * 201*83b3acccSAndreas Gohr * Clears the page's data from all collections. The entity persists in page.idx. 2026225b270SMichael Große * 203*83b3acccSAndreas Gohr * @param string $page The page to remove 204*83b3acccSAndreas Gohr * @param bool $force force deletion even when no .indexed tag exists 2056225b270SMichael Große * 206a32da6ddSSatoshi Sahara * @throws IndexAccessException 207a16bd548SSatoshi Sahara * @throws IndexLockException 208a16bd548SSatoshi Sahara * @throws IndexWriteException 2096225b270SMichael Große */ 210*83b3acccSAndreas Gohr public function deletePage(string $page, bool $force = false): void 2114027a91aSSatoshi Sahara { 2124027a91aSSatoshi Sahara $idxtag = metaFN($page, '.indexed'); 2134027a91aSSatoshi Sahara if (!$force && !file_exists($idxtag)) { 214*83b3acccSAndreas Gohr $this->log("Indexer: {$page}.indexed file does not exist, ignoring"); 215*83b3acccSAndreas Gohr return; 2164027a91aSSatoshi Sahara } 2176225b270SMichael Große 218*83b3acccSAndreas Gohr $pageIndex = new FileIndex('page', '', true); 219725e8e5fSSatoshi Sahara 220*83b3acccSAndreas Gohr (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 221*83b3acccSAndreas Gohr (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 222*83b3acccSAndreas Gohr 223*83b3acccSAndreas Gohr foreach ($this->getMetadataRegistryKeys() as $key) { 224*83b3acccSAndreas Gohr (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); 2254027a91aSSatoshi Sahara } 2266225b270SMichael Große 227*83b3acccSAndreas Gohr $this->log("Indexer: deleted {$page} from index"); 2284027a91aSSatoshi Sahara @unlink($idxtag); 2294027a91aSSatoshi Sahara } 2304027a91aSSatoshi Sahara 2314027a91aSSatoshi Sahara /** 232*83b3acccSAndreas Gohr * Rename a page in the search index 233*83b3acccSAndreas Gohr * 234*83b3acccSAndreas Gohr * The page must already have been moved on disk before calling this. 235*83b3acccSAndreas Gohr * Clears the old page's data and re-indexes under the new name. 2364027a91aSSatoshi Sahara * 2374027a91aSSatoshi Sahara * @param string $oldpage The old page name 2384027a91aSSatoshi Sahara * @param string $newpage The new page name 239*83b3acccSAndreas Gohr * 240*83b3acccSAndreas Gohr * @throws IndexAccessException 241a16bd548SSatoshi Sahara * @throws IndexLockException 242a16bd548SSatoshi Sahara * @throws IndexWriteException 2434027a91aSSatoshi Sahara */ 244*83b3acccSAndreas Gohr public function renamePage(string $oldpage, string $newpage): void 2454027a91aSSatoshi Sahara { 246*83b3acccSAndreas Gohr $this->deletePage($oldpage, true); 247*83b3acccSAndreas Gohr $this->addPage($newpage, true); 2486225b270SMichael Große } 2496225b270SMichael Große 2506225b270SMichael Große /** 251*83b3acccSAndreas Gohr * Clear all page indexes 2526225b270SMichael Große */ 253*83b3acccSAndreas Gohr public function clear(): void 2544027a91aSSatoshi Sahara { 2556225b270SMichael Große global $conf; 2566225b270SMichael Große 257*83b3acccSAndreas Gohr Lock::acquire('page'); 2584027a91aSSatoshi Sahara 259*83b3acccSAndreas Gohr // clear metadata indexes 260*83b3acccSAndreas Gohr foreach ($this->getMetadataRegistryKeys() as $key) { 261*83b3acccSAndreas Gohr $clean = PageMetaCollection::cleanName($key); 262*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); 263*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); 264*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); 2656225b270SMichael Große } 2666225b270SMichael Große 267*83b3acccSAndreas Gohr // clear fulltext indexes 268*83b3acccSAndreas Gohr $files = glob($conf['indexdir'] . '/i*.idx'); 269*83b3acccSAndreas Gohr if ($files) foreach ($files as $f) @unlink($f); 270*83b3acccSAndreas Gohr $files = glob($conf['indexdir'] . '/w*.idx'); 271*83b3acccSAndreas Gohr if ($files) foreach ($files as $f) @unlink($f); 272*83b3acccSAndreas Gohr 273*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/pageword.idx'); 274*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/lengths.idx'); 275*83b3acccSAndreas Gohr 276*83b3acccSAndreas Gohr // clear title and page indexes 277*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/title.idx'); 278*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/page.idx'); 279*83b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/metadata.idx'); 280*83b3acccSAndreas Gohr 281*83b3acccSAndreas Gohr Lock::release('page'); 282*83b3acccSAndreas Gohr } 283*83b3acccSAndreas Gohr 284*83b3acccSAndreas Gohr /** 285*83b3acccSAndreas Gohr * Get the list of known metadata keys from the metadata registry 286*83b3acccSAndreas Gohr * 287*83b3acccSAndreas Gohr * @return string[] list of metadata key names 288*83b3acccSAndreas Gohr */ 289*83b3acccSAndreas Gohr protected function getMetadataRegistryKeys(): array 290*83b3acccSAndreas Gohr { 291*83b3acccSAndreas Gohr global $conf; 292*83b3acccSAndreas Gohr $fn = $conf['indexdir'] . '/metadata.idx'; 293*83b3acccSAndreas Gohr if (!file_exists($fn)) return []; 294*83b3acccSAndreas Gohr $keys = file($fn, FILE_IGNORE_NEW_LINES); 295*83b3acccSAndreas Gohr return $keys ?: []; 296*83b3acccSAndreas Gohr } 297*83b3acccSAndreas Gohr 298*83b3acccSAndreas Gohr /** 299*83b3acccSAndreas Gohr * Update the metadata registry with new keys 300*83b3acccSAndreas Gohr * 301*83b3acccSAndreas Gohr * @param string[] $keys metadata key names to ensure are registered 302*83b3acccSAndreas Gohr */ 303*83b3acccSAndreas Gohr protected function updateMetadataRegistry(array $keys): void 304*83b3acccSAndreas Gohr { 305*83b3acccSAndreas Gohr global $conf; 306*83b3acccSAndreas Gohr $fn = $conf['indexdir'] . '/metadata.idx'; 307*83b3acccSAndreas Gohr $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; 308*83b3acccSAndreas Gohr if (!$existing) $existing = []; 309*83b3acccSAndreas Gohr 310*83b3acccSAndreas Gohr $added = false; 311*83b3acccSAndreas Gohr foreach ($keys as $key) { 312*83b3acccSAndreas Gohr if (!in_array($key, $existing)) { 313*83b3acccSAndreas Gohr $existing[] = $key; 314*83b3acccSAndreas Gohr $added = true; 315*83b3acccSAndreas Gohr } 316*83b3acccSAndreas Gohr } 317*83b3acccSAndreas Gohr 318*83b3acccSAndreas Gohr if ($added) { 319*83b3acccSAndreas Gohr io_saveFile($fn, implode("\n", $existing) . "\n"); 320*83b3acccSAndreas Gohr } 321*83b3acccSAndreas Gohr } 3226225b270SMichael Große} 323