16225b270SMichael Große<?php 26225b270SMichael Große 36225b270SMichael Großenamespace dokuwiki\Search; 46225b270SMichael Große 56225b270SMichael Großeuse dokuwiki\Extension\Event; 683b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageFulltextCollection; 783b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection; 883b3acccSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection; 915f699acSAndreas Gohruse dokuwiki\Search\Exception\IndexAccessException; 10*21fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException; 11a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexLockException; 12a16bd548SSatoshi Saharause dokuwiki\Search\Exception\IndexWriteException; 1383b3acccSAndreas Gohruse dokuwiki\Search\Index\FileIndex; 1483b3acccSAndreas Gohruse dokuwiki\Search\Index\Lock; 154027a91aSSatoshi Sahara 164027a91aSSatoshi Sahara// Version tag used to force rebuild on upgrade 174027a91aSSatoshi Saharaconst INDEXER_VERSION = 8; 186225b270SMichael Große 196225b270SMichael Große/** 20a32da6ddSSatoshi Sahara * Class DokuWiki Indexer 216225b270SMichael Große * 2283b3acccSAndreas Gohr * Manages the page search index by delegating to Collection classes. 2383b3acccSAndreas Gohr * 244027a91aSSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 256225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 264027a91aSSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 276225b270SMichael Große */ 2883b3acccSAndreas Gohrclass Indexer 294027a91aSSatoshi Sahara{ 3083b3acccSAndreas Gohr /** @var callable|null Logging callback, receives a string message */ 3183b3acccSAndreas Gohr protected $logger; 326225b270SMichael Große 334027a91aSSatoshi Sahara /** 3483b3acccSAndreas Gohr * Set a logging callback 354027a91aSSatoshi Sahara * 3683b3acccSAndreas Gohr * The callback receives a single string message. Use this to integrate 3783b3acccSAndreas Gohr * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) 3883b3acccSAndreas Gohr * 3983b3acccSAndreas Gohr * @param callable $logger 4083b3acccSAndreas Gohr * @return static 414027a91aSSatoshi Sahara */ 4283b3acccSAndreas Gohr public function setLogger(callable $logger): static 434027a91aSSatoshi Sahara { 4483b3acccSAndreas Gohr $this->logger = $logger; 4583b3acccSAndreas Gohr return $this; 466225b270SMichael Große } 476225b270SMichael Große 486225b270SMichael Große /** 4983b3acccSAndreas Gohr * Send a message to the registered logger 506225b270SMichael Große * 5183b3acccSAndreas Gohr * @param string $message 526225b270SMichael Große */ 5383b3acccSAndreas Gohr protected function log(string $message): void 544027a91aSSatoshi Sahara { 5583b3acccSAndreas Gohr if ($this->logger) ($this->logger)($message); 566225b270SMichael Große } 576225b270SMichael Große 586225b270SMichael Große /** 594027a91aSSatoshi Sahara * Version of the indexer taking into consideration the external tokenizer. 604027a91aSSatoshi Sahara * The indexer is only compatible with data written by the same version. 616225b270SMichael Große * 624027a91aSSatoshi Sahara * @triggers INDEXER_VERSION_GET 634027a91aSSatoshi Sahara * Plugins that modify what gets indexed should hook this event and 644027a91aSSatoshi Sahara * add their version info to the event data like so: 654027a91aSSatoshi Sahara * $data[$plugin_name] = $plugin_version; 666225b270SMichael Große * 674027a91aSSatoshi Sahara * @return int|string 686225b270SMichael Große */ 694027a91aSSatoshi Sahara public function getVersion() 704027a91aSSatoshi Sahara { 714027a91aSSatoshi Sahara static $indexer_version = null; 724027a91aSSatoshi Sahara if ($indexer_version == null) { 734027a91aSSatoshi Sahara $version = INDEXER_VERSION; 744027a91aSSatoshi Sahara 7583b3acccSAndreas Gohr $data = ['dokuwiki' => $version]; 764027a91aSSatoshi Sahara Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 774027a91aSSatoshi Sahara unset($data['dokuwiki']); // this needs to be first 784027a91aSSatoshi Sahara ksort($data); 794027a91aSSatoshi Sahara foreach ($data as $plugin => $vers) { 804027a91aSSatoshi Sahara $version .= '+' . $plugin . '=' . $vers; 814027a91aSSatoshi Sahara } 824027a91aSSatoshi Sahara $indexer_version = $version; 834027a91aSSatoshi Sahara } 844027a91aSSatoshi Sahara return $indexer_version; 856225b270SMichael Große } 866225b270SMichael Große 874027a91aSSatoshi Sahara /** 8883b3acccSAndreas Gohr * Return a list of all indexed pages 8983b3acccSAndreas Gohr * 9083b3acccSAndreas Gohr * @param bool $existsFilter only return pages that exist on disk 9183b3acccSAndreas Gohr * @return string[] list of page names (keys are the RIDs in the page index) 9283b3acccSAndreas Gohr */ 9383b3acccSAndreas Gohr public function getAllPages(bool $existsFilter = false): array 9483b3acccSAndreas Gohr { 9583b3acccSAndreas Gohr $pageIndex = new Index\MemoryIndex('page'); 9683b3acccSAndreas Gohr return array_filter( 9783b3acccSAndreas Gohr iterator_to_array($pageIndex), 9883b3acccSAndreas Gohr static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) 9983b3acccSAndreas Gohr ); 10083b3acccSAndreas Gohr } 10183b3acccSAndreas Gohr 10283b3acccSAndreas Gohr /** 10383b3acccSAndreas Gohr * Check if a page needs (re-)indexing 10483b3acccSAndreas Gohr * 10583b3acccSAndreas Gohr * @param string $page 10683b3acccSAndreas Gohr * @param bool $force 10783b3acccSAndreas Gohr * @return bool true if indexing is needed 10883b3acccSAndreas Gohr */ 10983b3acccSAndreas Gohr public function needsIndexing(string $page, bool $force = false): bool 11083b3acccSAndreas Gohr { 11183b3acccSAndreas Gohr $idxtag = metaFN($page, '.indexed'); 11283b3acccSAndreas Gohr if ($force || !file_exists($idxtag)) return true; 11383b3acccSAndreas Gohr 11483b3acccSAndreas Gohr if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; 11583b3acccSAndreas Gohr 11683b3acccSAndreas Gohr $last = @filemtime($idxtag); 11783b3acccSAndreas Gohr return $last <= @filemtime(wikiFN($page)); 11883b3acccSAndreas Gohr } 11983b3acccSAndreas Gohr 12083b3acccSAndreas Gohr /** 12183b3acccSAndreas Gohr * Add/update the search index for a page 1224027a91aSSatoshi Sahara * 1234027a91aSSatoshi Sahara * Locking is handled internally. 1244027a91aSSatoshi Sahara * 12583b3acccSAndreas Gohr * @param string $page The page to index 1264027a91aSSatoshi Sahara * @param bool $force force reindexing even when the index is up to date 1274027a91aSSatoshi Sahara * 128a32da6ddSSatoshi Sahara * @throws IndexAccessException 129a16bd548SSatoshi Sahara * @throws IndexLockException 130a16bd548SSatoshi Sahara * @throws IndexWriteException 1314027a91aSSatoshi Sahara */ 13283b3acccSAndreas Gohr public function addPage(string $page, bool $force = false): void 1334027a91aSSatoshi Sahara { 13483b3acccSAndreas Gohr if (!$this->needsIndexing($page, $force)) { 13583b3acccSAndreas Gohr $this->log("Indexer: index for {$page} up to date"); 13683b3acccSAndreas Gohr return; 137a32da6ddSSatoshi Sahara } 138a32da6ddSSatoshi Sahara 13983b3acccSAndreas Gohr // create shared writable page index early so we can resolve the PID for plugins 14083b3acccSAndreas Gohr $pageIndex = new FileIndex('page', '', true); 1416225b270SMichael Große 14283b3acccSAndreas Gohr // prepare event data 14383b3acccSAndreas Gohr $data = [ 14483b3acccSAndreas Gohr 'page' => $page, 14583b3acccSAndreas Gohr 'body' => '', 14683b3acccSAndreas Gohr 'metadata' => [ 14783b3acccSAndreas Gohr 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 14883b3acccSAndreas Gohr 'relation_references' => array_keys( 14983b3acccSAndreas Gohr p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] 15083b3acccSAndreas Gohr ), 15183b3acccSAndreas Gohr 'relation_media' => array_keys( 15283b3acccSAndreas Gohr p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] 15383b3acccSAndreas Gohr ), 15483b3acccSAndreas Gohr 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, 15583b3acccSAndreas Gohr ], 15683b3acccSAndreas Gohr 'pid' => $pageIndex->accessCachedValue($page), 15783b3acccSAndreas Gohr ]; 1586225b270SMichael Große 15983b3acccSAndreas Gohr // let plugins modify the data 1604027a91aSSatoshi Sahara $event = new Event('INDEXER_PAGE_ADD', $data); 16183b3acccSAndreas Gohr if ($event->advise_before()) { 16283b3acccSAndreas Gohr $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); 16383b3acccSAndreas Gohr } 1644027a91aSSatoshi Sahara $event->advise_after(); 1654027a91aSSatoshi Sahara unset($event); 1666225b270SMichael Große 16783b3acccSAndreas Gohr // index title 16883b3acccSAndreas Gohr (new PageTitleCollection($pageIndex))->lock() 16983b3acccSAndreas Gohr ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); 17083b3acccSAndreas Gohr unset($data['metadata']['title']); 1716225b270SMichael Große 17283b3acccSAndreas Gohr // index fulltext 17383b3acccSAndreas Gohr if ($data['metadata']['internal_index']) { 17483b3acccSAndreas Gohr $words = Tokenizer::getWords($data['body']); 17583b3acccSAndreas Gohr (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); 1766225b270SMichael Große } else { 17783b3acccSAndreas Gohr $this->log("Indexer: full text indexing disabled for {$data['page']}"); 17883b3acccSAndreas Gohr // clear any previously stored fulltext data 17983b3acccSAndreas Gohr (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); 1806225b270SMichael Große } 18183b3acccSAndreas Gohr unset($data['metadata']['internal_index']); 18283b3acccSAndreas Gohr 18383b3acccSAndreas Gohr // index metadata keys 18483b3acccSAndreas Gohr foreach ($data['metadata'] as $key => $values) { 18583b3acccSAndreas Gohr if (!is_array($values)) { 18683b3acccSAndreas Gohr $values = ($values !== null && $values !== '') ? [$values] : []; 1876225b270SMichael Große } 18883b3acccSAndreas Gohr (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); 18983b3acccSAndreas Gohr } 19083b3acccSAndreas Gohr 19183b3acccSAndreas Gohr // update metadata registry 19283b3acccSAndreas Gohr $this->updateMetadataRegistry(array_keys($data['metadata'])); 1936225b270SMichael Große 1944027a91aSSatoshi Sahara // update index tag file 19583b3acccSAndreas Gohr io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); 19683b3acccSAndreas Gohr $this->log("Indexer: finished indexing {$data['page']}"); 1976225b270SMichael Große } 1986225b270SMichael Große 1996225b270SMichael Große /** 2005f9bd525SSatoshi Sahara * Remove a page from the index 2016225b270SMichael Große * 20283b3acccSAndreas Gohr * Clears the page's data from all collections. The entity persists in page.idx. 2036225b270SMichael Große * 20483b3acccSAndreas Gohr * @param string $page The page to remove 20583b3acccSAndreas Gohr * @param bool $force force deletion even when no .indexed tag exists 2066225b270SMichael Große * 207a32da6ddSSatoshi Sahara * @throws IndexAccessException 208a16bd548SSatoshi Sahara * @throws IndexLockException 209a16bd548SSatoshi Sahara * @throws IndexWriteException 2106225b270SMichael Große */ 21183b3acccSAndreas Gohr public function deletePage(string $page, bool $force = false): void 2124027a91aSSatoshi Sahara { 2134027a91aSSatoshi Sahara $idxtag = metaFN($page, '.indexed'); 2144027a91aSSatoshi Sahara if (!$force && !file_exists($idxtag)) { 21583b3acccSAndreas Gohr $this->log("Indexer: {$page}.indexed file does not exist, ignoring"); 21683b3acccSAndreas Gohr return; 2174027a91aSSatoshi Sahara } 2186225b270SMichael Große 21983b3acccSAndreas Gohr $pageIndex = new FileIndex('page', '', true); 220725e8e5fSSatoshi Sahara 22183b3acccSAndreas Gohr (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 22283b3acccSAndreas Gohr (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); 22383b3acccSAndreas Gohr 22483b3acccSAndreas Gohr foreach ($this->getMetadataRegistryKeys() as $key) { 22583b3acccSAndreas Gohr (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); 2264027a91aSSatoshi Sahara } 2276225b270SMichael Große 22883b3acccSAndreas Gohr $this->log("Indexer: deleted {$page} from index"); 2294027a91aSSatoshi Sahara @unlink($idxtag); 2304027a91aSSatoshi Sahara } 2314027a91aSSatoshi Sahara 2324027a91aSSatoshi Sahara /** 23383b3acccSAndreas Gohr * Rename a page in the search index 23483b3acccSAndreas Gohr * 23583b3acccSAndreas Gohr * The page must already have been moved on disk before calling this. 23683b3acccSAndreas Gohr * Clears the old page's data and re-indexes under the new name. 2374027a91aSSatoshi Sahara * 2384027a91aSSatoshi Sahara * @param string $oldpage The old page name 2394027a91aSSatoshi Sahara * @param string $newpage The new page name 24083b3acccSAndreas Gohr * 24183b3acccSAndreas Gohr * @throws IndexAccessException 242a16bd548SSatoshi Sahara * @throws IndexLockException 243a16bd548SSatoshi Sahara * @throws IndexWriteException 2444027a91aSSatoshi Sahara */ 24583b3acccSAndreas Gohr public function renamePage(string $oldpage, string $newpage): void 2464027a91aSSatoshi Sahara { 24783b3acccSAndreas Gohr $this->deletePage($oldpage, true); 24883b3acccSAndreas Gohr $this->addPage($newpage, true); 2496225b270SMichael Große } 2506225b270SMichael Große 2516225b270SMichael Große /** 25283b3acccSAndreas Gohr * Clear all page indexes 2536225b270SMichael Große */ 25483b3acccSAndreas Gohr public function clear(): void 2554027a91aSSatoshi Sahara { 2566225b270SMichael Große global $conf; 2576225b270SMichael Große 25883b3acccSAndreas Gohr Lock::acquire('page'); 2594027a91aSSatoshi Sahara 26083b3acccSAndreas Gohr // clear metadata indexes 26183b3acccSAndreas Gohr foreach ($this->getMetadataRegistryKeys() as $key) { 26283b3acccSAndreas Gohr $clean = PageMetaCollection::cleanName($key); 26383b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); 26483b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); 26583b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); 2666225b270SMichael Große } 2676225b270SMichael Große 26883b3acccSAndreas Gohr // clear fulltext indexes 26983b3acccSAndreas Gohr $files = glob($conf['indexdir'] . '/i*.idx'); 27083b3acccSAndreas Gohr if ($files) foreach ($files as $f) @unlink($f); 27183b3acccSAndreas Gohr $files = glob($conf['indexdir'] . '/w*.idx'); 27283b3acccSAndreas Gohr if ($files) foreach ($files as $f) @unlink($f); 27383b3acccSAndreas Gohr 27483b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/pageword.idx'); 27583b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/lengths.idx'); 27683b3acccSAndreas Gohr 27783b3acccSAndreas Gohr // clear title and page indexes 27883b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/title.idx'); 27983b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/page.idx'); 28083b3acccSAndreas Gohr @unlink($conf['indexdir'] . '/metadata.idx'); 28183b3acccSAndreas Gohr 28283b3acccSAndreas Gohr Lock::release('page'); 28383b3acccSAndreas Gohr } 28483b3acccSAndreas Gohr 28583b3acccSAndreas Gohr /** 286*21fbd01bSAndreas Gohr * Check the structural integrity of all search indexes 287*21fbd01bSAndreas Gohr * 288*21fbd01bSAndreas Gohr * @throws IndexIntegrityException when a structural inconsistency is found 289*21fbd01bSAndreas Gohr */ 290*21fbd01bSAndreas Gohr public function checkIntegrity(): void 291*21fbd01bSAndreas Gohr { 292*21fbd01bSAndreas Gohr (new PageFulltextCollection())->checkIntegrity(); 293*21fbd01bSAndreas Gohr (new PageTitleCollection())->checkIntegrity(); 294*21fbd01bSAndreas Gohr 295*21fbd01bSAndreas Gohr foreach ($this->getMetadataRegistryKeys() as $key) { 296*21fbd01bSAndreas Gohr (new PageMetaCollection($key))->checkIntegrity(); 297*21fbd01bSAndreas Gohr } 298*21fbd01bSAndreas Gohr } 299*21fbd01bSAndreas Gohr 300*21fbd01bSAndreas Gohr /** 301*21fbd01bSAndreas Gohr * Whether the search index is empty (no fulltext data indexed yet) 302*21fbd01bSAndreas Gohr * 303*21fbd01bSAndreas Gohr * @return bool 304*21fbd01bSAndreas Gohr */ 305*21fbd01bSAndreas Gohr public function isIndexEmpty(): bool 306*21fbd01bSAndreas Gohr { 307*21fbd01bSAndreas Gohr return (new PageFulltextCollection())->getTokenIndexMaximum() === 0; 308*21fbd01bSAndreas Gohr } 309*21fbd01bSAndreas Gohr 310*21fbd01bSAndreas Gohr /** 31183b3acccSAndreas Gohr * Get the list of known metadata keys from the metadata registry 31283b3acccSAndreas Gohr * 31383b3acccSAndreas Gohr * @return string[] list of metadata key names 31483b3acccSAndreas Gohr */ 31583b3acccSAndreas Gohr protected function getMetadataRegistryKeys(): array 31683b3acccSAndreas Gohr { 31783b3acccSAndreas Gohr global $conf; 31883b3acccSAndreas Gohr $fn = $conf['indexdir'] . '/metadata.idx'; 31983b3acccSAndreas Gohr if (!file_exists($fn)) return []; 32083b3acccSAndreas Gohr $keys = file($fn, FILE_IGNORE_NEW_LINES); 32183b3acccSAndreas Gohr return $keys ?: []; 32283b3acccSAndreas Gohr } 32383b3acccSAndreas Gohr 32483b3acccSAndreas Gohr /** 32583b3acccSAndreas Gohr * Update the metadata registry with new keys 32683b3acccSAndreas Gohr * 32783b3acccSAndreas Gohr * @param string[] $keys metadata key names to ensure are registered 32883b3acccSAndreas Gohr */ 32983b3acccSAndreas Gohr protected function updateMetadataRegistry(array $keys): void 33083b3acccSAndreas Gohr { 33183b3acccSAndreas Gohr global $conf; 33283b3acccSAndreas Gohr $fn = $conf['indexdir'] . '/metadata.idx'; 33383b3acccSAndreas Gohr $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; 33483b3acccSAndreas Gohr if (!$existing) $existing = []; 33583b3acccSAndreas Gohr 33683b3acccSAndreas Gohr $added = false; 33783b3acccSAndreas Gohr foreach ($keys as $key) { 33883b3acccSAndreas Gohr if (!in_array($key, $existing)) { 33983b3acccSAndreas Gohr $existing[] = $key; 34083b3acccSAndreas Gohr $added = true; 34183b3acccSAndreas Gohr } 34283b3acccSAndreas Gohr } 34383b3acccSAndreas Gohr 34483b3acccSAndreas Gohr if ($added) { 34583b3acccSAndreas Gohr io_saveFile($fn, implode("\n", $existing) . "\n"); 34683b3acccSAndreas Gohr } 34783b3acccSAndreas Gohr } 3486225b270SMichael Große} 349