* @author Tom N Harris */ class Indexer { /** @var callable|null Logging callback, receives a string message */ protected $logger; /** * Set a logging callback * * The callback receives a single string message. Use this to integrate * with different output mechanisms (TaskRunner echo, CLI output, Logger, etc.) * * @param callable $logger * @return static */ public function setLogger(callable $logger): static { $this->logger = $logger; return $this; } /** * Send a message to the registered logger * * @param string $message */ protected function log(string $message): void { if ($this->logger) ($this->logger)($message); } /** * Version of the indexer taking into consideration the external tokenizer. * The indexer is only compatible with data written by the same version. * * @triggers INDEXER_VERSION_GET * Plugins that modify what gets indexed should hook this event and * add their version info to the event data like so: * $data[$plugin_name] = $plugin_version; * * @return int|string */ public function getVersion() { static $indexer_version = null; if ($indexer_version == null) { $version = INDEXER_VERSION; $data = ['dokuwiki' => $version]; Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); unset($data['dokuwiki']); // this needs to be first ksort($data); foreach ($data as $plugin => $vers) { $version .= '+' . $plugin . '=' . $vers; } $indexer_version = $version; } return $indexer_version; } /** * Return a list of all indexed pages * * @param bool $existsFilter only return pages that exist on disk * @return string[] list of page names (keys are the RIDs in the page index) */ public function getAllPages(bool $existsFilter = false): array { $pageIndex = new Index\MemoryIndex('page'); return array_filter( iterator_to_array($pageIndex), static fn($v) => $v !== '' && (!$existsFilter || page_exists($v, '', false)) ); } /** * Check if a page needs (re-)indexing * * @param string $page * @param bool $force * @return bool true if indexing is needed */ public function needsIndexing(string $page, bool $force = false): bool { $idxtag = metaFN($page, '.indexed'); if ($force || !file_exists($idxtag)) return true; if (trim(io_readFile($idxtag)) != $this->getVersion()) return true; $last = @filemtime($idxtag); return $last <= @filemtime(wikiFN($page)); } /** * Add/update the search index for a page * * Locking is handled internally. * * @param string $page The page to index * @param bool $force force reindexing even when the index is up to date * * @throws IndexAccessException * @throws IndexLockException * @throws IndexWriteException */ public function addPage(string $page, bool $force = false): void { if (!$this->needsIndexing($page, $force)) { $this->log("Indexer: index for {$page} up to date"); return; } // create shared writable page index early so we can resolve the PID for plugins $pageIndex = new FileIndex('page', '', true); // prepare event data $data = [ 'page' => $page, 'body' => '', 'metadata' => [ 'title' => p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED), 'relation_references' => array_keys( p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED) ?? [] ), 'relation_media' => array_keys( p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED) ?? [] ), 'internal_index' => p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED) !== false, ], 'pid' => $pageIndex->accessCachedValue($page), ]; // let plugins modify the data $event = new Event('INDEXER_PAGE_ADD', $data); if ($event->advise_before()) { $data['body'] = $data['body'] . ' ' . rawWiki($data['page']); } $event->advise_after(); unset($event); // index title (new PageTitleCollection($pageIndex))->lock() ->addEntity($data['page'], [$data['metadata']['title']])->unlock(); unset($data['metadata']['title']); // index fulltext if ($data['metadata']['internal_index']) { $words = Tokenizer::getWords($data['body']); (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], $words)->unlock(); } else { $this->log("Indexer: full text indexing disabled for {$data['page']}"); // clear any previously stored fulltext data (new PageFulltextCollection($pageIndex))->lock()->addEntity($data['page'], [])->unlock(); } unset($data['metadata']['internal_index']); // index metadata keys foreach ($data['metadata'] as $key => $values) { if (!is_array($values)) { $values = ($values !== null && $values !== '') ? [$values] : []; } (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($data['page'], $values)->unlock(); } // update metadata registry $this->updateMetadataRegistry(array_keys($data['metadata'])); // update index tag file io_saveFile(metaFN($data['page'], '.indexed'), $this->getVersion()); $this->log("Indexer: finished indexing {$data['page']}"); } /** * Remove a page from the index * * Clears the page's data from all collections. The entity persists in page.idx. * * @param string $page The page to remove * @param bool $force force deletion even when no .indexed tag exists * * @throws IndexAccessException * @throws IndexLockException * @throws IndexWriteException */ public function deletePage(string $page, bool $force = false): void { $idxtag = metaFN($page, '.indexed'); if (!$force && !file_exists($idxtag)) { $this->log("Indexer: {$page}.indexed file does not exist, ignoring"); return; } $pageIndex = new FileIndex('page', '', true); (new PageTitleCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); (new PageFulltextCollection($pageIndex))->lock()->addEntity($page, [])->unlock(); foreach ($this->getMetadataRegistryKeys() as $key) { (new PageMetaCollection($key, $pageIndex))->lock()->addEntity($page, [])->unlock(); } $this->log("Indexer: deleted {$page} from index"); @unlink($idxtag); } /** * Rename a page in the search index * * The page must already have been moved on disk before calling this. * Clears the old page's data and re-indexes under the new name. * * @param string $oldpage The old page name * @param string $newpage The new page name * * @throws IndexAccessException * @throws IndexLockException * @throws IndexWriteException */ public function renamePage(string $oldpage, string $newpage): void { $this->deletePage($oldpage, true); $this->addPage($newpage, true); } /** * Clear all page indexes */ public function clear(): void { global $conf; Lock::acquire('page'); // clear metadata indexes foreach ($this->getMetadataRegistryKeys() as $key) { $clean = PageMetaCollection::cleanName($key); @unlink($conf['indexdir'] . '/' . $clean . '_w.idx'); @unlink($conf['indexdir'] . '/' . $clean . '_i.idx'); @unlink($conf['indexdir'] . '/' . $clean . '_p.idx'); } // clear fulltext indexes $files = glob($conf['indexdir'] . '/i*.idx'); if ($files) foreach ($files as $f) @unlink($f); $files = glob($conf['indexdir'] . '/w*.idx'); if ($files) foreach ($files as $f) @unlink($f); @unlink($conf['indexdir'] . '/pageword.idx'); @unlink($conf['indexdir'] . '/lengths.idx'); // clear title and page indexes @unlink($conf['indexdir'] . '/title.idx'); @unlink($conf['indexdir'] . '/page.idx'); @unlink($conf['indexdir'] . '/metadata.idx'); Lock::release('page'); } /** * Check the structural integrity of all search indexes * * @throws IndexIntegrityException when a structural inconsistency is found */ public function checkIntegrity(): void { (new PageFulltextCollection())->checkIntegrity(); (new PageTitleCollection())->checkIntegrity(); foreach ($this->getMetadataRegistryKeys() as $key) { (new PageMetaCollection($key))->checkIntegrity(); } } /** * Whether the search index is empty (no fulltext data indexed yet) * * @return bool */ public function isIndexEmpty(): bool { return (new PageFulltextCollection())->getTokenIndexMaximum() === 0; } /** * Get the list of known metadata keys from the metadata registry * * @return string[] list of metadata key names */ protected function getMetadataRegistryKeys(): array { global $conf; $fn = $conf['indexdir'] . '/metadata.idx'; if (!file_exists($fn)) return []; $keys = file($fn, FILE_IGNORE_NEW_LINES); return $keys ?: []; } /** * Update the metadata registry with new keys * * @param string[] $keys metadata key names to ensure are registered */ protected function updateMetadataRegistry(array $keys): void { global $conf; $fn = $conf['indexdir'] . '/metadata.idx'; $existing = file_exists($fn) ? file($fn, FILE_IGNORE_NEW_LINES) : []; if (!$existing) $existing = []; $added = false; foreach ($keys as $key) { if (!in_array($key, $existing)) { $existing[] = $key; $added = true; } } if ($added) { io_saveFile($fn, implode("\n", $existing) . "\n"); } } // region Deprecated methods /** * Find pages containing a metadata value * * @param string $key metadata key name * @param string|string[] $value search term(s) * @param callable|null $func comparison function * @return array * * @deprecated 2026-04-07 use MetadataSearch::lookupKey() instead */ public function lookupKey($key, &$value, $func = null) { DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::lookupKey()'); return (new MetadataSearch())->lookupKey($key, $value, $func); } /** * Return a list of all indexed pages, optionally filtered by metadata key * * @param string|null $key metadata key name * @return string[] * * @deprecated 2026-04-07 use MetadataSearch::getPages() or Indexer::getAllPages() instead */ public function getPages($key = null) { DebugHelper::dbgDeprecatedFunction(MetadataSearch::class . '::getPages()'); return (new MetadataSearch())->getPages($key); } /** * Add metadata values for a page * * @param string $page page name * @param string $key metadata key name * @param string|string[]|null $value value(s) to add * @return bool|string * * @deprecated 2026-04-07 use Collection classes directly instead */ public function addMetaKeys($page, $key, $value = null) { DebugHelper::dbgDeprecatedFunction('Collection classes'); try { if ($key === 'title') { $collection = new PageTitleCollection(); } else { $collection = new PageMetaCollection($key); } $values = is_array($value) ? $value : ($value !== null && $value !== '' ? [$value] : []); $collection->lock()->addEntity($page, $values)->unlock(); $this->updateMetadataRegistry([$key]); return true; } catch (SearchException $e) { return false; } } /** * Rename a metadata value in the index * * @param string $key metadata key name * @param string $oldvalue old value * @param string $newvalue new value * @return bool|string * * @deprecated 2026-04-07 use Collection classes directly instead */ public function renameMetaValue($key, $oldvalue, $newvalue) { DebugHelper::dbgDeprecatedFunction('Collection classes'); try { $collection = new PageMetaCollection($key); $collection->lock(); $tokenIndex = $collection->getTokenIndex(0); // find old value — search() is read-only, won't create entries $matches = $tokenIndex->search('/^' . preg_quote($oldvalue, '/') . '$/'); if (empty($matches)) { $collection->unlock(); return true; } $oldid = array_key_first($matches); // check if new value already exists (read-only lookup) $newMatches = $tokenIndex->search('/^' . preg_quote($newvalue, '/') . '$/'); if (!empty($newMatches)) { // both values exist — merge frequency data from old to new $newid = array_key_first($newMatches); $freqIndex = $collection->getFrequencyIndex(0); $reverseIndex = $collection->getReverseIndex(); $oldFreqLine = $freqIndex->retrieveRow($oldid); if ($oldFreqLine !== '') { $newFreqLine = $freqIndex->retrieveRow($newid); foreach (TupleOps::parseTuples($oldFreqLine) as $entityId => $count) { $newFreqLine = TupleOps::updateTuple($newFreqLine, $entityId, $count); // update reverse index: remove old token, add new $reverseRow = $reverseIndex->retrieveRow((int)$entityId); $keyline = explode(':', $reverseRow); $keyline = array_diff($keyline, [(string)$oldid]); if (!in_array((string)$newid, $keyline)) { $keyline[] = $newid; } $reverseIndex->changeRow((int)$entityId, implode(':', array_filter($keyline, fn($v) => $v !== ''))); } $freqIndex->changeRow($oldid, ''); $freqIndex->changeRow($newid, $newFreqLine); } } else { // new value doesn't exist — simple rename $tokenIndex->changeRow($oldid, $newvalue); } $collection->unlock(); return true; } catch (SearchException $e) { return false; } } /** * Get the page ID for a page name * * @param string $page page name * @return int|false * * @deprecated 2026-04-07 use FileIndex directly instead */ public function getPID($page) { DebugHelper::dbgDeprecatedFunction(FileIndex::class); try { return (new FileIndex('page', '', true))->accessCachedValue($page); } catch (SearchException $e) { return false; } } /** * Find tokens in the fulltext index * * @param array $tokens list of words to search for * @return array list of pages found [word => [page => count, ...]] * * @deprecated 2026-04-07 use CollectionSearch on PageFulltextCollection instead */ public function lookup(&$tokens) { DebugHelper::dbgDeprecatedFunction(CollectionSearch::class); $collection = new PageFulltextCollection(); $search = new CollectionSearch($collection); $termMap = []; foreach ($tokens as $token) { try { $term = $search->addTerm($token); $termMap[$token] = $term; } catch (SearchException $e) { // skip invalid terms } } if (empty($termMap)) return []; $search->execute(); $result = []; foreach ($termMap as $word => $term) { $freqs = $term->getEntityFrequencies(); // filter to only existing pages $filtered = []; foreach ($freqs as $page => $count) { if (page_exists($page, '', false)) { $filtered[$page] = $count; } } $result[$word] = $filtered; } return $result; } // endregion }