1fe2d1da1SSatoshi Sahara<?php 246b83514SSatoshi Sahara 3fe2d1da1SSatoshi Saharanamespace dokuwiki\Search; 4fe2d1da1SSatoshi Sahara 5fe2d1da1SSatoshi Saharause dokuwiki\Extension\Event; 66734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch; 76734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection; 86734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection; 90b1bbbbbSAndreas Gohruse dokuwiki\Search\Query\QueryParser; 10a02395a1SSatoshi Saharause dokuwiki\Utf8; 11fe2d1da1SSatoshi Sahara 12fe2d1da1SSatoshi Sahara/** 13fe2d1da1SSatoshi Sahara * Class DokuWiki Metadata Search 14fe2d1da1SSatoshi Sahara * 156734bb8cSAndreas Gohr * Provides search operations on metadata indexes using the Collection/Index architecture. 166734bb8cSAndreas Gohr * 17fe2d1da1SSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18fe2d1da1SSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 19fe2d1da1SSatoshi Sahara */ 20fe2d1da1SSatoshi Saharaclass MetadataSearch 21fe2d1da1SSatoshi Sahara{ 22fe2d1da1SSatoshi Sahara /** 23fe2d1da1SSatoshi Sahara * Quicksearch for pagenames 24fe2d1da1SSatoshi Sahara * 25fe2d1da1SSatoshi Sahara * By default it only matches the pagename and ignores the namespace. 26fe2d1da1SSatoshi Sahara * This can be changed with the second parameter. 27fe2d1da1SSatoshi Sahara * The third parameter allows to search in titles as well. 28fe2d1da1SSatoshi Sahara * 29fe2d1da1SSatoshi Sahara * The function always returns titles as well 30fe2d1da1SSatoshi Sahara * 31fe2d1da1SSatoshi Sahara * @triggers SEARCH_QUERY_PAGELOOKUP 32fe2d1da1SSatoshi Sahara * @param string $id page id 33fe2d1da1SSatoshi Sahara * @param bool $in_ns match against namespace as well? 34fe2d1da1SSatoshi Sahara * @param bool $in_title search in title? 356734bb8cSAndreas Gohr * @param int|string|null $after only show results with mtime after this date, 36fe2d1da1SSatoshi Sahara * accepts timestap or strtotime arguments 376734bb8cSAndreas Gohr * @param int|string|null $before only show results with mtime before this date, 38fe2d1da1SSatoshi Sahara * accepts timestap or strtotime arguments 39fe2d1da1SSatoshi Sahara * 40fe2d1da1SSatoshi Sahara * @return string[] 416734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 426734bb8cSAndreas Gohr * @author Adrian Lang <lang@cosmocode.de> 436734bb8cSAndreas Gohr * 44fe2d1da1SSatoshi Sahara */ 456734bb8cSAndreas Gohr public function pageLookup( 466734bb8cSAndreas Gohr string $id, 476734bb8cSAndreas Gohr bool $in_ns = false, 486734bb8cSAndreas Gohr bool $in_title = false, 496734bb8cSAndreas Gohr int|string|null $after = null, 506734bb8cSAndreas Gohr int|string|null $before = null): array 51fe2d1da1SSatoshi Sahara { 52fe2d1da1SSatoshi Sahara $data = [ 53fe2d1da1SSatoshi Sahara 'id' => $id, 54fe2d1da1SSatoshi Sahara 'in_ns' => $in_ns, 55fe2d1da1SSatoshi Sahara 'in_title' => $in_title, 56fe2d1da1SSatoshi Sahara 'after' => $after, 57fe2d1da1SSatoshi Sahara 'before' => $before 58fe2d1da1SSatoshi Sahara ]; 59fe2d1da1SSatoshi Sahara $data['has_titles'] = true; // for plugin backward compatibility check 606734bb8cSAndreas Gohr return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...)); 61fe2d1da1SSatoshi Sahara } 62fe2d1da1SSatoshi Sahara 63fe2d1da1SSatoshi Sahara /** 64fe2d1da1SSatoshi Sahara * Returns list of pages as array(pageid => First Heading) 65fe2d1da1SSatoshi Sahara * 66fe2d1da1SSatoshi Sahara * @param array $data event data 67fe2d1da1SSatoshi Sahara * @return string[] 686734bb8cSAndreas Gohr * @throws IndexUsageException 69fe2d1da1SSatoshi Sahara */ 706734bb8cSAndreas Gohr public function pageLookupCallBack(array &$data): array 71fe2d1da1SSatoshi Sahara { 726734bb8cSAndreas Gohr $parsedQuery = (new QueryParser)->convert($data['id']); 736734bb8cSAndreas Gohr $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null; 746734bb8cSAndreas Gohr $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null; 756734bb8cSAndreas Gohr $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id']; 766734bb8cSAndreas Gohr $cleaned = cleanID($query); 77fe2d1da1SSatoshi Sahara 786734bb8cSAndreas Gohr if ($cleaned === '') return []; 796734bb8cSAndreas Gohr 806734bb8cSAndreas Gohr // find pages matching by page name 816734bb8cSAndreas Gohr $pages = []; 826734bb8cSAndreas Gohr foreach ($this->getPages() as $page) { 836734bb8cSAndreas Gohr if ($ns && !str_starts_with($page, $ns)) continue; 846734bb8cSAndreas Gohr if ($notns && str_starts_with($page, $notns)) continue; 856734bb8cSAndreas Gohr 866734bb8cSAndreas Gohr $match = $data['in_ns'] ? $page : noNSorNS($page); 876734bb8cSAndreas Gohr if (str_contains($match, $cleaned)) { 886734bb8cSAndreas Gohr $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER); 89fe2d1da1SSatoshi Sahara } 90fab81cc8SSatoshi Sahara } 91fe2d1da1SSatoshi Sahara 926734bb8cSAndreas Gohr // additionally find pages matching by title 936734bb8cSAndreas Gohr if ($data['in_title']) { 94*1148921dSAndreas Gohr $search = new CollectionSearch(new PageTitleCollection()); 95*1148921dSAndreas Gohr $search->caseInsensitive(); 96*1148921dSAndreas Gohr $search->addTerm('*' . $query . '*'); 97*1148921dSAndreas Gohr $terms = $search->execute(); 98*1148921dSAndreas Gohr $term = reset($terms); 99*1148921dSAndreas Gohr if ($term) { 100*1148921dSAndreas Gohr foreach ($term->getEntityTokens() as $page => $titles) { 1016734bb8cSAndreas Gohr if ($ns && !str_starts_with($page, $ns)) continue; 1026734bb8cSAndreas Gohr if ($notns && str_starts_with($page, $notns)) continue; 103fe2d1da1SSatoshi Sahara 1046734bb8cSAndreas Gohr if (!isset($pages[$page])) { 105*1148921dSAndreas Gohr $pages[$page] = $titles[0]; 106*1148921dSAndreas Gohr } 107fe2d1da1SSatoshi Sahara } 108fe2d1da1SSatoshi Sahara } 109fe2d1da1SSatoshi Sahara } 110fe2d1da1SSatoshi Sahara 1116734bb8cSAndreas Gohr $pages = static::filterPages($pages, false, $data['after'], $data['before']); 1126734bb8cSAndreas Gohr uksort($pages, $this->pagesorter(...)); 113fe2d1da1SSatoshi Sahara return $pages; 114fe2d1da1SSatoshi Sahara } 115fe2d1da1SSatoshi Sahara 116fe2d1da1SSatoshi Sahara /** 1176734bb8cSAndreas Gohr * Return a list of all indexed pages, optionally limited to those that have a specific metadata key 118fe2d1da1SSatoshi Sahara * 1196734bb8cSAndreas Gohr * When a key is given, only pages that have any value stored for that metadata key are returned. 1206734bb8cSAndreas Gohr * This does not filter by the metadata value itself. 1216734bb8cSAndreas Gohr * 1226734bb8cSAndreas Gohr * @param string|null $key metadata key name, or null for all pages 1236734bb8cSAndreas Gohr * @return string[] list of page names 124fe2d1da1SSatoshi Sahara */ 1256734bb8cSAndreas Gohr public function getPages(?string $key = null): array 126fe2d1da1SSatoshi Sahara { 1276734bb8cSAndreas Gohr if ($key === null) { 1286734bb8cSAndreas Gohr return (new Indexer())->getAllPages(); 1296734bb8cSAndreas Gohr } 1306734bb8cSAndreas Gohr 1316734bb8cSAndreas Gohr if ($key === 'title') { 1326734bb8cSAndreas Gohr return (new PageTitleCollection())->getEntitiesWithData(); 1336734bb8cSAndreas Gohr } 1346734bb8cSAndreas Gohr 1356734bb8cSAndreas Gohr return (new PageMetaCollection($key))->getEntitiesWithData(); 1366734bb8cSAndreas Gohr } 1376734bb8cSAndreas Gohr 1386734bb8cSAndreas Gohr /** 1396734bb8cSAndreas Gohr * Find pages containing a metadata value 1406734bb8cSAndreas Gohr * 141*1148921dSAndreas Gohr * Values are compared as case-sensitive strings. Wildcard matching with * at 142*1148921dSAndreas Gohr * the start and/or end is supported (e.g. '*foo', 'bar*', '*baz*'). 1436734bb8cSAndreas Gohr * 1446734bb8cSAndreas Gohr * When $value is a string, the result is a flat list of matching page names. 1456734bb8cSAndreas Gohr * When $value is an array, each value is searched independently and the result 1466734bb8cSAndreas Gohr * is an associative array keyed by the search values, each containing a list 1476734bb8cSAndreas Gohr * of matching page names. 1486734bb8cSAndreas Gohr * 1496734bb8cSAndreas Gohr * @param string $key name of the metadata key to look for 1506734bb8cSAndreas Gohr * @param string|string[] $value search term or array of search terms 1516734bb8cSAndreas Gohr * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value) 1526734bb8cSAndreas Gohr * 1536734bb8cSAndreas Gohr * @author Michael Hamann <michael@content-space.de> 1546734bb8cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 1556734bb8cSAndreas Gohr */ 156*1148921dSAndreas Gohr public function lookupKey(string $key, string|array &$value): array 1576734bb8cSAndreas Gohr { 1586734bb8cSAndreas Gohr $isScalar = !is_array($value); 1596734bb8cSAndreas Gohr $valueArray = $isScalar ? [$value] : $value; 1606734bb8cSAndreas Gohr 161*1148921dSAndreas Gohr $collection = ($key === 'title') ? new PageTitleCollection() : new PageMetaCollection($key); 1626734bb8cSAndreas Gohr 163*1148921dSAndreas Gohr $search = new CollectionSearch($collection); 164*1148921dSAndreas Gohr foreach ($valueArray as $v) { 165*1148921dSAndreas Gohr $search->addTerm($v); 166*1148921dSAndreas Gohr } 167*1148921dSAndreas Gohr $terms = $search->execute(); 168*1148921dSAndreas Gohr 169*1148921dSAndreas Gohr $result = []; 170*1148921dSAndreas Gohr foreach ($valueArray as $v) { 171*1148921dSAndreas Gohr $term = $terms[$v] ?? null; 172*1148921dSAndreas Gohr $result[$v] = $term ? array_keys($term->getEntityFrequencies()) : []; 173*1148921dSAndreas Gohr } 1746734bb8cSAndreas Gohr 1756734bb8cSAndreas Gohr return $isScalar ? $result[$value] : $result; 1766734bb8cSAndreas Gohr } 1776734bb8cSAndreas Gohr 1786734bb8cSAndreas Gohr /** 1796734bb8cSAndreas Gohr * Returns the backlinks for a given page 1806734bb8cSAndreas Gohr * 1816734bb8cSAndreas Gohr * @param string $id The id for which links shall be returned 1826734bb8cSAndreas Gohr * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected 1836734bb8cSAndreas Gohr * @return string[] The pages that contain links to the given page 1846734bb8cSAndreas Gohr * 1856734bb8cSAndreas Gohr * @throws IndexUsageException 1866734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 1876734bb8cSAndreas Gohr */ 1886734bb8cSAndreas Gohr public function backlinks(string $id, bool $ignore_perms = false): array 1896734bb8cSAndreas Gohr { 1906734bb8cSAndreas Gohr $result = $this->lookupKey('relation_references', $id); 1916734bb8cSAndreas Gohr if (!count($result)) return $result; 1926734bb8cSAndreas Gohr 1936734bb8cSAndreas Gohr $result = array_flip($result); 1946734bb8cSAndreas Gohr $result = static::filterPages($result, $ignore_perms); 1956734bb8cSAndreas Gohr $result = array_keys($result); 1966734bb8cSAndreas Gohr 1976734bb8cSAndreas Gohr Utf8\Sort::sort($result); 1986734bb8cSAndreas Gohr return $result; 1996734bb8cSAndreas Gohr } 2006734bb8cSAndreas Gohr 2016734bb8cSAndreas Gohr /** 2026734bb8cSAndreas Gohr * Returns the pages that use a given media file 2036734bb8cSAndreas Gohr * 2046734bb8cSAndreas Gohr * @param string $id The media id to look for 2056734bb8cSAndreas Gohr * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false) 2066734bb8cSAndreas Gohr * @return string[] A list of pages that use the given media file 2076734bb8cSAndreas Gohr * 2086734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 2096734bb8cSAndreas Gohr */ 2106734bb8cSAndreas Gohr public function mediause(string $id, bool $ignore_perms = false): array 2116734bb8cSAndreas Gohr { 2126734bb8cSAndreas Gohr $result = $this->lookupKey('relation_media', $id); 2136734bb8cSAndreas Gohr if (!count($result)) return $result; 2146734bb8cSAndreas Gohr 2156734bb8cSAndreas Gohr $result = array_flip($result); 2166734bb8cSAndreas Gohr $result = static::filterPages($result, $ignore_perms); 2176734bb8cSAndreas Gohr $result = array_keys($result); 2186734bb8cSAndreas Gohr 2196734bb8cSAndreas Gohr Utf8\Sort::sort($result); 2206734bb8cSAndreas Gohr return $result; 2216734bb8cSAndreas Gohr } 2226734bb8cSAndreas Gohr 2236734bb8cSAndreas Gohr /** 2246734bb8cSAndreas Gohr * Filter a list of pages by visibility, existence, permissions, and time range 2256734bb8cSAndreas Gohr * 2266734bb8cSAndreas Gohr * @param array $pages pages to filter (keys are page IDs) 2276734bb8cSAndreas Gohr * @param bool $ignorePerms skip visibility and ACL checks 2286734bb8cSAndreas Gohr * @param int|string|null $after only keep pages modified after this date 2296734bb8cSAndreas Gohr * @param int|string|null $before only keep pages modified before this date 2306734bb8cSAndreas Gohr * @return array filtered pages 2316734bb8cSAndreas Gohr */ 2326734bb8cSAndreas Gohr public static function filterPages(array $pages, bool $ignorePerms = false, $after = null, $before = null): array 2336734bb8cSAndreas Gohr { 2346734bb8cSAndreas Gohr if ($after) $after = is_int($after) ? $after : strtotime($after); 2356734bb8cSAndreas Gohr if ($before) $before = is_int($before) ? $before : strtotime($before); 2366734bb8cSAndreas Gohr 2376734bb8cSAndreas Gohr return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) { 2386734bb8cSAndreas Gohr if (!$ignorePerms) { 2396734bb8cSAndreas Gohr if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) { 2406734bb8cSAndreas Gohr return false; 2416734bb8cSAndreas Gohr } 2426734bb8cSAndreas Gohr } 2436734bb8cSAndreas Gohr if (!page_exists($id, '', false)) { 2446734bb8cSAndreas Gohr return false; 2456734bb8cSAndreas Gohr } 2466734bb8cSAndreas Gohr if ($after || $before) { 2476734bb8cSAndreas Gohr $mTime = filemtime(wikiFN($id)); 2486734bb8cSAndreas Gohr if ($after && $after > $mTime) return false; 2496734bb8cSAndreas Gohr if ($before && $before < $mTime) return false; 2506734bb8cSAndreas Gohr } 2516734bb8cSAndreas Gohr return true; 2526734bb8cSAndreas Gohr }, ARRAY_FILTER_USE_BOTH); 253fe2d1da1SSatoshi Sahara } 254fe2d1da1SSatoshi Sahara 255fe2d1da1SSatoshi Sahara /** 256fe2d1da1SSatoshi Sahara * Sort pages based on their namespace level first, then on their string 257fe2d1da1SSatoshi Sahara * values. This makes higher hierarchy pages rank higher than lower hierarchy 258fe2d1da1SSatoshi Sahara * pages. 259fe2d1da1SSatoshi Sahara * 260fe2d1da1SSatoshi Sahara * @param string $a 261fe2d1da1SSatoshi Sahara * @param string $b 262fe2d1da1SSatoshi Sahara * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, 263fe2d1da1SSatoshi Sahara * and 0 if they are equal. 264fe2d1da1SSatoshi Sahara */ 2656734bb8cSAndreas Gohr protected function pagesorter(string $a, string $b): int 266fe2d1da1SSatoshi Sahara { 2676734bb8cSAndreas Gohr $diff = substr_count($a, ':') - substr_count($b, ':'); 2686734bb8cSAndreas Gohr return $diff ?: Utf8\Sort::strcmp($a, $b); 269fe2d1da1SSatoshi Sahara } 270fe2d1da1SSatoshi Sahara} 271