1fe2d1da1SSatoshi Sahara<?php 246b83514SSatoshi Sahara 3fe2d1da1SSatoshi Saharanamespace dokuwiki\Search; 4fe2d1da1SSatoshi Sahara 5fe2d1da1SSatoshi Saharause dokuwiki\Extension\Event; 66734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch; 76734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection; 86734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection; 90b1bbbbbSAndreas Gohruse dokuwiki\Search\Query\QueryParser; 10*9369b4a9SAndreas Gohruse dokuwiki\Utf8\Sort; 11fe2d1da1SSatoshi Sahara 12fe2d1da1SSatoshi Sahara/** 13fe2d1da1SSatoshi Sahara * Class DokuWiki Metadata Search 14fe2d1da1SSatoshi Sahara * 156734bb8cSAndreas Gohr * Provides search operations on metadata indexes using the Collection/Index architecture. 166734bb8cSAndreas Gohr * 17fe2d1da1SSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 18fe2d1da1SSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 19fe2d1da1SSatoshi Sahara */ 20fe2d1da1SSatoshi Saharaclass MetadataSearch 21fe2d1da1SSatoshi Sahara{ 22fe2d1da1SSatoshi Sahara /** 23fe2d1da1SSatoshi Sahara * Quicksearch for pagenames 24fe2d1da1SSatoshi Sahara * 25fe2d1da1SSatoshi Sahara * By default it only matches the pagename and ignores the namespace. 26fe2d1da1SSatoshi Sahara * This can be changed with the second parameter. 27fe2d1da1SSatoshi Sahara * The third parameter allows to search in titles as well. 28fe2d1da1SSatoshi Sahara * 29fe2d1da1SSatoshi Sahara * The function always returns titles as well 30fe2d1da1SSatoshi Sahara * 31fe2d1da1SSatoshi Sahara * @triggers SEARCH_QUERY_PAGELOOKUP 32fe2d1da1SSatoshi Sahara * @param string $id page id 33fe2d1da1SSatoshi Sahara * @param bool $in_ns match against namespace as well? 34fe2d1da1SSatoshi Sahara * @param bool $in_title search in title? 356734bb8cSAndreas Gohr * @param int|string|null $after only show results with mtime after this date, 36fe2d1da1SSatoshi Sahara * accepts timestap or strtotime arguments 376734bb8cSAndreas Gohr * @param int|string|null $before only show results with mtime before this date, 38fe2d1da1SSatoshi Sahara * accepts timestap or strtotime arguments 39fe2d1da1SSatoshi Sahara * 40fe2d1da1SSatoshi Sahara * @return string[] 416734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 426734bb8cSAndreas Gohr * @author Adrian Lang <lang@cosmocode.de> 436734bb8cSAndreas Gohr * 44fe2d1da1SSatoshi Sahara */ 456734bb8cSAndreas Gohr public function pageLookup( 466734bb8cSAndreas Gohr string $id, 476734bb8cSAndreas Gohr bool $in_ns = false, 486734bb8cSAndreas Gohr bool $in_title = false, 496734bb8cSAndreas Gohr int|string|null $after = null, 50*9369b4a9SAndreas Gohr int|string|null $before = null 51*9369b4a9SAndreas Gohr ): array { 52fe2d1da1SSatoshi Sahara $data = [ 53fe2d1da1SSatoshi Sahara 'id' => $id, 54fe2d1da1SSatoshi Sahara 'in_ns' => $in_ns, 55fe2d1da1SSatoshi Sahara 'in_title' => $in_title, 56fe2d1da1SSatoshi Sahara 'after' => $after, 57fe2d1da1SSatoshi Sahara 'before' => $before 58fe2d1da1SSatoshi Sahara ]; 59fe2d1da1SSatoshi Sahara $data['has_titles'] = true; // for plugin backward compatibility check 606734bb8cSAndreas Gohr return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...)); 61fe2d1da1SSatoshi Sahara } 62fe2d1da1SSatoshi Sahara 63fe2d1da1SSatoshi Sahara /** 64fe2d1da1SSatoshi Sahara * Returns list of pages as array(pageid => First Heading) 65fe2d1da1SSatoshi Sahara * 66fe2d1da1SSatoshi Sahara * @param array $data event data 67fe2d1da1SSatoshi Sahara * @return string[] 68fe2d1da1SSatoshi Sahara */ 69*9369b4a9SAndreas Gohr public function pageLookupCallBack(array $data): array 70fe2d1da1SSatoshi Sahara { 71*9369b4a9SAndreas Gohr $parsedQuery = (new QueryParser())->convert($data['id']); 726734bb8cSAndreas Gohr $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null; 736734bb8cSAndreas Gohr $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null; 746734bb8cSAndreas Gohr $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id']; 756734bb8cSAndreas Gohr $cleaned = cleanID($query); 76fe2d1da1SSatoshi Sahara 776734bb8cSAndreas Gohr if ($cleaned === '') return []; 786734bb8cSAndreas Gohr 796734bb8cSAndreas Gohr // find pages matching by page name 806734bb8cSAndreas Gohr $pages = []; 816734bb8cSAndreas Gohr foreach ($this->getPages() as $page) { 826734bb8cSAndreas Gohr if ($ns && !str_starts_with($page, $ns)) continue; 836734bb8cSAndreas Gohr if ($notns && str_starts_with($page, $notns)) continue; 846734bb8cSAndreas Gohr 856734bb8cSAndreas Gohr $match = $data['in_ns'] ? $page : noNSorNS($page); 866734bb8cSAndreas Gohr if (str_contains($match, $cleaned)) { 876734bb8cSAndreas Gohr $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER); 88fe2d1da1SSatoshi Sahara } 89fab81cc8SSatoshi Sahara } 90fe2d1da1SSatoshi Sahara 916734bb8cSAndreas Gohr // additionally find pages matching by title 926734bb8cSAndreas Gohr if ($data['in_title']) { 931148921dSAndreas Gohr $search = new CollectionSearch(new PageTitleCollection()); 941148921dSAndreas Gohr $search->caseInsensitive(); 951148921dSAndreas Gohr $search->addTerm('*' . $query . '*'); 961148921dSAndreas Gohr $terms = $search->execute(); 971148921dSAndreas Gohr $term = reset($terms); 981148921dSAndreas Gohr if ($term) { 991148921dSAndreas Gohr foreach ($term->getEntityTokens() as $page => $titles) { 1006734bb8cSAndreas Gohr if ($ns && !str_starts_with($page, $ns)) continue; 1016734bb8cSAndreas Gohr if ($notns && str_starts_with($page, $notns)) continue; 102fe2d1da1SSatoshi Sahara 1036734bb8cSAndreas Gohr if (!isset($pages[$page])) { 1041148921dSAndreas Gohr $pages[$page] = $titles[0]; 1051148921dSAndreas Gohr } 106fe2d1da1SSatoshi Sahara } 107fe2d1da1SSatoshi Sahara } 108fe2d1da1SSatoshi Sahara } 109fe2d1da1SSatoshi Sahara 1106734bb8cSAndreas Gohr $pages = static::filterPages($pages, false, $data['after'], $data['before']); 1116734bb8cSAndreas Gohr uksort($pages, $this->pagesorter(...)); 112fe2d1da1SSatoshi Sahara return $pages; 113fe2d1da1SSatoshi Sahara } 114fe2d1da1SSatoshi Sahara 115fe2d1da1SSatoshi Sahara /** 1166734bb8cSAndreas Gohr * Return a list of all indexed pages, optionally limited to those that have a specific metadata key 117fe2d1da1SSatoshi Sahara * 1186734bb8cSAndreas Gohr * When a key is given, only pages that have any value stored for that metadata key are returned. 1196734bb8cSAndreas Gohr * This does not filter by the metadata value itself. 1206734bb8cSAndreas Gohr * 1216734bb8cSAndreas Gohr * @param string|null $key metadata key name, or null for all pages 1226734bb8cSAndreas Gohr * @return string[] list of page names 123fe2d1da1SSatoshi Sahara */ 1246734bb8cSAndreas Gohr public function getPages(?string $key = null): array 125fe2d1da1SSatoshi Sahara { 1266734bb8cSAndreas Gohr if ($key === null) { 1276734bb8cSAndreas Gohr return (new Indexer())->getAllPages(); 1286734bb8cSAndreas Gohr } 1296734bb8cSAndreas Gohr 1306734bb8cSAndreas Gohr if ($key === 'title') { 1316734bb8cSAndreas Gohr return (new PageTitleCollection())->getEntitiesWithData(); 1326734bb8cSAndreas Gohr } 1336734bb8cSAndreas Gohr 1346734bb8cSAndreas Gohr return (new PageMetaCollection($key))->getEntitiesWithData(); 1356734bb8cSAndreas Gohr } 1366734bb8cSAndreas Gohr 1376734bb8cSAndreas Gohr /** 1386734bb8cSAndreas Gohr * Find pages containing a metadata value 1396734bb8cSAndreas Gohr * 1401148921dSAndreas Gohr * Values are compared as case-sensitive strings. Wildcard matching with * at 1411148921dSAndreas Gohr * the start and/or end is supported (e.g. '*foo', 'bar*', '*baz*'). 1426734bb8cSAndreas Gohr * 1436734bb8cSAndreas Gohr * When $value is a string, the result is a flat list of matching page names. 1446734bb8cSAndreas Gohr * When $value is an array, each value is searched independently and the result 1456734bb8cSAndreas Gohr * is an associative array keyed by the search values, each containing a list 1466734bb8cSAndreas Gohr * of matching page names. 1476734bb8cSAndreas Gohr * 1486734bb8cSAndreas Gohr * @param string $key name of the metadata key to look for 1496734bb8cSAndreas Gohr * @param string|string[] $value search term or array of search terms 1506734bb8cSAndreas Gohr * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value) 1516734bb8cSAndreas Gohr * 1526734bb8cSAndreas Gohr * @author Michael Hamann <michael@content-space.de> 1536734bb8cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 1546734bb8cSAndreas Gohr */ 1551148921dSAndreas Gohr public function lookupKey(string $key, string|array &$value): array 1566734bb8cSAndreas Gohr { 1576734bb8cSAndreas Gohr $isScalar = !is_array($value); 1586734bb8cSAndreas Gohr $valueArray = $isScalar ? [$value] : $value; 1596734bb8cSAndreas Gohr 1601148921dSAndreas Gohr $collection = ($key === 'title') ? new PageTitleCollection() : new PageMetaCollection($key); 1616734bb8cSAndreas Gohr 1621148921dSAndreas Gohr $search = new CollectionSearch($collection); 1631148921dSAndreas Gohr foreach ($valueArray as $v) { 1641148921dSAndreas Gohr $search->addTerm($v); 1651148921dSAndreas Gohr } 1661148921dSAndreas Gohr $terms = $search->execute(); 1671148921dSAndreas Gohr 1681148921dSAndreas Gohr $result = []; 1691148921dSAndreas Gohr foreach ($valueArray as $v) { 1701148921dSAndreas Gohr $term = $terms[$v] ?? null; 1711148921dSAndreas Gohr $result[$v] = $term ? array_keys($term->getEntityFrequencies()) : []; 1721148921dSAndreas Gohr } 1736734bb8cSAndreas Gohr 1746734bb8cSAndreas Gohr return $isScalar ? $result[$value] : $result; 1756734bb8cSAndreas Gohr } 1766734bb8cSAndreas Gohr 1776734bb8cSAndreas Gohr /** 1786734bb8cSAndreas Gohr * Returns the backlinks for a given page 1796734bb8cSAndreas Gohr * 1806734bb8cSAndreas Gohr * @param string $id The id for which links shall be returned 1816734bb8cSAndreas Gohr * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected 1826734bb8cSAndreas Gohr * @return string[] The pages that contain links to the given page 1836734bb8cSAndreas Gohr * 1846734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 1856734bb8cSAndreas Gohr */ 1866734bb8cSAndreas Gohr public function backlinks(string $id, bool $ignore_perms = false): array 1876734bb8cSAndreas Gohr { 1886734bb8cSAndreas Gohr $result = $this->lookupKey('relation_references', $id); 189*9369b4a9SAndreas Gohr if ($result === []) return $result; 1906734bb8cSAndreas Gohr 1916734bb8cSAndreas Gohr $result = array_flip($result); 1926734bb8cSAndreas Gohr $result = static::filterPages($result, $ignore_perms); 1936734bb8cSAndreas Gohr $result = array_keys($result); 1946734bb8cSAndreas Gohr 195*9369b4a9SAndreas Gohr Sort::sort($result); 1966734bb8cSAndreas Gohr return $result; 1976734bb8cSAndreas Gohr } 1986734bb8cSAndreas Gohr 1996734bb8cSAndreas Gohr /** 2006734bb8cSAndreas Gohr * Returns the pages that use a given media file 2016734bb8cSAndreas Gohr * 2026734bb8cSAndreas Gohr * @param string $id The media id to look for 2036734bb8cSAndreas Gohr * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false) 2046734bb8cSAndreas Gohr * @return string[] A list of pages that use the given media file 2056734bb8cSAndreas Gohr * 2066734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 2076734bb8cSAndreas Gohr */ 2086734bb8cSAndreas Gohr public function mediause(string $id, bool $ignore_perms = false): array 2096734bb8cSAndreas Gohr { 2106734bb8cSAndreas Gohr $result = $this->lookupKey('relation_media', $id); 211*9369b4a9SAndreas Gohr if ($result === []) return $result; 2126734bb8cSAndreas Gohr 2136734bb8cSAndreas Gohr $result = array_flip($result); 2146734bb8cSAndreas Gohr $result = static::filterPages($result, $ignore_perms); 2156734bb8cSAndreas Gohr $result = array_keys($result); 2166734bb8cSAndreas Gohr 217*9369b4a9SAndreas Gohr Sort::sort($result); 2186734bb8cSAndreas Gohr return $result; 2196734bb8cSAndreas Gohr } 2206734bb8cSAndreas Gohr 2216734bb8cSAndreas Gohr /** 2226734bb8cSAndreas Gohr * Filter a list of pages by visibility, existence, permissions, and time range 2236734bb8cSAndreas Gohr * 2246734bb8cSAndreas Gohr * @param array $pages pages to filter (keys are page IDs) 2256734bb8cSAndreas Gohr * @param bool $ignorePerms skip visibility and ACL checks 2266734bb8cSAndreas Gohr * @param int|string|null $after only keep pages modified after this date 2276734bb8cSAndreas Gohr * @param int|string|null $before only keep pages modified before this date 2286734bb8cSAndreas Gohr * @return array filtered pages 2296734bb8cSAndreas Gohr */ 230*9369b4a9SAndreas Gohr public static function filterPages( 231*9369b4a9SAndreas Gohr array $pages, 232*9369b4a9SAndreas Gohr bool $ignorePerms = false, 233*9369b4a9SAndreas Gohr int|string|null $after = null, 234*9369b4a9SAndreas Gohr int|string|null $before = null 235*9369b4a9SAndreas Gohr ): array { 2366734bb8cSAndreas Gohr if ($after) $after = is_int($after) ? $after : strtotime($after); 2376734bb8cSAndreas Gohr if ($before) $before = is_int($before) ? $before : strtotime($before); 2386734bb8cSAndreas Gohr 2396734bb8cSAndreas Gohr return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) { 2406734bb8cSAndreas Gohr if (!$ignorePerms) { 2416734bb8cSAndreas Gohr if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) { 2426734bb8cSAndreas Gohr return false; 2436734bb8cSAndreas Gohr } 2446734bb8cSAndreas Gohr } 2456734bb8cSAndreas Gohr if (!page_exists($id, '', false)) { 2466734bb8cSAndreas Gohr return false; 2476734bb8cSAndreas Gohr } 2486734bb8cSAndreas Gohr if ($after || $before) { 2496734bb8cSAndreas Gohr $mTime = filemtime(wikiFN($id)); 2506734bb8cSAndreas Gohr if ($after && $after > $mTime) return false; 2516734bb8cSAndreas Gohr if ($before && $before < $mTime) return false; 2526734bb8cSAndreas Gohr } 2536734bb8cSAndreas Gohr return true; 2546734bb8cSAndreas Gohr }, ARRAY_FILTER_USE_BOTH); 255fe2d1da1SSatoshi Sahara } 256fe2d1da1SSatoshi Sahara 257fe2d1da1SSatoshi Sahara /** 258fe2d1da1SSatoshi Sahara * Sort pages based on their namespace level first, then on their string 259fe2d1da1SSatoshi Sahara * values. This makes higher hierarchy pages rank higher than lower hierarchy 260fe2d1da1SSatoshi Sahara * pages. 261fe2d1da1SSatoshi Sahara * 262fe2d1da1SSatoshi Sahara * @param string $a 263fe2d1da1SSatoshi Sahara * @param string $b 264fe2d1da1SSatoshi Sahara * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, 265fe2d1da1SSatoshi Sahara * and 0 if they are equal. 266fe2d1da1SSatoshi Sahara */ 2676734bb8cSAndreas Gohr protected function pagesorter(string $a, string $b): int 268fe2d1da1SSatoshi Sahara { 2696734bb8cSAndreas Gohr $diff = substr_count($a, ':') - substr_count($b, ':'); 270*9369b4a9SAndreas Gohr return $diff ?: Sort::strcmp($a, $b); 271fe2d1da1SSatoshi Sahara } 272fe2d1da1SSatoshi Sahara} 273