1fe2d1da1SSatoshi Sahara<?php 246b83514SSatoshi Sahara 3fe2d1da1SSatoshi Saharanamespace dokuwiki\Search; 4fe2d1da1SSatoshi Sahara 5fe2d1da1SSatoshi Saharause dokuwiki\Extension\Event; 6*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch; 7*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection; 8*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection; 9*6734bb8cSAndreas Gohruse dokuwiki\Search\Exception\IndexUsageException; 100b1bbbbbSAndreas Gohruse dokuwiki\Search\Query\QueryParser; 11a02395a1SSatoshi Saharause dokuwiki\Utf8; 12fe2d1da1SSatoshi Sahara 13fe2d1da1SSatoshi Sahara/** 14fe2d1da1SSatoshi Sahara * Class DokuWiki Metadata Search 15fe2d1da1SSatoshi Sahara * 16*6734bb8cSAndreas Gohr * Provides search operations on metadata indexes using the Collection/Index architecture. 17*6734bb8cSAndreas Gohr * 18fe2d1da1SSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 19fe2d1da1SSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 20fe2d1da1SSatoshi Sahara */ 21fe2d1da1SSatoshi Saharaclass MetadataSearch 22fe2d1da1SSatoshi Sahara{ 23fe2d1da1SSatoshi Sahara /** 24fe2d1da1SSatoshi Sahara * Quicksearch for pagenames 25fe2d1da1SSatoshi Sahara * 26fe2d1da1SSatoshi Sahara * By default it only matches the pagename and ignores the namespace. 27fe2d1da1SSatoshi Sahara * This can be changed with the second parameter. 28fe2d1da1SSatoshi Sahara * The third parameter allows to search in titles as well. 29fe2d1da1SSatoshi Sahara * 30fe2d1da1SSatoshi Sahara * The function always returns titles as well 31fe2d1da1SSatoshi Sahara * 32fe2d1da1SSatoshi Sahara * @triggers SEARCH_QUERY_PAGELOOKUP 33fe2d1da1SSatoshi Sahara * @param string $id page id 34fe2d1da1SSatoshi Sahara * @param bool $in_ns match against namespace as well? 35fe2d1da1SSatoshi Sahara * @param bool $in_title search in title? 36*6734bb8cSAndreas Gohr * @param int|string|null $after only show results with mtime after this date, 37fe2d1da1SSatoshi Sahara * accepts timestap or strtotime arguments 38*6734bb8cSAndreas Gohr * @param int|string|null $before only show results with mtime before this date, 39fe2d1da1SSatoshi Sahara * accepts timestap or strtotime arguments 40fe2d1da1SSatoshi Sahara * 41fe2d1da1SSatoshi Sahara * @return string[] 42*6734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 43*6734bb8cSAndreas Gohr * @author Adrian Lang <lang@cosmocode.de> 44*6734bb8cSAndreas Gohr * 45fe2d1da1SSatoshi Sahara */ 46*6734bb8cSAndreas Gohr public function pageLookup( 47*6734bb8cSAndreas Gohr string $id, 48*6734bb8cSAndreas Gohr bool $in_ns = false, 49*6734bb8cSAndreas Gohr bool $in_title = false, 50*6734bb8cSAndreas Gohr int|string|null $after = null, 51*6734bb8cSAndreas Gohr int|string|null $before = null): array 52fe2d1da1SSatoshi Sahara { 53fe2d1da1SSatoshi Sahara $data = [ 54fe2d1da1SSatoshi Sahara 'id' => $id, 55fe2d1da1SSatoshi Sahara 'in_ns' => $in_ns, 56fe2d1da1SSatoshi Sahara 'in_title' => $in_title, 57fe2d1da1SSatoshi Sahara 'after' => $after, 58fe2d1da1SSatoshi Sahara 'before' => $before 59fe2d1da1SSatoshi Sahara ]; 60fe2d1da1SSatoshi Sahara $data['has_titles'] = true; // for plugin backward compatibility check 61*6734bb8cSAndreas Gohr return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...)); 62fe2d1da1SSatoshi Sahara } 63fe2d1da1SSatoshi Sahara 64fe2d1da1SSatoshi Sahara /** 65fe2d1da1SSatoshi Sahara * Returns list of pages as array(pageid => First Heading) 66fe2d1da1SSatoshi Sahara * 67fe2d1da1SSatoshi Sahara * @param array $data event data 68fe2d1da1SSatoshi Sahara * @return string[] 69*6734bb8cSAndreas Gohr * @throws IndexUsageException 70fe2d1da1SSatoshi Sahara */ 71*6734bb8cSAndreas Gohr public function pageLookupCallBack(array &$data): array 72fe2d1da1SSatoshi Sahara { 73*6734bb8cSAndreas Gohr $parsedQuery = (new QueryParser)->convert($data['id']); 74*6734bb8cSAndreas Gohr $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null; 75*6734bb8cSAndreas Gohr $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null; 76*6734bb8cSAndreas Gohr $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id']; 77*6734bb8cSAndreas Gohr $cleaned = cleanID($query); 78fe2d1da1SSatoshi Sahara 79*6734bb8cSAndreas Gohr if ($cleaned === '') return []; 80*6734bb8cSAndreas Gohr 81*6734bb8cSAndreas Gohr // find pages matching by page name 82*6734bb8cSAndreas Gohr $pages = []; 83*6734bb8cSAndreas Gohr foreach ($this->getPages() as $page) { 84*6734bb8cSAndreas Gohr if ($ns && !str_starts_with($page, $ns)) continue; 85*6734bb8cSAndreas Gohr if ($notns && str_starts_with($page, $notns)) continue; 86*6734bb8cSAndreas Gohr 87*6734bb8cSAndreas Gohr $match = $data['in_ns'] ? $page : noNSorNS($page); 88*6734bb8cSAndreas Gohr if (str_contains($match, $cleaned)) { 89*6734bb8cSAndreas Gohr $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER); 90fe2d1da1SSatoshi Sahara } 91fab81cc8SSatoshi Sahara } 92fe2d1da1SSatoshi Sahara 93*6734bb8cSAndreas Gohr // additionally find pages matching by title 94*6734bb8cSAndreas Gohr if ($data['in_title']) { 95*6734bb8cSAndreas Gohr foreach ($this->lookupKey('title', $query, static fn($search, $title) => stripos($title, $search) !== false) as $page) { 96*6734bb8cSAndreas Gohr if ($ns && !str_starts_with($page, $ns)) continue; 97*6734bb8cSAndreas Gohr if ($notns && str_starts_with($page, $notns)) continue; 98fe2d1da1SSatoshi Sahara 99*6734bb8cSAndreas Gohr if (!isset($pages[$page])) { 100*6734bb8cSAndreas Gohr $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER); 101fe2d1da1SSatoshi Sahara } 102fe2d1da1SSatoshi Sahara } 103fe2d1da1SSatoshi Sahara } 104fe2d1da1SSatoshi Sahara 105*6734bb8cSAndreas Gohr $pages = static::filterPages($pages, false, $data['after'], $data['before']); 106*6734bb8cSAndreas Gohr uksort($pages, $this->pagesorter(...)); 107fe2d1da1SSatoshi Sahara return $pages; 108fe2d1da1SSatoshi Sahara } 109fe2d1da1SSatoshi Sahara 110fe2d1da1SSatoshi Sahara /** 111*6734bb8cSAndreas Gohr * Return a list of all indexed pages, optionally limited to those that have a specific metadata key 112fe2d1da1SSatoshi Sahara * 113*6734bb8cSAndreas Gohr * When a key is given, only pages that have any value stored for that metadata key are returned. 114*6734bb8cSAndreas Gohr * This does not filter by the metadata value itself. 115*6734bb8cSAndreas Gohr * 116*6734bb8cSAndreas Gohr * @param string|null $key metadata key name, or null for all pages 117*6734bb8cSAndreas Gohr * @return string[] list of page names 118fe2d1da1SSatoshi Sahara */ 119*6734bb8cSAndreas Gohr public function getPages(?string $key = null): array 120fe2d1da1SSatoshi Sahara { 121*6734bb8cSAndreas Gohr if ($key === null) { 122*6734bb8cSAndreas Gohr return (new Indexer())->getAllPages(); 123*6734bb8cSAndreas Gohr } 124*6734bb8cSAndreas Gohr 125*6734bb8cSAndreas Gohr if ($key === 'title') { 126*6734bb8cSAndreas Gohr return (new PageTitleCollection())->getEntitiesWithData(); 127*6734bb8cSAndreas Gohr } 128*6734bb8cSAndreas Gohr 129*6734bb8cSAndreas Gohr return (new PageMetaCollection($key))->getEntitiesWithData(); 130*6734bb8cSAndreas Gohr } 131*6734bb8cSAndreas Gohr 132*6734bb8cSAndreas Gohr /** 133*6734bb8cSAndreas Gohr * Find pages containing a metadata value 134*6734bb8cSAndreas Gohr * 135*6734bb8cSAndreas Gohr * The metadata values are compared as case-sensitive strings. Pass a 136*6734bb8cSAndreas Gohr * callback function that returns true or false to use a different 137*6734bb8cSAndreas Gohr * comparison function. The function will be called with the $value being 138*6734bb8cSAndreas Gohr * searched for as the first argument, and the word in the index as the 139*6734bb8cSAndreas Gohr * second argument. The function preg_match can be used directly if the 140*6734bb8cSAndreas Gohr * values are regexes. 141*6734bb8cSAndreas Gohr * 142*6734bb8cSAndreas Gohr * When $value is a string, the result is a flat list of matching page names. 143*6734bb8cSAndreas Gohr * When $value is an array, each value is searched independently and the result 144*6734bb8cSAndreas Gohr * is an associative array keyed by the search values, each containing a list 145*6734bb8cSAndreas Gohr * of matching page names. 146*6734bb8cSAndreas Gohr * 147*6734bb8cSAndreas Gohr * Without a callback, values support wildcard matching with * at the start 148*6734bb8cSAndreas Gohr * and/or end (e.g. '*foo', 'bar*', '*baz*'). 149*6734bb8cSAndreas Gohr * 150*6734bb8cSAndreas Gohr * @param string $key name of the metadata key to look for 151*6734bb8cSAndreas Gohr * @param string|string[] $value search term or array of search terms 152*6734bb8cSAndreas Gohr * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool 153*6734bb8cSAndreas Gohr * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value) 154*6734bb8cSAndreas Gohr * 155*6734bb8cSAndreas Gohr * @throws IndexUsageException 156*6734bb8cSAndreas Gohr * @author Michael Hamann <michael@content-space.de> 157*6734bb8cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 158*6734bb8cSAndreas Gohr */ 159*6734bb8cSAndreas Gohr public function lookupKey(string $key, string|array &$value, ?callable $func = null): array 160*6734bb8cSAndreas Gohr { 161*6734bb8cSAndreas Gohr $isScalar = !is_array($value); 162*6734bb8cSAndreas Gohr $valueArray = $isScalar ? [$value] : $value; 163*6734bb8cSAndreas Gohr 164*6734bb8cSAndreas Gohr if ($key === 'title') { 165*6734bb8cSAndreas Gohr $collection = new PageTitleCollection(); 166*6734bb8cSAndreas Gohr } else { 167*6734bb8cSAndreas Gohr $collection = new PageMetaCollection($key); 168*6734bb8cSAndreas Gohr } 169*6734bb8cSAndreas Gohr 170*6734bb8cSAndreas Gohr $result = (new CollectionSearch($collection))->lookup($valueArray, $func); 171*6734bb8cSAndreas Gohr 172*6734bb8cSAndreas Gohr return $isScalar ? $result[$value] : $result; 173*6734bb8cSAndreas Gohr } 174*6734bb8cSAndreas Gohr 175*6734bb8cSAndreas Gohr /** 176*6734bb8cSAndreas Gohr * Returns the backlinks for a given page 177*6734bb8cSAndreas Gohr * 178*6734bb8cSAndreas Gohr * @param string $id The id for which links shall be returned 179*6734bb8cSAndreas Gohr * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected 180*6734bb8cSAndreas Gohr * @return string[] The pages that contain links to the given page 181*6734bb8cSAndreas Gohr * 182*6734bb8cSAndreas Gohr * @throws IndexUsageException 183*6734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 184*6734bb8cSAndreas Gohr */ 185*6734bb8cSAndreas Gohr public function backlinks(string $id, bool $ignore_perms = false): array 186*6734bb8cSAndreas Gohr { 187*6734bb8cSAndreas Gohr $result = $this->lookupKey('relation_references', $id); 188*6734bb8cSAndreas Gohr if (!count($result)) return $result; 189*6734bb8cSAndreas Gohr 190*6734bb8cSAndreas Gohr $result = array_flip($result); 191*6734bb8cSAndreas Gohr $result = static::filterPages($result, $ignore_perms); 192*6734bb8cSAndreas Gohr $result = array_keys($result); 193*6734bb8cSAndreas Gohr 194*6734bb8cSAndreas Gohr Utf8\Sort::sort($result); 195*6734bb8cSAndreas Gohr return $result; 196*6734bb8cSAndreas Gohr } 197*6734bb8cSAndreas Gohr 198*6734bb8cSAndreas Gohr /** 199*6734bb8cSAndreas Gohr * Returns the pages that use a given media file 200*6734bb8cSAndreas Gohr * 201*6734bb8cSAndreas Gohr * @param string $id The media id to look for 202*6734bb8cSAndreas Gohr * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false) 203*6734bb8cSAndreas Gohr * @return string[] A list of pages that use the given media file 204*6734bb8cSAndreas Gohr * 205*6734bb8cSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 206*6734bb8cSAndreas Gohr */ 207*6734bb8cSAndreas Gohr public function mediause(string $id, bool $ignore_perms = false): array 208*6734bb8cSAndreas Gohr { 209*6734bb8cSAndreas Gohr $result = $this->lookupKey('relation_media', $id); 210*6734bb8cSAndreas Gohr if (!count($result)) return $result; 211*6734bb8cSAndreas Gohr 212*6734bb8cSAndreas Gohr $result = array_flip($result); 213*6734bb8cSAndreas Gohr $result = static::filterPages($result, $ignore_perms); 214*6734bb8cSAndreas Gohr $result = array_keys($result); 215*6734bb8cSAndreas Gohr 216*6734bb8cSAndreas Gohr Utf8\Sort::sort($result); 217*6734bb8cSAndreas Gohr return $result; 218*6734bb8cSAndreas Gohr } 219*6734bb8cSAndreas Gohr 220*6734bb8cSAndreas Gohr /** 221*6734bb8cSAndreas Gohr * Filter a list of pages by visibility, existence, permissions, and time range 222*6734bb8cSAndreas Gohr * 223*6734bb8cSAndreas Gohr * @param array $pages pages to filter (keys are page IDs) 224*6734bb8cSAndreas Gohr * @param bool $ignorePerms skip visibility and ACL checks 225*6734bb8cSAndreas Gohr * @param int|string|null $after only keep pages modified after this date 226*6734bb8cSAndreas Gohr * @param int|string|null $before only keep pages modified before this date 227*6734bb8cSAndreas Gohr * @return array filtered pages 228*6734bb8cSAndreas Gohr */ 229*6734bb8cSAndreas Gohr public static function filterPages(array $pages, bool $ignorePerms = false, $after = null, $before = null): array 230*6734bb8cSAndreas Gohr { 231*6734bb8cSAndreas Gohr if ($after) $after = is_int($after) ? $after : strtotime($after); 232*6734bb8cSAndreas Gohr if ($before) $before = is_int($before) ? $before : strtotime($before); 233*6734bb8cSAndreas Gohr 234*6734bb8cSAndreas Gohr return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) { 235*6734bb8cSAndreas Gohr if (!$ignorePerms) { 236*6734bb8cSAndreas Gohr if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) { 237*6734bb8cSAndreas Gohr return false; 238*6734bb8cSAndreas Gohr } 239*6734bb8cSAndreas Gohr } 240*6734bb8cSAndreas Gohr if (!page_exists($id, '', false)) { 241*6734bb8cSAndreas Gohr return false; 242*6734bb8cSAndreas Gohr } 243*6734bb8cSAndreas Gohr if ($after || $before) { 244*6734bb8cSAndreas Gohr $mTime = filemtime(wikiFN($id)); 245*6734bb8cSAndreas Gohr if ($after && $after > $mTime) return false; 246*6734bb8cSAndreas Gohr if ($before && $before < $mTime) return false; 247*6734bb8cSAndreas Gohr } 248*6734bb8cSAndreas Gohr return true; 249*6734bb8cSAndreas Gohr }, ARRAY_FILTER_USE_BOTH); 250fe2d1da1SSatoshi Sahara } 251fe2d1da1SSatoshi Sahara 252fe2d1da1SSatoshi Sahara /** 253fe2d1da1SSatoshi Sahara * Sort pages based on their namespace level first, then on their string 254fe2d1da1SSatoshi Sahara * values. This makes higher hierarchy pages rank higher than lower hierarchy 255fe2d1da1SSatoshi Sahara * pages. 256fe2d1da1SSatoshi Sahara * 257fe2d1da1SSatoshi Sahara * @param string $a 258fe2d1da1SSatoshi Sahara * @param string $b 259fe2d1da1SSatoshi Sahara * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, 260fe2d1da1SSatoshi Sahara * and 0 if they are equal. 261fe2d1da1SSatoshi Sahara */ 262*6734bb8cSAndreas Gohr protected function pagesorter(string $a, string $b): int 263fe2d1da1SSatoshi Sahara { 264*6734bb8cSAndreas Gohr $diff = substr_count($a, ':') - substr_count($b, ':'); 265*6734bb8cSAndreas Gohr return $diff ?: Utf8\Sort::strcmp($a, $b); 266fe2d1da1SSatoshi Sahara } 267fe2d1da1SSatoshi Sahara} 268