xref: /dokuwiki/inc/Search/MetadataSearch.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1fe2d1da1SSatoshi Sahara<?php
246b83514SSatoshi Sahara
3fe2d1da1SSatoshi Saharanamespace dokuwiki\Search;
4fe2d1da1SSatoshi Sahara
5fe2d1da1SSatoshi Saharause dokuwiki\Extension\Event;
6*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch;
7*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection;
8*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection;
9*6734bb8cSAndreas Gohruse dokuwiki\Search\Exception\IndexUsageException;
100b1bbbbbSAndreas Gohruse dokuwiki\Search\Query\QueryParser;
11a02395a1SSatoshi Saharause dokuwiki\Utf8;
12fe2d1da1SSatoshi Sahara
13fe2d1da1SSatoshi Sahara/**
14fe2d1da1SSatoshi Sahara * Class DokuWiki Metadata Search
15fe2d1da1SSatoshi Sahara *
16*6734bb8cSAndreas Gohr * Provides search operations on metadata indexes using the Collection/Index architecture.
17*6734bb8cSAndreas Gohr *
18fe2d1da1SSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
19fe2d1da1SSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
20fe2d1da1SSatoshi Sahara */
21fe2d1da1SSatoshi Saharaclass MetadataSearch
22fe2d1da1SSatoshi Sahara{
23fe2d1da1SSatoshi Sahara    /**
24fe2d1da1SSatoshi Sahara     * Quicksearch for pagenames
25fe2d1da1SSatoshi Sahara     *
26fe2d1da1SSatoshi Sahara     * By default it only matches the pagename and ignores the namespace.
27fe2d1da1SSatoshi Sahara     * This can be changed with the second parameter.
28fe2d1da1SSatoshi Sahara     * The third parameter allows to search in titles as well.
29fe2d1da1SSatoshi Sahara     *
30fe2d1da1SSatoshi Sahara     * The function always returns titles as well
31fe2d1da1SSatoshi Sahara     *
32fe2d1da1SSatoshi Sahara     * @triggers SEARCH_QUERY_PAGELOOKUP
33fe2d1da1SSatoshi Sahara     * @param string     $id       page id
34fe2d1da1SSatoshi Sahara     * @param bool $in_ns    match against namespace as well?
35fe2d1da1SSatoshi Sahara     * @param bool $in_title search in title?
36*6734bb8cSAndreas Gohr     * @param int|string|null $after    only show results with mtime after this date,
37fe2d1da1SSatoshi Sahara     *                             accepts timestap or strtotime arguments
38*6734bb8cSAndreas Gohr     * @param int|string|null $before   only show results with mtime before this date,
39fe2d1da1SSatoshi Sahara     *                             accepts timestap or strtotime arguments
40fe2d1da1SSatoshi Sahara     *
41fe2d1da1SSatoshi Sahara     * @return string[]
42*6734bb8cSAndreas Gohr     * @author   Andreas Gohr <andi@splitbrain.org>
43*6734bb8cSAndreas Gohr     * @author   Adrian Lang <lang@cosmocode.de>
44*6734bb8cSAndreas Gohr     *
45fe2d1da1SSatoshi Sahara     */
46*6734bb8cSAndreas Gohr    public function pageLookup(
47*6734bb8cSAndreas Gohr        string     $id,
48*6734bb8cSAndreas Gohr        bool       $in_ns = false,
49*6734bb8cSAndreas Gohr        bool       $in_title = false,
50*6734bb8cSAndreas Gohr        int|string|null $after = null,
51*6734bb8cSAndreas Gohr        int|string|null $before = null): array
52fe2d1da1SSatoshi Sahara    {
53fe2d1da1SSatoshi Sahara        $data = [
54fe2d1da1SSatoshi Sahara            'id' => $id,
55fe2d1da1SSatoshi Sahara            'in_ns' => $in_ns,
56fe2d1da1SSatoshi Sahara            'in_title' => $in_title,
57fe2d1da1SSatoshi Sahara            'after' => $after,
58fe2d1da1SSatoshi Sahara            'before' => $before
59fe2d1da1SSatoshi Sahara        ];
60fe2d1da1SSatoshi Sahara        $data['has_titles'] = true; // for plugin backward compatibility check
61*6734bb8cSAndreas Gohr        return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...));
62fe2d1da1SSatoshi Sahara    }
63fe2d1da1SSatoshi Sahara
64fe2d1da1SSatoshi Sahara    /**
65fe2d1da1SSatoshi Sahara     * Returns list of pages as array(pageid => First Heading)
66fe2d1da1SSatoshi Sahara     *
67fe2d1da1SSatoshi Sahara     * @param array $data event data
68fe2d1da1SSatoshi Sahara     * @return string[]
69*6734bb8cSAndreas Gohr     * @throws IndexUsageException
70fe2d1da1SSatoshi Sahara     */
71*6734bb8cSAndreas Gohr    public function pageLookupCallBack(array &$data): array
72fe2d1da1SSatoshi Sahara    {
73*6734bb8cSAndreas Gohr        $parsedQuery = (new QueryParser)->convert($data['id']);
74*6734bb8cSAndreas Gohr        $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null;
75*6734bb8cSAndreas Gohr        $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null;
76*6734bb8cSAndreas Gohr        $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id'];
77*6734bb8cSAndreas Gohr        $cleaned = cleanID($query);
78fe2d1da1SSatoshi Sahara
79*6734bb8cSAndreas Gohr        if ($cleaned === '') return [];
80*6734bb8cSAndreas Gohr
81*6734bb8cSAndreas Gohr        // find pages matching by page name
82*6734bb8cSAndreas Gohr        $pages = [];
83*6734bb8cSAndreas Gohr        foreach ($this->getPages() as $page) {
84*6734bb8cSAndreas Gohr            if ($ns && !str_starts_with($page, $ns)) continue;
85*6734bb8cSAndreas Gohr            if ($notns && str_starts_with($page, $notns)) continue;
86*6734bb8cSAndreas Gohr
87*6734bb8cSAndreas Gohr            $match = $data['in_ns'] ? $page : noNSorNS($page);
88*6734bb8cSAndreas Gohr            if (str_contains($match, $cleaned)) {
89*6734bb8cSAndreas Gohr                $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER);
90fe2d1da1SSatoshi Sahara            }
91fab81cc8SSatoshi Sahara        }
92fe2d1da1SSatoshi Sahara
93*6734bb8cSAndreas Gohr        // additionally find pages matching by title
94*6734bb8cSAndreas Gohr        if ($data['in_title']) {
95*6734bb8cSAndreas Gohr            foreach ($this->lookupKey('title', $query, static fn($search, $title) => stripos($title, $search) !== false) as $page) {
96*6734bb8cSAndreas Gohr                if ($ns && !str_starts_with($page, $ns)) continue;
97*6734bb8cSAndreas Gohr                if ($notns && str_starts_with($page, $notns)) continue;
98fe2d1da1SSatoshi Sahara
99*6734bb8cSAndreas Gohr                if (!isset($pages[$page])) {
100*6734bb8cSAndreas Gohr                    $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER);
101fe2d1da1SSatoshi Sahara                }
102fe2d1da1SSatoshi Sahara            }
103fe2d1da1SSatoshi Sahara        }
104fe2d1da1SSatoshi Sahara
105*6734bb8cSAndreas Gohr        $pages = static::filterPages($pages, false, $data['after'], $data['before']);
106*6734bb8cSAndreas Gohr        uksort($pages, $this->pagesorter(...));
107fe2d1da1SSatoshi Sahara        return $pages;
108fe2d1da1SSatoshi Sahara    }
109fe2d1da1SSatoshi Sahara
110fe2d1da1SSatoshi Sahara    /**
111*6734bb8cSAndreas Gohr     * Return a list of all indexed pages, optionally limited to those that have a specific metadata key
112fe2d1da1SSatoshi Sahara     *
113*6734bb8cSAndreas Gohr     * When a key is given, only pages that have any value stored for that metadata key are returned.
114*6734bb8cSAndreas Gohr     * This does not filter by the metadata value itself.
115*6734bb8cSAndreas Gohr     *
116*6734bb8cSAndreas Gohr     * @param string|null $key metadata key name, or null for all pages
117*6734bb8cSAndreas Gohr     * @return string[] list of page names
118fe2d1da1SSatoshi Sahara     */
119*6734bb8cSAndreas Gohr    public function getPages(?string $key = null): array
120fe2d1da1SSatoshi Sahara    {
121*6734bb8cSAndreas Gohr        if ($key === null) {
122*6734bb8cSAndreas Gohr            return (new Indexer())->getAllPages();
123*6734bb8cSAndreas Gohr        }
124*6734bb8cSAndreas Gohr
125*6734bb8cSAndreas Gohr        if ($key === 'title') {
126*6734bb8cSAndreas Gohr            return (new PageTitleCollection())->getEntitiesWithData();
127*6734bb8cSAndreas Gohr        }
128*6734bb8cSAndreas Gohr
129*6734bb8cSAndreas Gohr        return (new PageMetaCollection($key))->getEntitiesWithData();
130*6734bb8cSAndreas Gohr    }
131*6734bb8cSAndreas Gohr
132*6734bb8cSAndreas Gohr    /**
133*6734bb8cSAndreas Gohr     * Find pages containing a metadata value
134*6734bb8cSAndreas Gohr     *
135*6734bb8cSAndreas Gohr     * The metadata values are compared as case-sensitive strings. Pass a
136*6734bb8cSAndreas Gohr     * callback function that returns true or false to use a different
137*6734bb8cSAndreas Gohr     * comparison function. The function will be called with the $value being
138*6734bb8cSAndreas Gohr     * searched for as the first argument, and the word in the index as the
139*6734bb8cSAndreas Gohr     * second argument. The function preg_match can be used directly if the
140*6734bb8cSAndreas Gohr     * values are regexes.
141*6734bb8cSAndreas Gohr     *
142*6734bb8cSAndreas Gohr     * When $value is a string, the result is a flat list of matching page names.
143*6734bb8cSAndreas Gohr     * When $value is an array, each value is searched independently and the result
144*6734bb8cSAndreas Gohr     * is an associative array keyed by the search values, each containing a list
145*6734bb8cSAndreas Gohr     * of matching page names.
146*6734bb8cSAndreas Gohr     *
147*6734bb8cSAndreas Gohr     * Without a callback, values support wildcard matching with * at the start
148*6734bb8cSAndreas Gohr     * and/or end (e.g. '*foo', 'bar*', '*baz*').
149*6734bb8cSAndreas Gohr     *
150*6734bb8cSAndreas Gohr     * @param string $key name of the metadata key to look for
151*6734bb8cSAndreas Gohr     * @param string|string[] $value search term or array of search terms
152*6734bb8cSAndreas Gohr     * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool
153*6734bb8cSAndreas Gohr     * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value)
154*6734bb8cSAndreas Gohr     *
155*6734bb8cSAndreas Gohr     * @throws IndexUsageException
156*6734bb8cSAndreas Gohr     * @author Michael Hamann <michael@content-space.de>
157*6734bb8cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
158*6734bb8cSAndreas Gohr     */
159*6734bb8cSAndreas Gohr    public function lookupKey(string $key, string|array &$value, ?callable $func = null): array
160*6734bb8cSAndreas Gohr    {
161*6734bb8cSAndreas Gohr        $isScalar = !is_array($value);
162*6734bb8cSAndreas Gohr        $valueArray = $isScalar ? [$value] : $value;
163*6734bb8cSAndreas Gohr
164*6734bb8cSAndreas Gohr        if ($key === 'title') {
165*6734bb8cSAndreas Gohr            $collection = new PageTitleCollection();
166*6734bb8cSAndreas Gohr        } else {
167*6734bb8cSAndreas Gohr            $collection = new PageMetaCollection($key);
168*6734bb8cSAndreas Gohr        }
169*6734bb8cSAndreas Gohr
170*6734bb8cSAndreas Gohr        $result = (new CollectionSearch($collection))->lookup($valueArray, $func);
171*6734bb8cSAndreas Gohr
172*6734bb8cSAndreas Gohr        return $isScalar ? $result[$value] : $result;
173*6734bb8cSAndreas Gohr    }
174*6734bb8cSAndreas Gohr
175*6734bb8cSAndreas Gohr    /**
176*6734bb8cSAndreas Gohr     * Returns the backlinks for a given page
177*6734bb8cSAndreas Gohr     *
178*6734bb8cSAndreas Gohr     * @param string $id The id for which links shall be returned
179*6734bb8cSAndreas Gohr     * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected
180*6734bb8cSAndreas Gohr     * @return string[] The pages that contain links to the given page
181*6734bb8cSAndreas Gohr     *
182*6734bb8cSAndreas Gohr     * @throws IndexUsageException
183*6734bb8cSAndreas Gohr     * @author     Andreas Gohr <andi@splitbrain.org>
184*6734bb8cSAndreas Gohr     */
185*6734bb8cSAndreas Gohr    public function backlinks(string $id, bool $ignore_perms = false): array
186*6734bb8cSAndreas Gohr    {
187*6734bb8cSAndreas Gohr        $result = $this->lookupKey('relation_references', $id);
188*6734bb8cSAndreas Gohr        if (!count($result)) return $result;
189*6734bb8cSAndreas Gohr
190*6734bb8cSAndreas Gohr        $result = array_flip($result);
191*6734bb8cSAndreas Gohr        $result = static::filterPages($result, $ignore_perms);
192*6734bb8cSAndreas Gohr        $result = array_keys($result);
193*6734bb8cSAndreas Gohr
194*6734bb8cSAndreas Gohr        Utf8\Sort::sort($result);
195*6734bb8cSAndreas Gohr        return $result;
196*6734bb8cSAndreas Gohr    }
197*6734bb8cSAndreas Gohr
198*6734bb8cSAndreas Gohr    /**
199*6734bb8cSAndreas Gohr     * Returns the pages that use a given media file
200*6734bb8cSAndreas Gohr     *
201*6734bb8cSAndreas Gohr     * @param string $id           The media id to look for
202*6734bb8cSAndreas Gohr     * @param bool   $ignore_perms Ignore hidden pages and acls (optional, default: false)
203*6734bb8cSAndreas Gohr     * @return string[] A list of pages that use the given media file
204*6734bb8cSAndreas Gohr     *
205*6734bb8cSAndreas Gohr     * @author     Andreas Gohr <andi@splitbrain.org>
206*6734bb8cSAndreas Gohr     */
207*6734bb8cSAndreas Gohr    public function mediause(string $id, bool $ignore_perms = false): array
208*6734bb8cSAndreas Gohr    {
209*6734bb8cSAndreas Gohr        $result = $this->lookupKey('relation_media', $id);
210*6734bb8cSAndreas Gohr        if (!count($result)) return $result;
211*6734bb8cSAndreas Gohr
212*6734bb8cSAndreas Gohr        $result = array_flip($result);
213*6734bb8cSAndreas Gohr        $result = static::filterPages($result, $ignore_perms);
214*6734bb8cSAndreas Gohr        $result = array_keys($result);
215*6734bb8cSAndreas Gohr
216*6734bb8cSAndreas Gohr        Utf8\Sort::sort($result);
217*6734bb8cSAndreas Gohr        return $result;
218*6734bb8cSAndreas Gohr    }
219*6734bb8cSAndreas Gohr
220*6734bb8cSAndreas Gohr    /**
221*6734bb8cSAndreas Gohr     * Filter a list of pages by visibility, existence, permissions, and time range
222*6734bb8cSAndreas Gohr     *
223*6734bb8cSAndreas Gohr     * @param array $pages pages to filter (keys are page IDs)
224*6734bb8cSAndreas Gohr     * @param bool $ignorePerms skip visibility and ACL checks
225*6734bb8cSAndreas Gohr     * @param int|string|null $after only keep pages modified after this date
226*6734bb8cSAndreas Gohr     * @param int|string|null $before only keep pages modified before this date
227*6734bb8cSAndreas Gohr     * @return array filtered pages
228*6734bb8cSAndreas Gohr     */
229*6734bb8cSAndreas Gohr    public static function filterPages(array $pages, bool $ignorePerms = false, $after = null, $before = null): array
230*6734bb8cSAndreas Gohr    {
231*6734bb8cSAndreas Gohr        if ($after) $after = is_int($after) ? $after : strtotime($after);
232*6734bb8cSAndreas Gohr        if ($before) $before = is_int($before) ? $before : strtotime($before);
233*6734bb8cSAndreas Gohr
234*6734bb8cSAndreas Gohr        return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) {
235*6734bb8cSAndreas Gohr            if (!$ignorePerms) {
236*6734bb8cSAndreas Gohr                if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) {
237*6734bb8cSAndreas Gohr                    return false;
238*6734bb8cSAndreas Gohr                }
239*6734bb8cSAndreas Gohr            }
240*6734bb8cSAndreas Gohr            if (!page_exists($id, '', false)) {
241*6734bb8cSAndreas Gohr                return false;
242*6734bb8cSAndreas Gohr            }
243*6734bb8cSAndreas Gohr            if ($after || $before) {
244*6734bb8cSAndreas Gohr                $mTime = filemtime(wikiFN($id));
245*6734bb8cSAndreas Gohr                if ($after && $after > $mTime) return false;
246*6734bb8cSAndreas Gohr                if ($before && $before < $mTime) return false;
247*6734bb8cSAndreas Gohr            }
248*6734bb8cSAndreas Gohr            return true;
249*6734bb8cSAndreas Gohr        }, ARRAY_FILTER_USE_BOTH);
250fe2d1da1SSatoshi Sahara    }
251fe2d1da1SSatoshi Sahara
252fe2d1da1SSatoshi Sahara    /**
253fe2d1da1SSatoshi Sahara     * Sort pages based on their namespace level first, then on their string
254fe2d1da1SSatoshi Sahara     * values. This makes higher hierarchy pages rank higher than lower hierarchy
255fe2d1da1SSatoshi Sahara     * pages.
256fe2d1da1SSatoshi Sahara     *
257fe2d1da1SSatoshi Sahara     * @param string $a
258fe2d1da1SSatoshi Sahara     * @param string $b
259fe2d1da1SSatoshi Sahara     * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b,
260fe2d1da1SSatoshi Sahara     *             and 0 if they are equal.
261fe2d1da1SSatoshi Sahara     */
262*6734bb8cSAndreas Gohr    protected function pagesorter(string $a, string $b): int
263fe2d1da1SSatoshi Sahara    {
264*6734bb8cSAndreas Gohr        $diff = substr_count($a, ':') - substr_count($b, ':');
265*6734bb8cSAndreas Gohr        return $diff ?: Utf8\Sort::strcmp($a, $b);
266fe2d1da1SSatoshi Sahara    }
267fe2d1da1SSatoshi Sahara}
268