xref: /dokuwiki/inc/Search/MetadataSearch.php (revision 5e9d26e3624fd22ca3c57447be9e16d0b502761e)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\CollectionSearch;
7use dokuwiki\Search\Collection\PageMetaCollection;
8use dokuwiki\Search\Collection\PageTitleCollection;
9use dokuwiki\Search\Exception\IndexUsageException;
10use dokuwiki\Search\Query\QueryParser;
11use dokuwiki\Utf8;
12
13/**
14 * Class DokuWiki Metadata Search
15 *
16 * Provides search operations on metadata indexes using the Collection/Index architecture.
17 *
18 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
19 * @author     Andreas Gohr <andi@splitbrain.org>
20 */
21class MetadataSearch
22{
23    /**
24     * Quicksearch for pagenames
25     *
26     * By default it only matches the pagename and ignores the namespace.
27     * This can be changed with the second parameter.
28     * The third parameter allows to search in titles as well.
29     *
30     * The function always returns titles as well
31     *
32     * @triggers SEARCH_QUERY_PAGELOOKUP
33     * @param string     $id       page id
34     * @param bool $in_ns    match against namespace as well?
35     * @param bool $in_title search in title?
36     * @param int|string|null $after    only show results with mtime after this date,
37     *                             accepts timestap or strtotime arguments
38     * @param int|string|null $before   only show results with mtime before this date,
39     *                             accepts timestap or strtotime arguments
40     *
41     * @return string[]
42     * @author   Andreas Gohr <andi@splitbrain.org>
43     * @author   Adrian Lang <lang@cosmocode.de>
44     *
45     */
46    public function pageLookup(
47        string     $id,
48        bool       $in_ns = false,
49        bool       $in_title = false,
50        int|string|null $after = null,
51        int|string|null $before = null): array
52    {
53        $data = [
54            'id' => $id,
55            'in_ns' => $in_ns,
56            'in_title' => $in_title,
57            'after' => $after,
58            'before' => $before
59        ];
60        $data['has_titles'] = true; // for plugin backward compatibility check
61        return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...));
62    }
63
64    /**
65     * Returns list of pages as array(pageid => First Heading)
66     *
67     * @param array $data event data
68     * @return string[]
69     * @throws IndexUsageException
70     */
71    public function pageLookupCallBack(array &$data): array
72    {
73        $parsedQuery = (new QueryParser)->convert($data['id']);
74        $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null;
75        $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null;
76        $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id'];
77        $cleaned = cleanID($query);
78
79        if ($cleaned === '') return [];
80
81        // find pages matching by page name
82        $pages = [];
83        foreach ($this->getPages() as $page) {
84            if ($ns && !str_starts_with($page, $ns)) continue;
85            if ($notns && str_starts_with($page, $notns)) continue;
86
87            $match = $data['in_ns'] ? $page : noNSorNS($page);
88            if (str_contains($match, $cleaned)) {
89                $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER);
90            }
91        }
92
93        // additionally find pages matching by title
94        if ($data['in_title']) {
95            foreach ($this->lookupKey('title', $query, static function ($search, $title) {
96                if (Utf8\Clean::isASCII($search)) {
97                    return stripos($title, $search) !== false;
98                }
99                return Utf8\PhpString::strpos(
100                    Utf8\PhpString::strtolower($title),
101                    Utf8\PhpString::strtolower($search)
102                ) !== false;
103            }) as $page) {
104                if ($ns && !str_starts_with($page, $ns)) continue;
105                if ($notns && str_starts_with($page, $notns)) continue;
106
107                if (!isset($pages[$page])) {
108                    $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER);
109                }
110            }
111        }
112
113        $pages = static::filterPages($pages, false, $data['after'], $data['before']);
114        uksort($pages, $this->pagesorter(...));
115        return $pages;
116    }
117
118    /**
119     * Return a list of all indexed pages, optionally limited to those that have a specific metadata key
120     *
121     * When a key is given, only pages that have any value stored for that metadata key are returned.
122     * This does not filter by the metadata value itself.
123     *
124     * @param string|null $key metadata key name, or null for all pages
125     * @return string[] list of page names
126     */
127    public function getPages(?string $key = null): array
128    {
129        if ($key === null) {
130            return (new Indexer())->getAllPages();
131        }
132
133        if ($key === 'title') {
134            return (new PageTitleCollection())->getEntitiesWithData();
135        }
136
137        return (new PageMetaCollection($key))->getEntitiesWithData();
138    }
139
140    /**
141     * Find pages containing a metadata value
142     *
143     * The metadata values are compared as case-sensitive strings. Pass a
144     * callback function that returns true or false to use a different
145     * comparison function. The function will be called with the $value being
146     * searched for as the first argument, and the word in the index as the
147     * second argument. The function preg_match can be used directly if the
148     * values are regexes.
149     *
150     * When $value is a string, the result is a flat list of matching page names.
151     * When $value is an array, each value is searched independently and the result
152     * is an associative array keyed by the search values, each containing a list
153     * of matching page names.
154     *
155     * Without a callback, values support wildcard matching with * at the start
156     * and/or end (e.g. '*foo', 'bar*', '*baz*').
157     *
158     * @param string $key name of the metadata key to look for
159     * @param string|string[] $value search term or array of search terms
160     * @param callable|null $func comparison function: fn($searchValue, $indexWord) => bool
161     * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value)
162     *
163     * @throws IndexUsageException
164     * @author Michael Hamann <michael@content-space.de>
165     * @author Tom N Harris <tnharris@whoopdedo.org>
166     */
167    public function lookupKey(string $key, string|array &$value, ?callable $func = null): array
168    {
169        $isScalar = !is_array($value);
170        $valueArray = $isScalar ? [$value] : $value;
171
172        if ($key === 'title') {
173            $collection = new PageTitleCollection();
174        } else {
175            $collection = new PageMetaCollection($key);
176        }
177
178        $result = (new CollectionSearch($collection))->lookup($valueArray, $func);
179
180        return $isScalar ? $result[$value] : $result;
181    }
182
183    /**
184     * Returns the backlinks for a given page
185     *
186     * @param string $id The id for which links shall be returned
187     * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected
188     * @return string[] The pages that contain links to the given page
189     *
190     * @throws IndexUsageException
191     * @author     Andreas Gohr <andi@splitbrain.org>
192     */
193    public function backlinks(string $id, bool $ignore_perms = false): array
194    {
195        $result = $this->lookupKey('relation_references', $id);
196        if (!count($result)) return $result;
197
198        $result = array_flip($result);
199        $result = static::filterPages($result, $ignore_perms);
200        $result = array_keys($result);
201
202        Utf8\Sort::sort($result);
203        return $result;
204    }
205
206    /**
207     * Returns the pages that use a given media file
208     *
209     * @param string $id           The media id to look for
210     * @param bool   $ignore_perms Ignore hidden pages and acls (optional, default: false)
211     * @return string[] A list of pages that use the given media file
212     *
213     * @author     Andreas Gohr <andi@splitbrain.org>
214     */
215    public function mediause(string $id, bool $ignore_perms = false): array
216    {
217        $result = $this->lookupKey('relation_media', $id);
218        if (!count($result)) return $result;
219
220        $result = array_flip($result);
221        $result = static::filterPages($result, $ignore_perms);
222        $result = array_keys($result);
223
224        Utf8\Sort::sort($result);
225        return $result;
226    }
227
228    /**
229     * Filter a list of pages by visibility, existence, permissions, and time range
230     *
231     * @param array $pages pages to filter (keys are page IDs)
232     * @param bool $ignorePerms skip visibility and ACL checks
233     * @param int|string|null $after only keep pages modified after this date
234     * @param int|string|null $before only keep pages modified before this date
235     * @return array filtered pages
236     */
237    public static function filterPages(array $pages, bool $ignorePerms = false, $after = null, $before = null): array
238    {
239        if ($after) $after = is_int($after) ? $after : strtotime($after);
240        if ($before) $before = is_int($before) ? $before : strtotime($before);
241
242        return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) {
243            if (!$ignorePerms) {
244                if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) {
245                    return false;
246                }
247            }
248            if (!page_exists($id, '', false)) {
249                return false;
250            }
251            if ($after || $before) {
252                $mTime = filemtime(wikiFN($id));
253                if ($after && $after > $mTime) return false;
254                if ($before && $before < $mTime) return false;
255            }
256            return true;
257        }, ARRAY_FILTER_USE_BOTH);
258    }
259
260    /**
261     * Sort pages based on their namespace level first, then on their string
262     * values. This makes higher hierarchy pages rank higher than lower hierarchy
263     * pages.
264     *
265     * @param string $a
266     * @param string $b
267     * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b,
268     *             and 0 if they are equal.
269     */
270    protected function pagesorter(string $a, string $b): int
271    {
272        $diff = substr_count($a, ':') - substr_count($b, ':');
273        return $diff ?: Utf8\Sort::strcmp($a, $b);
274    }
275}
276