xref: /dokuwiki/inc/Search/MetadataSearch.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Search\Collection\CollectionSearch;
7use dokuwiki\Search\Collection\PageMetaCollection;
8use dokuwiki\Search\Collection\PageTitleCollection;
9use dokuwiki\Search\Query\QueryParser;
10use dokuwiki\Utf8;
11
12/**
13 * Class DokuWiki Metadata Search
14 *
15 * Provides search operations on metadata indexes using the Collection/Index architecture.
16 *
17 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18 * @author     Andreas Gohr <andi@splitbrain.org>
19 */
20class MetadataSearch
21{
22    /**
23     * Quicksearch for pagenames
24     *
25     * By default it only matches the pagename and ignores the namespace.
26     * This can be changed with the second parameter.
27     * The third parameter allows to search in titles as well.
28     *
29     * The function always returns titles as well
30     *
31     * @triggers SEARCH_QUERY_PAGELOOKUP
32     * @param string     $id       page id
33     * @param bool $in_ns    match against namespace as well?
34     * @param bool $in_title search in title?
35     * @param int|string|null $after    only show results with mtime after this date,
36     *                             accepts timestap or strtotime arguments
37     * @param int|string|null $before   only show results with mtime before this date,
38     *                             accepts timestap or strtotime arguments
39     *
40     * @return string[]
41     * @author   Andreas Gohr <andi@splitbrain.org>
42     * @author   Adrian Lang <lang@cosmocode.de>
43     *
44     */
45    public function pageLookup(
46        string     $id,
47        bool       $in_ns = false,
48        bool       $in_title = false,
49        int|string|null $after = null,
50        int|string|null $before = null): array
51    {
52        $data = [
53            'id' => $id,
54            'in_ns' => $in_ns,
55            'in_title' => $in_title,
56            'after' => $after,
57            'before' => $before
58        ];
59        $data['has_titles'] = true; // for plugin backward compatibility check
60        return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...));
61    }
62
63    /**
64     * Returns list of pages as array(pageid => First Heading)
65     *
66     * @param array $data event data
67     * @return string[]
68     * @throws IndexUsageException
69     */
70    public function pageLookupCallBack(array &$data): array
71    {
72        $parsedQuery = (new QueryParser)->convert($data['id']);
73        $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null;
74        $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null;
75        $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id'];
76        $cleaned = cleanID($query);
77
78        if ($cleaned === '') return [];
79
80        // find pages matching by page name
81        $pages = [];
82        foreach ($this->getPages() as $page) {
83            if ($ns && !str_starts_with($page, $ns)) continue;
84            if ($notns && str_starts_with($page, $notns)) continue;
85
86            $match = $data['in_ns'] ? $page : noNSorNS($page);
87            if (str_contains($match, $cleaned)) {
88                $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER);
89            }
90        }
91
92        // additionally find pages matching by title
93        if ($data['in_title']) {
94            $search = new CollectionSearch(new PageTitleCollection());
95            $search->caseInsensitive();
96            $search->addTerm('*' . $query . '*');
97            $terms = $search->execute();
98            $term = reset($terms);
99            if ($term) {
100                foreach ($term->getEntityTokens() as $page => $titles) {
101                    if ($ns && !str_starts_with($page, $ns)) continue;
102                    if ($notns && str_starts_with($page, $notns)) continue;
103
104                    if (!isset($pages[$page])) {
105                        $pages[$page] = $titles[0];
106                    }
107                }
108            }
109        }
110
111        $pages = static::filterPages($pages, false, $data['after'], $data['before']);
112        uksort($pages, $this->pagesorter(...));
113        return $pages;
114    }
115
116    /**
117     * Return a list of all indexed pages, optionally limited to those that have a specific metadata key
118     *
119     * When a key is given, only pages that have any value stored for that metadata key are returned.
120     * This does not filter by the metadata value itself.
121     *
122     * @param string|null $key metadata key name, or null for all pages
123     * @return string[] list of page names
124     */
125    public function getPages(?string $key = null): array
126    {
127        if ($key === null) {
128            return (new Indexer())->getAllPages();
129        }
130
131        if ($key === 'title') {
132            return (new PageTitleCollection())->getEntitiesWithData();
133        }
134
135        return (new PageMetaCollection($key))->getEntitiesWithData();
136    }
137
138    /**
139     * Find pages containing a metadata value
140     *
141     * Values are compared as case-sensitive strings. Wildcard matching with * at
142     * the start and/or end is supported (e.g. '*foo', 'bar*', '*baz*').
143     *
144     * When $value is a string, the result is a flat list of matching page names.
145     * When $value is an array, each value is searched independently and the result
146     * is an associative array keyed by the search values, each containing a list
147     * of matching page names.
148     *
149     * @param string $key name of the metadata key to look for
150     * @param string|string[] $value search term or array of search terms
151     * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value)
152     *
153     * @author Michael Hamann <michael@content-space.de>
154     * @author Tom N Harris <tnharris@whoopdedo.org>
155     */
156    public function lookupKey(string $key, string|array &$value): array
157    {
158        $isScalar = !is_array($value);
159        $valueArray = $isScalar ? [$value] : $value;
160
161        $collection = ($key === 'title') ? new PageTitleCollection() : new PageMetaCollection($key);
162
163        $search = new CollectionSearch($collection);
164        foreach ($valueArray as $v) {
165            $search->addTerm($v);
166        }
167        $terms = $search->execute();
168
169        $result = [];
170        foreach ($valueArray as $v) {
171            $term = $terms[$v] ?? null;
172            $result[$v] = $term ? array_keys($term->getEntityFrequencies()) : [];
173        }
174
175        return $isScalar ? $result[$value] : $result;
176    }
177
178    /**
179     * Returns the backlinks for a given page
180     *
181     * @param string $id The id for which links shall be returned
182     * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected
183     * @return string[] The pages that contain links to the given page
184     *
185     * @throws IndexUsageException
186     * @author     Andreas Gohr <andi@splitbrain.org>
187     */
188    public function backlinks(string $id, bool $ignore_perms = false): array
189    {
190        $result = $this->lookupKey('relation_references', $id);
191        if (!count($result)) return $result;
192
193        $result = array_flip($result);
194        $result = static::filterPages($result, $ignore_perms);
195        $result = array_keys($result);
196
197        Utf8\Sort::sort($result);
198        return $result;
199    }
200
201    /**
202     * Returns the pages that use a given media file
203     *
204     * @param string $id           The media id to look for
205     * @param bool   $ignore_perms Ignore hidden pages and acls (optional, default: false)
206     * @return string[] A list of pages that use the given media file
207     *
208     * @author     Andreas Gohr <andi@splitbrain.org>
209     */
210    public function mediause(string $id, bool $ignore_perms = false): array
211    {
212        $result = $this->lookupKey('relation_media', $id);
213        if (!count($result)) return $result;
214
215        $result = array_flip($result);
216        $result = static::filterPages($result, $ignore_perms);
217        $result = array_keys($result);
218
219        Utf8\Sort::sort($result);
220        return $result;
221    }
222
223    /**
224     * Filter a list of pages by visibility, existence, permissions, and time range
225     *
226     * @param array $pages pages to filter (keys are page IDs)
227     * @param bool $ignorePerms skip visibility and ACL checks
228     * @param int|string|null $after only keep pages modified after this date
229     * @param int|string|null $before only keep pages modified before this date
230     * @return array filtered pages
231     */
232    public static function filterPages(array $pages, bool $ignorePerms = false, $after = null, $before = null): array
233    {
234        if ($after) $after = is_int($after) ? $after : strtotime($after);
235        if ($before) $before = is_int($before) ? $before : strtotime($before);
236
237        return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) {
238            if (!$ignorePerms) {
239                if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) {
240                    return false;
241                }
242            }
243            if (!page_exists($id, '', false)) {
244                return false;
245            }
246            if ($after || $before) {
247                $mTime = filemtime(wikiFN($id));
248                if ($after && $after > $mTime) return false;
249                if ($before && $before < $mTime) return false;
250            }
251            return true;
252        }, ARRAY_FILTER_USE_BOTH);
253    }
254
255    /**
256     * Sort pages based on their namespace level first, then on their string
257     * values. This makes higher hierarchy pages rank higher than lower hierarchy
258     * pages.
259     *
260     * @param string $a
261     * @param string $b
262     * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b,
263     *             and 0 if they are equal.
264     */
265    protected function pagesorter(string $a, string $b): int
266    {
267        $diff = substr_count($a, ':') - substr_count($b, ':');
268        return $diff ?: Utf8\Sort::strcmp($a, $b);
269    }
270}
271