xref: /dokuwiki/inc/Search/MetadataSearch.php (revision 9369b4a991666bc911474806b106d8958e79f4c1)
1fe2d1da1SSatoshi Sahara<?php
246b83514SSatoshi Sahara
3fe2d1da1SSatoshi Saharanamespace dokuwiki\Search;
4fe2d1da1SSatoshi Sahara
5fe2d1da1SSatoshi Saharause dokuwiki\Extension\Event;
66734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch;
76734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageMetaCollection;
86734bb8cSAndreas Gohruse dokuwiki\Search\Collection\PageTitleCollection;
90b1bbbbbSAndreas Gohruse dokuwiki\Search\Query\QueryParser;
10*9369b4a9SAndreas Gohruse dokuwiki\Utf8\Sort;
11fe2d1da1SSatoshi Sahara
12fe2d1da1SSatoshi Sahara/**
13fe2d1da1SSatoshi Sahara * Class DokuWiki Metadata Search
14fe2d1da1SSatoshi Sahara *
156734bb8cSAndreas Gohr * Provides search operations on metadata indexes using the Collection/Index architecture.
166734bb8cSAndreas Gohr *
17fe2d1da1SSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18fe2d1da1SSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
19fe2d1da1SSatoshi Sahara */
20fe2d1da1SSatoshi Saharaclass MetadataSearch
21fe2d1da1SSatoshi Sahara{
22fe2d1da1SSatoshi Sahara    /**
23fe2d1da1SSatoshi Sahara     * Quicksearch for pagenames
24fe2d1da1SSatoshi Sahara     *
25fe2d1da1SSatoshi Sahara     * By default it only matches the pagename and ignores the namespace.
26fe2d1da1SSatoshi Sahara     * This can be changed with the second parameter.
27fe2d1da1SSatoshi Sahara     * The third parameter allows to search in titles as well.
28fe2d1da1SSatoshi Sahara     *
29fe2d1da1SSatoshi Sahara     * The function always returns titles as well
30fe2d1da1SSatoshi Sahara     *
31fe2d1da1SSatoshi Sahara     * @triggers SEARCH_QUERY_PAGELOOKUP
32fe2d1da1SSatoshi Sahara     * @param string $id page id
33fe2d1da1SSatoshi Sahara     * @param bool $in_ns match against namespace as well?
34fe2d1da1SSatoshi Sahara     * @param bool $in_title search in title?
356734bb8cSAndreas Gohr     * @param int|string|null $after only show results with mtime after this date,
36fe2d1da1SSatoshi Sahara     *                             accepts timestap or strtotime arguments
376734bb8cSAndreas Gohr     * @param int|string|null $before only show results with mtime before this date,
38fe2d1da1SSatoshi Sahara     *                             accepts timestap or strtotime arguments
39fe2d1da1SSatoshi Sahara     *
40fe2d1da1SSatoshi Sahara     * @return string[]
416734bb8cSAndreas Gohr     * @author   Andreas Gohr <andi@splitbrain.org>
426734bb8cSAndreas Gohr     * @author   Adrian Lang <lang@cosmocode.de>
436734bb8cSAndreas Gohr     *
44fe2d1da1SSatoshi Sahara     */
456734bb8cSAndreas Gohr    public function pageLookup(
466734bb8cSAndreas Gohr        string $id,
476734bb8cSAndreas Gohr        bool $in_ns = false,
486734bb8cSAndreas Gohr        bool $in_title = false,
496734bb8cSAndreas Gohr        int|string|null $after = null,
50*9369b4a9SAndreas Gohr        int|string|null $before = null
51*9369b4a9SAndreas Gohr    ): array {
52fe2d1da1SSatoshi Sahara        $data = [
53fe2d1da1SSatoshi Sahara            'id' => $id,
54fe2d1da1SSatoshi Sahara            'in_ns' => $in_ns,
55fe2d1da1SSatoshi Sahara            'in_title' => $in_title,
56fe2d1da1SSatoshi Sahara            'after' => $after,
57fe2d1da1SSatoshi Sahara            'before' => $before
58fe2d1da1SSatoshi Sahara        ];
59fe2d1da1SSatoshi Sahara        $data['has_titles'] = true; // for plugin backward compatibility check
606734bb8cSAndreas Gohr        return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $this->pageLookupCallBack(...));
61fe2d1da1SSatoshi Sahara    }
62fe2d1da1SSatoshi Sahara
63fe2d1da1SSatoshi Sahara    /**
64fe2d1da1SSatoshi Sahara     * Returns list of pages as array(pageid => First Heading)
65fe2d1da1SSatoshi Sahara     *
66fe2d1da1SSatoshi Sahara     * @param array $data event data
67fe2d1da1SSatoshi Sahara     * @return string[]
68fe2d1da1SSatoshi Sahara     */
69*9369b4a9SAndreas Gohr    public function pageLookupCallBack(array $data): array
70fe2d1da1SSatoshi Sahara    {
71*9369b4a9SAndreas Gohr        $parsedQuery = (new QueryParser())->convert($data['id']);
726734bb8cSAndreas Gohr        $ns = $parsedQuery['ns'] ? cleanID($parsedQuery['ns'][0]) . ':' : null;
736734bb8cSAndreas Gohr        $notns = $parsedQuery['notns'] ? cleanID($parsedQuery['notns'][0]) . ':' : null;
746734bb8cSAndreas Gohr        $query = ($ns || $notns) ? implode(' ', $parsedQuery['highlight']) : $data['id'];
756734bb8cSAndreas Gohr        $cleaned = cleanID($query);
76fe2d1da1SSatoshi Sahara
776734bb8cSAndreas Gohr        if ($cleaned === '') return [];
786734bb8cSAndreas Gohr
796734bb8cSAndreas Gohr        // find pages matching by page name
806734bb8cSAndreas Gohr        $pages = [];
816734bb8cSAndreas Gohr        foreach ($this->getPages() as $page) {
826734bb8cSAndreas Gohr            if ($ns && !str_starts_with($page, $ns)) continue;
836734bb8cSAndreas Gohr            if ($notns && str_starts_with($page, $notns)) continue;
846734bb8cSAndreas Gohr
856734bb8cSAndreas Gohr            $match = $data['in_ns'] ? $page : noNSorNS($page);
866734bb8cSAndreas Gohr            if (str_contains($match, $cleaned)) {
876734bb8cSAndreas Gohr                $pages[$page] = p_get_first_heading($page, METADATA_DONT_RENDER);
88fe2d1da1SSatoshi Sahara            }
89fab81cc8SSatoshi Sahara        }
90fe2d1da1SSatoshi Sahara
916734bb8cSAndreas Gohr        // additionally find pages matching by title
926734bb8cSAndreas Gohr        if ($data['in_title']) {
931148921dSAndreas Gohr            $search = new CollectionSearch(new PageTitleCollection());
941148921dSAndreas Gohr            $search->caseInsensitive();
951148921dSAndreas Gohr            $search->addTerm('*' . $query . '*');
961148921dSAndreas Gohr            $terms = $search->execute();
971148921dSAndreas Gohr            $term = reset($terms);
981148921dSAndreas Gohr            if ($term) {
991148921dSAndreas Gohr                foreach ($term->getEntityTokens() as $page => $titles) {
1006734bb8cSAndreas Gohr                    if ($ns && !str_starts_with($page, $ns)) continue;
1016734bb8cSAndreas Gohr                    if ($notns && str_starts_with($page, $notns)) continue;
102fe2d1da1SSatoshi Sahara
1036734bb8cSAndreas Gohr                    if (!isset($pages[$page])) {
1041148921dSAndreas Gohr                        $pages[$page] = $titles[0];
1051148921dSAndreas Gohr                    }
106fe2d1da1SSatoshi Sahara                }
107fe2d1da1SSatoshi Sahara            }
108fe2d1da1SSatoshi Sahara        }
109fe2d1da1SSatoshi Sahara
1106734bb8cSAndreas Gohr        $pages = static::filterPages($pages, false, $data['after'], $data['before']);
1116734bb8cSAndreas Gohr        uksort($pages, $this->pagesorter(...));
112fe2d1da1SSatoshi Sahara        return $pages;
113fe2d1da1SSatoshi Sahara    }
114fe2d1da1SSatoshi Sahara
115fe2d1da1SSatoshi Sahara    /**
1166734bb8cSAndreas Gohr     * Return a list of all indexed pages, optionally limited to those that have a specific metadata key
117fe2d1da1SSatoshi Sahara     *
1186734bb8cSAndreas Gohr     * When a key is given, only pages that have any value stored for that metadata key are returned.
1196734bb8cSAndreas Gohr     * This does not filter by the metadata value itself.
1206734bb8cSAndreas Gohr     *
1216734bb8cSAndreas Gohr     * @param string|null $key metadata key name, or null for all pages
1226734bb8cSAndreas Gohr     * @return string[] list of page names
123fe2d1da1SSatoshi Sahara     */
1246734bb8cSAndreas Gohr    public function getPages(?string $key = null): array
125fe2d1da1SSatoshi Sahara    {
1266734bb8cSAndreas Gohr        if ($key === null) {
1276734bb8cSAndreas Gohr            return (new Indexer())->getAllPages();
1286734bb8cSAndreas Gohr        }
1296734bb8cSAndreas Gohr
1306734bb8cSAndreas Gohr        if ($key === 'title') {
1316734bb8cSAndreas Gohr            return (new PageTitleCollection())->getEntitiesWithData();
1326734bb8cSAndreas Gohr        }
1336734bb8cSAndreas Gohr
1346734bb8cSAndreas Gohr        return (new PageMetaCollection($key))->getEntitiesWithData();
1356734bb8cSAndreas Gohr    }
1366734bb8cSAndreas Gohr
1376734bb8cSAndreas Gohr    /**
1386734bb8cSAndreas Gohr     * Find pages containing a metadata value
1396734bb8cSAndreas Gohr     *
1401148921dSAndreas Gohr     * Values are compared as case-sensitive strings. Wildcard matching with * at
1411148921dSAndreas Gohr     * the start and/or end is supported (e.g. '*foo', 'bar*', '*baz*').
1426734bb8cSAndreas Gohr     *
1436734bb8cSAndreas Gohr     * When $value is a string, the result is a flat list of matching page names.
1446734bb8cSAndreas Gohr     * When $value is an array, each value is searched independently and the result
1456734bb8cSAndreas Gohr     * is an associative array keyed by the search values, each containing a list
1466734bb8cSAndreas Gohr     * of matching page names.
1476734bb8cSAndreas Gohr     *
1486734bb8cSAndreas Gohr     * @param string $key name of the metadata key to look for
1496734bb8cSAndreas Gohr     * @param string|string[] $value search term or array of search terms
1506734bb8cSAndreas Gohr     * @return array flat list of page names (scalar $value) or [value => [pageName, ...]] (array $value)
1516734bb8cSAndreas Gohr     *
1526734bb8cSAndreas Gohr     * @author Michael Hamann <michael@content-space.de>
1536734bb8cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
1546734bb8cSAndreas Gohr     */
1551148921dSAndreas Gohr    public function lookupKey(string $key, string|array &$value): array
1566734bb8cSAndreas Gohr    {
1576734bb8cSAndreas Gohr        $isScalar = !is_array($value);
1586734bb8cSAndreas Gohr        $valueArray = $isScalar ? [$value] : $value;
1596734bb8cSAndreas Gohr
1601148921dSAndreas Gohr        $collection = ($key === 'title') ? new PageTitleCollection() : new PageMetaCollection($key);
1616734bb8cSAndreas Gohr
1621148921dSAndreas Gohr        $search = new CollectionSearch($collection);
1631148921dSAndreas Gohr        foreach ($valueArray as $v) {
1641148921dSAndreas Gohr            $search->addTerm($v);
1651148921dSAndreas Gohr        }
1661148921dSAndreas Gohr        $terms = $search->execute();
1671148921dSAndreas Gohr
1681148921dSAndreas Gohr        $result = [];
1691148921dSAndreas Gohr        foreach ($valueArray as $v) {
1701148921dSAndreas Gohr            $term = $terms[$v] ?? null;
1711148921dSAndreas Gohr            $result[$v] = $term ? array_keys($term->getEntityFrequencies()) : [];
1721148921dSAndreas Gohr        }
1736734bb8cSAndreas Gohr
1746734bb8cSAndreas Gohr        return $isScalar ? $result[$value] : $result;
1756734bb8cSAndreas Gohr    }
1766734bb8cSAndreas Gohr
1776734bb8cSAndreas Gohr    /**
1786734bb8cSAndreas Gohr     * Returns the backlinks for a given page
1796734bb8cSAndreas Gohr     *
1806734bb8cSAndreas Gohr     * @param string $id The id for which links shall be returned
1816734bb8cSAndreas Gohr     * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected
1826734bb8cSAndreas Gohr     * @return string[] The pages that contain links to the given page
1836734bb8cSAndreas Gohr     *
1846734bb8cSAndreas Gohr     * @author     Andreas Gohr <andi@splitbrain.org>
1856734bb8cSAndreas Gohr     */
1866734bb8cSAndreas Gohr    public function backlinks(string $id, bool $ignore_perms = false): array
1876734bb8cSAndreas Gohr    {
1886734bb8cSAndreas Gohr        $result = $this->lookupKey('relation_references', $id);
189*9369b4a9SAndreas Gohr        if ($result === []) return $result;
1906734bb8cSAndreas Gohr
1916734bb8cSAndreas Gohr        $result = array_flip($result);
1926734bb8cSAndreas Gohr        $result = static::filterPages($result, $ignore_perms);
1936734bb8cSAndreas Gohr        $result = array_keys($result);
1946734bb8cSAndreas Gohr
195*9369b4a9SAndreas Gohr        Sort::sort($result);
1966734bb8cSAndreas Gohr        return $result;
1976734bb8cSAndreas Gohr    }
1986734bb8cSAndreas Gohr
1996734bb8cSAndreas Gohr    /**
2006734bb8cSAndreas Gohr     * Returns the pages that use a given media file
2016734bb8cSAndreas Gohr     *
2026734bb8cSAndreas Gohr     * @param string $id The media id to look for
2036734bb8cSAndreas Gohr     * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false)
2046734bb8cSAndreas Gohr     * @return string[] A list of pages that use the given media file
2056734bb8cSAndreas Gohr     *
2066734bb8cSAndreas Gohr     * @author     Andreas Gohr <andi@splitbrain.org>
2076734bb8cSAndreas Gohr     */
2086734bb8cSAndreas Gohr    public function mediause(string $id, bool $ignore_perms = false): array
2096734bb8cSAndreas Gohr    {
2106734bb8cSAndreas Gohr        $result = $this->lookupKey('relation_media', $id);
211*9369b4a9SAndreas Gohr        if ($result === []) return $result;
2126734bb8cSAndreas Gohr
2136734bb8cSAndreas Gohr        $result = array_flip($result);
2146734bb8cSAndreas Gohr        $result = static::filterPages($result, $ignore_perms);
2156734bb8cSAndreas Gohr        $result = array_keys($result);
2166734bb8cSAndreas Gohr
217*9369b4a9SAndreas Gohr        Sort::sort($result);
2186734bb8cSAndreas Gohr        return $result;
2196734bb8cSAndreas Gohr    }
2206734bb8cSAndreas Gohr
2216734bb8cSAndreas Gohr    /**
2226734bb8cSAndreas Gohr     * Filter a list of pages by visibility, existence, permissions, and time range
2236734bb8cSAndreas Gohr     *
2246734bb8cSAndreas Gohr     * @param array $pages pages to filter (keys are page IDs)
2256734bb8cSAndreas Gohr     * @param bool $ignorePerms skip visibility and ACL checks
2266734bb8cSAndreas Gohr     * @param int|string|null $after only keep pages modified after this date
2276734bb8cSAndreas Gohr     * @param int|string|null $before only keep pages modified before this date
2286734bb8cSAndreas Gohr     * @return array filtered pages
2296734bb8cSAndreas Gohr     */
230*9369b4a9SAndreas Gohr    public static function filterPages(
231*9369b4a9SAndreas Gohr        array $pages,
232*9369b4a9SAndreas Gohr        bool $ignorePerms = false,
233*9369b4a9SAndreas Gohr        int|string|null $after = null,
234*9369b4a9SAndreas Gohr        int|string|null $before = null
235*9369b4a9SAndreas Gohr    ): array {
2366734bb8cSAndreas Gohr        if ($after) $after = is_int($after) ? $after : strtotime($after);
2376734bb8cSAndreas Gohr        if ($before) $before = is_int($before) ? $before : strtotime($before);
2386734bb8cSAndreas Gohr
2396734bb8cSAndreas Gohr        return array_filter($pages, static function ($value, $id) use ($ignorePerms, $after, $before) {
2406734bb8cSAndreas Gohr            if (!$ignorePerms) {
2416734bb8cSAndreas Gohr                if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ) {
2426734bb8cSAndreas Gohr                    return false;
2436734bb8cSAndreas Gohr                }
2446734bb8cSAndreas Gohr            }
2456734bb8cSAndreas Gohr            if (!page_exists($id, '', false)) {
2466734bb8cSAndreas Gohr                return false;
2476734bb8cSAndreas Gohr            }
2486734bb8cSAndreas Gohr            if ($after || $before) {
2496734bb8cSAndreas Gohr                $mTime = filemtime(wikiFN($id));
2506734bb8cSAndreas Gohr                if ($after && $after > $mTime) return false;
2516734bb8cSAndreas Gohr                if ($before && $before < $mTime) return false;
2526734bb8cSAndreas Gohr            }
2536734bb8cSAndreas Gohr            return true;
2546734bb8cSAndreas Gohr        }, ARRAY_FILTER_USE_BOTH);
255fe2d1da1SSatoshi Sahara    }
256fe2d1da1SSatoshi Sahara
257fe2d1da1SSatoshi Sahara    /**
258fe2d1da1SSatoshi Sahara     * Sort pages based on their namespace level first, then on their string
259fe2d1da1SSatoshi Sahara     * values. This makes higher hierarchy pages rank higher than lower hierarchy
260fe2d1da1SSatoshi Sahara     * pages.
261fe2d1da1SSatoshi Sahara     *
262fe2d1da1SSatoshi Sahara     * @param string $a
263fe2d1da1SSatoshi Sahara     * @param string $b
264fe2d1da1SSatoshi Sahara     * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b,
265fe2d1da1SSatoshi Sahara     *             and 0 if they are equal.
266fe2d1da1SSatoshi Sahara     */
2676734bb8cSAndreas Gohr    protected function pagesorter(string $a, string $b): int
268fe2d1da1SSatoshi Sahara    {
2696734bb8cSAndreas Gohr        $diff = substr_count($a, ':') - substr_count($b, ':');
270*9369b4a9SAndreas Gohr        return $diff ?: Sort::strcmp($a, $b);
271fe2d1da1SSatoshi Sahara    }
272fe2d1da1SSatoshi Sahara}
273