xref: /dokuwiki/inc/Search/Collection/Term.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
1596d5287SAndreas Gohr<?php
2596d5287SAndreas Gohr
3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection;
4596d5287SAndreas Gohr
5596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer;
6*1148921dSAndreas Gohruse dokuwiki\Utf8;
7596d5287SAndreas Gohr
8596d5287SAndreas Gohr/**
9*1148921dSAndreas Gohr * Represents a search term that can match one or more tokens in an index
10596d5287SAndreas Gohr *
11*1148921dSAndreas Gohr * A term can contain wildcards (* at start/end) and thus may refer to various tokens
12*1148921dSAndreas Gohr * of different lengths. After a CollectionSearch executes, each Term holds the full
13*1148921dSAndreas Gohr * match detail: which tokens matched on which entities with what frequencies.
14596d5287SAndreas Gohr */
15596d5287SAndreas Gohrclass Term
16596d5287SAndreas Gohr{
17596d5287SAndreas Gohr
18596d5287SAndreas Gohr    const WILDCARD_NONE = 0;
19596d5287SAndreas Gohr    const WILDCARD_START = 1;
20596d5287SAndreas Gohr    const WILDCARD_END = 2;
21596d5287SAndreas Gohr
22596d5287SAndreas Gohr    /** @var string the original term including wildcard chars */
236734bb8cSAndreas Gohr    protected string $original;
24596d5287SAndreas Gohr
256734bb8cSAndreas Gohr    /** @var string the base of the term without wildcard chars */
266734bb8cSAndreas Gohr    protected string $base;
27596d5287SAndreas Gohr
28596d5287SAndreas Gohr    /** @var string the quoted term to be used in a regular expression */
296734bb8cSAndreas Gohr    protected string $quoted;
30596d5287SAndreas Gohr
31596d5287SAndreas Gohr    /** @var int the length of the base term (not counting wildcards) */
326734bb8cSAndreas Gohr    protected int $length;
33596d5287SAndreas Gohr
34596d5287SAndreas Gohr    /** @var int The type of wildcards */
356734bb8cSAndreas Gohr    protected int $wildcard;
36596d5287SAndreas Gohr
37*1148921dSAndreas Gohr    /** @var bool Whether to match case-insensitively */
38*1148921dSAndreas Gohr    protected bool $isCaseInsensitive = false;
39596d5287SAndreas Gohr
40*1148921dSAndreas Gohr    /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */
41*1148921dSAndreas Gohr    protected array $matches = [];
42*1148921dSAndreas Gohr
43*1148921dSAndreas Gohr    // region Setup
44596d5287SAndreas Gohr
45596d5287SAndreas Gohr    /**
466734bb8cSAndreas Gohr     * @param string $term
47596d5287SAndreas Gohr     */
486734bb8cSAndreas Gohr    public function __construct(string $term)
49596d5287SAndreas Gohr    {
50596d5287SAndreas Gohr        $this->original = $term;
51596d5287SAndreas Gohr        $this->base = trim($term, '*');
52596d5287SAndreas Gohr        $this->quoted = preg_quote_cb($this->base);
53596d5287SAndreas Gohr        $this->wildcard = self::WILDCARD_NONE;
54596d5287SAndreas Gohr        $this->length = Tokenizer::tokenLength($this->base);
55596d5287SAndreas Gohr
56596d5287SAndreas Gohr        // handle wildcard
57596d5287SAndreas Gohr        if (substr($term, 0, 1) === '*') {
58596d5287SAndreas Gohr            $this->quoted = '.*' . $this->quoted;
59596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_START;
60596d5287SAndreas Gohr        }
61596d5287SAndreas Gohr
62596d5287SAndreas Gohr        if (substr($term, -1, 1) === '*') {
63596d5287SAndreas Gohr            $this->quoted = $this->quoted . '.*';
64596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_END;
65596d5287SAndreas Gohr        }
66596d5287SAndreas Gohr    }
67*1148921dSAndreas Gohr
68*1148921dSAndreas Gohr    /**
69*1148921dSAndreas Gohr     * Enable case-insensitive matching
70*1148921dSAndreas Gohr     *
71*1148921dSAndreas Gohr     * The fulltext token index is already lowercased by the Tokenizer, so this is only
72*1148921dSAndreas Gohr     * needed for metadata/title searches where indexed values preserve case.
73*1148921dSAndreas Gohr     *
74*1148921dSAndreas Gohr     * @return static
75*1148921dSAndreas Gohr     */
76*1148921dSAndreas Gohr    public function caseInsensitive(): static
77*1148921dSAndreas Gohr    {
78*1148921dSAndreas Gohr        $this->isCaseInsensitive = true;
79*1148921dSAndreas Gohr        $this->base = Utf8\PhpString::strtolower($this->base);
80*1148921dSAndreas Gohr        return $this;
81596d5287SAndreas Gohr    }
82596d5287SAndreas Gohr
83596d5287SAndreas Gohr    /**
84596d5287SAndreas Gohr     * @return string
85596d5287SAndreas Gohr     */
866734bb8cSAndreas Gohr    public function getOriginal(): string
87596d5287SAndreas Gohr    {
88596d5287SAndreas Gohr        return $this->original;
89596d5287SAndreas Gohr    }
90596d5287SAndreas Gohr
91596d5287SAndreas Gohr    /**
92596d5287SAndreas Gohr     * @return string
93596d5287SAndreas Gohr     */
946734bb8cSAndreas Gohr    public function getBase(): string
95596d5287SAndreas Gohr    {
96596d5287SAndreas Gohr        return $this->base;
97596d5287SAndreas Gohr    }
98596d5287SAndreas Gohr
99596d5287SAndreas Gohr    /**
100596d5287SAndreas Gohr     * @return string
101596d5287SAndreas Gohr     */
1026734bb8cSAndreas Gohr    public function getQuoted(): string
103596d5287SAndreas Gohr    {
104596d5287SAndreas Gohr        return $this->quoted;
105596d5287SAndreas Gohr    }
106596d5287SAndreas Gohr
107596d5287SAndreas Gohr    /**
108596d5287SAndreas Gohr     * @return int
109596d5287SAndreas Gohr     */
1106734bb8cSAndreas Gohr    public function getLength(): int
111596d5287SAndreas Gohr    {
112596d5287SAndreas Gohr        return $this->length;
113596d5287SAndreas Gohr    }
114596d5287SAndreas Gohr
115596d5287SAndreas Gohr    /**
116596d5287SAndreas Gohr     * @return int
117596d5287SAndreas Gohr     */
1186734bb8cSAndreas Gohr    public function getWildcard(): int
119596d5287SAndreas Gohr    {
120596d5287SAndreas Gohr        return $this->wildcard;
121596d5287SAndreas Gohr    }
122596d5287SAndreas Gohr
123*1148921dSAndreas Gohr    // endregion
124*1148921dSAndreas Gohr
125*1148921dSAndreas Gohr    // region Matching
126*1148921dSAndreas Gohr
127596d5287SAndreas Gohr    /**
128*1148921dSAndreas Gohr     * Check if a token value matches this term
129*1148921dSAndreas Gohr     *
130*1148921dSAndreas Gohr     * Uses efficient string functions instead of regex:
131*1148921dSAndreas Gohr     * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains.
132*1148921dSAndreas Gohr     * When caseInsensitive() is set, the token value is lowercased before comparison.
133*1148921dSAndreas Gohr     *
134*1148921dSAndreas Gohr     * @param string $tokenValue
135*1148921dSAndreas Gohr     * @return bool
136*1148921dSAndreas Gohr     */
137*1148921dSAndreas Gohr    public function matches(string $tokenValue): bool
138*1148921dSAndreas Gohr    {
139*1148921dSAndreas Gohr        if ($this->isCaseInsensitive) {
140*1148921dSAndreas Gohr            $tokenValue = Utf8\PhpString::strtolower($tokenValue);
141*1148921dSAndreas Gohr        }
142*1148921dSAndreas Gohr
143*1148921dSAndreas Gohr        return match ($this->wildcard) {
144*1148921dSAndreas Gohr            self::WILDCARD_NONE => $this->base === $tokenValue,
145*1148921dSAndreas Gohr            self::WILDCARD_END => str_starts_with($tokenValue, $this->base),
146*1148921dSAndreas Gohr            self::WILDCARD_START => str_ends_with($tokenValue, $this->base),
147*1148921dSAndreas Gohr            default => str_contains($tokenValue, $this->base),
148*1148921dSAndreas Gohr        };
149*1148921dSAndreas Gohr    }
150*1148921dSAndreas Gohr
151*1148921dSAndreas Gohr    // endregion
152*1148921dSAndreas Gohr
153*1148921dSAndreas Gohr    // region Results (populated by CollectionSearch at the end of execute())
154*1148921dSAndreas Gohr
155*1148921dSAndreas Gohr    /**
156*1148921dSAndreas Gohr     * Record that a token matched an entity with a given frequency
157*1148921dSAndreas Gohr     *
158*1148921dSAndreas Gohr     * When called multiple times for the same entity/token pair, frequencies are summed.
159*1148921dSAndreas Gohr     *
160*1148921dSAndreas Gohr     * @param string $entityName
161*1148921dSAndreas Gohr     * @param string $tokenName
162*1148921dSAndreas Gohr     * @param int $frequency
163*1148921dSAndreas Gohr     * @return void
164*1148921dSAndreas Gohr     * @internal Called by CollectionSearch::resolveAndPopulateTerms()
165*1148921dSAndreas Gohr     */
166*1148921dSAndreas Gohr    public function addMatch(string $entityName, string $tokenName, int $frequency): void
167*1148921dSAndreas Gohr    {
168*1148921dSAndreas Gohr        $this->matches[$entityName][$tokenName] =
169*1148921dSAndreas Gohr            ($this->matches[$entityName][$tokenName] ?? 0) + $frequency;
170*1148921dSAndreas Gohr    }
171*1148921dSAndreas Gohr
172*1148921dSAndreas Gohr    // endregion
173*1148921dSAndreas Gohr
174*1148921dSAndreas Gohr    // region Result accessors
175*1148921dSAndreas Gohr
176*1148921dSAndreas Gohr    /**
177*1148921dSAndreas Gohr     * Return the full match detail
178*1148921dSAndreas Gohr     *
179*1148921dSAndreas Gohr     * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...]
180*1148921dSAndreas Gohr     */
181*1148921dSAndreas Gohr    public function getMatches(): array
182*1148921dSAndreas Gohr    {
183*1148921dSAndreas Gohr        return $this->matches;
184*1148921dSAndreas Gohr    }
185*1148921dSAndreas Gohr
186*1148921dSAndreas Gohr    /**
187*1148921dSAndreas Gohr     * Return the matching entities and their aggregated frequencies
188*1148921dSAndreas Gohr     *
189*1148921dSAndreas Gohr     * Values are the total frequency across all matching tokens for each entity.
190*1148921dSAndreas Gohr     *
191*1148921dSAndreas Gohr     * @return array<string, int> [entityName => totalFrequency, ...]
192596d5287SAndreas Gohr     */
1936734bb8cSAndreas Gohr    public function getEntityFrequencies(): array
194596d5287SAndreas Gohr    {
195*1148921dSAndreas Gohr        return array_map('array_sum', $this->matches);
196596d5287SAndreas Gohr    }
197596d5287SAndreas Gohr
198596d5287SAndreas Gohr    /**
199*1148921dSAndreas Gohr     * Return the matched token names per entity
2006734bb8cSAndreas Gohr     *
201*1148921dSAndreas Gohr     * @return array<string, string[]> [entityName => [tokenName, ...], ...]
202596d5287SAndreas Gohr     */
203*1148921dSAndreas Gohr    public function getEntityTokens(): array
204596d5287SAndreas Gohr    {
205*1148921dSAndreas Gohr        return array_map('array_keys', $this->matches);
206596d5287SAndreas Gohr    }
207596d5287SAndreas Gohr
208596d5287SAndreas Gohr    /**
209*1148921dSAndreas Gohr     * Return all unique matched token values
210596d5287SAndreas Gohr     *
211e05998d5SAndreas Gohr     * @return string[]
212596d5287SAndreas Gohr     */
2136734bb8cSAndreas Gohr    public function getTokens(): array
214596d5287SAndreas Gohr    {
215*1148921dSAndreas Gohr        if (empty($this->matches)) return [];
216*1148921dSAndreas Gohr        return array_keys(array_merge(...array_values($this->matches)));
217596d5287SAndreas Gohr    }
218596d5287SAndreas Gohr
219*1148921dSAndreas Gohr    // endregion
220596d5287SAndreas Gohr}
221