xref: /dokuwiki/inc/Search/Collection/Term.php (revision 9369b4a991666bc911474806b106d8958e79f4c1) !
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Utf8\PhpString;
6use dokuwiki\Search\Tokenizer;
7use dokuwiki\Utf8;
8
9/**
10 * Represents a search term that can match one or more tokens in an index
11 *
12 * A term can contain wildcards (* at start/end) and thus may refer to various tokens
13 * of different lengths. After a CollectionSearch executes, each Term holds the full
14 * match detail: which tokens matched on which entities with what frequencies.
15 */
16class Term
17{
18    public const WILDCARD_NONE = 0;
19    public const WILDCARD_START = 1;
20    public const WILDCARD_END = 2;
21
22    /** @var string the original term including wildcard chars */
23    protected string $original;
24
25    /** @var string the base of the term without wildcard chars */
26    protected string $base;
27
28    /** @var string the quoted term to be used in a regular expression */
29    protected string $quoted;
30
31    /** @var int the length of the base term (not counting wildcards) */
32    protected int $length;
33
34    /** @var int The type of wildcards */
35    protected int $wildcard = self::WILDCARD_NONE;
36
37    /** @var bool Whether to match case-insensitively */
38    protected bool $isCaseInsensitive = false;
39
40    /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */
41    protected array $matches = [];
42
43    // region Setup
44
45    /**
46     * @param string $term
47     */
48    public function __construct(string $term)
49    {
50        $this->original = $term;
51        $this->base = trim($term, '*');
52        $this->quoted = preg_quote_cb($this->base);
53        $this->length = Tokenizer::tokenLength($this->base);
54
55        // handle wildcard
56        if (str_starts_with($term, '*')) {
57            $this->quoted = '.*' . $this->quoted;
58            $this->wildcard += self::WILDCARD_START;
59        }
60
61        if (str_ends_with($term, '*')) {
62            $this->quoted .= '.*';
63            $this->wildcard += self::WILDCARD_END;
64        }
65    }
66
67    /**
68     * Enable case-insensitive matching
69     *
70     * The fulltext token index is already lowercased by the Tokenizer, so this is only
71     * needed for metadata/title searches where indexed values preserve case.
72     *
73     * @return static
74     */
75    public function caseInsensitive(): static
76    {
77        $this->isCaseInsensitive = true;
78        $this->base = PhpString::strtolower($this->base);
79        return $this;
80    }
81
82    /**
83     * @return string
84     */
85    public function getOriginal(): string
86    {
87        return $this->original;
88    }
89
90    /**
91     * @return string
92     */
93    public function getBase(): string
94    {
95        return $this->base;
96    }
97
98    /**
99     * @return string
100     */
101    public function getQuoted(): string
102    {
103        return $this->quoted;
104    }
105
106    /**
107     * @return int
108     */
109    public function getLength(): int
110    {
111        return $this->length;
112    }
113
114    /**
115     * @return int
116     */
117    public function getWildcard(): int
118    {
119        return $this->wildcard;
120    }
121
122    // endregion
123
124    // region Matching
125
126    /**
127     * Check if a token value matches this term
128     *
129     * Uses efficient string functions instead of regex:
130     * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains.
131     * When caseInsensitive() is set, the token value is lowercased before comparison.
132     *
133     * @param string $tokenValue
134     * @return bool
135     */
136    public function matches(string $tokenValue): bool
137    {
138        if ($this->isCaseInsensitive) {
139            $tokenValue = PhpString::strtolower($tokenValue);
140        }
141
142        return match ($this->wildcard) {
143            self::WILDCARD_NONE => $this->base === $tokenValue,
144            self::WILDCARD_END => str_starts_with($tokenValue, $this->base),
145            self::WILDCARD_START => str_ends_with($tokenValue, $this->base),
146            default => str_contains($tokenValue, $this->base),
147        };
148    }
149
150    // endregion
151
152    // region Results (populated by CollectionSearch at the end of execute())
153
154    /**
155     * Record that a token matched an entity with a given frequency
156     *
157     * When called multiple times for the same entity/token pair, frequencies are summed.
158     *
159     * @param string $entityName
160     * @param string $tokenName
161     * @param int $frequency
162     * @return void
163     * @internal Called by CollectionSearch::resolveAndPopulateTerms()
164     */
165    public function addMatch(string $entityName, string $tokenName, int $frequency): void
166    {
167        $this->matches[$entityName][$tokenName] =
168            ($this->matches[$entityName][$tokenName] ?? 0) + $frequency;
169    }
170
171    // endregion
172
173    // region Result accessors
174
175    /**
176     * Return the full match detail
177     *
178     * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...]
179     */
180    public function getMatches(): array
181    {
182        return $this->matches;
183    }
184
185    /**
186     * Return the matching entities and their aggregated frequencies
187     *
188     * Values are the total frequency across all matching tokens for each entity.
189     *
190     * @return array<string, int> [entityName => totalFrequency, ...]
191     */
192    public function getEntityFrequencies(): array
193    {
194        return array_map(array_sum(...), $this->matches);
195    }
196
197    /**
198     * Return the matched token names per entity
199     *
200     * @return array<string, string[]> [entityName => [tokenName, ...], ...]
201     */
202    public function getEntityTokens(): array
203    {
204        return array_map(array_keys(...), $this->matches);
205    }
206
207    /**
208     * Return all unique matched token values
209     *
210     * @return string[]
211     */
212    public function getTokens(): array
213    {
214        if ($this->matches === []) return [];
215        return array_keys(array_merge(...array_values($this->matches)));
216    }
217
218    // endregion
219}
220