xref: /dokuwiki/inc/Search/Collection/Term.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Tokenizer;
6use dokuwiki\Utf8;
7
8/**
9 * Represents a search term that can match one or more tokens in an index
10 *
11 * A term can contain wildcards (* at start/end) and thus may refer to various tokens
12 * of different lengths. After a CollectionSearch executes, each Term holds the full
13 * match detail: which tokens matched on which entities with what frequencies.
14 */
15class Term
16{
17
18    const WILDCARD_NONE = 0;
19    const WILDCARD_START = 1;
20    const WILDCARD_END = 2;
21
22    /** @var string the original term including wildcard chars */
23    protected string $original;
24
25    /** @var string the base of the term without wildcard chars */
26    protected string $base;
27
28    /** @var string the quoted term to be used in a regular expression */
29    protected string $quoted;
30
31    /** @var int the length of the base term (not counting wildcards) */
32    protected int $length;
33
34    /** @var int The type of wildcards */
35    protected int $wildcard;
36
37    /** @var bool Whether to match case-insensitively */
38    protected bool $isCaseInsensitive = false;
39
40    /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */
41    protected array $matches = [];
42
43    // region Setup
44
45    /**
46     * @param string $term
47     */
48    public function __construct(string $term)
49    {
50        $this->original = $term;
51        $this->base = trim($term, '*');
52        $this->quoted = preg_quote_cb($this->base);
53        $this->wildcard = self::WILDCARD_NONE;
54        $this->length = Tokenizer::tokenLength($this->base);
55
56        // handle wildcard
57        if (substr($term, 0, 1) === '*') {
58            $this->quoted = '.*' . $this->quoted;
59            $this->wildcard += self::WILDCARD_START;
60        }
61
62        if (substr($term, -1, 1) === '*') {
63            $this->quoted = $this->quoted . '.*';
64            $this->wildcard += self::WILDCARD_END;
65        }
66    }
67
68    /**
69     * Enable case-insensitive matching
70     *
71     * The fulltext token index is already lowercased by the Tokenizer, so this is only
72     * needed for metadata/title searches where indexed values preserve case.
73     *
74     * @return static
75     */
76    public function caseInsensitive(): static
77    {
78        $this->isCaseInsensitive = true;
79        $this->base = Utf8\PhpString::strtolower($this->base);
80        return $this;
81    }
82
83    /**
84     * @return string
85     */
86    public function getOriginal(): string
87    {
88        return $this->original;
89    }
90
91    /**
92     * @return string
93     */
94    public function getBase(): string
95    {
96        return $this->base;
97    }
98
99    /**
100     * @return string
101     */
102    public function getQuoted(): string
103    {
104        return $this->quoted;
105    }
106
107    /**
108     * @return int
109     */
110    public function getLength(): int
111    {
112        return $this->length;
113    }
114
115    /**
116     * @return int
117     */
118    public function getWildcard(): int
119    {
120        return $this->wildcard;
121    }
122
123    // endregion
124
125    // region Matching
126
127    /**
128     * Check if a token value matches this term
129     *
130     * Uses efficient string functions instead of regex:
131     * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains.
132     * When caseInsensitive() is set, the token value is lowercased before comparison.
133     *
134     * @param string $tokenValue
135     * @return bool
136     */
137    public function matches(string $tokenValue): bool
138    {
139        if ($this->isCaseInsensitive) {
140            $tokenValue = Utf8\PhpString::strtolower($tokenValue);
141        }
142
143        return match ($this->wildcard) {
144            self::WILDCARD_NONE => $this->base === $tokenValue,
145            self::WILDCARD_END => str_starts_with($tokenValue, $this->base),
146            self::WILDCARD_START => str_ends_with($tokenValue, $this->base),
147            default => str_contains($tokenValue, $this->base),
148        };
149    }
150
151    // endregion
152
153    // region Results (populated by CollectionSearch at the end of execute())
154
155    /**
156     * Record that a token matched an entity with a given frequency
157     *
158     * When called multiple times for the same entity/token pair, frequencies are summed.
159     *
160     * @param string $entityName
161     * @param string $tokenName
162     * @param int $frequency
163     * @return void
164     * @internal Called by CollectionSearch::resolveAndPopulateTerms()
165     */
166    public function addMatch(string $entityName, string $tokenName, int $frequency): void
167    {
168        $this->matches[$entityName][$tokenName] =
169            ($this->matches[$entityName][$tokenName] ?? 0) + $frequency;
170    }
171
172    // endregion
173
174    // region Result accessors
175
176    /**
177     * Return the full match detail
178     *
179     * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...]
180     */
181    public function getMatches(): array
182    {
183        return $this->matches;
184    }
185
186    /**
187     * Return the matching entities and their aggregated frequencies
188     *
189     * Values are the total frequency across all matching tokens for each entity.
190     *
191     * @return array<string, int> [entityName => totalFrequency, ...]
192     */
193    public function getEntityFrequencies(): array
194    {
195        return array_map('array_sum', $this->matches);
196    }
197
198    /**
199     * Return the matched token names per entity
200     *
201     * @return array<string, string[]> [entityName => [tokenName, ...], ...]
202     */
203    public function getEntityTokens(): array
204    {
205        return array_map('array_keys', $this->matches);
206    }
207
208    /**
209     * Return all unique matched token values
210     *
211     * @return string[]
212     */
213    public function getTokens(): array
214    {
215        if (empty($this->matches)) return [];
216        return array_keys(array_merge(...array_values($this->matches)));
217    }
218
219    // endregion
220}
221