xref: /dokuwiki/inc/Search/Collection/Term.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1<?php
2
3namespace dokuwiki\Search\Collection;
4
5use dokuwiki\Search\Exception\SearchException;
6use dokuwiki\Search\Tokenizer;
7
8/**
9 * Represents a term that is searched on a frequency based index
10 *
11 * A term can contain wildcards and thus may refer to various tokens of different lengths.
12 */
13class Term
14{
15
16    const WILDCARD_NONE = 0;
17    const WILDCARD_START = 1;
18    const WILDCARD_END = 2;
19
20    /** @var string the original term including wildcard chars */
21    protected string $original;
22
23    /** @var string the base of the term without wildcard chars */
24    protected string $base;
25
26    /** @var string the quoted term to be used in a regular expression */
27    protected string $quoted;
28
29    /** @var int the length of the base term (not counting wildcards) */
30    protected int $length;
31
32    /** @var int The type of wildcards */
33    protected int $wildcard;
34
35    /** @var array<int, array<int, string>> The matching tokens for this term, keyed by group then token ID */
36    protected array $tokens = [];
37
38    /** @var array<int|string, int> The entity frequencies this term matches (aggregated over all tokens), keyed by entity ID or name */
39    protected array $frequencies = [];
40
41    /**
42     * @param string $term
43     * @throws SearchException
44     */
45    public function __construct(string $term)
46    {
47        $this->original = $term;
48        $this->base = trim($term, '*');
49        $this->quoted = preg_quote_cb($this->base);
50        $this->wildcard = self::WILDCARD_NONE;
51        $this->length = Tokenizer::tokenLength($this->base);
52
53        // handle wildcard
54        if (substr($term, 0, 1) === '*') {
55            $this->quoted = '.*' . $this->quoted;
56            $this->wildcard += self::WILDCARD_START;
57        }
58
59        if (substr($term, -1, 1) === '*') {
60            $this->quoted = $this->quoted . '.*';
61            $this->wildcard += self::WILDCARD_END;
62        }
63
64        // ignore terms that are too short, with an exception on numbers
65        if ($this->length === 0 || ($this->length < Tokenizer::getMinWordLength() && !is_numeric($term))) {
66            throw new SearchException('Too short term');
67        }
68    }
69
70    /**
71     * @return string
72     */
73    public function getOriginal(): string
74    {
75        return $this->original;
76    }
77
78    /**
79     * @return string
80     */
81    public function getBase(): string
82    {
83        return $this->base;
84    }
85
86    /**
87     * @return string
88     */
89    public function getQuoted(): string
90    {
91        return $this->quoted;
92    }
93
94    /**
95     * @return int
96     */
97    public function getLength(): int
98    {
99        return $this->length;
100    }
101
102    /**
103     * @return int
104     */
105    public function getWildcard(): int
106    {
107        return $this->wildcard;
108    }
109
110    /**
111     * @return array [entity => frequency, ...]
112     */
113    public function getEntityFrequencies(): array
114    {
115        return $this->frequencies;
116    }
117
118    /**
119     * Add found token IDs for a specific index group
120     *
121     * @param int $group Index group (length for split collections, 0 for non-split)
122     * @param array $tokens [tokenID => tokenName, ...]
123     * @return void
124     * @internal
125     */
126    public function addTokens(int $group, array $tokens): void
127    {
128        $this->tokens[$group] = [];
129        foreach ($tokens as $tokenID => $tokenName) {
130            $this->tokens[$group][$tokenID] = $tokenName;
131        }
132    }
133
134    /**
135     * Return all tokens that match the given term
136     *
137     * @return string[]
138     */
139    public function getTokens(): array
140    {
141        if (empty($this->tokens)) return [];
142        return array_merge(...array_map('array_values', array_values($this->tokens)));
143    }
144
145    /**
146     * Return all token IDs for a specific index group
147     *
148     * @param int $group Index group (length for split collections, 0 for non-split)
149     * @return int[]
150     */
151    public function getTokenIDsByGroup(int $group): array
152    {
153        return isset($this->tokens[$group]) ? array_keys($this->tokens[$group]) : [];
154    }
155
156    /**
157     * Mathematically add the given frequency to existing frequency for the entityID
158     *
159     * @param int $entityID
160     * @param int $frequency
161     * @return void
162     * @internal
163     */
164    public function addEntityFrequency(int $entityID, int $frequency): void
165    {
166        if (!isset($this->frequencies[$entityID])) {
167            $this->frequencies[$entityID] = 0;
168        }
169
170        $this->frequencies[$entityID] += $frequency;
171    }
172
173    /**
174     * Update the entity frequencies to use actual entity names
175     *
176     * @param array<int, string> $entityMap [entityID => entityName]
177     * @return void
178     */
179    public function resolveEntities(array $entityMap): void
180    {
181        $resolved = [];
182        foreach ($this->frequencies as $eid => $freq) {
183            $name = $entityMap[$eid];
184            $resolved[$name] = $freq;
185        }
186        $this->frequencies = $resolved;
187    }
188}
189