xref: /dokuwiki/inc/Search/Collection/Term.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1596d5287SAndreas Gohr<?php
2596d5287SAndreas Gohr
3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection;
4596d5287SAndreas Gohr
5596d5287SAndreas Gohruse dokuwiki\Search\Exception\SearchException;
6596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer;
7596d5287SAndreas Gohr
8596d5287SAndreas Gohr/**
9596d5287SAndreas Gohr * Represents a term that is searched on a frequency based index
10596d5287SAndreas Gohr *
11596d5287SAndreas Gohr * A term can contain wildcards and thus may refer to various tokens of different lengths.
12596d5287SAndreas Gohr */
13596d5287SAndreas Gohrclass Term
14596d5287SAndreas Gohr{
15596d5287SAndreas Gohr
16596d5287SAndreas Gohr    const WILDCARD_NONE = 0;
17596d5287SAndreas Gohr    const WILDCARD_START = 1;
18596d5287SAndreas Gohr    const WILDCARD_END = 2;
19596d5287SAndreas Gohr
20596d5287SAndreas Gohr    /** @var string the original term including wildcard chars */
21*6734bb8cSAndreas Gohr    protected string $original;
22596d5287SAndreas Gohr
23*6734bb8cSAndreas Gohr    /** @var string the base of the term without wildcard chars */
24*6734bb8cSAndreas Gohr    protected string $base;
25596d5287SAndreas Gohr
26596d5287SAndreas Gohr    /** @var string the quoted term to be used in a regular expression */
27*6734bb8cSAndreas Gohr    protected string $quoted;
28596d5287SAndreas Gohr
29596d5287SAndreas Gohr    /** @var int the length of the base term (not counting wildcards) */
30*6734bb8cSAndreas Gohr    protected int $length;
31596d5287SAndreas Gohr
32596d5287SAndreas Gohr    /** @var int The type of wildcards */
33*6734bb8cSAndreas Gohr    protected int $wildcard;
34596d5287SAndreas Gohr
35*6734bb8cSAndreas Gohr    /** @var array<int, array<int, string>> The matching tokens for this term, keyed by group then token ID */
36*6734bb8cSAndreas Gohr    protected array $tokens = [];
37596d5287SAndreas Gohr
38*6734bb8cSAndreas Gohr    /** @var array<int|string, int> The entity frequencies this term matches (aggregated over all tokens), keyed by entity ID or name */
39*6734bb8cSAndreas Gohr    protected array $frequencies = [];
40596d5287SAndreas Gohr
41596d5287SAndreas Gohr    /**
42*6734bb8cSAndreas Gohr     * @param string $term
43596d5287SAndreas Gohr     * @throws SearchException
44596d5287SAndreas Gohr     */
45*6734bb8cSAndreas Gohr    public function __construct(string $term)
46596d5287SAndreas Gohr    {
47596d5287SAndreas Gohr        $this->original = $term;
48596d5287SAndreas Gohr        $this->base = trim($term, '*');
49596d5287SAndreas Gohr        $this->quoted = preg_quote_cb($this->base);
50596d5287SAndreas Gohr        $this->wildcard = self::WILDCARD_NONE;
51596d5287SAndreas Gohr        $this->length = Tokenizer::tokenLength($this->base);
52596d5287SAndreas Gohr
53596d5287SAndreas Gohr        // handle wildcard
54596d5287SAndreas Gohr        if (substr($term, 0, 1) === '*') {
55596d5287SAndreas Gohr            $this->quoted = '.*' . $this->quoted;
56596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_START;
57596d5287SAndreas Gohr        }
58596d5287SAndreas Gohr
59596d5287SAndreas Gohr        if (substr($term, -1, 1) === '*') {
60596d5287SAndreas Gohr            $this->quoted = $this->quoted . '.*';
61596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_END;
62596d5287SAndreas Gohr        }
63596d5287SAndreas Gohr
64596d5287SAndreas Gohr        // ignore terms that are too short, with an exception on numbers
65596d5287SAndreas Gohr        if ($this->length === 0 || ($this->length < Tokenizer::getMinWordLength() && !is_numeric($term))) {
66596d5287SAndreas Gohr            throw new SearchException('Too short term');
67596d5287SAndreas Gohr        }
68596d5287SAndreas Gohr    }
69596d5287SAndreas Gohr
70596d5287SAndreas Gohr    /**
71596d5287SAndreas Gohr     * @return string
72596d5287SAndreas Gohr     */
73*6734bb8cSAndreas Gohr    public function getOriginal(): string
74596d5287SAndreas Gohr    {
75596d5287SAndreas Gohr        return $this->original;
76596d5287SAndreas Gohr    }
77596d5287SAndreas Gohr
78596d5287SAndreas Gohr    /**
79596d5287SAndreas Gohr     * @return string
80596d5287SAndreas Gohr     */
81*6734bb8cSAndreas Gohr    public function getBase(): string
82596d5287SAndreas Gohr    {
83596d5287SAndreas Gohr        return $this->base;
84596d5287SAndreas Gohr    }
85596d5287SAndreas Gohr
86596d5287SAndreas Gohr    /**
87596d5287SAndreas Gohr     * @return string
88596d5287SAndreas Gohr     */
89*6734bb8cSAndreas Gohr    public function getQuoted(): string
90596d5287SAndreas Gohr    {
91596d5287SAndreas Gohr        return $this->quoted;
92596d5287SAndreas Gohr    }
93596d5287SAndreas Gohr
94596d5287SAndreas Gohr    /**
95596d5287SAndreas Gohr     * @return int
96596d5287SAndreas Gohr     */
97*6734bb8cSAndreas Gohr    public function getLength(): int
98596d5287SAndreas Gohr    {
99596d5287SAndreas Gohr        return $this->length;
100596d5287SAndreas Gohr    }
101596d5287SAndreas Gohr
102596d5287SAndreas Gohr    /**
103596d5287SAndreas Gohr     * @return int
104596d5287SAndreas Gohr     */
105*6734bb8cSAndreas Gohr    public function getWildcard(): int
106596d5287SAndreas Gohr    {
107596d5287SAndreas Gohr        return $this->wildcard;
108596d5287SAndreas Gohr    }
109596d5287SAndreas Gohr
110596d5287SAndreas Gohr    /**
111596d5287SAndreas Gohr     * @return array [entity => frequency, ...]
112596d5287SAndreas Gohr     */
113*6734bb8cSAndreas Gohr    public function getEntityFrequencies(): array
114596d5287SAndreas Gohr    {
115596d5287SAndreas Gohr        return $this->frequencies;
116596d5287SAndreas Gohr    }
117596d5287SAndreas Gohr
118596d5287SAndreas Gohr    /**
119*6734bb8cSAndreas Gohr     * Add found token IDs for a specific index group
120*6734bb8cSAndreas Gohr     *
121*6734bb8cSAndreas Gohr     * @param int $group Index group (length for split collections, 0 for non-split)
122596d5287SAndreas Gohr     * @param array $tokens [tokenID => tokenName, ...]
123596d5287SAndreas Gohr     * @return void
124596d5287SAndreas Gohr     * @internal
125596d5287SAndreas Gohr     */
126*6734bb8cSAndreas Gohr    public function addTokens(int $group, array $tokens): void
127596d5287SAndreas Gohr    {
128*6734bb8cSAndreas Gohr        $this->tokens[$group] = [];
129596d5287SAndreas Gohr        foreach ($tokens as $tokenID => $tokenName) {
130*6734bb8cSAndreas Gohr            $this->tokens[$group][$tokenID] = $tokenName;
131596d5287SAndreas Gohr        }
132596d5287SAndreas Gohr    }
133596d5287SAndreas Gohr
134596d5287SAndreas Gohr    /**
135596d5287SAndreas Gohr     * Return all tokens that match the given term
136596d5287SAndreas Gohr     *
137e05998d5SAndreas Gohr     * @return string[]
138596d5287SAndreas Gohr     */
139*6734bb8cSAndreas Gohr    public function getTokens(): array
140596d5287SAndreas Gohr    {
141*6734bb8cSAndreas Gohr        if (empty($this->tokens)) return [];
142596d5287SAndreas Gohr        return array_merge(...array_map('array_values', array_values($this->tokens)));
143596d5287SAndreas Gohr    }
144596d5287SAndreas Gohr
145596d5287SAndreas Gohr    /**
146*6734bb8cSAndreas Gohr     * Return all token IDs for a specific index group
147596d5287SAndreas Gohr     *
148*6734bb8cSAndreas Gohr     * @param int $group Index group (length for split collections, 0 for non-split)
149596d5287SAndreas Gohr     * @return int[]
150596d5287SAndreas Gohr     */
151*6734bb8cSAndreas Gohr    public function getTokenIDsByGroup(int $group): array
152596d5287SAndreas Gohr    {
153*6734bb8cSAndreas Gohr        return isset($this->tokens[$group]) ? array_keys($this->tokens[$group]) : [];
154596d5287SAndreas Gohr    }
155596d5287SAndreas Gohr
156596d5287SAndreas Gohr    /**
157596d5287SAndreas Gohr     * Mathematically add the given frequency to existing frequency for the entityID
158596d5287SAndreas Gohr     *
159596d5287SAndreas Gohr     * @param int $entityID
160596d5287SAndreas Gohr     * @param int $frequency
161596d5287SAndreas Gohr     * @return void
162596d5287SAndreas Gohr     * @internal
163596d5287SAndreas Gohr     */
164*6734bb8cSAndreas Gohr    public function addEntityFrequency(int $entityID, int $frequency): void
165596d5287SAndreas Gohr    {
166596d5287SAndreas Gohr        if (!isset($this->frequencies[$entityID])) {
167596d5287SAndreas Gohr            $this->frequencies[$entityID] = 0;
168596d5287SAndreas Gohr        }
169596d5287SAndreas Gohr
170596d5287SAndreas Gohr        $this->frequencies[$entityID] += $frequency;
171596d5287SAndreas Gohr    }
172596d5287SAndreas Gohr
173596d5287SAndreas Gohr    /**
174596d5287SAndreas Gohr     * Update the entity frequencies to use actual entity names
175596d5287SAndreas Gohr     *
176*6734bb8cSAndreas Gohr     * @param array<int, string> $entityMap [entityID => entityName]
177596d5287SAndreas Gohr     * @return void
178596d5287SAndreas Gohr     */
179*6734bb8cSAndreas Gohr    public function resolveEntities(array $entityMap): void
180*6734bb8cSAndreas Gohr    {
181596d5287SAndreas Gohr        $resolved = [];
182596d5287SAndreas Gohr        foreach ($this->frequencies as $eid => $freq) {
183596d5287SAndreas Gohr            $name = $entityMap[$eid];
184596d5287SAndreas Gohr            $resolved[$name] = $freq;
185596d5287SAndreas Gohr        }
186596d5287SAndreas Gohr        $this->frequencies = $resolved;
187596d5287SAndreas Gohr    }
188596d5287SAndreas Gohr}
189