xref: /dokuwiki/inc/Search/Collection/Term.php (revision e05998d5d6388950e9732477c1bca8f3aff6f193)
1596d5287SAndreas Gohr<?php
2596d5287SAndreas Gohr
3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection;
4596d5287SAndreas Gohr
5596d5287SAndreas Gohruse dokuwiki\Search\Exception\SearchException;
6596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer;
7596d5287SAndreas Gohr
8596d5287SAndreas Gohr/**
9596d5287SAndreas Gohr * Represents a term that is searched on a frequency based index
10596d5287SAndreas Gohr *
11596d5287SAndreas Gohr * A term can contain wildcards and thus may refer to various tokens of different lengths.
12596d5287SAndreas Gohr */
13596d5287SAndreas Gohrclass Term
14596d5287SAndreas Gohr{
15596d5287SAndreas Gohr
16596d5287SAndreas Gohr    const WILDCARD_NONE = 0;
17596d5287SAndreas Gohr    const WILDCARD_START = 1;
18596d5287SAndreas Gohr    const WILDCARD_END = 2;
19596d5287SAndreas Gohr
20596d5287SAndreas Gohr    /** @var string the original term including wildcard chars */
21596d5287SAndreas Gohr    protected $original;
22596d5287SAndreas Gohr
23596d5287SAndreas Gohr    /** @var string the base of the term without wildcard chars FIXME */
24596d5287SAndreas Gohr    protected $base;
25596d5287SAndreas Gohr
26596d5287SAndreas Gohr    /** @var string the quoted term to be used in a regular expression */
27596d5287SAndreas Gohr    protected $quoted;
28596d5287SAndreas Gohr
29596d5287SAndreas Gohr    /** @var int the length of the base term (not counting wildcards) */
30596d5287SAndreas Gohr    protected $length;
31596d5287SAndreas Gohr
32596d5287SAndreas Gohr    /** @var int The type of wildcards */
33596d5287SAndreas Gohr    protected $wildcard;
34596d5287SAndreas Gohr
35596d5287SAndreas Gohr    /** @var array The matching tokens for this term [length => [tokenID => tokenName, ...], ...] */
36596d5287SAndreas Gohr    protected $tokens;
37596d5287SAndreas Gohr
38596d5287SAndreas Gohr    /** @var array The entity frequencies this term matches (aggregated over all tokens) [entity => frequency] */
39596d5287SAndreas Gohr    protected $frequencies;
40596d5287SAndreas Gohr
41596d5287SAndreas Gohr    /**
42596d5287SAndreas Gohr     * @throws SearchException
43596d5287SAndreas Gohr     */
44596d5287SAndreas Gohr    public function __construct($term)
45596d5287SAndreas Gohr    {
46596d5287SAndreas Gohr        $this->original = $term;
47596d5287SAndreas Gohr        $this->base = trim($term, '*');
48596d5287SAndreas Gohr        $this->quoted = preg_quote_cb($this->base);
49596d5287SAndreas Gohr        $this->wildcard = self::WILDCARD_NONE;
50596d5287SAndreas Gohr        $this->length = Tokenizer::tokenLength($this->base);
51596d5287SAndreas Gohr
52596d5287SAndreas Gohr        // handle wildcard
53596d5287SAndreas Gohr        if (substr($term, 0, 1) === '*') {
54596d5287SAndreas Gohr            $this->quoted = '.*' . $this->quoted;
55596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_START;
56596d5287SAndreas Gohr        }
57596d5287SAndreas Gohr
58596d5287SAndreas Gohr        if (substr($term, -1, 1) === '*') {
59596d5287SAndreas Gohr            $this->quoted = $this->quoted . '.*';
60596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_END;
61596d5287SAndreas Gohr        }
62596d5287SAndreas Gohr
63596d5287SAndreas Gohr        // ignore terms that are too short, with an exception on numbers
64596d5287SAndreas Gohr        if ($this->length === 0 || ($this->length < Tokenizer::getMinWordLength() && !is_numeric($term))) {
65596d5287SAndreas Gohr            throw new SearchException('Too short term');
66596d5287SAndreas Gohr        }
67596d5287SAndreas Gohr    }
68596d5287SAndreas Gohr
69596d5287SAndreas Gohr    /**
70596d5287SAndreas Gohr     * @return string
71596d5287SAndreas Gohr     */
72596d5287SAndreas Gohr    public function getOriginal()
73596d5287SAndreas Gohr    {
74596d5287SAndreas Gohr        return $this->original;
75596d5287SAndreas Gohr    }
76596d5287SAndreas Gohr
77596d5287SAndreas Gohr    /**
78596d5287SAndreas Gohr     * @return string
79596d5287SAndreas Gohr     */
80596d5287SAndreas Gohr    public function getBase()
81596d5287SAndreas Gohr    {
82596d5287SAndreas Gohr        return $this->base;
83596d5287SAndreas Gohr    }
84596d5287SAndreas Gohr
85596d5287SAndreas Gohr    /**
86596d5287SAndreas Gohr     * @return string
87596d5287SAndreas Gohr     */
88596d5287SAndreas Gohr    public function getQuoted()
89596d5287SAndreas Gohr    {
90596d5287SAndreas Gohr        return $this->quoted;
91596d5287SAndreas Gohr    }
92596d5287SAndreas Gohr
93596d5287SAndreas Gohr    /**
94596d5287SAndreas Gohr     * @return int
95596d5287SAndreas Gohr     */
96596d5287SAndreas Gohr    public function getLength()
97596d5287SAndreas Gohr    {
98596d5287SAndreas Gohr        return $this->length;
99596d5287SAndreas Gohr    }
100596d5287SAndreas Gohr
101596d5287SAndreas Gohr    /**
102596d5287SAndreas Gohr     * @return int
103596d5287SAndreas Gohr     */
104596d5287SAndreas Gohr    public function getWildcard()
105596d5287SAndreas Gohr    {
106596d5287SAndreas Gohr        return $this->wildcard;
107596d5287SAndreas Gohr    }
108596d5287SAndreas Gohr
109596d5287SAndreas Gohr    /**
110596d5287SAndreas Gohr     * @return array [entity => frequency, ...]
111596d5287SAndreas Gohr     */
112596d5287SAndreas Gohr    public function getEntityFrequencies()
113596d5287SAndreas Gohr    {
114596d5287SAndreas Gohr        return $this->frequencies;
115596d5287SAndreas Gohr    }
116596d5287SAndreas Gohr
117596d5287SAndreas Gohr    /**
118596d5287SAndreas Gohr     * Add found tokens IDs of a specific length
119596d5287SAndreas Gohr     * @param int $length
120596d5287SAndreas Gohr     * @param array $tokens [tokenID => tokenName, ...]
121596d5287SAndreas Gohr     * @return void
122596d5287SAndreas Gohr     * @internal
123596d5287SAndreas Gohr     */
124596d5287SAndreas Gohr    public function addTokens($length, $tokens)
125596d5287SAndreas Gohr    {
126596d5287SAndreas Gohr        $this->tokens[$length] = [];
127596d5287SAndreas Gohr        foreach ($tokens as $tokenID => $tokenName) {
128596d5287SAndreas Gohr            $this->tokens[$length][$tokenID] = $tokenName;
129596d5287SAndreas Gohr        }
130596d5287SAndreas Gohr    }
131596d5287SAndreas Gohr
132596d5287SAndreas Gohr    /**
133596d5287SAndreas Gohr     * Return all tokens that match the given term
134596d5287SAndreas Gohr     *
135*e05998d5SAndreas Gohr     * @return string[]
136596d5287SAndreas Gohr     */
137596d5287SAndreas Gohr    public function getTokens()
138596d5287SAndreas Gohr    {
139596d5287SAndreas Gohr        return array_merge(...array_map('array_values', array_values($this->tokens)));
140596d5287SAndreas Gohr    }
141596d5287SAndreas Gohr
142596d5287SAndreas Gohr    /**
143596d5287SAndreas Gohr     * Return all token IDs of the given length
144596d5287SAndreas Gohr     *
145596d5287SAndreas Gohr     * @param $length
146596d5287SAndreas Gohr     * @return int[]
147596d5287SAndreas Gohr     */
148596d5287SAndreas Gohr    public function getTokenIDsByLength($length)
149596d5287SAndreas Gohr    {
150596d5287SAndreas Gohr        return isset($this->tokens[$length]) ? array_keys($this->tokens[$length]) : [];
151596d5287SAndreas Gohr    }
152596d5287SAndreas Gohr
153596d5287SAndreas Gohr    /**
154596d5287SAndreas Gohr     * Mathematically add the given frequency to existing frequency for the entityID
155596d5287SAndreas Gohr     *
156596d5287SAndreas Gohr     * @param int $entityID
157596d5287SAndreas Gohr     * @param int $frequency
158596d5287SAndreas Gohr     * @return void
159596d5287SAndreas Gohr     * @internal
160596d5287SAndreas Gohr     */
161596d5287SAndreas Gohr    public function addEntityFrequency($entityID, $frequency)
162596d5287SAndreas Gohr    {
163596d5287SAndreas Gohr        if (!isset($this->frequencies[$entityID])) {
164596d5287SAndreas Gohr            $this->frequencies[$entityID] = 0;
165596d5287SAndreas Gohr        }
166596d5287SAndreas Gohr
167596d5287SAndreas Gohr        $this->frequencies[$entityID] += $frequency;
168596d5287SAndreas Gohr    }
169596d5287SAndreas Gohr
170596d5287SAndreas Gohr    /**
171596d5287SAndreas Gohr     * Update the entity frequencies to use actual entity names
172596d5287SAndreas Gohr     *
173596d5287SAndreas Gohr     * @param array $entityMap [entityID => entityName]
174596d5287SAndreas Gohr     * @return void
175596d5287SAndreas Gohr     */
176596d5287SAndreas Gohr    public function resolveEntities($entityMap) {
177596d5287SAndreas Gohr        $resolved = [];
178596d5287SAndreas Gohr        foreach ($this->frequencies as $eid => $freq) {
179596d5287SAndreas Gohr            $name = $entityMap[$eid];
180596d5287SAndreas Gohr            $resolved[$name] = $freq;
181596d5287SAndreas Gohr        }
182596d5287SAndreas Gohr        $this->frequencies = $resolved;
183596d5287SAndreas Gohr    }
184596d5287SAndreas Gohr}
185