xref: /dokuwiki/inc/Search/Collection/Term.php (revision 596d5287d7a816d606ef4153ef9e0f4704bf8f73)
1*596d5287SAndreas Gohr<?php
2*596d5287SAndreas Gohr
3*596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection;
4*596d5287SAndreas Gohr
5*596d5287SAndreas Gohruse dokuwiki\Search\Exception\SearchException;
6*596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer;
7*596d5287SAndreas Gohr
8*596d5287SAndreas Gohr/**
9*596d5287SAndreas Gohr * Represents a term that is searched on a frequency based index
10*596d5287SAndreas Gohr *
11*596d5287SAndreas Gohr * A term can contain wildcards and thus may refer to various tokens of different lengths.
12*596d5287SAndreas Gohr *
13*596d5287SAndreas Gohr * @fixme add standalone tests for this class
14*596d5287SAndreas Gohr */
15*596d5287SAndreas Gohrclass Term
16*596d5287SAndreas Gohr{
17*596d5287SAndreas Gohr
18*596d5287SAndreas Gohr    const WILDCARD_NONE = 0;
19*596d5287SAndreas Gohr    const WILDCARD_START = 1;
20*596d5287SAndreas Gohr    const WILDCARD_END = 2;
21*596d5287SAndreas Gohr
22*596d5287SAndreas Gohr    /** @var string the original term including wildcard chars */
23*596d5287SAndreas Gohr    protected $original;
24*596d5287SAndreas Gohr
25*596d5287SAndreas Gohr    /** @var string the base of the term without wildcard chars FIXME */
26*596d5287SAndreas Gohr    protected $base;
27*596d5287SAndreas Gohr
28*596d5287SAndreas Gohr    /** @var string the quoted term to be used in a regular expression */
29*596d5287SAndreas Gohr    protected $quoted;
30*596d5287SAndreas Gohr
31*596d5287SAndreas Gohr    /** @var int the length of the base term (not counting wildcards) */
32*596d5287SAndreas Gohr    protected $length;
33*596d5287SAndreas Gohr
34*596d5287SAndreas Gohr    /** @var int The type of wildcards */
35*596d5287SAndreas Gohr    protected $wildcard;
36*596d5287SAndreas Gohr
37*596d5287SAndreas Gohr    /** @var array The matching tokens for this term [length => [tokenID => tokenName, ...], ...] */
38*596d5287SAndreas Gohr    protected $tokens;
39*596d5287SAndreas Gohr
40*596d5287SAndreas Gohr    /** @var array The entity frequencies this term matches (aggregated over all tokens) [entity => frequency] */
41*596d5287SAndreas Gohr    protected $frequencies;
42*596d5287SAndreas Gohr
43*596d5287SAndreas Gohr    /**
44*596d5287SAndreas Gohr     * @throws SearchException
45*596d5287SAndreas Gohr     */
46*596d5287SAndreas Gohr    public function __construct($term)
47*596d5287SAndreas Gohr    {
48*596d5287SAndreas Gohr        $this->original = $term;
49*596d5287SAndreas Gohr        $this->base = trim($term, '*');
50*596d5287SAndreas Gohr        $this->quoted = preg_quote_cb($this->base);
51*596d5287SAndreas Gohr        $this->wildcard = self::WILDCARD_NONE;
52*596d5287SAndreas Gohr        $this->length = Tokenizer::tokenLength($this->base);
53*596d5287SAndreas Gohr
54*596d5287SAndreas Gohr        // handle wildcard
55*596d5287SAndreas Gohr        if (substr($term, 0, 1) === '*') {
56*596d5287SAndreas Gohr            $this->quoted = '.*' . $this->quoted;
57*596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_START;
58*596d5287SAndreas Gohr        }
59*596d5287SAndreas Gohr
60*596d5287SAndreas Gohr        if (substr($term, -1, 1) === '*') {
61*596d5287SAndreas Gohr            $this->quoted = $this->quoted . '.*';
62*596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_END;
63*596d5287SAndreas Gohr        }
64*596d5287SAndreas Gohr
65*596d5287SAndreas Gohr        // ignore terms that are too short, with an exception on numbers
66*596d5287SAndreas Gohr        if ($this->length === 0 || ($this->length < Tokenizer::getMinWordLength() && !is_numeric($term))) {
67*596d5287SAndreas Gohr            throw new SearchException('Too short term');
68*596d5287SAndreas Gohr        }
69*596d5287SAndreas Gohr    }
70*596d5287SAndreas Gohr
71*596d5287SAndreas Gohr    /**
72*596d5287SAndreas Gohr     * @return string
73*596d5287SAndreas Gohr     */
74*596d5287SAndreas Gohr    public function getOriginal()
75*596d5287SAndreas Gohr    {
76*596d5287SAndreas Gohr        return $this->original;
77*596d5287SAndreas Gohr    }
78*596d5287SAndreas Gohr
79*596d5287SAndreas Gohr    /**
80*596d5287SAndreas Gohr     * @return string
81*596d5287SAndreas Gohr     */
82*596d5287SAndreas Gohr    public function getBase()
83*596d5287SAndreas Gohr    {
84*596d5287SAndreas Gohr        return $this->base;
85*596d5287SAndreas Gohr    }
86*596d5287SAndreas Gohr
87*596d5287SAndreas Gohr    /**
88*596d5287SAndreas Gohr     * @return string
89*596d5287SAndreas Gohr     */
90*596d5287SAndreas Gohr    public function getQuoted()
91*596d5287SAndreas Gohr    {
92*596d5287SAndreas Gohr        return $this->quoted;
93*596d5287SAndreas Gohr    }
94*596d5287SAndreas Gohr
95*596d5287SAndreas Gohr    /**
96*596d5287SAndreas Gohr     * @return int
97*596d5287SAndreas Gohr     */
98*596d5287SAndreas Gohr    public function getLength()
99*596d5287SAndreas Gohr    {
100*596d5287SAndreas Gohr        return $this->length;
101*596d5287SAndreas Gohr    }
102*596d5287SAndreas Gohr
103*596d5287SAndreas Gohr    /**
104*596d5287SAndreas Gohr     * @return int
105*596d5287SAndreas Gohr     */
106*596d5287SAndreas Gohr    public function getWildcard()
107*596d5287SAndreas Gohr    {
108*596d5287SAndreas Gohr        return $this->wildcard;
109*596d5287SAndreas Gohr    }
110*596d5287SAndreas Gohr
111*596d5287SAndreas Gohr    /**
112*596d5287SAndreas Gohr     * @return array [entity => frequency, ...]
113*596d5287SAndreas Gohr     */
114*596d5287SAndreas Gohr    public function getEntityFrequencies()
115*596d5287SAndreas Gohr    {
116*596d5287SAndreas Gohr        return $this->frequencies;
117*596d5287SAndreas Gohr    }
118*596d5287SAndreas Gohr
119*596d5287SAndreas Gohr    /**
120*596d5287SAndreas Gohr     * Add found tokens IDs of a specific length
121*596d5287SAndreas Gohr     * @param int $length
122*596d5287SAndreas Gohr     * @param array $tokens [tokenID => tokenName, ...]
123*596d5287SAndreas Gohr     * @return void
124*596d5287SAndreas Gohr     * @internal
125*596d5287SAndreas Gohr     */
126*596d5287SAndreas Gohr    public function addTokens($length, $tokens)
127*596d5287SAndreas Gohr    {
128*596d5287SAndreas Gohr        $this->tokens[$length] = [];
129*596d5287SAndreas Gohr        foreach ($tokens as $tokenID => $tokenName) {
130*596d5287SAndreas Gohr            $this->tokens[$length][$tokenID] = $tokenName;
131*596d5287SAndreas Gohr        }
132*596d5287SAndreas Gohr    }
133*596d5287SAndreas Gohr
134*596d5287SAndreas Gohr    /**
135*596d5287SAndreas Gohr     * Return all tokens that match the given term
136*596d5287SAndreas Gohr     *
137*596d5287SAndreas Gohr     * @return string
138*596d5287SAndreas Gohr     */
139*596d5287SAndreas Gohr    public function getTokens()
140*596d5287SAndreas Gohr    {
141*596d5287SAndreas Gohr        return array_merge(...array_map('array_values', array_values($this->tokens)));
142*596d5287SAndreas Gohr    }
143*596d5287SAndreas Gohr
144*596d5287SAndreas Gohr    /**
145*596d5287SAndreas Gohr     * Return all token IDs of the given length
146*596d5287SAndreas Gohr     *
147*596d5287SAndreas Gohr     * @param $length
148*596d5287SAndreas Gohr     * @return int[]
149*596d5287SAndreas Gohr     */
150*596d5287SAndreas Gohr    public function getTokenIDsByLength($length)
151*596d5287SAndreas Gohr    {
152*596d5287SAndreas Gohr        return isset($this->tokens[$length]) ? array_keys($this->tokens[$length]) : [];
153*596d5287SAndreas Gohr    }
154*596d5287SAndreas Gohr
155*596d5287SAndreas Gohr    /**
156*596d5287SAndreas Gohr     * Mathematically add the given frequency to existing frequency for the entityID
157*596d5287SAndreas Gohr     *
158*596d5287SAndreas Gohr     * @param int $entityID
159*596d5287SAndreas Gohr     * @param int $frequency
160*596d5287SAndreas Gohr     * @return void
161*596d5287SAndreas Gohr     * @internal
162*596d5287SAndreas Gohr     */
163*596d5287SAndreas Gohr    public function addEntityFrequency($entityID, $frequency)
164*596d5287SAndreas Gohr    {
165*596d5287SAndreas Gohr        if (!isset($this->frequencies[$entityID])) {
166*596d5287SAndreas Gohr            $this->frequencies[$entityID] = 0;
167*596d5287SAndreas Gohr        }
168*596d5287SAndreas Gohr
169*596d5287SAndreas Gohr        $this->frequencies[$entityID] += $frequency;
170*596d5287SAndreas Gohr    }
171*596d5287SAndreas Gohr
172*596d5287SAndreas Gohr    /**
173*596d5287SAndreas Gohr     * Update the entity frequencies to use actual entity names
174*596d5287SAndreas Gohr     *
175*596d5287SAndreas Gohr     * @param array $entityMap [entityID => entityName]
176*596d5287SAndreas Gohr     * @return void
177*596d5287SAndreas Gohr     */
178*596d5287SAndreas Gohr    public function resolveEntities($entityMap) {
179*596d5287SAndreas Gohr        $resolved = [];
180*596d5287SAndreas Gohr        foreach ($this->frequencies as $eid => $freq) {
181*596d5287SAndreas Gohr            $name = $entityMap[$eid];
182*596d5287SAndreas Gohr            $resolved[$name] = $freq;
183*596d5287SAndreas Gohr        }
184*596d5287SAndreas Gohr        $this->frequencies = $resolved;
185*596d5287SAndreas Gohr    }
186*596d5287SAndreas Gohr}
187