1596d5287SAndreas Gohr<?php 2596d5287SAndreas Gohr 3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection; 4596d5287SAndreas Gohr 5596d5287SAndreas Gohruse dokuwiki\Search\Exception\SearchException; 6596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer; 7596d5287SAndreas Gohr 8596d5287SAndreas Gohr/** 9596d5287SAndreas Gohr * Represents a term that is searched on a frequency based index 10596d5287SAndreas Gohr * 11596d5287SAndreas Gohr * A term can contain wildcards and thus may refer to various tokens of different lengths. 12596d5287SAndreas Gohr */ 13596d5287SAndreas Gohrclass Term 14596d5287SAndreas Gohr{ 15596d5287SAndreas Gohr 16596d5287SAndreas Gohr const WILDCARD_NONE = 0; 17596d5287SAndreas Gohr const WILDCARD_START = 1; 18596d5287SAndreas Gohr const WILDCARD_END = 2; 19596d5287SAndreas Gohr 20596d5287SAndreas Gohr /** @var string the original term including wildcard chars */ 21596d5287SAndreas Gohr protected $original; 22596d5287SAndreas Gohr 23596d5287SAndreas Gohr /** @var string the base of the term without wildcard chars FIXME */ 24596d5287SAndreas Gohr protected $base; 25596d5287SAndreas Gohr 26596d5287SAndreas Gohr /** @var string the quoted term to be used in a regular expression */ 27596d5287SAndreas Gohr protected $quoted; 28596d5287SAndreas Gohr 29596d5287SAndreas Gohr /** @var int the length of the base term (not counting wildcards) */ 30596d5287SAndreas Gohr protected $length; 31596d5287SAndreas Gohr 32596d5287SAndreas Gohr /** @var int The type of wildcards */ 33596d5287SAndreas Gohr protected $wildcard; 34596d5287SAndreas Gohr 35596d5287SAndreas Gohr /** @var array The matching tokens for this term [length => [tokenID => tokenName, ...], ...] */ 36596d5287SAndreas Gohr protected $tokens; 37596d5287SAndreas Gohr 38596d5287SAndreas Gohr /** @var array The entity frequencies this term matches (aggregated over all tokens) [entity => frequency] */ 39596d5287SAndreas Gohr protected $frequencies; 40596d5287SAndreas Gohr 41596d5287SAndreas Gohr /** 42596d5287SAndreas Gohr * @throws SearchException 43596d5287SAndreas Gohr */ 44596d5287SAndreas Gohr public function __construct($term) 45596d5287SAndreas Gohr { 46596d5287SAndreas Gohr $this->original = $term; 47596d5287SAndreas Gohr $this->base = trim($term, '*'); 48596d5287SAndreas Gohr $this->quoted = preg_quote_cb($this->base); 49596d5287SAndreas Gohr $this->wildcard = self::WILDCARD_NONE; 50596d5287SAndreas Gohr $this->length = Tokenizer::tokenLength($this->base); 51596d5287SAndreas Gohr 52596d5287SAndreas Gohr // handle wildcard 53596d5287SAndreas Gohr if (substr($term, 0, 1) === '*') { 54596d5287SAndreas Gohr $this->quoted = '.*' . $this->quoted; 55596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_START; 56596d5287SAndreas Gohr } 57596d5287SAndreas Gohr 58596d5287SAndreas Gohr if (substr($term, -1, 1) === '*') { 59596d5287SAndreas Gohr $this->quoted = $this->quoted . '.*'; 60596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_END; 61596d5287SAndreas Gohr } 62596d5287SAndreas Gohr 63596d5287SAndreas Gohr // ignore terms that are too short, with an exception on numbers 64596d5287SAndreas Gohr if ($this->length === 0 || ($this->length < Tokenizer::getMinWordLength() && !is_numeric($term))) { 65596d5287SAndreas Gohr throw new SearchException('Too short term'); 66596d5287SAndreas Gohr } 67596d5287SAndreas Gohr } 68596d5287SAndreas Gohr 69596d5287SAndreas Gohr /** 70596d5287SAndreas Gohr * @return string 71596d5287SAndreas Gohr */ 72596d5287SAndreas Gohr public function getOriginal() 73596d5287SAndreas Gohr { 74596d5287SAndreas Gohr return $this->original; 75596d5287SAndreas Gohr } 76596d5287SAndreas Gohr 77596d5287SAndreas Gohr /** 78596d5287SAndreas Gohr * @return string 79596d5287SAndreas Gohr */ 80596d5287SAndreas Gohr public function getBase() 81596d5287SAndreas Gohr { 82596d5287SAndreas Gohr return $this->base; 83596d5287SAndreas Gohr } 84596d5287SAndreas Gohr 85596d5287SAndreas Gohr /** 86596d5287SAndreas Gohr * @return string 87596d5287SAndreas Gohr */ 88596d5287SAndreas Gohr public function getQuoted() 89596d5287SAndreas Gohr { 90596d5287SAndreas Gohr return $this->quoted; 91596d5287SAndreas Gohr } 92596d5287SAndreas Gohr 93596d5287SAndreas Gohr /** 94596d5287SAndreas Gohr * @return int 95596d5287SAndreas Gohr */ 96596d5287SAndreas Gohr public function getLength() 97596d5287SAndreas Gohr { 98596d5287SAndreas Gohr return $this->length; 99596d5287SAndreas Gohr } 100596d5287SAndreas Gohr 101596d5287SAndreas Gohr /** 102596d5287SAndreas Gohr * @return int 103596d5287SAndreas Gohr */ 104596d5287SAndreas Gohr public function getWildcard() 105596d5287SAndreas Gohr { 106596d5287SAndreas Gohr return $this->wildcard; 107596d5287SAndreas Gohr } 108596d5287SAndreas Gohr 109596d5287SAndreas Gohr /** 110596d5287SAndreas Gohr * @return array [entity => frequency, ...] 111596d5287SAndreas Gohr */ 112596d5287SAndreas Gohr public function getEntityFrequencies() 113596d5287SAndreas Gohr { 114596d5287SAndreas Gohr return $this->frequencies; 115596d5287SAndreas Gohr } 116596d5287SAndreas Gohr 117596d5287SAndreas Gohr /** 118596d5287SAndreas Gohr * Add found tokens IDs of a specific length 119596d5287SAndreas Gohr * @param int $length 120596d5287SAndreas Gohr * @param array $tokens [tokenID => tokenName, ...] 121596d5287SAndreas Gohr * @return void 122596d5287SAndreas Gohr * @internal 123596d5287SAndreas Gohr */ 124596d5287SAndreas Gohr public function addTokens($length, $tokens) 125596d5287SAndreas Gohr { 126596d5287SAndreas Gohr $this->tokens[$length] = []; 127596d5287SAndreas Gohr foreach ($tokens as $tokenID => $tokenName) { 128596d5287SAndreas Gohr $this->tokens[$length][$tokenID] = $tokenName; 129596d5287SAndreas Gohr } 130596d5287SAndreas Gohr } 131596d5287SAndreas Gohr 132596d5287SAndreas Gohr /** 133596d5287SAndreas Gohr * Return all tokens that match the given term 134596d5287SAndreas Gohr * 135*e05998d5SAndreas Gohr * @return string[] 136596d5287SAndreas Gohr */ 137596d5287SAndreas Gohr public function getTokens() 138596d5287SAndreas Gohr { 139596d5287SAndreas Gohr return array_merge(...array_map('array_values', array_values($this->tokens))); 140596d5287SAndreas Gohr } 141596d5287SAndreas Gohr 142596d5287SAndreas Gohr /** 143596d5287SAndreas Gohr * Return all token IDs of the given length 144596d5287SAndreas Gohr * 145596d5287SAndreas Gohr * @param $length 146596d5287SAndreas Gohr * @return int[] 147596d5287SAndreas Gohr */ 148596d5287SAndreas Gohr public function getTokenIDsByLength($length) 149596d5287SAndreas Gohr { 150596d5287SAndreas Gohr return isset($this->tokens[$length]) ? array_keys($this->tokens[$length]) : []; 151596d5287SAndreas Gohr } 152596d5287SAndreas Gohr 153596d5287SAndreas Gohr /** 154596d5287SAndreas Gohr * Mathematically add the given frequency to existing frequency for the entityID 155596d5287SAndreas Gohr * 156596d5287SAndreas Gohr * @param int $entityID 157596d5287SAndreas Gohr * @param int $frequency 158596d5287SAndreas Gohr * @return void 159596d5287SAndreas Gohr * @internal 160596d5287SAndreas Gohr */ 161596d5287SAndreas Gohr public function addEntityFrequency($entityID, $frequency) 162596d5287SAndreas Gohr { 163596d5287SAndreas Gohr if (!isset($this->frequencies[$entityID])) { 164596d5287SAndreas Gohr $this->frequencies[$entityID] = 0; 165596d5287SAndreas Gohr } 166596d5287SAndreas Gohr 167596d5287SAndreas Gohr $this->frequencies[$entityID] += $frequency; 168596d5287SAndreas Gohr } 169596d5287SAndreas Gohr 170596d5287SAndreas Gohr /** 171596d5287SAndreas Gohr * Update the entity frequencies to use actual entity names 172596d5287SAndreas Gohr * 173596d5287SAndreas Gohr * @param array $entityMap [entityID => entityName] 174596d5287SAndreas Gohr * @return void 175596d5287SAndreas Gohr */ 176596d5287SAndreas Gohr public function resolveEntities($entityMap) { 177596d5287SAndreas Gohr $resolved = []; 178596d5287SAndreas Gohr foreach ($this->frequencies as $eid => $freq) { 179596d5287SAndreas Gohr $name = $entityMap[$eid]; 180596d5287SAndreas Gohr $resolved[$name] = $freq; 181596d5287SAndreas Gohr } 182596d5287SAndreas Gohr $this->frequencies = $resolved; 183596d5287SAndreas Gohr } 184596d5287SAndreas Gohr} 185