1596d5287SAndreas Gohr<?php 2596d5287SAndreas Gohr 3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection; 4596d5287SAndreas Gohr 5596d5287SAndreas Gohruse dokuwiki\Search\Exception\SearchException; 6596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer; 7596d5287SAndreas Gohr 8596d5287SAndreas Gohr/** 9596d5287SAndreas Gohr * Represents a term that is searched on a frequency based index 10596d5287SAndreas Gohr * 11596d5287SAndreas Gohr * A term can contain wildcards and thus may refer to various tokens of different lengths. 12596d5287SAndreas Gohr */ 13596d5287SAndreas Gohrclass Term 14596d5287SAndreas Gohr{ 15596d5287SAndreas Gohr 16596d5287SAndreas Gohr const WILDCARD_NONE = 0; 17596d5287SAndreas Gohr const WILDCARD_START = 1; 18596d5287SAndreas Gohr const WILDCARD_END = 2; 19596d5287SAndreas Gohr 20596d5287SAndreas Gohr /** @var string the original term including wildcard chars */ 21*6734bb8cSAndreas Gohr protected string $original; 22596d5287SAndreas Gohr 23*6734bb8cSAndreas Gohr /** @var string the base of the term without wildcard chars */ 24*6734bb8cSAndreas Gohr protected string $base; 25596d5287SAndreas Gohr 26596d5287SAndreas Gohr /** @var string the quoted term to be used in a regular expression */ 27*6734bb8cSAndreas Gohr protected string $quoted; 28596d5287SAndreas Gohr 29596d5287SAndreas Gohr /** @var int the length of the base term (not counting wildcards) */ 30*6734bb8cSAndreas Gohr protected int $length; 31596d5287SAndreas Gohr 32596d5287SAndreas Gohr /** @var int The type of wildcards */ 33*6734bb8cSAndreas Gohr protected int $wildcard; 34596d5287SAndreas Gohr 35*6734bb8cSAndreas Gohr /** @var array<int, array<int, string>> The matching tokens for this term, keyed by group then token ID */ 36*6734bb8cSAndreas Gohr protected array $tokens = []; 37596d5287SAndreas Gohr 38*6734bb8cSAndreas Gohr /** @var array<int|string, int> The entity frequencies this term matches (aggregated over all tokens), keyed by entity ID or name */ 39*6734bb8cSAndreas Gohr protected array $frequencies = []; 40596d5287SAndreas Gohr 41596d5287SAndreas Gohr /** 42*6734bb8cSAndreas Gohr * @param string $term 43596d5287SAndreas Gohr * @throws SearchException 44596d5287SAndreas Gohr */ 45*6734bb8cSAndreas Gohr public function __construct(string $term) 46596d5287SAndreas Gohr { 47596d5287SAndreas Gohr $this->original = $term; 48596d5287SAndreas Gohr $this->base = trim($term, '*'); 49596d5287SAndreas Gohr $this->quoted = preg_quote_cb($this->base); 50596d5287SAndreas Gohr $this->wildcard = self::WILDCARD_NONE; 51596d5287SAndreas Gohr $this->length = Tokenizer::tokenLength($this->base); 52596d5287SAndreas Gohr 53596d5287SAndreas Gohr // handle wildcard 54596d5287SAndreas Gohr if (substr($term, 0, 1) === '*') { 55596d5287SAndreas Gohr $this->quoted = '.*' . $this->quoted; 56596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_START; 57596d5287SAndreas Gohr } 58596d5287SAndreas Gohr 59596d5287SAndreas Gohr if (substr($term, -1, 1) === '*') { 60596d5287SAndreas Gohr $this->quoted = $this->quoted . '.*'; 61596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_END; 62596d5287SAndreas Gohr } 63596d5287SAndreas Gohr 64596d5287SAndreas Gohr // ignore terms that are too short, with an exception on numbers 65596d5287SAndreas Gohr if ($this->length === 0 || ($this->length < Tokenizer::getMinWordLength() && !is_numeric($term))) { 66596d5287SAndreas Gohr throw new SearchException('Too short term'); 67596d5287SAndreas Gohr } 68596d5287SAndreas Gohr } 69596d5287SAndreas Gohr 70596d5287SAndreas Gohr /** 71596d5287SAndreas Gohr * @return string 72596d5287SAndreas Gohr */ 73*6734bb8cSAndreas Gohr public function getOriginal(): string 74596d5287SAndreas Gohr { 75596d5287SAndreas Gohr return $this->original; 76596d5287SAndreas Gohr } 77596d5287SAndreas Gohr 78596d5287SAndreas Gohr /** 79596d5287SAndreas Gohr * @return string 80596d5287SAndreas Gohr */ 81*6734bb8cSAndreas Gohr public function getBase(): string 82596d5287SAndreas Gohr { 83596d5287SAndreas Gohr return $this->base; 84596d5287SAndreas Gohr } 85596d5287SAndreas Gohr 86596d5287SAndreas Gohr /** 87596d5287SAndreas Gohr * @return string 88596d5287SAndreas Gohr */ 89*6734bb8cSAndreas Gohr public function getQuoted(): string 90596d5287SAndreas Gohr { 91596d5287SAndreas Gohr return $this->quoted; 92596d5287SAndreas Gohr } 93596d5287SAndreas Gohr 94596d5287SAndreas Gohr /** 95596d5287SAndreas Gohr * @return int 96596d5287SAndreas Gohr */ 97*6734bb8cSAndreas Gohr public function getLength(): int 98596d5287SAndreas Gohr { 99596d5287SAndreas Gohr return $this->length; 100596d5287SAndreas Gohr } 101596d5287SAndreas Gohr 102596d5287SAndreas Gohr /** 103596d5287SAndreas Gohr * @return int 104596d5287SAndreas Gohr */ 105*6734bb8cSAndreas Gohr public function getWildcard(): int 106596d5287SAndreas Gohr { 107596d5287SAndreas Gohr return $this->wildcard; 108596d5287SAndreas Gohr } 109596d5287SAndreas Gohr 110596d5287SAndreas Gohr /** 111596d5287SAndreas Gohr * @return array [entity => frequency, ...] 112596d5287SAndreas Gohr */ 113*6734bb8cSAndreas Gohr public function getEntityFrequencies(): array 114596d5287SAndreas Gohr { 115596d5287SAndreas Gohr return $this->frequencies; 116596d5287SAndreas Gohr } 117596d5287SAndreas Gohr 118596d5287SAndreas Gohr /** 119*6734bb8cSAndreas Gohr * Add found token IDs for a specific index group 120*6734bb8cSAndreas Gohr * 121*6734bb8cSAndreas Gohr * @param int $group Index group (length for split collections, 0 for non-split) 122596d5287SAndreas Gohr * @param array $tokens [tokenID => tokenName, ...] 123596d5287SAndreas Gohr * @return void 124596d5287SAndreas Gohr * @internal 125596d5287SAndreas Gohr */ 126*6734bb8cSAndreas Gohr public function addTokens(int $group, array $tokens): void 127596d5287SAndreas Gohr { 128*6734bb8cSAndreas Gohr $this->tokens[$group] = []; 129596d5287SAndreas Gohr foreach ($tokens as $tokenID => $tokenName) { 130*6734bb8cSAndreas Gohr $this->tokens[$group][$tokenID] = $tokenName; 131596d5287SAndreas Gohr } 132596d5287SAndreas Gohr } 133596d5287SAndreas Gohr 134596d5287SAndreas Gohr /** 135596d5287SAndreas Gohr * Return all tokens that match the given term 136596d5287SAndreas Gohr * 137e05998d5SAndreas Gohr * @return string[] 138596d5287SAndreas Gohr */ 139*6734bb8cSAndreas Gohr public function getTokens(): array 140596d5287SAndreas Gohr { 141*6734bb8cSAndreas Gohr if (empty($this->tokens)) return []; 142596d5287SAndreas Gohr return array_merge(...array_map('array_values', array_values($this->tokens))); 143596d5287SAndreas Gohr } 144596d5287SAndreas Gohr 145596d5287SAndreas Gohr /** 146*6734bb8cSAndreas Gohr * Return all token IDs for a specific index group 147596d5287SAndreas Gohr * 148*6734bb8cSAndreas Gohr * @param int $group Index group (length for split collections, 0 for non-split) 149596d5287SAndreas Gohr * @return int[] 150596d5287SAndreas Gohr */ 151*6734bb8cSAndreas Gohr public function getTokenIDsByGroup(int $group): array 152596d5287SAndreas Gohr { 153*6734bb8cSAndreas Gohr return isset($this->tokens[$group]) ? array_keys($this->tokens[$group]) : []; 154596d5287SAndreas Gohr } 155596d5287SAndreas Gohr 156596d5287SAndreas Gohr /** 157596d5287SAndreas Gohr * Mathematically add the given frequency to existing frequency for the entityID 158596d5287SAndreas Gohr * 159596d5287SAndreas Gohr * @param int $entityID 160596d5287SAndreas Gohr * @param int $frequency 161596d5287SAndreas Gohr * @return void 162596d5287SAndreas Gohr * @internal 163596d5287SAndreas Gohr */ 164*6734bb8cSAndreas Gohr public function addEntityFrequency(int $entityID, int $frequency): void 165596d5287SAndreas Gohr { 166596d5287SAndreas Gohr if (!isset($this->frequencies[$entityID])) { 167596d5287SAndreas Gohr $this->frequencies[$entityID] = 0; 168596d5287SAndreas Gohr } 169596d5287SAndreas Gohr 170596d5287SAndreas Gohr $this->frequencies[$entityID] += $frequency; 171596d5287SAndreas Gohr } 172596d5287SAndreas Gohr 173596d5287SAndreas Gohr /** 174596d5287SAndreas Gohr * Update the entity frequencies to use actual entity names 175596d5287SAndreas Gohr * 176*6734bb8cSAndreas Gohr * @param array<int, string> $entityMap [entityID => entityName] 177596d5287SAndreas Gohr * @return void 178596d5287SAndreas Gohr */ 179*6734bb8cSAndreas Gohr public function resolveEntities(array $entityMap): void 180*6734bb8cSAndreas Gohr { 181596d5287SAndreas Gohr $resolved = []; 182596d5287SAndreas Gohr foreach ($this->frequencies as $eid => $freq) { 183596d5287SAndreas Gohr $name = $entityMap[$eid]; 184596d5287SAndreas Gohr $resolved[$name] = $freq; 185596d5287SAndreas Gohr } 186596d5287SAndreas Gohr $this->frequencies = $resolved; 187596d5287SAndreas Gohr } 188596d5287SAndreas Gohr} 189