1596d5287SAndreas Gohr<?php 2596d5287SAndreas Gohr 3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection; 4596d5287SAndreas Gohr 5596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer; 6*1148921dSAndreas Gohruse dokuwiki\Utf8; 7596d5287SAndreas Gohr 8596d5287SAndreas Gohr/** 9*1148921dSAndreas Gohr * Represents a search term that can match one or more tokens in an index 10596d5287SAndreas Gohr * 11*1148921dSAndreas Gohr * A term can contain wildcards (* at start/end) and thus may refer to various tokens 12*1148921dSAndreas Gohr * of different lengths. After a CollectionSearch executes, each Term holds the full 13*1148921dSAndreas Gohr * match detail: which tokens matched on which entities with what frequencies. 14596d5287SAndreas Gohr */ 15596d5287SAndreas Gohrclass Term 16596d5287SAndreas Gohr{ 17596d5287SAndreas Gohr 18596d5287SAndreas Gohr const WILDCARD_NONE = 0; 19596d5287SAndreas Gohr const WILDCARD_START = 1; 20596d5287SAndreas Gohr const WILDCARD_END = 2; 21596d5287SAndreas Gohr 22596d5287SAndreas Gohr /** @var string the original term including wildcard chars */ 236734bb8cSAndreas Gohr protected string $original; 24596d5287SAndreas Gohr 256734bb8cSAndreas Gohr /** @var string the base of the term without wildcard chars */ 266734bb8cSAndreas Gohr protected string $base; 27596d5287SAndreas Gohr 28596d5287SAndreas Gohr /** @var string the quoted term to be used in a regular expression */ 296734bb8cSAndreas Gohr protected string $quoted; 30596d5287SAndreas Gohr 31596d5287SAndreas Gohr /** @var int the length of the base term (not counting wildcards) */ 326734bb8cSAndreas Gohr protected int $length; 33596d5287SAndreas Gohr 34596d5287SAndreas Gohr /** @var int The type of wildcards */ 356734bb8cSAndreas Gohr protected int $wildcard; 36596d5287SAndreas Gohr 37*1148921dSAndreas Gohr /** @var bool Whether to match case-insensitively */ 38*1148921dSAndreas Gohr protected bool $isCaseInsensitive = false; 39596d5287SAndreas Gohr 40*1148921dSAndreas Gohr /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */ 41*1148921dSAndreas Gohr protected array $matches = []; 42*1148921dSAndreas Gohr 43*1148921dSAndreas Gohr // region Setup 44596d5287SAndreas Gohr 45596d5287SAndreas Gohr /** 466734bb8cSAndreas Gohr * @param string $term 47596d5287SAndreas Gohr */ 486734bb8cSAndreas Gohr public function __construct(string $term) 49596d5287SAndreas Gohr { 50596d5287SAndreas Gohr $this->original = $term; 51596d5287SAndreas Gohr $this->base = trim($term, '*'); 52596d5287SAndreas Gohr $this->quoted = preg_quote_cb($this->base); 53596d5287SAndreas Gohr $this->wildcard = self::WILDCARD_NONE; 54596d5287SAndreas Gohr $this->length = Tokenizer::tokenLength($this->base); 55596d5287SAndreas Gohr 56596d5287SAndreas Gohr // handle wildcard 57596d5287SAndreas Gohr if (substr($term, 0, 1) === '*') { 58596d5287SAndreas Gohr $this->quoted = '.*' . $this->quoted; 59596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_START; 60596d5287SAndreas Gohr } 61596d5287SAndreas Gohr 62596d5287SAndreas Gohr if (substr($term, -1, 1) === '*') { 63596d5287SAndreas Gohr $this->quoted = $this->quoted . '.*'; 64596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_END; 65596d5287SAndreas Gohr } 66596d5287SAndreas Gohr } 67*1148921dSAndreas Gohr 68*1148921dSAndreas Gohr /** 69*1148921dSAndreas Gohr * Enable case-insensitive matching 70*1148921dSAndreas Gohr * 71*1148921dSAndreas Gohr * The fulltext token index is already lowercased by the Tokenizer, so this is only 72*1148921dSAndreas Gohr * needed for metadata/title searches where indexed values preserve case. 73*1148921dSAndreas Gohr * 74*1148921dSAndreas Gohr * @return static 75*1148921dSAndreas Gohr */ 76*1148921dSAndreas Gohr public function caseInsensitive(): static 77*1148921dSAndreas Gohr { 78*1148921dSAndreas Gohr $this->isCaseInsensitive = true; 79*1148921dSAndreas Gohr $this->base = Utf8\PhpString::strtolower($this->base); 80*1148921dSAndreas Gohr return $this; 81596d5287SAndreas Gohr } 82596d5287SAndreas Gohr 83596d5287SAndreas Gohr /** 84596d5287SAndreas Gohr * @return string 85596d5287SAndreas Gohr */ 866734bb8cSAndreas Gohr public function getOriginal(): string 87596d5287SAndreas Gohr { 88596d5287SAndreas Gohr return $this->original; 89596d5287SAndreas Gohr } 90596d5287SAndreas Gohr 91596d5287SAndreas Gohr /** 92596d5287SAndreas Gohr * @return string 93596d5287SAndreas Gohr */ 946734bb8cSAndreas Gohr public function getBase(): string 95596d5287SAndreas Gohr { 96596d5287SAndreas Gohr return $this->base; 97596d5287SAndreas Gohr } 98596d5287SAndreas Gohr 99596d5287SAndreas Gohr /** 100596d5287SAndreas Gohr * @return string 101596d5287SAndreas Gohr */ 1026734bb8cSAndreas Gohr public function getQuoted(): string 103596d5287SAndreas Gohr { 104596d5287SAndreas Gohr return $this->quoted; 105596d5287SAndreas Gohr } 106596d5287SAndreas Gohr 107596d5287SAndreas Gohr /** 108596d5287SAndreas Gohr * @return int 109596d5287SAndreas Gohr */ 1106734bb8cSAndreas Gohr public function getLength(): int 111596d5287SAndreas Gohr { 112596d5287SAndreas Gohr return $this->length; 113596d5287SAndreas Gohr } 114596d5287SAndreas Gohr 115596d5287SAndreas Gohr /** 116596d5287SAndreas Gohr * @return int 117596d5287SAndreas Gohr */ 1186734bb8cSAndreas Gohr public function getWildcard(): int 119596d5287SAndreas Gohr { 120596d5287SAndreas Gohr return $this->wildcard; 121596d5287SAndreas Gohr } 122596d5287SAndreas Gohr 123*1148921dSAndreas Gohr // endregion 124*1148921dSAndreas Gohr 125*1148921dSAndreas Gohr // region Matching 126*1148921dSAndreas Gohr 127596d5287SAndreas Gohr /** 128*1148921dSAndreas Gohr * Check if a token value matches this term 129*1148921dSAndreas Gohr * 130*1148921dSAndreas Gohr * Uses efficient string functions instead of regex: 131*1148921dSAndreas Gohr * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains. 132*1148921dSAndreas Gohr * When caseInsensitive() is set, the token value is lowercased before comparison. 133*1148921dSAndreas Gohr * 134*1148921dSAndreas Gohr * @param string $tokenValue 135*1148921dSAndreas Gohr * @return bool 136*1148921dSAndreas Gohr */ 137*1148921dSAndreas Gohr public function matches(string $tokenValue): bool 138*1148921dSAndreas Gohr { 139*1148921dSAndreas Gohr if ($this->isCaseInsensitive) { 140*1148921dSAndreas Gohr $tokenValue = Utf8\PhpString::strtolower($tokenValue); 141*1148921dSAndreas Gohr } 142*1148921dSAndreas Gohr 143*1148921dSAndreas Gohr return match ($this->wildcard) { 144*1148921dSAndreas Gohr self::WILDCARD_NONE => $this->base === $tokenValue, 145*1148921dSAndreas Gohr self::WILDCARD_END => str_starts_with($tokenValue, $this->base), 146*1148921dSAndreas Gohr self::WILDCARD_START => str_ends_with($tokenValue, $this->base), 147*1148921dSAndreas Gohr default => str_contains($tokenValue, $this->base), 148*1148921dSAndreas Gohr }; 149*1148921dSAndreas Gohr } 150*1148921dSAndreas Gohr 151*1148921dSAndreas Gohr // endregion 152*1148921dSAndreas Gohr 153*1148921dSAndreas Gohr // region Results (populated by CollectionSearch at the end of execute()) 154*1148921dSAndreas Gohr 155*1148921dSAndreas Gohr /** 156*1148921dSAndreas Gohr * Record that a token matched an entity with a given frequency 157*1148921dSAndreas Gohr * 158*1148921dSAndreas Gohr * When called multiple times for the same entity/token pair, frequencies are summed. 159*1148921dSAndreas Gohr * 160*1148921dSAndreas Gohr * @param string $entityName 161*1148921dSAndreas Gohr * @param string $tokenName 162*1148921dSAndreas Gohr * @param int $frequency 163*1148921dSAndreas Gohr * @return void 164*1148921dSAndreas Gohr * @internal Called by CollectionSearch::resolveAndPopulateTerms() 165*1148921dSAndreas Gohr */ 166*1148921dSAndreas Gohr public function addMatch(string $entityName, string $tokenName, int $frequency): void 167*1148921dSAndreas Gohr { 168*1148921dSAndreas Gohr $this->matches[$entityName][$tokenName] = 169*1148921dSAndreas Gohr ($this->matches[$entityName][$tokenName] ?? 0) + $frequency; 170*1148921dSAndreas Gohr } 171*1148921dSAndreas Gohr 172*1148921dSAndreas Gohr // endregion 173*1148921dSAndreas Gohr 174*1148921dSAndreas Gohr // region Result accessors 175*1148921dSAndreas Gohr 176*1148921dSAndreas Gohr /** 177*1148921dSAndreas Gohr * Return the full match detail 178*1148921dSAndreas Gohr * 179*1148921dSAndreas Gohr * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...] 180*1148921dSAndreas Gohr */ 181*1148921dSAndreas Gohr public function getMatches(): array 182*1148921dSAndreas Gohr { 183*1148921dSAndreas Gohr return $this->matches; 184*1148921dSAndreas Gohr } 185*1148921dSAndreas Gohr 186*1148921dSAndreas Gohr /** 187*1148921dSAndreas Gohr * Return the matching entities and their aggregated frequencies 188*1148921dSAndreas Gohr * 189*1148921dSAndreas Gohr * Values are the total frequency across all matching tokens for each entity. 190*1148921dSAndreas Gohr * 191*1148921dSAndreas Gohr * @return array<string, int> [entityName => totalFrequency, ...] 192596d5287SAndreas Gohr */ 1936734bb8cSAndreas Gohr public function getEntityFrequencies(): array 194596d5287SAndreas Gohr { 195*1148921dSAndreas Gohr return array_map('array_sum', $this->matches); 196596d5287SAndreas Gohr } 197596d5287SAndreas Gohr 198596d5287SAndreas Gohr /** 199*1148921dSAndreas Gohr * Return the matched token names per entity 2006734bb8cSAndreas Gohr * 201*1148921dSAndreas Gohr * @return array<string, string[]> [entityName => [tokenName, ...], ...] 202596d5287SAndreas Gohr */ 203*1148921dSAndreas Gohr public function getEntityTokens(): array 204596d5287SAndreas Gohr { 205*1148921dSAndreas Gohr return array_map('array_keys', $this->matches); 206596d5287SAndreas Gohr } 207596d5287SAndreas Gohr 208596d5287SAndreas Gohr /** 209*1148921dSAndreas Gohr * Return all unique matched token values 210596d5287SAndreas Gohr * 211e05998d5SAndreas Gohr * @return string[] 212596d5287SAndreas Gohr */ 2136734bb8cSAndreas Gohr public function getTokens(): array 214596d5287SAndreas Gohr { 215*1148921dSAndreas Gohr if (empty($this->matches)) return []; 216*1148921dSAndreas Gohr return array_keys(array_merge(...array_values($this->matches))); 217596d5287SAndreas Gohr } 218596d5287SAndreas Gohr 219*1148921dSAndreas Gohr // endregion 220596d5287SAndreas Gohr} 221