1596d5287SAndreas Gohr<?php 2596d5287SAndreas Gohr 3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection; 4596d5287SAndreas Gohr 5*9369b4a9SAndreas Gohruse dokuwiki\Utf8\PhpString; 6596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer; 71148921dSAndreas Gohruse dokuwiki\Utf8; 8596d5287SAndreas Gohr 9596d5287SAndreas Gohr/** 101148921dSAndreas Gohr * Represents a search term that can match one or more tokens in an index 11596d5287SAndreas Gohr * 121148921dSAndreas Gohr * A term can contain wildcards (* at start/end) and thus may refer to various tokens 131148921dSAndreas Gohr * of different lengths. After a CollectionSearch executes, each Term holds the full 141148921dSAndreas Gohr * match detail: which tokens matched on which entities with what frequencies. 15596d5287SAndreas Gohr */ 16596d5287SAndreas Gohrclass Term 17596d5287SAndreas Gohr{ 18*9369b4a9SAndreas Gohr public const WILDCARD_NONE = 0; 19*9369b4a9SAndreas Gohr public const WILDCARD_START = 1; 20*9369b4a9SAndreas Gohr public const WILDCARD_END = 2; 21596d5287SAndreas Gohr 22596d5287SAndreas Gohr /** @var string the original term including wildcard chars */ 236734bb8cSAndreas Gohr protected string $original; 24596d5287SAndreas Gohr 256734bb8cSAndreas Gohr /** @var string the base of the term without wildcard chars */ 266734bb8cSAndreas Gohr protected string $base; 27596d5287SAndreas Gohr 28596d5287SAndreas Gohr /** @var string the quoted term to be used in a regular expression */ 296734bb8cSAndreas Gohr protected string $quoted; 30596d5287SAndreas Gohr 31596d5287SAndreas Gohr /** @var int the length of the base term (not counting wildcards) */ 326734bb8cSAndreas Gohr protected int $length; 33596d5287SAndreas Gohr 34596d5287SAndreas Gohr /** @var int The type of wildcards */ 35*9369b4a9SAndreas Gohr protected int $wildcard = self::WILDCARD_NONE; 36596d5287SAndreas Gohr 371148921dSAndreas Gohr /** @var bool Whether to match case-insensitively */ 381148921dSAndreas Gohr protected bool $isCaseInsensitive = false; 39596d5287SAndreas Gohr 401148921dSAndreas Gohr /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */ 411148921dSAndreas Gohr protected array $matches = []; 421148921dSAndreas Gohr 431148921dSAndreas Gohr // region Setup 44596d5287SAndreas Gohr 45596d5287SAndreas Gohr /** 466734bb8cSAndreas Gohr * @param string $term 47596d5287SAndreas Gohr */ 486734bb8cSAndreas Gohr public function __construct(string $term) 49596d5287SAndreas Gohr { 50596d5287SAndreas Gohr $this->original = $term; 51596d5287SAndreas Gohr $this->base = trim($term, '*'); 52596d5287SAndreas Gohr $this->quoted = preg_quote_cb($this->base); 53596d5287SAndreas Gohr $this->length = Tokenizer::tokenLength($this->base); 54596d5287SAndreas Gohr 55596d5287SAndreas Gohr // handle wildcard 56*9369b4a9SAndreas Gohr if (str_starts_with($term, '*')) { 57596d5287SAndreas Gohr $this->quoted = '.*' . $this->quoted; 58596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_START; 59596d5287SAndreas Gohr } 60596d5287SAndreas Gohr 61*9369b4a9SAndreas Gohr if (str_ends_with($term, '*')) { 62*9369b4a9SAndreas Gohr $this->quoted .= '.*'; 63596d5287SAndreas Gohr $this->wildcard += self::WILDCARD_END; 64596d5287SAndreas Gohr } 65596d5287SAndreas Gohr } 661148921dSAndreas Gohr 671148921dSAndreas Gohr /** 681148921dSAndreas Gohr * Enable case-insensitive matching 691148921dSAndreas Gohr * 701148921dSAndreas Gohr * The fulltext token index is already lowercased by the Tokenizer, so this is only 711148921dSAndreas Gohr * needed for metadata/title searches where indexed values preserve case. 721148921dSAndreas Gohr * 731148921dSAndreas Gohr * @return static 741148921dSAndreas Gohr */ 751148921dSAndreas Gohr public function caseInsensitive(): static 761148921dSAndreas Gohr { 771148921dSAndreas Gohr $this->isCaseInsensitive = true; 78*9369b4a9SAndreas Gohr $this->base = PhpString::strtolower($this->base); 791148921dSAndreas Gohr return $this; 80596d5287SAndreas Gohr } 81596d5287SAndreas Gohr 82596d5287SAndreas Gohr /** 83596d5287SAndreas Gohr * @return string 84596d5287SAndreas Gohr */ 856734bb8cSAndreas Gohr public function getOriginal(): string 86596d5287SAndreas Gohr { 87596d5287SAndreas Gohr return $this->original; 88596d5287SAndreas Gohr } 89596d5287SAndreas Gohr 90596d5287SAndreas Gohr /** 91596d5287SAndreas Gohr * @return string 92596d5287SAndreas Gohr */ 936734bb8cSAndreas Gohr public function getBase(): string 94596d5287SAndreas Gohr { 95596d5287SAndreas Gohr return $this->base; 96596d5287SAndreas Gohr } 97596d5287SAndreas Gohr 98596d5287SAndreas Gohr /** 99596d5287SAndreas Gohr * @return string 100596d5287SAndreas Gohr */ 1016734bb8cSAndreas Gohr public function getQuoted(): string 102596d5287SAndreas Gohr { 103596d5287SAndreas Gohr return $this->quoted; 104596d5287SAndreas Gohr } 105596d5287SAndreas Gohr 106596d5287SAndreas Gohr /** 107596d5287SAndreas Gohr * @return int 108596d5287SAndreas Gohr */ 1096734bb8cSAndreas Gohr public function getLength(): int 110596d5287SAndreas Gohr { 111596d5287SAndreas Gohr return $this->length; 112596d5287SAndreas Gohr } 113596d5287SAndreas Gohr 114596d5287SAndreas Gohr /** 115596d5287SAndreas Gohr * @return int 116596d5287SAndreas Gohr */ 1176734bb8cSAndreas Gohr public function getWildcard(): int 118596d5287SAndreas Gohr { 119596d5287SAndreas Gohr return $this->wildcard; 120596d5287SAndreas Gohr } 121596d5287SAndreas Gohr 1221148921dSAndreas Gohr // endregion 1231148921dSAndreas Gohr 1241148921dSAndreas Gohr // region Matching 1251148921dSAndreas Gohr 126596d5287SAndreas Gohr /** 1271148921dSAndreas Gohr * Check if a token value matches this term 1281148921dSAndreas Gohr * 1291148921dSAndreas Gohr * Uses efficient string functions instead of regex: 1301148921dSAndreas Gohr * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains. 1311148921dSAndreas Gohr * When caseInsensitive() is set, the token value is lowercased before comparison. 1321148921dSAndreas Gohr * 1331148921dSAndreas Gohr * @param string $tokenValue 1341148921dSAndreas Gohr * @return bool 1351148921dSAndreas Gohr */ 1361148921dSAndreas Gohr public function matches(string $tokenValue): bool 1371148921dSAndreas Gohr { 1381148921dSAndreas Gohr if ($this->isCaseInsensitive) { 139*9369b4a9SAndreas Gohr $tokenValue = PhpString::strtolower($tokenValue); 1401148921dSAndreas Gohr } 1411148921dSAndreas Gohr 1421148921dSAndreas Gohr return match ($this->wildcard) { 1431148921dSAndreas Gohr self::WILDCARD_NONE => $this->base === $tokenValue, 1441148921dSAndreas Gohr self::WILDCARD_END => str_starts_with($tokenValue, $this->base), 1451148921dSAndreas Gohr self::WILDCARD_START => str_ends_with($tokenValue, $this->base), 1461148921dSAndreas Gohr default => str_contains($tokenValue, $this->base), 1471148921dSAndreas Gohr }; 1481148921dSAndreas Gohr } 1491148921dSAndreas Gohr 1501148921dSAndreas Gohr // endregion 1511148921dSAndreas Gohr 1521148921dSAndreas Gohr // region Results (populated by CollectionSearch at the end of execute()) 1531148921dSAndreas Gohr 1541148921dSAndreas Gohr /** 1551148921dSAndreas Gohr * Record that a token matched an entity with a given frequency 1561148921dSAndreas Gohr * 1571148921dSAndreas Gohr * When called multiple times for the same entity/token pair, frequencies are summed. 1581148921dSAndreas Gohr * 1591148921dSAndreas Gohr * @param string $entityName 1601148921dSAndreas Gohr * @param string $tokenName 1611148921dSAndreas Gohr * @param int $frequency 1621148921dSAndreas Gohr * @return void 1631148921dSAndreas Gohr * @internal Called by CollectionSearch::resolveAndPopulateTerms() 1641148921dSAndreas Gohr */ 1651148921dSAndreas Gohr public function addMatch(string $entityName, string $tokenName, int $frequency): void 1661148921dSAndreas Gohr { 1671148921dSAndreas Gohr $this->matches[$entityName][$tokenName] = 1681148921dSAndreas Gohr ($this->matches[$entityName][$tokenName] ?? 0) + $frequency; 1691148921dSAndreas Gohr } 1701148921dSAndreas Gohr 1711148921dSAndreas Gohr // endregion 1721148921dSAndreas Gohr 1731148921dSAndreas Gohr // region Result accessors 1741148921dSAndreas Gohr 1751148921dSAndreas Gohr /** 1761148921dSAndreas Gohr * Return the full match detail 1771148921dSAndreas Gohr * 1781148921dSAndreas Gohr * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...] 1791148921dSAndreas Gohr */ 1801148921dSAndreas Gohr public function getMatches(): array 1811148921dSAndreas Gohr { 1821148921dSAndreas Gohr return $this->matches; 1831148921dSAndreas Gohr } 1841148921dSAndreas Gohr 1851148921dSAndreas Gohr /** 1861148921dSAndreas Gohr * Return the matching entities and their aggregated frequencies 1871148921dSAndreas Gohr * 1881148921dSAndreas Gohr * Values are the total frequency across all matching tokens for each entity. 1891148921dSAndreas Gohr * 1901148921dSAndreas Gohr * @return array<string, int> [entityName => totalFrequency, ...] 191596d5287SAndreas Gohr */ 1926734bb8cSAndreas Gohr public function getEntityFrequencies(): array 193596d5287SAndreas Gohr { 194*9369b4a9SAndreas Gohr return array_map(array_sum(...), $this->matches); 195596d5287SAndreas Gohr } 196596d5287SAndreas Gohr 197596d5287SAndreas Gohr /** 1981148921dSAndreas Gohr * Return the matched token names per entity 1996734bb8cSAndreas Gohr * 2001148921dSAndreas Gohr * @return array<string, string[]> [entityName => [tokenName, ...], ...] 201596d5287SAndreas Gohr */ 2021148921dSAndreas Gohr public function getEntityTokens(): array 203596d5287SAndreas Gohr { 204*9369b4a9SAndreas Gohr return array_map(array_keys(...), $this->matches); 205596d5287SAndreas Gohr } 206596d5287SAndreas Gohr 207596d5287SAndreas Gohr /** 2081148921dSAndreas Gohr * Return all unique matched token values 209596d5287SAndreas Gohr * 210e05998d5SAndreas Gohr * @return string[] 211596d5287SAndreas Gohr */ 2126734bb8cSAndreas Gohr public function getTokens(): array 213596d5287SAndreas Gohr { 214*9369b4a9SAndreas Gohr if ($this->matches === []) return []; 2151148921dSAndreas Gohr return array_keys(array_merge(...array_values($this->matches))); 216596d5287SAndreas Gohr } 217596d5287SAndreas Gohr 2181148921dSAndreas Gohr // endregion 219596d5287SAndreas Gohr} 220