1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Utf8\PhpString; 6use dokuwiki\Search\Tokenizer; 7use dokuwiki\Utf8; 8 9/** 10 * Represents a search term that can match one or more tokens in an index 11 * 12 * A term can contain wildcards (* at start/end) and thus may refer to various tokens 13 * of different lengths. After a CollectionSearch executes, each Term holds the full 14 * match detail: which tokens matched on which entities with what frequencies. 15 */ 16class Term 17{ 18 public const WILDCARD_NONE = 0; 19 public const WILDCARD_START = 1; 20 public const WILDCARD_END = 2; 21 22 /** @var string the original term including wildcard chars */ 23 protected string $original; 24 25 /** @var string the base of the term without wildcard chars */ 26 protected string $base; 27 28 /** @var string the quoted term to be used in a regular expression */ 29 protected string $quoted; 30 31 /** @var int the length of the base term (not counting wildcards) */ 32 protected int $length; 33 34 /** @var int The type of wildcards */ 35 protected int $wildcard = self::WILDCARD_NONE; 36 37 /** @var bool Whether to match case-insensitively */ 38 protected bool $isCaseInsensitive = false; 39 40 /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */ 41 protected array $matches = []; 42 43 // region Setup 44 45 /** 46 * @param string $term 47 */ 48 public function __construct(string $term) 49 { 50 $this->original = $term; 51 $this->base = trim($term, '*'); 52 $this->quoted = preg_quote_cb($this->base); 53 $this->length = Tokenizer::tokenLength($this->base); 54 55 // handle wildcard 56 if (str_starts_with($term, '*')) { 57 $this->quoted = '.*' . $this->quoted; 58 $this->wildcard += self::WILDCARD_START; 59 } 60 61 if (str_ends_with($term, '*')) { 62 $this->quoted .= '.*'; 63 $this->wildcard += self::WILDCARD_END; 64 } 65 } 66 67 /** 68 * Enable case-insensitive matching 69 * 70 * The fulltext token index is already lowercased by the Tokenizer, so this is only 71 * needed for metadata/title searches where indexed values preserve case. 72 * 73 * @return static 74 */ 75 public function caseInsensitive(): static 76 { 77 $this->isCaseInsensitive = true; 78 $this->base = PhpString::strtolower($this->base); 79 return $this; 80 } 81 82 /** 83 * @return string 84 */ 85 public function getOriginal(): string 86 { 87 return $this->original; 88 } 89 90 /** 91 * @return string 92 */ 93 public function getBase(): string 94 { 95 return $this->base; 96 } 97 98 /** 99 * @return string 100 */ 101 public function getQuoted(): string 102 { 103 return $this->quoted; 104 } 105 106 /** 107 * @return int 108 */ 109 public function getLength(): int 110 { 111 return $this->length; 112 } 113 114 /** 115 * @return int 116 */ 117 public function getWildcard(): int 118 { 119 return $this->wildcard; 120 } 121 122 // endregion 123 124 // region Matching 125 126 /** 127 * Check if a token value matches this term 128 * 129 * Uses efficient string functions instead of regex: 130 * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains. 131 * When caseInsensitive() is set, the token value is lowercased before comparison. 132 * 133 * @param string $tokenValue 134 * @return bool 135 */ 136 public function matches(string $tokenValue): bool 137 { 138 if ($this->isCaseInsensitive) { 139 $tokenValue = PhpString::strtolower($tokenValue); 140 } 141 142 return match ($this->wildcard) { 143 self::WILDCARD_NONE => $this->base === $tokenValue, 144 self::WILDCARD_END => str_starts_with($tokenValue, $this->base), 145 self::WILDCARD_START => str_ends_with($tokenValue, $this->base), 146 default => str_contains($tokenValue, $this->base), 147 }; 148 } 149 150 // endregion 151 152 // region Results (populated by CollectionSearch at the end of execute()) 153 154 /** 155 * Record that a token matched an entity with a given frequency 156 * 157 * When called multiple times for the same entity/token pair, frequencies are summed. 158 * 159 * @param string $entityName 160 * @param string $tokenName 161 * @param int $frequency 162 * @return void 163 * @internal Called by CollectionSearch::resolveAndPopulateTerms() 164 */ 165 public function addMatch(string $entityName, string $tokenName, int $frequency): void 166 { 167 $this->matches[$entityName][$tokenName] = 168 ($this->matches[$entityName][$tokenName] ?? 0) + $frequency; 169 } 170 171 // endregion 172 173 // region Result accessors 174 175 /** 176 * Return the full match detail 177 * 178 * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...] 179 */ 180 public function getMatches(): array 181 { 182 return $this->matches; 183 } 184 185 /** 186 * Return the matching entities and their aggregated frequencies 187 * 188 * Values are the total frequency across all matching tokens for each entity. 189 * 190 * @return array<string, int> [entityName => totalFrequency, ...] 191 */ 192 public function getEntityFrequencies(): array 193 { 194 return array_map(array_sum(...), $this->matches); 195 } 196 197 /** 198 * Return the matched token names per entity 199 * 200 * @return array<string, string[]> [entityName => [tokenName, ...], ...] 201 */ 202 public function getEntityTokens(): array 203 { 204 return array_map(array_keys(...), $this->matches); 205 } 206 207 /** 208 * Return all unique matched token values 209 * 210 * @return string[] 211 */ 212 public function getTokens(): array 213 { 214 if ($this->matches === []) return []; 215 return array_keys(array_merge(...array_values($this->matches))); 216 } 217 218 // endregion 219} 220