1<?php 2 3namespace dokuwiki\Search\Collection; 4 5use dokuwiki\Search\Tokenizer; 6use dokuwiki\Utf8; 7 8/** 9 * Represents a search term that can match one or more tokens in an index 10 * 11 * A term can contain wildcards (* at start/end) and thus may refer to various tokens 12 * of different lengths. After a CollectionSearch executes, each Term holds the full 13 * match detail: which tokens matched on which entities with what frequencies. 14 */ 15class Term 16{ 17 18 const WILDCARD_NONE = 0; 19 const WILDCARD_START = 1; 20 const WILDCARD_END = 2; 21 22 /** @var string the original term including wildcard chars */ 23 protected string $original; 24 25 /** @var string the base of the term without wildcard chars */ 26 protected string $base; 27 28 /** @var string the quoted term to be used in a regular expression */ 29 protected string $quoted; 30 31 /** @var int the length of the base term (not counting wildcards) */ 32 protected int $length; 33 34 /** @var int The type of wildcards */ 35 protected int $wildcard; 36 37 /** @var bool Whether to match case-insensitively */ 38 protected bool $isCaseInsensitive = false; 39 40 /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */ 41 protected array $matches = []; 42 43 // region Setup 44 45 /** 46 * @param string $term 47 */ 48 public function __construct(string $term) 49 { 50 $this->original = $term; 51 $this->base = trim($term, '*'); 52 $this->quoted = preg_quote_cb($this->base); 53 $this->wildcard = self::WILDCARD_NONE; 54 $this->length = Tokenizer::tokenLength($this->base); 55 56 // handle wildcard 57 if (substr($term, 0, 1) === '*') { 58 $this->quoted = '.*' . $this->quoted; 59 $this->wildcard += self::WILDCARD_START; 60 } 61 62 if (substr($term, -1, 1) === '*') { 63 $this->quoted = $this->quoted . '.*'; 64 $this->wildcard += self::WILDCARD_END; 65 } 66 } 67 68 /** 69 * Enable case-insensitive matching 70 * 71 * The fulltext token index is already lowercased by the Tokenizer, so this is only 72 * needed for metadata/title searches where indexed values preserve case. 73 * 74 * @return static 75 */ 76 public function caseInsensitive(): static 77 { 78 $this->isCaseInsensitive = true; 79 $this->base = Utf8\PhpString::strtolower($this->base); 80 return $this; 81 } 82 83 /** 84 * @return string 85 */ 86 public function getOriginal(): string 87 { 88 return $this->original; 89 } 90 91 /** 92 * @return string 93 */ 94 public function getBase(): string 95 { 96 return $this->base; 97 } 98 99 /** 100 * @return string 101 */ 102 public function getQuoted(): string 103 { 104 return $this->quoted; 105 } 106 107 /** 108 * @return int 109 */ 110 public function getLength(): int 111 { 112 return $this->length; 113 } 114 115 /** 116 * @return int 117 */ 118 public function getWildcard(): int 119 { 120 return $this->wildcard; 121 } 122 123 // endregion 124 125 // region Matching 126 127 /** 128 * Check if a token value matches this term 129 * 130 * Uses efficient string functions instead of regex: 131 * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains. 132 * When caseInsensitive() is set, the token value is lowercased before comparison. 133 * 134 * @param string $tokenValue 135 * @return bool 136 */ 137 public function matches(string $tokenValue): bool 138 { 139 if ($this->isCaseInsensitive) { 140 $tokenValue = Utf8\PhpString::strtolower($tokenValue); 141 } 142 143 return match ($this->wildcard) { 144 self::WILDCARD_NONE => $this->base === $tokenValue, 145 self::WILDCARD_END => str_starts_with($tokenValue, $this->base), 146 self::WILDCARD_START => str_ends_with($tokenValue, $this->base), 147 default => str_contains($tokenValue, $this->base), 148 }; 149 } 150 151 // endregion 152 153 // region Results (populated by CollectionSearch at the end of execute()) 154 155 /** 156 * Record that a token matched an entity with a given frequency 157 * 158 * When called multiple times for the same entity/token pair, frequencies are summed. 159 * 160 * @param string $entityName 161 * @param string $tokenName 162 * @param int $frequency 163 * @return void 164 * @internal Called by CollectionSearch::resolveAndPopulateTerms() 165 */ 166 public function addMatch(string $entityName, string $tokenName, int $frequency): void 167 { 168 $this->matches[$entityName][$tokenName] = 169 ($this->matches[$entityName][$tokenName] ?? 0) + $frequency; 170 } 171 172 // endregion 173 174 // region Result accessors 175 176 /** 177 * Return the full match detail 178 * 179 * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...] 180 */ 181 public function getMatches(): array 182 { 183 return $this->matches; 184 } 185 186 /** 187 * Return the matching entities and their aggregated frequencies 188 * 189 * Values are the total frequency across all matching tokens for each entity. 190 * 191 * @return array<string, int> [entityName => totalFrequency, ...] 192 */ 193 public function getEntityFrequencies(): array 194 { 195 return array_map('array_sum', $this->matches); 196 } 197 198 /** 199 * Return the matched token names per entity 200 * 201 * @return array<string, string[]> [entityName => [tokenName, ...], ...] 202 */ 203 public function getEntityTokens(): array 204 { 205 return array_map('array_keys', $this->matches); 206 } 207 208 /** 209 * Return all unique matched token values 210 * 211 * @return string[] 212 */ 213 public function getTokens(): array 214 { 215 if (empty($this->matches)) return []; 216 return array_keys(array_merge(...array_values($this->matches))); 217 } 218 219 // endregion 220} 221