xref: /dokuwiki/inc/Search/Collection/Term.php (revision 9369b4a991666bc911474806b106d8958e79f4c1)
1596d5287SAndreas Gohr<?php
2596d5287SAndreas Gohr
3596d5287SAndreas Gohrnamespace dokuwiki\Search\Collection;
4596d5287SAndreas Gohr
5*9369b4a9SAndreas Gohruse dokuwiki\Utf8\PhpString;
6596d5287SAndreas Gohruse dokuwiki\Search\Tokenizer;
71148921dSAndreas Gohruse dokuwiki\Utf8;
8596d5287SAndreas Gohr
9596d5287SAndreas Gohr/**
101148921dSAndreas Gohr * Represents a search term that can match one or more tokens in an index
11596d5287SAndreas Gohr *
121148921dSAndreas Gohr * A term can contain wildcards (* at start/end) and thus may refer to various tokens
131148921dSAndreas Gohr * of different lengths. After a CollectionSearch executes, each Term holds the full
141148921dSAndreas Gohr * match detail: which tokens matched on which entities with what frequencies.
15596d5287SAndreas Gohr */
16596d5287SAndreas Gohrclass Term
17596d5287SAndreas Gohr{
18*9369b4a9SAndreas Gohr    public const WILDCARD_NONE = 0;
19*9369b4a9SAndreas Gohr    public const WILDCARD_START = 1;
20*9369b4a9SAndreas Gohr    public const WILDCARD_END = 2;
21596d5287SAndreas Gohr
22596d5287SAndreas Gohr    /** @var string the original term including wildcard chars */
236734bb8cSAndreas Gohr    protected string $original;
24596d5287SAndreas Gohr
256734bb8cSAndreas Gohr    /** @var string the base of the term without wildcard chars */
266734bb8cSAndreas Gohr    protected string $base;
27596d5287SAndreas Gohr
28596d5287SAndreas Gohr    /** @var string the quoted term to be used in a regular expression */
296734bb8cSAndreas Gohr    protected string $quoted;
30596d5287SAndreas Gohr
31596d5287SAndreas Gohr    /** @var int the length of the base term (not counting wildcards) */
326734bb8cSAndreas Gohr    protected int $length;
33596d5287SAndreas Gohr
34596d5287SAndreas Gohr    /** @var int The type of wildcards */
35*9369b4a9SAndreas Gohr    protected int $wildcard = self::WILDCARD_NONE;
36596d5287SAndreas Gohr
371148921dSAndreas Gohr    /** @var bool Whether to match case-insensitively */
381148921dSAndreas Gohr    protected bool $isCaseInsensitive = false;
39596d5287SAndreas Gohr
401148921dSAndreas Gohr    /** @var array<string, array<string, int>> Match results: [entityName => [tokenName => freq, ...], ...] */
411148921dSAndreas Gohr    protected array $matches = [];
421148921dSAndreas Gohr
431148921dSAndreas Gohr    // region Setup
44596d5287SAndreas Gohr
45596d5287SAndreas Gohr    /**
466734bb8cSAndreas Gohr     * @param string $term
47596d5287SAndreas Gohr     */
486734bb8cSAndreas Gohr    public function __construct(string $term)
49596d5287SAndreas Gohr    {
50596d5287SAndreas Gohr        $this->original = $term;
51596d5287SAndreas Gohr        $this->base = trim($term, '*');
52596d5287SAndreas Gohr        $this->quoted = preg_quote_cb($this->base);
53596d5287SAndreas Gohr        $this->length = Tokenizer::tokenLength($this->base);
54596d5287SAndreas Gohr
55596d5287SAndreas Gohr        // handle wildcard
56*9369b4a9SAndreas Gohr        if (str_starts_with($term, '*')) {
57596d5287SAndreas Gohr            $this->quoted = '.*' . $this->quoted;
58596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_START;
59596d5287SAndreas Gohr        }
60596d5287SAndreas Gohr
61*9369b4a9SAndreas Gohr        if (str_ends_with($term, '*')) {
62*9369b4a9SAndreas Gohr            $this->quoted .= '.*';
63596d5287SAndreas Gohr            $this->wildcard += self::WILDCARD_END;
64596d5287SAndreas Gohr        }
65596d5287SAndreas Gohr    }
661148921dSAndreas Gohr
671148921dSAndreas Gohr    /**
681148921dSAndreas Gohr     * Enable case-insensitive matching
691148921dSAndreas Gohr     *
701148921dSAndreas Gohr     * The fulltext token index is already lowercased by the Tokenizer, so this is only
711148921dSAndreas Gohr     * needed for metadata/title searches where indexed values preserve case.
721148921dSAndreas Gohr     *
731148921dSAndreas Gohr     * @return static
741148921dSAndreas Gohr     */
751148921dSAndreas Gohr    public function caseInsensitive(): static
761148921dSAndreas Gohr    {
771148921dSAndreas Gohr        $this->isCaseInsensitive = true;
78*9369b4a9SAndreas Gohr        $this->base = PhpString::strtolower($this->base);
791148921dSAndreas Gohr        return $this;
80596d5287SAndreas Gohr    }
81596d5287SAndreas Gohr
82596d5287SAndreas Gohr    /**
83596d5287SAndreas Gohr     * @return string
84596d5287SAndreas Gohr     */
856734bb8cSAndreas Gohr    public function getOriginal(): string
86596d5287SAndreas Gohr    {
87596d5287SAndreas Gohr        return $this->original;
88596d5287SAndreas Gohr    }
89596d5287SAndreas Gohr
90596d5287SAndreas Gohr    /**
91596d5287SAndreas Gohr     * @return string
92596d5287SAndreas Gohr     */
936734bb8cSAndreas Gohr    public function getBase(): string
94596d5287SAndreas Gohr    {
95596d5287SAndreas Gohr        return $this->base;
96596d5287SAndreas Gohr    }
97596d5287SAndreas Gohr
98596d5287SAndreas Gohr    /**
99596d5287SAndreas Gohr     * @return string
100596d5287SAndreas Gohr     */
1016734bb8cSAndreas Gohr    public function getQuoted(): string
102596d5287SAndreas Gohr    {
103596d5287SAndreas Gohr        return $this->quoted;
104596d5287SAndreas Gohr    }
105596d5287SAndreas Gohr
106596d5287SAndreas Gohr    /**
107596d5287SAndreas Gohr     * @return int
108596d5287SAndreas Gohr     */
1096734bb8cSAndreas Gohr    public function getLength(): int
110596d5287SAndreas Gohr    {
111596d5287SAndreas Gohr        return $this->length;
112596d5287SAndreas Gohr    }
113596d5287SAndreas Gohr
114596d5287SAndreas Gohr    /**
115596d5287SAndreas Gohr     * @return int
116596d5287SAndreas Gohr     */
1176734bb8cSAndreas Gohr    public function getWildcard(): int
118596d5287SAndreas Gohr    {
119596d5287SAndreas Gohr        return $this->wildcard;
120596d5287SAndreas Gohr    }
121596d5287SAndreas Gohr
1221148921dSAndreas Gohr    // endregion
1231148921dSAndreas Gohr
1241148921dSAndreas Gohr    // region Matching
1251148921dSAndreas Gohr
126596d5287SAndreas Gohr    /**
1271148921dSAndreas Gohr     * Check if a token value matches this term
1281148921dSAndreas Gohr     *
1291148921dSAndreas Gohr     * Uses efficient string functions instead of regex:
1301148921dSAndreas Gohr     * exact match → ===, wildcards → str_starts_with/str_ends_with/str_contains.
1311148921dSAndreas Gohr     * When caseInsensitive() is set, the token value is lowercased before comparison.
1321148921dSAndreas Gohr     *
1331148921dSAndreas Gohr     * @param string $tokenValue
1341148921dSAndreas Gohr     * @return bool
1351148921dSAndreas Gohr     */
1361148921dSAndreas Gohr    public function matches(string $tokenValue): bool
1371148921dSAndreas Gohr    {
1381148921dSAndreas Gohr        if ($this->isCaseInsensitive) {
139*9369b4a9SAndreas Gohr            $tokenValue = PhpString::strtolower($tokenValue);
1401148921dSAndreas Gohr        }
1411148921dSAndreas Gohr
1421148921dSAndreas Gohr        return match ($this->wildcard) {
1431148921dSAndreas Gohr            self::WILDCARD_NONE => $this->base === $tokenValue,
1441148921dSAndreas Gohr            self::WILDCARD_END => str_starts_with($tokenValue, $this->base),
1451148921dSAndreas Gohr            self::WILDCARD_START => str_ends_with($tokenValue, $this->base),
1461148921dSAndreas Gohr            default => str_contains($tokenValue, $this->base),
1471148921dSAndreas Gohr        };
1481148921dSAndreas Gohr    }
1491148921dSAndreas Gohr
1501148921dSAndreas Gohr    // endregion
1511148921dSAndreas Gohr
1521148921dSAndreas Gohr    // region Results (populated by CollectionSearch at the end of execute())
1531148921dSAndreas Gohr
1541148921dSAndreas Gohr    /**
1551148921dSAndreas Gohr     * Record that a token matched an entity with a given frequency
1561148921dSAndreas Gohr     *
1571148921dSAndreas Gohr     * When called multiple times for the same entity/token pair, frequencies are summed.
1581148921dSAndreas Gohr     *
1591148921dSAndreas Gohr     * @param string $entityName
1601148921dSAndreas Gohr     * @param string $tokenName
1611148921dSAndreas Gohr     * @param int $frequency
1621148921dSAndreas Gohr     * @return void
1631148921dSAndreas Gohr     * @internal Called by CollectionSearch::resolveAndPopulateTerms()
1641148921dSAndreas Gohr     */
1651148921dSAndreas Gohr    public function addMatch(string $entityName, string $tokenName, int $frequency): void
1661148921dSAndreas Gohr    {
1671148921dSAndreas Gohr        $this->matches[$entityName][$tokenName] =
1681148921dSAndreas Gohr            ($this->matches[$entityName][$tokenName] ?? 0) + $frequency;
1691148921dSAndreas Gohr    }
1701148921dSAndreas Gohr
1711148921dSAndreas Gohr    // endregion
1721148921dSAndreas Gohr
1731148921dSAndreas Gohr    // region Result accessors
1741148921dSAndreas Gohr
1751148921dSAndreas Gohr    /**
1761148921dSAndreas Gohr     * Return the full match detail
1771148921dSAndreas Gohr     *
1781148921dSAndreas Gohr     * @return array<string, array<string, int>> [entityName => [tokenName => freq, ...], ...]
1791148921dSAndreas Gohr     */
1801148921dSAndreas Gohr    public function getMatches(): array
1811148921dSAndreas Gohr    {
1821148921dSAndreas Gohr        return $this->matches;
1831148921dSAndreas Gohr    }
1841148921dSAndreas Gohr
1851148921dSAndreas Gohr    /**
1861148921dSAndreas Gohr     * Return the matching entities and their aggregated frequencies
1871148921dSAndreas Gohr     *
1881148921dSAndreas Gohr     * Values are the total frequency across all matching tokens for each entity.
1891148921dSAndreas Gohr     *
1901148921dSAndreas Gohr     * @return array<string, int> [entityName => totalFrequency, ...]
191596d5287SAndreas Gohr     */
1926734bb8cSAndreas Gohr    public function getEntityFrequencies(): array
193596d5287SAndreas Gohr    {
194*9369b4a9SAndreas Gohr        return array_map(array_sum(...), $this->matches);
195596d5287SAndreas Gohr    }
196596d5287SAndreas Gohr
197596d5287SAndreas Gohr    /**
1981148921dSAndreas Gohr     * Return the matched token names per entity
1996734bb8cSAndreas Gohr     *
2001148921dSAndreas Gohr     * @return array<string, string[]> [entityName => [tokenName, ...], ...]
201596d5287SAndreas Gohr     */
2021148921dSAndreas Gohr    public function getEntityTokens(): array
203596d5287SAndreas Gohr    {
204*9369b4a9SAndreas Gohr        return array_map(array_keys(...), $this->matches);
205596d5287SAndreas Gohr    }
206596d5287SAndreas Gohr
207596d5287SAndreas Gohr    /**
2081148921dSAndreas Gohr     * Return all unique matched token values
209596d5287SAndreas Gohr     *
210e05998d5SAndreas Gohr     * @return string[]
211596d5287SAndreas Gohr     */
2126734bb8cSAndreas Gohr    public function getTokens(): array
213596d5287SAndreas Gohr    {
214*9369b4a9SAndreas Gohr        if ($this->matches === []) return [];
2151148921dSAndreas Gohr        return array_keys(array_merge(...array_values($this->matches)));
216596d5287SAndreas Gohr    }
217596d5287SAndreas Gohr
2181148921dSAndreas Gohr    // endregion
219596d5287SAndreas Gohr}
220