xref: /dokuwiki/_test/tests/Search/Collection/TermTest.php (revision 21fbd01b3c3eea88b767376b7b158f31f0f63127)
1<?php
2
3namespace dokuwiki\test\Search\Collection;
4
5use dokuwiki\Search\Collection\FulltextCollection;
6use dokuwiki\Search\Collection\FulltextCollectionSearch;
7use dokuwiki\Search\Collection\Term;
8use dokuwiki\Search\Exception\SearchException;
9use dokuwiki\Search\Index\MemoryIndex;
10use dokuwiki\Search\Query\QueryParser;
11use dokuwiki\Search\Tokenizer;
12
13class TermTest extends \DokuWikiTest
14{
15    public function testBasicExact()
16    {
17        $term = new Term('dokuwiki');
18
19        $this->assertEquals('dokuwiki', $term->getOriginal());
20        $this->assertEquals('dokuwiki', $term->getBase());
21        $this->assertEquals('dokuwiki', $term->getQuoted());
22        $this->assertEquals(8, $term->getLength());
23        $this->assertEquals(Term::WILDCARD_NONE, $term->getWildcard());
24    }
25
26    public function testBasicLeftWildcard()
27    {
28        $term = new Term('*wiki');
29
30        $this->assertEquals('*wiki', $term->getOriginal());
31        $this->assertEquals('wiki', $term->getBase());
32        $this->assertEquals('.*wiki', $term->getQuoted());
33        $this->assertEquals(4, $term->getLength());
34        $this->assertEquals(Term::WILDCARD_START, $term->getWildcard());
35    }
36
37    public function testBasicRightWildcard()
38    {
39        $term = new Term('wiki*');
40
41        $this->assertEquals('wiki*', $term->getOriginal());
42        $this->assertEquals('wiki', $term->getBase());
43        $this->assertEquals('wiki.*', $term->getQuoted());
44        $this->assertEquals(4, $term->getLength());
45        $this->assertEquals(Term::WILDCARD_END, $term->getWildcard());
46    }
47
48    public function testBasicBothWildcard()
49    {
50        $term = new Term('*wiki*');
51
52        $this->assertEquals('*wiki*', $term->getOriginal());
53        $this->assertEquals('wiki', $term->getBase());
54        $this->assertEquals('.*wiki.*', $term->getQuoted());
55        $this->assertEquals(4, $term->getLength());
56        $this->assertEquals(Term::WILDCARD_START + Term::WILDCARD_END, $term->getWildcard());
57    }
58
59    public function testEmptyTerm()
60    {
61        $this->expectException(SearchException::class);
62        $this->expectExceptionMessageMatches('/short/i');
63        new Term('');
64    }
65
66    public function testTokenAdding()
67    {
68        $term = new Term('*wiki*');
69        $term->addTokens(8, [0 => 'dokuwiki']);
70        $term->addTokens(5, [0 => 'wikis', 134 => 'awiki']);
71
72        $this->assertEquals(['dokuwiki', 'wikis', 'awiki'], $term->getTokens());
73
74        $this->assertEquals([0], $term->getTokenIDsByGroup(8));
75        $this->assertEquals([0, 134], $term->getTokenIDsByGroup(5));
76        $this->assertEquals([], $term->getTokenIDsByGroup(3));
77    }
78
79    public function testFrequencyAdding()
80    {
81        $term = new Term('dokuwiki');
82
83        $term->addEntityFrequency(7, 7);
84        $term->addEntityFrequency(7, 7);
85        $term->addEntityFrequency(8, 1);
86
87        $this->assertEquals([7 => 14, 8 => 1], $term->getEntityFrequencies());
88
89        $map = [
90            7 => 'page1',
91            8 => 'page2'
92        ];
93        $term->resolveEntities($map);
94
95        $this->assertEquals(['page1' => 14, 'page2' => 1], $term->getEntityFrequencies());
96    }
97
98    public function testNumericTerm()
99    {
100        // Numeric terms should be allowed even if they're shorter than minimum word length
101        $term = new Term('42');
102
103        $this->assertEquals('42', $term->getOriginal());
104        $this->assertEquals('42', $term->getBase());
105        $this->assertEquals(2, $term->getLength());
106        $this->assertEquals(Term::WILDCARD_NONE, $term->getWildcard());
107    }
108
109    public function testSpecialCharactersQuoting()
110    {
111        // Test that special regex characters are properly escaped
112        $term = new Term('test.doc');
113
114        $this->assertEquals('test.doc', $term->getOriginal());
115        $this->assertEquals('test.doc', $term->getBase());
116        // The dot should be escaped in the quoted version
117        $this->assertEquals('test\\.doc', $term->getQuoted());
118    }
119
120    public function testSpecialCharactersWithWildcard()
121    {
122        // Test special chars with wildcard
123        $term = new Term('test.*');
124
125        $this->assertEquals('test.*', $term->getOriginal());
126        $this->assertEquals('test.', $term->getBase());
127        // The dot should be escaped, but the wildcard * should become .*
128        $this->assertEquals('test\\..*', $term->getQuoted());
129        $this->assertEquals(Term::WILDCARD_END, $term->getWildcard());
130    }
131
132    public function testWildcardTrimming()
133    {
134        // Test that only wildcards (not spaces) are trimmed from base
135        $term = new Term('*wiki*');
136
137        $this->assertEquals('*wiki*', $term->getOriginal());
138        $this->assertEquals('wiki', $term->getBase());
139        $this->assertEquals('.*wiki.*', $term->getQuoted());
140        $this->assertEquals(Term::WILDCARD_START + Term::WILDCARD_END, $term->getWildcard());
141    }
142
143    public function testTooShortTerm()
144    {
145        // Get the minimum word length
146        $minLength = Tokenizer::getMinWordLength();
147
148        if ($minLength > 1) {
149            $this->expectException(SearchException::class);
150            $this->expectExceptionMessageMatches('/short/i');
151            // Create a term that's too short (one character less than minimum)
152            new Term(str_repeat('a', $minLength - 1));
153        } else {
154            // If minimum length is 1 or less, this test doesn't apply
155            $this->markTestSkipped('Minimum word length is too small for this test');
156        }
157    }
158
159    public function testOnlyWildcards()
160    {
161        $this->expectException(SearchException::class);
162        $this->expectExceptionMessageMatches('/short/i');
163        new Term('***');
164    }
165
166    public function testMultipleLengthTokens()
167    {
168        $term = new Term('*wiki*');
169
170        // Add tokens of various lengths
171        $term->addTokens(4, [10 => 'wiki', 11 => 'mwiki']);
172        $term->addTokens(8, [20 => 'dokuwiki', 21 => 'pmwiki']);
173        $term->addTokens(9, [30 => 'mediawiki']);
174
175        // Check we get all tokens
176        $allTokens = $term->getTokens();
177        $this->assertCount(5, $allTokens);
178        $this->assertContains('wiki', $allTokens);
179        $this->assertContains('dokuwiki', $allTokens);
180        $this->assertContains('mediawiki', $allTokens);
181
182        // Check we can get tokens by specific length
183        $this->assertEquals([10, 11], $term->getTokenIDsByGroup(4));
184        $this->assertEquals([20, 21], $term->getTokenIDsByGroup(8));
185        $this->assertEquals([30], $term->getTokenIDsByGroup(9));
186        $this->assertEquals([], $term->getTokenIDsByGroup(5));
187    }
188
189    public function testFrequencyAggregationAcrossTokens()
190    {
191        // Simulate a search where term matches multiple tokens on the same entity
192        $term = new Term('*wiki*');
193
194        // Entity 1 has multiple matching tokens
195        $term->addEntityFrequency(1, 5);  // first token appears 5 times
196        $term->addEntityFrequency(1, 3);  // second token appears 3 times
197        $term->addEntityFrequency(1, 2);  // third token appears 2 times
198
199        // Entity 2 has one matching token
200        $term->addEntityFrequency(2, 7);
201
202        $frequencies = $term->getEntityFrequencies();
203        $this->assertEquals(10, $frequencies[1]); // 5 + 3 + 2
204        $this->assertEquals(7, $frequencies[2]);
205    }
206
207    public function testEmptyTokensByLength()
208    {
209        $term = new Term('dokuwiki');
210
211        // Before adding any tokens, getting by length should return empty
212        $this->assertEquals([], $term->getTokenIDsByGroup(8));
213
214        // After adding tokens, querying a non-existent length returns empty
215        $term->addTokens(4, [10 => 'wiki']);
216        $this->assertEquals([], $term->getTokenIDsByGroup(8));
217    }
218
219    public function testZeroFrequency()
220    {
221        $term = new Term('dokuwiki');
222
223        $term->addEntityFrequency(1, 5);
224        $term->addEntityFrequency(2, 0);  // Zero frequency
225        $term->addEntityFrequency(3, 3);
226
227        $frequencies = $term->getEntityFrequencies();
228        $this->assertEquals(5, $frequencies[1]);
229        $this->assertEquals(0, $frequencies[2]);  // Zero is stored
230        $this->assertEquals(3, $frequencies[3]);
231    }
232
233    public function testResolveEntitiesPartialMap()
234    {
235        $term = new Term('dokuwiki');
236
237        $term->addEntityFrequency(1, 5);
238        $term->addEntityFrequency(2, 3);
239
240        // Resolve with partial map - only some entities are mapped
241        $map = [
242            1 => 'page1',
243            2 => 'page2'
244        ];
245        $term->resolveEntities($map);
246
247        $frequencies = $term->getEntityFrequencies();
248        $this->assertEquals(5, $frequencies['page1']);
249        $this->assertEquals(3, $frequencies['page2']);
250        $this->assertCount(2, $frequencies);
251    }
252
253    public function testCaseSensitiveBase()
254    {
255        // Test that case is preserved
256        $term = new Term('DokuWiki');
257
258        $this->assertEquals('DokuWiki', $term->getOriginal());
259        $this->assertEquals('DokuWiki', $term->getBase());
260    }
261
262    public function testComplexRegexCharacters()
263    {
264        // Test multiple special regex characters
265        $term = new Term('test[0-9]+.txt');
266
267        $this->assertEquals('test[0-9]+.txt', $term->getOriginal());
268        $this->assertEquals('test[0-9]+.txt', $term->getBase());
269        // All special characters should be escaped
270        $quoted = $term->getQuoted();
271        $this->assertStringContainsString('\\[', $quoted);
272        $this->assertStringContainsString('\\]', $quoted);
273        $this->assertStringContainsString('\\+', $quoted);
274        $this->assertStringContainsString('\\.', $quoted);
275    }
276
277}
278