xref: /dokuwiki/_test/tests/Search/Collection/TermTest.php (revision ede4646658cf51245060332d97a319a39c788ea1)
1*ede46466SAndreas Gohr<?php
2*ede46466SAndreas Gohr
3*ede46466SAndreas Gohrnamespace dokuwiki\test\Search\Collection;
4*ede46466SAndreas Gohr
5*ede46466SAndreas Gohruse dokuwiki\Search\Collection\FulltextCollection;
6*ede46466SAndreas Gohruse dokuwiki\Search\Collection\FulltextCollectionSearch;
7*ede46466SAndreas Gohruse dokuwiki\Search\Collection\Term;
8*ede46466SAndreas Gohruse dokuwiki\Search\Exception\SearchException;
9*ede46466SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex;
10*ede46466SAndreas Gohruse dokuwiki\Search\Query\QueryParser;
11*ede46466SAndreas Gohruse dokuwiki\Search\Tokenizer;
12*ede46466SAndreas Gohr
13*ede46466SAndreas Gohrclass TermTest extends \DokuWikiTest
14*ede46466SAndreas Gohr{
15*ede46466SAndreas Gohr    public function testBasicExact()
16*ede46466SAndreas Gohr    {
17*ede46466SAndreas Gohr        $term = new Term('dokuwiki');
18*ede46466SAndreas Gohr
19*ede46466SAndreas Gohr        $this->assertEquals('dokuwiki', $term->getOriginal());
20*ede46466SAndreas Gohr        $this->assertEquals('dokuwiki', $term->getBase());
21*ede46466SAndreas Gohr        $this->assertEquals('dokuwiki', $term->getQuoted());
22*ede46466SAndreas Gohr        $this->assertEquals(8, $term->getLength());
23*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_NONE, $term->getWildcard());
24*ede46466SAndreas Gohr    }
25*ede46466SAndreas Gohr
26*ede46466SAndreas Gohr    public function testBasicLeftWildcard()
27*ede46466SAndreas Gohr    {
28*ede46466SAndreas Gohr        $term = new Term('*wiki');
29*ede46466SAndreas Gohr
30*ede46466SAndreas Gohr        $this->assertEquals('*wiki', $term->getOriginal());
31*ede46466SAndreas Gohr        $this->assertEquals('wiki', $term->getBase());
32*ede46466SAndreas Gohr        $this->assertEquals('.*wiki', $term->getQuoted());
33*ede46466SAndreas Gohr        $this->assertEquals(4, $term->getLength());
34*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_START, $term->getWildcard());
35*ede46466SAndreas Gohr    }
36*ede46466SAndreas Gohr
37*ede46466SAndreas Gohr    public function testBasicRightWildcard()
38*ede46466SAndreas Gohr    {
39*ede46466SAndreas Gohr        $term = new Term('wiki*');
40*ede46466SAndreas Gohr
41*ede46466SAndreas Gohr        $this->assertEquals('wiki*', $term->getOriginal());
42*ede46466SAndreas Gohr        $this->assertEquals('wiki', $term->getBase());
43*ede46466SAndreas Gohr        $this->assertEquals('wiki.*', $term->getQuoted());
44*ede46466SAndreas Gohr        $this->assertEquals(4, $term->getLength());
45*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_END, $term->getWildcard());
46*ede46466SAndreas Gohr    }
47*ede46466SAndreas Gohr
48*ede46466SAndreas Gohr    public function testBasicBothWildcard()
49*ede46466SAndreas Gohr    {
50*ede46466SAndreas Gohr        $term = new Term('*wiki*');
51*ede46466SAndreas Gohr
52*ede46466SAndreas Gohr        $this->assertEquals('*wiki*', $term->getOriginal());
53*ede46466SAndreas Gohr        $this->assertEquals('wiki', $term->getBase());
54*ede46466SAndreas Gohr        $this->assertEquals('.*wiki.*', $term->getQuoted());
55*ede46466SAndreas Gohr        $this->assertEquals(4, $term->getLength());
56*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_START + Term::WILDCARD_END, $term->getWildcard());
57*ede46466SAndreas Gohr    }
58*ede46466SAndreas Gohr
59*ede46466SAndreas Gohr    public function testEmptyTerm()
60*ede46466SAndreas Gohr    {
61*ede46466SAndreas Gohr        $this->expectException(SearchException::class);
62*ede46466SAndreas Gohr        $this->expectExceptionMessageMatches('/short/i');
63*ede46466SAndreas Gohr        new Term('');
64*ede46466SAndreas Gohr    }
65*ede46466SAndreas Gohr
66*ede46466SAndreas Gohr    public function testTokenAdding()
67*ede46466SAndreas Gohr    {
68*ede46466SAndreas Gohr        $term = new Term('*wiki*');
69*ede46466SAndreas Gohr        $term->addTokens(8, [0 => 'dokuwiki']);
70*ede46466SAndreas Gohr        $term->addTokens(5, [0 => 'wikis', 134 => 'awiki']);
71*ede46466SAndreas Gohr
72*ede46466SAndreas Gohr        $this->assertEquals(['dokuwiki', 'wikis', 'awiki'], $term->getTokens());
73*ede46466SAndreas Gohr
74*ede46466SAndreas Gohr        $this->assertEquals([0], $term->getTokenIDsByLength(8));
75*ede46466SAndreas Gohr        $this->assertEquals([0, 134], $term->getTokenIDsByLength(5));
76*ede46466SAndreas Gohr        $this->assertEquals([], $term->getTokenIDsByLength(3));
77*ede46466SAndreas Gohr    }
78*ede46466SAndreas Gohr
79*ede46466SAndreas Gohr    public function testFrequencyAdding()
80*ede46466SAndreas Gohr    {
81*ede46466SAndreas Gohr        $term = new Term('dokuwiki');
82*ede46466SAndreas Gohr
83*ede46466SAndreas Gohr        $term->addEntityFrequency(7, 7);
84*ede46466SAndreas Gohr        $term->addEntityFrequency(7, 7);
85*ede46466SAndreas Gohr        $term->addEntityFrequency(8, 1);
86*ede46466SAndreas Gohr
87*ede46466SAndreas Gohr        $this->assertEquals([7 => 14, 8 => 1], $term->getEntityFrequencies());
88*ede46466SAndreas Gohr
89*ede46466SAndreas Gohr        $map = [
90*ede46466SAndreas Gohr            7 => 'page1',
91*ede46466SAndreas Gohr            8 => 'page2'
92*ede46466SAndreas Gohr        ];
93*ede46466SAndreas Gohr        $term->resolveEntities($map);
94*ede46466SAndreas Gohr
95*ede46466SAndreas Gohr        $this->assertEquals(['page1' => 14, 'page2' => 1], $term->getEntityFrequencies());
96*ede46466SAndreas Gohr    }
97*ede46466SAndreas Gohr
98*ede46466SAndreas Gohr    public function testNumericTerm()
99*ede46466SAndreas Gohr    {
100*ede46466SAndreas Gohr        // Numeric terms should be allowed even if they're shorter than minimum word length
101*ede46466SAndreas Gohr        $term = new Term('42');
102*ede46466SAndreas Gohr
103*ede46466SAndreas Gohr        $this->assertEquals('42', $term->getOriginal());
104*ede46466SAndreas Gohr        $this->assertEquals('42', $term->getBase());
105*ede46466SAndreas Gohr        $this->assertEquals(2, $term->getLength());
106*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_NONE, $term->getWildcard());
107*ede46466SAndreas Gohr    }
108*ede46466SAndreas Gohr
109*ede46466SAndreas Gohr    public function testSpecialCharactersQuoting()
110*ede46466SAndreas Gohr    {
111*ede46466SAndreas Gohr        // Test that special regex characters are properly escaped
112*ede46466SAndreas Gohr        $term = new Term('test.doc');
113*ede46466SAndreas Gohr
114*ede46466SAndreas Gohr        $this->assertEquals('test.doc', $term->getOriginal());
115*ede46466SAndreas Gohr        $this->assertEquals('test.doc', $term->getBase());
116*ede46466SAndreas Gohr        // The dot should be escaped in the quoted version
117*ede46466SAndreas Gohr        $this->assertEquals('test\\.doc', $term->getQuoted());
118*ede46466SAndreas Gohr    }
119*ede46466SAndreas Gohr
120*ede46466SAndreas Gohr    public function testSpecialCharactersWithWildcard()
121*ede46466SAndreas Gohr    {
122*ede46466SAndreas Gohr        // Test special chars with wildcard
123*ede46466SAndreas Gohr        $term = new Term('test.*');
124*ede46466SAndreas Gohr
125*ede46466SAndreas Gohr        $this->assertEquals('test.*', $term->getOriginal());
126*ede46466SAndreas Gohr        $this->assertEquals('test.', $term->getBase());
127*ede46466SAndreas Gohr        // The dot should be escaped, but the wildcard * should become .*
128*ede46466SAndreas Gohr        $this->assertEquals('test\\..*', $term->getQuoted());
129*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_END, $term->getWildcard());
130*ede46466SAndreas Gohr    }
131*ede46466SAndreas Gohr
132*ede46466SAndreas Gohr    public function testWildcardTrimming()
133*ede46466SAndreas Gohr    {
134*ede46466SAndreas Gohr        // Test that only wildcards (not spaces) are trimmed from base
135*ede46466SAndreas Gohr        $term = new Term('*wiki*');
136*ede46466SAndreas Gohr
137*ede46466SAndreas Gohr        $this->assertEquals('*wiki*', $term->getOriginal());
138*ede46466SAndreas Gohr        $this->assertEquals('wiki', $term->getBase());
139*ede46466SAndreas Gohr        $this->assertEquals('.*wiki.*', $term->getQuoted());
140*ede46466SAndreas Gohr        $this->assertEquals(Term::WILDCARD_START + Term::WILDCARD_END, $term->getWildcard());
141*ede46466SAndreas Gohr    }
142*ede46466SAndreas Gohr
143*ede46466SAndreas Gohr    public function testTooShortTerm()
144*ede46466SAndreas Gohr    {
145*ede46466SAndreas Gohr        // Get the minimum word length
146*ede46466SAndreas Gohr        $minLength = Tokenizer::getMinWordLength();
147*ede46466SAndreas Gohr
148*ede46466SAndreas Gohr        if ($minLength > 1) {
149*ede46466SAndreas Gohr            $this->expectException(SearchException::class);
150*ede46466SAndreas Gohr            $this->expectExceptionMessageMatches('/short/i');
151*ede46466SAndreas Gohr            // Create a term that's too short (one character less than minimum)
152*ede46466SAndreas Gohr            new Term(str_repeat('a', $minLength - 1));
153*ede46466SAndreas Gohr        } else {
154*ede46466SAndreas Gohr            // If minimum length is 1 or less, this test doesn't apply
155*ede46466SAndreas Gohr            $this->markTestSkipped('Minimum word length is too small for this test');
156*ede46466SAndreas Gohr        }
157*ede46466SAndreas Gohr    }
158*ede46466SAndreas Gohr
159*ede46466SAndreas Gohr    public function testOnlyWildcards()
160*ede46466SAndreas Gohr    {
161*ede46466SAndreas Gohr        $this->expectException(SearchException::class);
162*ede46466SAndreas Gohr        $this->expectExceptionMessageMatches('/short/i');
163*ede46466SAndreas Gohr        new Term('***');
164*ede46466SAndreas Gohr    }
165*ede46466SAndreas Gohr
166*ede46466SAndreas Gohr    public function testMultipleLengthTokens()
167*ede46466SAndreas Gohr    {
168*ede46466SAndreas Gohr        $term = new Term('*wiki*');
169*ede46466SAndreas Gohr
170*ede46466SAndreas Gohr        // Add tokens of various lengths
171*ede46466SAndreas Gohr        $term->addTokens(4, [10 => 'wiki', 11 => 'mwiki']);
172*ede46466SAndreas Gohr        $term->addTokens(8, [20 => 'dokuwiki', 21 => 'pmwiki']);
173*ede46466SAndreas Gohr        $term->addTokens(9, [30 => 'mediawiki']);
174*ede46466SAndreas Gohr
175*ede46466SAndreas Gohr        // Check we get all tokens
176*ede46466SAndreas Gohr        $allTokens = $term->getTokens();
177*ede46466SAndreas Gohr        $this->assertCount(5, $allTokens);
178*ede46466SAndreas Gohr        $this->assertContains('wiki', $allTokens);
179*ede46466SAndreas Gohr        $this->assertContains('dokuwiki', $allTokens);
180*ede46466SAndreas Gohr        $this->assertContains('mediawiki', $allTokens);
181*ede46466SAndreas Gohr
182*ede46466SAndreas Gohr        // Check we can get tokens by specific length
183*ede46466SAndreas Gohr        $this->assertEquals([10, 11], $term->getTokenIDsByLength(4));
184*ede46466SAndreas Gohr        $this->assertEquals([20, 21], $term->getTokenIDsByLength(8));
185*ede46466SAndreas Gohr        $this->assertEquals([30], $term->getTokenIDsByLength(9));
186*ede46466SAndreas Gohr        $this->assertEquals([], $term->getTokenIDsByLength(5));
187*ede46466SAndreas Gohr    }
188*ede46466SAndreas Gohr
189*ede46466SAndreas Gohr    public function testFrequencyAggregationAcrossTokens()
190*ede46466SAndreas Gohr    {
191*ede46466SAndreas Gohr        // Simulate a search where term matches multiple tokens on the same entity
192*ede46466SAndreas Gohr        $term = new Term('*wiki*');
193*ede46466SAndreas Gohr
194*ede46466SAndreas Gohr        // Entity 1 has multiple matching tokens
195*ede46466SAndreas Gohr        $term->addEntityFrequency(1, 5);  // first token appears 5 times
196*ede46466SAndreas Gohr        $term->addEntityFrequency(1, 3);  // second token appears 3 times
197*ede46466SAndreas Gohr        $term->addEntityFrequency(1, 2);  // third token appears 2 times
198*ede46466SAndreas Gohr
199*ede46466SAndreas Gohr        // Entity 2 has one matching token
200*ede46466SAndreas Gohr        $term->addEntityFrequency(2, 7);
201*ede46466SAndreas Gohr
202*ede46466SAndreas Gohr        $frequencies = $term->getEntityFrequencies();
203*ede46466SAndreas Gohr        $this->assertEquals(10, $frequencies[1]); // 5 + 3 + 2
204*ede46466SAndreas Gohr        $this->assertEquals(7, $frequencies[2]);
205*ede46466SAndreas Gohr    }
206*ede46466SAndreas Gohr
207*ede46466SAndreas Gohr    public function testEmptyTokensByLength()
208*ede46466SAndreas Gohr    {
209*ede46466SAndreas Gohr        $term = new Term('dokuwiki');
210*ede46466SAndreas Gohr
211*ede46466SAndreas Gohr        // Before adding any tokens, getting by length should return empty
212*ede46466SAndreas Gohr        $this->assertEquals([], $term->getTokenIDsByLength(8));
213*ede46466SAndreas Gohr
214*ede46466SAndreas Gohr        // After adding tokens, querying a non-existent length returns empty
215*ede46466SAndreas Gohr        $term->addTokens(4, [10 => 'wiki']);
216*ede46466SAndreas Gohr        $this->assertEquals([], $term->getTokenIDsByLength(8));
217*ede46466SAndreas Gohr    }
218*ede46466SAndreas Gohr
219*ede46466SAndreas Gohr    public function testZeroFrequency()
220*ede46466SAndreas Gohr    {
221*ede46466SAndreas Gohr        $term = new Term('dokuwiki');
222*ede46466SAndreas Gohr
223*ede46466SAndreas Gohr        $term->addEntityFrequency(1, 5);
224*ede46466SAndreas Gohr        $term->addEntityFrequency(2, 0);  // Zero frequency
225*ede46466SAndreas Gohr        $term->addEntityFrequency(3, 3);
226*ede46466SAndreas Gohr
227*ede46466SAndreas Gohr        $frequencies = $term->getEntityFrequencies();
228*ede46466SAndreas Gohr        $this->assertEquals(5, $frequencies[1]);
229*ede46466SAndreas Gohr        $this->assertEquals(0, $frequencies[2]);  // Zero is stored
230*ede46466SAndreas Gohr        $this->assertEquals(3, $frequencies[3]);
231*ede46466SAndreas Gohr    }
232*ede46466SAndreas Gohr
233*ede46466SAndreas Gohr    public function testResolveEntitiesPartialMap()
234*ede46466SAndreas Gohr    {
235*ede46466SAndreas Gohr        $term = new Term('dokuwiki');
236*ede46466SAndreas Gohr
237*ede46466SAndreas Gohr        $term->addEntityFrequency(1, 5);
238*ede46466SAndreas Gohr        $term->addEntityFrequency(2, 3);
239*ede46466SAndreas Gohr
240*ede46466SAndreas Gohr        // Resolve with partial map - only some entities are mapped
241*ede46466SAndreas Gohr        $map = [
242*ede46466SAndreas Gohr            1 => 'page1',
243*ede46466SAndreas Gohr            2 => 'page2'
244*ede46466SAndreas Gohr        ];
245*ede46466SAndreas Gohr        $term->resolveEntities($map);
246*ede46466SAndreas Gohr
247*ede46466SAndreas Gohr        $frequencies = $term->getEntityFrequencies();
248*ede46466SAndreas Gohr        $this->assertEquals(5, $frequencies['page1']);
249*ede46466SAndreas Gohr        $this->assertEquals(3, $frequencies['page2']);
250*ede46466SAndreas Gohr        $this->assertCount(2, $frequencies);
251*ede46466SAndreas Gohr    }
252*ede46466SAndreas Gohr
253*ede46466SAndreas Gohr    public function testCaseSensitiveBase()
254*ede46466SAndreas Gohr    {
255*ede46466SAndreas Gohr        // Test that case is preserved
256*ede46466SAndreas Gohr        $term = new Term('DokuWiki');
257*ede46466SAndreas Gohr
258*ede46466SAndreas Gohr        $this->assertEquals('DokuWiki', $term->getOriginal());
259*ede46466SAndreas Gohr        $this->assertEquals('DokuWiki', $term->getBase());
260*ede46466SAndreas Gohr    }
261*ede46466SAndreas Gohr
262*ede46466SAndreas Gohr    public function testComplexRegexCharacters()
263*ede46466SAndreas Gohr    {
264*ede46466SAndreas Gohr        // Test multiple special regex characters
265*ede46466SAndreas Gohr        $term = new Term('test[0-9]+.txt');
266*ede46466SAndreas Gohr
267*ede46466SAndreas Gohr        $this->assertEquals('test[0-9]+.txt', $term->getOriginal());
268*ede46466SAndreas Gohr        $this->assertEquals('test[0-9]+.txt', $term->getBase());
269*ede46466SAndreas Gohr        // All special characters should be escaped
270*ede46466SAndreas Gohr        $quoted = $term->getQuoted();
271*ede46466SAndreas Gohr        $this->assertStringContainsString('\\[', $quoted);
272*ede46466SAndreas Gohr        $this->assertStringContainsString('\\]', $quoted);
273*ede46466SAndreas Gohr        $this->assertStringContainsString('\\+', $quoted);
274*ede46466SAndreas Gohr        $this->assertStringContainsString('\\.', $quoted);
275*ede46466SAndreas Gohr    }
276*ede46466SAndreas Gohr
277*ede46466SAndreas Gohr}
278