xref: /dokuwiki/_test/tests/Search/Collection/CollectionSearchTest.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1*6734bb8cSAndreas Gohr<?php
2*6734bb8cSAndreas Gohr
3*6734bb8cSAndreas Gohrnamespace dokuwiki\test\Search\Collection;
4*6734bb8cSAndreas Gohr
5*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch;
6*6734bb8cSAndreas Gohruse dokuwiki\Search\Index\MemoryIndex;
7*6734bb8cSAndreas Gohruse dokuwiki\Search\Tokenizer;
8*6734bb8cSAndreas Gohr
9*6734bb8cSAndreas Gohrclass CollectionSearchTest extends \DokuWikiTest
10*6734bb8cSAndreas Gohr{
11*6734bb8cSAndreas Gohr
12*6734bb8cSAndreas Gohr    public function testExactTerm()
13*6734bb8cSAndreas Gohr    {
14*6734bb8cSAndreas Gohr        // add some content to the indexes
15*6734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword');
16*6734bb8cSAndreas Gohr        $collection->lock();
17*6734bb8cSAndreas Gohr        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
18*6734bb8cSAndreas Gohr        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
19*6734bb8cSAndreas Gohr        $collection->unlock();
20*6734bb8cSAndreas Gohr
21*6734bb8cSAndreas Gohr        // add search term
22*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
23*6734bb8cSAndreas Gohr        $term = $search->addTerm('dokuwiki');
24*6734bb8cSAndreas Gohr
25*6734bb8cSAndreas Gohr        // execute search
26*6734bb8cSAndreas Gohr        $search->execute();
27*6734bb8cSAndreas Gohr
28*6734bb8cSAndreas Gohr        // inspect the term updates first:
29*6734bb8cSAndreas Gohr
30*6734bb8cSAndreas Gohr        // exact search should only match one token
31*6734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki'],  $term->getTokens());
32*6734bb8cSAndreas Gohr        // that token is 8 chars and should be the first in the index
33*6734bb8cSAndreas Gohr        $this->assertEquals([0], $term->getTokenIDsByGroup(8));
34*6734bb8cSAndreas Gohr        // the dokuwiki token is two times on page1 and 1 time on page2
35*6734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies());
36*6734bb8cSAndreas Gohr
37*6734bb8cSAndreas Gohr        // entity IDs should be available from the search
38*6734bb8cSAndreas Gohr        $this->assertEquals([0 => 'page1', 1 => 'page2'], $search->getEntities());
39*6734bb8cSAndreas Gohr
40*6734bb8cSAndreas Gohr    }
41*6734bb8cSAndreas Gohr
42*6734bb8cSAndreas Gohr    public function testWildcardSearch()
43*6734bb8cSAndreas Gohr    {
44*6734bb8cSAndreas Gohr        // page1 has: dokuwiki(x2), dokuwikis, doku, wiki
45*6734bb8cSAndreas Gohr        // page2 has: dokuwiki, other, words
46*6734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword');
47*6734bb8cSAndreas Gohr        $collection->lock();
48*6734bb8cSAndreas Gohr        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
49*6734bb8cSAndreas Gohr        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
50*6734bb8cSAndreas Gohr        $collection->unlock();
51*6734bb8cSAndreas Gohr
52*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
53*6734bb8cSAndreas Gohr        $endWild = $search->addTerm('doku*');
54*6734bb8cSAndreas Gohr        $startWild = $search->addTerm('*wiki');
55*6734bb8cSAndreas Gohr        $bothWild = $search->addTerm('*kuwi*');
56*6734bb8cSAndreas Gohr        $search->execute();
57*6734bb8cSAndreas Gohr
58*6734bb8cSAndreas Gohr        // doku* should match: doku(4), dokuwiki(8), dokuwikis(9)
59*6734bb8cSAndreas Gohr        $endTokens = $endWild->getTokens();
60*6734bb8cSAndreas Gohr        sort($endTokens);
61*6734bb8cSAndreas Gohr        $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens);
62*6734bb8cSAndreas Gohr        // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1
63*6734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies());
64*6734bb8cSAndreas Gohr
65*6734bb8cSAndreas Gohr        // *wiki should match: dokuwiki(8), wiki(4)
66*6734bb8cSAndreas Gohr        $startTokens = $startWild->getTokens();
67*6734bb8cSAndreas Gohr        sort($startTokens);
68*6734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki', 'wiki'], $startTokens);
69*6734bb8cSAndreas Gohr        // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1
70*6734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies());
71*6734bb8cSAndreas Gohr
72*6734bb8cSAndreas Gohr        // *kuwi* should match: dokuwiki(8), dokuwikis(9)
73*6734bb8cSAndreas Gohr        $bothTokens = $bothWild->getTokens();
74*6734bb8cSAndreas Gohr        sort($bothTokens);
75*6734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens);
76*6734bb8cSAndreas Gohr        // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1
77*6734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies());
78*6734bb8cSAndreas Gohr    }
79*6734bb8cSAndreas Gohr
80*6734bb8cSAndreas Gohr    /**
81*6734bb8cSAndreas Gohr     * Index a real text file via the Tokenizer and search it
82*6734bb8cSAndreas Gohr     */
83*6734bb8cSAndreas Gohr    public function testTokenizedPageSearch()
84*6734bb8cSAndreas Gohr    {
85*6734bb8cSAndreas Gohr        $text = file_get_contents(__DIR__ . '/../data/searchtest.txt');
86*6734bb8cSAndreas Gohr        $tokens = Tokenizer::getWords($text);
87*6734bb8cSAndreas Gohr
88*6734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword');
89*6734bb8cSAndreas Gohr        $collection->lock();
90*6734bb8cSAndreas Gohr        $collection->addEntity('search:test', $tokens);
91*6734bb8cSAndreas Gohr        $collection->unlock();
92*6734bb8cSAndreas Gohr
93*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
94*6734bb8cSAndreas Gohr        $exact = $search->addTerm('dokuwiki');
95*6734bb8cSAndreas Gohr        $wild = $search->addTerm('plugin*');
96*6734bb8cSAndreas Gohr        $search->execute();
97*6734bb8cSAndreas Gohr
98*6734bb8cSAndreas Gohr        // "dokuwiki" appears 4 times in the text (case-insensitive tokenization)
99*6734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki'], $exact->getTokens());
100*6734bb8cSAndreas Gohr        $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies());
101*6734bb8cSAndreas Gohr
102*6734bb8cSAndreas Gohr        // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present
103*6734bb8cSAndreas Gohr        $wildTokens = $wild->getTokens();
104*6734bb8cSAndreas Gohr        $this->assertContains('plugins', $wildTokens);
105*6734bb8cSAndreas Gohr        $this->assertNotEmpty($wild->getEntityFrequencies());
106*6734bb8cSAndreas Gohr        $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies());
107*6734bb8cSAndreas Gohr    }
108*6734bb8cSAndreas Gohr
109*6734bb8cSAndreas Gohr    public function testNoMatchReturnsEmptyFrequencies()
110*6734bb8cSAndreas Gohr    {
111*6734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword');
112*6734bb8cSAndreas Gohr        $collection->lock();
113*6734bb8cSAndreas Gohr        $collection->addEntity('page1', ['alpha', 'beta', 'gamma']);
114*6734bb8cSAndreas Gohr        $collection->unlock();
115*6734bb8cSAndreas Gohr
116*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
117*6734bb8cSAndreas Gohr        $term = $search->addTerm('zzzznotfound');
118*6734bb8cSAndreas Gohr        $search->execute();
119*6734bb8cSAndreas Gohr
120*6734bb8cSAndreas Gohr        $this->assertEmpty($term->getTokens());
121*6734bb8cSAndreas Gohr        $this->assertEmpty($term->getEntityFrequencies());
122*6734bb8cSAndreas Gohr        $this->assertEmpty($search->getEntities());
123*6734bb8cSAndreas Gohr    }
124*6734bb8cSAndreas Gohr
125*6734bb8cSAndreas Gohr    // --- lookup() tests ---
126*6734bb8cSAndreas Gohr
127*6734bb8cSAndreas Gohr    /**
128*6734bb8cSAndreas Gohr     * Exact lookup on a non-split LookupCollection
129*6734bb8cSAndreas Gohr     */
130*6734bb8cSAndreas Gohr    public function testLookupExact()
131*6734bb8cSAndreas Gohr    {
132*6734bb8cSAndreas Gohr        $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse');
133*6734bb8cSAndreas Gohr        $collection->lock();
134*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
135*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['wiki:syntax']);
136*6734bb8cSAndreas Gohr        $collection->unlock();
137*6734bb8cSAndreas Gohr
138*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
139*6734bb8cSAndreas Gohr        $result = $search->lookup('wiki:syntax');
140*6734bb8cSAndreas Gohr
141*6734bb8cSAndreas Gohr        $this->assertCount(1, $result);
142*6734bb8cSAndreas Gohr        $this->assertArrayHasKey('wiki:syntax', $result);
143*6734bb8cSAndreas Gohr        $pages = $result['wiki:syntax'];
144*6734bb8cSAndreas Gohr        sort($pages);
145*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
146*6734bb8cSAndreas Gohr    }
147*6734bb8cSAndreas Gohr
148*6734bb8cSAndreas Gohr    /**
149*6734bb8cSAndreas Gohr     * Wildcard lookup on a non-split LookupCollection
150*6734bb8cSAndreas Gohr     */
151*6734bb8cSAndreas Gohr    public function testLookupWildcard()
152*6734bb8cSAndreas Gohr    {
153*6734bb8cSAndreas Gohr        $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse');
154*6734bb8cSAndreas Gohr        $collection->lock();
155*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
156*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']);
157*6734bb8cSAndreas Gohr        $collection->unlock();
158*6734bb8cSAndreas Gohr
159*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
160*6734bb8cSAndreas Gohr
161*6734bb8cSAndreas Gohr        // end wildcard: wiki:* matches wiki:syntax and wiki:welcome
162*6734bb8cSAndreas Gohr        // wiki:start has both tokens, so it appears twice; wiki:other has wiki:syntax once
163*6734bb8cSAndreas Gohr        $result = $search->lookup('wiki:*');
164*6734bb8cSAndreas Gohr        $pages = $result['wiki:*'];
165*6734bb8cSAndreas Gohr        sort($pages);
166*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start', 'wiki:start'], $pages);
167*6734bb8cSAndreas Gohr
168*6734bb8cSAndreas Gohr        // start wildcard: *syntax matches only wiki:syntax
169*6734bb8cSAndreas Gohr        $search2 = new CollectionSearch($collection);
170*6734bb8cSAndreas Gohr        $result2 = $search2->lookup('*syntax');
171*6734bb8cSAndreas Gohr        $pages2 = $result2['*syntax'];
172*6734bb8cSAndreas Gohr        sort($pages2);
173*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages2);
174*6734bb8cSAndreas Gohr    }
175*6734bb8cSAndreas Gohr
176*6734bb8cSAndreas Gohr    /**
177*6734bb8cSAndreas Gohr     * Callback lookup on a non-split LookupCollection
178*6734bb8cSAndreas Gohr     */
179*6734bb8cSAndreas Gohr    public function testLookupCallback()
180*6734bb8cSAndreas Gohr    {
181*6734bb8cSAndreas Gohr        $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse');
182*6734bb8cSAndreas Gohr        $collection->lock();
183*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['Apple', 'Banana']);
184*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']);
185*6734bb8cSAndreas Gohr        $collection->unlock();
186*6734bb8cSAndreas Gohr
187*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
188*6734bb8cSAndreas Gohr        // case-insensitive substring match
189*6734bb8cSAndreas Gohr        $result = $search->lookup('apple', static fn($search, $word) => stripos($word, $search) !== false);
190*6734bb8cSAndreas Gohr
191*6734bb8cSAndreas Gohr        $pages = $result['apple'];
192*6734bb8cSAndreas Gohr        sort($pages);
193*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
194*6734bb8cSAndreas Gohr    }
195*6734bb8cSAndreas Gohr
196*6734bb8cSAndreas Gohr    /**
197*6734bb8cSAndreas Gohr     * lookup() on a DirectCollection (title-style 1:1 mapping)
198*6734bb8cSAndreas Gohr     */
199*6734bb8cSAndreas Gohr    public function testLookupOnDirectCollection()
200*6734bb8cSAndreas Gohr    {
201*6734bb8cSAndreas Gohr        $collection = new MockDirectCollection('ld_entity', 'ld_token');
202*6734bb8cSAndreas Gohr        $collection->lock();
203*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['Welcome to DokuWiki']);
204*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:syntax', ['Formatting Syntax']);
205*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['Other Page']);
206*6734bb8cSAndreas Gohr        $collection->unlock();
207*6734bb8cSAndreas Gohr
208*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
209*6734bb8cSAndreas Gohr
210*6734bb8cSAndreas Gohr        // exact match
211*6734bb8cSAndreas Gohr        $result = $search->lookup('Welcome to DokuWiki');
212*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:start'], $result['Welcome to DokuWiki']);
213*6734bb8cSAndreas Gohr
214*6734bb8cSAndreas Gohr        // wildcard match
215*6734bb8cSAndreas Gohr        $search2 = new CollectionSearch($collection);
216*6734bb8cSAndreas Gohr        $result2 = $search2->lookup('*Syntax');
217*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:syntax'], $result2['*Syntax']);
218*6734bb8cSAndreas Gohr
219*6734bb8cSAndreas Gohr        // callback match (case-insensitive substring)
220*6734bb8cSAndreas Gohr        $search3 = new CollectionSearch($collection);
221*6734bb8cSAndreas Gohr        $result3 = $search3->lookup('wiki', static fn($s, $w) => stripos($w, $s) !== false);
222*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:start'], $result3['wiki']);
223*6734bb8cSAndreas Gohr    }
224*6734bb8cSAndreas Gohr
225*6734bb8cSAndreas Gohr    /**
226*6734bb8cSAndreas Gohr     * lookup() with multiple values
227*6734bb8cSAndreas Gohr     */
228*6734bb8cSAndreas Gohr    public function testLookupMultipleValues()
229*6734bb8cSAndreas Gohr    {
230*6734bb8cSAndreas Gohr        $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse');
231*6734bb8cSAndreas Gohr        $collection->lock();
232*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
233*6734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['wiki:syntax']);
234*6734bb8cSAndreas Gohr        $collection->unlock();
235*6734bb8cSAndreas Gohr
236*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
237*6734bb8cSAndreas Gohr        $result = $search->lookup(['wiki:syntax', 'wiki:welcome', 'nonexistent']);
238*6734bb8cSAndreas Gohr
239*6734bb8cSAndreas Gohr        $syntax = $result['wiki:syntax'];
240*6734bb8cSAndreas Gohr        sort($syntax);
241*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $syntax);
242*6734bb8cSAndreas Gohr        $this->assertEquals(['wiki:start'], $result['wiki:welcome']);
243*6734bb8cSAndreas Gohr        $this->assertEquals([], $result['nonexistent']);
244*6734bb8cSAndreas Gohr    }
245*6734bb8cSAndreas Gohr
246*6734bb8cSAndreas Gohr    /**
247*6734bb8cSAndreas Gohr     * lookup() on a split FrequencyCollection
248*6734bb8cSAndreas Gohr     */
249*6734bb8cSAndreas Gohr    public function testLookupOnSplitCollection()
250*6734bb8cSAndreas Gohr    {
251*6734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword');
252*6734bb8cSAndreas Gohr        $collection->lock();
253*6734bb8cSAndreas Gohr        $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']);
254*6734bb8cSAndreas Gohr        $collection->addEntity('page2', ['dokuwiki', 'other']);
255*6734bb8cSAndreas Gohr        $collection->unlock();
256*6734bb8cSAndreas Gohr
257*6734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
258*6734bb8cSAndreas Gohr        $result = $search->lookup('dokuwiki');
259*6734bb8cSAndreas Gohr
260*6734bb8cSAndreas Gohr        $pages = $result['dokuwiki'];
261*6734bb8cSAndreas Gohr        sort($pages);
262*6734bb8cSAndreas Gohr        $this->assertEquals(['page1', 'page2'], $pages);
263*6734bb8cSAndreas Gohr    }
264*6734bb8cSAndreas Gohr}
265