xref: /dokuwiki/_test/tests/Search/Collection/CollectionSearchTest.php (revision 6734bb8cef71e8b4af23e627d4db5430304d55a2)
1<?php
2
3namespace dokuwiki\test\Search\Collection;
4
5use dokuwiki\Search\Collection\CollectionSearch;
6use dokuwiki\Search\Index\MemoryIndex;
7use dokuwiki\Search\Tokenizer;
8
9class CollectionSearchTest extends \DokuWikiTest
10{
11
12    public function testExactTerm()
13    {
14        // add some content to the indexes
15        $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword');
16        $collection->lock();
17        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
18        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
19        $collection->unlock();
20
21        // add search term
22        $search = new CollectionSearch($collection);
23        $term = $search->addTerm('dokuwiki');
24
25        // execute search
26        $search->execute();
27
28        // inspect the term updates first:
29
30        // exact search should only match one token
31        $this->assertEquals(['dokuwiki'],  $term->getTokens());
32        // that token is 8 chars and should be the first in the index
33        $this->assertEquals([0], $term->getTokenIDsByGroup(8));
34        // the dokuwiki token is two times on page1 and 1 time on page2
35        $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies());
36
37        // entity IDs should be available from the search
38        $this->assertEquals([0 => 'page1', 1 => 'page2'], $search->getEntities());
39
40    }
41
42    public function testWildcardSearch()
43    {
44        // page1 has: dokuwiki(x2), dokuwikis, doku, wiki
45        // page2 has: dokuwiki, other, words
46        $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword');
47        $collection->lock();
48        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
49        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
50        $collection->unlock();
51
52        $search = new CollectionSearch($collection);
53        $endWild = $search->addTerm('doku*');
54        $startWild = $search->addTerm('*wiki');
55        $bothWild = $search->addTerm('*kuwi*');
56        $search->execute();
57
58        // doku* should match: doku(4), dokuwiki(8), dokuwikis(9)
59        $endTokens = $endWild->getTokens();
60        sort($endTokens);
61        $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens);
62        // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1
63        $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies());
64
65        // *wiki should match: dokuwiki(8), wiki(4)
66        $startTokens = $startWild->getTokens();
67        sort($startTokens);
68        $this->assertEquals(['dokuwiki', 'wiki'], $startTokens);
69        // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1
70        $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies());
71
72        // *kuwi* should match: dokuwiki(8), dokuwikis(9)
73        $bothTokens = $bothWild->getTokens();
74        sort($bothTokens);
75        $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens);
76        // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1
77        $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies());
78    }
79
80    /**
81     * Index a real text file via the Tokenizer and search it
82     */
83    public function testTokenizedPageSearch()
84    {
85        $text = file_get_contents(__DIR__ . '/../data/searchtest.txt');
86        $tokens = Tokenizer::getWords($text);
87
88        $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword');
89        $collection->lock();
90        $collection->addEntity('search:test', $tokens);
91        $collection->unlock();
92
93        $search = new CollectionSearch($collection);
94        $exact = $search->addTerm('dokuwiki');
95        $wild = $search->addTerm('plugin*');
96        $search->execute();
97
98        // "dokuwiki" appears 4 times in the text (case-insensitive tokenization)
99        $this->assertEquals(['dokuwiki'], $exact->getTokens());
100        $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies());
101
102        // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present
103        $wildTokens = $wild->getTokens();
104        $this->assertContains('plugins', $wildTokens);
105        $this->assertNotEmpty($wild->getEntityFrequencies());
106        $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies());
107    }
108
109    public function testNoMatchReturnsEmptyFrequencies()
110    {
111        $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword');
112        $collection->lock();
113        $collection->addEntity('page1', ['alpha', 'beta', 'gamma']);
114        $collection->unlock();
115
116        $search = new CollectionSearch($collection);
117        $term = $search->addTerm('zzzznotfound');
118        $search->execute();
119
120        $this->assertEmpty($term->getTokens());
121        $this->assertEmpty($term->getEntityFrequencies());
122        $this->assertEmpty($search->getEntities());
123    }
124
125    // --- lookup() tests ---
126
127    /**
128     * Exact lookup on a non-split LookupCollection
129     */
130    public function testLookupExact()
131    {
132        $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse');
133        $collection->lock();
134        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
135        $collection->addEntity('wiki:other', ['wiki:syntax']);
136        $collection->unlock();
137
138        $search = new CollectionSearch($collection);
139        $result = $search->lookup('wiki:syntax');
140
141        $this->assertCount(1, $result);
142        $this->assertArrayHasKey('wiki:syntax', $result);
143        $pages = $result['wiki:syntax'];
144        sort($pages);
145        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
146    }
147
148    /**
149     * Wildcard lookup on a non-split LookupCollection
150     */
151    public function testLookupWildcard()
152    {
153        $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse');
154        $collection->lock();
155        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
156        $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']);
157        $collection->unlock();
158
159        $search = new CollectionSearch($collection);
160
161        // end wildcard: wiki:* matches wiki:syntax and wiki:welcome
162        // wiki:start has both tokens, so it appears twice; wiki:other has wiki:syntax once
163        $result = $search->lookup('wiki:*');
164        $pages = $result['wiki:*'];
165        sort($pages);
166        $this->assertEquals(['wiki:other', 'wiki:start', 'wiki:start'], $pages);
167
168        // start wildcard: *syntax matches only wiki:syntax
169        $search2 = new CollectionSearch($collection);
170        $result2 = $search2->lookup('*syntax');
171        $pages2 = $result2['*syntax'];
172        sort($pages2);
173        $this->assertEquals(['wiki:other', 'wiki:start'], $pages2);
174    }
175
176    /**
177     * Callback lookup on a non-split LookupCollection
178     */
179    public function testLookupCallback()
180    {
181        $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse');
182        $collection->lock();
183        $collection->addEntity('wiki:start', ['Apple', 'Banana']);
184        $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']);
185        $collection->unlock();
186
187        $search = new CollectionSearch($collection);
188        // case-insensitive substring match
189        $result = $search->lookup('apple', static fn($search, $word) => stripos($word, $search) !== false);
190
191        $pages = $result['apple'];
192        sort($pages);
193        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
194    }
195
196    /**
197     * lookup() on a DirectCollection (title-style 1:1 mapping)
198     */
199    public function testLookupOnDirectCollection()
200    {
201        $collection = new MockDirectCollection('ld_entity', 'ld_token');
202        $collection->lock();
203        $collection->addEntity('wiki:start', ['Welcome to DokuWiki']);
204        $collection->addEntity('wiki:syntax', ['Formatting Syntax']);
205        $collection->addEntity('wiki:other', ['Other Page']);
206        $collection->unlock();
207
208        $search = new CollectionSearch($collection);
209
210        // exact match
211        $result = $search->lookup('Welcome to DokuWiki');
212        $this->assertEquals(['wiki:start'], $result['Welcome to DokuWiki']);
213
214        // wildcard match
215        $search2 = new CollectionSearch($collection);
216        $result2 = $search2->lookup('*Syntax');
217        $this->assertEquals(['wiki:syntax'], $result2['*Syntax']);
218
219        // callback match (case-insensitive substring)
220        $search3 = new CollectionSearch($collection);
221        $result3 = $search3->lookup('wiki', static fn($s, $w) => stripos($w, $s) !== false);
222        $this->assertEquals(['wiki:start'], $result3['wiki']);
223    }
224
225    /**
226     * lookup() with multiple values
227     */
228    public function testLookupMultipleValues()
229    {
230        $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse');
231        $collection->lock();
232        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
233        $collection->addEntity('wiki:other', ['wiki:syntax']);
234        $collection->unlock();
235
236        $search = new CollectionSearch($collection);
237        $result = $search->lookup(['wiki:syntax', 'wiki:welcome', 'nonexistent']);
238
239        $syntax = $result['wiki:syntax'];
240        sort($syntax);
241        $this->assertEquals(['wiki:other', 'wiki:start'], $syntax);
242        $this->assertEquals(['wiki:start'], $result['wiki:welcome']);
243        $this->assertEquals([], $result['nonexistent']);
244    }
245
246    /**
247     * lookup() on a split FrequencyCollection
248     */
249    public function testLookupOnSplitCollection()
250    {
251        $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword');
252        $collection->lock();
253        $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']);
254        $collection->addEntity('page2', ['dokuwiki', 'other']);
255        $collection->unlock();
256
257        $search = new CollectionSearch($collection);
258        $result = $search->lookup('dokuwiki');
259
260        $pages = $result['dokuwiki'];
261        sort($pages);
262        $this->assertEquals(['page1', 'page2'], $pages);
263    }
264}
265