xref: /dokuwiki/_test/tests/Search/Collection/CollectionSearchTest.php (revision 06053dca2fac9a1da4eb1accf8c2488942da5d2a)
1<?php
2
3namespace dokuwiki\test\Search\Collection;
4
5use dokuwiki\Search\Collection\CollectionSearch;
6use dokuwiki\Search\Index\MemoryIndex;
7use dokuwiki\Search\Tokenizer;
8
9class CollectionSearchTest extends \DokuWikiTest
10{
11
12    public function testExactTerm()
13    {
14        // add some content to the indexes
15        $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword');
16        $collection->lock();
17        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
18        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
19        $collection->unlock();
20
21        // add search term
22        $search = new CollectionSearch($collection);
23        $term = $search->addTerm('dokuwiki');
24
25        // execute search
26        $search->execute();
27
28        // exact search should only match one token
29        $this->assertEquals(['dokuwiki'], $term->getTokens());
30        // the dokuwiki token is two times on page1 and 1 time on page2
31        $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies());
32        // full detail available
33        $this->assertEquals(['dokuwiki' => 2], $term->getMatches()['page1']);
34        $this->assertEquals(['dokuwiki' => 1], $term->getMatches()['page2']);
35
36    }
37
38    public function testWildcardSearch()
39    {
40        // page1 has: dokuwiki(x2), dokuwikis, doku, wiki
41        // page2 has: dokuwiki, other, words
42        $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword');
43        $collection->lock();
44        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
45        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
46        $collection->unlock();
47
48        $search = new CollectionSearch($collection);
49        $endWild = $search->addTerm('doku*');
50        $startWild = $search->addTerm('*wiki');
51        $bothWild = $search->addTerm('*kuwi*');
52        $search->execute();
53
54        // doku* should match: doku(4), dokuwiki(8), dokuwikis(9)
55        $endTokens = $endWild->getTokens();
56        sort($endTokens);
57        $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens);
58        // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1
59        $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies());
60
61        // *wiki should match: dokuwiki(8), wiki(4)
62        $startTokens = $startWild->getTokens();
63        sort($startTokens);
64        $this->assertEquals(['dokuwiki', 'wiki'], $startTokens);
65        // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1
66        $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies());
67
68        // *kuwi* should match: dokuwiki(8), dokuwikis(9)
69        $bothTokens = $bothWild->getTokens();
70        sort($bothTokens);
71        $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens);
72        // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1
73        $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies());
74    }
75
76    /**
77     * Index a real text file via the Tokenizer and search it
78     */
79    public function testTokenizedPageSearch()
80    {
81        $text = file_get_contents(__DIR__ . '/../data/searchtest.txt');
82        $tokens = Tokenizer::getWords($text);
83
84        $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword');
85        $collection->lock();
86        $collection->addEntity('search:test', $tokens);
87        $collection->unlock();
88
89        $search = new CollectionSearch($collection);
90        $exact = $search->addTerm('dokuwiki');
91        $wild = $search->addTerm('plugin*');
92        $search->execute();
93
94        // "dokuwiki" appears 4 times in the text (case-insensitive tokenization)
95        $this->assertEquals(['dokuwiki'], $exact->getTokens());
96        $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies());
97
98        // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present
99        $wildTokens = $wild->getTokens();
100        $this->assertContains('plugins', $wildTokens);
101        $this->assertNotEmpty($wild->getEntityFrequencies());
102        $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies());
103    }
104
105    public function testNoMatchReturnsEmptyFrequencies()
106    {
107        $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword');
108        $collection->lock();
109        $collection->addEntity('page1', ['alpha', 'beta', 'gamma']);
110        $collection->unlock();
111
112        $search = new CollectionSearch($collection);
113        $term = $search->addTerm('zzzznotfound');
114        $search->execute();
115
116        $this->assertEmpty($term->getTokens());
117        $this->assertEmpty($term->getEntityFrequencies());
118        $this->assertEmpty($term->getMatches());
119    }
120
121    // --- metadata-style search tests (using addTerm/execute without length restrictions) ---
122
123    /**
124     * Exact search on a non-split LookupCollection
125     */
126    public function testMetadataExact()
127    {
128        $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse');
129        $collection->lock();
130        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
131        $collection->addEntity('wiki:other', ['wiki:syntax']);
132        $collection->unlock();
133
134        $search = new CollectionSearch($collection);
135        $term = $search->addTerm('wiki:syntax');
136        $search->execute();
137
138        $pages = array_keys($term->getEntityFrequencies());
139        sort($pages);
140        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
141    }
142
143    /**
144     * Wildcard search on a non-split LookupCollection
145     */
146    public function testMetadataWildcard()
147    {
148        $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse');
149        $collection->lock();
150        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
151        $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']);
152        $collection->unlock();
153
154        // end wildcard: wiki:* matches wiki:syntax and wiki:welcome
155        $search = new CollectionSearch($collection);
156        $term = $search->addTerm('wiki:*');
157        $search->execute();
158
159        $pages = array_keys($term->getEntityFrequencies());
160        sort($pages);
161        // wiki:start has both tokens (freq 2), wiki:other has wiki:syntax (freq 1)
162        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
163
164        // start wildcard: *syntax matches only wiki:syntax
165        $search2 = new CollectionSearch($collection);
166        $term2 = $search2->addTerm('*syntax');
167        $search2->execute();
168
169        $pages2 = array_keys($term2->getEntityFrequencies());
170        sort($pages2);
171        $this->assertEquals(['wiki:other', 'wiki:start'], $pages2);
172    }
173
174    /**
175     * Case-insensitive search on a non-split LookupCollection
176     */
177    public function testMetadataCaseInsensitive()
178    {
179        $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse');
180        $collection->lock();
181        $collection->addEntity('wiki:start', ['Apple', 'Banana']);
182        $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']);
183        $collection->unlock();
184
185        $search = new CollectionSearch($collection);
186        $search->caseInsensitive();
187        $term = $search->addTerm('*apple*');
188        $search->execute();
189
190        $pages = array_keys($term->getEntityFrequencies());
191        sort($pages);
192        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
193    }
194
195    /**
196     * Search on a DirectCollection (title-style 1:1 mapping)
197     */
198    public function testSearchOnDirectCollection()
199    {
200        $collection = new MockDirectCollection('ld_entity', 'ld_token');
201        $collection->lock();
202        $collection->addEntity('wiki:start', ['Welcome to DokuWiki']);
203        $collection->addEntity('wiki:syntax', ['Formatting Syntax']);
204        $collection->addEntity('wiki:other', ['Other Page']);
205        $collection->unlock();
206
207        // exact match
208        $search = new CollectionSearch($collection);
209        $term = $search->addTerm('Welcome to DokuWiki');
210        $search->execute();
211        $this->assertEquals(['wiki:start'], array_keys($term->getEntityFrequencies()));
212
213        // wildcard match
214        $search2 = new CollectionSearch($collection);
215        $term2 = $search2->addTerm('*Syntax');
216        $search2->execute();
217        $this->assertEquals(['wiki:syntax'], array_keys($term2->getEntityFrequencies()));
218
219        // case-insensitive substring match
220        $search3 = new CollectionSearch($collection);
221        $search3->caseInsensitive();
222        $term3 = $search3->addTerm('*wiki*');
223        $search3->execute();
224        $this->assertEquals(['wiki:start'], array_keys($term3->getEntityFrequencies()));
225    }
226
227    /**
228     * Multiple terms in a single search
229     */
230    public function testMultipleTerms()
231    {
232        $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse');
233        $collection->lock();
234        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
235        $collection->addEntity('wiki:other', ['wiki:syntax']);
236        $collection->unlock();
237
238        $search = new CollectionSearch($collection);
239        $term1 = $search->addTerm('wiki:syntax');
240        $term2 = $search->addTerm('wiki:welcome');
241        $term3 = $search->addTerm('nonexistent');
242        $search->execute();
243
244        $syntax = array_keys($term1->getEntityFrequencies());
245        sort($syntax);
246        $this->assertEquals(['wiki:other', 'wiki:start'], $syntax);
247        $this->assertEquals(['wiki:start'], array_keys($term2->getEntityFrequencies()));
248        $this->assertEquals([], array_keys($term3->getEntityFrequencies()));
249    }
250
251    /**
252     * Search on a split FrequencyCollection
253     */
254    public function testSearchOnSplitCollection()
255    {
256        $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword');
257        $collection->lock();
258        $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']);
259        $collection->addEntity('page2', ['dokuwiki', 'other']);
260        $collection->unlock();
261
262        $search = new CollectionSearch($collection);
263        $term = $search->addTerm('dokuwiki');
264        $search->execute();
265
266        $pages = array_keys($term->getEntityFrequencies());
267        sort($pages);
268        $this->assertEquals(['page1', 'page2'], $pages);
269    }
270
271    /**
272     * Searching an empty collection returns no results
273     */
274    public function testSearchEmptyCollection()
275    {
276        $collection = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw');
277
278        $search = new CollectionSearch($collection);
279        $term = $search->addTerm('anything');
280        $search->execute();
281        $this->assertEquals([], $term->getEntityFrequencies());
282    }
283
284    /**
285     * Search on an empty collection returns empty frequencies
286     */
287    public function testSearchEmptyCollection2()
288    {
289        $collection = new MockFrequencyCollection('empty2_page', 'empty2_w', 'empty2_i', 'empty2_pw');
290
291        $search = new CollectionSearch($collection);
292        $term = $search->addTerm('anything');
293        $search->execute();
294        $this->assertEquals([], $term->getEntityFrequencies());
295    }
296}
297