xref: /dokuwiki/_test/tests/Search/Collection/CollectionSearchTest.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
16734bb8cSAndreas Gohr<?php
26734bb8cSAndreas Gohr
36734bb8cSAndreas Gohrnamespace dokuwiki\test\Search\Collection;
46734bb8cSAndreas Gohr
56734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch;
66734bb8cSAndreas Gohruse dokuwiki\Search\Index\MemoryIndex;
76734bb8cSAndreas Gohruse dokuwiki\Search\Tokenizer;
86734bb8cSAndreas Gohr
96734bb8cSAndreas Gohrclass CollectionSearchTest extends \DokuWikiTest
106734bb8cSAndreas Gohr{
116734bb8cSAndreas Gohr
126734bb8cSAndreas Gohr    public function testExactTerm()
136734bb8cSAndreas Gohr    {
146734bb8cSAndreas Gohr        // add some content to the indexes
156734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword');
166734bb8cSAndreas Gohr        $collection->lock();
176734bb8cSAndreas Gohr        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
186734bb8cSAndreas Gohr        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
196734bb8cSAndreas Gohr        $collection->unlock();
206734bb8cSAndreas Gohr
216734bb8cSAndreas Gohr        // add search term
226734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
236734bb8cSAndreas Gohr        $term = $search->addTerm('dokuwiki');
246734bb8cSAndreas Gohr
256734bb8cSAndreas Gohr        // execute search
266734bb8cSAndreas Gohr        $search->execute();
276734bb8cSAndreas Gohr
286734bb8cSAndreas Gohr        // exact search should only match one token
296734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki'], $term->getTokens());
306734bb8cSAndreas Gohr        // the dokuwiki token is two times on page1 and 1 time on page2
316734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies());
32*1148921dSAndreas Gohr        // full detail available
33*1148921dSAndreas Gohr        $this->assertEquals(['dokuwiki' => 2], $term->getMatches()['page1']);
34*1148921dSAndreas Gohr        $this->assertEquals(['dokuwiki' => 1], $term->getMatches()['page2']);
356734bb8cSAndreas Gohr
366734bb8cSAndreas Gohr    }
376734bb8cSAndreas Gohr
386734bb8cSAndreas Gohr    public function testWildcardSearch()
396734bb8cSAndreas Gohr    {
406734bb8cSAndreas Gohr        // page1 has: dokuwiki(x2), dokuwikis, doku, wiki
416734bb8cSAndreas Gohr        // page2 has: dokuwiki, other, words
426734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword');
436734bb8cSAndreas Gohr        $collection->lock();
446734bb8cSAndreas Gohr        $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']);
456734bb8cSAndreas Gohr        $collection->addEntity('page2', ['dokuwiki', 'other', 'words']);
466734bb8cSAndreas Gohr        $collection->unlock();
476734bb8cSAndreas Gohr
486734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
496734bb8cSAndreas Gohr        $endWild = $search->addTerm('doku*');
506734bb8cSAndreas Gohr        $startWild = $search->addTerm('*wiki');
516734bb8cSAndreas Gohr        $bothWild = $search->addTerm('*kuwi*');
526734bb8cSAndreas Gohr        $search->execute();
536734bb8cSAndreas Gohr
546734bb8cSAndreas Gohr        // doku* should match: doku(4), dokuwiki(8), dokuwikis(9)
556734bb8cSAndreas Gohr        $endTokens = $endWild->getTokens();
566734bb8cSAndreas Gohr        sort($endTokens);
576734bb8cSAndreas Gohr        $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens);
586734bb8cSAndreas Gohr        // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1
596734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies());
606734bb8cSAndreas Gohr
616734bb8cSAndreas Gohr        // *wiki should match: dokuwiki(8), wiki(4)
626734bb8cSAndreas Gohr        $startTokens = $startWild->getTokens();
636734bb8cSAndreas Gohr        sort($startTokens);
646734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki', 'wiki'], $startTokens);
656734bb8cSAndreas Gohr        // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1
666734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies());
676734bb8cSAndreas Gohr
686734bb8cSAndreas Gohr        // *kuwi* should match: dokuwiki(8), dokuwikis(9)
696734bb8cSAndreas Gohr        $bothTokens = $bothWild->getTokens();
706734bb8cSAndreas Gohr        sort($bothTokens);
716734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens);
726734bb8cSAndreas Gohr        // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1
736734bb8cSAndreas Gohr        $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies());
746734bb8cSAndreas Gohr    }
756734bb8cSAndreas Gohr
766734bb8cSAndreas Gohr    /**
776734bb8cSAndreas Gohr     * Index a real text file via the Tokenizer and search it
786734bb8cSAndreas Gohr     */
796734bb8cSAndreas Gohr    public function testTokenizedPageSearch()
806734bb8cSAndreas Gohr    {
816734bb8cSAndreas Gohr        $text = file_get_contents(__DIR__ . '/../data/searchtest.txt');
826734bb8cSAndreas Gohr        $tokens = Tokenizer::getWords($text);
836734bb8cSAndreas Gohr
846734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword');
856734bb8cSAndreas Gohr        $collection->lock();
866734bb8cSAndreas Gohr        $collection->addEntity('search:test', $tokens);
876734bb8cSAndreas Gohr        $collection->unlock();
886734bb8cSAndreas Gohr
896734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
906734bb8cSAndreas Gohr        $exact = $search->addTerm('dokuwiki');
916734bb8cSAndreas Gohr        $wild = $search->addTerm('plugin*');
926734bb8cSAndreas Gohr        $search->execute();
936734bb8cSAndreas Gohr
946734bb8cSAndreas Gohr        // "dokuwiki" appears 4 times in the text (case-insensitive tokenization)
956734bb8cSAndreas Gohr        $this->assertEquals(['dokuwiki'], $exact->getTokens());
966734bb8cSAndreas Gohr        $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies());
976734bb8cSAndreas Gohr
986734bb8cSAndreas Gohr        // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present
996734bb8cSAndreas Gohr        $wildTokens = $wild->getTokens();
1006734bb8cSAndreas Gohr        $this->assertContains('plugins', $wildTokens);
1016734bb8cSAndreas Gohr        $this->assertNotEmpty($wild->getEntityFrequencies());
1026734bb8cSAndreas Gohr        $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies());
1036734bb8cSAndreas Gohr    }
1046734bb8cSAndreas Gohr
1056734bb8cSAndreas Gohr    public function testNoMatchReturnsEmptyFrequencies()
1066734bb8cSAndreas Gohr    {
1076734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword');
1086734bb8cSAndreas Gohr        $collection->lock();
1096734bb8cSAndreas Gohr        $collection->addEntity('page1', ['alpha', 'beta', 'gamma']);
1106734bb8cSAndreas Gohr        $collection->unlock();
1116734bb8cSAndreas Gohr
1126734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
1136734bb8cSAndreas Gohr        $term = $search->addTerm('zzzznotfound');
1146734bb8cSAndreas Gohr        $search->execute();
1156734bb8cSAndreas Gohr
1166734bb8cSAndreas Gohr        $this->assertEmpty($term->getTokens());
1176734bb8cSAndreas Gohr        $this->assertEmpty($term->getEntityFrequencies());
118*1148921dSAndreas Gohr        $this->assertEmpty($term->getMatches());
1196734bb8cSAndreas Gohr    }
1206734bb8cSAndreas Gohr
121*1148921dSAndreas Gohr    // --- metadata-style search tests (using addTerm/execute without length restrictions) ---
1226734bb8cSAndreas Gohr
1236734bb8cSAndreas Gohr    /**
124*1148921dSAndreas Gohr     * Exact search on a non-split LookupCollection
1256734bb8cSAndreas Gohr     */
126*1148921dSAndreas Gohr    public function testMetadataExact()
1276734bb8cSAndreas Gohr    {
1286734bb8cSAndreas Gohr        $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse');
1296734bb8cSAndreas Gohr        $collection->lock();
1306734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
1316734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['wiki:syntax']);
1326734bb8cSAndreas Gohr        $collection->unlock();
1336734bb8cSAndreas Gohr
1346734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
135*1148921dSAndreas Gohr        $term = $search->addTerm('wiki:syntax');
136*1148921dSAndreas Gohr        $search->execute();
1376734bb8cSAndreas Gohr
138*1148921dSAndreas Gohr        $pages = array_keys($term->getEntityFrequencies());
1396734bb8cSAndreas Gohr        sort($pages);
1406734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
1416734bb8cSAndreas Gohr    }
1426734bb8cSAndreas Gohr
1436734bb8cSAndreas Gohr    /**
144*1148921dSAndreas Gohr     * Wildcard search on a non-split LookupCollection
1456734bb8cSAndreas Gohr     */
146*1148921dSAndreas Gohr    public function testMetadataWildcard()
1476734bb8cSAndreas Gohr    {
1486734bb8cSAndreas Gohr        $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse');
1496734bb8cSAndreas Gohr        $collection->lock();
1506734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
1516734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']);
1526734bb8cSAndreas Gohr        $collection->unlock();
1536734bb8cSAndreas Gohr
1546734bb8cSAndreas Gohr        // end wildcard: wiki:* matches wiki:syntax and wiki:welcome
155*1148921dSAndreas Gohr        $search = new CollectionSearch($collection);
156*1148921dSAndreas Gohr        $term = $search->addTerm('wiki:*');
157*1148921dSAndreas Gohr        $search->execute();
158*1148921dSAndreas Gohr
159*1148921dSAndreas Gohr        $pages = array_keys($term->getEntityFrequencies());
1606734bb8cSAndreas Gohr        sort($pages);
161*1148921dSAndreas Gohr        // wiki:start has both tokens (freq 2), wiki:other has wiki:syntax (freq 1)
162*1148921dSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
1636734bb8cSAndreas Gohr
1646734bb8cSAndreas Gohr        // start wildcard: *syntax matches only wiki:syntax
1656734bb8cSAndreas Gohr        $search2 = new CollectionSearch($collection);
166*1148921dSAndreas Gohr        $term2 = $search2->addTerm('*syntax');
167*1148921dSAndreas Gohr        $search2->execute();
168*1148921dSAndreas Gohr
169*1148921dSAndreas Gohr        $pages2 = array_keys($term2->getEntityFrequencies());
1706734bb8cSAndreas Gohr        sort($pages2);
1716734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages2);
1726734bb8cSAndreas Gohr    }
1736734bb8cSAndreas Gohr
1746734bb8cSAndreas Gohr    /**
175*1148921dSAndreas Gohr     * Case-insensitive search on a non-split LookupCollection
1766734bb8cSAndreas Gohr     */
177*1148921dSAndreas Gohr    public function testMetadataCaseInsensitive()
1786734bb8cSAndreas Gohr    {
1796734bb8cSAndreas Gohr        $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse');
1806734bb8cSAndreas Gohr        $collection->lock();
1816734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['Apple', 'Banana']);
1826734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']);
1836734bb8cSAndreas Gohr        $collection->unlock();
1846734bb8cSAndreas Gohr
1856734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
186*1148921dSAndreas Gohr        $search->caseInsensitive();
187*1148921dSAndreas Gohr        $term = $search->addTerm('*apple*');
188*1148921dSAndreas Gohr        $search->execute();
1896734bb8cSAndreas Gohr
190*1148921dSAndreas Gohr        $pages = array_keys($term->getEntityFrequencies());
1916734bb8cSAndreas Gohr        sort($pages);
1926734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $pages);
1936734bb8cSAndreas Gohr    }
1946734bb8cSAndreas Gohr
1956734bb8cSAndreas Gohr    /**
196*1148921dSAndreas Gohr     * Search on a DirectCollection (title-style 1:1 mapping)
1976734bb8cSAndreas Gohr     */
198*1148921dSAndreas Gohr    public function testSearchOnDirectCollection()
1996734bb8cSAndreas Gohr    {
2006734bb8cSAndreas Gohr        $collection = new MockDirectCollection('ld_entity', 'ld_token');
2016734bb8cSAndreas Gohr        $collection->lock();
2026734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['Welcome to DokuWiki']);
2036734bb8cSAndreas Gohr        $collection->addEntity('wiki:syntax', ['Formatting Syntax']);
2046734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['Other Page']);
2056734bb8cSAndreas Gohr        $collection->unlock();
2066734bb8cSAndreas Gohr
2076734bb8cSAndreas Gohr        // exact match
208*1148921dSAndreas Gohr        $search = new CollectionSearch($collection);
209*1148921dSAndreas Gohr        $term = $search->addTerm('Welcome to DokuWiki');
210*1148921dSAndreas Gohr        $search->execute();
211*1148921dSAndreas Gohr        $this->assertEquals(['wiki:start'], array_keys($term->getEntityFrequencies()));
2126734bb8cSAndreas Gohr
2136734bb8cSAndreas Gohr        // wildcard match
2146734bb8cSAndreas Gohr        $search2 = new CollectionSearch($collection);
215*1148921dSAndreas Gohr        $term2 = $search2->addTerm('*Syntax');
216*1148921dSAndreas Gohr        $search2->execute();
217*1148921dSAndreas Gohr        $this->assertEquals(['wiki:syntax'], array_keys($term2->getEntityFrequencies()));
2186734bb8cSAndreas Gohr
219*1148921dSAndreas Gohr        // case-insensitive substring match
2206734bb8cSAndreas Gohr        $search3 = new CollectionSearch($collection);
221*1148921dSAndreas Gohr        $search3->caseInsensitive();
222*1148921dSAndreas Gohr        $term3 = $search3->addTerm('*wiki*');
223*1148921dSAndreas Gohr        $search3->execute();
224*1148921dSAndreas Gohr        $this->assertEquals(['wiki:start'], array_keys($term3->getEntityFrequencies()));
2256734bb8cSAndreas Gohr    }
2266734bb8cSAndreas Gohr
2276734bb8cSAndreas Gohr    /**
228*1148921dSAndreas Gohr     * Multiple terms in a single search
2296734bb8cSAndreas Gohr     */
230*1148921dSAndreas Gohr    public function testMultipleTerms()
2316734bb8cSAndreas Gohr    {
2326734bb8cSAndreas Gohr        $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse');
2336734bb8cSAndreas Gohr        $collection->lock();
2346734bb8cSAndreas Gohr        $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']);
2356734bb8cSAndreas Gohr        $collection->addEntity('wiki:other', ['wiki:syntax']);
2366734bb8cSAndreas Gohr        $collection->unlock();
2376734bb8cSAndreas Gohr
2386734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
239*1148921dSAndreas Gohr        $term1 = $search->addTerm('wiki:syntax');
240*1148921dSAndreas Gohr        $term2 = $search->addTerm('wiki:welcome');
241*1148921dSAndreas Gohr        $term3 = $search->addTerm('nonexistent');
242*1148921dSAndreas Gohr        $search->execute();
2436734bb8cSAndreas Gohr
244*1148921dSAndreas Gohr        $syntax = array_keys($term1->getEntityFrequencies());
2456734bb8cSAndreas Gohr        sort($syntax);
2466734bb8cSAndreas Gohr        $this->assertEquals(['wiki:other', 'wiki:start'], $syntax);
247*1148921dSAndreas Gohr        $this->assertEquals(['wiki:start'], array_keys($term2->getEntityFrequencies()));
248*1148921dSAndreas Gohr        $this->assertEquals([], array_keys($term3->getEntityFrequencies()));
2496734bb8cSAndreas Gohr    }
2506734bb8cSAndreas Gohr
2516734bb8cSAndreas Gohr    /**
252*1148921dSAndreas Gohr     * Search on a split FrequencyCollection
2536734bb8cSAndreas Gohr     */
254*1148921dSAndreas Gohr    public function testSearchOnSplitCollection()
2556734bb8cSAndreas Gohr    {
2566734bb8cSAndreas Gohr        $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword');
2576734bb8cSAndreas Gohr        $collection->lock();
2586734bb8cSAndreas Gohr        $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']);
2596734bb8cSAndreas Gohr        $collection->addEntity('page2', ['dokuwiki', 'other']);
2606734bb8cSAndreas Gohr        $collection->unlock();
2616734bb8cSAndreas Gohr
2626734bb8cSAndreas Gohr        $search = new CollectionSearch($collection);
263*1148921dSAndreas Gohr        $term = $search->addTerm('dokuwiki');
264*1148921dSAndreas Gohr        $search->execute();
2656734bb8cSAndreas Gohr
266*1148921dSAndreas Gohr        $pages = array_keys($term->getEntityFrequencies());
2676734bb8cSAndreas Gohr        sort($pages);
2686734bb8cSAndreas Gohr        $this->assertEquals(['page1', 'page2'], $pages);
2696734bb8cSAndreas Gohr    }
27021fbd01bSAndreas Gohr
27121fbd01bSAndreas Gohr    /**
27221fbd01bSAndreas Gohr     * Searching an empty collection returns no results
27321fbd01bSAndreas Gohr     */
27421fbd01bSAndreas Gohr    public function testSearchEmptyCollection()
27521fbd01bSAndreas Gohr    {
27621fbd01bSAndreas Gohr        $collection = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw');
27721fbd01bSAndreas Gohr
27821fbd01bSAndreas Gohr        $search = new CollectionSearch($collection);
27921fbd01bSAndreas Gohr        $term = $search->addTerm('anything');
28021fbd01bSAndreas Gohr        $search->execute();
28121fbd01bSAndreas Gohr        $this->assertEquals([], $term->getEntityFrequencies());
28221fbd01bSAndreas Gohr    }
28321fbd01bSAndreas Gohr
28421fbd01bSAndreas Gohr    /**
285*1148921dSAndreas Gohr     * Search on an empty collection returns empty frequencies
28621fbd01bSAndreas Gohr     */
287*1148921dSAndreas Gohr    public function testSearchEmptyCollection2()
28821fbd01bSAndreas Gohr    {
28921fbd01bSAndreas Gohr        $collection = new MockFrequencyCollection('empty2_page', 'empty2_w', 'empty2_i', 'empty2_pw');
29021fbd01bSAndreas Gohr
29121fbd01bSAndreas Gohr        $search = new CollectionSearch($collection);
292*1148921dSAndreas Gohr        $term = $search->addTerm('anything');
293*1148921dSAndreas Gohr        $search->execute();
294*1148921dSAndreas Gohr        $this->assertEquals([], $term->getEntityFrequencies());
29521fbd01bSAndreas Gohr    }
2966734bb8cSAndreas Gohr}
297