16734bb8cSAndreas Gohr<?php 26734bb8cSAndreas Gohr 36734bb8cSAndreas Gohrnamespace dokuwiki\test\Search\Collection; 46734bb8cSAndreas Gohr 56734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch; 66734bb8cSAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 76734bb8cSAndreas Gohruse dokuwiki\Search\Tokenizer; 86734bb8cSAndreas Gohr 96734bb8cSAndreas Gohrclass CollectionSearchTest extends \DokuWikiTest 106734bb8cSAndreas Gohr{ 116734bb8cSAndreas Gohr 126734bb8cSAndreas Gohr public function testExactTerm() 136734bb8cSAndreas Gohr { 146734bb8cSAndreas Gohr // add some content to the indexes 156734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword'); 166734bb8cSAndreas Gohr $collection->lock(); 176734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']); 186734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other', 'words']); 196734bb8cSAndreas Gohr $collection->unlock(); 206734bb8cSAndreas Gohr 216734bb8cSAndreas Gohr // add search term 226734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 236734bb8cSAndreas Gohr $term = $search->addTerm('dokuwiki'); 246734bb8cSAndreas Gohr 256734bb8cSAndreas Gohr // execute search 266734bb8cSAndreas Gohr $search->execute(); 276734bb8cSAndreas Gohr 286734bb8cSAndreas Gohr // exact search should only match one token 296734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki'], $term->getTokens()); 306734bb8cSAndreas Gohr // the dokuwiki token is two times on page1 and 1 time on page2 316734bb8cSAndreas Gohr $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies()); 32*1148921dSAndreas Gohr // full detail available 33*1148921dSAndreas Gohr $this->assertEquals(['dokuwiki' => 2], $term->getMatches()['page1']); 34*1148921dSAndreas Gohr $this->assertEquals(['dokuwiki' => 1], $term->getMatches()['page2']); 356734bb8cSAndreas Gohr 366734bb8cSAndreas Gohr } 376734bb8cSAndreas Gohr 386734bb8cSAndreas Gohr public function testWildcardSearch() 396734bb8cSAndreas Gohr { 406734bb8cSAndreas Gohr // page1 has: dokuwiki(x2), dokuwikis, doku, wiki 416734bb8cSAndreas Gohr // page2 has: dokuwiki, other, words 426734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword'); 436734bb8cSAndreas Gohr $collection->lock(); 446734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']); 456734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other', 'words']); 466734bb8cSAndreas Gohr $collection->unlock(); 476734bb8cSAndreas Gohr 486734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 496734bb8cSAndreas Gohr $endWild = $search->addTerm('doku*'); 506734bb8cSAndreas Gohr $startWild = $search->addTerm('*wiki'); 516734bb8cSAndreas Gohr $bothWild = $search->addTerm('*kuwi*'); 526734bb8cSAndreas Gohr $search->execute(); 536734bb8cSAndreas Gohr 546734bb8cSAndreas Gohr // doku* should match: doku(4), dokuwiki(8), dokuwikis(9) 556734bb8cSAndreas Gohr $endTokens = $endWild->getTokens(); 566734bb8cSAndreas Gohr sort($endTokens); 576734bb8cSAndreas Gohr $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens); 586734bb8cSAndreas Gohr // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1 596734bb8cSAndreas Gohr $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies()); 606734bb8cSAndreas Gohr 616734bb8cSAndreas Gohr // *wiki should match: dokuwiki(8), wiki(4) 626734bb8cSAndreas Gohr $startTokens = $startWild->getTokens(); 636734bb8cSAndreas Gohr sort($startTokens); 646734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki', 'wiki'], $startTokens); 656734bb8cSAndreas Gohr // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1 666734bb8cSAndreas Gohr $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies()); 676734bb8cSAndreas Gohr 686734bb8cSAndreas Gohr // *kuwi* should match: dokuwiki(8), dokuwikis(9) 696734bb8cSAndreas Gohr $bothTokens = $bothWild->getTokens(); 706734bb8cSAndreas Gohr sort($bothTokens); 716734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens); 726734bb8cSAndreas Gohr // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1 736734bb8cSAndreas Gohr $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies()); 746734bb8cSAndreas Gohr } 756734bb8cSAndreas Gohr 766734bb8cSAndreas Gohr /** 776734bb8cSAndreas Gohr * Index a real text file via the Tokenizer and search it 786734bb8cSAndreas Gohr */ 796734bb8cSAndreas Gohr public function testTokenizedPageSearch() 806734bb8cSAndreas Gohr { 816734bb8cSAndreas Gohr $text = file_get_contents(__DIR__ . '/../data/searchtest.txt'); 826734bb8cSAndreas Gohr $tokens = Tokenizer::getWords($text); 836734bb8cSAndreas Gohr 846734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword'); 856734bb8cSAndreas Gohr $collection->lock(); 866734bb8cSAndreas Gohr $collection->addEntity('search:test', $tokens); 876734bb8cSAndreas Gohr $collection->unlock(); 886734bb8cSAndreas Gohr 896734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 906734bb8cSAndreas Gohr $exact = $search->addTerm('dokuwiki'); 916734bb8cSAndreas Gohr $wild = $search->addTerm('plugin*'); 926734bb8cSAndreas Gohr $search->execute(); 936734bb8cSAndreas Gohr 946734bb8cSAndreas Gohr // "dokuwiki" appears 4 times in the text (case-insensitive tokenization) 956734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki'], $exact->getTokens()); 966734bb8cSAndreas Gohr $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies()); 976734bb8cSAndreas Gohr 986734bb8cSAndreas Gohr // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present 996734bb8cSAndreas Gohr $wildTokens = $wild->getTokens(); 1006734bb8cSAndreas Gohr $this->assertContains('plugins', $wildTokens); 1016734bb8cSAndreas Gohr $this->assertNotEmpty($wild->getEntityFrequencies()); 1026734bb8cSAndreas Gohr $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies()); 1036734bb8cSAndreas Gohr } 1046734bb8cSAndreas Gohr 1056734bb8cSAndreas Gohr public function testNoMatchReturnsEmptyFrequencies() 1066734bb8cSAndreas Gohr { 1076734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword'); 1086734bb8cSAndreas Gohr $collection->lock(); 1096734bb8cSAndreas Gohr $collection->addEntity('page1', ['alpha', 'beta', 'gamma']); 1106734bb8cSAndreas Gohr $collection->unlock(); 1116734bb8cSAndreas Gohr 1126734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 1136734bb8cSAndreas Gohr $term = $search->addTerm('zzzznotfound'); 1146734bb8cSAndreas Gohr $search->execute(); 1156734bb8cSAndreas Gohr 1166734bb8cSAndreas Gohr $this->assertEmpty($term->getTokens()); 1176734bb8cSAndreas Gohr $this->assertEmpty($term->getEntityFrequencies()); 118*1148921dSAndreas Gohr $this->assertEmpty($term->getMatches()); 1196734bb8cSAndreas Gohr } 1206734bb8cSAndreas Gohr 121*1148921dSAndreas Gohr // --- metadata-style search tests (using addTerm/execute without length restrictions) --- 1226734bb8cSAndreas Gohr 1236734bb8cSAndreas Gohr /** 124*1148921dSAndreas Gohr * Exact search on a non-split LookupCollection 1256734bb8cSAndreas Gohr */ 126*1148921dSAndreas Gohr public function testMetadataExact() 1276734bb8cSAndreas Gohr { 1286734bb8cSAndreas Gohr $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse'); 1296734bb8cSAndreas Gohr $collection->lock(); 1306734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 1316734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax']); 1326734bb8cSAndreas Gohr $collection->unlock(); 1336734bb8cSAndreas Gohr 1346734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 135*1148921dSAndreas Gohr $term = $search->addTerm('wiki:syntax'); 136*1148921dSAndreas Gohr $search->execute(); 1376734bb8cSAndreas Gohr 138*1148921dSAndreas Gohr $pages = array_keys($term->getEntityFrequencies()); 1396734bb8cSAndreas Gohr sort($pages); 1406734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 1416734bb8cSAndreas Gohr } 1426734bb8cSAndreas Gohr 1436734bb8cSAndreas Gohr /** 144*1148921dSAndreas Gohr * Wildcard search on a non-split LookupCollection 1456734bb8cSAndreas Gohr */ 146*1148921dSAndreas Gohr public function testMetadataWildcard() 1476734bb8cSAndreas Gohr { 1486734bb8cSAndreas Gohr $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse'); 1496734bb8cSAndreas Gohr $collection->lock(); 1506734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 1516734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']); 1526734bb8cSAndreas Gohr $collection->unlock(); 1536734bb8cSAndreas Gohr 1546734bb8cSAndreas Gohr // end wildcard: wiki:* matches wiki:syntax and wiki:welcome 155*1148921dSAndreas Gohr $search = new CollectionSearch($collection); 156*1148921dSAndreas Gohr $term = $search->addTerm('wiki:*'); 157*1148921dSAndreas Gohr $search->execute(); 158*1148921dSAndreas Gohr 159*1148921dSAndreas Gohr $pages = array_keys($term->getEntityFrequencies()); 1606734bb8cSAndreas Gohr sort($pages); 161*1148921dSAndreas Gohr // wiki:start has both tokens (freq 2), wiki:other has wiki:syntax (freq 1) 162*1148921dSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 1636734bb8cSAndreas Gohr 1646734bb8cSAndreas Gohr // start wildcard: *syntax matches only wiki:syntax 1656734bb8cSAndreas Gohr $search2 = new CollectionSearch($collection); 166*1148921dSAndreas Gohr $term2 = $search2->addTerm('*syntax'); 167*1148921dSAndreas Gohr $search2->execute(); 168*1148921dSAndreas Gohr 169*1148921dSAndreas Gohr $pages2 = array_keys($term2->getEntityFrequencies()); 1706734bb8cSAndreas Gohr sort($pages2); 1716734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages2); 1726734bb8cSAndreas Gohr } 1736734bb8cSAndreas Gohr 1746734bb8cSAndreas Gohr /** 175*1148921dSAndreas Gohr * Case-insensitive search on a non-split LookupCollection 1766734bb8cSAndreas Gohr */ 177*1148921dSAndreas Gohr public function testMetadataCaseInsensitive() 1786734bb8cSAndreas Gohr { 1796734bb8cSAndreas Gohr $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse'); 1806734bb8cSAndreas Gohr $collection->lock(); 1816734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['Apple', 'Banana']); 1826734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']); 1836734bb8cSAndreas Gohr $collection->unlock(); 1846734bb8cSAndreas Gohr 1856734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 186*1148921dSAndreas Gohr $search->caseInsensitive(); 187*1148921dSAndreas Gohr $term = $search->addTerm('*apple*'); 188*1148921dSAndreas Gohr $search->execute(); 1896734bb8cSAndreas Gohr 190*1148921dSAndreas Gohr $pages = array_keys($term->getEntityFrequencies()); 1916734bb8cSAndreas Gohr sort($pages); 1926734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 1936734bb8cSAndreas Gohr } 1946734bb8cSAndreas Gohr 1956734bb8cSAndreas Gohr /** 196*1148921dSAndreas Gohr * Search on a DirectCollection (title-style 1:1 mapping) 1976734bb8cSAndreas Gohr */ 198*1148921dSAndreas Gohr public function testSearchOnDirectCollection() 1996734bb8cSAndreas Gohr { 2006734bb8cSAndreas Gohr $collection = new MockDirectCollection('ld_entity', 'ld_token'); 2016734bb8cSAndreas Gohr $collection->lock(); 2026734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['Welcome to DokuWiki']); 2036734bb8cSAndreas Gohr $collection->addEntity('wiki:syntax', ['Formatting Syntax']); 2046734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['Other Page']); 2056734bb8cSAndreas Gohr $collection->unlock(); 2066734bb8cSAndreas Gohr 2076734bb8cSAndreas Gohr // exact match 208*1148921dSAndreas Gohr $search = new CollectionSearch($collection); 209*1148921dSAndreas Gohr $term = $search->addTerm('Welcome to DokuWiki'); 210*1148921dSAndreas Gohr $search->execute(); 211*1148921dSAndreas Gohr $this->assertEquals(['wiki:start'], array_keys($term->getEntityFrequencies())); 2126734bb8cSAndreas Gohr 2136734bb8cSAndreas Gohr // wildcard match 2146734bb8cSAndreas Gohr $search2 = new CollectionSearch($collection); 215*1148921dSAndreas Gohr $term2 = $search2->addTerm('*Syntax'); 216*1148921dSAndreas Gohr $search2->execute(); 217*1148921dSAndreas Gohr $this->assertEquals(['wiki:syntax'], array_keys($term2->getEntityFrequencies())); 2186734bb8cSAndreas Gohr 219*1148921dSAndreas Gohr // case-insensitive substring match 2206734bb8cSAndreas Gohr $search3 = new CollectionSearch($collection); 221*1148921dSAndreas Gohr $search3->caseInsensitive(); 222*1148921dSAndreas Gohr $term3 = $search3->addTerm('*wiki*'); 223*1148921dSAndreas Gohr $search3->execute(); 224*1148921dSAndreas Gohr $this->assertEquals(['wiki:start'], array_keys($term3->getEntityFrequencies())); 2256734bb8cSAndreas Gohr } 2266734bb8cSAndreas Gohr 2276734bb8cSAndreas Gohr /** 228*1148921dSAndreas Gohr * Multiple terms in a single search 2296734bb8cSAndreas Gohr */ 230*1148921dSAndreas Gohr public function testMultipleTerms() 2316734bb8cSAndreas Gohr { 2326734bb8cSAndreas Gohr $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse'); 2336734bb8cSAndreas Gohr $collection->lock(); 2346734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 2356734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax']); 2366734bb8cSAndreas Gohr $collection->unlock(); 2376734bb8cSAndreas Gohr 2386734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 239*1148921dSAndreas Gohr $term1 = $search->addTerm('wiki:syntax'); 240*1148921dSAndreas Gohr $term2 = $search->addTerm('wiki:welcome'); 241*1148921dSAndreas Gohr $term3 = $search->addTerm('nonexistent'); 242*1148921dSAndreas Gohr $search->execute(); 2436734bb8cSAndreas Gohr 244*1148921dSAndreas Gohr $syntax = array_keys($term1->getEntityFrequencies()); 2456734bb8cSAndreas Gohr sort($syntax); 2466734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $syntax); 247*1148921dSAndreas Gohr $this->assertEquals(['wiki:start'], array_keys($term2->getEntityFrequencies())); 248*1148921dSAndreas Gohr $this->assertEquals([], array_keys($term3->getEntityFrequencies())); 2496734bb8cSAndreas Gohr } 2506734bb8cSAndreas Gohr 2516734bb8cSAndreas Gohr /** 252*1148921dSAndreas Gohr * Search on a split FrequencyCollection 2536734bb8cSAndreas Gohr */ 254*1148921dSAndreas Gohr public function testSearchOnSplitCollection() 2556734bb8cSAndreas Gohr { 2566734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword'); 2576734bb8cSAndreas Gohr $collection->lock(); 2586734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']); 2596734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other']); 2606734bb8cSAndreas Gohr $collection->unlock(); 2616734bb8cSAndreas Gohr 2626734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 263*1148921dSAndreas Gohr $term = $search->addTerm('dokuwiki'); 264*1148921dSAndreas Gohr $search->execute(); 2656734bb8cSAndreas Gohr 266*1148921dSAndreas Gohr $pages = array_keys($term->getEntityFrequencies()); 2676734bb8cSAndreas Gohr sort($pages); 2686734bb8cSAndreas Gohr $this->assertEquals(['page1', 'page2'], $pages); 2696734bb8cSAndreas Gohr } 27021fbd01bSAndreas Gohr 27121fbd01bSAndreas Gohr /** 27221fbd01bSAndreas Gohr * Searching an empty collection returns no results 27321fbd01bSAndreas Gohr */ 27421fbd01bSAndreas Gohr public function testSearchEmptyCollection() 27521fbd01bSAndreas Gohr { 27621fbd01bSAndreas Gohr $collection = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw'); 27721fbd01bSAndreas Gohr 27821fbd01bSAndreas Gohr $search = new CollectionSearch($collection); 27921fbd01bSAndreas Gohr $term = $search->addTerm('anything'); 28021fbd01bSAndreas Gohr $search->execute(); 28121fbd01bSAndreas Gohr $this->assertEquals([], $term->getEntityFrequencies()); 28221fbd01bSAndreas Gohr } 28321fbd01bSAndreas Gohr 28421fbd01bSAndreas Gohr /** 285*1148921dSAndreas Gohr * Search on an empty collection returns empty frequencies 28621fbd01bSAndreas Gohr */ 287*1148921dSAndreas Gohr public function testSearchEmptyCollection2() 28821fbd01bSAndreas Gohr { 28921fbd01bSAndreas Gohr $collection = new MockFrequencyCollection('empty2_page', 'empty2_w', 'empty2_i', 'empty2_pw'); 29021fbd01bSAndreas Gohr 29121fbd01bSAndreas Gohr $search = new CollectionSearch($collection); 292*1148921dSAndreas Gohr $term = $search->addTerm('anything'); 293*1148921dSAndreas Gohr $search->execute(); 294*1148921dSAndreas Gohr $this->assertEquals([], $term->getEntityFrequencies()); 29521fbd01bSAndreas Gohr } 2966734bb8cSAndreas Gohr} 297