16734bb8cSAndreas Gohr<?php 26734bb8cSAndreas Gohr 36734bb8cSAndreas Gohrnamespace dokuwiki\test\Search\Collection; 46734bb8cSAndreas Gohr 56734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch; 66734bb8cSAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 76734bb8cSAndreas Gohruse dokuwiki\Search\Tokenizer; 86734bb8cSAndreas Gohr 96734bb8cSAndreas Gohrclass CollectionSearchTest extends \DokuWikiTest 106734bb8cSAndreas Gohr{ 116734bb8cSAndreas Gohr 126734bb8cSAndreas Gohr public function testExactTerm() 136734bb8cSAndreas Gohr { 146734bb8cSAndreas Gohr // add some content to the indexes 156734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword'); 166734bb8cSAndreas Gohr $collection->lock(); 176734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']); 186734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other', 'words']); 196734bb8cSAndreas Gohr $collection->unlock(); 206734bb8cSAndreas Gohr 216734bb8cSAndreas Gohr // add search term 226734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 236734bb8cSAndreas Gohr $term = $search->addTerm('dokuwiki'); 246734bb8cSAndreas Gohr 256734bb8cSAndreas Gohr // execute search 266734bb8cSAndreas Gohr $search->execute(); 276734bb8cSAndreas Gohr 286734bb8cSAndreas Gohr // inspect the term updates first: 296734bb8cSAndreas Gohr 306734bb8cSAndreas Gohr // exact search should only match one token 316734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki'], $term->getTokens()); 326734bb8cSAndreas Gohr // that token is 8 chars and should be the first in the index 336734bb8cSAndreas Gohr $this->assertEquals([0], $term->getTokenIDsByGroup(8)); 346734bb8cSAndreas Gohr // the dokuwiki token is two times on page1 and 1 time on page2 356734bb8cSAndreas Gohr $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies()); 366734bb8cSAndreas Gohr 376734bb8cSAndreas Gohr // entity IDs should be available from the search 386734bb8cSAndreas Gohr $this->assertEquals([0 => 'page1', 1 => 'page2'], $search->getEntities()); 396734bb8cSAndreas Gohr 406734bb8cSAndreas Gohr } 416734bb8cSAndreas Gohr 426734bb8cSAndreas Gohr public function testWildcardSearch() 436734bb8cSAndreas Gohr { 446734bb8cSAndreas Gohr // page1 has: dokuwiki(x2), dokuwikis, doku, wiki 456734bb8cSAndreas Gohr // page2 has: dokuwiki, other, words 466734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword'); 476734bb8cSAndreas Gohr $collection->lock(); 486734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']); 496734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other', 'words']); 506734bb8cSAndreas Gohr $collection->unlock(); 516734bb8cSAndreas Gohr 526734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 536734bb8cSAndreas Gohr $endWild = $search->addTerm('doku*'); 546734bb8cSAndreas Gohr $startWild = $search->addTerm('*wiki'); 556734bb8cSAndreas Gohr $bothWild = $search->addTerm('*kuwi*'); 566734bb8cSAndreas Gohr $search->execute(); 576734bb8cSAndreas Gohr 586734bb8cSAndreas Gohr // doku* should match: doku(4), dokuwiki(8), dokuwikis(9) 596734bb8cSAndreas Gohr $endTokens = $endWild->getTokens(); 606734bb8cSAndreas Gohr sort($endTokens); 616734bb8cSAndreas Gohr $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens); 626734bb8cSAndreas Gohr // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1 636734bb8cSAndreas Gohr $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies()); 646734bb8cSAndreas Gohr 656734bb8cSAndreas Gohr // *wiki should match: dokuwiki(8), wiki(4) 666734bb8cSAndreas Gohr $startTokens = $startWild->getTokens(); 676734bb8cSAndreas Gohr sort($startTokens); 686734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki', 'wiki'], $startTokens); 696734bb8cSAndreas Gohr // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1 706734bb8cSAndreas Gohr $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies()); 716734bb8cSAndreas Gohr 726734bb8cSAndreas Gohr // *kuwi* should match: dokuwiki(8), dokuwikis(9) 736734bb8cSAndreas Gohr $bothTokens = $bothWild->getTokens(); 746734bb8cSAndreas Gohr sort($bothTokens); 756734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens); 766734bb8cSAndreas Gohr // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1 776734bb8cSAndreas Gohr $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies()); 786734bb8cSAndreas Gohr } 796734bb8cSAndreas Gohr 806734bb8cSAndreas Gohr /** 816734bb8cSAndreas Gohr * Index a real text file via the Tokenizer and search it 826734bb8cSAndreas Gohr */ 836734bb8cSAndreas Gohr public function testTokenizedPageSearch() 846734bb8cSAndreas Gohr { 856734bb8cSAndreas Gohr $text = file_get_contents(__DIR__ . '/../data/searchtest.txt'); 866734bb8cSAndreas Gohr $tokens = Tokenizer::getWords($text); 876734bb8cSAndreas Gohr 886734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword'); 896734bb8cSAndreas Gohr $collection->lock(); 906734bb8cSAndreas Gohr $collection->addEntity('search:test', $tokens); 916734bb8cSAndreas Gohr $collection->unlock(); 926734bb8cSAndreas Gohr 936734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 946734bb8cSAndreas Gohr $exact = $search->addTerm('dokuwiki'); 956734bb8cSAndreas Gohr $wild = $search->addTerm('plugin*'); 966734bb8cSAndreas Gohr $search->execute(); 976734bb8cSAndreas Gohr 986734bb8cSAndreas Gohr // "dokuwiki" appears 4 times in the text (case-insensitive tokenization) 996734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki'], $exact->getTokens()); 1006734bb8cSAndreas Gohr $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies()); 1016734bb8cSAndreas Gohr 1026734bb8cSAndreas Gohr // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present 1036734bb8cSAndreas Gohr $wildTokens = $wild->getTokens(); 1046734bb8cSAndreas Gohr $this->assertContains('plugins', $wildTokens); 1056734bb8cSAndreas Gohr $this->assertNotEmpty($wild->getEntityFrequencies()); 1066734bb8cSAndreas Gohr $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies()); 1076734bb8cSAndreas Gohr } 1086734bb8cSAndreas Gohr 1096734bb8cSAndreas Gohr public function testNoMatchReturnsEmptyFrequencies() 1106734bb8cSAndreas Gohr { 1116734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword'); 1126734bb8cSAndreas Gohr $collection->lock(); 1136734bb8cSAndreas Gohr $collection->addEntity('page1', ['alpha', 'beta', 'gamma']); 1146734bb8cSAndreas Gohr $collection->unlock(); 1156734bb8cSAndreas Gohr 1166734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 1176734bb8cSAndreas Gohr $term = $search->addTerm('zzzznotfound'); 1186734bb8cSAndreas Gohr $search->execute(); 1196734bb8cSAndreas Gohr 1206734bb8cSAndreas Gohr $this->assertEmpty($term->getTokens()); 1216734bb8cSAndreas Gohr $this->assertEmpty($term->getEntityFrequencies()); 1226734bb8cSAndreas Gohr $this->assertEmpty($search->getEntities()); 1236734bb8cSAndreas Gohr } 1246734bb8cSAndreas Gohr 1256734bb8cSAndreas Gohr // --- lookup() tests --- 1266734bb8cSAndreas Gohr 1276734bb8cSAndreas Gohr /** 1286734bb8cSAndreas Gohr * Exact lookup on a non-split LookupCollection 1296734bb8cSAndreas Gohr */ 1306734bb8cSAndreas Gohr public function testLookupExact() 1316734bb8cSAndreas Gohr { 1326734bb8cSAndreas Gohr $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse'); 1336734bb8cSAndreas Gohr $collection->lock(); 1346734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 1356734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax']); 1366734bb8cSAndreas Gohr $collection->unlock(); 1376734bb8cSAndreas Gohr 1386734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 1396734bb8cSAndreas Gohr $result = $search->lookup('wiki:syntax'); 1406734bb8cSAndreas Gohr 1416734bb8cSAndreas Gohr $this->assertCount(1, $result); 1426734bb8cSAndreas Gohr $this->assertArrayHasKey('wiki:syntax', $result); 1436734bb8cSAndreas Gohr $pages = $result['wiki:syntax']; 1446734bb8cSAndreas Gohr sort($pages); 1456734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 1466734bb8cSAndreas Gohr } 1476734bb8cSAndreas Gohr 1486734bb8cSAndreas Gohr /** 1496734bb8cSAndreas Gohr * Wildcard lookup on a non-split LookupCollection 1506734bb8cSAndreas Gohr */ 1516734bb8cSAndreas Gohr public function testLookupWildcard() 1526734bb8cSAndreas Gohr { 1536734bb8cSAndreas Gohr $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse'); 1546734bb8cSAndreas Gohr $collection->lock(); 1556734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 1566734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']); 1576734bb8cSAndreas Gohr $collection->unlock(); 1586734bb8cSAndreas Gohr 1596734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 1606734bb8cSAndreas Gohr 1616734bb8cSAndreas Gohr // end wildcard: wiki:* matches wiki:syntax and wiki:welcome 1626734bb8cSAndreas Gohr // wiki:start has both tokens, so it appears twice; wiki:other has wiki:syntax once 1636734bb8cSAndreas Gohr $result = $search->lookup('wiki:*'); 1646734bb8cSAndreas Gohr $pages = $result['wiki:*']; 1656734bb8cSAndreas Gohr sort($pages); 1666734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start', 'wiki:start'], $pages); 1676734bb8cSAndreas Gohr 1686734bb8cSAndreas Gohr // start wildcard: *syntax matches only wiki:syntax 1696734bb8cSAndreas Gohr $search2 = new CollectionSearch($collection); 1706734bb8cSAndreas Gohr $result2 = $search2->lookup('*syntax'); 1716734bb8cSAndreas Gohr $pages2 = $result2['*syntax']; 1726734bb8cSAndreas Gohr sort($pages2); 1736734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages2); 1746734bb8cSAndreas Gohr } 1756734bb8cSAndreas Gohr 1766734bb8cSAndreas Gohr /** 1776734bb8cSAndreas Gohr * Callback lookup on a non-split LookupCollection 1786734bb8cSAndreas Gohr */ 1796734bb8cSAndreas Gohr public function testLookupCallback() 1806734bb8cSAndreas Gohr { 1816734bb8cSAndreas Gohr $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse'); 1826734bb8cSAndreas Gohr $collection->lock(); 1836734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['Apple', 'Banana']); 1846734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']); 1856734bb8cSAndreas Gohr $collection->unlock(); 1866734bb8cSAndreas Gohr 1876734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 1886734bb8cSAndreas Gohr // case-insensitive substring match 1896734bb8cSAndreas Gohr $result = $search->lookup('apple', static fn($search, $word) => stripos($word, $search) !== false); 1906734bb8cSAndreas Gohr 1916734bb8cSAndreas Gohr $pages = $result['apple']; 1926734bb8cSAndreas Gohr sort($pages); 1936734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 1946734bb8cSAndreas Gohr } 1956734bb8cSAndreas Gohr 1966734bb8cSAndreas Gohr /** 1976734bb8cSAndreas Gohr * lookup() on a DirectCollection (title-style 1:1 mapping) 1986734bb8cSAndreas Gohr */ 1996734bb8cSAndreas Gohr public function testLookupOnDirectCollection() 2006734bb8cSAndreas Gohr { 2016734bb8cSAndreas Gohr $collection = new MockDirectCollection('ld_entity', 'ld_token'); 2026734bb8cSAndreas Gohr $collection->lock(); 2036734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['Welcome to DokuWiki']); 2046734bb8cSAndreas Gohr $collection->addEntity('wiki:syntax', ['Formatting Syntax']); 2056734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['Other Page']); 2066734bb8cSAndreas Gohr $collection->unlock(); 2076734bb8cSAndreas Gohr 2086734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 2096734bb8cSAndreas Gohr 2106734bb8cSAndreas Gohr // exact match 2116734bb8cSAndreas Gohr $result = $search->lookup('Welcome to DokuWiki'); 2126734bb8cSAndreas Gohr $this->assertEquals(['wiki:start'], $result['Welcome to DokuWiki']); 2136734bb8cSAndreas Gohr 2146734bb8cSAndreas Gohr // wildcard match 2156734bb8cSAndreas Gohr $search2 = new CollectionSearch($collection); 2166734bb8cSAndreas Gohr $result2 = $search2->lookup('*Syntax'); 2176734bb8cSAndreas Gohr $this->assertEquals(['wiki:syntax'], $result2['*Syntax']); 2186734bb8cSAndreas Gohr 2196734bb8cSAndreas Gohr // callback match (case-insensitive substring) 2206734bb8cSAndreas Gohr $search3 = new CollectionSearch($collection); 2216734bb8cSAndreas Gohr $result3 = $search3->lookup('wiki', static fn($s, $w) => stripos($w, $s) !== false); 2226734bb8cSAndreas Gohr $this->assertEquals(['wiki:start'], $result3['wiki']); 2236734bb8cSAndreas Gohr } 2246734bb8cSAndreas Gohr 2256734bb8cSAndreas Gohr /** 2266734bb8cSAndreas Gohr * lookup() with multiple values 2276734bb8cSAndreas Gohr */ 2286734bb8cSAndreas Gohr public function testLookupMultipleValues() 2296734bb8cSAndreas Gohr { 2306734bb8cSAndreas Gohr $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse'); 2316734bb8cSAndreas Gohr $collection->lock(); 2326734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 2336734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax']); 2346734bb8cSAndreas Gohr $collection->unlock(); 2356734bb8cSAndreas Gohr 2366734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 2376734bb8cSAndreas Gohr $result = $search->lookup(['wiki:syntax', 'wiki:welcome', 'nonexistent']); 2386734bb8cSAndreas Gohr 2396734bb8cSAndreas Gohr $syntax = $result['wiki:syntax']; 2406734bb8cSAndreas Gohr sort($syntax); 2416734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $syntax); 2426734bb8cSAndreas Gohr $this->assertEquals(['wiki:start'], $result['wiki:welcome']); 2436734bb8cSAndreas Gohr $this->assertEquals([], $result['nonexistent']); 2446734bb8cSAndreas Gohr } 2456734bb8cSAndreas Gohr 2466734bb8cSAndreas Gohr /** 2476734bb8cSAndreas Gohr * lookup() on a split FrequencyCollection 2486734bb8cSAndreas Gohr */ 2496734bb8cSAndreas Gohr public function testLookupOnSplitCollection() 2506734bb8cSAndreas Gohr { 2516734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword'); 2526734bb8cSAndreas Gohr $collection->lock(); 2536734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']); 2546734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other']); 2556734bb8cSAndreas Gohr $collection->unlock(); 2566734bb8cSAndreas Gohr 2576734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 2586734bb8cSAndreas Gohr $result = $search->lookup('dokuwiki'); 2596734bb8cSAndreas Gohr 2606734bb8cSAndreas Gohr $pages = $result['dokuwiki']; 2616734bb8cSAndreas Gohr sort($pages); 2626734bb8cSAndreas Gohr $this->assertEquals(['page1', 'page2'], $pages); 2636734bb8cSAndreas Gohr } 264*21fbd01bSAndreas Gohr 265*21fbd01bSAndreas Gohr /** 266*21fbd01bSAndreas Gohr * Searching an empty collection returns no results 267*21fbd01bSAndreas Gohr */ 268*21fbd01bSAndreas Gohr public function testSearchEmptyCollection() 269*21fbd01bSAndreas Gohr { 270*21fbd01bSAndreas Gohr $collection = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw'); 271*21fbd01bSAndreas Gohr 272*21fbd01bSAndreas Gohr $search = new CollectionSearch($collection); 273*21fbd01bSAndreas Gohr $term = $search->addTerm('anything'); 274*21fbd01bSAndreas Gohr $search->execute(); 275*21fbd01bSAndreas Gohr $this->assertEquals([], $term->getEntityFrequencies()); 276*21fbd01bSAndreas Gohr } 277*21fbd01bSAndreas Gohr 278*21fbd01bSAndreas Gohr /** 279*21fbd01bSAndreas Gohr * Lookup on an empty collection returns empty arrays 280*21fbd01bSAndreas Gohr */ 281*21fbd01bSAndreas Gohr public function testLookupEmptyCollection() 282*21fbd01bSAndreas Gohr { 283*21fbd01bSAndreas Gohr $collection = new MockFrequencyCollection('empty2_page', 'empty2_w', 'empty2_i', 'empty2_pw'); 284*21fbd01bSAndreas Gohr 285*21fbd01bSAndreas Gohr $search = new CollectionSearch($collection); 286*21fbd01bSAndreas Gohr $result = $search->lookup('anything'); 287*21fbd01bSAndreas Gohr $this->assertEquals([], $result['anything']); 288*21fbd01bSAndreas Gohr } 2896734bb8cSAndreas Gohr} 290