1*6734bb8cSAndreas Gohr<?php 2*6734bb8cSAndreas Gohr 3*6734bb8cSAndreas Gohrnamespace dokuwiki\test\Search\Collection; 4*6734bb8cSAndreas Gohr 5*6734bb8cSAndreas Gohruse dokuwiki\Search\Collection\CollectionSearch; 6*6734bb8cSAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 7*6734bb8cSAndreas Gohruse dokuwiki\Search\Tokenizer; 8*6734bb8cSAndreas Gohr 9*6734bb8cSAndreas Gohrclass CollectionSearchTest extends \DokuWikiTest 10*6734bb8cSAndreas Gohr{ 11*6734bb8cSAndreas Gohr 12*6734bb8cSAndreas Gohr public function testExactTerm() 13*6734bb8cSAndreas Gohr { 14*6734bb8cSAndreas Gohr // add some content to the indexes 15*6734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('page', 'w', 'i', 'pageword'); 16*6734bb8cSAndreas Gohr $collection->lock(); 17*6734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']); 18*6734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other', 'words']); 19*6734bb8cSAndreas Gohr $collection->unlock(); 20*6734bb8cSAndreas Gohr 21*6734bb8cSAndreas Gohr // add search term 22*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 23*6734bb8cSAndreas Gohr $term = $search->addTerm('dokuwiki'); 24*6734bb8cSAndreas Gohr 25*6734bb8cSAndreas Gohr // execute search 26*6734bb8cSAndreas Gohr $search->execute(); 27*6734bb8cSAndreas Gohr 28*6734bb8cSAndreas Gohr // inspect the term updates first: 29*6734bb8cSAndreas Gohr 30*6734bb8cSAndreas Gohr // exact search should only match one token 31*6734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki'], $term->getTokens()); 32*6734bb8cSAndreas Gohr // that token is 8 chars and should be the first in the index 33*6734bb8cSAndreas Gohr $this->assertEquals([0], $term->getTokenIDsByGroup(8)); 34*6734bb8cSAndreas Gohr // the dokuwiki token is two times on page1 and 1 time on page2 35*6734bb8cSAndreas Gohr $this->assertEquals(['page1' => 2, 'page2' => 1], $term->getEntityFrequencies()); 36*6734bb8cSAndreas Gohr 37*6734bb8cSAndreas Gohr // entity IDs should be available from the search 38*6734bb8cSAndreas Gohr $this->assertEquals([0 => 'page1', 1 => 'page2'], $search->getEntities()); 39*6734bb8cSAndreas Gohr 40*6734bb8cSAndreas Gohr } 41*6734bb8cSAndreas Gohr 42*6734bb8cSAndreas Gohr public function testWildcardSearch() 43*6734bb8cSAndreas Gohr { 44*6734bb8cSAndreas Gohr // page1 has: dokuwiki(x2), dokuwikis, doku, wiki 45*6734bb8cSAndreas Gohr // page2 has: dokuwiki, other, words 46*6734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('wc_page', 'wc_w', 'wc_i', 'wc_pageword'); 47*6734bb8cSAndreas Gohr $collection->lock(); 48*6734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'dokuwiki', 'dokuwikis', 'doku', 'wiki']); 49*6734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other', 'words']); 50*6734bb8cSAndreas Gohr $collection->unlock(); 51*6734bb8cSAndreas Gohr 52*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 53*6734bb8cSAndreas Gohr $endWild = $search->addTerm('doku*'); 54*6734bb8cSAndreas Gohr $startWild = $search->addTerm('*wiki'); 55*6734bb8cSAndreas Gohr $bothWild = $search->addTerm('*kuwi*'); 56*6734bb8cSAndreas Gohr $search->execute(); 57*6734bb8cSAndreas Gohr 58*6734bb8cSAndreas Gohr // doku* should match: doku(4), dokuwiki(8), dokuwikis(9) 59*6734bb8cSAndreas Gohr $endTokens = $endWild->getTokens(); 60*6734bb8cSAndreas Gohr sort($endTokens); 61*6734bb8cSAndreas Gohr $this->assertEquals(['doku', 'dokuwiki', 'dokuwikis'], $endTokens); 62*6734bb8cSAndreas Gohr // page1 has doku(1) + dokuwiki(2) + dokuwikis(1) = 4, page2 has dokuwiki(1) = 1 63*6734bb8cSAndreas Gohr $this->assertEquals(['page1' => 4, 'page2' => 1], $endWild->getEntityFrequencies()); 64*6734bb8cSAndreas Gohr 65*6734bb8cSAndreas Gohr // *wiki should match: dokuwiki(8), wiki(4) 66*6734bb8cSAndreas Gohr $startTokens = $startWild->getTokens(); 67*6734bb8cSAndreas Gohr sort($startTokens); 68*6734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki', 'wiki'], $startTokens); 69*6734bb8cSAndreas Gohr // page1 has dokuwiki(2) + wiki(1) = 3, page2 has dokuwiki(1) = 1 70*6734bb8cSAndreas Gohr $this->assertEquals(['page1' => 3, 'page2' => 1], $startWild->getEntityFrequencies()); 71*6734bb8cSAndreas Gohr 72*6734bb8cSAndreas Gohr // *kuwi* should match: dokuwiki(8), dokuwikis(9) 73*6734bb8cSAndreas Gohr $bothTokens = $bothWild->getTokens(); 74*6734bb8cSAndreas Gohr sort($bothTokens); 75*6734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki', 'dokuwikis'], $bothTokens); 76*6734bb8cSAndreas Gohr // page1 has dokuwiki(2) + dokuwikis(1) = 3, page2 has dokuwiki(1) = 1 77*6734bb8cSAndreas Gohr $this->assertEquals(['page1' => 3, 'page2' => 1], $bothWild->getEntityFrequencies()); 78*6734bb8cSAndreas Gohr } 79*6734bb8cSAndreas Gohr 80*6734bb8cSAndreas Gohr /** 81*6734bb8cSAndreas Gohr * Index a real text file via the Tokenizer and search it 82*6734bb8cSAndreas Gohr */ 83*6734bb8cSAndreas Gohr public function testTokenizedPageSearch() 84*6734bb8cSAndreas Gohr { 85*6734bb8cSAndreas Gohr $text = file_get_contents(__DIR__ . '/../data/searchtest.txt'); 86*6734bb8cSAndreas Gohr $tokens = Tokenizer::getWords($text); 87*6734bb8cSAndreas Gohr 88*6734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('tp_page', 'tp_w', 'tp_i', 'tp_pageword'); 89*6734bb8cSAndreas Gohr $collection->lock(); 90*6734bb8cSAndreas Gohr $collection->addEntity('search:test', $tokens); 91*6734bb8cSAndreas Gohr $collection->unlock(); 92*6734bb8cSAndreas Gohr 93*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 94*6734bb8cSAndreas Gohr $exact = $search->addTerm('dokuwiki'); 95*6734bb8cSAndreas Gohr $wild = $search->addTerm('plugin*'); 96*6734bb8cSAndreas Gohr $search->execute(); 97*6734bb8cSAndreas Gohr 98*6734bb8cSAndreas Gohr // "dokuwiki" appears 4 times in the text (case-insensitive tokenization) 99*6734bb8cSAndreas Gohr $this->assertEquals(['dokuwiki'], $exact->getTokens()); 100*6734bb8cSAndreas Gohr $this->assertEquals(['search:test' => 4], $exact->getEntityFrequencies()); 101*6734bb8cSAndreas Gohr 102*6734bb8cSAndreas Gohr // "plugin*" should match "plugins" (7 chars) and "plugin" would be too if present 103*6734bb8cSAndreas Gohr $wildTokens = $wild->getTokens(); 104*6734bb8cSAndreas Gohr $this->assertContains('plugins', $wildTokens); 105*6734bb8cSAndreas Gohr $this->assertNotEmpty($wild->getEntityFrequencies()); 106*6734bb8cSAndreas Gohr $this->assertArrayHasKey('search:test', $wild->getEntityFrequencies()); 107*6734bb8cSAndreas Gohr } 108*6734bb8cSAndreas Gohr 109*6734bb8cSAndreas Gohr public function testNoMatchReturnsEmptyFrequencies() 110*6734bb8cSAndreas Gohr { 111*6734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('nm_page', 'nm_w', 'nm_i', 'nm_pageword'); 112*6734bb8cSAndreas Gohr $collection->lock(); 113*6734bb8cSAndreas Gohr $collection->addEntity('page1', ['alpha', 'beta', 'gamma']); 114*6734bb8cSAndreas Gohr $collection->unlock(); 115*6734bb8cSAndreas Gohr 116*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 117*6734bb8cSAndreas Gohr $term = $search->addTerm('zzzznotfound'); 118*6734bb8cSAndreas Gohr $search->execute(); 119*6734bb8cSAndreas Gohr 120*6734bb8cSAndreas Gohr $this->assertEmpty($term->getTokens()); 121*6734bb8cSAndreas Gohr $this->assertEmpty($term->getEntityFrequencies()); 122*6734bb8cSAndreas Gohr $this->assertEmpty($search->getEntities()); 123*6734bb8cSAndreas Gohr } 124*6734bb8cSAndreas Gohr 125*6734bb8cSAndreas Gohr // --- lookup() tests --- 126*6734bb8cSAndreas Gohr 127*6734bb8cSAndreas Gohr /** 128*6734bb8cSAndreas Gohr * Exact lookup on a non-split LookupCollection 129*6734bb8cSAndreas Gohr */ 130*6734bb8cSAndreas Gohr public function testLookupExact() 131*6734bb8cSAndreas Gohr { 132*6734bb8cSAndreas Gohr $collection = new MockLookupCollection('le_entity', 'le_token', 'le_freq', 'le_reverse'); 133*6734bb8cSAndreas Gohr $collection->lock(); 134*6734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 135*6734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax']); 136*6734bb8cSAndreas Gohr $collection->unlock(); 137*6734bb8cSAndreas Gohr 138*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 139*6734bb8cSAndreas Gohr $result = $search->lookup('wiki:syntax'); 140*6734bb8cSAndreas Gohr 141*6734bb8cSAndreas Gohr $this->assertCount(1, $result); 142*6734bb8cSAndreas Gohr $this->assertArrayHasKey('wiki:syntax', $result); 143*6734bb8cSAndreas Gohr $pages = $result['wiki:syntax']; 144*6734bb8cSAndreas Gohr sort($pages); 145*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 146*6734bb8cSAndreas Gohr } 147*6734bb8cSAndreas Gohr 148*6734bb8cSAndreas Gohr /** 149*6734bb8cSAndreas Gohr * Wildcard lookup on a non-split LookupCollection 150*6734bb8cSAndreas Gohr */ 151*6734bb8cSAndreas Gohr public function testLookupWildcard() 152*6734bb8cSAndreas Gohr { 153*6734bb8cSAndreas Gohr $collection = new MockLookupCollection('lw_entity', 'lw_token', 'lw_freq', 'lw_reverse'); 154*6734bb8cSAndreas Gohr $collection->lock(); 155*6734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 156*6734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax', 'other:page']); 157*6734bb8cSAndreas Gohr $collection->unlock(); 158*6734bb8cSAndreas Gohr 159*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 160*6734bb8cSAndreas Gohr 161*6734bb8cSAndreas Gohr // end wildcard: wiki:* matches wiki:syntax and wiki:welcome 162*6734bb8cSAndreas Gohr // wiki:start has both tokens, so it appears twice; wiki:other has wiki:syntax once 163*6734bb8cSAndreas Gohr $result = $search->lookup('wiki:*'); 164*6734bb8cSAndreas Gohr $pages = $result['wiki:*']; 165*6734bb8cSAndreas Gohr sort($pages); 166*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start', 'wiki:start'], $pages); 167*6734bb8cSAndreas Gohr 168*6734bb8cSAndreas Gohr // start wildcard: *syntax matches only wiki:syntax 169*6734bb8cSAndreas Gohr $search2 = new CollectionSearch($collection); 170*6734bb8cSAndreas Gohr $result2 = $search2->lookup('*syntax'); 171*6734bb8cSAndreas Gohr $pages2 = $result2['*syntax']; 172*6734bb8cSAndreas Gohr sort($pages2); 173*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages2); 174*6734bb8cSAndreas Gohr } 175*6734bb8cSAndreas Gohr 176*6734bb8cSAndreas Gohr /** 177*6734bb8cSAndreas Gohr * Callback lookup on a non-split LookupCollection 178*6734bb8cSAndreas Gohr */ 179*6734bb8cSAndreas Gohr public function testLookupCallback() 180*6734bb8cSAndreas Gohr { 181*6734bb8cSAndreas Gohr $collection = new MockLookupCollection('lc_entity', 'lc_token', 'lc_freq', 'lc_reverse'); 182*6734bb8cSAndreas Gohr $collection->lock(); 183*6734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['Apple', 'Banana']); 184*6734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['Cherry', 'Apple Pie']); 185*6734bb8cSAndreas Gohr $collection->unlock(); 186*6734bb8cSAndreas Gohr 187*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 188*6734bb8cSAndreas Gohr // case-insensitive substring match 189*6734bb8cSAndreas Gohr $result = $search->lookup('apple', static fn($search, $word) => stripos($word, $search) !== false); 190*6734bb8cSAndreas Gohr 191*6734bb8cSAndreas Gohr $pages = $result['apple']; 192*6734bb8cSAndreas Gohr sort($pages); 193*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $pages); 194*6734bb8cSAndreas Gohr } 195*6734bb8cSAndreas Gohr 196*6734bb8cSAndreas Gohr /** 197*6734bb8cSAndreas Gohr * lookup() on a DirectCollection (title-style 1:1 mapping) 198*6734bb8cSAndreas Gohr */ 199*6734bb8cSAndreas Gohr public function testLookupOnDirectCollection() 200*6734bb8cSAndreas Gohr { 201*6734bb8cSAndreas Gohr $collection = new MockDirectCollection('ld_entity', 'ld_token'); 202*6734bb8cSAndreas Gohr $collection->lock(); 203*6734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['Welcome to DokuWiki']); 204*6734bb8cSAndreas Gohr $collection->addEntity('wiki:syntax', ['Formatting Syntax']); 205*6734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['Other Page']); 206*6734bb8cSAndreas Gohr $collection->unlock(); 207*6734bb8cSAndreas Gohr 208*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 209*6734bb8cSAndreas Gohr 210*6734bb8cSAndreas Gohr // exact match 211*6734bb8cSAndreas Gohr $result = $search->lookup('Welcome to DokuWiki'); 212*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:start'], $result['Welcome to DokuWiki']); 213*6734bb8cSAndreas Gohr 214*6734bb8cSAndreas Gohr // wildcard match 215*6734bb8cSAndreas Gohr $search2 = new CollectionSearch($collection); 216*6734bb8cSAndreas Gohr $result2 = $search2->lookup('*Syntax'); 217*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:syntax'], $result2['*Syntax']); 218*6734bb8cSAndreas Gohr 219*6734bb8cSAndreas Gohr // callback match (case-insensitive substring) 220*6734bb8cSAndreas Gohr $search3 = new CollectionSearch($collection); 221*6734bb8cSAndreas Gohr $result3 = $search3->lookup('wiki', static fn($s, $w) => stripos($w, $s) !== false); 222*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:start'], $result3['wiki']); 223*6734bb8cSAndreas Gohr } 224*6734bb8cSAndreas Gohr 225*6734bb8cSAndreas Gohr /** 226*6734bb8cSAndreas Gohr * lookup() with multiple values 227*6734bb8cSAndreas Gohr */ 228*6734bb8cSAndreas Gohr public function testLookupMultipleValues() 229*6734bb8cSAndreas Gohr { 230*6734bb8cSAndreas Gohr $collection = new MockLookupCollection('lm_entity', 'lm_token', 'lm_freq', 'lm_reverse'); 231*6734bb8cSAndreas Gohr $collection->lock(); 232*6734bb8cSAndreas Gohr $collection->addEntity('wiki:start', ['wiki:syntax', 'wiki:welcome']); 233*6734bb8cSAndreas Gohr $collection->addEntity('wiki:other', ['wiki:syntax']); 234*6734bb8cSAndreas Gohr $collection->unlock(); 235*6734bb8cSAndreas Gohr 236*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 237*6734bb8cSAndreas Gohr $result = $search->lookup(['wiki:syntax', 'wiki:welcome', 'nonexistent']); 238*6734bb8cSAndreas Gohr 239*6734bb8cSAndreas Gohr $syntax = $result['wiki:syntax']; 240*6734bb8cSAndreas Gohr sort($syntax); 241*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:other', 'wiki:start'], $syntax); 242*6734bb8cSAndreas Gohr $this->assertEquals(['wiki:start'], $result['wiki:welcome']); 243*6734bb8cSAndreas Gohr $this->assertEquals([], $result['nonexistent']); 244*6734bb8cSAndreas Gohr } 245*6734bb8cSAndreas Gohr 246*6734bb8cSAndreas Gohr /** 247*6734bb8cSAndreas Gohr * lookup() on a split FrequencyCollection 248*6734bb8cSAndreas Gohr */ 249*6734bb8cSAndreas Gohr public function testLookupOnSplitCollection() 250*6734bb8cSAndreas Gohr { 251*6734bb8cSAndreas Gohr $collection = new MockFrequencyCollection('ls_page', 'ls_w', 'ls_i', 'ls_pageword'); 252*6734bb8cSAndreas Gohr $collection->lock(); 253*6734bb8cSAndreas Gohr $collection->addEntity('page1', ['dokuwiki', 'wiki', 'doku']); 254*6734bb8cSAndreas Gohr $collection->addEntity('page2', ['dokuwiki', 'other']); 255*6734bb8cSAndreas Gohr $collection->unlock(); 256*6734bb8cSAndreas Gohr 257*6734bb8cSAndreas Gohr $search = new CollectionSearch($collection); 258*6734bb8cSAndreas Gohr $result = $search->lookup('dokuwiki'); 259*6734bb8cSAndreas Gohr 260*6734bb8cSAndreas Gohr $pages = $result['dokuwiki']; 261*6734bb8cSAndreas Gohr sort($pages); 262*6734bb8cSAndreas Gohr $this->assertEquals(['page1', 'page2'], $pages); 263*6734bb8cSAndreas Gohr } 264*6734bb8cSAndreas Gohr} 265