1*ede46466SAndreas Gohr<?php 2*ede46466SAndreas Gohr 3*ede46466SAndreas Gohrnamespace dokuwiki\test\Search\Collection; 4*ede46466SAndreas Gohr 5*ede46466SAndreas Gohruse dokuwiki\Search\Collection\FulltextCollection; 6*ede46466SAndreas Gohruse dokuwiki\Search\Collection\FulltextCollectionSearch; 7*ede46466SAndreas Gohruse dokuwiki\Search\Collection\Term; 8*ede46466SAndreas Gohruse dokuwiki\Search\Exception\SearchException; 9*ede46466SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 10*ede46466SAndreas Gohruse dokuwiki\Search\Query\QueryParser; 11*ede46466SAndreas Gohruse dokuwiki\Search\Tokenizer; 12*ede46466SAndreas Gohr 13*ede46466SAndreas Gohrclass TermTest extends \DokuWikiTest 14*ede46466SAndreas Gohr{ 15*ede46466SAndreas Gohr public function testBasicExact() 16*ede46466SAndreas Gohr { 17*ede46466SAndreas Gohr $term = new Term('dokuwiki'); 18*ede46466SAndreas Gohr 19*ede46466SAndreas Gohr $this->assertEquals('dokuwiki', $term->getOriginal()); 20*ede46466SAndreas Gohr $this->assertEquals('dokuwiki', $term->getBase()); 21*ede46466SAndreas Gohr $this->assertEquals('dokuwiki', $term->getQuoted()); 22*ede46466SAndreas Gohr $this->assertEquals(8, $term->getLength()); 23*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_NONE, $term->getWildcard()); 24*ede46466SAndreas Gohr } 25*ede46466SAndreas Gohr 26*ede46466SAndreas Gohr public function testBasicLeftWildcard() 27*ede46466SAndreas Gohr { 28*ede46466SAndreas Gohr $term = new Term('*wiki'); 29*ede46466SAndreas Gohr 30*ede46466SAndreas Gohr $this->assertEquals('*wiki', $term->getOriginal()); 31*ede46466SAndreas Gohr $this->assertEquals('wiki', $term->getBase()); 32*ede46466SAndreas Gohr $this->assertEquals('.*wiki', $term->getQuoted()); 33*ede46466SAndreas Gohr $this->assertEquals(4, $term->getLength()); 34*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_START, $term->getWildcard()); 35*ede46466SAndreas Gohr } 36*ede46466SAndreas Gohr 37*ede46466SAndreas Gohr public function testBasicRightWildcard() 38*ede46466SAndreas Gohr { 39*ede46466SAndreas Gohr $term = new Term('wiki*'); 40*ede46466SAndreas Gohr 41*ede46466SAndreas Gohr $this->assertEquals('wiki*', $term->getOriginal()); 42*ede46466SAndreas Gohr $this->assertEquals('wiki', $term->getBase()); 43*ede46466SAndreas Gohr $this->assertEquals('wiki.*', $term->getQuoted()); 44*ede46466SAndreas Gohr $this->assertEquals(4, $term->getLength()); 45*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_END, $term->getWildcard()); 46*ede46466SAndreas Gohr } 47*ede46466SAndreas Gohr 48*ede46466SAndreas Gohr public function testBasicBothWildcard() 49*ede46466SAndreas Gohr { 50*ede46466SAndreas Gohr $term = new Term('*wiki*'); 51*ede46466SAndreas Gohr 52*ede46466SAndreas Gohr $this->assertEquals('*wiki*', $term->getOriginal()); 53*ede46466SAndreas Gohr $this->assertEquals('wiki', $term->getBase()); 54*ede46466SAndreas Gohr $this->assertEquals('.*wiki.*', $term->getQuoted()); 55*ede46466SAndreas Gohr $this->assertEquals(4, $term->getLength()); 56*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_START + Term::WILDCARD_END, $term->getWildcard()); 57*ede46466SAndreas Gohr } 58*ede46466SAndreas Gohr 59*ede46466SAndreas Gohr public function testEmptyTerm() 60*ede46466SAndreas Gohr { 61*ede46466SAndreas Gohr $this->expectException(SearchException::class); 62*ede46466SAndreas Gohr $this->expectExceptionMessageMatches('/short/i'); 63*ede46466SAndreas Gohr new Term(''); 64*ede46466SAndreas Gohr } 65*ede46466SAndreas Gohr 66*ede46466SAndreas Gohr public function testTokenAdding() 67*ede46466SAndreas Gohr { 68*ede46466SAndreas Gohr $term = new Term('*wiki*'); 69*ede46466SAndreas Gohr $term->addTokens(8, [0 => 'dokuwiki']); 70*ede46466SAndreas Gohr $term->addTokens(5, [0 => 'wikis', 134 => 'awiki']); 71*ede46466SAndreas Gohr 72*ede46466SAndreas Gohr $this->assertEquals(['dokuwiki', 'wikis', 'awiki'], $term->getTokens()); 73*ede46466SAndreas Gohr 74*ede46466SAndreas Gohr $this->assertEquals([0], $term->getTokenIDsByLength(8)); 75*ede46466SAndreas Gohr $this->assertEquals([0, 134], $term->getTokenIDsByLength(5)); 76*ede46466SAndreas Gohr $this->assertEquals([], $term->getTokenIDsByLength(3)); 77*ede46466SAndreas Gohr } 78*ede46466SAndreas Gohr 79*ede46466SAndreas Gohr public function testFrequencyAdding() 80*ede46466SAndreas Gohr { 81*ede46466SAndreas Gohr $term = new Term('dokuwiki'); 82*ede46466SAndreas Gohr 83*ede46466SAndreas Gohr $term->addEntityFrequency(7, 7); 84*ede46466SAndreas Gohr $term->addEntityFrequency(7, 7); 85*ede46466SAndreas Gohr $term->addEntityFrequency(8, 1); 86*ede46466SAndreas Gohr 87*ede46466SAndreas Gohr $this->assertEquals([7 => 14, 8 => 1], $term->getEntityFrequencies()); 88*ede46466SAndreas Gohr 89*ede46466SAndreas Gohr $map = [ 90*ede46466SAndreas Gohr 7 => 'page1', 91*ede46466SAndreas Gohr 8 => 'page2' 92*ede46466SAndreas Gohr ]; 93*ede46466SAndreas Gohr $term->resolveEntities($map); 94*ede46466SAndreas Gohr 95*ede46466SAndreas Gohr $this->assertEquals(['page1' => 14, 'page2' => 1], $term->getEntityFrequencies()); 96*ede46466SAndreas Gohr } 97*ede46466SAndreas Gohr 98*ede46466SAndreas Gohr public function testNumericTerm() 99*ede46466SAndreas Gohr { 100*ede46466SAndreas Gohr // Numeric terms should be allowed even if they're shorter than minimum word length 101*ede46466SAndreas Gohr $term = new Term('42'); 102*ede46466SAndreas Gohr 103*ede46466SAndreas Gohr $this->assertEquals('42', $term->getOriginal()); 104*ede46466SAndreas Gohr $this->assertEquals('42', $term->getBase()); 105*ede46466SAndreas Gohr $this->assertEquals(2, $term->getLength()); 106*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_NONE, $term->getWildcard()); 107*ede46466SAndreas Gohr } 108*ede46466SAndreas Gohr 109*ede46466SAndreas Gohr public function testSpecialCharactersQuoting() 110*ede46466SAndreas Gohr { 111*ede46466SAndreas Gohr // Test that special regex characters are properly escaped 112*ede46466SAndreas Gohr $term = new Term('test.doc'); 113*ede46466SAndreas Gohr 114*ede46466SAndreas Gohr $this->assertEquals('test.doc', $term->getOriginal()); 115*ede46466SAndreas Gohr $this->assertEquals('test.doc', $term->getBase()); 116*ede46466SAndreas Gohr // The dot should be escaped in the quoted version 117*ede46466SAndreas Gohr $this->assertEquals('test\\.doc', $term->getQuoted()); 118*ede46466SAndreas Gohr } 119*ede46466SAndreas Gohr 120*ede46466SAndreas Gohr public function testSpecialCharactersWithWildcard() 121*ede46466SAndreas Gohr { 122*ede46466SAndreas Gohr // Test special chars with wildcard 123*ede46466SAndreas Gohr $term = new Term('test.*'); 124*ede46466SAndreas Gohr 125*ede46466SAndreas Gohr $this->assertEquals('test.*', $term->getOriginal()); 126*ede46466SAndreas Gohr $this->assertEquals('test.', $term->getBase()); 127*ede46466SAndreas Gohr // The dot should be escaped, but the wildcard * should become .* 128*ede46466SAndreas Gohr $this->assertEquals('test\\..*', $term->getQuoted()); 129*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_END, $term->getWildcard()); 130*ede46466SAndreas Gohr } 131*ede46466SAndreas Gohr 132*ede46466SAndreas Gohr public function testWildcardTrimming() 133*ede46466SAndreas Gohr { 134*ede46466SAndreas Gohr // Test that only wildcards (not spaces) are trimmed from base 135*ede46466SAndreas Gohr $term = new Term('*wiki*'); 136*ede46466SAndreas Gohr 137*ede46466SAndreas Gohr $this->assertEquals('*wiki*', $term->getOriginal()); 138*ede46466SAndreas Gohr $this->assertEquals('wiki', $term->getBase()); 139*ede46466SAndreas Gohr $this->assertEquals('.*wiki.*', $term->getQuoted()); 140*ede46466SAndreas Gohr $this->assertEquals(Term::WILDCARD_START + Term::WILDCARD_END, $term->getWildcard()); 141*ede46466SAndreas Gohr } 142*ede46466SAndreas Gohr 143*ede46466SAndreas Gohr public function testTooShortTerm() 144*ede46466SAndreas Gohr { 145*ede46466SAndreas Gohr // Get the minimum word length 146*ede46466SAndreas Gohr $minLength = Tokenizer::getMinWordLength(); 147*ede46466SAndreas Gohr 148*ede46466SAndreas Gohr if ($minLength > 1) { 149*ede46466SAndreas Gohr $this->expectException(SearchException::class); 150*ede46466SAndreas Gohr $this->expectExceptionMessageMatches('/short/i'); 151*ede46466SAndreas Gohr // Create a term that's too short (one character less than minimum) 152*ede46466SAndreas Gohr new Term(str_repeat('a', $minLength - 1)); 153*ede46466SAndreas Gohr } else { 154*ede46466SAndreas Gohr // If minimum length is 1 or less, this test doesn't apply 155*ede46466SAndreas Gohr $this->markTestSkipped('Minimum word length is too small for this test'); 156*ede46466SAndreas Gohr } 157*ede46466SAndreas Gohr } 158*ede46466SAndreas Gohr 159*ede46466SAndreas Gohr public function testOnlyWildcards() 160*ede46466SAndreas Gohr { 161*ede46466SAndreas Gohr $this->expectException(SearchException::class); 162*ede46466SAndreas Gohr $this->expectExceptionMessageMatches('/short/i'); 163*ede46466SAndreas Gohr new Term('***'); 164*ede46466SAndreas Gohr } 165*ede46466SAndreas Gohr 166*ede46466SAndreas Gohr public function testMultipleLengthTokens() 167*ede46466SAndreas Gohr { 168*ede46466SAndreas Gohr $term = new Term('*wiki*'); 169*ede46466SAndreas Gohr 170*ede46466SAndreas Gohr // Add tokens of various lengths 171*ede46466SAndreas Gohr $term->addTokens(4, [10 => 'wiki', 11 => 'mwiki']); 172*ede46466SAndreas Gohr $term->addTokens(8, [20 => 'dokuwiki', 21 => 'pmwiki']); 173*ede46466SAndreas Gohr $term->addTokens(9, [30 => 'mediawiki']); 174*ede46466SAndreas Gohr 175*ede46466SAndreas Gohr // Check we get all tokens 176*ede46466SAndreas Gohr $allTokens = $term->getTokens(); 177*ede46466SAndreas Gohr $this->assertCount(5, $allTokens); 178*ede46466SAndreas Gohr $this->assertContains('wiki', $allTokens); 179*ede46466SAndreas Gohr $this->assertContains('dokuwiki', $allTokens); 180*ede46466SAndreas Gohr $this->assertContains('mediawiki', $allTokens); 181*ede46466SAndreas Gohr 182*ede46466SAndreas Gohr // Check we can get tokens by specific length 183*ede46466SAndreas Gohr $this->assertEquals([10, 11], $term->getTokenIDsByLength(4)); 184*ede46466SAndreas Gohr $this->assertEquals([20, 21], $term->getTokenIDsByLength(8)); 185*ede46466SAndreas Gohr $this->assertEquals([30], $term->getTokenIDsByLength(9)); 186*ede46466SAndreas Gohr $this->assertEquals([], $term->getTokenIDsByLength(5)); 187*ede46466SAndreas Gohr } 188*ede46466SAndreas Gohr 189*ede46466SAndreas Gohr public function testFrequencyAggregationAcrossTokens() 190*ede46466SAndreas Gohr { 191*ede46466SAndreas Gohr // Simulate a search where term matches multiple tokens on the same entity 192*ede46466SAndreas Gohr $term = new Term('*wiki*'); 193*ede46466SAndreas Gohr 194*ede46466SAndreas Gohr // Entity 1 has multiple matching tokens 195*ede46466SAndreas Gohr $term->addEntityFrequency(1, 5); // first token appears 5 times 196*ede46466SAndreas Gohr $term->addEntityFrequency(1, 3); // second token appears 3 times 197*ede46466SAndreas Gohr $term->addEntityFrequency(1, 2); // third token appears 2 times 198*ede46466SAndreas Gohr 199*ede46466SAndreas Gohr // Entity 2 has one matching token 200*ede46466SAndreas Gohr $term->addEntityFrequency(2, 7); 201*ede46466SAndreas Gohr 202*ede46466SAndreas Gohr $frequencies = $term->getEntityFrequencies(); 203*ede46466SAndreas Gohr $this->assertEquals(10, $frequencies[1]); // 5 + 3 + 2 204*ede46466SAndreas Gohr $this->assertEquals(7, $frequencies[2]); 205*ede46466SAndreas Gohr } 206*ede46466SAndreas Gohr 207*ede46466SAndreas Gohr public function testEmptyTokensByLength() 208*ede46466SAndreas Gohr { 209*ede46466SAndreas Gohr $term = new Term('dokuwiki'); 210*ede46466SAndreas Gohr 211*ede46466SAndreas Gohr // Before adding any tokens, getting by length should return empty 212*ede46466SAndreas Gohr $this->assertEquals([], $term->getTokenIDsByLength(8)); 213*ede46466SAndreas Gohr 214*ede46466SAndreas Gohr // After adding tokens, querying a non-existent length returns empty 215*ede46466SAndreas Gohr $term->addTokens(4, [10 => 'wiki']); 216*ede46466SAndreas Gohr $this->assertEquals([], $term->getTokenIDsByLength(8)); 217*ede46466SAndreas Gohr } 218*ede46466SAndreas Gohr 219*ede46466SAndreas Gohr public function testZeroFrequency() 220*ede46466SAndreas Gohr { 221*ede46466SAndreas Gohr $term = new Term('dokuwiki'); 222*ede46466SAndreas Gohr 223*ede46466SAndreas Gohr $term->addEntityFrequency(1, 5); 224*ede46466SAndreas Gohr $term->addEntityFrequency(2, 0); // Zero frequency 225*ede46466SAndreas Gohr $term->addEntityFrequency(3, 3); 226*ede46466SAndreas Gohr 227*ede46466SAndreas Gohr $frequencies = $term->getEntityFrequencies(); 228*ede46466SAndreas Gohr $this->assertEquals(5, $frequencies[1]); 229*ede46466SAndreas Gohr $this->assertEquals(0, $frequencies[2]); // Zero is stored 230*ede46466SAndreas Gohr $this->assertEquals(3, $frequencies[3]); 231*ede46466SAndreas Gohr } 232*ede46466SAndreas Gohr 233*ede46466SAndreas Gohr public function testResolveEntitiesPartialMap() 234*ede46466SAndreas Gohr { 235*ede46466SAndreas Gohr $term = new Term('dokuwiki'); 236*ede46466SAndreas Gohr 237*ede46466SAndreas Gohr $term->addEntityFrequency(1, 5); 238*ede46466SAndreas Gohr $term->addEntityFrequency(2, 3); 239*ede46466SAndreas Gohr 240*ede46466SAndreas Gohr // Resolve with partial map - only some entities are mapped 241*ede46466SAndreas Gohr $map = [ 242*ede46466SAndreas Gohr 1 => 'page1', 243*ede46466SAndreas Gohr 2 => 'page2' 244*ede46466SAndreas Gohr ]; 245*ede46466SAndreas Gohr $term->resolveEntities($map); 246*ede46466SAndreas Gohr 247*ede46466SAndreas Gohr $frequencies = $term->getEntityFrequencies(); 248*ede46466SAndreas Gohr $this->assertEquals(5, $frequencies['page1']); 249*ede46466SAndreas Gohr $this->assertEquals(3, $frequencies['page2']); 250*ede46466SAndreas Gohr $this->assertCount(2, $frequencies); 251*ede46466SAndreas Gohr } 252*ede46466SAndreas Gohr 253*ede46466SAndreas Gohr public function testCaseSensitiveBase() 254*ede46466SAndreas Gohr { 255*ede46466SAndreas Gohr // Test that case is preserved 256*ede46466SAndreas Gohr $term = new Term('DokuWiki'); 257*ede46466SAndreas Gohr 258*ede46466SAndreas Gohr $this->assertEquals('DokuWiki', $term->getOriginal()); 259*ede46466SAndreas Gohr $this->assertEquals('DokuWiki', $term->getBase()); 260*ede46466SAndreas Gohr } 261*ede46466SAndreas Gohr 262*ede46466SAndreas Gohr public function testComplexRegexCharacters() 263*ede46466SAndreas Gohr { 264*ede46466SAndreas Gohr // Test multiple special regex characters 265*ede46466SAndreas Gohr $term = new Term('test[0-9]+.txt'); 266*ede46466SAndreas Gohr 267*ede46466SAndreas Gohr $this->assertEquals('test[0-9]+.txt', $term->getOriginal()); 268*ede46466SAndreas Gohr $this->assertEquals('test[0-9]+.txt', $term->getBase()); 269*ede46466SAndreas Gohr // All special characters should be escaped 270*ede46466SAndreas Gohr $quoted = $term->getQuoted(); 271*ede46466SAndreas Gohr $this->assertStringContainsString('\\[', $quoted); 272*ede46466SAndreas Gohr $this->assertStringContainsString('\\]', $quoted); 273*ede46466SAndreas Gohr $this->assertStringContainsString('\\+', $quoted); 274*ede46466SAndreas Gohr $this->assertStringContainsString('\\.', $quoted); 275*ede46466SAndreas Gohr } 276*ede46466SAndreas Gohr 277*ede46466SAndreas Gohr} 278