1ede46466SAndreas Gohr<?php 2ede46466SAndreas Gohr 3ede46466SAndreas Gohrnamespace dokuwiki\test\Search\Collection; 4ede46466SAndreas Gohr 521fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException; 6ede46466SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex; 7ede46466SAndreas Gohr 8ede46466SAndreas Gohrclass FrequencyCollectionTest extends \DokuWikiTest 9ede46466SAndreas Gohr{ 10ede46466SAndreas Gohr 11ede46466SAndreas Gohr /** 12ede46466SAndreas Gohr * Add data and directly check the underlying indexes for correctness 13ede46466SAndreas Gohr */ 14ede46466SAndreas Gohr public function testDirectly() 15ede46466SAndreas Gohr { 16ede46466SAndreas Gohr $index = new MockFrequencyCollection('entity', 'token', 'freq', 'reverse'); 17ede46466SAndreas Gohr 18ede46466SAndreas Gohr $tokens = ['one', 'two', 'three', 'four', 'two']; 19ede46466SAndreas Gohr $index->lock(); 20ede46466SAndreas Gohr $index->addEntity('test', $tokens); 21ede46466SAndreas Gohr $index->unlock(); 22ede46466SAndreas Gohr 23ede46466SAndreas Gohr $idxEntity = new MemoryIndex('entity'); 24ede46466SAndreas Gohr $this->assertEquals('test', $idxEntity->retrieveRow(0)); 25ede46466SAndreas Gohr 26ede46466SAndreas Gohr $idxToken = new MemoryIndex('token', '3'); 27ede46466SAndreas Gohr $this->assertEquals('one', $idxToken->retrieveRow(0)); 28ede46466SAndreas Gohr $this->assertEquals('two', $idxToken->retrieveRow(1)); 29ede46466SAndreas Gohr 30ede46466SAndreas Gohr $idxFreq = new MemoryIndex('freq', '3'); 31ede46466SAndreas Gohr $this->assertEquals('0', $idxFreq->retrieveRow(0)); // one is 1x on page 0 (written without *1) 32ede46466SAndreas Gohr $this->assertEquals('0*2', $idxFreq->retrieveRow(1)); // two is 2x on page 0 33ede46466SAndreas Gohr 34ede46466SAndreas Gohr $idxRev = new MemoryIndex('reverse'); 35ede46466SAndreas Gohr $this->assertEquals('3*0:3*1:5*0:4*0', $idxRev->retrieveRow(0)); 36ede46466SAndreas Gohr 37ede46466SAndreas Gohr // remove one of the tokens 38ede46466SAndreas Gohr $tokens = ['two', 'three', 'four', 'two']; 39ede46466SAndreas Gohr $index->lock(); 40ede46466SAndreas Gohr $index->addEntity('test', $tokens); 41ede46466SAndreas Gohr $index->unlock(); 42ede46466SAndreas Gohr 43ede46466SAndreas Gohr $idxFreq = new MemoryIndex('freq', '3'); 44ede46466SAndreas Gohr $this->assertEquals('', $idxFreq->retrieveRow(0)); // one is not on page 0 45ede46466SAndreas Gohr } 46ede46466SAndreas Gohr 47ede46466SAndreas Gohr /** 48ede46466SAndreas Gohr * Test reverse lookup 49ede46466SAndreas Gohr * 50ede46466SAndreas Gohr * A lookup for the page should return the word frequencies 51ede46466SAndreas Gohr */ 52ede46466SAndreas Gohr public function testReverse() 53ede46466SAndreas Gohr { 54ede46466SAndreas Gohr $index = new MockFrequencyCollection('page', 'word', 'w', 'pageword'); 55ede46466SAndreas Gohr $index->lock(); 56ede46466SAndreas Gohr $index->addEntity('wiki:syntax', ['dokuwiki']); 57ede46466SAndreas Gohr $index->unlock(); 58ede46466SAndreas Gohr 59ede46466SAndreas Gohr $len = strlen('dokuwiki'); 60ede46466SAndreas Gohr $this->assertEquals([$len => [0 => 0]], $index->getReverseAssignments('wiki:syntax')); 61ede46466SAndreas Gohr } 62ede46466SAndreas Gohr 63ede46466SAndreas Gohr /** 64ede46466SAndreas Gohr * resolveTokens should count frequencies and group by token length 65ede46466SAndreas Gohr */ 66ede46466SAndreas Gohr public function testResolveTokens() 67ede46466SAndreas Gohr { 68ede46466SAndreas Gohr $index = new MockFrequencyCollection('rt_entity', 'rt_token', 'rt_freq', 'rt_reverse'); 69ede46466SAndreas Gohr $index->lock(); 70ede46466SAndreas Gohr 71ede46466SAndreas Gohr $result = $this->callInaccessibleMethod($index, 'resolveTokens', [ 72ede46466SAndreas Gohr ['one', 'two', 'two', 'three'], 73ede46466SAndreas Gohr ]); 74ede46466SAndreas Gohr 75ede46466SAndreas Gohr // 'one' and 'two' are 3 chars, 'three' is 5 chars 76ede46466SAndreas Gohr $this->assertArrayHasKey(3, $result); 77ede46466SAndreas Gohr $this->assertArrayHasKey(5, $result); 78ede46466SAndreas Gohr 79ede46466SAndreas Gohr // token IDs are sequential: one=0, two=1, three=0 (in its own length group) 80ede46466SAndreas Gohr $this->assertEquals(1, $result[3][0]); // 'one' appears once 81ede46466SAndreas Gohr $this->assertEquals(2, $result[3][1]); // 'two' appears twice 82ede46466SAndreas Gohr $this->assertEquals(1, $result[5][0]); // 'three' appears once 83ede46466SAndreas Gohr } 84ede46466SAndreas Gohr 85ede46466SAndreas Gohr /** 86ede46466SAndreas Gohr * resolveTokens with empty input should return empty array 87ede46466SAndreas Gohr */ 88ede46466SAndreas Gohr public function testResolveTokensEmpty() 89ede46466SAndreas Gohr { 90ede46466SAndreas Gohr $index = new MockFrequencyCollection('rte_entity', 'rte_token', 'rte_freq', 'rte_reverse'); 91ede46466SAndreas Gohr $index->lock(); 92ede46466SAndreas Gohr 93ede46466SAndreas Gohr $result = $this->callInaccessibleMethod($index, 'resolveTokens', [[]]); 94ede46466SAndreas Gohr 95ede46466SAndreas Gohr $this->assertEmpty($result); 96ede46466SAndreas Gohr } 97ede46466SAndreas Gohr 98ede46466SAndreas Gohr /** 99ede46466SAndreas Gohr * countTokens should return occurrence counts 100ede46466SAndreas Gohr */ 101ede46466SAndreas Gohr public function testCountTokens() 102ede46466SAndreas Gohr { 103ede46466SAndreas Gohr $index = new MockFrequencyCollection(); 104ede46466SAndreas Gohr 105ede46466SAndreas Gohr $result = $this->callInaccessibleMethod($index, 'countTokens', [ 106ede46466SAndreas Gohr ['one', 'two', 'two', 'three', 'three', 'three'], 107ede46466SAndreas Gohr ]); 108ede46466SAndreas Gohr 109ede46466SAndreas Gohr $this->assertEquals([ 110ede46466SAndreas Gohr 'one' => 1, 111ede46466SAndreas Gohr 'two' => 2, 112ede46466SAndreas Gohr 'three' => 3, 113ede46466SAndreas Gohr ], $result); 114ede46466SAndreas Gohr } 1156734bb8cSAndreas Gohr 1166734bb8cSAndreas Gohr /** 117*06053dcaSAndreas Gohr * Adding a second entity creates new RIDs in the entity index that must be 118*06053dcaSAndreas Gohr * used consistently across frequency and reverse indexes, even when those 119*06053dcaSAndreas Gohr * indexes were originally sized for only the first entity. 120*06053dcaSAndreas Gohr */ 121*06053dcaSAndreas Gohr public function testMultipleEntitiesShareTokens() 122*06053dcaSAndreas Gohr { 123*06053dcaSAndreas Gohr $index = new MockFrequencyCollection('me_page', 'me_w', 'me_i', 'me_pw'); 124*06053dcaSAndreas Gohr $index->lock(); 125*06053dcaSAndreas Gohr $index->addEntity('page1', ['alpha', 'beta']); 126*06053dcaSAndreas Gohr $index->addEntity('page2', ['beta', 'gamma']); 127*06053dcaSAndreas Gohr $index->unlock(); 128*06053dcaSAndreas Gohr 129*06053dcaSAndreas Gohr // entity index: page1=0, page2=1 130*06053dcaSAndreas Gohr $idxEntity = new MemoryIndex('me_page'); 131*06053dcaSAndreas Gohr $this->assertEquals('page1', $idxEntity->retrieveRow(0)); 132*06053dcaSAndreas Gohr $this->assertEquals('page2', $idxEntity->retrieveRow(1)); 133*06053dcaSAndreas Gohr 134*06053dcaSAndreas Gohr // token index (5-char group): alpha=0, gamma=1 135*06053dcaSAndreas Gohr $idxToken5 = new MemoryIndex('me_w', '5'); 136*06053dcaSAndreas Gohr $this->assertEquals('alpha', $idxToken5->retrieveRow(0)); 137*06053dcaSAndreas Gohr $this->assertEquals('gamma', $idxToken5->retrieveRow(1)); 138*06053dcaSAndreas Gohr 139*06053dcaSAndreas Gohr // token index (4-char group): beta=0 140*06053dcaSAndreas Gohr $idxToken4 = new MemoryIndex('me_w', '4'); 141*06053dcaSAndreas Gohr $this->assertEquals('beta', $idxToken4->retrieveRow(0)); 142*06053dcaSAndreas Gohr 143*06053dcaSAndreas Gohr // frequency index: beta (token 0 in 4-char group) is on both entities 144*06053dcaSAndreas Gohr $idxFreq4 = new MemoryIndex('me_i', '4'); 145*06053dcaSAndreas Gohr $betaFreq = explode(':', $idxFreq4->retrieveRow(0)); 146*06053dcaSAndreas Gohr sort($betaFreq); 147*06053dcaSAndreas Gohr $this->assertEquals(['0', '1'], $betaFreq); // beta on page1(0) and page2(1) 148*06053dcaSAndreas Gohr 149*06053dcaSAndreas Gohr // frequency index: alpha (token 0 in 5-char group) only on page1 150*06053dcaSAndreas Gohr $idxFreq5 = new MemoryIndex('me_i', '5'); 151*06053dcaSAndreas Gohr $this->assertEquals('0', $idxFreq5->retrieveRow(0)); // alpha on page1(0) only 152*06053dcaSAndreas Gohr $this->assertEquals('1', $idxFreq5->retrieveRow(1)); // gamma on page2(1) only 153*06053dcaSAndreas Gohr 154*06053dcaSAndreas Gohr // reverse index: page1 has alpha(5*0) and beta(4*0), page2 has beta(4*0) and gamma(5*1) 155*06053dcaSAndreas Gohr $idxRev = new MemoryIndex('me_pw'); 156*06053dcaSAndreas Gohr $rev0 = explode(':', $idxRev->retrieveRow(0)); 157*06053dcaSAndreas Gohr sort($rev0); 158*06053dcaSAndreas Gohr $this->assertEquals(['4*0', '5*0'], $rev0); 159*06053dcaSAndreas Gohr 160*06053dcaSAndreas Gohr $rev1 = explode(':', $idxRev->retrieveRow(1)); 161*06053dcaSAndreas Gohr sort($rev1); 162*06053dcaSAndreas Gohr $this->assertEquals(['4*0', '5*1'], $rev1); 163*06053dcaSAndreas Gohr } 164*06053dcaSAndreas Gohr 165*06053dcaSAndreas Gohr /** 1666734bb8cSAndreas Gohr * getEntitiesWithData on a split FrequencyCollection 1676734bb8cSAndreas Gohr */ 1686734bb8cSAndreas Gohr public function testGetEntitiesWithData() 1696734bb8cSAndreas Gohr { 1706734bb8cSAndreas Gohr $index = new MockFrequencyCollection('ewd_page', 'ewd_w', 'ewd_i', 'ewd_pw'); 1716734bb8cSAndreas Gohr $index->lock(); 1726734bb8cSAndreas Gohr $index->addEntity('page1', ['dokuwiki', 'wiki']); 1736734bb8cSAndreas Gohr $index->addEntity('page2', ['other', 'words']); 1746734bb8cSAndreas Gohr $index->unlock(); 1756734bb8cSAndreas Gohr 1766734bb8cSAndreas Gohr $result = $index->getEntitiesWithData(); 1776734bb8cSAndreas Gohr sort($result); 1786734bb8cSAndreas Gohr $this->assertEquals(['page1', 'page2'], $result); 1796734bb8cSAndreas Gohr } 1806734bb8cSAndreas Gohr 1816734bb8cSAndreas Gohr /** 18221fbd01bSAndreas Gohr * getEntitiesWithData on an empty split collection returns empty array 18321fbd01bSAndreas Gohr */ 18421fbd01bSAndreas Gohr public function testGetEntitiesWithDataEmpty() 18521fbd01bSAndreas Gohr { 18621fbd01bSAndreas Gohr $index = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw'); 18721fbd01bSAndreas Gohr $result = $index->getEntitiesWithData(); 18821fbd01bSAndreas Gohr $this->assertEquals([], $result); 18921fbd01bSAndreas Gohr } 19021fbd01bSAndreas Gohr 19121fbd01bSAndreas Gohr /** 19221fbd01bSAndreas Gohr * checkIntegrity on an empty split collection does not throw 19321fbd01bSAndreas Gohr */ 19421fbd01bSAndreas Gohr public function testCheckIntegrityEmpty() 19521fbd01bSAndreas Gohr { 19621fbd01bSAndreas Gohr $index = new MockFrequencyCollection('ci_page', 'ci_w', 'ci_i', 'ci_pw'); 19721fbd01bSAndreas Gohr $index->checkIntegrity(); 19821fbd01bSAndreas Gohr $this->assertTrue(true); // no exception thrown 19921fbd01bSAndreas Gohr } 20021fbd01bSAndreas Gohr 20121fbd01bSAndreas Gohr /** 20221fbd01bSAndreas Gohr * checkIntegrity passes on a healthy split collection 20321fbd01bSAndreas Gohr */ 20421fbd01bSAndreas Gohr public function testCheckIntegrityHealthy() 20521fbd01bSAndreas Gohr { 20621fbd01bSAndreas Gohr $index = new MockFrequencyCollection('cih_page', 'cih_w', 'cih_i', 'cih_pw'); 20721fbd01bSAndreas Gohr $index->lock(); 20821fbd01bSAndreas Gohr $index->addEntity('page1', ['dokuwiki', 'wiki']); 20921fbd01bSAndreas Gohr $index->unlock(); 21021fbd01bSAndreas Gohr 21121fbd01bSAndreas Gohr $index->checkIntegrity(); // should not throw 21221fbd01bSAndreas Gohr $this->assertTrue(true); 21321fbd01bSAndreas Gohr } 21421fbd01bSAndreas Gohr 21521fbd01bSAndreas Gohr /** 21621fbd01bSAndreas Gohr * checkIntegrity detects missing frequency index for a group 21721fbd01bSAndreas Gohr */ 21821fbd01bSAndreas Gohr public function testCheckIntegrityMissingFreqIndex() 21921fbd01bSAndreas Gohr { 22021fbd01bSAndreas Gohr global $conf; 22121fbd01bSAndreas Gohr $index = new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'); 22221fbd01bSAndreas Gohr $index->lock(); 22321fbd01bSAndreas Gohr $index->addEntity('page1', ['dokuwiki', 'wiki']); 22421fbd01bSAndreas Gohr $index->unlock(); 22521fbd01bSAndreas Gohr 22621fbd01bSAndreas Gohr // find a group that exists and delete its frequency index 22721fbd01bSAndreas Gohr $max = $index->getTokenIndexMaximum(); 22821fbd01bSAndreas Gohr @unlink($conf['indexdir'] . '/cimf_i' . $max . '.idx'); 22921fbd01bSAndreas Gohr 23021fbd01bSAndreas Gohr $this->expectException(IndexIntegrityException::class); 23121fbd01bSAndreas Gohr (new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'))->checkIntegrity(); 23221fbd01bSAndreas Gohr } 23321fbd01bSAndreas Gohr 23421fbd01bSAndreas Gohr /** 23521fbd01bSAndreas Gohr * checkIntegrity detects missing token index for a group 23621fbd01bSAndreas Gohr */ 23721fbd01bSAndreas Gohr public function testCheckIntegrityMissingTokenIndex() 23821fbd01bSAndreas Gohr { 23921fbd01bSAndreas Gohr global $conf; 24021fbd01bSAndreas Gohr $index = new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'); 24121fbd01bSAndreas Gohr $index->lock(); 24221fbd01bSAndreas Gohr // use words of different lengths to create multiple groups 24321fbd01bSAndreas Gohr $index->addEntity('page1', ['hi', 'dokuwiki', 'wiki']); 24421fbd01bSAndreas Gohr $index->unlock(); 24521fbd01bSAndreas Gohr 24621fbd01bSAndreas Gohr // delete the token index for the shortest group (not the max) 24721fbd01bSAndreas Gohr @unlink($conf['indexdir'] . '/cimt_w2.idx'); 24821fbd01bSAndreas Gohr 24921fbd01bSAndreas Gohr $this->expectException(IndexIntegrityException::class); 25021fbd01bSAndreas Gohr (new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'))->checkIntegrity(); 25121fbd01bSAndreas Gohr } 25221fbd01bSAndreas Gohr 25321fbd01bSAndreas Gohr /** 2546734bb8cSAndreas Gohr * groupToSuffix throws on group 0 for split collection 2556734bb8cSAndreas Gohr */ 2566734bb8cSAndreas Gohr public function testGroupToSuffixValidationSplit() 2576734bb8cSAndreas Gohr { 2586734bb8cSAndreas Gohr $this->expectException(\dokuwiki\Search\Exception\IndexUsageException::class); 2596734bb8cSAndreas Gohr 2606734bb8cSAndreas Gohr $index = new MockFrequencyCollection('gs_page', 'gs_w', 'gs_i', 'gs_pw'); 2616734bb8cSAndreas Gohr // split collection should reject group 0 2626734bb8cSAndreas Gohr $index->getTokenIndex(0); 2636734bb8cSAndreas Gohr } 264ede46466SAndreas Gohr} 265