1<?php 2 3namespace dokuwiki\test\Search\Collection; 4 5use dokuwiki\Search\Exception\IndexIntegrityException; 6use dokuwiki\Search\Index\MemoryIndex; 7 8class FrequencyCollectionTest extends \DokuWikiTest 9{ 10 11 /** 12 * Add data and directly check the underlying indexes for correctness 13 */ 14 public function testDirectly() 15 { 16 $index = new MockFrequencyCollection('entity', 'token', 'freq', 'reverse'); 17 18 $tokens = ['one', 'two', 'three', 'four', 'two']; 19 $index->lock(); 20 $index->addEntity('test', $tokens); 21 $index->unlock(); 22 23 $idxEntity = new MemoryIndex('entity'); 24 $this->assertEquals('test', $idxEntity->retrieveRow(0)); 25 26 $idxToken = new MemoryIndex('token', '3'); 27 $this->assertEquals('one', $idxToken->retrieveRow(0)); 28 $this->assertEquals('two', $idxToken->retrieveRow(1)); 29 30 $idxFreq = new MemoryIndex('freq', '3'); 31 $this->assertEquals('0', $idxFreq->retrieveRow(0)); // one is 1x on page 0 (written without *1) 32 $this->assertEquals('0*2', $idxFreq->retrieveRow(1)); // two is 2x on page 0 33 34 $idxRev = new MemoryIndex('reverse'); 35 $this->assertEquals('3*0:3*1:5*0:4*0', $idxRev->retrieveRow(0)); 36 37 // remove one of the tokens 38 $tokens = ['two', 'three', 'four', 'two']; 39 $index->lock(); 40 $index->addEntity('test', $tokens); 41 $index->unlock(); 42 43 $idxFreq = new MemoryIndex('freq', '3'); 44 $this->assertEquals('', $idxFreq->retrieveRow(0)); // one is not on page 0 45 } 46 47 /** 48 * Test reverse lookup 49 * 50 * A lookup for the page should return the word frequencies 51 */ 52 public function testReverse() 53 { 54 $index = new MockFrequencyCollection('page', 'word', 'w', 'pageword'); 55 $index->lock(); 56 $index->addEntity('wiki:syntax', ['dokuwiki']); 57 $index->unlock(); 58 59 $len = strlen('dokuwiki'); 60 $this->assertEquals([$len => [0 => 0]], $index->getReverseAssignments('wiki:syntax')); 61 } 62 63 /** 64 * resolveTokens should count frequencies and group by token length 65 */ 66 public function testResolveTokens() 67 { 68 $index = new MockFrequencyCollection('rt_entity', 'rt_token', 'rt_freq', 'rt_reverse'); 69 $index->lock(); 70 71 $result = $this->callInaccessibleMethod($index, 'resolveTokens', [ 72 ['one', 'two', 'two', 'three'], 73 ]); 74 75 // 'one' and 'two' are 3 chars, 'three' is 5 chars 76 $this->assertArrayHasKey(3, $result); 77 $this->assertArrayHasKey(5, $result); 78 79 // token IDs are sequential: one=0, two=1, three=0 (in its own length group) 80 $this->assertEquals(1, $result[3][0]); // 'one' appears once 81 $this->assertEquals(2, $result[3][1]); // 'two' appears twice 82 $this->assertEquals(1, $result[5][0]); // 'three' appears once 83 } 84 85 /** 86 * resolveTokens with empty input should return empty array 87 */ 88 public function testResolveTokensEmpty() 89 { 90 $index = new MockFrequencyCollection('rte_entity', 'rte_token', 'rte_freq', 'rte_reverse'); 91 $index->lock(); 92 93 $result = $this->callInaccessibleMethod($index, 'resolveTokens', [[]]); 94 95 $this->assertEmpty($result); 96 } 97 98 /** 99 * countTokens should return occurrence counts 100 */ 101 public function testCountTokens() 102 { 103 $index = new MockFrequencyCollection(); 104 105 $result = $this->callInaccessibleMethod($index, 'countTokens', [ 106 ['one', 'two', 'two', 'three', 'three', 'three'], 107 ]); 108 109 $this->assertEquals([ 110 'one' => 1, 111 'two' => 2, 112 'three' => 3, 113 ], $result); 114 } 115 116 /** 117 * Adding a second entity creates new RIDs in the entity index that must be 118 * used consistently across frequency and reverse indexes, even when those 119 * indexes were originally sized for only the first entity. 120 */ 121 public function testMultipleEntitiesShareTokens() 122 { 123 $index = new MockFrequencyCollection('me_page', 'me_w', 'me_i', 'me_pw'); 124 $index->lock(); 125 $index->addEntity('page1', ['alpha', 'beta']); 126 $index->addEntity('page2', ['beta', 'gamma']); 127 $index->unlock(); 128 129 // entity index: page1=0, page2=1 130 $idxEntity = new MemoryIndex('me_page'); 131 $this->assertEquals('page1', $idxEntity->retrieveRow(0)); 132 $this->assertEquals('page2', $idxEntity->retrieveRow(1)); 133 134 // token index (5-char group): alpha=0, gamma=1 135 $idxToken5 = new MemoryIndex('me_w', '5'); 136 $this->assertEquals('alpha', $idxToken5->retrieveRow(0)); 137 $this->assertEquals('gamma', $idxToken5->retrieveRow(1)); 138 139 // token index (4-char group): beta=0 140 $idxToken4 = new MemoryIndex('me_w', '4'); 141 $this->assertEquals('beta', $idxToken4->retrieveRow(0)); 142 143 // frequency index: beta (token 0 in 4-char group) is on both entities 144 $idxFreq4 = new MemoryIndex('me_i', '4'); 145 $betaFreq = explode(':', $idxFreq4->retrieveRow(0)); 146 sort($betaFreq); 147 $this->assertEquals(['0', '1'], $betaFreq); // beta on page1(0) and page2(1) 148 149 // frequency index: alpha (token 0 in 5-char group) only on page1 150 $idxFreq5 = new MemoryIndex('me_i', '5'); 151 $this->assertEquals('0', $idxFreq5->retrieveRow(0)); // alpha on page1(0) only 152 $this->assertEquals('1', $idxFreq5->retrieveRow(1)); // gamma on page2(1) only 153 154 // reverse index: page1 has alpha(5*0) and beta(4*0), page2 has beta(4*0) and gamma(5*1) 155 $idxRev = new MemoryIndex('me_pw'); 156 $rev0 = explode(':', $idxRev->retrieveRow(0)); 157 sort($rev0); 158 $this->assertEquals(['4*0', '5*0'], $rev0); 159 160 $rev1 = explode(':', $idxRev->retrieveRow(1)); 161 sort($rev1); 162 $this->assertEquals(['4*0', '5*1'], $rev1); 163 } 164 165 /** 166 * getEntitiesWithData on a split FrequencyCollection 167 */ 168 public function testGetEntitiesWithData() 169 { 170 $index = new MockFrequencyCollection('ewd_page', 'ewd_w', 'ewd_i', 'ewd_pw'); 171 $index->lock(); 172 $index->addEntity('page1', ['dokuwiki', 'wiki']); 173 $index->addEntity('page2', ['other', 'words']); 174 $index->unlock(); 175 176 $result = $index->getEntitiesWithData(); 177 sort($result); 178 $this->assertEquals(['page1', 'page2'], $result); 179 } 180 181 /** 182 * getEntitiesWithData on an empty split collection returns empty array 183 */ 184 public function testGetEntitiesWithDataEmpty() 185 { 186 $index = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw'); 187 $result = $index->getEntitiesWithData(); 188 $this->assertEquals([], $result); 189 } 190 191 /** 192 * checkIntegrity on an empty split collection does not throw 193 */ 194 public function testCheckIntegrityEmpty() 195 { 196 $index = new MockFrequencyCollection('ci_page', 'ci_w', 'ci_i', 'ci_pw'); 197 $index->checkIntegrity(); 198 $this->assertTrue(true); // no exception thrown 199 } 200 201 /** 202 * checkIntegrity passes on a healthy split collection 203 */ 204 public function testCheckIntegrityHealthy() 205 { 206 $index = new MockFrequencyCollection('cih_page', 'cih_w', 'cih_i', 'cih_pw'); 207 $index->lock(); 208 $index->addEntity('page1', ['dokuwiki', 'wiki']); 209 $index->unlock(); 210 211 $index->checkIntegrity(); // should not throw 212 $this->assertTrue(true); 213 } 214 215 /** 216 * checkIntegrity detects missing frequency index for a group 217 */ 218 public function testCheckIntegrityMissingFreqIndex() 219 { 220 global $conf; 221 $index = new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'); 222 $index->lock(); 223 $index->addEntity('page1', ['dokuwiki', 'wiki']); 224 $index->unlock(); 225 226 // find a group that exists and delete its frequency index 227 $max = $index->getTokenIndexMaximum(); 228 @unlink($conf['indexdir'] . '/cimf_i' . $max . '.idx'); 229 230 $this->expectException(IndexIntegrityException::class); 231 (new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'))->checkIntegrity(); 232 } 233 234 /** 235 * checkIntegrity detects missing token index for a group 236 */ 237 public function testCheckIntegrityMissingTokenIndex() 238 { 239 global $conf; 240 $index = new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'); 241 $index->lock(); 242 // use words of different lengths to create multiple groups 243 $index->addEntity('page1', ['hi', 'dokuwiki', 'wiki']); 244 $index->unlock(); 245 246 // delete the token index for the shortest group (not the max) 247 @unlink($conf['indexdir'] . '/cimt_w2.idx'); 248 249 $this->expectException(IndexIntegrityException::class); 250 (new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'))->checkIntegrity(); 251 } 252 253 /** 254 * groupToSuffix throws on group 0 for split collection 255 */ 256 public function testGroupToSuffixValidationSplit() 257 { 258 $this->expectException(\dokuwiki\Search\Exception\IndexUsageException::class); 259 260 $index = new MockFrequencyCollection('gs_page', 'gs_w', 'gs_i', 'gs_pw'); 261 // split collection should reject group 0 262 $index->getTokenIndex(0); 263 } 264} 265