xref: /dokuwiki/_test/tests/Search/Collection/FrequencyCollectionTest.php (revision 06053dca2fac9a1da4eb1accf8c2488942da5d2a)
1<?php
2
3namespace dokuwiki\test\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexIntegrityException;
6use dokuwiki\Search\Index\MemoryIndex;
7
8class FrequencyCollectionTest extends \DokuWikiTest
9{
10
11    /**
12     * Add data and directly check the underlying indexes for correctness
13     */
14    public function testDirectly()
15    {
16        $index = new MockFrequencyCollection('entity', 'token', 'freq', 'reverse');
17
18        $tokens = ['one', 'two', 'three', 'four', 'two'];
19        $index->lock();
20        $index->addEntity('test', $tokens);
21        $index->unlock();
22
23        $idxEntity = new MemoryIndex('entity');
24        $this->assertEquals('test', $idxEntity->retrieveRow(0));
25
26        $idxToken = new MemoryIndex('token', '3');
27        $this->assertEquals('one', $idxToken->retrieveRow(0));
28        $this->assertEquals('two', $idxToken->retrieveRow(1));
29
30        $idxFreq = new MemoryIndex('freq', '3');
31        $this->assertEquals('0', $idxFreq->retrieveRow(0)); // one is 1x on page 0 (written without *1)
32        $this->assertEquals('0*2', $idxFreq->retrieveRow(1)); // two is 2x on page 0
33
34        $idxRev = new MemoryIndex('reverse');
35        $this->assertEquals('3*0:3*1:5*0:4*0', $idxRev->retrieveRow(0));
36
37        // remove one of the tokens
38        $tokens = ['two', 'three', 'four', 'two'];
39        $index->lock();
40        $index->addEntity('test', $tokens);
41        $index->unlock();
42
43        $idxFreq = new MemoryIndex('freq', '3');
44        $this->assertEquals('', $idxFreq->retrieveRow(0)); // one is not on page 0
45    }
46
47    /**
48     * Test reverse lookup
49     *
50     * A lookup for the page should return the word frequencies
51     */
52    public function testReverse()
53    {
54        $index = new MockFrequencyCollection('page', 'word', 'w', 'pageword');
55        $index->lock();
56        $index->addEntity('wiki:syntax', ['dokuwiki']);
57        $index->unlock();
58
59        $len = strlen('dokuwiki');
60        $this->assertEquals([$len => [0 => 0]], $index->getReverseAssignments('wiki:syntax'));
61    }
62
63    /**
64     * resolveTokens should count frequencies and group by token length
65     */
66    public function testResolveTokens()
67    {
68        $index = new MockFrequencyCollection('rt_entity', 'rt_token', 'rt_freq', 'rt_reverse');
69        $index->lock();
70
71        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [
72            ['one', 'two', 'two', 'three'],
73        ]);
74
75        // 'one' and 'two' are 3 chars, 'three' is 5 chars
76        $this->assertArrayHasKey(3, $result);
77        $this->assertArrayHasKey(5, $result);
78
79        // token IDs are sequential: one=0, two=1, three=0 (in its own length group)
80        $this->assertEquals(1, $result[3][0]); // 'one' appears once
81        $this->assertEquals(2, $result[3][1]); // 'two' appears twice
82        $this->assertEquals(1, $result[5][0]); // 'three' appears once
83    }
84
85    /**
86     * resolveTokens with empty input should return empty array
87     */
88    public function testResolveTokensEmpty()
89    {
90        $index = new MockFrequencyCollection('rte_entity', 'rte_token', 'rte_freq', 'rte_reverse');
91        $index->lock();
92
93        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [[]]);
94
95        $this->assertEmpty($result);
96    }
97
98    /**
99     * countTokens should return occurrence counts
100     */
101    public function testCountTokens()
102    {
103        $index = new MockFrequencyCollection();
104
105        $result = $this->callInaccessibleMethod($index, 'countTokens', [
106            ['one', 'two', 'two', 'three', 'three', 'three'],
107        ]);
108
109        $this->assertEquals([
110            'one' => 1,
111            'two' => 2,
112            'three' => 3,
113        ], $result);
114    }
115
116    /**
117     * Adding a second entity creates new RIDs in the entity index that must be
118     * used consistently across frequency and reverse indexes, even when those
119     * indexes were originally sized for only the first entity.
120     */
121    public function testMultipleEntitiesShareTokens()
122    {
123        $index = new MockFrequencyCollection('me_page', 'me_w', 'me_i', 'me_pw');
124        $index->lock();
125        $index->addEntity('page1', ['alpha', 'beta']);
126        $index->addEntity('page2', ['beta', 'gamma']);
127        $index->unlock();
128
129        // entity index: page1=0, page2=1
130        $idxEntity = new MemoryIndex('me_page');
131        $this->assertEquals('page1', $idxEntity->retrieveRow(0));
132        $this->assertEquals('page2', $idxEntity->retrieveRow(1));
133
134        // token index (5-char group): alpha=0, gamma=1
135        $idxToken5 = new MemoryIndex('me_w', '5');
136        $this->assertEquals('alpha', $idxToken5->retrieveRow(0));
137        $this->assertEquals('gamma', $idxToken5->retrieveRow(1));
138
139        // token index (4-char group): beta=0
140        $idxToken4 = new MemoryIndex('me_w', '4');
141        $this->assertEquals('beta', $idxToken4->retrieveRow(0));
142
143        // frequency index: beta (token 0 in 4-char group) is on both entities
144        $idxFreq4 = new MemoryIndex('me_i', '4');
145        $betaFreq = explode(':', $idxFreq4->retrieveRow(0));
146        sort($betaFreq);
147        $this->assertEquals(['0', '1'], $betaFreq); // beta on page1(0) and page2(1)
148
149        // frequency index: alpha (token 0 in 5-char group) only on page1
150        $idxFreq5 = new MemoryIndex('me_i', '5');
151        $this->assertEquals('0', $idxFreq5->retrieveRow(0)); // alpha on page1(0) only
152        $this->assertEquals('1', $idxFreq5->retrieveRow(1)); // gamma on page2(1) only
153
154        // reverse index: page1 has alpha(5*0) and beta(4*0), page2 has beta(4*0) and gamma(5*1)
155        $idxRev = new MemoryIndex('me_pw');
156        $rev0 = explode(':', $idxRev->retrieveRow(0));
157        sort($rev0);
158        $this->assertEquals(['4*0', '5*0'], $rev0);
159
160        $rev1 = explode(':', $idxRev->retrieveRow(1));
161        sort($rev1);
162        $this->assertEquals(['4*0', '5*1'], $rev1);
163    }
164
165    /**
166     * getEntitiesWithData on a split FrequencyCollection
167     */
168    public function testGetEntitiesWithData()
169    {
170        $index = new MockFrequencyCollection('ewd_page', 'ewd_w', 'ewd_i', 'ewd_pw');
171        $index->lock();
172        $index->addEntity('page1', ['dokuwiki', 'wiki']);
173        $index->addEntity('page2', ['other', 'words']);
174        $index->unlock();
175
176        $result = $index->getEntitiesWithData();
177        sort($result);
178        $this->assertEquals(['page1', 'page2'], $result);
179    }
180
181    /**
182     * getEntitiesWithData on an empty split collection returns empty array
183     */
184    public function testGetEntitiesWithDataEmpty()
185    {
186        $index = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw');
187        $result = $index->getEntitiesWithData();
188        $this->assertEquals([], $result);
189    }
190
191    /**
192     * checkIntegrity on an empty split collection does not throw
193     */
194    public function testCheckIntegrityEmpty()
195    {
196        $index = new MockFrequencyCollection('ci_page', 'ci_w', 'ci_i', 'ci_pw');
197        $index->checkIntegrity();
198        $this->assertTrue(true); // no exception thrown
199    }
200
201    /**
202     * checkIntegrity passes on a healthy split collection
203     */
204    public function testCheckIntegrityHealthy()
205    {
206        $index = new MockFrequencyCollection('cih_page', 'cih_w', 'cih_i', 'cih_pw');
207        $index->lock();
208        $index->addEntity('page1', ['dokuwiki', 'wiki']);
209        $index->unlock();
210
211        $index->checkIntegrity(); // should not throw
212        $this->assertTrue(true);
213    }
214
215    /**
216     * checkIntegrity detects missing frequency index for a group
217     */
218    public function testCheckIntegrityMissingFreqIndex()
219    {
220        global $conf;
221        $index = new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw');
222        $index->lock();
223        $index->addEntity('page1', ['dokuwiki', 'wiki']);
224        $index->unlock();
225
226        // find a group that exists and delete its frequency index
227        $max = $index->getTokenIndexMaximum();
228        @unlink($conf['indexdir'] . '/cimf_i' . $max . '.idx');
229
230        $this->expectException(IndexIntegrityException::class);
231        (new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'))->checkIntegrity();
232    }
233
234    /**
235     * checkIntegrity detects missing token index for a group
236     */
237    public function testCheckIntegrityMissingTokenIndex()
238    {
239        global $conf;
240        $index = new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw');
241        $index->lock();
242        // use words of different lengths to create multiple groups
243        $index->addEntity('page1', ['hi', 'dokuwiki', 'wiki']);
244        $index->unlock();
245
246        // delete the token index for the shortest group (not the max)
247        @unlink($conf['indexdir'] . '/cimt_w2.idx');
248
249        $this->expectException(IndexIntegrityException::class);
250        (new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'))->checkIntegrity();
251    }
252
253    /**
254     * groupToSuffix throws on group 0 for split collection
255     */
256    public function testGroupToSuffixValidationSplit()
257    {
258        $this->expectException(\dokuwiki\Search\Exception\IndexUsageException::class);
259
260        $index = new MockFrequencyCollection('gs_page', 'gs_w', 'gs_i', 'gs_pw');
261        // split collection should reject group 0
262        $index->getTokenIndex(0);
263    }
264}
265