xref: /dokuwiki/_test/tests/Search/Collection/FrequencyCollectionTest.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
1<?php
2
3namespace dokuwiki\test\Search\Collection;
4
5use dokuwiki\Search\Exception\IndexIntegrityException;
6use dokuwiki\Search\Index\MemoryIndex;
7
8class FrequencyCollectionTest extends \DokuWikiTest
9{
10
11    /**
12     * Add data and directly check the underlying indexes for correctness
13     */
14    public function testDirectly()
15    {
16        $index = new MockFrequencyCollection('entity', 'token', 'freq', 'reverse');
17
18        $tokens = ['one', 'two', 'three', 'four', 'two'];
19        $index->lock();
20        $index->addEntity('test', $tokens);
21        $index->unlock();
22
23        $idxEntity = new MemoryIndex('entity');
24        $this->assertEquals('test', $idxEntity->retrieveRow(0));
25
26        $idxToken = new MemoryIndex('token', '3');
27        $this->assertEquals('one', $idxToken->retrieveRow(0));
28        $this->assertEquals('two', $idxToken->retrieveRow(1));
29
30        $idxFreq = new MemoryIndex('freq', '3');
31        $this->assertEquals('0', $idxFreq->retrieveRow(0)); // one is 1x on page 0 (written without *1)
32        $this->assertEquals('0*2', $idxFreq->retrieveRow(1)); // two is 2x on page 0
33
34        $idxRev = new MemoryIndex('reverse');
35        $this->assertEquals('3*0:3*1:5*0:4*0', $idxRev->retrieveRow(0));
36
37        // remove one of the tokens
38        $tokens = ['two', 'three', 'four', 'two'];
39        $index->lock();
40        $index->addEntity('test', $tokens);
41        $index->unlock();
42
43        $idxFreq = new MemoryIndex('freq', '3');
44        $this->assertEquals('', $idxFreq->retrieveRow(0)); // one is not on page 0
45    }
46
47    /**
48     * Test reverse lookup
49     *
50     * A lookup for the page should return the word frequencies
51     */
52    public function testReverse()
53    {
54        $index = new MockFrequencyCollection('page', 'word', 'w', 'pageword');
55        $index->lock();
56        $index->addEntity('wiki:syntax', ['dokuwiki']);
57        $index->unlock();
58
59        $len = strlen('dokuwiki');
60        $this->assertEquals([$len => [0 => 0]], $index->getReverseAssignments('wiki:syntax'));
61    }
62
63    /**
64     * resolveTokens should count frequencies and group by token length
65     */
66    public function testResolveTokens()
67    {
68        $index = new MockFrequencyCollection('rt_entity', 'rt_token', 'rt_freq', 'rt_reverse');
69        $index->lock();
70
71        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [
72            ['one', 'two', 'two', 'three'],
73        ]);
74
75        // 'one' and 'two' are 3 chars, 'three' is 5 chars
76        $this->assertArrayHasKey(3, $result);
77        $this->assertArrayHasKey(5, $result);
78
79        // token IDs are sequential: one=0, two=1, three=0 (in its own length group)
80        $this->assertEquals(1, $result[3][0]); // 'one' appears once
81        $this->assertEquals(2, $result[3][1]); // 'two' appears twice
82        $this->assertEquals(1, $result[5][0]); // 'three' appears once
83    }
84
85    /**
86     * resolveTokens with empty input should return empty array
87     */
88    public function testResolveTokensEmpty()
89    {
90        $index = new MockFrequencyCollection('rte_entity', 'rte_token', 'rte_freq', 'rte_reverse');
91        $index->lock();
92
93        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [[]]);
94
95        $this->assertEmpty($result);
96    }
97
98    /**
99     * countTokens should return occurrence counts
100     */
101    public function testCountTokens()
102    {
103        $index = new MockFrequencyCollection();
104
105        $result = $this->callInaccessibleMethod($index, 'countTokens', [
106            ['one', 'two', 'two', 'three', 'three', 'three'],
107        ]);
108
109        $this->assertEquals([
110            'one' => 1,
111            'two' => 2,
112            'three' => 3,
113        ], $result);
114    }
115
116    /**
117     * getEntitiesWithData on a split FrequencyCollection
118     */
119    public function testGetEntitiesWithData()
120    {
121        $index = new MockFrequencyCollection('ewd_page', 'ewd_w', 'ewd_i', 'ewd_pw');
122        $index->lock();
123        $index->addEntity('page1', ['dokuwiki', 'wiki']);
124        $index->addEntity('page2', ['other', 'words']);
125        $index->unlock();
126
127        $result = $index->getEntitiesWithData();
128        sort($result);
129        $this->assertEquals(['page1', 'page2'], $result);
130    }
131
132    /**
133     * getEntitiesWithData on an empty split collection returns empty array
134     */
135    public function testGetEntitiesWithDataEmpty()
136    {
137        $index = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw');
138        $result = $index->getEntitiesWithData();
139        $this->assertEquals([], $result);
140    }
141
142    /**
143     * checkIntegrity on an empty split collection does not throw
144     */
145    public function testCheckIntegrityEmpty()
146    {
147        $index = new MockFrequencyCollection('ci_page', 'ci_w', 'ci_i', 'ci_pw');
148        $index->checkIntegrity();
149        $this->assertTrue(true); // no exception thrown
150    }
151
152    /**
153     * checkIntegrity passes on a healthy split collection
154     */
155    public function testCheckIntegrityHealthy()
156    {
157        $index = new MockFrequencyCollection('cih_page', 'cih_w', 'cih_i', 'cih_pw');
158        $index->lock();
159        $index->addEntity('page1', ['dokuwiki', 'wiki']);
160        $index->unlock();
161
162        $index->checkIntegrity(); // should not throw
163        $this->assertTrue(true);
164    }
165
166    /**
167     * checkIntegrity detects missing frequency index for a group
168     */
169    public function testCheckIntegrityMissingFreqIndex()
170    {
171        global $conf;
172        $index = new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw');
173        $index->lock();
174        $index->addEntity('page1', ['dokuwiki', 'wiki']);
175        $index->unlock();
176
177        // find a group that exists and delete its frequency index
178        $max = $index->getTokenIndexMaximum();
179        @unlink($conf['indexdir'] . '/cimf_i' . $max . '.idx');
180
181        $this->expectException(IndexIntegrityException::class);
182        (new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'))->checkIntegrity();
183    }
184
185    /**
186     * checkIntegrity detects missing token index for a group
187     */
188    public function testCheckIntegrityMissingTokenIndex()
189    {
190        global $conf;
191        $index = new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw');
192        $index->lock();
193        // use words of different lengths to create multiple groups
194        $index->addEntity('page1', ['hi', 'dokuwiki', 'wiki']);
195        $index->unlock();
196
197        // delete the token index for the shortest group (not the max)
198        @unlink($conf['indexdir'] . '/cimt_w2.idx');
199
200        $this->expectException(IndexIntegrityException::class);
201        (new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'))->checkIntegrity();
202    }
203
204    /**
205     * groupToSuffix throws on group 0 for split collection
206     */
207    public function testGroupToSuffixValidationSplit()
208    {
209        $this->expectException(\dokuwiki\Search\Exception\IndexUsageException::class);
210
211        $index = new MockFrequencyCollection('gs_page', 'gs_w', 'gs_i', 'gs_pw');
212        // split collection should reject group 0
213        $index->getTokenIndex(0);
214    }
215}
216