xref: /dokuwiki/_test/tests/Search/Collection/FrequencyCollectionTest.php (revision 21fbd01b3c3eea88b767376b7b158f31f0f63127)
1ede46466SAndreas Gohr<?php
2ede46466SAndreas Gohr
3ede46466SAndreas Gohrnamespace dokuwiki\test\Search\Collection;
4ede46466SAndreas Gohr
5*21fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException;
6ede46466SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex;
7ede46466SAndreas Gohr
8ede46466SAndreas Gohrclass FrequencyCollectionTest extends \DokuWikiTest
9ede46466SAndreas Gohr{
10ede46466SAndreas Gohr
11ede46466SAndreas Gohr    /**
12ede46466SAndreas Gohr     * Add data and directly check the underlying indexes for correctness
13ede46466SAndreas Gohr     */
14ede46466SAndreas Gohr    public function testDirectly()
15ede46466SAndreas Gohr    {
16ede46466SAndreas Gohr        $index = new MockFrequencyCollection('entity', 'token', 'freq', 'reverse');
17ede46466SAndreas Gohr
18ede46466SAndreas Gohr        $tokens = ['one', 'two', 'three', 'four', 'two'];
19ede46466SAndreas Gohr        $index->lock();
20ede46466SAndreas Gohr        $index->addEntity('test', $tokens);
21ede46466SAndreas Gohr        $index->unlock();
22ede46466SAndreas Gohr
23ede46466SAndreas Gohr        $idxEntity = new MemoryIndex('entity');
24ede46466SAndreas Gohr        $this->assertEquals('test', $idxEntity->retrieveRow(0));
25ede46466SAndreas Gohr
26ede46466SAndreas Gohr        $idxToken = new MemoryIndex('token', '3');
27ede46466SAndreas Gohr        $this->assertEquals('one', $idxToken->retrieveRow(0));
28ede46466SAndreas Gohr        $this->assertEquals('two', $idxToken->retrieveRow(1));
29ede46466SAndreas Gohr
30ede46466SAndreas Gohr        $idxFreq = new MemoryIndex('freq', '3');
31ede46466SAndreas Gohr        $this->assertEquals('0', $idxFreq->retrieveRow(0)); // one is 1x on page 0 (written without *1)
32ede46466SAndreas Gohr        $this->assertEquals('0*2', $idxFreq->retrieveRow(1)); // two is 2x on page 0
33ede46466SAndreas Gohr
34ede46466SAndreas Gohr        $idxRev = new MemoryIndex('reverse');
35ede46466SAndreas Gohr        $this->assertEquals('3*0:3*1:5*0:4*0', $idxRev->retrieveRow(0));
36ede46466SAndreas Gohr
37ede46466SAndreas Gohr        // remove one of the tokens
38ede46466SAndreas Gohr        $tokens = ['two', 'three', 'four', 'two'];
39ede46466SAndreas Gohr        $index->lock();
40ede46466SAndreas Gohr        $index->addEntity('test', $tokens);
41ede46466SAndreas Gohr        $index->unlock();
42ede46466SAndreas Gohr
43ede46466SAndreas Gohr        $idxFreq = new MemoryIndex('freq', '3');
44ede46466SAndreas Gohr        $this->assertEquals('', $idxFreq->retrieveRow(0)); // one is not on page 0
45ede46466SAndreas Gohr    }
46ede46466SAndreas Gohr
47ede46466SAndreas Gohr    /**
48ede46466SAndreas Gohr     * Test reverse lookup
49ede46466SAndreas Gohr     *
50ede46466SAndreas Gohr     * A lookup for the page should return the word frequencies
51ede46466SAndreas Gohr     */
52ede46466SAndreas Gohr    public function testReverse()
53ede46466SAndreas Gohr    {
54ede46466SAndreas Gohr        $index = new MockFrequencyCollection('page', 'word', 'w', 'pageword');
55ede46466SAndreas Gohr        $index->lock();
56ede46466SAndreas Gohr        $index->addEntity('wiki:syntax', ['dokuwiki']);
57ede46466SAndreas Gohr        $index->unlock();
58ede46466SAndreas Gohr
59ede46466SAndreas Gohr        $len = strlen('dokuwiki');
60ede46466SAndreas Gohr        $this->assertEquals([$len => [0 => 0]], $index->getReverseAssignments('wiki:syntax'));
61ede46466SAndreas Gohr    }
62ede46466SAndreas Gohr
63ede46466SAndreas Gohr    /**
64ede46466SAndreas Gohr     * resolveTokens should count frequencies and group by token length
65ede46466SAndreas Gohr     */
66ede46466SAndreas Gohr    public function testResolveTokens()
67ede46466SAndreas Gohr    {
68ede46466SAndreas Gohr        $index = new MockFrequencyCollection('rt_entity', 'rt_token', 'rt_freq', 'rt_reverse');
69ede46466SAndreas Gohr        $index->lock();
70ede46466SAndreas Gohr
71ede46466SAndreas Gohr        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [
72ede46466SAndreas Gohr            ['one', 'two', 'two', 'three'],
73ede46466SAndreas Gohr        ]);
74ede46466SAndreas Gohr
75ede46466SAndreas Gohr        // 'one' and 'two' are 3 chars, 'three' is 5 chars
76ede46466SAndreas Gohr        $this->assertArrayHasKey(3, $result);
77ede46466SAndreas Gohr        $this->assertArrayHasKey(5, $result);
78ede46466SAndreas Gohr
79ede46466SAndreas Gohr        // token IDs are sequential: one=0, two=1, three=0 (in its own length group)
80ede46466SAndreas Gohr        $this->assertEquals(1, $result[3][0]); // 'one' appears once
81ede46466SAndreas Gohr        $this->assertEquals(2, $result[3][1]); // 'two' appears twice
82ede46466SAndreas Gohr        $this->assertEquals(1, $result[5][0]); // 'three' appears once
83ede46466SAndreas Gohr    }
84ede46466SAndreas Gohr
85ede46466SAndreas Gohr    /**
86ede46466SAndreas Gohr     * resolveTokens with empty input should return empty array
87ede46466SAndreas Gohr     */
88ede46466SAndreas Gohr    public function testResolveTokensEmpty()
89ede46466SAndreas Gohr    {
90ede46466SAndreas Gohr        $index = new MockFrequencyCollection('rte_entity', 'rte_token', 'rte_freq', 'rte_reverse');
91ede46466SAndreas Gohr        $index->lock();
92ede46466SAndreas Gohr
93ede46466SAndreas Gohr        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [[]]);
94ede46466SAndreas Gohr
95ede46466SAndreas Gohr        $this->assertEmpty($result);
96ede46466SAndreas Gohr    }
97ede46466SAndreas Gohr
98ede46466SAndreas Gohr    /**
99ede46466SAndreas Gohr     * countTokens should return occurrence counts
100ede46466SAndreas Gohr     */
101ede46466SAndreas Gohr    public function testCountTokens()
102ede46466SAndreas Gohr    {
103ede46466SAndreas Gohr        $index = new MockFrequencyCollection();
104ede46466SAndreas Gohr
105ede46466SAndreas Gohr        $result = $this->callInaccessibleMethod($index, 'countTokens', [
106ede46466SAndreas Gohr            ['one', 'two', 'two', 'three', 'three', 'three'],
107ede46466SAndreas Gohr        ]);
108ede46466SAndreas Gohr
109ede46466SAndreas Gohr        $this->assertEquals([
110ede46466SAndreas Gohr            'one' => 1,
111ede46466SAndreas Gohr            'two' => 2,
112ede46466SAndreas Gohr            'three' => 3,
113ede46466SAndreas Gohr        ], $result);
114ede46466SAndreas Gohr    }
1156734bb8cSAndreas Gohr
1166734bb8cSAndreas Gohr    /**
1176734bb8cSAndreas Gohr     * getEntitiesWithData on a split FrequencyCollection
1186734bb8cSAndreas Gohr     */
1196734bb8cSAndreas Gohr    public function testGetEntitiesWithData()
1206734bb8cSAndreas Gohr    {
1216734bb8cSAndreas Gohr        $index = new MockFrequencyCollection('ewd_page', 'ewd_w', 'ewd_i', 'ewd_pw');
1226734bb8cSAndreas Gohr        $index->lock();
1236734bb8cSAndreas Gohr        $index->addEntity('page1', ['dokuwiki', 'wiki']);
1246734bb8cSAndreas Gohr        $index->addEntity('page2', ['other', 'words']);
1256734bb8cSAndreas Gohr        $index->unlock();
1266734bb8cSAndreas Gohr
1276734bb8cSAndreas Gohr        $result = $index->getEntitiesWithData();
1286734bb8cSAndreas Gohr        sort($result);
1296734bb8cSAndreas Gohr        $this->assertEquals(['page1', 'page2'], $result);
1306734bb8cSAndreas Gohr    }
1316734bb8cSAndreas Gohr
1326734bb8cSAndreas Gohr    /**
133*21fbd01bSAndreas Gohr     * getEntitiesWithData on an empty split collection returns empty array
134*21fbd01bSAndreas Gohr     */
135*21fbd01bSAndreas Gohr    public function testGetEntitiesWithDataEmpty()
136*21fbd01bSAndreas Gohr    {
137*21fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw');
138*21fbd01bSAndreas Gohr        $result = $index->getEntitiesWithData();
139*21fbd01bSAndreas Gohr        $this->assertEquals([], $result);
140*21fbd01bSAndreas Gohr    }
141*21fbd01bSAndreas Gohr
142*21fbd01bSAndreas Gohr    /**
143*21fbd01bSAndreas Gohr     * checkIntegrity on an empty split collection does not throw
144*21fbd01bSAndreas Gohr     */
145*21fbd01bSAndreas Gohr    public function testCheckIntegrityEmpty()
146*21fbd01bSAndreas Gohr    {
147*21fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('ci_page', 'ci_w', 'ci_i', 'ci_pw');
148*21fbd01bSAndreas Gohr        $index->checkIntegrity();
149*21fbd01bSAndreas Gohr        $this->assertTrue(true); // no exception thrown
150*21fbd01bSAndreas Gohr    }
151*21fbd01bSAndreas Gohr
152*21fbd01bSAndreas Gohr    /**
153*21fbd01bSAndreas Gohr     * checkIntegrity passes on a healthy split collection
154*21fbd01bSAndreas Gohr     */
155*21fbd01bSAndreas Gohr    public function testCheckIntegrityHealthy()
156*21fbd01bSAndreas Gohr    {
157*21fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('cih_page', 'cih_w', 'cih_i', 'cih_pw');
158*21fbd01bSAndreas Gohr        $index->lock();
159*21fbd01bSAndreas Gohr        $index->addEntity('page1', ['dokuwiki', 'wiki']);
160*21fbd01bSAndreas Gohr        $index->unlock();
161*21fbd01bSAndreas Gohr
162*21fbd01bSAndreas Gohr        $index->checkIntegrity(); // should not throw
163*21fbd01bSAndreas Gohr        $this->assertTrue(true);
164*21fbd01bSAndreas Gohr    }
165*21fbd01bSAndreas Gohr
166*21fbd01bSAndreas Gohr    /**
167*21fbd01bSAndreas Gohr     * checkIntegrity detects missing frequency index for a group
168*21fbd01bSAndreas Gohr     */
169*21fbd01bSAndreas Gohr    public function testCheckIntegrityMissingFreqIndex()
170*21fbd01bSAndreas Gohr    {
171*21fbd01bSAndreas Gohr        global $conf;
172*21fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw');
173*21fbd01bSAndreas Gohr        $index->lock();
174*21fbd01bSAndreas Gohr        $index->addEntity('page1', ['dokuwiki', 'wiki']);
175*21fbd01bSAndreas Gohr        $index->unlock();
176*21fbd01bSAndreas Gohr
177*21fbd01bSAndreas Gohr        // find a group that exists and delete its frequency index
178*21fbd01bSAndreas Gohr        $max = $index->getTokenIndexMaximum();
179*21fbd01bSAndreas Gohr        @unlink($conf['indexdir'] . '/cimf_i' . $max . '.idx');
180*21fbd01bSAndreas Gohr
181*21fbd01bSAndreas Gohr        $this->expectException(IndexIntegrityException::class);
182*21fbd01bSAndreas Gohr        (new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'))->checkIntegrity();
183*21fbd01bSAndreas Gohr    }
184*21fbd01bSAndreas Gohr
185*21fbd01bSAndreas Gohr    /**
186*21fbd01bSAndreas Gohr     * checkIntegrity detects missing token index for a group
187*21fbd01bSAndreas Gohr     */
188*21fbd01bSAndreas Gohr    public function testCheckIntegrityMissingTokenIndex()
189*21fbd01bSAndreas Gohr    {
190*21fbd01bSAndreas Gohr        global $conf;
191*21fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw');
192*21fbd01bSAndreas Gohr        $index->lock();
193*21fbd01bSAndreas Gohr        // use words of different lengths to create multiple groups
194*21fbd01bSAndreas Gohr        $index->addEntity('page1', ['hi', 'dokuwiki', 'wiki']);
195*21fbd01bSAndreas Gohr        $index->unlock();
196*21fbd01bSAndreas Gohr
197*21fbd01bSAndreas Gohr        // delete the token index for the shortest group (not the max)
198*21fbd01bSAndreas Gohr        @unlink($conf['indexdir'] . '/cimt_w2.idx');
199*21fbd01bSAndreas Gohr
200*21fbd01bSAndreas Gohr        $this->expectException(IndexIntegrityException::class);
201*21fbd01bSAndreas Gohr        (new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'))->checkIntegrity();
202*21fbd01bSAndreas Gohr    }
203*21fbd01bSAndreas Gohr
204*21fbd01bSAndreas Gohr    /**
2056734bb8cSAndreas Gohr     * groupToSuffix throws on group 0 for split collection
2066734bb8cSAndreas Gohr     */
2076734bb8cSAndreas Gohr    public function testGroupToSuffixValidationSplit()
2086734bb8cSAndreas Gohr    {
2096734bb8cSAndreas Gohr        $this->expectException(\dokuwiki\Search\Exception\IndexUsageException::class);
2106734bb8cSAndreas Gohr
2116734bb8cSAndreas Gohr        $index = new MockFrequencyCollection('gs_page', 'gs_w', 'gs_i', 'gs_pw');
2126734bb8cSAndreas Gohr        // split collection should reject group 0
2136734bb8cSAndreas Gohr        $index->getTokenIndex(0);
2146734bb8cSAndreas Gohr    }
215ede46466SAndreas Gohr}
216