xref: /dokuwiki/_test/tests/Search/Collection/FrequencyCollectionTest.php (revision 06053dca2fac9a1da4eb1accf8c2488942da5d2a)
1ede46466SAndreas Gohr<?php
2ede46466SAndreas Gohr
3ede46466SAndreas Gohrnamespace dokuwiki\test\Search\Collection;
4ede46466SAndreas Gohr
521fbd01bSAndreas Gohruse dokuwiki\Search\Exception\IndexIntegrityException;
6ede46466SAndreas Gohruse dokuwiki\Search\Index\MemoryIndex;
7ede46466SAndreas Gohr
8ede46466SAndreas Gohrclass FrequencyCollectionTest extends \DokuWikiTest
9ede46466SAndreas Gohr{
10ede46466SAndreas Gohr
11ede46466SAndreas Gohr    /**
12ede46466SAndreas Gohr     * Add data and directly check the underlying indexes for correctness
13ede46466SAndreas Gohr     */
14ede46466SAndreas Gohr    public function testDirectly()
15ede46466SAndreas Gohr    {
16ede46466SAndreas Gohr        $index = new MockFrequencyCollection('entity', 'token', 'freq', 'reverse');
17ede46466SAndreas Gohr
18ede46466SAndreas Gohr        $tokens = ['one', 'two', 'three', 'four', 'two'];
19ede46466SAndreas Gohr        $index->lock();
20ede46466SAndreas Gohr        $index->addEntity('test', $tokens);
21ede46466SAndreas Gohr        $index->unlock();
22ede46466SAndreas Gohr
23ede46466SAndreas Gohr        $idxEntity = new MemoryIndex('entity');
24ede46466SAndreas Gohr        $this->assertEquals('test', $idxEntity->retrieveRow(0));
25ede46466SAndreas Gohr
26ede46466SAndreas Gohr        $idxToken = new MemoryIndex('token', '3');
27ede46466SAndreas Gohr        $this->assertEquals('one', $idxToken->retrieveRow(0));
28ede46466SAndreas Gohr        $this->assertEquals('two', $idxToken->retrieveRow(1));
29ede46466SAndreas Gohr
30ede46466SAndreas Gohr        $idxFreq = new MemoryIndex('freq', '3');
31ede46466SAndreas Gohr        $this->assertEquals('0', $idxFreq->retrieveRow(0)); // one is 1x on page 0 (written without *1)
32ede46466SAndreas Gohr        $this->assertEquals('0*2', $idxFreq->retrieveRow(1)); // two is 2x on page 0
33ede46466SAndreas Gohr
34ede46466SAndreas Gohr        $idxRev = new MemoryIndex('reverse');
35ede46466SAndreas Gohr        $this->assertEquals('3*0:3*1:5*0:4*0', $idxRev->retrieveRow(0));
36ede46466SAndreas Gohr
37ede46466SAndreas Gohr        // remove one of the tokens
38ede46466SAndreas Gohr        $tokens = ['two', 'three', 'four', 'two'];
39ede46466SAndreas Gohr        $index->lock();
40ede46466SAndreas Gohr        $index->addEntity('test', $tokens);
41ede46466SAndreas Gohr        $index->unlock();
42ede46466SAndreas Gohr
43ede46466SAndreas Gohr        $idxFreq = new MemoryIndex('freq', '3');
44ede46466SAndreas Gohr        $this->assertEquals('', $idxFreq->retrieveRow(0)); // one is not on page 0
45ede46466SAndreas Gohr    }
46ede46466SAndreas Gohr
47ede46466SAndreas Gohr    /**
48ede46466SAndreas Gohr     * Test reverse lookup
49ede46466SAndreas Gohr     *
50ede46466SAndreas Gohr     * A lookup for the page should return the word frequencies
51ede46466SAndreas Gohr     */
52ede46466SAndreas Gohr    public function testReverse()
53ede46466SAndreas Gohr    {
54ede46466SAndreas Gohr        $index = new MockFrequencyCollection('page', 'word', 'w', 'pageword');
55ede46466SAndreas Gohr        $index->lock();
56ede46466SAndreas Gohr        $index->addEntity('wiki:syntax', ['dokuwiki']);
57ede46466SAndreas Gohr        $index->unlock();
58ede46466SAndreas Gohr
59ede46466SAndreas Gohr        $len = strlen('dokuwiki');
60ede46466SAndreas Gohr        $this->assertEquals([$len => [0 => 0]], $index->getReverseAssignments('wiki:syntax'));
61ede46466SAndreas Gohr    }
62ede46466SAndreas Gohr
63ede46466SAndreas Gohr    /**
64ede46466SAndreas Gohr     * resolveTokens should count frequencies and group by token length
65ede46466SAndreas Gohr     */
66ede46466SAndreas Gohr    public function testResolveTokens()
67ede46466SAndreas Gohr    {
68ede46466SAndreas Gohr        $index = new MockFrequencyCollection('rt_entity', 'rt_token', 'rt_freq', 'rt_reverse');
69ede46466SAndreas Gohr        $index->lock();
70ede46466SAndreas Gohr
71ede46466SAndreas Gohr        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [
72ede46466SAndreas Gohr            ['one', 'two', 'two', 'three'],
73ede46466SAndreas Gohr        ]);
74ede46466SAndreas Gohr
75ede46466SAndreas Gohr        // 'one' and 'two' are 3 chars, 'three' is 5 chars
76ede46466SAndreas Gohr        $this->assertArrayHasKey(3, $result);
77ede46466SAndreas Gohr        $this->assertArrayHasKey(5, $result);
78ede46466SAndreas Gohr
79ede46466SAndreas Gohr        // token IDs are sequential: one=0, two=1, three=0 (in its own length group)
80ede46466SAndreas Gohr        $this->assertEquals(1, $result[3][0]); // 'one' appears once
81ede46466SAndreas Gohr        $this->assertEquals(2, $result[3][1]); // 'two' appears twice
82ede46466SAndreas Gohr        $this->assertEquals(1, $result[5][0]); // 'three' appears once
83ede46466SAndreas Gohr    }
84ede46466SAndreas Gohr
85ede46466SAndreas Gohr    /**
86ede46466SAndreas Gohr     * resolveTokens with empty input should return empty array
87ede46466SAndreas Gohr     */
88ede46466SAndreas Gohr    public function testResolveTokensEmpty()
89ede46466SAndreas Gohr    {
90ede46466SAndreas Gohr        $index = new MockFrequencyCollection('rte_entity', 'rte_token', 'rte_freq', 'rte_reverse');
91ede46466SAndreas Gohr        $index->lock();
92ede46466SAndreas Gohr
93ede46466SAndreas Gohr        $result = $this->callInaccessibleMethod($index, 'resolveTokens', [[]]);
94ede46466SAndreas Gohr
95ede46466SAndreas Gohr        $this->assertEmpty($result);
96ede46466SAndreas Gohr    }
97ede46466SAndreas Gohr
98ede46466SAndreas Gohr    /**
99ede46466SAndreas Gohr     * countTokens should return occurrence counts
100ede46466SAndreas Gohr     */
101ede46466SAndreas Gohr    public function testCountTokens()
102ede46466SAndreas Gohr    {
103ede46466SAndreas Gohr        $index = new MockFrequencyCollection();
104ede46466SAndreas Gohr
105ede46466SAndreas Gohr        $result = $this->callInaccessibleMethod($index, 'countTokens', [
106ede46466SAndreas Gohr            ['one', 'two', 'two', 'three', 'three', 'three'],
107ede46466SAndreas Gohr        ]);
108ede46466SAndreas Gohr
109ede46466SAndreas Gohr        $this->assertEquals([
110ede46466SAndreas Gohr            'one' => 1,
111ede46466SAndreas Gohr            'two' => 2,
112ede46466SAndreas Gohr            'three' => 3,
113ede46466SAndreas Gohr        ], $result);
114ede46466SAndreas Gohr    }
1156734bb8cSAndreas Gohr
1166734bb8cSAndreas Gohr    /**
117*06053dcaSAndreas Gohr     * Adding a second entity creates new RIDs in the entity index that must be
118*06053dcaSAndreas Gohr     * used consistently across frequency and reverse indexes, even when those
119*06053dcaSAndreas Gohr     * indexes were originally sized for only the first entity.
120*06053dcaSAndreas Gohr     */
121*06053dcaSAndreas Gohr    public function testMultipleEntitiesShareTokens()
122*06053dcaSAndreas Gohr    {
123*06053dcaSAndreas Gohr        $index = new MockFrequencyCollection('me_page', 'me_w', 'me_i', 'me_pw');
124*06053dcaSAndreas Gohr        $index->lock();
125*06053dcaSAndreas Gohr        $index->addEntity('page1', ['alpha', 'beta']);
126*06053dcaSAndreas Gohr        $index->addEntity('page2', ['beta', 'gamma']);
127*06053dcaSAndreas Gohr        $index->unlock();
128*06053dcaSAndreas Gohr
129*06053dcaSAndreas Gohr        // entity index: page1=0, page2=1
130*06053dcaSAndreas Gohr        $idxEntity = new MemoryIndex('me_page');
131*06053dcaSAndreas Gohr        $this->assertEquals('page1', $idxEntity->retrieveRow(0));
132*06053dcaSAndreas Gohr        $this->assertEquals('page2', $idxEntity->retrieveRow(1));
133*06053dcaSAndreas Gohr
134*06053dcaSAndreas Gohr        // token index (5-char group): alpha=0, gamma=1
135*06053dcaSAndreas Gohr        $idxToken5 = new MemoryIndex('me_w', '5');
136*06053dcaSAndreas Gohr        $this->assertEquals('alpha', $idxToken5->retrieveRow(0));
137*06053dcaSAndreas Gohr        $this->assertEquals('gamma', $idxToken5->retrieveRow(1));
138*06053dcaSAndreas Gohr
139*06053dcaSAndreas Gohr        // token index (4-char group): beta=0
140*06053dcaSAndreas Gohr        $idxToken4 = new MemoryIndex('me_w', '4');
141*06053dcaSAndreas Gohr        $this->assertEquals('beta', $idxToken4->retrieveRow(0));
142*06053dcaSAndreas Gohr
143*06053dcaSAndreas Gohr        // frequency index: beta (token 0 in 4-char group) is on both entities
144*06053dcaSAndreas Gohr        $idxFreq4 = new MemoryIndex('me_i', '4');
145*06053dcaSAndreas Gohr        $betaFreq = explode(':', $idxFreq4->retrieveRow(0));
146*06053dcaSAndreas Gohr        sort($betaFreq);
147*06053dcaSAndreas Gohr        $this->assertEquals(['0', '1'], $betaFreq); // beta on page1(0) and page2(1)
148*06053dcaSAndreas Gohr
149*06053dcaSAndreas Gohr        // frequency index: alpha (token 0 in 5-char group) only on page1
150*06053dcaSAndreas Gohr        $idxFreq5 = new MemoryIndex('me_i', '5');
151*06053dcaSAndreas Gohr        $this->assertEquals('0', $idxFreq5->retrieveRow(0)); // alpha on page1(0) only
152*06053dcaSAndreas Gohr        $this->assertEquals('1', $idxFreq5->retrieveRow(1)); // gamma on page2(1) only
153*06053dcaSAndreas Gohr
154*06053dcaSAndreas Gohr        // reverse index: page1 has alpha(5*0) and beta(4*0), page2 has beta(4*0) and gamma(5*1)
155*06053dcaSAndreas Gohr        $idxRev = new MemoryIndex('me_pw');
156*06053dcaSAndreas Gohr        $rev0 = explode(':', $idxRev->retrieveRow(0));
157*06053dcaSAndreas Gohr        sort($rev0);
158*06053dcaSAndreas Gohr        $this->assertEquals(['4*0', '5*0'], $rev0);
159*06053dcaSAndreas Gohr
160*06053dcaSAndreas Gohr        $rev1 = explode(':', $idxRev->retrieveRow(1));
161*06053dcaSAndreas Gohr        sort($rev1);
162*06053dcaSAndreas Gohr        $this->assertEquals(['4*0', '5*1'], $rev1);
163*06053dcaSAndreas Gohr    }
164*06053dcaSAndreas Gohr
165*06053dcaSAndreas Gohr    /**
1666734bb8cSAndreas Gohr     * getEntitiesWithData on a split FrequencyCollection
1676734bb8cSAndreas Gohr     */
1686734bb8cSAndreas Gohr    public function testGetEntitiesWithData()
1696734bb8cSAndreas Gohr    {
1706734bb8cSAndreas Gohr        $index = new MockFrequencyCollection('ewd_page', 'ewd_w', 'ewd_i', 'ewd_pw');
1716734bb8cSAndreas Gohr        $index->lock();
1726734bb8cSAndreas Gohr        $index->addEntity('page1', ['dokuwiki', 'wiki']);
1736734bb8cSAndreas Gohr        $index->addEntity('page2', ['other', 'words']);
1746734bb8cSAndreas Gohr        $index->unlock();
1756734bb8cSAndreas Gohr
1766734bb8cSAndreas Gohr        $result = $index->getEntitiesWithData();
1776734bb8cSAndreas Gohr        sort($result);
1786734bb8cSAndreas Gohr        $this->assertEquals(['page1', 'page2'], $result);
1796734bb8cSAndreas Gohr    }
1806734bb8cSAndreas Gohr
1816734bb8cSAndreas Gohr    /**
18221fbd01bSAndreas Gohr     * getEntitiesWithData on an empty split collection returns empty array
18321fbd01bSAndreas Gohr     */
18421fbd01bSAndreas Gohr    public function testGetEntitiesWithDataEmpty()
18521fbd01bSAndreas Gohr    {
18621fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('empty_page', 'empty_w', 'empty_i', 'empty_pw');
18721fbd01bSAndreas Gohr        $result = $index->getEntitiesWithData();
18821fbd01bSAndreas Gohr        $this->assertEquals([], $result);
18921fbd01bSAndreas Gohr    }
19021fbd01bSAndreas Gohr
19121fbd01bSAndreas Gohr    /**
19221fbd01bSAndreas Gohr     * checkIntegrity on an empty split collection does not throw
19321fbd01bSAndreas Gohr     */
19421fbd01bSAndreas Gohr    public function testCheckIntegrityEmpty()
19521fbd01bSAndreas Gohr    {
19621fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('ci_page', 'ci_w', 'ci_i', 'ci_pw');
19721fbd01bSAndreas Gohr        $index->checkIntegrity();
19821fbd01bSAndreas Gohr        $this->assertTrue(true); // no exception thrown
19921fbd01bSAndreas Gohr    }
20021fbd01bSAndreas Gohr
20121fbd01bSAndreas Gohr    /**
20221fbd01bSAndreas Gohr     * checkIntegrity passes on a healthy split collection
20321fbd01bSAndreas Gohr     */
20421fbd01bSAndreas Gohr    public function testCheckIntegrityHealthy()
20521fbd01bSAndreas Gohr    {
20621fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('cih_page', 'cih_w', 'cih_i', 'cih_pw');
20721fbd01bSAndreas Gohr        $index->lock();
20821fbd01bSAndreas Gohr        $index->addEntity('page1', ['dokuwiki', 'wiki']);
20921fbd01bSAndreas Gohr        $index->unlock();
21021fbd01bSAndreas Gohr
21121fbd01bSAndreas Gohr        $index->checkIntegrity(); // should not throw
21221fbd01bSAndreas Gohr        $this->assertTrue(true);
21321fbd01bSAndreas Gohr    }
21421fbd01bSAndreas Gohr
21521fbd01bSAndreas Gohr    /**
21621fbd01bSAndreas Gohr     * checkIntegrity detects missing frequency index for a group
21721fbd01bSAndreas Gohr     */
21821fbd01bSAndreas Gohr    public function testCheckIntegrityMissingFreqIndex()
21921fbd01bSAndreas Gohr    {
22021fbd01bSAndreas Gohr        global $conf;
22121fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw');
22221fbd01bSAndreas Gohr        $index->lock();
22321fbd01bSAndreas Gohr        $index->addEntity('page1', ['dokuwiki', 'wiki']);
22421fbd01bSAndreas Gohr        $index->unlock();
22521fbd01bSAndreas Gohr
22621fbd01bSAndreas Gohr        // find a group that exists and delete its frequency index
22721fbd01bSAndreas Gohr        $max = $index->getTokenIndexMaximum();
22821fbd01bSAndreas Gohr        @unlink($conf['indexdir'] . '/cimf_i' . $max . '.idx');
22921fbd01bSAndreas Gohr
23021fbd01bSAndreas Gohr        $this->expectException(IndexIntegrityException::class);
23121fbd01bSAndreas Gohr        (new MockFrequencyCollection('cimf_page', 'cimf_w', 'cimf_i', 'cimf_pw'))->checkIntegrity();
23221fbd01bSAndreas Gohr    }
23321fbd01bSAndreas Gohr
23421fbd01bSAndreas Gohr    /**
23521fbd01bSAndreas Gohr     * checkIntegrity detects missing token index for a group
23621fbd01bSAndreas Gohr     */
23721fbd01bSAndreas Gohr    public function testCheckIntegrityMissingTokenIndex()
23821fbd01bSAndreas Gohr    {
23921fbd01bSAndreas Gohr        global $conf;
24021fbd01bSAndreas Gohr        $index = new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw');
24121fbd01bSAndreas Gohr        $index->lock();
24221fbd01bSAndreas Gohr        // use words of different lengths to create multiple groups
24321fbd01bSAndreas Gohr        $index->addEntity('page1', ['hi', 'dokuwiki', 'wiki']);
24421fbd01bSAndreas Gohr        $index->unlock();
24521fbd01bSAndreas Gohr
24621fbd01bSAndreas Gohr        // delete the token index for the shortest group (not the max)
24721fbd01bSAndreas Gohr        @unlink($conf['indexdir'] . '/cimt_w2.idx');
24821fbd01bSAndreas Gohr
24921fbd01bSAndreas Gohr        $this->expectException(IndexIntegrityException::class);
25021fbd01bSAndreas Gohr        (new MockFrequencyCollection('cimt_page', 'cimt_w', 'cimt_i', 'cimt_pw'))->checkIntegrity();
25121fbd01bSAndreas Gohr    }
25221fbd01bSAndreas Gohr
25321fbd01bSAndreas Gohr    /**
2546734bb8cSAndreas Gohr     * groupToSuffix throws on group 0 for split collection
2556734bb8cSAndreas Gohr     */
2566734bb8cSAndreas Gohr    public function testGroupToSuffixValidationSplit()
2576734bb8cSAndreas Gohr    {
2586734bb8cSAndreas Gohr        $this->expectException(\dokuwiki\Search\Exception\IndexUsageException::class);
2596734bb8cSAndreas Gohr
2606734bb8cSAndreas Gohr        $index = new MockFrequencyCollection('gs_page', 'gs_w', 'gs_i', 'gs_pw');
2616734bb8cSAndreas Gohr        // split collection should reject group 0
2626734bb8cSAndreas Gohr        $index->getTokenIndex(0);
2636734bb8cSAndreas Gohr    }
264ede46466SAndreas Gohr}
265