xref: /plugin/aichat/_test/TextSplitterTest.php (revision 3f61c2bb97ce4f3c020d40dbc8d35c0e6cbdc421)
1*3f61c2bbSAndreas Gohr<?php
2*3f61c2bbSAndreas Gohr
3*3f61c2bbSAndreas Gohrnamespace dokuwiki\plugin\aichat\test;
4*3f61c2bbSAndreas Gohr
5*3f61c2bbSAndreas Gohruse dokuwiki\plugin\aichat\TextSplitter;
6*3f61c2bbSAndreas Gohruse DokuWikiTest;
7*3f61c2bbSAndreas Gohruse TikToken\Encoder;
8*3f61c2bbSAndreas Gohr
9*3f61c2bbSAndreas Gohr/**
10*3f61c2bbSAndreas Gohr * Tests for the TextSplitter class
11*3f61c2bbSAndreas Gohr *
12*3f61c2bbSAndreas Gohr * @group plugin_aichat
13*3f61c2bbSAndreas Gohr * @group plugins
14*3f61c2bbSAndreas Gohr */
15*3f61c2bbSAndreas Gohrclass TextSplitterTest extends DokuWikiTest
16*3f61c2bbSAndreas Gohr{
17*3f61c2bbSAndreas Gohr    protected $pluginsEnabled = ['aichat'];
18*3f61c2bbSAndreas Gohr
19*3f61c2bbSAndreas Gohr    const CHUNKSIZE = 10; // 10 token chunks for testing
20*3f61c2bbSAndreas Gohr    const OVERLAP = 5;  // 2 token overlap for testing
21*3f61c2bbSAndreas Gohr    private TextSplitter $splitter;
22*3f61c2bbSAndreas Gohr    private Encoder $encoder;
23*3f61c2bbSAndreas Gohr
24*3f61c2bbSAndreas Gohr    public function setUp(): void
25*3f61c2bbSAndreas Gohr    {
26*3f61c2bbSAndreas Gohr        parent::setUp();
27*3f61c2bbSAndreas Gohr        $this->encoder = new Encoder();
28*3f61c2bbSAndreas Gohr        $this->splitter = new TextSplitter(self::CHUNKSIZE, $this->encoder, self::OVERLAP);
29*3f61c2bbSAndreas Gohr    }
30*3f61c2bbSAndreas Gohr
31*3f61c2bbSAndreas Gohr    /**
32*3f61c2bbSAndreas Gohr     * Test basic text splitting functionality
33*3f61c2bbSAndreas Gohr     */
34*3f61c2bbSAndreas Gohr    public function testSplitIntoChunks(): void
35*3f61c2bbSAndreas Gohr    {
36*3f61c2bbSAndreas Gohr        $text = "This is the first sentence. This is the second sentence. This is the third sentence.";
37*3f61c2bbSAndreas Gohr        $chunks = $this->splitter->splitIntoChunks($text);
38*3f61c2bbSAndreas Gohr
39*3f61c2bbSAndreas Gohr        $this->assertIsArray($chunks);
40*3f61c2bbSAndreas Gohr        $this->assertNotEmpty($chunks);
41*3f61c2bbSAndreas Gohr
42*3f61c2bbSAndreas Gohr        $this->assertGreaterThan(1, count($chunks)); // Should be split into multiple chunks
43*3f61c2bbSAndreas Gohr
44*3f61c2bbSAndreas Gohr
45*3f61c2bbSAndreas Gohr        foreach ($chunks as $chunk) {
46*3f61c2bbSAndreas Gohr            // Each chunk should be non-empty
47*3f61c2bbSAndreas Gohr            $this->assertNotEmpty(trim($chunk));
48*3f61c2bbSAndreas Gohr
49*3f61c2bbSAndreas Gohr            // Each chunk should be within the token limit
50*3f61c2bbSAndreas Gohr            $tokenCount = count($this->encoder->encode($chunk));
51*3f61c2bbSAndreas Gohr            $this->assertLessThanOrEqual(self::CHUNKSIZE, $tokenCount);
52*3f61c2bbSAndreas Gohr        }
53*3f61c2bbSAndreas Gohr    }
54*3f61c2bbSAndreas Gohr
55*3f61c2bbSAndreas Gohr    /**
56*3f61c2bbSAndreas Gohr     * Test splitting with empty text
57*3f61c2bbSAndreas Gohr     */
58*3f61c2bbSAndreas Gohr    public function testSplitEmptyText(): void
59*3f61c2bbSAndreas Gohr    {
60*3f61c2bbSAndreas Gohr        $chunks = $this->splitter->splitIntoChunks('');
61*3f61c2bbSAndreas Gohr        $this->assertIsArray($chunks);
62*3f61c2bbSAndreas Gohr        $this->assertEmpty($chunks);
63*3f61c2bbSAndreas Gohr    }
64*3f61c2bbSAndreas Gohr
65*3f61c2bbSAndreas Gohr    /**
66*3f61c2bbSAndreas Gohr     * Test splitting with whitespace only
67*3f61c2bbSAndreas Gohr     */
68*3f61c2bbSAndreas Gohr    public function testSplitWhitespaceOnly(): void
69*3f61c2bbSAndreas Gohr    {
70*3f61c2bbSAndreas Gohr        $chunks = $this->splitter->splitIntoChunks('   ');
71*3f61c2bbSAndreas Gohr        $this->assertIsArray($chunks);
72*3f61c2bbSAndreas Gohr        $this->assertEmpty($chunks);
73*3f61c2bbSAndreas Gohr    }
74*3f61c2bbSAndreas Gohr
75*3f61c2bbSAndreas Gohr    /**
76*3f61c2bbSAndreas Gohr     * Test splitting a single short sentence
77*3f61c2bbSAndreas Gohr     */
78*3f61c2bbSAndreas Gohr    public function testSplitSingleShortSentence(): void
79*3f61c2bbSAndreas Gohr    {
80*3f61c2bbSAndreas Gohr        $text = "This is a short sentence.";
81*3f61c2bbSAndreas Gohr        $chunks = $this->splitter->splitIntoChunks($text);
82*3f61c2bbSAndreas Gohr
83*3f61c2bbSAndreas Gohr        $this->assertCount(1, $chunks);
84*3f61c2bbSAndreas Gohr        $this->assertEquals($text, $chunks[0]);
85*3f61c2bbSAndreas Gohr    }
86*3f61c2bbSAndreas Gohr
87*3f61c2bbSAndreas Gohr    /**
88*3f61c2bbSAndreas Gohr     * Test splitting multiple sentences that fit in one chunk
89*3f61c2bbSAndreas Gohr     */
90*3f61c2bbSAndreas Gohr    public function testSplitMultipleSentencesOneChunk(): void
91*3f61c2bbSAndreas Gohr    {
92*3f61c2bbSAndreas Gohr        $text = "First sentence. Second sentence. Third sentence.";
93*3f61c2bbSAndreas Gohr        $chunks = $this->splitter->splitIntoChunks($text);
94*3f61c2bbSAndreas Gohr
95*3f61c2bbSAndreas Gohr        $this->assertCount(1, $chunks);
96*3f61c2bbSAndreas Gohr        $this->assertEquals($text, $chunks[0]);
97*3f61c2bbSAndreas Gohr    }
98*3f61c2bbSAndreas Gohr
99*3f61c2bbSAndreas Gohr    /**
100*3f61c2bbSAndreas Gohr     * Test that chunks have proper overlap
101*3f61c2bbSAndreas Gohr     */
102*3f61c2bbSAndreas Gohr    public function testChunkOverlap(): void
103*3f61c2bbSAndreas Gohr    {
104*3f61c2bbSAndreas Gohr        $text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.";
105*3f61c2bbSAndreas Gohr
106*3f61c2bbSAndreas Gohr        $chunks = $this->splitter->splitIntoChunks($text);
107*3f61c2bbSAndreas Gohr        $this->assertGreaterThan(1, count($chunks));
108*3f61c2bbSAndreas Gohr
109*3f61c2bbSAndreas Gohr        $this->assertStringEndsWith('Third sentence.', $chunks[0]);
110*3f61c2bbSAndreas Gohr        $this->assertStringStartsWith('Third sentence.', $chunks[1]);
111*3f61c2bbSAndreas Gohr    }
112*3f61c2bbSAndreas Gohr
113*3f61c2bbSAndreas Gohr    /**
114*3f61c2bbSAndreas Gohr     * Test splitLongSentence protected method
115*3f61c2bbSAndreas Gohr     */
116*3f61c2bbSAndreas Gohr    public function testSplitLongSentence(): void
117*3f61c2bbSAndreas Gohr    {
118*3f61c2bbSAndreas Gohr        // Create a very long sentence without periods
119*3f61c2bbSAndreas Gohr        $longSentence = str_repeat("long word is long ", 20);
120*3f61c2bbSAndreas Gohr
121*3f61c2bbSAndreas Gohr        $result = self::callInaccessibleMethod($this->splitter, 'splitLongSentence', [$longSentence]);
122*3f61c2bbSAndreas Gohr
123*3f61c2bbSAndreas Gohr        $this->assertIsArray($result);
124*3f61c2bbSAndreas Gohr        $this->assertGreaterThan(1, count($result));
125*3f61c2bbSAndreas Gohr
126*3f61c2bbSAndreas Gohr        // Each sub-sentence should be shorter than the original
127*3f61c2bbSAndreas Gohr        foreach ($result as $subSentence) {
128*3f61c2bbSAndreas Gohr            $this->assertLessThan(strlen($longSentence), strlen($subSentence));
129*3f61c2bbSAndreas Gohr        }
130*3f61c2bbSAndreas Gohr
131*3f61c2bbSAndreas Gohr        // Verify all pieces together reconstruct the original
132*3f61c2bbSAndreas Gohr        $reconstructed = implode('', $result);
133*3f61c2bbSAndreas Gohr        $this->assertEquals($longSentence, $reconstructed);
134*3f61c2bbSAndreas Gohr    }
135*3f61c2bbSAndreas Gohr
136*3f61c2bbSAndreas Gohr    /**
137*3f61c2bbSAndreas Gohr     * Test splitString protected method
138*3f61c2bbSAndreas Gohr     */
139*3f61c2bbSAndreas Gohr    public function testSplitString(): void
140*3f61c2bbSAndreas Gohr    {
141*3f61c2bbSAndreas Gohr        $text = str_repeat("verylongwordwithoutspaces", 20);
142*3f61c2bbSAndreas Gohr        $tokenLength = count($this->encoder->encode($text));
143*3f61c2bbSAndreas Gohr        $chunkSize = 5;
144*3f61c2bbSAndreas Gohr
145*3f61c2bbSAndreas Gohr        $result = self::callInaccessibleMethod($this->splitter, 'splitString', [$text, $tokenLength, $chunkSize]);
146*3f61c2bbSAndreas Gohr
147*3f61c2bbSAndreas Gohr        $this->assertIsArray($result);
148*3f61c2bbSAndreas Gohr        $this->assertGreaterThan(1, count($result));
149*3f61c2bbSAndreas Gohr
150*3f61c2bbSAndreas Gohr        // Each sub-sentence should be shorter than the original
151*3f61c2bbSAndreas Gohr        foreach ($result as $subSentence) {
152*3f61c2bbSAndreas Gohr            $this->assertLessThan(strlen($text), strlen($subSentence));
153*3f61c2bbSAndreas Gohr        }
154*3f61c2bbSAndreas Gohr
155*3f61c2bbSAndreas Gohr        // Verify all pieces together reconstruct the original
156*3f61c2bbSAndreas Gohr        $reconstructed = implode('', $result);
157*3f61c2bbSAndreas Gohr        $this->assertEquals($text, $reconstructed);
158*3f61c2bbSAndreas Gohr    }
159*3f61c2bbSAndreas Gohr
160*3f61c2bbSAndreas Gohr    /**
161*3f61c2bbSAndreas Gohr     * Test rememberSentence protected method
162*3f61c2bbSAndreas Gohr     */
163*3f61c2bbSAndreas Gohr    public function testRememberSentence(): void
164*3f61c2bbSAndreas Gohr    {
165*3f61c2bbSAndreas Gohr        // Clear the sentence queue first
166*3f61c2bbSAndreas Gohr        self::setInaccessibleProperty($this->splitter, 'sentenceQueue', []);
167*3f61c2bbSAndreas Gohr
168*3f61c2bbSAndreas Gohr        // Sentence queue should be empty now
169*3f61c2bbSAndreas Gohr        $initialQueue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
170*3f61c2bbSAndreas Gohr        $this->assertEmpty($initialQueue);
171*3f61c2bbSAndreas Gohr
172*3f61c2bbSAndreas Gohr        // Add a sentence
173*3f61c2bbSAndreas Gohr        self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['First sentence.']);
174*3f61c2bbSAndreas Gohr        $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
175*3f61c2bbSAndreas Gohr        $this->assertGreaterThanOrEqual(1, count($queue));
176*3f61c2bbSAndreas Gohr        $this->assertContains('First sentence.', $queue);
177*3f61c2bbSAndreas Gohr
178*3f61c2bbSAndreas Gohr        // Add another sentence
179*3f61c2bbSAndreas Gohr        self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['Second sentence.']);
180*3f61c2bbSAndreas Gohr        $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
181*3f61c2bbSAndreas Gohr        $this->assertGreaterThan(1, $queue);
182*3f61c2bbSAndreas Gohr        $this->assertContains('Second sentence.', $queue);
183*3f61c2bbSAndreas Gohr
184*3f61c2bbSAndreas Gohr        // add a whole bunch of sentences to exceed the overlap limit
185*3f61c2bbSAndreas Gohr        for ($i = 0; $i < 20; $i++) {
186*3f61c2bbSAndreas Gohr            self::callInaccessibleMethod($this->splitter, 'rememberSentence', ["Sentence $i."]);
187*3f61c2bbSAndreas Gohr        }
188*3f61c2bbSAndreas Gohr
189*3f61c2bbSAndreas Gohr        // each of our sentences is at least 2 tokens, our limit is 5, so we should not have more than 2 in queue
190*3f61c2bbSAndreas Gohr        $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
191*3f61c2bbSAndreas Gohr        $this->assertLessThanOrEqual(2, count($queue));
192*3f61c2bbSAndreas Gohr    }
193*3f61c2bbSAndreas Gohr}
194