1<?php
2
3namespace dokuwiki\plugin\aichat\test;
4
5use dokuwiki\plugin\aichat\TextSplitter;
6use DokuWikiTest;
7use TikToken\Encoder;
8
9/**
10 * Tests for the TextSplitter class
11 *
12 * @group plugin_aichat
13 * @group plugins
14 */
15class TextSplitterTest extends DokuWikiTest
16{
17    protected $pluginsEnabled = ['aichat'];
18
19    const CHUNKSIZE = 10; // 10 token chunks for testing
20    const OVERLAP = 5;  // 2 token overlap for testing
21    private TextSplitter $splitter;
22    private Encoder $encoder;
23
24    public function setUp(): void
25    {
26        parent::setUp();
27        $this->encoder = new Encoder();
28        $this->splitter = new TextSplitter(self::CHUNKSIZE, $this->encoder, self::OVERLAP);
29    }
30
31    /**
32     * Test basic text splitting functionality
33     */
34    public function testSplitIntoChunks(): void
35    {
36        $text = "This is the first sentence. This is the second sentence. This is the third sentence.";
37        $chunks = $this->splitter->splitIntoChunks($text);
38
39        $this->assertIsArray($chunks);
40        $this->assertNotEmpty($chunks);
41
42        $this->assertGreaterThan(1, count($chunks)); // Should be split into multiple chunks
43
44
45        foreach ($chunks as $chunk) {
46            // Each chunk should be non-empty
47            $this->assertNotEmpty(trim($chunk));
48
49            // Each chunk should be within the token limit
50            $tokenCount = count($this->encoder->encode($chunk));
51            $this->assertLessThanOrEqual(self::CHUNKSIZE, $tokenCount);
52        }
53    }
54
55    /**
56     * Test splitting with empty text
57     */
58    public function testSplitEmptyText(): void
59    {
60        $chunks = $this->splitter->splitIntoChunks('');
61        $this->assertIsArray($chunks);
62        $this->assertEmpty($chunks);
63    }
64
65    /**
66     * Test splitting with whitespace only
67     */
68    public function testSplitWhitespaceOnly(): void
69    {
70        $chunks = $this->splitter->splitIntoChunks('   ');
71        $this->assertIsArray($chunks);
72        $this->assertEmpty($chunks);
73    }
74
75    /**
76     * Test splitting a single short sentence
77     */
78    public function testSplitSingleShortSentence(): void
79    {
80        $text = "This is a short sentence.";
81        $chunks = $this->splitter->splitIntoChunks($text);
82
83        $this->assertCount(1, $chunks);
84        $this->assertEquals($text, $chunks[0]);
85    }
86
87    /**
88     * Test splitting multiple sentences that fit in one chunk
89     */
90    public function testSplitMultipleSentencesOneChunk(): void
91    {
92        $text = "First sentence. Second sentence. Third sentence.";
93        $chunks = $this->splitter->splitIntoChunks($text);
94
95        $this->assertCount(1, $chunks);
96        $this->assertEquals($text, $chunks[0]);
97    }
98
99    /**
100     * Test that chunks have proper overlap
101     */
102    public function testChunkOverlap(): void
103    {
104        $text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.";
105
106        $chunks = $this->splitter->splitIntoChunks($text);
107        $this->assertGreaterThan(1, count($chunks));
108
109        $this->assertStringEndsWith('Third sentence.', $chunks[0]);
110        $this->assertStringStartsWith('Third sentence.', $chunks[1]);
111    }
112
113    /**
114     * Test splitLongSentence protected method
115     */
116    public function testSplitLongSentence(): void
117    {
118        // Create a very long sentence without periods
119        $longSentence = str_repeat("long word is long ", 20);
120
121        $result = self::callInaccessibleMethod($this->splitter, 'splitLongSentence', [$longSentence]);
122
123        $this->assertIsArray($result);
124        $this->assertGreaterThan(1, count($result));
125
126        // Each sub-sentence should be shorter than the original
127        foreach ($result as $subSentence) {
128            $this->assertLessThan(strlen($longSentence), strlen($subSentence));
129        }
130
131        // Verify all pieces together reconstruct the original
132        $reconstructed = implode('', $result);
133        $this->assertEquals($longSentence, $reconstructed);
134    }
135
136    /**
137     * Test splitString protected method
138     */
139    public function testSplitString(): void
140    {
141        $text = str_repeat("verylongwordwithoutspaces", 20);
142        $tokenLength = count($this->encoder->encode($text));
143        $chunkSize = 5;
144
145        $result = self::callInaccessibleMethod($this->splitter, 'splitString', [$text, $tokenLength, $chunkSize]);
146
147        $this->assertIsArray($result);
148        $this->assertGreaterThan(1, count($result));
149
150        // Each sub-sentence should be shorter than the original
151        foreach ($result as $subSentence) {
152            $this->assertLessThan(strlen($text), strlen($subSentence));
153        }
154
155        // Verify all pieces together reconstruct the original
156        $reconstructed = implode('', $result);
157        $this->assertEquals($text, $reconstructed);
158    }
159
160    /**
161     * Test rememberSentence protected method
162     */
163    public function testRememberSentence(): void
164    {
165        // Clear the sentence queue first
166        self::setInaccessibleProperty($this->splitter, 'sentenceQueue', []);
167
168        // Sentence queue should be empty now
169        $initialQueue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
170        $this->assertEmpty($initialQueue);
171
172        // Add a sentence
173        self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['First sentence.']);
174        $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
175        $this->assertGreaterThanOrEqual(1, count($queue));
176        $this->assertContains('First sentence.', $queue);
177
178        // Add another sentence
179        self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['Second sentence.']);
180        $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
181        $this->assertGreaterThan(1, $queue);
182        $this->assertContains('Second sentence.', $queue);
183
184        // add a whole bunch of sentences to exceed the overlap limit
185        for ($i = 0; $i < 20; $i++) {
186            self::callInaccessibleMethod($this->splitter, 'rememberSentence', ["Sentence $i."]);
187        }
188
189        // each of our sentences is at least 2 tokens, our limit is 5, so we should not have more than 2 in queue
190        $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue');
191        $this->assertLessThanOrEqual(2, count($queue));
192    }
193}
194