1<?php 2 3namespace dokuwiki\plugin\aichat\test; 4 5use dokuwiki\plugin\aichat\TextSplitter; 6use DokuWikiTest; 7use TikToken\Encoder; 8 9/** 10 * Tests for the TextSplitter class 11 * 12 * @group plugin_aichat 13 * @group plugins 14 */ 15class TextSplitterTest extends DokuWikiTest 16{ 17 protected $pluginsEnabled = ['aichat']; 18 19 const CHUNKSIZE = 10; // 10 token chunks for testing 20 const OVERLAP = 5; // 2 token overlap for testing 21 private TextSplitter $splitter; 22 private Encoder $encoder; 23 24 public function setUp(): void 25 { 26 parent::setUp(); 27 $this->encoder = new Encoder(); 28 $this->splitter = new TextSplitter(self::CHUNKSIZE, $this->encoder, self::OVERLAP); 29 } 30 31 /** 32 * Test basic text splitting functionality 33 */ 34 public function testSplitIntoChunks(): void 35 { 36 $text = "This is the first sentence. This is the second sentence. This is the third sentence."; 37 $chunks = $this->splitter->splitIntoChunks($text); 38 39 $this->assertIsArray($chunks); 40 $this->assertNotEmpty($chunks); 41 42 $this->assertGreaterThan(1, count($chunks)); // Should be split into multiple chunks 43 44 45 foreach ($chunks as $chunk) { 46 // Each chunk should be non-empty 47 $this->assertNotEmpty(trim($chunk)); 48 49 // Each chunk should be within the token limit 50 $tokenCount = count($this->encoder->encode($chunk)); 51 $this->assertLessThanOrEqual(self::CHUNKSIZE, $tokenCount); 52 } 53 } 54 55 /** 56 * Test splitting with empty text 57 */ 58 public function testSplitEmptyText(): void 59 { 60 $chunks = $this->splitter->splitIntoChunks(''); 61 $this->assertIsArray($chunks); 62 $this->assertEmpty($chunks); 63 } 64 65 /** 66 * Test splitting with whitespace only 67 */ 68 public function testSplitWhitespaceOnly(): void 69 { 70 $chunks = $this->splitter->splitIntoChunks(' '); 71 $this->assertIsArray($chunks); 72 $this->assertEmpty($chunks); 73 } 74 75 /** 76 * Test splitting a single short sentence 77 */ 78 public function testSplitSingleShortSentence(): void 79 { 80 $text = "This is a short sentence."; 81 $chunks = $this->splitter->splitIntoChunks($text); 82 83 $this->assertCount(1, $chunks); 84 $this->assertEquals($text, $chunks[0]); 85 } 86 87 /** 88 * Test splitting multiple sentences that fit in one chunk 89 */ 90 public function testSplitMultipleSentencesOneChunk(): void 91 { 92 $text = "First sentence. Second sentence. Third sentence."; 93 $chunks = $this->splitter->splitIntoChunks($text); 94 95 $this->assertCount(1, $chunks); 96 $this->assertEquals($text, $chunks[0]); 97 } 98 99 /** 100 * Test that chunks have proper overlap 101 */ 102 public function testChunkOverlap(): void 103 { 104 $text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence."; 105 106 $chunks = $this->splitter->splitIntoChunks($text); 107 $this->assertGreaterThan(1, count($chunks)); 108 109 $this->assertStringEndsWith('Third sentence.', $chunks[0]); 110 $this->assertStringStartsWith('Third sentence.', $chunks[1]); 111 } 112 113 /** 114 * Test splitLongSentence protected method 115 */ 116 public function testSplitLongSentence(): void 117 { 118 // Create a very long sentence without periods 119 $longSentence = str_repeat("long word is long ", 20); 120 121 $result = self::callInaccessibleMethod($this->splitter, 'splitLongSentence', [$longSentence]); 122 123 $this->assertIsArray($result); 124 $this->assertGreaterThan(1, count($result)); 125 126 // Each sub-sentence should be shorter than the original 127 foreach ($result as $subSentence) { 128 $this->assertLessThan(strlen($longSentence), strlen($subSentence)); 129 } 130 131 // Verify all pieces together reconstruct the original 132 $reconstructed = implode('', $result); 133 $this->assertEquals($longSentence, $reconstructed); 134 } 135 136 /** 137 * Test splitString protected method 138 */ 139 public function testSplitString(): void 140 { 141 $text = str_repeat("verylongwordwithoutspaces", 20); 142 $tokenLength = count($this->encoder->encode($text)); 143 $chunkSize = 5; 144 145 $result = self::callInaccessibleMethod($this->splitter, 'splitString', [$text, $tokenLength, $chunkSize]); 146 147 $this->assertIsArray($result); 148 $this->assertGreaterThan(1, count($result)); 149 150 // Each sub-sentence should be shorter than the original 151 foreach ($result as $subSentence) { 152 $this->assertLessThan(strlen($text), strlen($subSentence)); 153 } 154 155 // Verify all pieces together reconstruct the original 156 $reconstructed = implode('', $result); 157 $this->assertEquals($text, $reconstructed); 158 } 159 160 /** 161 * Test rememberSentence protected method 162 */ 163 public function testRememberSentence(): void 164 { 165 // Clear the sentence queue first 166 self::setInaccessibleProperty($this->splitter, 'sentenceQueue', []); 167 168 // Sentence queue should be empty now 169 $initialQueue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 170 $this->assertEmpty($initialQueue); 171 172 // Add a sentence 173 self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['First sentence.']); 174 $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 175 $this->assertGreaterThanOrEqual(1, count($queue)); 176 $this->assertContains('First sentence.', $queue); 177 178 // Add another sentence 179 self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['Second sentence.']); 180 $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 181 $this->assertGreaterThan(1, $queue); 182 $this->assertContains('Second sentence.', $queue); 183 184 // add a whole bunch of sentences to exceed the overlap limit 185 for ($i = 0; $i < 20; $i++) { 186 self::callInaccessibleMethod($this->splitter, 'rememberSentence', ["Sentence $i."]); 187 } 188 189 // each of our sentences is at least 2 tokens, our limit is 5, so we should not have more than 2 in queue 190 $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 191 $this->assertLessThanOrEqual(2, count($queue)); 192 } 193} 194