1*3f61c2bbSAndreas Gohr<?php 2*3f61c2bbSAndreas Gohr 3*3f61c2bbSAndreas Gohrnamespace dokuwiki\plugin\aichat\test; 4*3f61c2bbSAndreas Gohr 5*3f61c2bbSAndreas Gohruse dokuwiki\plugin\aichat\TextSplitter; 6*3f61c2bbSAndreas Gohruse DokuWikiTest; 7*3f61c2bbSAndreas Gohruse TikToken\Encoder; 8*3f61c2bbSAndreas Gohr 9*3f61c2bbSAndreas Gohr/** 10*3f61c2bbSAndreas Gohr * Tests for the TextSplitter class 11*3f61c2bbSAndreas Gohr * 12*3f61c2bbSAndreas Gohr * @group plugin_aichat 13*3f61c2bbSAndreas Gohr * @group plugins 14*3f61c2bbSAndreas Gohr */ 15*3f61c2bbSAndreas Gohrclass TextSplitterTest extends DokuWikiTest 16*3f61c2bbSAndreas Gohr{ 17*3f61c2bbSAndreas Gohr protected $pluginsEnabled = ['aichat']; 18*3f61c2bbSAndreas Gohr 19*3f61c2bbSAndreas Gohr const CHUNKSIZE = 10; // 10 token chunks for testing 20*3f61c2bbSAndreas Gohr const OVERLAP = 5; // 2 token overlap for testing 21*3f61c2bbSAndreas Gohr private TextSplitter $splitter; 22*3f61c2bbSAndreas Gohr private Encoder $encoder; 23*3f61c2bbSAndreas Gohr 24*3f61c2bbSAndreas Gohr public function setUp(): void 25*3f61c2bbSAndreas Gohr { 26*3f61c2bbSAndreas Gohr parent::setUp(); 27*3f61c2bbSAndreas Gohr $this->encoder = new Encoder(); 28*3f61c2bbSAndreas Gohr $this->splitter = new TextSplitter(self::CHUNKSIZE, $this->encoder, self::OVERLAP); 29*3f61c2bbSAndreas Gohr } 30*3f61c2bbSAndreas Gohr 31*3f61c2bbSAndreas Gohr /** 32*3f61c2bbSAndreas Gohr * Test basic text splitting functionality 33*3f61c2bbSAndreas Gohr */ 34*3f61c2bbSAndreas Gohr public function testSplitIntoChunks(): void 35*3f61c2bbSAndreas Gohr { 36*3f61c2bbSAndreas Gohr $text = "This is the first sentence. This is the second sentence. This is the third sentence."; 37*3f61c2bbSAndreas Gohr $chunks = $this->splitter->splitIntoChunks($text); 38*3f61c2bbSAndreas Gohr 39*3f61c2bbSAndreas Gohr $this->assertIsArray($chunks); 40*3f61c2bbSAndreas Gohr $this->assertNotEmpty($chunks); 41*3f61c2bbSAndreas Gohr 42*3f61c2bbSAndreas Gohr $this->assertGreaterThan(1, count($chunks)); // Should be split into multiple chunks 43*3f61c2bbSAndreas Gohr 44*3f61c2bbSAndreas Gohr 45*3f61c2bbSAndreas Gohr foreach ($chunks as $chunk) { 46*3f61c2bbSAndreas Gohr // Each chunk should be non-empty 47*3f61c2bbSAndreas Gohr $this->assertNotEmpty(trim($chunk)); 48*3f61c2bbSAndreas Gohr 49*3f61c2bbSAndreas Gohr // Each chunk should be within the token limit 50*3f61c2bbSAndreas Gohr $tokenCount = count($this->encoder->encode($chunk)); 51*3f61c2bbSAndreas Gohr $this->assertLessThanOrEqual(self::CHUNKSIZE, $tokenCount); 52*3f61c2bbSAndreas Gohr } 53*3f61c2bbSAndreas Gohr } 54*3f61c2bbSAndreas Gohr 55*3f61c2bbSAndreas Gohr /** 56*3f61c2bbSAndreas Gohr * Test splitting with empty text 57*3f61c2bbSAndreas Gohr */ 58*3f61c2bbSAndreas Gohr public function testSplitEmptyText(): void 59*3f61c2bbSAndreas Gohr { 60*3f61c2bbSAndreas Gohr $chunks = $this->splitter->splitIntoChunks(''); 61*3f61c2bbSAndreas Gohr $this->assertIsArray($chunks); 62*3f61c2bbSAndreas Gohr $this->assertEmpty($chunks); 63*3f61c2bbSAndreas Gohr } 64*3f61c2bbSAndreas Gohr 65*3f61c2bbSAndreas Gohr /** 66*3f61c2bbSAndreas Gohr * Test splitting with whitespace only 67*3f61c2bbSAndreas Gohr */ 68*3f61c2bbSAndreas Gohr public function testSplitWhitespaceOnly(): void 69*3f61c2bbSAndreas Gohr { 70*3f61c2bbSAndreas Gohr $chunks = $this->splitter->splitIntoChunks(' '); 71*3f61c2bbSAndreas Gohr $this->assertIsArray($chunks); 72*3f61c2bbSAndreas Gohr $this->assertEmpty($chunks); 73*3f61c2bbSAndreas Gohr } 74*3f61c2bbSAndreas Gohr 75*3f61c2bbSAndreas Gohr /** 76*3f61c2bbSAndreas Gohr * Test splitting a single short sentence 77*3f61c2bbSAndreas Gohr */ 78*3f61c2bbSAndreas Gohr public function testSplitSingleShortSentence(): void 79*3f61c2bbSAndreas Gohr { 80*3f61c2bbSAndreas Gohr $text = "This is a short sentence."; 81*3f61c2bbSAndreas Gohr $chunks = $this->splitter->splitIntoChunks($text); 82*3f61c2bbSAndreas Gohr 83*3f61c2bbSAndreas Gohr $this->assertCount(1, $chunks); 84*3f61c2bbSAndreas Gohr $this->assertEquals($text, $chunks[0]); 85*3f61c2bbSAndreas Gohr } 86*3f61c2bbSAndreas Gohr 87*3f61c2bbSAndreas Gohr /** 88*3f61c2bbSAndreas Gohr * Test splitting multiple sentences that fit in one chunk 89*3f61c2bbSAndreas Gohr */ 90*3f61c2bbSAndreas Gohr public function testSplitMultipleSentencesOneChunk(): void 91*3f61c2bbSAndreas Gohr { 92*3f61c2bbSAndreas Gohr $text = "First sentence. Second sentence. Third sentence."; 93*3f61c2bbSAndreas Gohr $chunks = $this->splitter->splitIntoChunks($text); 94*3f61c2bbSAndreas Gohr 95*3f61c2bbSAndreas Gohr $this->assertCount(1, $chunks); 96*3f61c2bbSAndreas Gohr $this->assertEquals($text, $chunks[0]); 97*3f61c2bbSAndreas Gohr } 98*3f61c2bbSAndreas Gohr 99*3f61c2bbSAndreas Gohr /** 100*3f61c2bbSAndreas Gohr * Test that chunks have proper overlap 101*3f61c2bbSAndreas Gohr */ 102*3f61c2bbSAndreas Gohr public function testChunkOverlap(): void 103*3f61c2bbSAndreas Gohr { 104*3f61c2bbSAndreas Gohr $text = "First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence."; 105*3f61c2bbSAndreas Gohr 106*3f61c2bbSAndreas Gohr $chunks = $this->splitter->splitIntoChunks($text); 107*3f61c2bbSAndreas Gohr $this->assertGreaterThan(1, count($chunks)); 108*3f61c2bbSAndreas Gohr 109*3f61c2bbSAndreas Gohr $this->assertStringEndsWith('Third sentence.', $chunks[0]); 110*3f61c2bbSAndreas Gohr $this->assertStringStartsWith('Third sentence.', $chunks[1]); 111*3f61c2bbSAndreas Gohr } 112*3f61c2bbSAndreas Gohr 113*3f61c2bbSAndreas Gohr /** 114*3f61c2bbSAndreas Gohr * Test splitLongSentence protected method 115*3f61c2bbSAndreas Gohr */ 116*3f61c2bbSAndreas Gohr public function testSplitLongSentence(): void 117*3f61c2bbSAndreas Gohr { 118*3f61c2bbSAndreas Gohr // Create a very long sentence without periods 119*3f61c2bbSAndreas Gohr $longSentence = str_repeat("long word is long ", 20); 120*3f61c2bbSAndreas Gohr 121*3f61c2bbSAndreas Gohr $result = self::callInaccessibleMethod($this->splitter, 'splitLongSentence', [$longSentence]); 122*3f61c2bbSAndreas Gohr 123*3f61c2bbSAndreas Gohr $this->assertIsArray($result); 124*3f61c2bbSAndreas Gohr $this->assertGreaterThan(1, count($result)); 125*3f61c2bbSAndreas Gohr 126*3f61c2bbSAndreas Gohr // Each sub-sentence should be shorter than the original 127*3f61c2bbSAndreas Gohr foreach ($result as $subSentence) { 128*3f61c2bbSAndreas Gohr $this->assertLessThan(strlen($longSentence), strlen($subSentence)); 129*3f61c2bbSAndreas Gohr } 130*3f61c2bbSAndreas Gohr 131*3f61c2bbSAndreas Gohr // Verify all pieces together reconstruct the original 132*3f61c2bbSAndreas Gohr $reconstructed = implode('', $result); 133*3f61c2bbSAndreas Gohr $this->assertEquals($longSentence, $reconstructed); 134*3f61c2bbSAndreas Gohr } 135*3f61c2bbSAndreas Gohr 136*3f61c2bbSAndreas Gohr /** 137*3f61c2bbSAndreas Gohr * Test splitString protected method 138*3f61c2bbSAndreas Gohr */ 139*3f61c2bbSAndreas Gohr public function testSplitString(): void 140*3f61c2bbSAndreas Gohr { 141*3f61c2bbSAndreas Gohr $text = str_repeat("verylongwordwithoutspaces", 20); 142*3f61c2bbSAndreas Gohr $tokenLength = count($this->encoder->encode($text)); 143*3f61c2bbSAndreas Gohr $chunkSize = 5; 144*3f61c2bbSAndreas Gohr 145*3f61c2bbSAndreas Gohr $result = self::callInaccessibleMethod($this->splitter, 'splitString', [$text, $tokenLength, $chunkSize]); 146*3f61c2bbSAndreas Gohr 147*3f61c2bbSAndreas Gohr $this->assertIsArray($result); 148*3f61c2bbSAndreas Gohr $this->assertGreaterThan(1, count($result)); 149*3f61c2bbSAndreas Gohr 150*3f61c2bbSAndreas Gohr // Each sub-sentence should be shorter than the original 151*3f61c2bbSAndreas Gohr foreach ($result as $subSentence) { 152*3f61c2bbSAndreas Gohr $this->assertLessThan(strlen($text), strlen($subSentence)); 153*3f61c2bbSAndreas Gohr } 154*3f61c2bbSAndreas Gohr 155*3f61c2bbSAndreas Gohr // Verify all pieces together reconstruct the original 156*3f61c2bbSAndreas Gohr $reconstructed = implode('', $result); 157*3f61c2bbSAndreas Gohr $this->assertEquals($text, $reconstructed); 158*3f61c2bbSAndreas Gohr } 159*3f61c2bbSAndreas Gohr 160*3f61c2bbSAndreas Gohr /** 161*3f61c2bbSAndreas Gohr * Test rememberSentence protected method 162*3f61c2bbSAndreas Gohr */ 163*3f61c2bbSAndreas Gohr public function testRememberSentence(): void 164*3f61c2bbSAndreas Gohr { 165*3f61c2bbSAndreas Gohr // Clear the sentence queue first 166*3f61c2bbSAndreas Gohr self::setInaccessibleProperty($this->splitter, 'sentenceQueue', []); 167*3f61c2bbSAndreas Gohr 168*3f61c2bbSAndreas Gohr // Sentence queue should be empty now 169*3f61c2bbSAndreas Gohr $initialQueue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 170*3f61c2bbSAndreas Gohr $this->assertEmpty($initialQueue); 171*3f61c2bbSAndreas Gohr 172*3f61c2bbSAndreas Gohr // Add a sentence 173*3f61c2bbSAndreas Gohr self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['First sentence.']); 174*3f61c2bbSAndreas Gohr $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 175*3f61c2bbSAndreas Gohr $this->assertGreaterThanOrEqual(1, count($queue)); 176*3f61c2bbSAndreas Gohr $this->assertContains('First sentence.', $queue); 177*3f61c2bbSAndreas Gohr 178*3f61c2bbSAndreas Gohr // Add another sentence 179*3f61c2bbSAndreas Gohr self::callInaccessibleMethod($this->splitter, 'rememberSentence', ['Second sentence.']); 180*3f61c2bbSAndreas Gohr $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 181*3f61c2bbSAndreas Gohr $this->assertGreaterThan(1, $queue); 182*3f61c2bbSAndreas Gohr $this->assertContains('Second sentence.', $queue); 183*3f61c2bbSAndreas Gohr 184*3f61c2bbSAndreas Gohr // add a whole bunch of sentences to exceed the overlap limit 185*3f61c2bbSAndreas Gohr for ($i = 0; $i < 20; $i++) { 186*3f61c2bbSAndreas Gohr self::callInaccessibleMethod($this->splitter, 'rememberSentence', ["Sentence $i."]); 187*3f61c2bbSAndreas Gohr } 188*3f61c2bbSAndreas Gohr 189*3f61c2bbSAndreas Gohr // each of our sentences is at least 2 tokens, our limit is 5, so we should not have more than 2 in queue 190*3f61c2bbSAndreas Gohr $queue = self::getInaccessibleProperty($this->splitter, 'sentenceQueue'); 191*3f61c2bbSAndreas Gohr $this->assertLessThanOrEqual(2, count($queue)); 192*3f61c2bbSAndreas Gohr } 193*3f61c2bbSAndreas Gohr} 194