xref: /dokuwiki/_test/tests/Parsing/Lexer/LexerTest.php (revision 504c13e8df88563c11b3720b317991bc38835a35)
1<?php
2
3namespace dokuwiki\test\Parsing\Lexer;
4
5use dokuwiki\Parsing\Lexer\Lexer;
6
7class LexerTest extends \DokuWikiTest
8{
9    function testNoPatterns()
10    {
11        $handler = new RecordingHandler();
12        $lexer = new Lexer($handler);
13        $this->assertFalse($lexer->parse("abcdef"));
14        $this->assertSame([], $handler->recorded);
15    }
16
17    function testEmptyPage()
18    {
19        $handler = new RecordingHandler();
20        $lexer = new Lexer($handler);
21        $lexer->addPattern("a+");
22        $this->assertTrue($lexer->parse(""));
23        $this->assertSame([], $handler->recorded);
24    }
25
26    function testSinglePattern()
27    {
28        $handler = new RecordingHandler();
29        $lexer = new Lexer($handler);
30        $lexer->addPattern("a+");
31        $this->assertTrue($lexer->parse("aaaxayyyaxaaaz"));
32        $this->assertSame([
33            ['accept', 'aaa', \DOKU_LEXER_MATCHED, 0],
34            ['accept', 'x', \DOKU_LEXER_UNMATCHED, 3],
35            ['accept', 'a', \DOKU_LEXER_MATCHED, 4],
36            ['accept', 'yyy', \DOKU_LEXER_UNMATCHED, 5],
37            ['accept', 'a', \DOKU_LEXER_MATCHED, 8],
38            ['accept', 'x', \DOKU_LEXER_UNMATCHED, 9],
39            ['accept', 'aaa', \DOKU_LEXER_MATCHED, 10],
40            ['accept', 'z', \DOKU_LEXER_UNMATCHED, 13],
41        ], $handler->recorded);
42    }
43
44    function testMultiplePattern()
45    {
46        $handler = new RecordingHandler();
47        $lexer = new Lexer($handler);
48        $lexer->addPattern("a+");
49        $lexer->addPattern("b+");
50        $this->assertTrue($lexer->parse("ababbxbaxxxxxxax"));
51        $expected = ['a', 'b', 'a', 'bb', 'x', 'b', 'a', 'xxxxxx', 'a', 'x'];
52        $actual = array_column($handler->recorded, 1);
53        $this->assertSame($expected, $actual);
54    }
55
56    function testIsolatedPattern()
57    {
58        $handler = new RecordingHandler();
59        $lexer = new Lexer($handler, "a");
60        $lexer->addPattern("a+", "a");
61        $lexer->addPattern("b+", "b");
62        $this->assertTrue($lexer->parse("abaabxbaaaxaaaax"));
63        $this->assertSame([
64            ['a', 'a', \DOKU_LEXER_MATCHED, 0],
65            ['a', 'b', \DOKU_LEXER_UNMATCHED, 1],
66            ['a', 'aa', \DOKU_LEXER_MATCHED, 2],
67            ['a', 'bxb', \DOKU_LEXER_UNMATCHED, 4],
68            ['a', 'aaa', \DOKU_LEXER_MATCHED, 7],
69            ['a', 'x', \DOKU_LEXER_UNMATCHED, 10],
70            ['a', 'aaaa', \DOKU_LEXER_MATCHED, 11],
71            ['a', 'x', \DOKU_LEXER_UNMATCHED, 15],
72        ], $handler->recorded);
73    }
74
75    function testModeChange()
76    {
77        $handler = new RecordingHandler();
78        $lexer = new Lexer($handler, "a");
79        $lexer->addPattern("a+", "a");
80        $lexer->addEntryPattern(":", "a", "b");
81        $lexer->addPattern("b+", "b");
82        $this->assertTrue($lexer->parse("abaabaaa:ababbabbba"));
83        $this->assertSame([
84            ['a', 'a', \DOKU_LEXER_MATCHED, 0],
85            ['a', 'b', \DOKU_LEXER_UNMATCHED, 1],
86            ['a', 'aa', \DOKU_LEXER_MATCHED, 2],
87            ['a', 'b', \DOKU_LEXER_UNMATCHED, 4],
88            ['a', 'aaa', \DOKU_LEXER_MATCHED, 5],
89            ['b', ':', \DOKU_LEXER_ENTER, 8],
90            ['b', 'a', \DOKU_LEXER_UNMATCHED, 9],
91            ['b', 'b', \DOKU_LEXER_MATCHED, 10],
92            ['b', 'a', \DOKU_LEXER_UNMATCHED, 11],
93            ['b', 'bb', \DOKU_LEXER_MATCHED, 12],
94            ['b', 'a', \DOKU_LEXER_UNMATCHED, 14],
95            ['b', 'bbb', \DOKU_LEXER_MATCHED, 15],
96            ['b', 'a', \DOKU_LEXER_UNMATCHED, 18],
97        ], $handler->recorded);
98    }
99
100    function testNesting()
101    {
102        $handler = new RecordingHandler();
103        $lexer = new Lexer($handler, "a");
104        $lexer->addPattern("a+", "a");
105        $lexer->addEntryPattern("(", "a", "b");
106        $lexer->addPattern("b+", "b");
107        $lexer->addExitPattern(")", "b");
108        $this->assertTrue($lexer->parse("aabaab(bbabb)aab"));
109        $this->assertSame([
110            ['a', 'aa', \DOKU_LEXER_MATCHED, 0],
111            ['a', 'b', \DOKU_LEXER_UNMATCHED, 2],
112            ['a', 'aa', \DOKU_LEXER_MATCHED, 3],
113            ['a', 'b', \DOKU_LEXER_UNMATCHED, 5],
114            ['b', '(', \DOKU_LEXER_ENTER, 6],
115            ['b', 'bb', \DOKU_LEXER_MATCHED, 7],
116            ['b', 'a', \DOKU_LEXER_UNMATCHED, 9],
117            ['b', 'bb', \DOKU_LEXER_MATCHED, 10],
118            ['b', ')', \DOKU_LEXER_EXIT, 12],
119            ['a', 'aa', \DOKU_LEXER_MATCHED, 13],
120            ['a', 'b', \DOKU_LEXER_UNMATCHED, 15],
121        ], $handler->recorded);
122    }
123
124    function testSingular()
125    {
126        $handler = new RecordingHandler();
127        $lexer = new Lexer($handler, "a");
128        $lexer->addPattern("a+", "a");
129        $lexer->addSpecialPattern("b+", "a", "b");
130        $this->assertTrue($lexer->parse("aabaaxxbbbxx"));
131        $this->assertSame([
132            ['a', 'aa', \DOKU_LEXER_MATCHED, 0],
133            ['b', 'b', \DOKU_LEXER_SPECIAL, 2],
134            ['a', 'aa', \DOKU_LEXER_MATCHED, 3],
135            ['a', 'xx', \DOKU_LEXER_UNMATCHED, 5],
136            ['b', 'bbb', \DOKU_LEXER_SPECIAL, 7],
137            ['a', 'xx', \DOKU_LEXER_UNMATCHED, 10],
138        ], $handler->recorded);
139    }
140
141    function testUnwindTooFar()
142    {
143        $handler = new RecordingHandler();
144        $lexer = new Lexer($handler, "a");
145        $lexer->addPattern("a+", "a");
146        $lexer->addExitPattern(")", "a");
147        $this->assertFalse($lexer->parse("aa)aa"));
148        $this->assertSame([
149            ['a', 'aa', \DOKU_LEXER_MATCHED, 0],
150            ['a', ')', \DOKU_LEXER_EXIT, 2],
151        ], $handler->recorded);
152    }
153
154    function testModeMapping()
155    {
156        $handler = new RecordingHandler();
157        $lexer = new Lexer($handler, "mode_a");
158        $lexer->addPattern("a+", "mode_a");
159        $lexer->addEntryPattern("(", "mode_a", "mode_b");
160        $lexer->addPattern("b+", "mode_b");
161        $lexer->addExitPattern(")", "mode_b");
162        $lexer->mapHandler("mode_a", "a");
163        $lexer->mapHandler("mode_b", "a");
164        $this->assertTrue($lexer->parse("aa(bbabb)b"));
165        $this->assertSame([
166            ['a', 'aa', \DOKU_LEXER_MATCHED, 0],
167            ['a', '(', \DOKU_LEXER_ENTER, 2],
168            ['a', 'bb', \DOKU_LEXER_MATCHED, 3],
169            ['a', 'a', \DOKU_LEXER_UNMATCHED, 5],
170            ['a', 'bb', \DOKU_LEXER_MATCHED, 6],
171            ['a', ')', \DOKU_LEXER_EXIT, 8],
172            ['a', 'b', \DOKU_LEXER_UNMATCHED, 9],
173        ], $handler->recorded);
174    }
175
176    function testIndex()
177    {
178        $doc = "aaa<file>bcd</file>eee";
179        $handler = new RecordingHandler();
180        $lexer = new Lexer($handler, "ignore");
181        $lexer->addEntryPattern("<file>", "ignore", "caught");
182        $lexer->addExitPattern("</file>", "caught");
183        $lexer->addSpecialPattern('b', 'caught', 'special');
184        $lexer->mapHandler('special', 'caught');
185        $lexer->addPattern('c', 'caught');
186        $this->assertTrue($lexer->parse($doc));
187
188        $caught = array_values(array_filter($handler->recorded, fn($c) => $c[0] === 'caught'));
189        $this->assertSame([
190            ['caught', '<file>', \DOKU_LEXER_ENTER, strpos($doc, '<file>')],
191            ['caught', 'b', \DOKU_LEXER_SPECIAL, strpos($doc, 'b')],
192            ['caught', 'c', \DOKU_LEXER_MATCHED, strpos($doc, 'c')],
193            ['caught', 'd', \DOKU_LEXER_UNMATCHED, strpos($doc, 'd')],
194            ['caught', '</file>', \DOKU_LEXER_EXIT, strpos($doc, '</file>')],
195        ], $caught);
196    }
197
198    function testIndexLookaheadEqual()
199    {
200        $doc = "aaa<file>bcd</file>eee";
201        $handler = new RecordingHandler();
202        $lexer = new Lexer($handler, "ignore");
203        $lexer->addEntryPattern('<file>(?=.*</file>)', "ignore", "caught");
204        $lexer->addExitPattern("</file>", "caught");
205        $lexer->addSpecialPattern('b', 'caught', 'special');
206        $lexer->mapHandler('special', 'caught');
207        $lexer->addPattern('c', 'caught');
208        $this->assertTrue($lexer->parse($doc));
209
210        $caught = array_values(array_filter($handler->recorded, fn($c) => $c[0] === 'caught'));
211        $this->assertSame([
212            ['caught', '<file>', \DOKU_LEXER_ENTER, strpos($doc, '<file>')],
213            ['caught', 'b', \DOKU_LEXER_SPECIAL, strpos($doc, 'b')],
214            ['caught', 'c', \DOKU_LEXER_MATCHED, strpos($doc, 'c')],
215            ['caught', 'd', \DOKU_LEXER_UNMATCHED, strpos($doc, 'd')],
216            ['caught', '</file>', \DOKU_LEXER_EXIT, strpos($doc, '</file>')],
217        ], $caught);
218    }
219
220    function testIndexLookaheadNotEqual()
221    {
222        $doc = "aaa<file>bcd</file>eee";
223        $handler = new RecordingHandler();
224        $lexer = new Lexer($handler, "ignore");
225        $lexer->addEntryPattern('<file>(?!foo)', "ignore", "caught");
226        $lexer->addExitPattern("</file>", "caught");
227        $lexer->addSpecialPattern('b', 'caught', 'special');
228        $lexer->mapHandler('special', 'caught');
229        $lexer->addPattern('c', 'caught');
230        $this->assertTrue($lexer->parse($doc));
231
232        $caught = array_values(array_filter($handler->recorded, fn($c) => $c[0] === 'caught'));
233        $this->assertSame([
234            ['caught', '<file>', \DOKU_LEXER_ENTER, strpos($doc, '<file>')],
235            ['caught', 'b', \DOKU_LEXER_SPECIAL, strpos($doc, 'b')],
236            ['caught', 'c', \DOKU_LEXER_MATCHED, strpos($doc, 'c')],
237            ['caught', 'd', \DOKU_LEXER_UNMATCHED, strpos($doc, 'd')],
238            ['caught', '</file>', \DOKU_LEXER_EXIT, strpos($doc, '</file>')],
239        ], $caught);
240    }
241
242    function testIndexLookbehindEqual()
243    {
244        $doc = "aaa<file>bcd</file>eee";
245        $handler = new RecordingHandler();
246        $lexer = new Lexer($handler, "ignore");
247        $lexer->addEntryPattern('<file>', "ignore", "caught");
248        $lexer->addExitPattern("(?<=d)</file>", "caught");
249        $lexer->addSpecialPattern('b', 'caught', 'special');
250        $lexer->mapHandler('special', 'caught');
251        $lexer->addPattern('c', 'caught');
252        $this->assertTrue($lexer->parse($doc));
253
254        $caught = array_values(array_filter($handler->recorded, fn($c) => $c[0] === 'caught'));
255        $this->assertSame([
256            ['caught', '<file>', \DOKU_LEXER_ENTER, strpos($doc, '<file>')],
257            ['caught', 'b', \DOKU_LEXER_SPECIAL, strpos($doc, 'b')],
258            ['caught', 'c', \DOKU_LEXER_MATCHED, strpos($doc, 'c')],
259            ['caught', 'd', \DOKU_LEXER_UNMATCHED, strpos($doc, 'd')],
260            ['caught', '</file>', \DOKU_LEXER_EXIT, strpos($doc, '</file>')],
261        ], $caught);
262    }
263
264    function testIndexLookbehindNotEqual()
265    {
266        $doc = "aaa<file>bcd</file>eee";
267        $handler = new RecordingHandler();
268        $lexer = new Lexer($handler, 'ignore');
269        $lexer->addEntryPattern('<file>', 'ignore', 'caught');
270        $lexer->addExitPattern('(?<!c)</file>', 'caught');
271        $lexer->addSpecialPattern('b', 'caught', 'special');
272        $lexer->mapHandler('special', 'caught');
273        $lexer->addPattern('c', 'caught');
274        $this->assertTrue($lexer->parse($doc));
275
276        $caught = array_values(array_filter($handler->recorded, fn($c) => $c[0] === 'caught'));
277        $this->assertSame([
278            ['caught', '<file>', \DOKU_LEXER_ENTER, strpos($doc, '<file>')],
279            ['caught', 'b', \DOKU_LEXER_SPECIAL, strpos($doc, 'b')],
280            ['caught', 'c', \DOKU_LEXER_MATCHED, strpos($doc, 'c')],
281            ['caught', 'd', \DOKU_LEXER_UNMATCHED, strpos($doc, 'd')],
282            ['caught', '</file>', \DOKU_LEXER_EXIT, strpos($doc, '</file>')],
283        ], $caught);
284    }
285
286    /**
287     * This test is primarily to ensure the correct match is chosen
288     * when there are non-captured elements in the pattern.
289     */
290    function testIndexSelectCorrectMatch()
291    {
292        $doc = "ALL FOOLS ARE FOO";
293        $pattern = '\bFOO\b';
294        $handler = new RecordingHandler();
295        $lexer = new Lexer($handler, "ignore");
296        $lexer->addSpecialPattern($pattern, 'ignore', 'caught');
297        $this->assertTrue($lexer->parse($doc));
298
299        $caught = array_values(array_filter($handler->recorded, fn($c) => $c[0] === 'caught'));
300        $matches = [];
301        preg_match('/' . $pattern . '/', $doc, $matches, PREG_OFFSET_CAPTURE);
302        $this->assertCount(1, $caught);
303        $this->assertSame('FOO', $caught[0][1]);
304        $this->assertSame(\DOKU_LEXER_SPECIAL, $caught[0][2]);
305        $this->assertSame($matches[0][1], $caught[0][3]);
306    }
307}
308