1 <?php
2 
3 /**
4  * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5  * For an intro to the Lexer see:
6  * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7  *
8  * @author Marcus Baker http://www.lastcraft.com
9  */
10 
11 namespace dokuwiki\Parsing\Lexer;
12 
13 /**
14  * Accepts text and breaks it into tokens.
15  *
16  * Some optimisation to make the sure the content is only scanned by the PHP regex
17  * parser once. Lexer modes must not start with leading underscores.
18  */
19 class Lexer
20 {
21     /** @var ParallelRegex[] */
22     protected $regexes = [];
23     /** @var \Doku_Handler */
24     protected $handler;
25     /** @var StateStack */
26     protected $modeStack;
27     /** @var array mode "rewrites" */
28     protected $mode_handlers = [];
29     /** @var bool case sensitive? */
30     protected $case;
31 
32     /**
33      * Sets up the lexer in case insensitive matching by default.
34      *
35      * @param \Doku_Handler $handler  Handling strategy by reference.
36      * @param string $start            Starting handler.
37      * @param boolean $case            True for case sensitive.
38      */
39     public function __construct($handler, $start = "accept", $case = false)
40     {
41         $this->case = $case;
42         $this->handler = $handler;
43         $this->modeStack = new StateStack($start);
44     }
45 
46     /**
47      * Adds a token search pattern for a particular parsing mode.
48      *
49      * The pattern does not change the current mode.
50      *
51      * @param string $pattern      Perl style regex, but ( and )
52      *                             lose the usual meaning.
53      * @param string $mode         Should only apply this
54      *                             pattern when dealing with
55      *                             this type of input.
56      */
57     public function addPattern($pattern, $mode = "accept")
58     {
59         if (! isset($this->regexes[$mode])) {
60             $this->regexes[$mode] = new ParallelRegex($this->case);
61         }
62         $this->regexes[$mode]->addPattern($pattern);
63     }
64 
65     /**
66      * Adds a pattern that will enter a new parsing mode.
67      *
68      * Useful for entering parenthesis, strings, tags, etc.
69      *
70      * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
71      * @param string $mode         Should only apply this pattern when dealing with this type of input.
72      * @param string $new_mode     Change parsing to this new nested mode.
73      */
74     public function addEntryPattern($pattern, $mode, $new_mode)
75     {
76         if (! isset($this->regexes[$mode])) {
77             $this->regexes[$mode] = new ParallelRegex($this->case);
78         }
79         $this->regexes[$mode]->addPattern($pattern, $new_mode);
80     }
81 
82     /**
83      * Adds a pattern that will exit the current mode and re-enter the previous one.
84      *
85      * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
86      * @param string $mode         Mode to leave.
87      */
88     public function addExitPattern($pattern, $mode)
89     {
90         if (! isset($this->regexes[$mode])) {
91             $this->regexes[$mode] = new ParallelRegex($this->case);
92         }
93         $this->regexes[$mode]->addPattern($pattern, "__exit");
94     }
95 
96     /**
97      * Adds a pattern that has a special mode.
98      *
99      * Acts as an entry and exit pattern in one go, effectively calling a special
100      * parser handler for this token only.
101      *
102      * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
103      * @param string $mode         Should only apply this pattern when dealing with this type of input.
104      * @param string $special      Use this mode for this one token.
105      */
106     public function addSpecialPattern($pattern, $mode, $special)
107     {
108         if (! isset($this->regexes[$mode])) {
109             $this->regexes[$mode] = new ParallelRegex($this->case);
110         }
111         $this->regexes[$mode]->addPattern($pattern, "_$special");
112     }
113 
114     /**
115      * Adds a mapping from a mode to another handler.
116      *
117      * @param string $mode        Mode to be remapped.
118      * @param string $handler     New target handler.
119      */
120     public function mapHandler($mode, $handler)
121     {
122         $this->mode_handlers[$mode] = $handler;
123     }
124 
125     /**
126      * Splits the page text into tokens.
127      *
128      * Will fail if the handlers report an error or if no content is consumed. If successful then each
129      * unparsed and parsed token invokes a call to the held listener.
130      *
131      * @param string $raw        Raw HTML text.
132      * @return boolean           True on success, else false.
133      */
134     public function parse($raw)
135     {
136         if (! isset($this->handler)) {
137             return false;
138         }
139         $initialLength = strlen($raw);
140         $length = $initialLength;
141         $pos = 0;
142         while (is_array($parsed = $this->reduce($raw))) {
143             [$unmatched, $matched, $mode] = $parsed;
144             $currentLength = strlen($raw);
145             $matchPos = $initialLength - $currentLength - strlen($matched);
146             if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
147                 return false;
148             }
149             if ($currentLength === $length) {
150                 return false;
151             }
152             $length = $currentLength;
153             $pos = $initialLength - $currentLength;
154         }
155         if (!$parsed) {
156             return false;
157         }
158         return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
159     }
160 
161     /**
162      * Gives plugins access to the mode stack
163      *
164      * @return StateStack
165      */
166     public function getModeStack()
167     {
168         return $this->modeStack;
169     }
170 
171     /**
172      * Sends the matched token and any leading unmatched
173      * text to the parser changing the lexer to a new
174      * mode if one is listed.
175      *
176      * @param string $unmatched Unmatched leading portion.
177      * @param string $matched Actual token match.
178      * @param bool|string $mode Mode after match. A boolean false mode causes no change.
179      * @param int $initialPos
180      * @param int $matchPos Current byte index location in raw doc thats being parsed
181      * @return boolean             False if there was any error from the parser.
182      */
183     protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
184     {
185         if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
186             return false;
187         }
188         if ($this->isModeEnd($mode)) {
189             if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
190                 return false;
191             }
192             return $this->modeStack->leave();
193         }
194         if ($this->isSpecialMode($mode)) {
195             $this->modeStack->enter($this->decodeSpecial($mode));
196             if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
197                 return false;
198             }
199             return $this->modeStack->leave();
200         }
201         if (is_string($mode)) {
202             $this->modeStack->enter($mode);
203             return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
204         }
205         return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
206     }
207 
208     /**
209      * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
210      * mode stack.
211      *
212      * @param string $mode    Mode to test.
213      * @return boolean        True if this is the exit mode.
214      */
215     protected function isModeEnd($mode)
216     {
217         return ($mode === "__exit");
218     }
219 
220     /**
221      * Test to see if the mode is one where this mode is entered for this token only and automatically
222      * leaves immediately afterwoods.
223      *
224      * @param string $mode    Mode to test.
225      * @return boolean        True if this is the exit mode.
226      */
227     protected function isSpecialMode($mode)
228     {
229         return str_starts_with($mode, '_');
230     }
231 
232     /**
233      * Strips the magic underscore marking single token modes.
234      *
235      * @param string $mode    Mode to decode.
236      * @return string         Underlying mode name.
237      */
238     protected function decodeSpecial($mode)
239     {
240         return substr($mode, 1);
241     }
242 
243     /**
244      * Calls the parser method named after the current mode.
245      *
246      * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
247      *
248      * @param string $content Text parsed.
249      * @param boolean $is_match Token is recognised rather
250      *                               than unparsed data.
251      * @param int $pos Current byte index location in raw doc
252      *                             thats being parsed
253      * @return bool
254      */
255     protected function invokeHandler($content, $is_match, $pos)
256     {
257         if (($content === "") || ($content === false)) {
258             return true;
259         }
260         $handler = $this->modeStack->getCurrent();
261         if (isset($this->mode_handlers[$handler])) {
262             $handler = $this->mode_handlers[$handler];
263         }
264 
265         // modes starting with plugin_ are all handled by the same
266         // handler but with an additional parameter
267         if (str_starts_with($handler, 'plugin_')) {
268             [$handler, $plugin] = sexplode('_', $handler, 2, '');
269             return $this->handler->$handler($content, $is_match, $pos, $plugin);
270         }
271 
272         return $this->handler->$handler($content, $is_match, $pos);
273     }
274 
275     /**
276      * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
277      * unparsed data. Empty strings will not be matched.
278      *
279      * @param string $raw         The subject to parse. This is the content that will be eaten.
280      * @return array|bool         Three item list of unparsed content followed by the
281      *                            recognised token and finally the action the parser is to take.
282      *                            True if no match, false if there is a parsing error.
283      */
284     protected function reduce(&$raw)
285     {
286         if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
287             return false;
288         }
289         if ($raw === "") {
290             return true;
291         }
292         if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
293             [$unparsed, $match, $raw] = $split;
294             return [$unparsed, $match, $action];
295         }
296         return true;
297     }
298 
299     /**
300      * Escapes regex characters other than (, ) and /
301      *
302      * @param string $str
303      * @return string
304      */
305     public static function escape($str)
306     {
307         $chars = [
308             '/\\\\/',
309             '/\./',
310             '/\+/',
311             '/\*/',
312             '/\?/',
313             '/\[/',
314             '/\^/',
315             '/\]/',
316             '/\$/',
317             '/\{/',
318             '/\}/',
319             '/\=/',
320             '/\!/',
321             '/\</',
322             '/\>/',
323             '/\|/',
324             '/\:/'
325         ];
326 
327         $escaped = [
328             '\\\\\\\\',
329             '\.',
330             '\+',
331             '\*',
332             '\?',
333             '\[',
334             '\^',
335             '\]',
336             '\$',
337             '\{',
338             '\}',
339             '\=',
340             '\!',
341             '\<',
342             '\>',
343             '\|',
344             '\:'
345         ];
346 
347         return preg_replace($chars, $escaped, $str);
348     }
349 }
350