xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision dba14ea3c4253d454e478f27d0ae9c47d2fa7aa6)
1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13/**
14 * Accepts text and breaks it into tokens.
15 *
16 * Some optimisation to make the sure the content is only scanned by the PHP regex
17 * parser once. Lexer modes must not start with leading underscores.
18 */
19class Lexer
20{
21    /** Signal for leaving a mode */
22    public const MODE_EXIT = '__exit';
23    /** Prefix marking special (enter-and-exit) patterns */
24    public const MODE_SPECIAL_PREFIX = '_';
25
26    /** @var ParallelRegex[] */
27    protected $regexes = [];
28    /** @var \Doku_Handler */
29    protected $handler;
30    /** @var StateStack */
31    protected $modeStack;
32    /** @var array mode "rewrites" */
33    protected $mode_handlers = [];
34    /** @var bool case sensitive? */
35    protected $case;
36
37    /**
38     * Sets up the lexer in case insensitive matching by default.
39     *
40     * @param \Doku_Handler $handler  Handling strategy by reference.
41     * @param string $start            Starting handler.
42     * @param boolean $case            True for case sensitive.
43     */
44    public function __construct($handler, $start = "accept", $case = false)
45    {
46        $this->case = $case;
47        $this->handler = $handler;
48        $this->modeStack = new StateStack($start);
49    }
50
51    /**
52     * Adds a token search pattern for a particular parsing mode.
53     *
54     * The pattern does not change the current mode.
55     *
56     * @param string $pattern      Perl style regex, but ( and )
57     *                             lose the usual meaning.
58     * @param string $mode         Should only apply this
59     *                             pattern when dealing with
60     *                             this type of input.
61     */
62    public function addPattern($pattern, $mode = "accept")
63    {
64        if (! isset($this->regexes[$mode])) {
65            $this->regexes[$mode] = new ParallelRegex($this->case);
66        }
67        $this->regexes[$mode]->addPattern($pattern);
68    }
69
70    /**
71     * Adds a pattern that will enter a new parsing mode.
72     *
73     * Useful for entering parenthesis, strings, tags, etc.
74     *
75     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
76     * @param string $mode         Should only apply this pattern when dealing with this type of input.
77     * @param string $new_mode     Change parsing to this new nested mode.
78     */
79    public function addEntryPattern($pattern, $mode, $new_mode)
80    {
81        if (! isset($this->regexes[$mode])) {
82            $this->regexes[$mode] = new ParallelRegex($this->case);
83        }
84        $this->regexes[$mode]->addPattern($pattern, $new_mode);
85    }
86
87    /**
88     * Adds a pattern that will exit the current mode and re-enter the previous one.
89     *
90     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
91     * @param string $mode         Mode to leave.
92     */
93    public function addExitPattern($pattern, $mode)
94    {
95        if (! isset($this->regexes[$mode])) {
96            $this->regexes[$mode] = new ParallelRegex($this->case);
97        }
98        $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT);
99    }
100
101    /**
102     * Adds a pattern that has a special mode.
103     *
104     * Acts as an entry and exit pattern in one go, effectively calling a special
105     * parser handler for this token only.
106     *
107     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
108     * @param string $mode         Should only apply this pattern when dealing with this type of input.
109     * @param string $special      Use this mode for this one token.
110     */
111    public function addSpecialPattern($pattern, $mode, $special)
112    {
113        if (! isset($this->regexes[$mode])) {
114            $this->regexes[$mode] = new ParallelRegex($this->case);
115        }
116        $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special);
117    }
118
119    /**
120     * Adds a mapping from a mode to another handler.
121     *
122     * @param string $mode        Mode to be remapped.
123     * @param string $handler     New target handler.
124     */
125    public function mapHandler($mode, $handler)
126    {
127        $this->mode_handlers[$mode] = $handler;
128    }
129
130    /**
131     * Splits the page text into tokens.
132     *
133     * Will fail if the handlers report an error or if no content is consumed. If successful then each
134     * unparsed and parsed token invokes a call to the held listener.
135     *
136     * @param string $raw        Raw HTML text.
137     * @return boolean           True on success, else false.
138     */
139    public function parse($raw)
140    {
141        if (! isset($this->handler)) {
142            return false;
143        }
144        $initialLength = strlen($raw);
145        $length = $initialLength;
146        $pos = 0;
147        while (is_array($parsed = $this->reduce($raw))) {
148            [$unmatched, $matched, $mode] = $parsed;
149            $currentLength = strlen($raw);
150            $matchPos = $initialLength - $currentLength - strlen($matched);
151            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
152                return false;
153            }
154            if ($currentLength === $length) {
155                return false;
156            }
157            $length = $currentLength;
158            $pos = $initialLength - $currentLength;
159        }
160        if (!$parsed) {
161            return false;
162        }
163        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
164    }
165
166    /**
167     * Gives plugins access to the mode stack
168     *
169     * @return StateStack
170     */
171    public function getModeStack()
172    {
173        return $this->modeStack;
174    }
175
176    /**
177     * Sends the matched token and any leading unmatched
178     * text to the parser changing the lexer to a new
179     * mode if one is listed.
180     *
181     * @param string $unmatched Unmatched leading portion.
182     * @param string $matched Actual token match.
183     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
184     * @param int $initialPos
185     * @param int $matchPos Current byte index location in raw doc thats being parsed
186     * @return boolean             False if there was any error from the parser.
187     */
188    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
189    {
190        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
191            return false;
192        }
193        if ($this->isModeEnd($mode)) {
194            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
195                return false;
196            }
197            return $this->modeStack->leave();
198        }
199        if ($this->isSpecialMode($mode)) {
200            $this->modeStack->enter($this->decodeSpecial($mode));
201            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
202                return false;
203            }
204            return $this->modeStack->leave();
205        }
206        if (is_string($mode)) {
207            $this->modeStack->enter($mode);
208            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
209        }
210        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
211    }
212
213    /**
214     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
215     * mode stack.
216     *
217     * @param string $mode    Mode to test.
218     * @return boolean        True if this is the exit mode.
219     */
220    protected function isModeEnd($mode)
221    {
222        return ($mode === self::MODE_EXIT);
223    }
224
225    /**
226     * Test to see if the mode is one where this mode is entered for this token only and automatically
227     * leaves immediately afterwoods.
228     *
229     * @param string $mode    Mode to test.
230     * @return boolean        True if this is the exit mode.
231     */
232    protected function isSpecialMode($mode)
233    {
234        return str_starts_with($mode, self::MODE_SPECIAL_PREFIX);
235    }
236
237    /**
238     * Strips the magic underscore marking single token modes.
239     *
240     * @param string $mode    Mode to decode.
241     * @return string         Underlying mode name.
242     */
243    protected function decodeSpecial($mode)
244    {
245        return substr($mode, strlen(self::MODE_SPECIAL_PREFIX));
246    }
247
248    /**
249     * Calls the parser method named after the current mode.
250     *
251     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
252     *
253     * @param string $content Text parsed.
254     * @param boolean $is_match Token is recognised rather
255     *                               than unparsed data.
256     * @param int $pos Current byte index location in raw doc
257     *                             thats being parsed
258     * @return bool
259     */
260    protected function invokeHandler($content, $is_match, $pos)
261    {
262        if (($content === "") || ($content === false)) {
263            return true;
264        }
265        $handler = $this->modeStack->getCurrent();
266        if (isset($this->mode_handlers[$handler])) {
267            $handler = $this->mode_handlers[$handler];
268        }
269
270        // modes starting with plugin_ are all handled by the same
271        // handler but with an additional parameter
272        if (str_starts_with($handler, 'plugin_')) {
273            [$handler, $plugin] = sexplode('_', $handler, 2, '');
274            return $this->handler->$handler($content, $is_match, $pos, $plugin);
275        }
276
277        return $this->handler->$handler($content, $is_match, $pos);
278    }
279
280    /**
281     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
282     * unparsed data. Empty strings will not be matched.
283     *
284     * @param string $raw         The subject to parse. This is the content that will be eaten.
285     * @return array|bool         Three item list of unparsed content followed by the
286     *                            recognised token and finally the action the parser is to take.
287     *                            True if no match, false if there is a parsing error.
288     */
289    protected function reduce(&$raw)
290    {
291        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
292            return false;
293        }
294        if ($raw === "") {
295            return true;
296        }
297        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
298            [$unparsed, $match, $raw] = $split;
299            return [$unparsed, $match, $action];
300        }
301        return true;
302    }
303
304    /**
305     * Escapes regex characters other than (, ) and /
306     *
307     * @param string $str
308     * @return string
309     */
310    public static function escape($str)
311    {
312        $chars = [
313            '/\\\\/',
314            '/\./',
315            '/\+/',
316            '/\*/',
317            '/\?/',
318            '/\[/',
319            '/\^/',
320            '/\]/',
321            '/\$/',
322            '/\{/',
323            '/\}/',
324            '/\=/',
325            '/\!/',
326            '/\</',
327            '/\>/',
328            '/\|/',
329            '/\:/'
330        ];
331
332        $escaped = [
333            '\\\\\\\\',
334            '\.',
335            '\+',
336            '\*',
337            '\?',
338            '\[',
339            '\^',
340            '\]',
341            '\$',
342            '\{',
343            '\}',
344            '\=',
345            '\!',
346            '\<',
347            '\>',
348            '\|',
349            '\:'
350        ];
351
352        return preg_replace($chars, $escaped, $str);
353    }
354}
355