xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 8c7c53b0321a3cd3116b8d3b2ad27863a38dece7)
1<?php
2/**
3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4 * For an intro to the Lexer see:
5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6 *
7 * @author Marcus Baker http://www.lastcraft.com
8 */
9
10namespace dokuwiki\Parsing\Lexer;
11
12/**
13 * Accepts text and breaks it into tokens.
14 *
15 * Some optimisation to make the sure the content is only scanned by the PHP regex
16 * parser once. Lexer modes must not start with leading underscores.
17 */
18class Lexer
19{
20    /** @var ParallelRegex[] */
21    protected $regexes = [];
22    /** @var \Doku_Handler */
23    protected $handler;
24    /** @var StateStack */
25    protected $modeStack;
26    /** @var array mode "rewrites" */
27    protected $mode_handlers = [];
28    /** @var bool case sensitive? */
29    protected $case;
30
31    /**
32     * Sets up the lexer in case insensitive matching by default.
33     *
34     * @param \Doku_Handler $handler  Handling strategy by reference.
35     * @param string $start            Starting handler.
36     * @param boolean $case            True for case sensitive.
37     */
38    public function __construct($handler, $start = "accept", $case = false)
39    {
40        $this->case = $case;
41        $this->handler = $handler;
42        $this->modeStack = new StateStack($start);
43    }
44
45    /**
46     * Adds a token search pattern for a particular parsing mode.
47     *
48     * The pattern does not change the current mode.
49     *
50     * @param string $pattern      Perl style regex, but ( and )
51     *                             lose the usual meaning.
52     * @param string $mode         Should only apply this
53     *                             pattern when dealing with
54     *                             this type of input.
55     */
56    public function addPattern($pattern, $mode = "accept")
57    {
58        if (! isset($this->regexes[$mode])) {
59            $this->regexes[$mode] = new ParallelRegex($this->case);
60        }
61        $this->regexes[$mode]->addPattern($pattern);
62    }
63
64    /**
65     * Adds a pattern that will enter a new parsing mode.
66     *
67     * Useful for entering parenthesis, strings, tags, etc.
68     *
69     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
70     * @param string $mode         Should only apply this pattern when dealing with this type of input.
71     * @param string $new_mode     Change parsing to this new nested mode.
72     */
73    public function addEntryPattern($pattern, $mode, $new_mode)
74    {
75        if (! isset($this->regexes[$mode])) {
76            $this->regexes[$mode] = new ParallelRegex($this->case);
77        }
78        $this->regexes[$mode]->addPattern($pattern, $new_mode);
79    }
80
81    /**
82     * Adds a pattern that will exit the current mode and re-enter the previous one.
83     *
84     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
85     * @param string $mode         Mode to leave.
86     */
87    public function addExitPattern($pattern, $mode)
88    {
89        if (! isset($this->regexes[$mode])) {
90            $this->regexes[$mode] = new ParallelRegex($this->case);
91        }
92        $this->regexes[$mode]->addPattern($pattern, "__exit");
93    }
94
95    /**
96     * Adds a pattern that has a special mode.
97     *
98     * Acts as an entry and exit pattern in one go, effectively calling a special
99     * parser handler for this token only.
100     *
101     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
102     * @param string $mode         Should only apply this pattern when dealing with this type of input.
103     * @param string $special      Use this mode for this one token.
104     */
105    public function addSpecialPattern($pattern, $mode, $special)
106    {
107        if (! isset($this->regexes[$mode])) {
108            $this->regexes[$mode] = new ParallelRegex($this->case);
109        }
110        $this->regexes[$mode]->addPattern($pattern, "_$special");
111    }
112
113    /**
114     * Adds a mapping from a mode to another handler.
115     *
116     * @param string $mode        Mode to be remapped.
117     * @param string $handler     New target handler.
118     */
119    public function mapHandler($mode, $handler)
120    {
121        $this->mode_handlers[$mode] = $handler;
122    }
123
124    /**
125     * Splits the page text into tokens.
126     *
127     * Will fail if the handlers report an error or if no content is consumed. If successful then each
128     * unparsed and parsed token invokes a call to the held listener.
129     *
130     * @param string $raw        Raw HTML text.
131     * @return boolean           True on success, else false.
132     */
133    public function parse($raw)
134    {
135        if (! isset($this->handler)) {
136            return false;
137        }
138        $initialLength = strlen($raw);
139        $length = $initialLength;
140        $pos = 0;
141        while (is_array($parsed = $this->reduce($raw))) {
142            [$unmatched, $matched, $mode] = $parsed;
143            $currentLength = strlen($raw);
144            $matchPos = $initialLength - $currentLength - strlen($matched);
145            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
146                return false;
147            }
148            if ($currentLength === $length) {
149                return false;
150            }
151            $length = $currentLength;
152            $pos = $initialLength - $currentLength;
153        }
154        if (!$parsed) {
155            return false;
156        }
157        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
158    }
159
160    /**
161     * Gives plugins access to the mode stack
162     *
163     * @return StateStack
164     */
165    public function getModeStack()
166    {
167        return $this->modeStack;
168    }
169
170    /**
171     * Sends the matched token and any leading unmatched
172     * text to the parser changing the lexer to a new
173     * mode if one is listed.
174     *
175     * @param string $unmatched Unmatched leading portion.
176     * @param string $matched Actual token match.
177     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
178     * @param int $initialPos
179     * @param int $matchPos Current byte index location in raw doc thats being parsed
180     * @return boolean             False if there was any error from the parser.
181     */
182    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
183    {
184        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
185            return false;
186        }
187        if ($this->isModeEnd($mode)) {
188            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
189                return false;
190            }
191            return $this->modeStack->leave();
192        }
193        if ($this->isSpecialMode($mode)) {
194            $this->modeStack->enter($this->decodeSpecial($mode));
195            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
196                return false;
197            }
198            return $this->modeStack->leave();
199        }
200        if (is_string($mode)) {
201            $this->modeStack->enter($mode);
202            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
203        }
204        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
205    }
206
207    /**
208     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
209     * mode stack.
210     *
211     * @param string $mode    Mode to test.
212     * @return boolean        True if this is the exit mode.
213     */
214    protected function isModeEnd($mode)
215    {
216        return ($mode === "__exit");
217    }
218
219    /**
220     * Test to see if the mode is one where this mode is entered for this token only and automatically
221     * leaves immediately afterwoods.
222     *
223     * @param string $mode    Mode to test.
224     * @return boolean        True if this is the exit mode.
225     */
226    protected function isSpecialMode($mode)
227    {
228        return (strncmp($mode, "_", 1) == 0);
229    }
230
231    /**
232     * Strips the magic underscore marking single token modes.
233     *
234     * @param string $mode    Mode to decode.
235     * @return string         Underlying mode name.
236     */
237    protected function decodeSpecial($mode)
238    {
239        return substr($mode, 1);
240    }
241
242    /**
243     * Calls the parser method named after the current mode.
244     *
245     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
246     *
247     * @param string $content Text parsed.
248     * @param boolean $is_match Token is recognised rather
249     *                               than unparsed data.
250     * @param int $pos Current byte index location in raw doc
251     *                             thats being parsed
252     * @return bool
253     */
254    protected function invokeHandler($content, $is_match, $pos)
255    {
256        if (($content === "") || ($content === false)) {
257            return true;
258        }
259        $handler = $this->modeStack->getCurrent();
260        if (isset($this->mode_handlers[$handler])) {
261            $handler = $this->mode_handlers[$handler];
262        }
263
264        // modes starting with plugin_ are all handled by the same
265        // handler but with an additional parameter
266        if (substr($handler, 0, 7)=='plugin_') {
267            [$handler, $plugin] = sexplode('_', $handler, 2, '');
268            return $this->handler->$handler($content, $is_match, $pos, $plugin);
269        }
270
271        return $this->handler->$handler($content, $is_match, $pos);
272    }
273
274    /**
275     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276     * unparsed data. Empty strings will not be matched.
277     *
278     * @param string $raw         The subject to parse. This is the content that will be eaten.
279     * @return array|bool         Three item list of unparsed content followed by the
280     *                            recognised token and finally the action the parser is to take.
281     *                            True if no match, false if there is a parsing error.
282     */
283    protected function reduce(&$raw)
284    {
285        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
286            return false;
287        }
288        if ($raw === "") {
289            return true;
290        }
291        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
292            [$unparsed, $match, $raw] = $split;
293            return [$unparsed, $match, $action];
294        }
295        return true;
296    }
297
298    /**
299     * Escapes regex characters other than (, ) and /
300     *
301     * @param string $str
302     * @return string
303     */
304    public static function escape($str)
305    {
306        $chars = [
307            '/\\\\/',
308            '/\./',
309            '/\+/',
310            '/\*/',
311            '/\?/',
312            '/\[/',
313            '/\^/',
314            '/\]/',
315            '/\$/',
316            '/\{/',
317            '/\}/',
318            '/\=/',
319            '/\!/',
320            '/\</',
321            '/\>/',
322            '/\|/',
323            '/\:/'
324        ];
325
326        $escaped = [
327            '\\\\\\\\',
328            '\.',
329            '\+',
330            '\*',
331            '\?',
332            '\[',
333            '\^',
334            '\]',
335            '\$',
336            '\{',
337            '\}',
338            '\=',
339            '\!',
340            '\<',
341            '\>',
342            '\|',
343            '\:'
344        ];
345
346        return preg_replace($chars, $escaped, $str);
347    }
348}
349