xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 3465db0c3f7b210754e5c6a080828a54b6dc4e56)
1<?php
2/**
3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4 * For an intro to the Lexer see:
5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6 *
7 * @author Marcus Baker http://www.lastcraft.com
8 */
9
10namespace dokuwiki\Parsing\Lexer;
11
12// FIXME move elsewhere
13
14define("DOKU_LEXER_ENTER", 1);
15define("DOKU_LEXER_MATCHED", 2);
16define("DOKU_LEXER_UNMATCHED", 3);
17define("DOKU_LEXER_EXIT", 4);
18define("DOKU_LEXER_SPECIAL", 5);
19
20/**
21 * Accepts text and breaks it into tokens.
22 *
23 * Some optimisation to make the sure the content is only scanned by the PHP regex
24 * parser once. Lexer modes must not start with leading underscores.
25 */
26class Lexer
27{
28    /** @var ParallelRegex[] */
29    protected $regexes;
30    /** @var \Doku_Handler */
31    protected $handler;
32    /** @var StateStack */
33    protected $modeStack;
34    /** @var array mode "rewrites" */
35    protected $mode_handlers;
36    /** @var bool case sensitive? */
37    protected $case;
38
39    /**
40     * Sets up the lexer in case insensitive matching by default.
41     *
42     * @param \Doku_Handler $handler  Handling strategy by reference.
43     * @param string $start            Starting handler.
44     * @param boolean $case            True for case sensitive.
45     */
46    public function __construct($handler, $start = "accept", $case = false)
47    {
48        $this->case = $case;
49        $this->regexes = array();
50        $this->handler = $handler;
51        $this->modeStack = new StateStack($start);
52        $this->mode_handlers = array();
53    }
54
55    /**
56     * Adds a token search pattern for a particular parsing mode.
57     *
58     * The pattern does not change the current mode.
59     *
60     * @param string $pattern      Perl style regex, but ( and )
61     *                             lose the usual meaning.
62     * @param string $mode         Should only apply this
63     *                             pattern when dealing with
64     *                             this type of input.
65     */
66    public function addPattern($pattern, $mode = "accept")
67    {
68        if (! isset($this->regexes[$mode])) {
69            $this->regexes[$mode] = new ParallelRegex($this->case);
70        }
71        $this->regexes[$mode]->addPattern($pattern);
72    }
73
74    /**
75     * Adds a pattern that will enter a new parsing mode.
76     *
77     * Useful for entering parenthesis, strings, tags, etc.
78     *
79     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
80     * @param string $mode         Should only apply this pattern when dealing with this type of input.
81     * @param string $new_mode     Change parsing to this new nested mode.
82     */
83    public function addEntryPattern($pattern, $mode, $new_mode)
84    {
85        if (! isset($this->regexes[$mode])) {
86            $this->regexes[$mode] = new ParallelRegex($this->case);
87        }
88        $this->regexes[$mode]->addPattern($pattern, $new_mode);
89    }
90
91    /**
92     * Adds a pattern that will exit the current mode and re-enter the previous one.
93     *
94     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
95     * @param string $mode         Mode to leave.
96     */
97    public function addExitPattern($pattern, $mode)
98    {
99        if (! isset($this->regexes[$mode])) {
100            $this->regexes[$mode] = new ParallelRegex($this->case);
101        }
102        $this->regexes[$mode]->addPattern($pattern, "__exit");
103    }
104
105    /**
106     * Adds a pattern that has a special mode.
107     *
108     * Acts as an entry and exit pattern in one go, effectively calling a special
109     * parser handler for this token only.
110     *
111     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
112     * @param string $mode         Should only apply this pattern when dealing with this type of input.
113     * @param string $special      Use this mode for this one token.
114     */
115    public function addSpecialPattern($pattern, $mode, $special)
116    {
117        if (! isset($this->regexes[$mode])) {
118            $this->regexes[$mode] = new ParallelRegex($this->case);
119        }
120        $this->regexes[$mode]->addPattern($pattern, "_$special");
121    }
122
123    /**
124     * Adds a mapping from a mode to another handler.
125     *
126     * @param string $mode        Mode to be remapped.
127     * @param string $handler     New target handler.
128     */
129    public function mapHandler($mode, $handler)
130    {
131        $this->mode_handlers[$mode] = $handler;
132    }
133
134    /**
135     * Splits the page text into tokens.
136     *
137     * Will fail if the handlers report an error or if no content is consumed. If successful then each
138     * unparsed and parsed token invokes a call to the held listener.
139     *
140     * @param string $raw        Raw HTML text.
141     * @return boolean           True on success, else false.
142     */
143    public function parse($raw)
144    {
145        if (! isset($this->handler)) {
146            return false;
147        }
148        $initialLength = strlen($raw);
149        $length = $initialLength;
150        $pos = 0;
151        while (is_array($parsed = $this->reduce($raw))) {
152            list($unmatched, $matched, $mode) = $parsed;
153            $currentLength = strlen($raw);
154            $matchPos = $initialLength - $currentLength - strlen($matched);
155            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
156                return false;
157            }
158            if ($currentLength == $length) {
159                return false;
160            }
161            $length = $currentLength;
162            $pos = $initialLength - $currentLength;
163        }
164        if (!$parsed) {
165            return false;
166        }
167        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
168    }
169
170    /**
171     * Gives plugins access to the mode stack
172     *
173     * @return StateStack
174     */
175    public function getModeStack()
176    {
177        return $this->modeStack;
178    }
179
180    /**
181     * Sends the matched token and any leading unmatched
182     * text to the parser changing the lexer to a new
183     * mode if one is listed.
184     *
185     * @param string $unmatched Unmatched leading portion.
186     * @param string $matched Actual token match.
187     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
188     * @param int $initialPos
189     * @param int $matchPos Current byte index location in raw doc thats being parsed
190     * @return boolean             False if there was any error from the parser.
191     */
192    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
193    {
194        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
195            return false;
196        }
197        if ($this->isModeEnd($mode)) {
198            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
199                return false;
200            }
201            return $this->modeStack->leave();
202        }
203        if ($this->isSpecialMode($mode)) {
204            $this->modeStack->enter($this->decodeSpecial($mode));
205            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
206                return false;
207            }
208            return $this->modeStack->leave();
209        }
210        if (is_string($mode)) {
211            $this->modeStack->enter($mode);
212            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
213        }
214        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
215    }
216
217    /**
218     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
219     * mode stack.
220     *
221     * @param string $mode    Mode to test.
222     * @return boolean        True if this is the exit mode.
223     */
224    protected function isModeEnd($mode)
225    {
226        return ($mode === "__exit");
227    }
228
229    /**
230     * Test to see if the mode is one where this mode is entered for this token only and automatically
231     * leaves immediately afterwoods.
232     *
233     * @param string $mode    Mode to test.
234     * @return boolean        True if this is the exit mode.
235     */
236    protected function isSpecialMode($mode)
237    {
238        return (strncmp($mode, "_", 1) == 0);
239    }
240
241    /**
242     * Strips the magic underscore marking single token modes.
243     *
244     * @param string $mode    Mode to decode.
245     * @return string         Underlying mode name.
246     */
247    protected function decodeSpecial($mode)
248    {
249        return substr($mode, 1);
250    }
251
252    /**
253     * Calls the parser method named after the current mode.
254     *
255     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
256     *
257     * @param string $content Text parsed.
258     * @param boolean $is_match Token is recognised rather
259     *                               than unparsed data.
260     * @param int $pos Current byte index location in raw doc
261     *                             thats being parsed
262     * @return bool
263     */
264    protected function invokeHandler($content, $is_match, $pos)
265    {
266        if (($content === "") || ($content === false)) {
267            return true;
268        }
269        $handler = $this->modeStack->getCurrent();
270        if (isset($this->mode_handlers[$handler])) {
271            $handler = $this->mode_handlers[$handler];
272        }
273
274        // modes starting with plugin_ are all handled by the same
275        // handler but with an additional parameter
276        if (substr($handler, 0, 7)=='plugin_') {
277            list($handler,$plugin) = explode('_', $handler, 2);
278            return $this->handler->$handler($content, $is_match, $pos, $plugin);
279        }
280
281        return $this->handler->$handler($content, $is_match, $pos);
282    }
283
284    /**
285     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
286     * unparsed data. Empty strings will not be matched.
287     *
288     * @param string $raw         The subject to parse. This is the content that will be eaten.
289     * @return array|bool         Three item list of unparsed content followed by the
290     *                            recognised token and finally the action the parser is to take.
291     *                            True if no match, false if there is a parsing error.
292     */
293    protected function reduce(&$raw)
294    {
295        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
296            return false;
297        }
298        if ($raw === "") {
299            return true;
300        }
301        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
302            list($unparsed, $match, $raw) = $split;
303            return array($unparsed, $match, $action);
304        }
305        return true;
306    }
307
308    /**
309     * Escapes regex characters other than (, ) and /
310     *
311     * @param string $str
312     * @return string
313     */
314    public static function escape($str)
315    {
316        $chars = array(
317            '/\\\\/',
318            '/\./',
319            '/\+/',
320            '/\*/',
321            '/\?/',
322            '/\[/',
323            '/\^/',
324            '/\]/',
325            '/\$/',
326            '/\{/',
327            '/\}/',
328            '/\=/',
329            '/\!/',
330            '/\</',
331            '/\>/',
332            '/\|/',
333            '/\:/'
334        );
335
336        $escaped = array(
337            '\\\\\\\\',
338            '\.',
339            '\+',
340            '\*',
341            '\?',
342            '\[',
343            '\^',
344            '\]',
345            '\$',
346            '\{',
347            '\}',
348            '\=',
349            '\!',
350            '\<',
351            '\>',
352            '\|',
353            '\:'
354        );
355        return preg_replace($chars, $escaped, $str);
356    }
357}
358