xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 5aa905e95e0f4ee1de1d93da15dbd388e985c134)
1<?php
2/**
3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4 * For an intro to the Lexer see:
5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6 *
7 * @author Marcus Baker http://www.lastcraft.com
8 */
9
10namespace dokuwiki\Parsing\Lexer;
11
12// FIXME move elsewhere
13
14define("DOKU_LEXER_ENTER", 1);
15define("DOKU_LEXER_MATCHED", 2);
16define("DOKU_LEXER_UNMATCHED", 3);
17define("DOKU_LEXER_EXIT", 4);
18define("DOKU_LEXER_SPECIAL", 5);
19
20/**
21 * Accepts text and breaks it into tokens.
22 *
23 * Some optimisation to make the sure the content is only scanned by the PHP regex
24 * parser once. Lexer modes must not start with leading underscores.
25 */
26class Lexer
27{
28    /** @var ParallelRegex[] */
29    protected $regexes;
30    /** @var \Doku_Handler */
31    protected $handler;
32    /** @var StateStack */
33    protected $mode;
34    /** @var array mode "rewrites" */
35    protected $mode_handlers;
36    /** @var bool case sensitive? */
37    protected $case;
38
39    /**
40     * Sets up the lexer in case insensitive matching by default.
41     *
42     * @param \Doku_Handler $handler  Handling strategy by reference.
43     * @param string $start            Starting handler.
44     * @param boolean $case            True for case sensitive.
45     */
46    public function __construct($handler, $start = "accept", $case = false)
47    {
48        $this->case = $case;
49        $this->regexes = array();
50        $this->handler = $handler;
51        $this->mode = new StateStack($start);
52        $this->mode_handlers = array();
53    }
54
55    /**
56     * Adds a token search pattern for a particular parsing mode.
57     *
58     * The pattern does not change the current mode.
59     *
60     * @param string $pattern      Perl style regex, but ( and )
61     *                             lose the usual meaning.
62     * @param string $mode         Should only apply this
63     *                             pattern when dealing with
64     *                             this type of input.
65     */
66    public function addPattern($pattern, $mode = "accept")
67    {
68        if (! isset($this->regexes[$mode])) {
69            $this->regexes[$mode] = new ParallelRegex($this->case);
70        }
71        $this->regexes[$mode]->addPattern($pattern);
72    }
73
74    /**
75     * Adds a pattern that will enter a new parsing mode.
76     *
77     * Useful for entering parenthesis, strings, tags, etc.
78     *
79     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
80     * @param string $mode         Should only apply this pattern when dealing with this type of input.
81     * @param string $new_mode     Change parsing to this new nested mode.
82     */
83    public function addEntryPattern($pattern, $mode, $new_mode)
84    {
85        if (! isset($this->regexes[$mode])) {
86            $this->regexes[$mode] = new ParallelRegex($this->case);
87        }
88        $this->regexes[$mode]->addPattern($pattern, $new_mode);
89    }
90
91    /**
92     * Adds a pattern that will exit the current mode and re-enter the previous one.
93     *
94     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
95     * @param string $mode         Mode to leave.
96     */
97    public function addExitPattern($pattern, $mode)
98    {
99        if (! isset($this->regexes[$mode])) {
100            $this->regexes[$mode] = new ParallelRegex($this->case);
101        }
102        $this->regexes[$mode]->addPattern($pattern, "__exit");
103    }
104
105    /**
106     * Adds a pattern that has a special mode.
107     *
108     * Acts as an entry and exit pattern in one go, effectively calling a special
109     * parser handler for this token only.
110     *
111     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
112     * @param string $mode         Should only apply this pattern when dealing with this type of input.
113     * @param string $special      Use this mode for this one token.
114     */
115    public function addSpecialPattern($pattern, $mode, $special)
116    {
117        if (! isset($this->regexes[$mode])) {
118            $this->regexes[$mode] = new ParallelRegex($this->case);
119        }
120        $this->regexes[$mode]->addPattern($pattern, "_$special");
121    }
122
123    /**
124     * Adds a mapping from a mode to another handler.
125     *
126     * @param string $mode        Mode to be remapped.
127     * @param string $handler     New target handler.
128     */
129    public function mapHandler($mode, $handler)
130    {
131        $this->mode_handlers[$mode] = $handler;
132    }
133
134    /**
135     * Splits the page text into tokens.
136     *
137     * Will fail if the handlers report an error or if no content is consumed. If successful then each
138     * unparsed and parsed token invokes a call to the held listener.
139     *
140     * @param string $raw        Raw HTML text.
141     * @return boolean           True on success, else false.
142     */
143    public function parse($raw)
144    {
145        if (! isset($this->handler)) {
146            return false;
147        }
148        $initialLength = strlen($raw);
149        $length = $initialLength;
150        $pos = 0;
151        while (is_array($parsed = $this->reduce($raw))) {
152            list($unmatched, $matched, $mode) = $parsed;
153            $currentLength = strlen($raw);
154            $matchPos = $initialLength - $currentLength - strlen($matched);
155            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
156                return false;
157            }
158            if ($currentLength == $length) {
159                return false;
160            }
161            $length = $currentLength;
162            $pos = $initialLength - $currentLength;
163        }
164        if (!$parsed) {
165            return false;
166        }
167        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
168    }
169
170    /**
171     * Sends the matched token and any leading unmatched
172     * text to the parser changing the lexer to a new
173     * mode if one is listed.
174     *
175     * @param string $unmatched Unmatched leading portion.
176     * @param string $matched Actual token match.
177     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
178     * @param int $initialPos
179     * @param int $matchPos Current byte index location in raw doc thats being parsed
180     * @return boolean             False if there was any error from the parser.
181     */
182    protected function dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos)
183    {
184        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
185            return false;
186        }
187        if ($this->isModeEnd($mode)) {
188            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
189                return false;
190            }
191            return $this->mode->leave();
192        }
193        if ($this->isSpecialMode($mode)) {
194            $this->mode->enter($this->decodeSpecial($mode));
195            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
196                return false;
197            }
198            return $this->mode->leave();
199        }
200        if (is_string($mode)) {
201            $this->mode->enter($mode);
202            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
203        }
204        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
205    }
206
207    /**
208     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
209     * mode stack.
210     *
211     * @param string $mode    Mode to test.
212     * @return boolean        True if this is the exit mode.
213     */
214    protected function isModeEnd($mode)
215    {
216        return ($mode === "__exit");
217    }
218
219    /**
220     * Test to see if the mode is one where this mode is entered for this token only and automatically
221     * leaves immediately afterwoods.
222     *
223     * @param string $mode    Mode to test.
224     * @return boolean        True if this is the exit mode.
225     */
226    protected function isSpecialMode($mode)
227    {
228        return (strncmp($mode, "_", 1) == 0);
229    }
230
231    /**
232     * Strips the magic underscore marking single token modes.
233     *
234     * @param string $mode    Mode to decode.
235     * @return string         Underlying mode name.
236     */
237    protected function decodeSpecial($mode)
238    {
239        return substr($mode, 1);
240    }
241
242    /**
243     * Calls the parser method named after the current mode.
244     *
245     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
246     *
247     * @param string $content Text parsed.
248     * @param boolean $is_match Token is recognised rather
249     *                               than unparsed data.
250     * @param int $pos Current byte index location in raw doc
251     *                             thats being parsed
252     * @return bool
253     */
254    protected function invokeHandler($content, $is_match, $pos)
255    {
256        if (($content === "") || ($content === false)) {
257            return true;
258        }
259        $handler = $this->mode->getCurrent();
260        if (isset($this->mode_handlers[$handler])) {
261            $handler = $this->mode_handlers[$handler];
262        }
263
264        // modes starting with plugin_ are all handled by the same
265        // handler but with an additional parameter
266        if (substr($handler, 0, 7)=='plugin_') {
267            list($handler,$plugin) = explode('_', $handler, 2);
268            return $this->handler->$handler($content, $is_match, $pos, $plugin);
269        }
270
271        return $this->handler->$handler($content, $is_match, $pos);
272    }
273
274    /**
275     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276     * unparsed data. Empty strings will not be matched.
277     *
278     * @param string $raw         The subject to parse. This is the content that will be eaten.
279     * @return array|bool         Three item list of unparsed content followed by the
280     *                            recognised token and finally the action the parser is to take.
281     *                            True if no match, false if there is a parsing error.
282     */
283    protected function reduce(&$raw)
284    {
285        if (! isset($this->regexes[$this->mode->getCurrent()])) {
286            return false;
287        }
288        if ($raw === "") {
289            return true;
290        }
291        if ($action = $this->regexes[$this->mode->getCurrent()]->split($raw, $split)) {
292            list($unparsed, $match, $raw) = $split;
293            return array($unparsed, $match, $action);
294        }
295        return true;
296    }
297
298    /**
299     * Escapes regex characters other than (, ) and /
300     *
301     * @param string $str
302     * @return string
303     */
304    public static function escape($str)
305    {
306        $chars = array(
307            '/\\\\/',
308            '/\./',
309            '/\+/',
310            '/\*/',
311            '/\?/',
312            '/\[/',
313            '/\^/',
314            '/\]/',
315            '/\$/',
316            '/\{/',
317            '/\}/',
318            '/\=/',
319            '/\!/',
320            '/\</',
321            '/\>/',
322            '/\|/',
323            '/\:/'
324        );
325
326        $escaped = array(
327            '\\\\\\\\',
328            '\.',
329            '\+',
330            '\*',
331            '\?',
332            '\[',
333            '\^',
334            '\]',
335            '\$',
336            '\{',
337            '\}',
338            '\=',
339            '\!',
340            '\<',
341            '\>',
342            '\|',
343            '\:'
344        );
345        return preg_replace($chars, $escaped, $str);
346    }
347}
348