1<?php
2/**
3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4 * For an intro to the Lexer see:
5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6 *
7 * @author Marcus Baker http://www.lastcraft.com
8 */
9
10namespace dokuwiki\Parsing\Lexer;
11
12/**
13 * Accepts text and breaks it into tokens.
14 *
15 * Some optimisation to make the sure the content is only scanned by the PHP regex
16 * parser once. Lexer modes must not start with leading underscores.
17 */
18class Lexer
19{
20    /** @var ParallelRegex[] */
21    protected $regexes;
22    /** @var \Doku_Handler */
23    protected $handler;
24    /** @var StateStack */
25    protected $modeStack;
26    /** @var array mode "rewrites" */
27    protected $mode_handlers;
28    /** @var bool case sensitive? */
29    protected $case;
30
31    /**
32     * Sets up the lexer in case insensitive matching by default.
33     *
34     * @param \Doku_Handler $handler  Handling strategy by reference.
35     * @param string $start            Starting handler.
36     * @param boolean $case            True for case sensitive.
37     */
38    public function __construct($handler, $start = "accept", $case = false)
39    {
40        $this->case = $case;
41        $this->regexes = array();
42        $this->handler = $handler;
43        $this->modeStack = new StateStack($start);
44        $this->mode_handlers = array();
45    }
46
47    /**
48     * Adds a token search pattern for a particular parsing mode.
49     *
50     * The pattern does not change the current mode.
51     *
52     * @param string $pattern      Perl style regex, but ( and )
53     *                             lose the usual meaning.
54     * @param string $mode         Should only apply this
55     *                             pattern when dealing with
56     *                             this type of input.
57     */
58    public function addPattern($pattern, $mode = "accept")
59    {
60        if (! isset($this->regexes[$mode])) {
61            $this->regexes[$mode] = new ParallelRegex($this->case);
62        }
63        $this->regexes[$mode]->addPattern($pattern);
64    }
65
66    /**
67     * Adds a pattern that will enter a new parsing mode.
68     *
69     * Useful for entering parenthesis, strings, tags, etc.
70     *
71     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
72     * @param string $mode         Should only apply this pattern when dealing with this type of input.
73     * @param string $new_mode     Change parsing to this new nested mode.
74     */
75    public function addEntryPattern($pattern, $mode, $new_mode)
76    {
77        if (! isset($this->regexes[$mode])) {
78            $this->regexes[$mode] = new ParallelRegex($this->case);
79        }
80        $this->regexes[$mode]->addPattern($pattern, $new_mode);
81    }
82
83    /**
84     * Adds a pattern that will exit the current mode and re-enter the previous one.
85     *
86     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
87     * @param string $mode         Mode to leave.
88     */
89    public function addExitPattern($pattern, $mode)
90    {
91        if (! isset($this->regexes[$mode])) {
92            $this->regexes[$mode] = new ParallelRegex($this->case);
93        }
94        $this->regexes[$mode]->addPattern($pattern, "__exit");
95    }
96
97    /**
98     * Adds a pattern that has a special mode.
99     *
100     * Acts as an entry and exit pattern in one go, effectively calling a special
101     * parser handler for this token only.
102     *
103     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
104     * @param string $mode         Should only apply this pattern when dealing with this type of input.
105     * @param string $special      Use this mode for this one token.
106     */
107    public function addSpecialPattern($pattern, $mode, $special)
108    {
109        if (! isset($this->regexes[$mode])) {
110            $this->regexes[$mode] = new ParallelRegex($this->case);
111        }
112        $this->regexes[$mode]->addPattern($pattern, "_$special");
113    }
114
115    /**
116     * Adds a mapping from a mode to another handler.
117     *
118     * @param string $mode        Mode to be remapped.
119     * @param string $handler     New target handler.
120     */
121    public function mapHandler($mode, $handler)
122    {
123        $this->mode_handlers[$mode] = $handler;
124    }
125
126    /**
127     * Splits the page text into tokens.
128     *
129     * Will fail if the handlers report an error or if no content is consumed. If successful then each
130     * unparsed and parsed token invokes a call to the held listener.
131     *
132     * @param string $raw        Raw HTML text.
133     * @return boolean           True on success, else false.
134     */
135    public function parse($raw)
136    {
137        if (! isset($this->handler)) {
138            return false;
139        }
140        $initialLength = strlen($raw);
141        $length = $initialLength;
142        $pos = 0;
143        while (is_array($parsed = $this->reduce($raw))) {
144            list($unmatched, $matched, $mode) = $parsed;
145            $currentLength = strlen($raw);
146            $matchPos = $initialLength - $currentLength - strlen($matched);
147            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
148                return false;
149            }
150            if ($currentLength == $length) {
151                return false;
152            }
153            $length = $currentLength;
154            $pos = $initialLength - $currentLength;
155        }
156        if (!$parsed) {
157            return false;
158        }
159        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
160    }
161
162    /**
163     * Gives plugins access to the mode stack
164     *
165     * @return StateStack
166     */
167    public function getModeStack()
168    {
169        return $this->modeStack;
170    }
171
172    /**
173     * Sends the matched token and any leading unmatched
174     * text to the parser changing the lexer to a new
175     * mode if one is listed.
176     *
177     * @param string $unmatched Unmatched leading portion.
178     * @param string $matched Actual token match.
179     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
180     * @param int $initialPos
181     * @param int $matchPos Current byte index location in raw doc thats being parsed
182     * @return boolean             False if there was any error from the parser.
183     */
184    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
185    {
186        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
187            return false;
188        }
189        if ($this->isModeEnd($mode)) {
190            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
191                return false;
192            }
193            return $this->modeStack->leave();
194        }
195        if ($this->isSpecialMode($mode)) {
196            $this->modeStack->enter($this->decodeSpecial($mode));
197            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
198                return false;
199            }
200            return $this->modeStack->leave();
201        }
202        if (is_string($mode)) {
203            $this->modeStack->enter($mode);
204            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
205        }
206        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
207    }
208
209    /**
210     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
211     * mode stack.
212     *
213     * @param string $mode    Mode to test.
214     * @return boolean        True if this is the exit mode.
215     */
216    protected function isModeEnd($mode)
217    {
218        return ($mode === "__exit");
219    }
220
221    /**
222     * Test to see if the mode is one where this mode is entered for this token only and automatically
223     * leaves immediately afterwoods.
224     *
225     * @param string $mode    Mode to test.
226     * @return boolean        True if this is the exit mode.
227     */
228    protected function isSpecialMode($mode)
229    {
230        return (strncmp($mode, "_", 1) == 0);
231    }
232
233    /**
234     * Strips the magic underscore marking single token modes.
235     *
236     * @param string $mode    Mode to decode.
237     * @return string         Underlying mode name.
238     */
239    protected function decodeSpecial($mode)
240    {
241        return substr($mode, 1);
242    }
243
244    /**
245     * Calls the parser method named after the current mode.
246     *
247     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
248     *
249     * @param string $content Text parsed.
250     * @param boolean $is_match Token is recognised rather
251     *                               than unparsed data.
252     * @param int $pos Current byte index location in raw doc
253     *                             thats being parsed
254     * @return bool
255     */
256    protected function invokeHandler($content, $is_match, $pos)
257    {
258        if (($content === "") || ($content === false)) {
259            return true;
260        }
261        $handler = $this->modeStack->getCurrent();
262        if (isset($this->mode_handlers[$handler])) {
263            $handler = $this->mode_handlers[$handler];
264        }
265
266        // modes starting with plugin_ are all handled by the same
267        // handler but with an additional parameter
268        if (substr($handler, 0, 7)=='plugin_') {
269            list($handler,$plugin) = explode('_', $handler, 2);
270            return $this->handler->$handler($content, $is_match, $pos, $plugin);
271        }
272
273        return $this->handler->$handler($content, $is_match, $pos);
274    }
275
276    /**
277     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
278     * unparsed data. Empty strings will not be matched.
279     *
280     * @param string $raw         The subject to parse. This is the content that will be eaten.
281     * @return array|bool         Three item list of unparsed content followed by the
282     *                            recognised token and finally the action the parser is to take.
283     *                            True if no match, false if there is a parsing error.
284     */
285    protected function reduce(&$raw)
286    {
287        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
288            return false;
289        }
290        if ($raw === "") {
291            return true;
292        }
293        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
294            list($unparsed, $match, $raw) = $split;
295            return array($unparsed, $match, $action);
296        }
297        return true;
298    }
299
300    /**
301     * Escapes regex characters other than (, ) and /
302     *
303     * @param string $str
304     * @return string
305     */
306    public static function escape($str)
307    {
308        $chars = array(
309            '/\\\\/',
310            '/\./',
311            '/\+/',
312            '/\*/',
313            '/\?/',
314            '/\[/',
315            '/\^/',
316            '/\]/',
317            '/\$/',
318            '/\{/',
319            '/\}/',
320            '/\=/',
321            '/\!/',
322            '/\</',
323            '/\>/',
324            '/\|/',
325            '/\:/'
326        );
327
328        $escaped = array(
329            '\\\\\\\\',
330            '\.',
331            '\+',
332            '\*',
333            '\?',
334            '\[',
335            '\^',
336            '\]',
337            '\$',
338            '\{',
339            '\}',
340            '\=',
341            '\!',
342            '\<',
343            '\>',
344            '\|',
345            '\:'
346        );
347        return preg_replace($chars, $escaped, $str);
348    }
349}
350