1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13/**
14 * Accepts text and breaks it into tokens.
15 *
16 * Some optimisation to make the sure the content is only scanned by the PHP regex
17 * parser once. Lexer modes must not start with leading underscores.
18 */
19class Lexer
20{
21    /** @var ParallelRegex[] */
22    protected $regexes = [];
23    /** @var \Doku_Handler */
24    protected $handler;
25    /** @var StateStack */
26    protected $modeStack;
27    /** @var array mode "rewrites" */
28    protected $mode_handlers = [];
29    /** @var bool case sensitive? */
30    protected $case;
31
32    /**
33     * Sets up the lexer in case insensitive matching by default.
34     *
35     * @param \Doku_Handler $handler  Handling strategy by reference.
36     * @param string $start            Starting handler.
37     * @param boolean $case            True for case sensitive.
38     */
39    public function __construct($handler, $start = "accept", $case = false)
40    {
41        $this->case = $case;
42        $this->handler = $handler;
43        $this->modeStack = new StateStack($start);
44    }
45
46    /**
47     * Adds a token search pattern for a particular parsing mode.
48     *
49     * The pattern does not change the current mode.
50     *
51     * @param string $pattern      Perl style regex, but ( and )
52     *                             lose the usual meaning.
53     * @param string $mode         Should only apply this
54     *                             pattern when dealing with
55     *                             this type of input.
56     */
57    public function addPattern($pattern, $mode = "accept")
58    {
59        if (! isset($this->regexes[$mode])) {
60            $this->regexes[$mode] = new ParallelRegex($this->case);
61        }
62        $this->regexes[$mode]->addPattern($pattern);
63    }
64
65    /**
66     * Adds a pattern that will enter a new parsing mode.
67     *
68     * Useful for entering parenthesis, strings, tags, etc.
69     *
70     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
71     * @param string $mode         Should only apply this pattern when dealing with this type of input.
72     * @param string $new_mode     Change parsing to this new nested mode.
73     */
74    public function addEntryPattern($pattern, $mode, $new_mode)
75    {
76        if (! isset($this->regexes[$mode])) {
77            $this->regexes[$mode] = new ParallelRegex($this->case);
78        }
79        $this->regexes[$mode]->addPattern($pattern, $new_mode);
80    }
81
82    /**
83     * Adds a pattern that will exit the current mode and re-enter the previous one.
84     *
85     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
86     * @param string $mode         Mode to leave.
87     */
88    public function addExitPattern($pattern, $mode)
89    {
90        if (! isset($this->regexes[$mode])) {
91            $this->regexes[$mode] = new ParallelRegex($this->case);
92        }
93        $this->regexes[$mode]->addPattern($pattern, "__exit");
94    }
95
96    /**
97     * Adds a pattern that has a special mode.
98     *
99     * Acts as an entry and exit pattern in one go, effectively calling a special
100     * parser handler for this token only.
101     *
102     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
103     * @param string $mode         Should only apply this pattern when dealing with this type of input.
104     * @param string $special      Use this mode for this one token.
105     */
106    public function addSpecialPattern($pattern, $mode, $special)
107    {
108        if (! isset($this->regexes[$mode])) {
109            $this->regexes[$mode] = new ParallelRegex($this->case);
110        }
111        $this->regexes[$mode]->addPattern($pattern, "_$special");
112    }
113
114    /**
115     * Adds a mapping from a mode to another handler.
116     *
117     * @param string $mode        Mode to be remapped.
118     * @param string $handler     New target handler.
119     */
120    public function mapHandler($mode, $handler)
121    {
122        $this->mode_handlers[$mode] = $handler;
123    }
124
125    /**
126     * Splits the page text into tokens.
127     *
128     * Will fail if the handlers report an error or if no content is consumed. If successful then each
129     * unparsed and parsed token invokes a call to the held listener.
130     *
131     * @param string $raw        Raw HTML text.
132     * @return boolean           True on success, else false.
133     */
134    public function parse($raw)
135    {
136        if (! isset($this->handler)) {
137            return false;
138        }
139        $initialLength = strlen($raw);
140        $length = $initialLength;
141        $pos = 0;
142        while (is_array($parsed = $this->reduce($raw))) {
143            [$unmatched, $matched, $mode] = $parsed;
144            $currentLength = strlen($raw);
145            $matchPos = $initialLength - $currentLength - strlen($matched);
146            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
147                return false;
148            }
149            if ($currentLength === $length) {
150                return false;
151            }
152            $length = $currentLength;
153            $pos = $initialLength - $currentLength;
154        }
155        if (!$parsed) {
156            return false;
157        }
158        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
159    }
160
161    /**
162     * Gives plugins access to the mode stack
163     *
164     * @return StateStack
165     */
166    public function getModeStack()
167    {
168        return $this->modeStack;
169    }
170
171    /**
172     * Sends the matched token and any leading unmatched
173     * text to the parser changing the lexer to a new
174     * mode if one is listed.
175     *
176     * @param string $unmatched Unmatched leading portion.
177     * @param string $matched Actual token match.
178     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
179     * @param int $initialPos
180     * @param int $matchPos Current byte index location in raw doc thats being parsed
181     * @return boolean             False if there was any error from the parser.
182     */
183    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
184    {
185        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
186            return false;
187        }
188        if ($this->isModeEnd($mode)) {
189            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
190                return false;
191            }
192            return $this->modeStack->leave();
193        }
194        if ($this->isSpecialMode($mode)) {
195            $this->modeStack->enter($this->decodeSpecial($mode));
196            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
197                return false;
198            }
199            return $this->modeStack->leave();
200        }
201        if (is_string($mode)) {
202            $this->modeStack->enter($mode);
203            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
204        }
205        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
206    }
207
208    /**
209     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
210     * mode stack.
211     *
212     * @param string $mode    Mode to test.
213     * @return boolean        True if this is the exit mode.
214     */
215    protected function isModeEnd($mode)
216    {
217        return ($mode === "__exit");
218    }
219
220    /**
221     * Test to see if the mode is one where this mode is entered for this token only and automatically
222     * leaves immediately afterwoods.
223     *
224     * @param string $mode    Mode to test.
225     * @return boolean        True if this is the exit mode.
226     */
227    protected function isSpecialMode($mode)
228    {
229        return str_starts_with($mode, '_');
230    }
231
232    /**
233     * Strips the magic underscore marking single token modes.
234     *
235     * @param string $mode    Mode to decode.
236     * @return string         Underlying mode name.
237     */
238    protected function decodeSpecial($mode)
239    {
240        return substr($mode, 1);
241    }
242
243    /**
244     * Calls the parser method named after the current mode.
245     *
246     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
247     *
248     * @param string $content Text parsed.
249     * @param boolean $is_match Token is recognised rather
250     *                               than unparsed data.
251     * @param int $pos Current byte index location in raw doc
252     *                             thats being parsed
253     * @return bool
254     */
255    protected function invokeHandler($content, $is_match, $pos)
256    {
257        if (($content === "") || ($content === false)) {
258            return true;
259        }
260        $handler = $this->modeStack->getCurrent();
261        if (isset($this->mode_handlers[$handler])) {
262            $handler = $this->mode_handlers[$handler];
263        }
264
265        // modes starting with plugin_ are all handled by the same
266        // handler but with an additional parameter
267        if (str_starts_with($handler, 'plugin_')) {
268            [$handler, $plugin] = sexplode('_', $handler, 2, '');
269            return $this->handler->$handler($content, $is_match, $pos, $plugin);
270        }
271
272        return $this->handler->$handler($content, $is_match, $pos);
273    }
274
275    /**
276     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
277     * unparsed data. Empty strings will not be matched.
278     *
279     * @param string $raw         The subject to parse. This is the content that will be eaten.
280     * @return array|bool         Three item list of unparsed content followed by the
281     *                            recognised token and finally the action the parser is to take.
282     *                            True if no match, false if there is a parsing error.
283     */
284    protected function reduce(&$raw)
285    {
286        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
287            return false;
288        }
289        if ($raw === "") {
290            return true;
291        }
292        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
293            [$unparsed, $match, $raw] = $split;
294            return [$unparsed, $match, $action];
295        }
296        return true;
297    }
298
299    /**
300     * Escapes regex characters other than (, ) and /
301     *
302     * @param string $str
303     * @return string
304     */
305    public static function escape($str)
306    {
307        $chars = [
308            '/\\\\/',
309            '/\./',
310            '/\+/',
311            '/\*/',
312            '/\?/',
313            '/\[/',
314            '/\^/',
315            '/\]/',
316            '/\$/',
317            '/\{/',
318            '/\}/',
319            '/\=/',
320            '/\!/',
321            '/\</',
322            '/\>/',
323            '/\|/',
324            '/\:/'
325        ];
326
327        $escaped = [
328            '\\\\\\\\',
329            '\.',
330            '\+',
331            '\*',
332            '\?',
333            '\[',
334            '\^',
335            '\]',
336            '\$',
337            '\{',
338            '\}',
339            '\=',
340            '\!',
341            '\<',
342            '\>',
343            '\|',
344            '\:'
345        ];
346
347        return preg_replace($chars, $escaped, $str);
348    }
349}
350