xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 13a62f810fbd091d15ab734b467eaec0a6bf829a)
1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13use dokuwiki\Parsing\Handler;
14
15/**
16 * Accepts text and breaks it into tokens.
17 *
18 * Some optimisation to make the sure the content is only scanned by the PHP regex
19 * parser once. Lexer modes must not start with leading underscores.
20 */
21class Lexer
22{
23    /** Signal for leaving a mode */
24    public const MODE_EXIT = '__exit';
25    /** Prefix marking special (enter-and-exit) patterns */
26    public const MODE_SPECIAL_PREFIX = '_';
27
28    /** @var ParallelRegex[] */
29    protected $regexes = [];
30    /** @var Handler */
31    protected $handler;
32    /** @var StateStack */
33    protected $modeStack;
34    /** @var array mode "rewrites" */
35    protected $mode_handlers = [];
36    /** @var bool case sensitive? */
37    protected $case;
38
39    /**
40     * Sets up the lexer in case insensitive matching by default.
41     *
42     * @param Handler $handler  Handling strategy by reference.
43     * @param string $start            Starting handler.
44     * @param boolean $case            True for case sensitive.
45     */
46    public function __construct($handler, $start = "accept", $case = false)
47    {
48        $this->case = $case;
49        $this->handler = $handler;
50        $this->modeStack = new StateStack($start);
51    }
52
53    /**
54     * Adds a token search pattern for a particular parsing mode.
55     *
56     * The pattern does not change the current mode.
57     *
58     * @param string $pattern      Perl style regex, but ( and )
59     *                             lose the usual meaning.
60     * @param string $mode         Should only apply this
61     *                             pattern when dealing with
62     *                             this type of input.
63     */
64    public function addPattern($pattern, $mode = "accept")
65    {
66        if (! isset($this->regexes[$mode])) {
67            $this->regexes[$mode] = new ParallelRegex($this->case);
68        }
69        $this->regexes[$mode]->addPattern($pattern);
70    }
71
72    /**
73     * Adds a pattern that will enter a new parsing mode.
74     *
75     * Useful for entering parenthesis, strings, tags, etc.
76     *
77     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
78     * @param string $mode         Should only apply this pattern when dealing with this type of input.
79     * @param string $new_mode     Change parsing to this new nested mode.
80     */
81    public function addEntryPattern($pattern, $mode, $new_mode)
82    {
83        if (! isset($this->regexes[$mode])) {
84            $this->regexes[$mode] = new ParallelRegex($this->case);
85        }
86        $this->regexes[$mode]->addPattern($pattern, $new_mode);
87    }
88
89    /**
90     * Adds a pattern that will exit the current mode and re-enter the previous one.
91     *
92     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
93     * @param string $mode         Mode to leave.
94     */
95    public function addExitPattern($pattern, $mode)
96    {
97        if (! isset($this->regexes[$mode])) {
98            $this->regexes[$mode] = new ParallelRegex($this->case);
99        }
100        $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT);
101    }
102
103    /**
104     * Adds a pattern that has a special mode.
105     *
106     * Acts as an entry and exit pattern in one go, effectively calling a special
107     * parser handler for this token only.
108     *
109     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
110     * @param string $mode         Should only apply this pattern when dealing with this type of input.
111     * @param string $special      Use this mode for this one token.
112     */
113    public function addSpecialPattern($pattern, $mode, $special)
114    {
115        if (! isset($this->regexes[$mode])) {
116            $this->regexes[$mode] = new ParallelRegex($this->case);
117        }
118        $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special);
119    }
120
121    /**
122     * Adds a mapping from a mode to another handler.
123     *
124     * @param string $mode        Mode to be remapped.
125     * @param string $handler     New target handler.
126     */
127    public function mapHandler($mode, $handler)
128    {
129        $this->mode_handlers[$mode] = $handler;
130    }
131
132    /**
133     * Splits the page text into tokens.
134     *
135     * Will fail if the handlers report an error or if no content is consumed. If successful then each
136     * unparsed and parsed token invokes a call to the held listener.
137     *
138     * @param string $raw        Raw HTML text.
139     * @return boolean           True on success, else false.
140     */
141    public function parse($raw)
142    {
143        if (! isset($this->handler)) {
144            return false;
145        }
146        $offset = 0;
147        while (is_array($parsed = $this->reduce($raw, $offset))) {
148            [$unmatched, $matched, $mode] = $parsed;
149            $matchPos = $offset + strlen($unmatched);
150            if (! $this->dispatchTokens($unmatched, $matched, $mode, $offset, $matchPos)) {
151                return false;
152            }
153            $newOffset = $matchPos + strlen($matched);
154            if ($newOffset === $offset) {
155                return false;
156            }
157            $offset = $newOffset;
158        }
159        if (!$parsed) {
160            return false;
161        }
162        return $this->invokeHandler(substr($raw, $offset), DOKU_LEXER_UNMATCHED, $offset);
163    }
164
165    /**
166     * Gives plugins access to the mode stack
167     *
168     * @return StateStack
169     */
170    public function getModeStack()
171    {
172        return $this->modeStack;
173    }
174
175    /**
176     * Sends the matched token and any leading unmatched
177     * text to the parser changing the lexer to a new
178     * mode if one is listed.
179     *
180     * @param string $unmatched Unmatched leading portion.
181     * @param string $matched Actual token match.
182     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
183     * @param int $initialPos
184     * @param int $matchPos Current byte index location in raw doc thats being parsed
185     * @return boolean             False if there was any error from the parser.
186     */
187    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
188    {
189        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
190            return false;
191        }
192        if ($this->isModeEnd($mode)) {
193            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
194                return false;
195            }
196            return $this->modeStack->leave();
197        }
198        if ($this->isSpecialMode($mode)) {
199            $this->modeStack->enter($this->decodeSpecial($mode));
200            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
201                return false;
202            }
203            return $this->modeStack->leave();
204        }
205        if (is_string($mode)) {
206            $this->modeStack->enter($mode);
207            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
208        }
209        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
210    }
211
212    /**
213     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
214     * mode stack.
215     *
216     * @param string $mode    Mode to test.
217     * @return boolean        True if this is the exit mode.
218     */
219    protected function isModeEnd($mode)
220    {
221        return ($mode === self::MODE_EXIT);
222    }
223
224    /**
225     * Test to see if the mode is one where this mode is entered for this token only and automatically
226     * leaves immediately afterwoods.
227     *
228     * @param string $mode    Mode to test.
229     * @return boolean        True if this is the exit mode.
230     */
231    protected function isSpecialMode($mode)
232    {
233        return str_starts_with($mode, self::MODE_SPECIAL_PREFIX);
234    }
235
236    /**
237     * Strips the magic underscore marking single token modes.
238     *
239     * @param string $mode    Mode to decode.
240     * @return string         Underlying mode name.
241     */
242    protected function decodeSpecial($mode)
243    {
244        return substr($mode, strlen(self::MODE_SPECIAL_PREFIX));
245    }
246
247    /**
248     * Dispatches a token to the handler.
249     *
250     * Resolves mode name aliases (e.g. unformattedalt → unformatted) and
251     * delegates all dispatch logic to Handler::handleToken().
252     *
253     * @param string $content Text parsed.
254     * @param boolean $is_match Token is recognised rather
255     *                               than unparsed data.
256     * @param int $pos Current byte index location in raw doc
257     *                             thats being parsed
258     * @return bool
259     */
260    protected function invokeHandler($content, $is_match, $pos)
261    {
262        if (($content === "") || ($content === false)) {
263            return true;
264        }
265        $originalName = $this->modeStack->getCurrent();
266        $modeName = $this->mode_handlers[$originalName] ?? $originalName;
267
268        return $this->handler->handleToken($modeName, $content, $is_match, $pos, $originalName);
269    }
270
271    /**
272     * Tries to match the next token starting at `$offset` in `$raw`.
273     *
274     * The full subject is passed to the regex engine (rather than a
275     * truncated tail) so that lookbehind assertions in the registered
276     * patterns can see characters before the current offset. Empty
277     * subjects (offset past end) will not be matched.
278     *
279     * @param string $raw     The full subject to parse.
280     * @param int    $offset  Byte offset at which to resume matching.
281     * @return array|bool     Three item list of unparsed content followed by the
282     *                        recognised token and finally the action the parser is to take.
283     *                        True if no match, false if there is a parsing error.
284     */
285    protected function reduce($raw, $offset)
286    {
287        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
288            return false;
289        }
290        if ($offset >= strlen($raw)) {
291            return true;
292        }
293        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split, $offset)) {
294            [$unparsed, $match] = $split;
295            return [$unparsed, $match, $action];
296        }
297        return true;
298    }
299
300    /**
301     * Escapes regex characters other than (, ) and /
302     *
303     * @param string $str
304     * @return string
305     */
306    public static function escape($str)
307    {
308        $chars = [
309            '/\\\\/',
310            '/\./',
311            '/\+/',
312            '/\*/',
313            '/\?/',
314            '/\[/',
315            '/\^/',
316            '/\]/',
317            '/\$/',
318            '/\{/',
319            '/\}/',
320            '/\=/',
321            '/\!/',
322            '/\</',
323            '/\>/',
324            '/\|/',
325            '/\:/'
326        ];
327
328        $escaped = [
329            '\\\\\\\\',
330            '\.',
331            '\+',
332            '\*',
333            '\?',
334            '\[',
335            '\^',
336            '\]',
337            '\$',
338            '\{',
339            '\}',
340            '\=',
341            '\!',
342            '\<',
343            '\>',
344            '\|',
345            '\:'
346        ];
347
348        return preg_replace($chars, $escaped, $str);
349    }
350}
351