xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 2e43b79909f3bc04928779d886f68c1242b5d436)
1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13use dokuwiki\Parsing\Handler;
14
15/**
16 * Accepts text and breaks it into tokens.
17 *
18 * Some optimisation to make the sure the content is only scanned by the PHP regex
19 * parser once. Lexer modes must not start with leading underscores.
20 */
21class Lexer
22{
23    /** Signal for leaving a mode */
24    public const MODE_EXIT = '__exit';
25    /** Prefix marking special (enter-and-exit) patterns */
26    public const MODE_SPECIAL_PREFIX = '_';
27
28    /** @var ParallelRegex[] */
29    protected $regexes = [];
30    /** @var Handler */
31    protected $handler;
32    /** @var StateStack */
33    protected $modeStack;
34    /** @var array mode "rewrites" */
35    protected $mode_handlers = [];
36    /** @var bool case sensitive? */
37    protected $case;
38
39    /**
40     * Sets up the lexer in case insensitive matching by default.
41     *
42     * @param Handler $handler  Handling strategy by reference.
43     * @param string $start            Starting handler.
44     * @param boolean $case            True for case sensitive.
45     */
46    public function __construct($handler, $start = "accept", $case = false)
47    {
48        $this->case = $case;
49        $this->handler = $handler;
50        $this->modeStack = new StateStack($start);
51    }
52
53    /**
54     * Adds a token search pattern for a particular parsing mode.
55     *
56     * The pattern does not change the current mode.
57     *
58     * @param string $pattern      Perl style regex, but ( and )
59     *                             lose the usual meaning.
60     * @param string $mode         Should only apply this
61     *                             pattern when dealing with
62     *                             this type of input.
63     */
64    public function addPattern($pattern, $mode = "accept")
65    {
66        if (! isset($this->regexes[$mode])) {
67            $this->regexes[$mode] = new ParallelRegex($this->case);
68        }
69        $this->regexes[$mode]->addPattern($pattern);
70    }
71
72    /**
73     * Adds a pattern that will enter a new parsing mode.
74     *
75     * Useful for entering parenthesis, strings, tags, etc.
76     *
77     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
78     * @param string $mode         Should only apply this pattern when dealing with this type of input.
79     * @param string $new_mode     Change parsing to this new nested mode.
80     */
81    public function addEntryPattern($pattern, $mode, $new_mode)
82    {
83        if (! isset($this->regexes[$mode])) {
84            $this->regexes[$mode] = new ParallelRegex($this->case);
85        }
86        $this->regexes[$mode]->addPattern($pattern, $new_mode);
87    }
88
89    /**
90     * Adds a pattern that will exit the current mode and re-enter the previous one.
91     *
92     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
93     * @param string $mode         Mode to leave.
94     */
95    public function addExitPattern($pattern, $mode)
96    {
97        if (! isset($this->regexes[$mode])) {
98            $this->regexes[$mode] = new ParallelRegex($this->case);
99        }
100        $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT);
101    }
102
103    /**
104     * Adds a pattern that has a special mode.
105     *
106     * Acts as an entry and exit pattern in one go, effectively calling a special
107     * parser handler for this token only.
108     *
109     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
110     * @param string $mode         Should only apply this pattern when dealing with this type of input.
111     * @param string $special      Use this mode for this one token.
112     */
113    public function addSpecialPattern($pattern, $mode, $special)
114    {
115        if (! isset($this->regexes[$mode])) {
116            $this->regexes[$mode] = new ParallelRegex($this->case);
117        }
118        $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special);
119    }
120
121    /**
122     * Adds a mapping from a mode to another handler.
123     *
124     * @param string $mode        Mode to be remapped.
125     * @param string $handler     New target handler.
126     */
127    public function mapHandler($mode, $handler)
128    {
129        $this->mode_handlers[$mode] = $handler;
130    }
131
132    /**
133     * Splits the page text into tokens.
134     *
135     * Will fail if the handlers report an error or if no content is consumed. If successful then each
136     * unparsed and parsed token invokes a call to the held listener.
137     *
138     * @param string $raw        Raw HTML text.
139     * @return boolean           True on success, else false.
140     */
141    public function parse($raw)
142    {
143        if (! isset($this->handler)) {
144            return false;
145        }
146        $offset = 0;
147        while (is_array($parsed = $this->reduce($raw, $offset))) {
148            [$unmatched, $matched, $mode] = $parsed;
149            $matchPos = $offset + strlen($unmatched);
150            if (! $this->dispatchTokens($unmatched, $matched, $mode, $offset, $matchPos)) {
151                return false;
152            }
153            $newOffset = $matchPos + strlen($matched);
154            if ($newOffset === $offset) {
155                return false;
156            }
157            $offset = $newOffset;
158        }
159        if (!$parsed) {
160            return false;
161        }
162        return $this->invokeHandler(substr($raw, $offset), DOKU_LEXER_UNMATCHED, $offset);
163    }
164
165    /**
166     * Gives plugins access to the mode stack
167     *
168     * @return StateStack
169     */
170    public function getModeStack()
171    {
172        return $this->modeStack;
173    }
174
175    /**
176     * Sends the matched token and any leading unmatched
177     * text to the parser changing the lexer to a new
178     * mode if one is listed.
179     *
180     * @param string $unmatched Unmatched leading portion.
181     * @param string $matched Actual token match.
182     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
183     * @param int $initialPos
184     * @param int $matchPos Current byte index location in raw doc thats being parsed
185     * @return boolean             False if there was any error from the parser.
186     */
187    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
188    {
189        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
190            return false;
191        }
192        if ($this->isModeEnd($mode)) {
193            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
194                return false;
195            }
196            return $this->modeStack->leave();
197        }
198        if ($this->isSpecialMode($mode)) {
199            $this->modeStack->enter($this->decodeSpecial($mode));
200            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
201                return false;
202            }
203            return $this->modeStack->leave();
204        }
205        if (is_string($mode)) {
206            $this->modeStack->enter($mode);
207            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
208        }
209        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
210    }
211
212    /**
213     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
214     * mode stack.
215     *
216     * @param string $mode    Mode to test.
217     * @return boolean        True if this is the exit mode.
218     */
219    protected function isModeEnd($mode)
220    {
221        return ($mode === self::MODE_EXIT);
222    }
223
224    /**
225     * Test to see if the mode is one where this mode is entered for this token only and automatically
226     * leaves immediately afterwoods.
227     *
228     * @param string $mode    Mode to test.
229     * @return boolean        True if this is the exit mode.
230     */
231    protected function isSpecialMode($mode)
232    {
233        return str_starts_with($mode, self::MODE_SPECIAL_PREFIX);
234    }
235
236    /**
237     * Strips the magic underscore marking single token modes.
238     *
239     * @param string $mode    Mode to decode.
240     * @return string         Underlying mode name.
241     */
242    protected function decodeSpecial($mode)
243    {
244        return substr($mode, strlen(self::MODE_SPECIAL_PREFIX));
245    }
246
247    /**
248     * Dispatches a token to the handler.
249     *
250     * Resolves mode name aliases (e.g. unformattedalt → unformatted) and
251     * delegates all dispatch logic to Handler::handleToken().
252     *
253     * @param string $content Text parsed.
254     * @param int $state One of the DOKU_LEXER_* constants identifying the
255     *                   lexer event (ENTER / MATCHED / UNMATCHED / EXIT /
256     *                   SPECIAL).
257     * @param int $pos Current byte index location in raw doc
258     *                             thats being parsed
259     * @return bool
260     */
261    protected function invokeHandler($content, $state, $pos)
262    {
263        if ($content === false) {
264            return true;
265        }
266        // Empty content is a no-op for every state EXCEPT EXIT: a zero-width
267        // exit pattern (lookahead-only) must still fire the mode's exit
268        // handler so cleanup like restoring a buffered call writer happens.
269        // Skipping it would pop the mode stack but leave the handler-side
270        // state stale.
271        if ($content === '' && $state !== DOKU_LEXER_EXIT) {
272            return true;
273        }
274        $originalName = $this->modeStack->getCurrent();
275        $modeName = $this->mode_handlers[$originalName] ?? $originalName;
276
277        return $this->handler->handleToken($modeName, $content, $state, $pos, $originalName);
278    }
279
280    /**
281     * Tries to match the next token starting at `$offset` in `$raw`.
282     *
283     * The full subject is passed to the regex engine (rather than a
284     * truncated tail) so that lookbehind assertions in the registered
285     * patterns can see characters before the current offset. Empty
286     * subjects (offset past end) will not be matched.
287     *
288     * @param string $raw     The full subject to parse.
289     * @param int    $offset  Byte offset at which to resume matching.
290     * @return array|bool     Three item list of unparsed content followed by the
291     *                        recognised token and finally the action the parser is to take.
292     *                        True if no match, false if there is a parsing error.
293     */
294    protected function reduce($raw, $offset)
295    {
296        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
297            return false;
298        }
299        if ($offset >= strlen($raw)) {
300            return true;
301        }
302        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split, $offset)) {
303            [$unparsed, $match] = $split;
304            return [$unparsed, $match, $action];
305        }
306        return true;
307    }
308
309    /**
310     * Escapes regex characters other than (, ) and /
311     *
312     * @param string $str
313     * @return string
314     */
315    public static function escape($str)
316    {
317        $chars = [
318            '/\\\\/',
319            '/\./',
320            '/\+/',
321            '/\*/',
322            '/\?/',
323            '/\[/',
324            '/\^/',
325            '/\]/',
326            '/\$/',
327            '/\{/',
328            '/\}/',
329            '/\=/',
330            '/\!/',
331            '/\</',
332            '/\>/',
333            '/\|/',
334            '/\:/'
335        ];
336
337        $escaped = [
338            '\\\\\\\\',
339            '\.',
340            '\+',
341            '\*',
342            '\?',
343            '\[',
344            '\^',
345            '\]',
346            '\$',
347            '\{',
348            '\}',
349            '\=',
350            '\!',
351            '\<',
352            '\>',
353            '\|',
354            '\:'
355        ];
356
357        return preg_replace($chars, $escaped, $str);
358    }
359}
360