xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 71096e46fcbfaeaa808667aba794e77fe2780169)
1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13use dokuwiki\Parsing\Handler;
14
15/**
16 * Accepts text and breaks it into tokens.
17 *
18 * Some optimisation to make the sure the content is only scanned by the PHP regex
19 * parser once. Lexer modes must not start with leading underscores.
20 */
21class Lexer
22{
23    /** Signal for leaving a mode */
24    public const MODE_EXIT = '__exit';
25    /** Prefix marking special (enter-and-exit) patterns */
26    public const MODE_SPECIAL_PREFIX = '_';
27
28    /** @var ParallelRegex[] */
29    protected $regexes = [];
30    /** @var Handler */
31    protected $handler;
32    /** @var StateStack */
33    protected $modeStack;
34    /** @var array mode "rewrites" */
35    protected $mode_handlers = [];
36    /** @var bool case sensitive? */
37    protected $case;
38
39    /**
40     * Sets up the lexer in case insensitive matching by default.
41     *
42     * @param Handler $handler  Handling strategy by reference.
43     * @param string $start            Starting handler.
44     * @param boolean $case            True for case sensitive.
45     */
46    public function __construct($handler, $start = "accept", $case = false)
47    {
48        $this->case = $case;
49        $this->handler = $handler;
50        $this->modeStack = new StateStack($start);
51    }
52
53    /**
54     * Adds a token search pattern for a particular parsing mode.
55     *
56     * The pattern does not change the current mode.
57     *
58     * @param string $pattern      Perl style regex, but ( and )
59     *                             lose the usual meaning.
60     * @param string $mode         Should only apply this
61     *                             pattern when dealing with
62     *                             this type of input.
63     */
64    public function addPattern($pattern, $mode = "accept")
65    {
66        if (! isset($this->regexes[$mode])) {
67            $this->regexes[$mode] = new ParallelRegex($this->case);
68        }
69        $this->regexes[$mode]->addPattern($pattern);
70    }
71
72    /**
73     * Adds a pattern that will enter a new parsing mode.
74     *
75     * Useful for entering parenthesis, strings, tags, etc.
76     *
77     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
78     * @param string $mode         Should only apply this pattern when dealing with this type of input.
79     * @param string $new_mode     Change parsing to this new nested mode.
80     */
81    public function addEntryPattern($pattern, $mode, $new_mode)
82    {
83        if (! isset($this->regexes[$mode])) {
84            $this->regexes[$mode] = new ParallelRegex($this->case);
85        }
86        $this->regexes[$mode]->addPattern($pattern, $new_mode);
87    }
88
89    /**
90     * Adds a pattern that will exit the current mode and re-enter the previous one.
91     *
92     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
93     * @param string $mode         Mode to leave.
94     */
95    public function addExitPattern($pattern, $mode)
96    {
97        if (! isset($this->regexes[$mode])) {
98            $this->regexes[$mode] = new ParallelRegex($this->case);
99        }
100        $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT);
101    }
102
103    /**
104     * Adds a pattern that has a special mode.
105     *
106     * Acts as an entry and exit pattern in one go, effectively calling a special
107     * parser handler for this token only.
108     *
109     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
110     * @param string $mode         Should only apply this pattern when dealing with this type of input.
111     * @param string $special      Use this mode for this one token.
112     */
113    public function addSpecialPattern($pattern, $mode, $special)
114    {
115        if (! isset($this->regexes[$mode])) {
116            $this->regexes[$mode] = new ParallelRegex($this->case);
117        }
118        $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special);
119    }
120
121    /**
122     * Adds a mapping from a mode to another handler.
123     *
124     * @param string $mode        Mode to be remapped.
125     * @param string $handler     New target handler.
126     */
127    public function mapHandler($mode, $handler)
128    {
129        $this->mode_handlers[$mode] = $handler;
130    }
131
132    /**
133     * Splits the page text into tokens.
134     *
135     * Will fail if the handlers report an error or if no content is consumed. If successful then each
136     * unparsed and parsed token invokes a call to the held listener.
137     *
138     * @param string $raw        Raw HTML text.
139     * @return boolean           True on success, else false.
140     */
141    public function parse($raw)
142    {
143        if (! isset($this->handler)) {
144            return false;
145        }
146        $initialLength = strlen($raw);
147        $length = $initialLength;
148        $pos = 0;
149        while (is_array($parsed = $this->reduce($raw))) {
150            [$unmatched, $matched, $mode] = $parsed;
151            $currentLength = strlen($raw);
152            $matchPos = $initialLength - $currentLength - strlen($matched);
153            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
154                return false;
155            }
156            if ($currentLength === $length) {
157                return false;
158            }
159            $length = $currentLength;
160            $pos = $initialLength - $currentLength;
161        }
162        if (!$parsed) {
163            return false;
164        }
165        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
166    }
167
168    /**
169     * Gives plugins access to the mode stack
170     *
171     * @return StateStack
172     */
173    public function getModeStack()
174    {
175        return $this->modeStack;
176    }
177
178    /**
179     * Sends the matched token and any leading unmatched
180     * text to the parser changing the lexer to a new
181     * mode if one is listed.
182     *
183     * @param string $unmatched Unmatched leading portion.
184     * @param string $matched Actual token match.
185     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
186     * @param int $initialPos
187     * @param int $matchPos Current byte index location in raw doc thats being parsed
188     * @return boolean             False if there was any error from the parser.
189     */
190    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
191    {
192        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
193            return false;
194        }
195        if ($this->isModeEnd($mode)) {
196            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
197                return false;
198            }
199            return $this->modeStack->leave();
200        }
201        if ($this->isSpecialMode($mode)) {
202            $this->modeStack->enter($this->decodeSpecial($mode));
203            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
204                return false;
205            }
206            return $this->modeStack->leave();
207        }
208        if (is_string($mode)) {
209            $this->modeStack->enter($mode);
210            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
211        }
212        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
213    }
214
215    /**
216     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
217     * mode stack.
218     *
219     * @param string $mode    Mode to test.
220     * @return boolean        True if this is the exit mode.
221     */
222    protected function isModeEnd($mode)
223    {
224        return ($mode === self::MODE_EXIT);
225    }
226
227    /**
228     * Test to see if the mode is one where this mode is entered for this token only and automatically
229     * leaves immediately afterwoods.
230     *
231     * @param string $mode    Mode to test.
232     * @return boolean        True if this is the exit mode.
233     */
234    protected function isSpecialMode($mode)
235    {
236        return str_starts_with($mode, self::MODE_SPECIAL_PREFIX);
237    }
238
239    /**
240     * Strips the magic underscore marking single token modes.
241     *
242     * @param string $mode    Mode to decode.
243     * @return string         Underlying mode name.
244     */
245    protected function decodeSpecial($mode)
246    {
247        return substr($mode, strlen(self::MODE_SPECIAL_PREFIX));
248    }
249
250    /**
251     * Dispatches a token to the handler.
252     *
253     * Resolves mode name aliases (e.g. unformattedalt → unformatted) and
254     * delegates all dispatch logic to Handler::handleToken().
255     *
256     * @param string $content Text parsed.
257     * @param boolean $is_match Token is recognised rather
258     *                               than unparsed data.
259     * @param int $pos Current byte index location in raw doc
260     *                             thats being parsed
261     * @return bool
262     */
263    protected function invokeHandler($content, $is_match, $pos)
264    {
265        if (($content === "") || ($content === false)) {
266            return true;
267        }
268        $originalName = $this->modeStack->getCurrent();
269        $modeName = $this->mode_handlers[$originalName] ?? $originalName;
270
271        return $this->handler->handleToken($modeName, $content, $is_match, $pos, $originalName);
272    }
273
274    /**
275     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276     * unparsed data. Empty strings will not be matched.
277     *
278     * @param string $raw         The subject to parse. This is the content that will be eaten.
279     * @return array|bool         Three item list of unparsed content followed by the
280     *                            recognised token and finally the action the parser is to take.
281     *                            True if no match, false if there is a parsing error.
282     */
283    protected function reduce(&$raw)
284    {
285        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
286            return false;
287        }
288        if ($raw === "") {
289            return true;
290        }
291        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
292            [$unparsed, $match, $raw] = $split;
293            return [$unparsed, $match, $action];
294        }
295        return true;
296    }
297
298    /**
299     * Escapes regex characters other than (, ) and /
300     *
301     * @param string $str
302     * @return string
303     */
304    public static function escape($str)
305    {
306        $chars = [
307            '/\\\\/',
308            '/\./',
309            '/\+/',
310            '/\*/',
311            '/\?/',
312            '/\[/',
313            '/\^/',
314            '/\]/',
315            '/\$/',
316            '/\{/',
317            '/\}/',
318            '/\=/',
319            '/\!/',
320            '/\</',
321            '/\>/',
322            '/\|/',
323            '/\:/'
324        ];
325
326        $escaped = [
327            '\\\\\\\\',
328            '\.',
329            '\+',
330            '\*',
331            '\?',
332            '\[',
333            '\^',
334            '\]',
335            '\$',
336            '\{',
337            '\}',
338            '\=',
339            '\!',
340            '\<',
341            '\>',
342            '\|',
343            '\:'
344        ];
345
346        return preg_replace($chars, $escaped, $str);
347    }
348}
349