xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 71096e46fcbfaeaa808667aba794e77fe2780169)
1be906b56SAndreas Gohr<?php
2d4f83172SAndreas Gohr
3be906b56SAndreas Gohr/**
4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5be906b56SAndreas Gohr * For an intro to the Lexer see:
6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7be906b56SAndreas Gohr *
8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
9be906b56SAndreas Gohr */
10be906b56SAndreas Gohr
11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
12be906b56SAndreas Gohr
13*71096e46SAndreas Gohruse dokuwiki\Parsing\Handler;
14*71096e46SAndreas Gohr
15be906b56SAndreas Gohr/**
16be906b56SAndreas Gohr * Accepts text and breaks it into tokens.
17be906b56SAndreas Gohr *
18be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex
19be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores.
20be906b56SAndreas Gohr */
21be906b56SAndreas Gohrclass Lexer
22be906b56SAndreas Gohr{
23f8026da1SAndreas Gohr    /** Signal for leaving a mode */
24f8026da1SAndreas Gohr    public const MODE_EXIT = '__exit';
25f8026da1SAndreas Gohr    /** Prefix marking special (enter-and-exit) patterns */
26f8026da1SAndreas Gohr    public const MODE_SPECIAL_PREFIX = '_';
27f8026da1SAndreas Gohr
28be906b56SAndreas Gohr    /** @var ParallelRegex[] */
29bcaec9f4SAndreas Gohr    protected $regexes = [];
30*71096e46SAndreas Gohr    /** @var Handler */
31be906b56SAndreas Gohr    protected $handler;
32be906b56SAndreas Gohr    /** @var StateStack */
33661c1ddcSChristopher Smith    protected $modeStack;
34be906b56SAndreas Gohr    /** @var array mode "rewrites" */
35bcaec9f4SAndreas Gohr    protected $mode_handlers = [];
36be906b56SAndreas Gohr    /** @var bool case sensitive? */
37be906b56SAndreas Gohr    protected $case;
38be906b56SAndreas Gohr
39be906b56SAndreas Gohr    /**
40be906b56SAndreas Gohr     * Sets up the lexer in case insensitive matching by default.
41be906b56SAndreas Gohr     *
42*71096e46SAndreas Gohr     * @param Handler $handler  Handling strategy by reference.
43be906b56SAndreas Gohr     * @param string $start            Starting handler.
44be906b56SAndreas Gohr     * @param boolean $case            True for case sensitive.
45be906b56SAndreas Gohr     */
46be906b56SAndreas Gohr    public function __construct($handler, $start = "accept", $case = false)
47be906b56SAndreas Gohr    {
48be906b56SAndreas Gohr        $this->case = $case;
49be906b56SAndreas Gohr        $this->handler = $handler;
50661c1ddcSChristopher Smith        $this->modeStack = new StateStack($start);
51be906b56SAndreas Gohr    }
52be906b56SAndreas Gohr
53be906b56SAndreas Gohr    /**
54be906b56SAndreas Gohr     * Adds a token search pattern for a particular parsing mode.
55be906b56SAndreas Gohr     *
56be906b56SAndreas Gohr     * The pattern does not change the current mode.
57be906b56SAndreas Gohr     *
58be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and )
59be906b56SAndreas Gohr     *                             lose the usual meaning.
60be906b56SAndreas Gohr     * @param string $mode         Should only apply this
61be906b56SAndreas Gohr     *                             pattern when dealing with
62be906b56SAndreas Gohr     *                             this type of input.
63be906b56SAndreas Gohr     */
64be906b56SAndreas Gohr    public function addPattern($pattern, $mode = "accept")
65be906b56SAndreas Gohr    {
66be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
67be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
68be906b56SAndreas Gohr        }
69be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern);
70be906b56SAndreas Gohr    }
71be906b56SAndreas Gohr
72be906b56SAndreas Gohr    /**
73be906b56SAndreas Gohr     * Adds a pattern that will enter a new parsing mode.
74be906b56SAndreas Gohr     *
75be906b56SAndreas Gohr     * Useful for entering parenthesis, strings, tags, etc.
76be906b56SAndreas Gohr     *
77be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
78be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
79be906b56SAndreas Gohr     * @param string $new_mode     Change parsing to this new nested mode.
80be906b56SAndreas Gohr     */
81be906b56SAndreas Gohr    public function addEntryPattern($pattern, $mode, $new_mode)
82be906b56SAndreas Gohr    {
83be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
84be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
85be906b56SAndreas Gohr        }
86be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, $new_mode);
87be906b56SAndreas Gohr    }
88be906b56SAndreas Gohr
89be906b56SAndreas Gohr    /**
90be906b56SAndreas Gohr     * Adds a pattern that will exit the current mode and re-enter the previous one.
91be906b56SAndreas Gohr     *
92be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
93be906b56SAndreas Gohr     * @param string $mode         Mode to leave.
94be906b56SAndreas Gohr     */
95be906b56SAndreas Gohr    public function addExitPattern($pattern, $mode)
96be906b56SAndreas Gohr    {
97be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
98be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
99be906b56SAndreas Gohr        }
100f8026da1SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT);
101be906b56SAndreas Gohr    }
102be906b56SAndreas Gohr
103be906b56SAndreas Gohr    /**
104be906b56SAndreas Gohr     * Adds a pattern that has a special mode.
105be906b56SAndreas Gohr     *
106be906b56SAndreas Gohr     * Acts as an entry and exit pattern in one go, effectively calling a special
107be906b56SAndreas Gohr     * parser handler for this token only.
108be906b56SAndreas Gohr     *
109be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
110be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
111be906b56SAndreas Gohr     * @param string $special      Use this mode for this one token.
112be906b56SAndreas Gohr     */
113be906b56SAndreas Gohr    public function addSpecialPattern($pattern, $mode, $special)
114be906b56SAndreas Gohr    {
115be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
116be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
117be906b56SAndreas Gohr        }
118f8026da1SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special);
119be906b56SAndreas Gohr    }
120be906b56SAndreas Gohr
121be906b56SAndreas Gohr    /**
122be906b56SAndreas Gohr     * Adds a mapping from a mode to another handler.
123be906b56SAndreas Gohr     *
124be906b56SAndreas Gohr     * @param string $mode        Mode to be remapped.
125be906b56SAndreas Gohr     * @param string $handler     New target handler.
126be906b56SAndreas Gohr     */
127be906b56SAndreas Gohr    public function mapHandler($mode, $handler)
128be906b56SAndreas Gohr    {
129be906b56SAndreas Gohr        $this->mode_handlers[$mode] = $handler;
130be906b56SAndreas Gohr    }
131be906b56SAndreas Gohr
132be906b56SAndreas Gohr    /**
133be906b56SAndreas Gohr     * Splits the page text into tokens.
134be906b56SAndreas Gohr     *
135be906b56SAndreas Gohr     * Will fail if the handlers report an error or if no content is consumed. If successful then each
136be906b56SAndreas Gohr     * unparsed and parsed token invokes a call to the held listener.
137be906b56SAndreas Gohr     *
138be906b56SAndreas Gohr     * @param string $raw        Raw HTML text.
139be906b56SAndreas Gohr     * @return boolean           True on success, else false.
140be906b56SAndreas Gohr     */
141be906b56SAndreas Gohr    public function parse($raw)
142be906b56SAndreas Gohr    {
143be906b56SAndreas Gohr        if (! isset($this->handler)) {
144be906b56SAndreas Gohr            return false;
145be906b56SAndreas Gohr        }
146be906b56SAndreas Gohr        $initialLength = strlen($raw);
147be906b56SAndreas Gohr        $length = $initialLength;
148be906b56SAndreas Gohr        $pos = 0;
149be906b56SAndreas Gohr        while (is_array($parsed = $this->reduce($raw))) {
150bcaec9f4SAndreas Gohr            [$unmatched, $matched, $mode] = $parsed;
151be906b56SAndreas Gohr            $currentLength = strlen($raw);
152be906b56SAndreas Gohr            $matchPos = $initialLength - $currentLength - strlen($matched);
153be906b56SAndreas Gohr            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
154be906b56SAndreas Gohr                return false;
155be906b56SAndreas Gohr            }
156bcaec9f4SAndreas Gohr            if ($currentLength === $length) {
157be906b56SAndreas Gohr                return false;
158be906b56SAndreas Gohr            }
159be906b56SAndreas Gohr            $length = $currentLength;
160be906b56SAndreas Gohr            $pos = $initialLength - $currentLength;
161be906b56SAndreas Gohr        }
162be906b56SAndreas Gohr        if (!$parsed) {
163be906b56SAndreas Gohr            return false;
164be906b56SAndreas Gohr        }
165be906b56SAndreas Gohr        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
166be906b56SAndreas Gohr    }
167be906b56SAndreas Gohr
168be906b56SAndreas Gohr    /**
169368a782fSAnna Dabrowska     * Gives plugins access to the mode stack
170368a782fSAnna Dabrowska     *
171368a782fSAnna Dabrowska     * @return StateStack
172368a782fSAnna Dabrowska     */
173368a782fSAnna Dabrowska    public function getModeStack()
174368a782fSAnna Dabrowska    {
175368a782fSAnna Dabrowska        return $this->modeStack;
176368a782fSAnna Dabrowska    }
177368a782fSAnna Dabrowska
178368a782fSAnna Dabrowska    /**
179be906b56SAndreas Gohr     * Sends the matched token and any leading unmatched
180be906b56SAndreas Gohr     * text to the parser changing the lexer to a new
181be906b56SAndreas Gohr     * mode if one is listed.
182be906b56SAndreas Gohr     *
183be906b56SAndreas Gohr     * @param string $unmatched Unmatched leading portion.
184be906b56SAndreas Gohr     * @param string $matched Actual token match.
185be906b56SAndreas Gohr     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
186be906b56SAndreas Gohr     * @param int $initialPos
187be906b56SAndreas Gohr     * @param int $matchPos Current byte index location in raw doc thats being parsed
188be906b56SAndreas Gohr     * @return boolean             False if there was any error from the parser.
189be906b56SAndreas Gohr     */
190661c1ddcSChristopher Smith    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
191be906b56SAndreas Gohr    {
192be906b56SAndreas Gohr        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
193be906b56SAndreas Gohr            return false;
194be906b56SAndreas Gohr        }
195be906b56SAndreas Gohr        if ($this->isModeEnd($mode)) {
196be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
197be906b56SAndreas Gohr                return false;
198be906b56SAndreas Gohr            }
199661c1ddcSChristopher Smith            return $this->modeStack->leave();
200be906b56SAndreas Gohr        }
201be906b56SAndreas Gohr        if ($this->isSpecialMode($mode)) {
202661c1ddcSChristopher Smith            $this->modeStack->enter($this->decodeSpecial($mode));
203be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
204be906b56SAndreas Gohr                return false;
205be906b56SAndreas Gohr            }
206661c1ddcSChristopher Smith            return $this->modeStack->leave();
207be906b56SAndreas Gohr        }
208be906b56SAndreas Gohr        if (is_string($mode)) {
209661c1ddcSChristopher Smith            $this->modeStack->enter($mode);
210be906b56SAndreas Gohr            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
211be906b56SAndreas Gohr        }
212be906b56SAndreas Gohr        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
213be906b56SAndreas Gohr    }
214be906b56SAndreas Gohr
215be906b56SAndreas Gohr    /**
216be906b56SAndreas Gohr     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
217be906b56SAndreas Gohr     * mode stack.
218be906b56SAndreas Gohr     *
219be906b56SAndreas Gohr     * @param string $mode    Mode to test.
220be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
221be906b56SAndreas Gohr     */
222be906b56SAndreas Gohr    protected function isModeEnd($mode)
223be906b56SAndreas Gohr    {
224f8026da1SAndreas Gohr        return ($mode === self::MODE_EXIT);
225be906b56SAndreas Gohr    }
226be906b56SAndreas Gohr
227be906b56SAndreas Gohr    /**
228be906b56SAndreas Gohr     * Test to see if the mode is one where this mode is entered for this token only and automatically
229be906b56SAndreas Gohr     * leaves immediately afterwoods.
230be906b56SAndreas Gohr     *
231be906b56SAndreas Gohr     * @param string $mode    Mode to test.
232be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
233be906b56SAndreas Gohr     */
234be906b56SAndreas Gohr    protected function isSpecialMode($mode)
235be906b56SAndreas Gohr    {
236f8026da1SAndreas Gohr        return str_starts_with($mode, self::MODE_SPECIAL_PREFIX);
237be906b56SAndreas Gohr    }
238be906b56SAndreas Gohr
239be906b56SAndreas Gohr    /**
240be906b56SAndreas Gohr     * Strips the magic underscore marking single token modes.
241be906b56SAndreas Gohr     *
242be906b56SAndreas Gohr     * @param string $mode    Mode to decode.
243be906b56SAndreas Gohr     * @return string         Underlying mode name.
244be906b56SAndreas Gohr     */
245be906b56SAndreas Gohr    protected function decodeSpecial($mode)
246be906b56SAndreas Gohr    {
247f8026da1SAndreas Gohr        return substr($mode, strlen(self::MODE_SPECIAL_PREFIX));
248be906b56SAndreas Gohr    }
249be906b56SAndreas Gohr
250be906b56SAndreas Gohr    /**
251*71096e46SAndreas Gohr     * Dispatches a token to the handler.
252be906b56SAndreas Gohr     *
253*71096e46SAndreas Gohr     * Resolves mode name aliases (e.g. unformattedalt → unformatted) and
254*71096e46SAndreas Gohr     * delegates all dispatch logic to Handler::handleToken().
255be906b56SAndreas Gohr     *
256be906b56SAndreas Gohr     * @param string $content Text parsed.
257be906b56SAndreas Gohr     * @param boolean $is_match Token is recognised rather
258be906b56SAndreas Gohr     *                               than unparsed data.
259be906b56SAndreas Gohr     * @param int $pos Current byte index location in raw doc
260be906b56SAndreas Gohr     *                             thats being parsed
261be906b56SAndreas Gohr     * @return bool
262be906b56SAndreas Gohr     */
263be906b56SAndreas Gohr    protected function invokeHandler($content, $is_match, $pos)
264be906b56SAndreas Gohr    {
265be906b56SAndreas Gohr        if (($content === "") || ($content === false)) {
266be906b56SAndreas Gohr            return true;
267be906b56SAndreas Gohr        }
268*71096e46SAndreas Gohr        $originalName = $this->modeStack->getCurrent();
269*71096e46SAndreas Gohr        $modeName = $this->mode_handlers[$originalName] ?? $originalName;
270be906b56SAndreas Gohr
271*71096e46SAndreas Gohr        return $this->handler->handleToken($modeName, $content, $is_match, $pos, $originalName);
272be906b56SAndreas Gohr    }
273be906b56SAndreas Gohr
274be906b56SAndreas Gohr    /**
275be906b56SAndreas Gohr     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276be906b56SAndreas Gohr     * unparsed data. Empty strings will not be matched.
277be906b56SAndreas Gohr     *
278be906b56SAndreas Gohr     * @param string $raw         The subject to parse. This is the content that will be eaten.
279be906b56SAndreas Gohr     * @return array|bool         Three item list of unparsed content followed by the
280be906b56SAndreas Gohr     *                            recognised token and finally the action the parser is to take.
281be906b56SAndreas Gohr     *                            True if no match, false if there is a parsing error.
282be906b56SAndreas Gohr     */
283be906b56SAndreas Gohr    protected function reduce(&$raw)
284be906b56SAndreas Gohr    {
285661c1ddcSChristopher Smith        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
286be906b56SAndreas Gohr            return false;
287be906b56SAndreas Gohr        }
288be906b56SAndreas Gohr        if ($raw === "") {
289be906b56SAndreas Gohr            return true;
290be906b56SAndreas Gohr        }
291661c1ddcSChristopher Smith        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
292bcaec9f4SAndreas Gohr            [$unparsed, $match, $raw] = $split;
293bcaec9f4SAndreas Gohr            return [$unparsed, $match, $action];
294be906b56SAndreas Gohr        }
295be906b56SAndreas Gohr        return true;
296be906b56SAndreas Gohr    }
297be906b56SAndreas Gohr
298be906b56SAndreas Gohr    /**
299be906b56SAndreas Gohr     * Escapes regex characters other than (, ) and /
300be906b56SAndreas Gohr     *
301be906b56SAndreas Gohr     * @param string $str
302be906b56SAndreas Gohr     * @return string
303be906b56SAndreas Gohr     */
304be906b56SAndreas Gohr    public static function escape($str)
305be906b56SAndreas Gohr    {
306bcaec9f4SAndreas Gohr        $chars = [
307be906b56SAndreas Gohr            '/\\\\/',
308be906b56SAndreas Gohr            '/\./',
309be906b56SAndreas Gohr            '/\+/',
310be906b56SAndreas Gohr            '/\*/',
311be906b56SAndreas Gohr            '/\?/',
312be906b56SAndreas Gohr            '/\[/',
313be906b56SAndreas Gohr            '/\^/',
314be906b56SAndreas Gohr            '/\]/',
315be906b56SAndreas Gohr            '/\$/',
316be906b56SAndreas Gohr            '/\{/',
317be906b56SAndreas Gohr            '/\}/',
318be906b56SAndreas Gohr            '/\=/',
319be906b56SAndreas Gohr            '/\!/',
320be906b56SAndreas Gohr            '/\</',
321be906b56SAndreas Gohr            '/\>/',
322be906b56SAndreas Gohr            '/\|/',
323be906b56SAndreas Gohr            '/\:/'
324bcaec9f4SAndreas Gohr        ];
325be906b56SAndreas Gohr
326bcaec9f4SAndreas Gohr        $escaped = [
327be906b56SAndreas Gohr            '\\\\\\\\',
328be906b56SAndreas Gohr            '\.',
329be906b56SAndreas Gohr            '\+',
330be906b56SAndreas Gohr            '\*',
331be906b56SAndreas Gohr            '\?',
332be906b56SAndreas Gohr            '\[',
333be906b56SAndreas Gohr            '\^',
334be906b56SAndreas Gohr            '\]',
335be906b56SAndreas Gohr            '\$',
336be906b56SAndreas Gohr            '\{',
337be906b56SAndreas Gohr            '\}',
338be906b56SAndreas Gohr            '\=',
339be906b56SAndreas Gohr            '\!',
340be906b56SAndreas Gohr            '\<',
341be906b56SAndreas Gohr            '\>',
342be906b56SAndreas Gohr            '\|',
343be906b56SAndreas Gohr            '\:'
344bcaec9f4SAndreas Gohr        ];
345bcaec9f4SAndreas Gohr
346be906b56SAndreas Gohr        return preg_replace($chars, $escaped, $str);
347be906b56SAndreas Gohr    }
348be906b56SAndreas Gohr}
349