xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 6c16a3a9aa602bb7e269fb6d5d18e1353e17f97f)
1be906b56SAndreas Gohr<?php
2d4f83172SAndreas Gohr
3be906b56SAndreas Gohr/**
4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5be906b56SAndreas Gohr * For an intro to the Lexer see:
6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7be906b56SAndreas Gohr *
8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
9be906b56SAndreas Gohr */
10be906b56SAndreas Gohr
11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
12be906b56SAndreas Gohr
13be906b56SAndreas Gohr/**
14be906b56SAndreas Gohr * Accepts text and breaks it into tokens.
15be906b56SAndreas Gohr *
16be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex
17be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores.
18be906b56SAndreas Gohr */
19be906b56SAndreas Gohrclass Lexer
20be906b56SAndreas Gohr{
21be906b56SAndreas Gohr    /** @var ParallelRegex[] */
22bcaec9f4SAndreas Gohr    protected $regexes = [];
23be906b56SAndreas Gohr    /** @var \Doku_Handler */
24be906b56SAndreas Gohr    protected $handler;
25be906b56SAndreas Gohr    /** @var StateStack */
26661c1ddcSChristopher Smith    protected $modeStack;
27be906b56SAndreas Gohr    /** @var array mode "rewrites" */
28bcaec9f4SAndreas Gohr    protected $mode_handlers = [];
29be906b56SAndreas Gohr    /** @var bool case sensitive? */
30be906b56SAndreas Gohr    protected $case;
31be906b56SAndreas Gohr
32be906b56SAndreas Gohr    /**
33be906b56SAndreas Gohr     * Sets up the lexer in case insensitive matching by default.
34be906b56SAndreas Gohr     *
35be906b56SAndreas Gohr     * @param \Doku_Handler $handler  Handling strategy by reference.
36be906b56SAndreas Gohr     * @param string $start            Starting handler.
37be906b56SAndreas Gohr     * @param boolean $case            True for case sensitive.
38be906b56SAndreas Gohr     */
39be906b56SAndreas Gohr    public function __construct($handler, $start = "accept", $case = false)
40be906b56SAndreas Gohr    {
41be906b56SAndreas Gohr        $this->case = $case;
42be906b56SAndreas Gohr        $this->handler = $handler;
43661c1ddcSChristopher Smith        $this->modeStack = new StateStack($start);
44be906b56SAndreas Gohr    }
45be906b56SAndreas Gohr
46be906b56SAndreas Gohr    /**
47be906b56SAndreas Gohr     * Adds a token search pattern for a particular parsing mode.
48be906b56SAndreas Gohr     *
49be906b56SAndreas Gohr     * The pattern does not change the current mode.
50be906b56SAndreas Gohr     *
51be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and )
52be906b56SAndreas Gohr     *                             lose the usual meaning.
53be906b56SAndreas Gohr     * @param string $mode         Should only apply this
54be906b56SAndreas Gohr     *                             pattern when dealing with
55be906b56SAndreas Gohr     *                             this type of input.
56be906b56SAndreas Gohr     */
57be906b56SAndreas Gohr    public function addPattern($pattern, $mode = "accept")
58be906b56SAndreas Gohr    {
59be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
60be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
61be906b56SAndreas Gohr        }
62be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern);
63be906b56SAndreas Gohr    }
64be906b56SAndreas Gohr
65be906b56SAndreas Gohr    /**
66be906b56SAndreas Gohr     * Adds a pattern that will enter a new parsing mode.
67be906b56SAndreas Gohr     *
68be906b56SAndreas Gohr     * Useful for entering parenthesis, strings, tags, etc.
69be906b56SAndreas Gohr     *
70be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
71be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
72be906b56SAndreas Gohr     * @param string $new_mode     Change parsing to this new nested mode.
73be906b56SAndreas Gohr     */
74be906b56SAndreas Gohr    public function addEntryPattern($pattern, $mode, $new_mode)
75be906b56SAndreas Gohr    {
76be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
77be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
78be906b56SAndreas Gohr        }
79be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, $new_mode);
80be906b56SAndreas Gohr    }
81be906b56SAndreas Gohr
82be906b56SAndreas Gohr    /**
83be906b56SAndreas Gohr     * Adds a pattern that will exit the current mode and re-enter the previous one.
84be906b56SAndreas Gohr     *
85be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
86be906b56SAndreas Gohr     * @param string $mode         Mode to leave.
87be906b56SAndreas Gohr     */
88be906b56SAndreas Gohr    public function addExitPattern($pattern, $mode)
89be906b56SAndreas Gohr    {
90be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
91be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
92be906b56SAndreas Gohr        }
93be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "__exit");
94be906b56SAndreas Gohr    }
95be906b56SAndreas Gohr
96be906b56SAndreas Gohr    /**
97be906b56SAndreas Gohr     * Adds a pattern that has a special mode.
98be906b56SAndreas Gohr     *
99be906b56SAndreas Gohr     * Acts as an entry and exit pattern in one go, effectively calling a special
100be906b56SAndreas Gohr     * parser handler for this token only.
101be906b56SAndreas Gohr     *
102be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
103be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
104be906b56SAndreas Gohr     * @param string $special      Use this mode for this one token.
105be906b56SAndreas Gohr     */
106be906b56SAndreas Gohr    public function addSpecialPattern($pattern, $mode, $special)
107be906b56SAndreas Gohr    {
108be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
109be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
110be906b56SAndreas Gohr        }
111be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "_$special");
112be906b56SAndreas Gohr    }
113be906b56SAndreas Gohr
114be906b56SAndreas Gohr    /**
115be906b56SAndreas Gohr     * Adds a mapping from a mode to another handler.
116be906b56SAndreas Gohr     *
117be906b56SAndreas Gohr     * @param string $mode        Mode to be remapped.
118be906b56SAndreas Gohr     * @param string $handler     New target handler.
119be906b56SAndreas Gohr     */
120be906b56SAndreas Gohr    public function mapHandler($mode, $handler)
121be906b56SAndreas Gohr    {
122be906b56SAndreas Gohr        $this->mode_handlers[$mode] = $handler;
123be906b56SAndreas Gohr    }
124be906b56SAndreas Gohr
125be906b56SAndreas Gohr    /**
126be906b56SAndreas Gohr     * Splits the page text into tokens.
127be906b56SAndreas Gohr     *
128be906b56SAndreas Gohr     * Will fail if the handlers report an error or if no content is consumed. If successful then each
129be906b56SAndreas Gohr     * unparsed and parsed token invokes a call to the held listener.
130be906b56SAndreas Gohr     *
131be906b56SAndreas Gohr     * @param string $raw        Raw HTML text.
132be906b56SAndreas Gohr     * @return boolean           True on success, else false.
133be906b56SAndreas Gohr     */
134be906b56SAndreas Gohr    public function parse($raw)
135be906b56SAndreas Gohr    {
136be906b56SAndreas Gohr        if (! isset($this->handler)) {
137be906b56SAndreas Gohr            return false;
138be906b56SAndreas Gohr        }
139be906b56SAndreas Gohr        $initialLength = strlen($raw);
140be906b56SAndreas Gohr        $length = $initialLength;
141be906b56SAndreas Gohr        $pos = 0;
142be906b56SAndreas Gohr        while (is_array($parsed = $this->reduce($raw))) {
143bcaec9f4SAndreas Gohr            [$unmatched, $matched, $mode] = $parsed;
144be906b56SAndreas Gohr            $currentLength = strlen($raw);
145be906b56SAndreas Gohr            $matchPos = $initialLength - $currentLength - strlen($matched);
146be906b56SAndreas Gohr            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
147be906b56SAndreas Gohr                return false;
148be906b56SAndreas Gohr            }
149bcaec9f4SAndreas Gohr            if ($currentLength === $length) {
150be906b56SAndreas Gohr                return false;
151be906b56SAndreas Gohr            }
152be906b56SAndreas Gohr            $length = $currentLength;
153be906b56SAndreas Gohr            $pos = $initialLength - $currentLength;
154be906b56SAndreas Gohr        }
155be906b56SAndreas Gohr        if (!$parsed) {
156be906b56SAndreas Gohr            return false;
157be906b56SAndreas Gohr        }
158be906b56SAndreas Gohr        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
159be906b56SAndreas Gohr    }
160be906b56SAndreas Gohr
161be906b56SAndreas Gohr    /**
162368a782fSAnna Dabrowska     * Gives plugins access to the mode stack
163368a782fSAnna Dabrowska     *
164368a782fSAnna Dabrowska     * @return StateStack
165368a782fSAnna Dabrowska     */
166368a782fSAnna Dabrowska    public function getModeStack()
167368a782fSAnna Dabrowska    {
168368a782fSAnna Dabrowska        return $this->modeStack;
169368a782fSAnna Dabrowska    }
170368a782fSAnna Dabrowska
171368a782fSAnna Dabrowska    /**
172be906b56SAndreas Gohr     * Sends the matched token and any leading unmatched
173be906b56SAndreas Gohr     * text to the parser changing the lexer to a new
174be906b56SAndreas Gohr     * mode if one is listed.
175be906b56SAndreas Gohr     *
176be906b56SAndreas Gohr     * @param string $unmatched Unmatched leading portion.
177be906b56SAndreas Gohr     * @param string $matched Actual token match.
178be906b56SAndreas Gohr     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
179be906b56SAndreas Gohr     * @param int $initialPos
180be906b56SAndreas Gohr     * @param int $matchPos Current byte index location in raw doc thats being parsed
181be906b56SAndreas Gohr     * @return boolean             False if there was any error from the parser.
182be906b56SAndreas Gohr     */
183661c1ddcSChristopher Smith    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
184be906b56SAndreas Gohr    {
185be906b56SAndreas Gohr        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
186be906b56SAndreas Gohr            return false;
187be906b56SAndreas Gohr        }
188be906b56SAndreas Gohr        if ($this->isModeEnd($mode)) {
189be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
190be906b56SAndreas Gohr                return false;
191be906b56SAndreas Gohr            }
192661c1ddcSChristopher Smith            return $this->modeStack->leave();
193be906b56SAndreas Gohr        }
194be906b56SAndreas Gohr        if ($this->isSpecialMode($mode)) {
195661c1ddcSChristopher Smith            $this->modeStack->enter($this->decodeSpecial($mode));
196be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
197be906b56SAndreas Gohr                return false;
198be906b56SAndreas Gohr            }
199661c1ddcSChristopher Smith            return $this->modeStack->leave();
200be906b56SAndreas Gohr        }
201be906b56SAndreas Gohr        if (is_string($mode)) {
202661c1ddcSChristopher Smith            $this->modeStack->enter($mode);
203be906b56SAndreas Gohr            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
204be906b56SAndreas Gohr        }
205be906b56SAndreas Gohr        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
206be906b56SAndreas Gohr    }
207be906b56SAndreas Gohr
208be906b56SAndreas Gohr    /**
209be906b56SAndreas Gohr     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
210be906b56SAndreas Gohr     * mode stack.
211be906b56SAndreas Gohr     *
212be906b56SAndreas Gohr     * @param string $mode    Mode to test.
213be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
214be906b56SAndreas Gohr     */
215be906b56SAndreas Gohr    protected function isModeEnd($mode)
216be906b56SAndreas Gohr    {
217be906b56SAndreas Gohr        return ($mode === "__exit");
218be906b56SAndreas Gohr    }
219be906b56SAndreas Gohr
220be906b56SAndreas Gohr    /**
221be906b56SAndreas Gohr     * Test to see if the mode is one where this mode is entered for this token only and automatically
222be906b56SAndreas Gohr     * leaves immediately afterwoods.
223be906b56SAndreas Gohr     *
224be906b56SAndreas Gohr     * @param string $mode    Mode to test.
225be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
226be906b56SAndreas Gohr     */
227be906b56SAndreas Gohr    protected function isSpecialMode($mode)
228be906b56SAndreas Gohr    {
229*6c16a3a9Sfiwswe        return str_starts_with($mode, '_');
230be906b56SAndreas Gohr    }
231be906b56SAndreas Gohr
232be906b56SAndreas Gohr    /**
233be906b56SAndreas Gohr     * Strips the magic underscore marking single token modes.
234be906b56SAndreas Gohr     *
235be906b56SAndreas Gohr     * @param string $mode    Mode to decode.
236be906b56SAndreas Gohr     * @return string         Underlying mode name.
237be906b56SAndreas Gohr     */
238be906b56SAndreas Gohr    protected function decodeSpecial($mode)
239be906b56SAndreas Gohr    {
240be906b56SAndreas Gohr        return substr($mode, 1);
241be906b56SAndreas Gohr    }
242be906b56SAndreas Gohr
243be906b56SAndreas Gohr    /**
244be906b56SAndreas Gohr     * Calls the parser method named after the current mode.
245be906b56SAndreas Gohr     *
246be906b56SAndreas Gohr     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
247be906b56SAndreas Gohr     *
248be906b56SAndreas Gohr     * @param string $content Text parsed.
249be906b56SAndreas Gohr     * @param boolean $is_match Token is recognised rather
250be906b56SAndreas Gohr     *                               than unparsed data.
251be906b56SAndreas Gohr     * @param int $pos Current byte index location in raw doc
252be906b56SAndreas Gohr     *                             thats being parsed
253be906b56SAndreas Gohr     * @return bool
254be906b56SAndreas Gohr     */
255be906b56SAndreas Gohr    protected function invokeHandler($content, $is_match, $pos)
256be906b56SAndreas Gohr    {
257be906b56SAndreas Gohr        if (($content === "") || ($content === false)) {
258be906b56SAndreas Gohr            return true;
259be906b56SAndreas Gohr        }
260661c1ddcSChristopher Smith        $handler = $this->modeStack->getCurrent();
261be906b56SAndreas Gohr        if (isset($this->mode_handlers[$handler])) {
262be906b56SAndreas Gohr            $handler = $this->mode_handlers[$handler];
263be906b56SAndreas Gohr        }
264be906b56SAndreas Gohr
265be906b56SAndreas Gohr        // modes starting with plugin_ are all handled by the same
266be906b56SAndreas Gohr        // handler but with an additional parameter
267*6c16a3a9Sfiwswe        if (str_starts_with($handler, 'plugin_')) {
268bcaec9f4SAndreas Gohr            [$handler, $plugin] = sexplode('_', $handler, 2, '');
269be906b56SAndreas Gohr            return $this->handler->$handler($content, $is_match, $pos, $plugin);
270be906b56SAndreas Gohr        }
271be906b56SAndreas Gohr
272be906b56SAndreas Gohr        return $this->handler->$handler($content, $is_match, $pos);
273be906b56SAndreas Gohr    }
274be906b56SAndreas Gohr
275be906b56SAndreas Gohr    /**
276be906b56SAndreas Gohr     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
277be906b56SAndreas Gohr     * unparsed data. Empty strings will not be matched.
278be906b56SAndreas Gohr     *
279be906b56SAndreas Gohr     * @param string $raw         The subject to parse. This is the content that will be eaten.
280be906b56SAndreas Gohr     * @return array|bool         Three item list of unparsed content followed by the
281be906b56SAndreas Gohr     *                            recognised token and finally the action the parser is to take.
282be906b56SAndreas Gohr     *                            True if no match, false if there is a parsing error.
283be906b56SAndreas Gohr     */
284be906b56SAndreas Gohr    protected function reduce(&$raw)
285be906b56SAndreas Gohr    {
286661c1ddcSChristopher Smith        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
287be906b56SAndreas Gohr            return false;
288be906b56SAndreas Gohr        }
289be906b56SAndreas Gohr        if ($raw === "") {
290be906b56SAndreas Gohr            return true;
291be906b56SAndreas Gohr        }
292661c1ddcSChristopher Smith        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
293bcaec9f4SAndreas Gohr            [$unparsed, $match, $raw] = $split;
294bcaec9f4SAndreas Gohr            return [$unparsed, $match, $action];
295be906b56SAndreas Gohr        }
296be906b56SAndreas Gohr        return true;
297be906b56SAndreas Gohr    }
298be906b56SAndreas Gohr
299be906b56SAndreas Gohr    /**
300be906b56SAndreas Gohr     * Escapes regex characters other than (, ) and /
301be906b56SAndreas Gohr     *
302be906b56SAndreas Gohr     * @param string $str
303be906b56SAndreas Gohr     * @return string
304be906b56SAndreas Gohr     */
305be906b56SAndreas Gohr    public static function escape($str)
306be906b56SAndreas Gohr    {
307bcaec9f4SAndreas Gohr        $chars = [
308be906b56SAndreas Gohr            '/\\\\/',
309be906b56SAndreas Gohr            '/\./',
310be906b56SAndreas Gohr            '/\+/',
311be906b56SAndreas Gohr            '/\*/',
312be906b56SAndreas Gohr            '/\?/',
313be906b56SAndreas Gohr            '/\[/',
314be906b56SAndreas Gohr            '/\^/',
315be906b56SAndreas Gohr            '/\]/',
316be906b56SAndreas Gohr            '/\$/',
317be906b56SAndreas Gohr            '/\{/',
318be906b56SAndreas Gohr            '/\}/',
319be906b56SAndreas Gohr            '/\=/',
320be906b56SAndreas Gohr            '/\!/',
321be906b56SAndreas Gohr            '/\</',
322be906b56SAndreas Gohr            '/\>/',
323be906b56SAndreas Gohr            '/\|/',
324be906b56SAndreas Gohr            '/\:/'
325bcaec9f4SAndreas Gohr        ];
326be906b56SAndreas Gohr
327bcaec9f4SAndreas Gohr        $escaped = [
328be906b56SAndreas Gohr            '\\\\\\\\',
329be906b56SAndreas Gohr            '\.',
330be906b56SAndreas Gohr            '\+',
331be906b56SAndreas Gohr            '\*',
332be906b56SAndreas Gohr            '\?',
333be906b56SAndreas Gohr            '\[',
334be906b56SAndreas Gohr            '\^',
335be906b56SAndreas Gohr            '\]',
336be906b56SAndreas Gohr            '\$',
337be906b56SAndreas Gohr            '\{',
338be906b56SAndreas Gohr            '\}',
339be906b56SAndreas Gohr            '\=',
340be906b56SAndreas Gohr            '\!',
341be906b56SAndreas Gohr            '\<',
342be906b56SAndreas Gohr            '\>',
343be906b56SAndreas Gohr            '\|',
344be906b56SAndreas Gohr            '\:'
345bcaec9f4SAndreas Gohr        ];
346bcaec9f4SAndreas Gohr
347be906b56SAndreas Gohr        return preg_replace($chars, $escaped, $str);
348be906b56SAndreas Gohr    }
349be906b56SAndreas Gohr}
350