xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision bcaec9f47d06126b3e653fea89a86d8b6a6cbef8)
1be906b56SAndreas Gohr<?php
2be906b56SAndreas Gohr/**
3be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4be906b56SAndreas Gohr * For an intro to the Lexer see:
5be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6be906b56SAndreas Gohr *
7be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
8be906b56SAndreas Gohr */
9be906b56SAndreas Gohr
10be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
11be906b56SAndreas Gohr
12be906b56SAndreas Gohr/**
13be906b56SAndreas Gohr * Accepts text and breaks it into tokens.
14be906b56SAndreas Gohr *
15be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex
16be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores.
17be906b56SAndreas Gohr */
18be906b56SAndreas Gohrclass Lexer
19be906b56SAndreas Gohr{
20be906b56SAndreas Gohr    /** @var ParallelRegex[] */
21*bcaec9f4SAndreas Gohr    protected $regexes = [];
22be906b56SAndreas Gohr    /** @var \Doku_Handler */
23be906b56SAndreas Gohr    protected $handler;
24be906b56SAndreas Gohr    /** @var StateStack */
25661c1ddcSChristopher Smith    protected $modeStack;
26be906b56SAndreas Gohr    /** @var array mode "rewrites" */
27*bcaec9f4SAndreas Gohr    protected $mode_handlers = [];
28be906b56SAndreas Gohr    /** @var bool case sensitive? */
29be906b56SAndreas Gohr    protected $case;
30be906b56SAndreas Gohr
31be906b56SAndreas Gohr    /**
32be906b56SAndreas Gohr     * Sets up the lexer in case insensitive matching by default.
33be906b56SAndreas Gohr     *
34be906b56SAndreas Gohr     * @param \Doku_Handler $handler  Handling strategy by reference.
35be906b56SAndreas Gohr     * @param string $start            Starting handler.
36be906b56SAndreas Gohr     * @param boolean $case            True for case sensitive.
37be906b56SAndreas Gohr     */
38be906b56SAndreas Gohr    public function __construct($handler, $start = "accept", $case = false)
39be906b56SAndreas Gohr    {
40be906b56SAndreas Gohr        $this->case = $case;
41be906b56SAndreas Gohr        $this->handler = $handler;
42661c1ddcSChristopher Smith        $this->modeStack = new StateStack($start);
43be906b56SAndreas Gohr    }
44be906b56SAndreas Gohr
45be906b56SAndreas Gohr    /**
46be906b56SAndreas Gohr     * Adds a token search pattern for a particular parsing mode.
47be906b56SAndreas Gohr     *
48be906b56SAndreas Gohr     * The pattern does not change the current mode.
49be906b56SAndreas Gohr     *
50be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and )
51be906b56SAndreas Gohr     *                             lose the usual meaning.
52be906b56SAndreas Gohr     * @param string $mode         Should only apply this
53be906b56SAndreas Gohr     *                             pattern when dealing with
54be906b56SAndreas Gohr     *                             this type of input.
55be906b56SAndreas Gohr     */
56be906b56SAndreas Gohr    public function addPattern($pattern, $mode = "accept")
57be906b56SAndreas Gohr    {
58be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
59be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
60be906b56SAndreas Gohr        }
61be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern);
62be906b56SAndreas Gohr    }
63be906b56SAndreas Gohr
64be906b56SAndreas Gohr    /**
65be906b56SAndreas Gohr     * Adds a pattern that will enter a new parsing mode.
66be906b56SAndreas Gohr     *
67be906b56SAndreas Gohr     * Useful for entering parenthesis, strings, tags, etc.
68be906b56SAndreas Gohr     *
69be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
70be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
71be906b56SAndreas Gohr     * @param string $new_mode     Change parsing to this new nested mode.
72be906b56SAndreas Gohr     */
73be906b56SAndreas Gohr    public function addEntryPattern($pattern, $mode, $new_mode)
74be906b56SAndreas Gohr    {
75be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
76be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
77be906b56SAndreas Gohr        }
78be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, $new_mode);
79be906b56SAndreas Gohr    }
80be906b56SAndreas Gohr
81be906b56SAndreas Gohr    /**
82be906b56SAndreas Gohr     * Adds a pattern that will exit the current mode and re-enter the previous one.
83be906b56SAndreas Gohr     *
84be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
85be906b56SAndreas Gohr     * @param string $mode         Mode to leave.
86be906b56SAndreas Gohr     */
87be906b56SAndreas Gohr    public function addExitPattern($pattern, $mode)
88be906b56SAndreas Gohr    {
89be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
90be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
91be906b56SAndreas Gohr        }
92be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "__exit");
93be906b56SAndreas Gohr    }
94be906b56SAndreas Gohr
95be906b56SAndreas Gohr    /**
96be906b56SAndreas Gohr     * Adds a pattern that has a special mode.
97be906b56SAndreas Gohr     *
98be906b56SAndreas Gohr     * Acts as an entry and exit pattern in one go, effectively calling a special
99be906b56SAndreas Gohr     * parser handler for this token only.
100be906b56SAndreas Gohr     *
101be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
102be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
103be906b56SAndreas Gohr     * @param string $special      Use this mode for this one token.
104be906b56SAndreas Gohr     */
105be906b56SAndreas Gohr    public function addSpecialPattern($pattern, $mode, $special)
106be906b56SAndreas Gohr    {
107be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
108be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
109be906b56SAndreas Gohr        }
110be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "_$special");
111be906b56SAndreas Gohr    }
112be906b56SAndreas Gohr
113be906b56SAndreas Gohr    /**
114be906b56SAndreas Gohr     * Adds a mapping from a mode to another handler.
115be906b56SAndreas Gohr     *
116be906b56SAndreas Gohr     * @param string $mode        Mode to be remapped.
117be906b56SAndreas Gohr     * @param string $handler     New target handler.
118be906b56SAndreas Gohr     */
119be906b56SAndreas Gohr    public function mapHandler($mode, $handler)
120be906b56SAndreas Gohr    {
121be906b56SAndreas Gohr        $this->mode_handlers[$mode] = $handler;
122be906b56SAndreas Gohr    }
123be906b56SAndreas Gohr
124be906b56SAndreas Gohr    /**
125be906b56SAndreas Gohr     * Splits the page text into tokens.
126be906b56SAndreas Gohr     *
127be906b56SAndreas Gohr     * Will fail if the handlers report an error or if no content is consumed. If successful then each
128be906b56SAndreas Gohr     * unparsed and parsed token invokes a call to the held listener.
129be906b56SAndreas Gohr     *
130be906b56SAndreas Gohr     * @param string $raw        Raw HTML text.
131be906b56SAndreas Gohr     * @return boolean           True on success, else false.
132be906b56SAndreas Gohr     */
133be906b56SAndreas Gohr    public function parse($raw)
134be906b56SAndreas Gohr    {
135be906b56SAndreas Gohr        if (! isset($this->handler)) {
136be906b56SAndreas Gohr            return false;
137be906b56SAndreas Gohr        }
138be906b56SAndreas Gohr        $initialLength = strlen($raw);
139be906b56SAndreas Gohr        $length = $initialLength;
140be906b56SAndreas Gohr        $pos = 0;
141be906b56SAndreas Gohr        while (is_array($parsed = $this->reduce($raw))) {
142*bcaec9f4SAndreas Gohr            [$unmatched, $matched, $mode] = $parsed;
143be906b56SAndreas Gohr            $currentLength = strlen($raw);
144be906b56SAndreas Gohr            $matchPos = $initialLength - $currentLength - strlen($matched);
145be906b56SAndreas Gohr            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
146be906b56SAndreas Gohr                return false;
147be906b56SAndreas Gohr            }
148*bcaec9f4SAndreas Gohr            if ($currentLength === $length) {
149be906b56SAndreas Gohr                return false;
150be906b56SAndreas Gohr            }
151be906b56SAndreas Gohr            $length = $currentLength;
152be906b56SAndreas Gohr            $pos = $initialLength - $currentLength;
153be906b56SAndreas Gohr        }
154be906b56SAndreas Gohr        if (!$parsed) {
155be906b56SAndreas Gohr            return false;
156be906b56SAndreas Gohr        }
157be906b56SAndreas Gohr        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
158be906b56SAndreas Gohr    }
159be906b56SAndreas Gohr
160be906b56SAndreas Gohr    /**
161368a782fSAnna Dabrowska     * Gives plugins access to the mode stack
162368a782fSAnna Dabrowska     *
163368a782fSAnna Dabrowska     * @return StateStack
164368a782fSAnna Dabrowska     */
165368a782fSAnna Dabrowska    public function getModeStack()
166368a782fSAnna Dabrowska    {
167368a782fSAnna Dabrowska        return $this->modeStack;
168368a782fSAnna Dabrowska    }
169368a782fSAnna Dabrowska
170368a782fSAnna Dabrowska    /**
171be906b56SAndreas Gohr     * Sends the matched token and any leading unmatched
172be906b56SAndreas Gohr     * text to the parser changing the lexer to a new
173be906b56SAndreas Gohr     * mode if one is listed.
174be906b56SAndreas Gohr     *
175be906b56SAndreas Gohr     * @param string $unmatched Unmatched leading portion.
176be906b56SAndreas Gohr     * @param string $matched Actual token match.
177be906b56SAndreas Gohr     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
178be906b56SAndreas Gohr     * @param int $initialPos
179be906b56SAndreas Gohr     * @param int $matchPos Current byte index location in raw doc thats being parsed
180be906b56SAndreas Gohr     * @return boolean             False if there was any error from the parser.
181be906b56SAndreas Gohr     */
182661c1ddcSChristopher Smith    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
183be906b56SAndreas Gohr    {
184be906b56SAndreas Gohr        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
185be906b56SAndreas Gohr            return false;
186be906b56SAndreas Gohr        }
187be906b56SAndreas Gohr        if ($this->isModeEnd($mode)) {
188be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
189be906b56SAndreas Gohr                return false;
190be906b56SAndreas Gohr            }
191661c1ddcSChristopher Smith            return $this->modeStack->leave();
192be906b56SAndreas Gohr        }
193be906b56SAndreas Gohr        if ($this->isSpecialMode($mode)) {
194661c1ddcSChristopher Smith            $this->modeStack->enter($this->decodeSpecial($mode));
195be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
196be906b56SAndreas Gohr                return false;
197be906b56SAndreas Gohr            }
198661c1ddcSChristopher Smith            return $this->modeStack->leave();
199be906b56SAndreas Gohr        }
200be906b56SAndreas Gohr        if (is_string($mode)) {
201661c1ddcSChristopher Smith            $this->modeStack->enter($mode);
202be906b56SAndreas Gohr            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
203be906b56SAndreas Gohr        }
204be906b56SAndreas Gohr        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
205be906b56SAndreas Gohr    }
206be906b56SAndreas Gohr
207be906b56SAndreas Gohr    /**
208be906b56SAndreas Gohr     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
209be906b56SAndreas Gohr     * mode stack.
210be906b56SAndreas Gohr     *
211be906b56SAndreas Gohr     * @param string $mode    Mode to test.
212be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
213be906b56SAndreas Gohr     */
214be906b56SAndreas Gohr    protected function isModeEnd($mode)
215be906b56SAndreas Gohr    {
216be906b56SAndreas Gohr        return ($mode === "__exit");
217be906b56SAndreas Gohr    }
218be906b56SAndreas Gohr
219be906b56SAndreas Gohr    /**
220be906b56SAndreas Gohr     * Test to see if the mode is one where this mode is entered for this token only and automatically
221be906b56SAndreas Gohr     * leaves immediately afterwoods.
222be906b56SAndreas Gohr     *
223be906b56SAndreas Gohr     * @param string $mode    Mode to test.
224be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
225be906b56SAndreas Gohr     */
226be906b56SAndreas Gohr    protected function isSpecialMode($mode)
227be906b56SAndreas Gohr    {
228be906b56SAndreas Gohr        return (strncmp($mode, "_", 1) == 0);
229be906b56SAndreas Gohr    }
230be906b56SAndreas Gohr
231be906b56SAndreas Gohr    /**
232be906b56SAndreas Gohr     * Strips the magic underscore marking single token modes.
233be906b56SAndreas Gohr     *
234be906b56SAndreas Gohr     * @param string $mode    Mode to decode.
235be906b56SAndreas Gohr     * @return string         Underlying mode name.
236be906b56SAndreas Gohr     */
237be906b56SAndreas Gohr    protected function decodeSpecial($mode)
238be906b56SAndreas Gohr    {
239be906b56SAndreas Gohr        return substr($mode, 1);
240be906b56SAndreas Gohr    }
241be906b56SAndreas Gohr
242be906b56SAndreas Gohr    /**
243be906b56SAndreas Gohr     * Calls the parser method named after the current mode.
244be906b56SAndreas Gohr     *
245be906b56SAndreas Gohr     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
246be906b56SAndreas Gohr     *
247be906b56SAndreas Gohr     * @param string $content Text parsed.
248be906b56SAndreas Gohr     * @param boolean $is_match Token is recognised rather
249be906b56SAndreas Gohr     *                               than unparsed data.
250be906b56SAndreas Gohr     * @param int $pos Current byte index location in raw doc
251be906b56SAndreas Gohr     *                             thats being parsed
252be906b56SAndreas Gohr     * @return bool
253be906b56SAndreas Gohr     */
254be906b56SAndreas Gohr    protected function invokeHandler($content, $is_match, $pos)
255be906b56SAndreas Gohr    {
256be906b56SAndreas Gohr        if (($content === "") || ($content === false)) {
257be906b56SAndreas Gohr            return true;
258be906b56SAndreas Gohr        }
259661c1ddcSChristopher Smith        $handler = $this->modeStack->getCurrent();
260be906b56SAndreas Gohr        if (isset($this->mode_handlers[$handler])) {
261be906b56SAndreas Gohr            $handler = $this->mode_handlers[$handler];
262be906b56SAndreas Gohr        }
263be906b56SAndreas Gohr
264be906b56SAndreas Gohr        // modes starting with plugin_ are all handled by the same
265be906b56SAndreas Gohr        // handler but with an additional parameter
266be906b56SAndreas Gohr        if (substr($handler, 0, 7)=='plugin_') {
267*bcaec9f4SAndreas Gohr            [$handler, $plugin] = sexplode('_', $handler, 2, '');
268be906b56SAndreas Gohr            return $this->handler->$handler($content, $is_match, $pos, $plugin);
269be906b56SAndreas Gohr        }
270be906b56SAndreas Gohr
271be906b56SAndreas Gohr        return $this->handler->$handler($content, $is_match, $pos);
272be906b56SAndreas Gohr    }
273be906b56SAndreas Gohr
274be906b56SAndreas Gohr    /**
275be906b56SAndreas Gohr     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276be906b56SAndreas Gohr     * unparsed data. Empty strings will not be matched.
277be906b56SAndreas Gohr     *
278be906b56SAndreas Gohr     * @param string $raw         The subject to parse. This is the content that will be eaten.
279be906b56SAndreas Gohr     * @return array|bool         Three item list of unparsed content followed by the
280be906b56SAndreas Gohr     *                            recognised token and finally the action the parser is to take.
281be906b56SAndreas Gohr     *                            True if no match, false if there is a parsing error.
282be906b56SAndreas Gohr     */
283be906b56SAndreas Gohr    protected function reduce(&$raw)
284be906b56SAndreas Gohr    {
285661c1ddcSChristopher Smith        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
286be906b56SAndreas Gohr            return false;
287be906b56SAndreas Gohr        }
288be906b56SAndreas Gohr        if ($raw === "") {
289be906b56SAndreas Gohr            return true;
290be906b56SAndreas Gohr        }
291661c1ddcSChristopher Smith        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
292*bcaec9f4SAndreas Gohr            [$unparsed, $match, $raw] = $split;
293*bcaec9f4SAndreas Gohr            return [$unparsed, $match, $action];
294be906b56SAndreas Gohr        }
295be906b56SAndreas Gohr        return true;
296be906b56SAndreas Gohr    }
297be906b56SAndreas Gohr
298be906b56SAndreas Gohr    /**
299be906b56SAndreas Gohr     * Escapes regex characters other than (, ) and /
300be906b56SAndreas Gohr     *
301be906b56SAndreas Gohr     * @param string $str
302be906b56SAndreas Gohr     * @return string
303be906b56SAndreas Gohr     */
304be906b56SAndreas Gohr    public static function escape($str)
305be906b56SAndreas Gohr    {
306*bcaec9f4SAndreas Gohr        $chars = [
307be906b56SAndreas Gohr            '/\\\\/',
308be906b56SAndreas Gohr            '/\./',
309be906b56SAndreas Gohr            '/\+/',
310be906b56SAndreas Gohr            '/\*/',
311be906b56SAndreas Gohr            '/\?/',
312be906b56SAndreas Gohr            '/\[/',
313be906b56SAndreas Gohr            '/\^/',
314be906b56SAndreas Gohr            '/\]/',
315be906b56SAndreas Gohr            '/\$/',
316be906b56SAndreas Gohr            '/\{/',
317be906b56SAndreas Gohr            '/\}/',
318be906b56SAndreas Gohr            '/\=/',
319be906b56SAndreas Gohr            '/\!/',
320be906b56SAndreas Gohr            '/\</',
321be906b56SAndreas Gohr            '/\>/',
322be906b56SAndreas Gohr            '/\|/',
323be906b56SAndreas Gohr            '/\:/'
324*bcaec9f4SAndreas Gohr        ];
325be906b56SAndreas Gohr
326*bcaec9f4SAndreas Gohr        $escaped = [
327be906b56SAndreas Gohr            '\\\\\\\\',
328be906b56SAndreas Gohr            '\.',
329be906b56SAndreas Gohr            '\+',
330be906b56SAndreas Gohr            '\*',
331be906b56SAndreas Gohr            '\?',
332be906b56SAndreas Gohr            '\[',
333be906b56SAndreas Gohr            '\^',
334be906b56SAndreas Gohr            '\]',
335be906b56SAndreas Gohr            '\$',
336be906b56SAndreas Gohr            '\{',
337be906b56SAndreas Gohr            '\}',
338be906b56SAndreas Gohr            '\=',
339be906b56SAndreas Gohr            '\!',
340be906b56SAndreas Gohr            '\<',
341be906b56SAndreas Gohr            '\>',
342be906b56SAndreas Gohr            '\|',
343be906b56SAndreas Gohr            '\:'
344*bcaec9f4SAndreas Gohr        ];
345*bcaec9f4SAndreas Gohr
346be906b56SAndreas Gohr        return preg_replace($chars, $escaped, $str);
347be906b56SAndreas Gohr    }
348be906b56SAndreas Gohr}
349