xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision 661c1ddc2c77658fc8c124036c2e706227865c5a)
1be906b56SAndreas Gohr<?php
2be906b56SAndreas Gohr/**
3be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4be906b56SAndreas Gohr * For an intro to the Lexer see:
5be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6be906b56SAndreas Gohr *
7be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
8be906b56SAndreas Gohr */
9be906b56SAndreas Gohr
10be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
11be906b56SAndreas Gohr
12be906b56SAndreas Gohr// FIXME move elsewhere
13be906b56SAndreas Gohr
14be906b56SAndreas Gohrdefine("DOKU_LEXER_ENTER", 1);
15be906b56SAndreas Gohrdefine("DOKU_LEXER_MATCHED", 2);
16be906b56SAndreas Gohrdefine("DOKU_LEXER_UNMATCHED", 3);
17be906b56SAndreas Gohrdefine("DOKU_LEXER_EXIT", 4);
18be906b56SAndreas Gohrdefine("DOKU_LEXER_SPECIAL", 5);
19be906b56SAndreas Gohr
20be906b56SAndreas Gohr/**
21be906b56SAndreas Gohr * Accepts text and breaks it into tokens.
22be906b56SAndreas Gohr *
23be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex
24be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores.
25be906b56SAndreas Gohr */
26be906b56SAndreas Gohrclass Lexer
27be906b56SAndreas Gohr{
28be906b56SAndreas Gohr    /** @var ParallelRegex[] */
29be906b56SAndreas Gohr    protected $regexes;
30be906b56SAndreas Gohr    /** @var \Doku_Handler */
31be906b56SAndreas Gohr    protected $handler;
32be906b56SAndreas Gohr    /** @var StateStack */
33*661c1ddcSChristopher Smith    protected $modeStack;
34be906b56SAndreas Gohr    /** @var array mode "rewrites" */
35be906b56SAndreas Gohr    protected $mode_handlers;
36be906b56SAndreas Gohr    /** @var bool case sensitive? */
37be906b56SAndreas Gohr    protected $case;
38be906b56SAndreas Gohr
39be906b56SAndreas Gohr    /**
40be906b56SAndreas Gohr     * Sets up the lexer in case insensitive matching by default.
41be906b56SAndreas Gohr     *
42be906b56SAndreas Gohr     * @param \Doku_Handler $handler  Handling strategy by reference.
43be906b56SAndreas Gohr     * @param string $start            Starting handler.
44be906b56SAndreas Gohr     * @param boolean $case            True for case sensitive.
45be906b56SAndreas Gohr     */
46be906b56SAndreas Gohr    public function __construct($handler, $start = "accept", $case = false)
47be906b56SAndreas Gohr    {
48be906b56SAndreas Gohr        $this->case = $case;
49be906b56SAndreas Gohr        $this->regexes = array();
50be906b56SAndreas Gohr        $this->handler = $handler;
51*661c1ddcSChristopher Smith        $this->modeStack = new StateStack($start);
52be906b56SAndreas Gohr        $this->mode_handlers = array();
53be906b56SAndreas Gohr    }
54be906b56SAndreas Gohr
55be906b56SAndreas Gohr    /**
56be906b56SAndreas Gohr     * Adds a token search pattern for a particular parsing mode.
57be906b56SAndreas Gohr     *
58be906b56SAndreas Gohr     * The pattern does not change the current mode.
59be906b56SAndreas Gohr     *
60be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and )
61be906b56SAndreas Gohr     *                             lose the usual meaning.
62be906b56SAndreas Gohr     * @param string $mode         Should only apply this
63be906b56SAndreas Gohr     *                             pattern when dealing with
64be906b56SAndreas Gohr     *                             this type of input.
65be906b56SAndreas Gohr     */
66be906b56SAndreas Gohr    public function addPattern($pattern, $mode = "accept")
67be906b56SAndreas Gohr    {
68be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
69be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
70be906b56SAndreas Gohr        }
71be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern);
72be906b56SAndreas Gohr    }
73be906b56SAndreas Gohr
74be906b56SAndreas Gohr    /**
75be906b56SAndreas Gohr     * Adds a pattern that will enter a new parsing mode.
76be906b56SAndreas Gohr     *
77be906b56SAndreas Gohr     * Useful for entering parenthesis, strings, tags, etc.
78be906b56SAndreas Gohr     *
79be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
80be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
81be906b56SAndreas Gohr     * @param string $new_mode     Change parsing to this new nested mode.
82be906b56SAndreas Gohr     */
83be906b56SAndreas Gohr    public function addEntryPattern($pattern, $mode, $new_mode)
84be906b56SAndreas Gohr    {
85be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
86be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
87be906b56SAndreas Gohr        }
88be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, $new_mode);
89be906b56SAndreas Gohr    }
90be906b56SAndreas Gohr
91be906b56SAndreas Gohr    /**
92be906b56SAndreas Gohr     * Adds a pattern that will exit the current mode and re-enter the previous one.
93be906b56SAndreas Gohr     *
94be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
95be906b56SAndreas Gohr     * @param string $mode         Mode to leave.
96be906b56SAndreas Gohr     */
97be906b56SAndreas Gohr    public function addExitPattern($pattern, $mode)
98be906b56SAndreas Gohr    {
99be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
100be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
101be906b56SAndreas Gohr        }
102be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "__exit");
103be906b56SAndreas Gohr    }
104be906b56SAndreas Gohr
105be906b56SAndreas Gohr    /**
106be906b56SAndreas Gohr     * Adds a pattern that has a special mode.
107be906b56SAndreas Gohr     *
108be906b56SAndreas Gohr     * Acts as an entry and exit pattern in one go, effectively calling a special
109be906b56SAndreas Gohr     * parser handler for this token only.
110be906b56SAndreas Gohr     *
111be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
112be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
113be906b56SAndreas Gohr     * @param string $special      Use this mode for this one token.
114be906b56SAndreas Gohr     */
115be906b56SAndreas Gohr    public function addSpecialPattern($pattern, $mode, $special)
116be906b56SAndreas Gohr    {
117be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
118be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
119be906b56SAndreas Gohr        }
120be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "_$special");
121be906b56SAndreas Gohr    }
122be906b56SAndreas Gohr
123be906b56SAndreas Gohr    /**
124be906b56SAndreas Gohr     * Adds a mapping from a mode to another handler.
125be906b56SAndreas Gohr     *
126be906b56SAndreas Gohr     * @param string $mode        Mode to be remapped.
127be906b56SAndreas Gohr     * @param string $handler     New target handler.
128be906b56SAndreas Gohr     */
129be906b56SAndreas Gohr    public function mapHandler($mode, $handler)
130be906b56SAndreas Gohr    {
131be906b56SAndreas Gohr        $this->mode_handlers[$mode] = $handler;
132be906b56SAndreas Gohr    }
133be906b56SAndreas Gohr
134be906b56SAndreas Gohr    /**
135be906b56SAndreas Gohr     * Splits the page text into tokens.
136be906b56SAndreas Gohr     *
137be906b56SAndreas Gohr     * Will fail if the handlers report an error or if no content is consumed. If successful then each
138be906b56SAndreas Gohr     * unparsed and parsed token invokes a call to the held listener.
139be906b56SAndreas Gohr     *
140be906b56SAndreas Gohr     * @param string $raw        Raw HTML text.
141be906b56SAndreas Gohr     * @return boolean           True on success, else false.
142be906b56SAndreas Gohr     */
143be906b56SAndreas Gohr    public function parse($raw)
144be906b56SAndreas Gohr    {
145be906b56SAndreas Gohr        if (! isset($this->handler)) {
146be906b56SAndreas Gohr            return false;
147be906b56SAndreas Gohr        }
148be906b56SAndreas Gohr        $initialLength = strlen($raw);
149be906b56SAndreas Gohr        $length = $initialLength;
150be906b56SAndreas Gohr        $pos = 0;
151be906b56SAndreas Gohr        while (is_array($parsed = $this->reduce($raw))) {
152be906b56SAndreas Gohr            list($unmatched, $matched, $mode) = $parsed;
153be906b56SAndreas Gohr            $currentLength = strlen($raw);
154be906b56SAndreas Gohr            $matchPos = $initialLength - $currentLength - strlen($matched);
155be906b56SAndreas Gohr            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
156be906b56SAndreas Gohr                return false;
157be906b56SAndreas Gohr            }
158be906b56SAndreas Gohr            if ($currentLength == $length) {
159be906b56SAndreas Gohr                return false;
160be906b56SAndreas Gohr            }
161be906b56SAndreas Gohr            $length = $currentLength;
162be906b56SAndreas Gohr            $pos = $initialLength - $currentLength;
163be906b56SAndreas Gohr        }
164be906b56SAndreas Gohr        if (!$parsed) {
165be906b56SAndreas Gohr            return false;
166be906b56SAndreas Gohr        }
167be906b56SAndreas Gohr        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
168be906b56SAndreas Gohr    }
169be906b56SAndreas Gohr
170be906b56SAndreas Gohr    /**
171be906b56SAndreas Gohr     * Sends the matched token and any leading unmatched
172be906b56SAndreas Gohr     * text to the parser changing the lexer to a new
173be906b56SAndreas Gohr     * mode if one is listed.
174be906b56SAndreas Gohr     *
175be906b56SAndreas Gohr     * @param string $unmatched Unmatched leading portion.
176be906b56SAndreas Gohr     * @param string $matched Actual token match.
177be906b56SAndreas Gohr     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
178be906b56SAndreas Gohr     * @param int $initialPos
179be906b56SAndreas Gohr     * @param int $matchPos Current byte index location in raw doc thats being parsed
180be906b56SAndreas Gohr     * @return boolean             False if there was any error from the parser.
181be906b56SAndreas Gohr     */
182*661c1ddcSChristopher Smith    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
183be906b56SAndreas Gohr    {
184be906b56SAndreas Gohr        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
185be906b56SAndreas Gohr            return false;
186be906b56SAndreas Gohr        }
187be906b56SAndreas Gohr        if ($this->isModeEnd($mode)) {
188be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
189be906b56SAndreas Gohr                return false;
190be906b56SAndreas Gohr            }
191*661c1ddcSChristopher Smith            return $this->modeStack->leave();
192be906b56SAndreas Gohr        }
193be906b56SAndreas Gohr        if ($this->isSpecialMode($mode)) {
194*661c1ddcSChristopher Smith            $this->modeStack->enter($this->decodeSpecial($mode));
195be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
196be906b56SAndreas Gohr                return false;
197be906b56SAndreas Gohr            }
198*661c1ddcSChristopher Smith            return $this->modeStack->leave();
199be906b56SAndreas Gohr        }
200be906b56SAndreas Gohr        if (is_string($mode)) {
201*661c1ddcSChristopher Smith            $this->modeStack->enter($mode);
202be906b56SAndreas Gohr            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
203be906b56SAndreas Gohr        }
204be906b56SAndreas Gohr        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
205be906b56SAndreas Gohr    }
206be906b56SAndreas Gohr
207be906b56SAndreas Gohr    /**
208be906b56SAndreas Gohr     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
209be906b56SAndreas Gohr     * mode stack.
210be906b56SAndreas Gohr     *
211be906b56SAndreas Gohr     * @param string $mode    Mode to test.
212be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
213be906b56SAndreas Gohr     */
214be906b56SAndreas Gohr    protected function isModeEnd($mode)
215be906b56SAndreas Gohr    {
216be906b56SAndreas Gohr        return ($mode === "__exit");
217be906b56SAndreas Gohr    }
218be906b56SAndreas Gohr
219be906b56SAndreas Gohr    /**
220be906b56SAndreas Gohr     * Test to see if the mode is one where this mode is entered for this token only and automatically
221be906b56SAndreas Gohr     * leaves immediately afterwoods.
222be906b56SAndreas Gohr     *
223be906b56SAndreas Gohr     * @param string $mode    Mode to test.
224be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
225be906b56SAndreas Gohr     */
226be906b56SAndreas Gohr    protected function isSpecialMode($mode)
227be906b56SAndreas Gohr    {
228be906b56SAndreas Gohr        return (strncmp($mode, "_", 1) == 0);
229be906b56SAndreas Gohr    }
230be906b56SAndreas Gohr
231be906b56SAndreas Gohr    /**
232be906b56SAndreas Gohr     * Strips the magic underscore marking single token modes.
233be906b56SAndreas Gohr     *
234be906b56SAndreas Gohr     * @param string $mode    Mode to decode.
235be906b56SAndreas Gohr     * @return string         Underlying mode name.
236be906b56SAndreas Gohr     */
237be906b56SAndreas Gohr    protected function decodeSpecial($mode)
238be906b56SAndreas Gohr    {
239be906b56SAndreas Gohr        return substr($mode, 1);
240be906b56SAndreas Gohr    }
241be906b56SAndreas Gohr
242be906b56SAndreas Gohr    /**
243be906b56SAndreas Gohr     * Calls the parser method named after the current mode.
244be906b56SAndreas Gohr     *
245be906b56SAndreas Gohr     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
246be906b56SAndreas Gohr     *
247be906b56SAndreas Gohr     * @param string $content Text parsed.
248be906b56SAndreas Gohr     * @param boolean $is_match Token is recognised rather
249be906b56SAndreas Gohr     *                               than unparsed data.
250be906b56SAndreas Gohr     * @param int $pos Current byte index location in raw doc
251be906b56SAndreas Gohr     *                             thats being parsed
252be906b56SAndreas Gohr     * @return bool
253be906b56SAndreas Gohr     */
254be906b56SAndreas Gohr    protected function invokeHandler($content, $is_match, $pos)
255be906b56SAndreas Gohr    {
256be906b56SAndreas Gohr        if (($content === "") || ($content === false)) {
257be906b56SAndreas Gohr            return true;
258be906b56SAndreas Gohr        }
259*661c1ddcSChristopher Smith        $handler = $this->modeStack->getCurrent();
260be906b56SAndreas Gohr        if (isset($this->mode_handlers[$handler])) {
261be906b56SAndreas Gohr            $handler = $this->mode_handlers[$handler];
262be906b56SAndreas Gohr        }
263be906b56SAndreas Gohr
264be906b56SAndreas Gohr        // modes starting with plugin_ are all handled by the same
265be906b56SAndreas Gohr        // handler but with an additional parameter
266be906b56SAndreas Gohr        if (substr($handler, 0, 7)=='plugin_') {
267be906b56SAndreas Gohr            list($handler,$plugin) = explode('_', $handler, 2);
268be906b56SAndreas Gohr            return $this->handler->$handler($content, $is_match, $pos, $plugin);
269be906b56SAndreas Gohr        }
270be906b56SAndreas Gohr
271be906b56SAndreas Gohr        return $this->handler->$handler($content, $is_match, $pos);
272be906b56SAndreas Gohr    }
273be906b56SAndreas Gohr
274be906b56SAndreas Gohr    /**
275be906b56SAndreas Gohr     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276be906b56SAndreas Gohr     * unparsed data. Empty strings will not be matched.
277be906b56SAndreas Gohr     *
278be906b56SAndreas Gohr     * @param string $raw         The subject to parse. This is the content that will be eaten.
279be906b56SAndreas Gohr     * @return array|bool         Three item list of unparsed content followed by the
280be906b56SAndreas Gohr     *                            recognised token and finally the action the parser is to take.
281be906b56SAndreas Gohr     *                            True if no match, false if there is a parsing error.
282be906b56SAndreas Gohr     */
283be906b56SAndreas Gohr    protected function reduce(&$raw)
284be906b56SAndreas Gohr    {
285*661c1ddcSChristopher Smith        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
286be906b56SAndreas Gohr            return false;
287be906b56SAndreas Gohr        }
288be906b56SAndreas Gohr        if ($raw === "") {
289be906b56SAndreas Gohr            return true;
290be906b56SAndreas Gohr        }
291*661c1ddcSChristopher Smith        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
292be906b56SAndreas Gohr            list($unparsed, $match, $raw) = $split;
293be906b56SAndreas Gohr            return array($unparsed, $match, $action);
294be906b56SAndreas Gohr        }
295be906b56SAndreas Gohr        return true;
296be906b56SAndreas Gohr    }
297be906b56SAndreas Gohr
298be906b56SAndreas Gohr    /**
299be906b56SAndreas Gohr     * Escapes regex characters other than (, ) and /
300be906b56SAndreas Gohr     *
301be906b56SAndreas Gohr     * @param string $str
302be906b56SAndreas Gohr     * @return string
303be906b56SAndreas Gohr     */
304be906b56SAndreas Gohr    public static function escape($str)
305be906b56SAndreas Gohr    {
306be906b56SAndreas Gohr        $chars = array(
307be906b56SAndreas Gohr            '/\\\\/',
308be906b56SAndreas Gohr            '/\./',
309be906b56SAndreas Gohr            '/\+/',
310be906b56SAndreas Gohr            '/\*/',
311be906b56SAndreas Gohr            '/\?/',
312be906b56SAndreas Gohr            '/\[/',
313be906b56SAndreas Gohr            '/\^/',
314be906b56SAndreas Gohr            '/\]/',
315be906b56SAndreas Gohr            '/\$/',
316be906b56SAndreas Gohr            '/\{/',
317be906b56SAndreas Gohr            '/\}/',
318be906b56SAndreas Gohr            '/\=/',
319be906b56SAndreas Gohr            '/\!/',
320be906b56SAndreas Gohr            '/\</',
321be906b56SAndreas Gohr            '/\>/',
322be906b56SAndreas Gohr            '/\|/',
323be906b56SAndreas Gohr            '/\:/'
324be906b56SAndreas Gohr        );
325be906b56SAndreas Gohr
326be906b56SAndreas Gohr        $escaped = array(
327be906b56SAndreas Gohr            '\\\\\\\\',
328be906b56SAndreas Gohr            '\.',
329be906b56SAndreas Gohr            '\+',
330be906b56SAndreas Gohr            '\*',
331be906b56SAndreas Gohr            '\?',
332be906b56SAndreas Gohr            '\[',
333be906b56SAndreas Gohr            '\^',
334be906b56SAndreas Gohr            '\]',
335be906b56SAndreas Gohr            '\$',
336be906b56SAndreas Gohr            '\{',
337be906b56SAndreas Gohr            '\}',
338be906b56SAndreas Gohr            '\=',
339be906b56SAndreas Gohr            '\!',
340be906b56SAndreas Gohr            '\<',
341be906b56SAndreas Gohr            '\>',
342be906b56SAndreas Gohr            '\|',
343be906b56SAndreas Gohr            '\:'
344be906b56SAndreas Gohr        );
345be906b56SAndreas Gohr        return preg_replace($chars, $escaped, $str);
346be906b56SAndreas Gohr    }
347be906b56SAndreas Gohr}
348