xref: /dokuwiki/inc/Parsing/Lexer/Lexer.php (revision be906b566b9bdfd92c032ee07c4fd077d820a8d1)
1*be906b56SAndreas Gohr<?php
2*be906b56SAndreas Gohr/**
3*be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4*be906b56SAndreas Gohr * For an intro to the Lexer see:
5*be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6*be906b56SAndreas Gohr *
7*be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
8*be906b56SAndreas Gohr */
9*be906b56SAndreas Gohr
10*be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
11*be906b56SAndreas Gohr
12*be906b56SAndreas Gohr// FIXME move elsewhere
13*be906b56SAndreas Gohr
14*be906b56SAndreas Gohrdefine("DOKU_LEXER_ENTER", 1);
15*be906b56SAndreas Gohrdefine("DOKU_LEXER_MATCHED", 2);
16*be906b56SAndreas Gohrdefine("DOKU_LEXER_UNMATCHED", 3);
17*be906b56SAndreas Gohrdefine("DOKU_LEXER_EXIT", 4);
18*be906b56SAndreas Gohrdefine("DOKU_LEXER_SPECIAL", 5);
19*be906b56SAndreas Gohr
20*be906b56SAndreas Gohr/**
21*be906b56SAndreas Gohr * Accepts text and breaks it into tokens.
22*be906b56SAndreas Gohr *
23*be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex
24*be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores.
25*be906b56SAndreas Gohr */
26*be906b56SAndreas Gohrclass Lexer
27*be906b56SAndreas Gohr{
28*be906b56SAndreas Gohr    /** @var ParallelRegex[] */
29*be906b56SAndreas Gohr    protected $regexes;
30*be906b56SAndreas Gohr    /** @var \Doku_Handler */
31*be906b56SAndreas Gohr    protected $handler;
32*be906b56SAndreas Gohr    /** @var StateStack */
33*be906b56SAndreas Gohr    protected $mode;
34*be906b56SAndreas Gohr    /** @var array mode "rewrites" */
35*be906b56SAndreas Gohr    protected $mode_handlers;
36*be906b56SAndreas Gohr    /** @var bool case sensitive? */
37*be906b56SAndreas Gohr    protected $case;
38*be906b56SAndreas Gohr
39*be906b56SAndreas Gohr    /**
40*be906b56SAndreas Gohr     * Sets up the lexer in case insensitive matching by default.
41*be906b56SAndreas Gohr     *
42*be906b56SAndreas Gohr     * @param \Doku_Handler $handler  Handling strategy by reference.
43*be906b56SAndreas Gohr     * @param string $start            Starting handler.
44*be906b56SAndreas Gohr     * @param boolean $case            True for case sensitive.
45*be906b56SAndreas Gohr     */
46*be906b56SAndreas Gohr    public function __construct($handler, $start = "accept", $case = false)
47*be906b56SAndreas Gohr    {
48*be906b56SAndreas Gohr        $this->case = $case;
49*be906b56SAndreas Gohr        $this->regexes = array();
50*be906b56SAndreas Gohr        $this->handler = $handler;
51*be906b56SAndreas Gohr        $this->mode = new StateStack($start);
52*be906b56SAndreas Gohr        $this->mode_handlers = array();
53*be906b56SAndreas Gohr    }
54*be906b56SAndreas Gohr
55*be906b56SAndreas Gohr    /**
56*be906b56SAndreas Gohr     * Adds a token search pattern for a particular parsing mode.
57*be906b56SAndreas Gohr     *
58*be906b56SAndreas Gohr     * The pattern does not change the current mode.
59*be906b56SAndreas Gohr     *
60*be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and )
61*be906b56SAndreas Gohr     *                             lose the usual meaning.
62*be906b56SAndreas Gohr     * @param string $mode         Should only apply this
63*be906b56SAndreas Gohr     *                             pattern when dealing with
64*be906b56SAndreas Gohr     *                             this type of input.
65*be906b56SAndreas Gohr     */
66*be906b56SAndreas Gohr    public function addPattern($pattern, $mode = "accept")
67*be906b56SAndreas Gohr    {
68*be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
69*be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
70*be906b56SAndreas Gohr        }
71*be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern);
72*be906b56SAndreas Gohr    }
73*be906b56SAndreas Gohr
74*be906b56SAndreas Gohr    /**
75*be906b56SAndreas Gohr     * Adds a pattern that will enter a new parsing mode.
76*be906b56SAndreas Gohr     *
77*be906b56SAndreas Gohr     * Useful for entering parenthesis, strings, tags, etc.
78*be906b56SAndreas Gohr     *
79*be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
80*be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
81*be906b56SAndreas Gohr     * @param string $new_mode     Change parsing to this new nested mode.
82*be906b56SAndreas Gohr     */
83*be906b56SAndreas Gohr    public function addEntryPattern($pattern, $mode, $new_mode)
84*be906b56SAndreas Gohr    {
85*be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
86*be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
87*be906b56SAndreas Gohr        }
88*be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, $new_mode);
89*be906b56SAndreas Gohr    }
90*be906b56SAndreas Gohr
91*be906b56SAndreas Gohr    /**
92*be906b56SAndreas Gohr     * Adds a pattern that will exit the current mode and re-enter the previous one.
93*be906b56SAndreas Gohr     *
94*be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
95*be906b56SAndreas Gohr     * @param string $mode         Mode to leave.
96*be906b56SAndreas Gohr     */
97*be906b56SAndreas Gohr    public function addExitPattern($pattern, $mode)
98*be906b56SAndreas Gohr    {
99*be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
100*be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
101*be906b56SAndreas Gohr        }
102*be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "__exit");
103*be906b56SAndreas Gohr    }
104*be906b56SAndreas Gohr
105*be906b56SAndreas Gohr    /**
106*be906b56SAndreas Gohr     * Adds a pattern that has a special mode.
107*be906b56SAndreas Gohr     *
108*be906b56SAndreas Gohr     * Acts as an entry and exit pattern in one go, effectively calling a special
109*be906b56SAndreas Gohr     * parser handler for this token only.
110*be906b56SAndreas Gohr     *
111*be906b56SAndreas Gohr     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
112*be906b56SAndreas Gohr     * @param string $mode         Should only apply this pattern when dealing with this type of input.
113*be906b56SAndreas Gohr     * @param string $special      Use this mode for this one token.
114*be906b56SAndreas Gohr     */
115*be906b56SAndreas Gohr    public function addSpecialPattern($pattern, $mode, $special)
116*be906b56SAndreas Gohr    {
117*be906b56SAndreas Gohr        if (! isset($this->regexes[$mode])) {
118*be906b56SAndreas Gohr            $this->regexes[$mode] = new ParallelRegex($this->case);
119*be906b56SAndreas Gohr        }
120*be906b56SAndreas Gohr        $this->regexes[$mode]->addPattern($pattern, "_$special");
121*be906b56SAndreas Gohr    }
122*be906b56SAndreas Gohr
123*be906b56SAndreas Gohr    /**
124*be906b56SAndreas Gohr     * Adds a mapping from a mode to another handler.
125*be906b56SAndreas Gohr     *
126*be906b56SAndreas Gohr     * @param string $mode        Mode to be remapped.
127*be906b56SAndreas Gohr     * @param string $handler     New target handler.
128*be906b56SAndreas Gohr     */
129*be906b56SAndreas Gohr    public function mapHandler($mode, $handler)
130*be906b56SAndreas Gohr    {
131*be906b56SAndreas Gohr        $this->mode_handlers[$mode] = $handler;
132*be906b56SAndreas Gohr    }
133*be906b56SAndreas Gohr
134*be906b56SAndreas Gohr    /**
135*be906b56SAndreas Gohr     * Splits the page text into tokens.
136*be906b56SAndreas Gohr     *
137*be906b56SAndreas Gohr     * Will fail if the handlers report an error or if no content is consumed. If successful then each
138*be906b56SAndreas Gohr     * unparsed and parsed token invokes a call to the held listener.
139*be906b56SAndreas Gohr     *
140*be906b56SAndreas Gohr     * @param string $raw        Raw HTML text.
141*be906b56SAndreas Gohr     * @return boolean           True on success, else false.
142*be906b56SAndreas Gohr     */
143*be906b56SAndreas Gohr    public function parse($raw)
144*be906b56SAndreas Gohr    {
145*be906b56SAndreas Gohr        if (! isset($this->handler)) {
146*be906b56SAndreas Gohr            return false;
147*be906b56SAndreas Gohr        }
148*be906b56SAndreas Gohr        $initialLength = strlen($raw);
149*be906b56SAndreas Gohr        $length = $initialLength;
150*be906b56SAndreas Gohr        $pos = 0;
151*be906b56SAndreas Gohr        while (is_array($parsed = $this->reduce($raw))) {
152*be906b56SAndreas Gohr            list($unmatched, $matched, $mode) = $parsed;
153*be906b56SAndreas Gohr            $currentLength = strlen($raw);
154*be906b56SAndreas Gohr            $matchPos = $initialLength - $currentLength - strlen($matched);
155*be906b56SAndreas Gohr            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
156*be906b56SAndreas Gohr                return false;
157*be906b56SAndreas Gohr            }
158*be906b56SAndreas Gohr            if ($currentLength == $length) {
159*be906b56SAndreas Gohr                return false;
160*be906b56SAndreas Gohr            }
161*be906b56SAndreas Gohr            $length = $currentLength;
162*be906b56SAndreas Gohr            $pos = $initialLength - $currentLength;
163*be906b56SAndreas Gohr        }
164*be906b56SAndreas Gohr        if (!$parsed) {
165*be906b56SAndreas Gohr            return false;
166*be906b56SAndreas Gohr        }
167*be906b56SAndreas Gohr        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
168*be906b56SAndreas Gohr    }
169*be906b56SAndreas Gohr
170*be906b56SAndreas Gohr    /**
171*be906b56SAndreas Gohr     * Sends the matched token and any leading unmatched
172*be906b56SAndreas Gohr     * text to the parser changing the lexer to a new
173*be906b56SAndreas Gohr     * mode if one is listed.
174*be906b56SAndreas Gohr     *
175*be906b56SAndreas Gohr     * @param string $unmatched Unmatched leading portion.
176*be906b56SAndreas Gohr     * @param string $matched Actual token match.
177*be906b56SAndreas Gohr     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
178*be906b56SAndreas Gohr     * @param int $initialPos
179*be906b56SAndreas Gohr     * @param int $matchPos Current byte index location in raw doc thats being parsed
180*be906b56SAndreas Gohr     * @return boolean             False if there was any error from the parser.
181*be906b56SAndreas Gohr     */
182*be906b56SAndreas Gohr    protected function dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos)
183*be906b56SAndreas Gohr    {
184*be906b56SAndreas Gohr        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
185*be906b56SAndreas Gohr            return false;
186*be906b56SAndreas Gohr        }
187*be906b56SAndreas Gohr        if ($this->isModeEnd($mode)) {
188*be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
189*be906b56SAndreas Gohr                return false;
190*be906b56SAndreas Gohr            }
191*be906b56SAndreas Gohr            return $this->mode->leave();
192*be906b56SAndreas Gohr        }
193*be906b56SAndreas Gohr        if ($this->isSpecialMode($mode)) {
194*be906b56SAndreas Gohr            $this->mode->enter($this->decodeSpecial($mode));
195*be906b56SAndreas Gohr            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
196*be906b56SAndreas Gohr                return false;
197*be906b56SAndreas Gohr            }
198*be906b56SAndreas Gohr            return $this->mode->leave();
199*be906b56SAndreas Gohr        }
200*be906b56SAndreas Gohr        if (is_string($mode)) {
201*be906b56SAndreas Gohr            $this->mode->enter($mode);
202*be906b56SAndreas Gohr            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
203*be906b56SAndreas Gohr        }
204*be906b56SAndreas Gohr        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
205*be906b56SAndreas Gohr    }
206*be906b56SAndreas Gohr
207*be906b56SAndreas Gohr    /**
208*be906b56SAndreas Gohr     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
209*be906b56SAndreas Gohr     * mode stack.
210*be906b56SAndreas Gohr     *
211*be906b56SAndreas Gohr     * @param string $mode    Mode to test.
212*be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
213*be906b56SAndreas Gohr     */
214*be906b56SAndreas Gohr    protected function isModeEnd($mode)
215*be906b56SAndreas Gohr    {
216*be906b56SAndreas Gohr        return ($mode === "__exit");
217*be906b56SAndreas Gohr    }
218*be906b56SAndreas Gohr
219*be906b56SAndreas Gohr    /**
220*be906b56SAndreas Gohr     * Test to see if the mode is one where this mode is entered for this token only and automatically
221*be906b56SAndreas Gohr     * leaves immediately afterwoods.
222*be906b56SAndreas Gohr     *
223*be906b56SAndreas Gohr     * @param string $mode    Mode to test.
224*be906b56SAndreas Gohr     * @return boolean        True if this is the exit mode.
225*be906b56SAndreas Gohr     */
226*be906b56SAndreas Gohr    protected function isSpecialMode($mode)
227*be906b56SAndreas Gohr    {
228*be906b56SAndreas Gohr        return (strncmp($mode, "_", 1) == 0);
229*be906b56SAndreas Gohr    }
230*be906b56SAndreas Gohr
231*be906b56SAndreas Gohr    /**
232*be906b56SAndreas Gohr     * Strips the magic underscore marking single token modes.
233*be906b56SAndreas Gohr     *
234*be906b56SAndreas Gohr     * @param string $mode    Mode to decode.
235*be906b56SAndreas Gohr     * @return string         Underlying mode name.
236*be906b56SAndreas Gohr     */
237*be906b56SAndreas Gohr    protected function decodeSpecial($mode)
238*be906b56SAndreas Gohr    {
239*be906b56SAndreas Gohr        return substr($mode, 1);
240*be906b56SAndreas Gohr    }
241*be906b56SAndreas Gohr
242*be906b56SAndreas Gohr    /**
243*be906b56SAndreas Gohr     * Calls the parser method named after the current mode.
244*be906b56SAndreas Gohr     *
245*be906b56SAndreas Gohr     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
246*be906b56SAndreas Gohr     *
247*be906b56SAndreas Gohr     * @param string $content Text parsed.
248*be906b56SAndreas Gohr     * @param boolean $is_match Token is recognised rather
249*be906b56SAndreas Gohr     *                               than unparsed data.
250*be906b56SAndreas Gohr     * @param int $pos Current byte index location in raw doc
251*be906b56SAndreas Gohr     *                             thats being parsed
252*be906b56SAndreas Gohr     * @return bool
253*be906b56SAndreas Gohr     */
254*be906b56SAndreas Gohr    protected function invokeHandler($content, $is_match, $pos)
255*be906b56SAndreas Gohr    {
256*be906b56SAndreas Gohr        if (($content === "") || ($content === false)) {
257*be906b56SAndreas Gohr            return true;
258*be906b56SAndreas Gohr        }
259*be906b56SAndreas Gohr        $handler = $this->mode->getCurrent();
260*be906b56SAndreas Gohr        if (isset($this->mode_handlers[$handler])) {
261*be906b56SAndreas Gohr            $handler = $this->mode_handlers[$handler];
262*be906b56SAndreas Gohr        }
263*be906b56SAndreas Gohr
264*be906b56SAndreas Gohr        // modes starting with plugin_ are all handled by the same
265*be906b56SAndreas Gohr        // handler but with an additional parameter
266*be906b56SAndreas Gohr        if (substr($handler, 0, 7)=='plugin_') {
267*be906b56SAndreas Gohr            list($handler,$plugin) = explode('_', $handler, 2);
268*be906b56SAndreas Gohr            return $this->handler->$handler($content, $is_match, $pos, $plugin);
269*be906b56SAndreas Gohr        }
270*be906b56SAndreas Gohr
271*be906b56SAndreas Gohr        return $this->handler->$handler($content, $is_match, $pos);
272*be906b56SAndreas Gohr    }
273*be906b56SAndreas Gohr
274*be906b56SAndreas Gohr    /**
275*be906b56SAndreas Gohr     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
276*be906b56SAndreas Gohr     * unparsed data. Empty strings will not be matched.
277*be906b56SAndreas Gohr     *
278*be906b56SAndreas Gohr     * @param string $raw         The subject to parse. This is the content that will be eaten.
279*be906b56SAndreas Gohr     * @return array|bool         Three item list of unparsed content followed by the
280*be906b56SAndreas Gohr     *                            recognised token and finally the action the parser is to take.
281*be906b56SAndreas Gohr     *                            True if no match, false if there is a parsing error.
282*be906b56SAndreas Gohr     */
283*be906b56SAndreas Gohr    protected function reduce(&$raw)
284*be906b56SAndreas Gohr    {
285*be906b56SAndreas Gohr        if (! isset($this->regexes[$this->mode->getCurrent()])) {
286*be906b56SAndreas Gohr            return false;
287*be906b56SAndreas Gohr        }
288*be906b56SAndreas Gohr        if ($raw === "") {
289*be906b56SAndreas Gohr            return true;
290*be906b56SAndreas Gohr        }
291*be906b56SAndreas Gohr        if ($action = $this->regexes[$this->mode->getCurrent()]->split($raw, $split)) {
292*be906b56SAndreas Gohr            list($unparsed, $match, $raw) = $split;
293*be906b56SAndreas Gohr            return array($unparsed, $match, $action);
294*be906b56SAndreas Gohr        }
295*be906b56SAndreas Gohr        return true;
296*be906b56SAndreas Gohr    }
297*be906b56SAndreas Gohr
298*be906b56SAndreas Gohr    /**
299*be906b56SAndreas Gohr     * Escapes regex characters other than (, ) and /
300*be906b56SAndreas Gohr     *
301*be906b56SAndreas Gohr     * @param string $str
302*be906b56SAndreas Gohr     * @return string
303*be906b56SAndreas Gohr     */
304*be906b56SAndreas Gohr    public static function escape($str)
305*be906b56SAndreas Gohr    {
306*be906b56SAndreas Gohr        $chars = array(
307*be906b56SAndreas Gohr            '/\\\\/',
308*be906b56SAndreas Gohr            '/\./',
309*be906b56SAndreas Gohr            '/\+/',
310*be906b56SAndreas Gohr            '/\*/',
311*be906b56SAndreas Gohr            '/\?/',
312*be906b56SAndreas Gohr            '/\[/',
313*be906b56SAndreas Gohr            '/\^/',
314*be906b56SAndreas Gohr            '/\]/',
315*be906b56SAndreas Gohr            '/\$/',
316*be906b56SAndreas Gohr            '/\{/',
317*be906b56SAndreas Gohr            '/\}/',
318*be906b56SAndreas Gohr            '/\=/',
319*be906b56SAndreas Gohr            '/\!/',
320*be906b56SAndreas Gohr            '/\</',
321*be906b56SAndreas Gohr            '/\>/',
322*be906b56SAndreas Gohr            '/\|/',
323*be906b56SAndreas Gohr            '/\:/'
324*be906b56SAndreas Gohr        );
325*be906b56SAndreas Gohr
326*be906b56SAndreas Gohr        $escaped = array(
327*be906b56SAndreas Gohr            '\\\\\\\\',
328*be906b56SAndreas Gohr            '\.',
329*be906b56SAndreas Gohr            '\+',
330*be906b56SAndreas Gohr            '\*',
331*be906b56SAndreas Gohr            '\?',
332*be906b56SAndreas Gohr            '\[',
333*be906b56SAndreas Gohr            '\^',
334*be906b56SAndreas Gohr            '\]',
335*be906b56SAndreas Gohr            '\$',
336*be906b56SAndreas Gohr            '\{',
337*be906b56SAndreas Gohr            '\}',
338*be906b56SAndreas Gohr            '\=',
339*be906b56SAndreas Gohr            '\!',
340*be906b56SAndreas Gohr            '\<',
341*be906b56SAndreas Gohr            '\>',
342*be906b56SAndreas Gohr            '\|',
343*be906b56SAndreas Gohr            '\:'
344*be906b56SAndreas Gohr        );
345*be906b56SAndreas Gohr        return preg_replace($chars, $escaped, $str);
346*be906b56SAndreas Gohr    }
347*be906b56SAndreas Gohr}
348