1<?php
2
3/*
4 * This file is part of Twig.
5 *
6 * (c) Fabien Potencier
7 * (c) Armin Ronacher
8 *
9 * For the full copyright and license information, please view the LICENSE
10 * file that was distributed with this source code.
11 */
12
13namespace Twig;
14
15use Twig\Error\SyntaxError;
16
17/**
18 * Lexes a template string.
19 *
20 * @author Fabien Potencier <fabien@symfony.com>
21 */
22class Lexer
23{
24    private $tokens;
25    private $code;
26    private $cursor;
27    private $lineno;
28    private $end;
29    private $state;
30    private $states;
31    private $brackets;
32    private $env;
33    private $source;
34    private $options;
35    private $regexes;
36    private $position;
37    private $positions;
38    private $currentVarBlockLine;
39
40    public const STATE_DATA = 0;
41    public const STATE_BLOCK = 1;
42    public const STATE_VAR = 2;
43    public const STATE_STRING = 3;
44    public const STATE_INTERPOLATION = 4;
45
46    public const REGEX_NAME = '/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/A';
47    public const REGEX_NUMBER = '/[0-9]+(?:\.[0-9]+)?([Ee][\+\-][0-9]+)?/A';
48    public const REGEX_STRING = '/"([^#"\\\\]*(?:\\\\.[^#"\\\\]*)*)"|\'([^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'/As';
49    public const REGEX_DQ_STRING_DELIM = '/"/A';
50    public const REGEX_DQ_STRING_PART = '/[^#"\\\\]*(?:(?:\\\\.|#(?!\{))[^#"\\\\]*)*/As';
51    public const PUNCTUATION = '()[]{}?:.,|';
52
53    public function __construct(Environment $env, array $options = [])
54    {
55        $this->env = $env;
56
57        $this->options = array_merge([
58            'tag_comment' => ['{#', '#}'],
59            'tag_block' => ['{%', '%}'],
60            'tag_variable' => ['{{', '}}'],
61            'whitespace_trim' => '-',
62            'whitespace_line_trim' => '~',
63            'whitespace_line_chars' => ' \t\0\x0B',
64            'interpolation' => ['#{', '}'],
65        ], $options);
66
67        // when PHP 7.3 is the min version, we will be able to remove the '#' part in preg_quote as it's part of the default
68        $this->regexes = [
69            // }}
70            'lex_var' => '{
71                \s*
72                (?:'.
73                    preg_quote($this->options['whitespace_trim'].$this->options['tag_variable'][1], '#').'\s*'. // -}}\s*
74                    '|'.
75                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_variable'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~}}[ \t\0\x0B]*
76                    '|'.
77                    preg_quote($this->options['tag_variable'][1], '#'). // }}
78                ')
79            }Ax',
80
81            // %}
82            'lex_block' => '{
83                \s*
84                (?:'.
85                    preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*\n?'. // -%}\s*\n?
86                    '|'.
87                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]*
88                    '|'.
89                    preg_quote($this->options['tag_block'][1], '#').'\n?'. // %}\n?
90                ')
91            }Ax',
92
93            // {% endverbatim %}
94            'lex_raw_data' => '{'.
95                preg_quote($this->options['tag_block'][0], '#'). // {%
96                '('.
97                    $this->options['whitespace_trim']. // -
98                    '|'.
99                    $this->options['whitespace_line_trim']. // ~
100                ')?\s*endverbatim\s*'.
101                '(?:'.
102                    preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%}
103                    '|'.
104                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]*
105                    '|'.
106                    preg_quote($this->options['tag_block'][1], '#'). // %}
107                ')
108            }sx',
109
110            'operator' => $this->getOperatorRegex(),
111
112            // #}
113            'lex_comment' => '{
114                (?:'.
115                    preg_quote($this->options['whitespace_trim'].$this->options['tag_comment'][1], '#').'\s*\n?'. // -#}\s*\n?
116                    '|'.
117                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_comment'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~#}[ \t\0\x0B]*
118                    '|'.
119                    preg_quote($this->options['tag_comment'][1], '#').'\n?'. // #}\n?
120                ')
121            }sx',
122
123            // verbatim %}
124            'lex_block_raw' => '{
125                \s*verbatim\s*
126                (?:'.
127                    preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%}\s*
128                    '|'.
129                    preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]*
130                    '|'.
131                    preg_quote($this->options['tag_block'][1], '#'). // %}
132                ')
133            }Asx',
134
135            'lex_block_line' => '{\s*line\s+(\d+)\s*'.preg_quote($this->options['tag_block'][1], '#').'}As',
136
137            // {{ or {% or {#
138            'lex_tokens_start' => '{
139                ('.
140                    preg_quote($this->options['tag_variable'][0], '#'). // {{
141                    '|'.
142                    preg_quote($this->options['tag_block'][0], '#'). // {%
143                    '|'.
144                    preg_quote($this->options['tag_comment'][0], '#'). // {#
145                ')('.
146                    preg_quote($this->options['whitespace_trim'], '#'). // -
147                    '|'.
148                    preg_quote($this->options['whitespace_line_trim'], '#'). // ~
149                ')?
150            }sx',
151            'interpolation_start' => '{'.preg_quote($this->options['interpolation'][0], '#').'\s*}A',
152            'interpolation_end' => '{\s*'.preg_quote($this->options['interpolation'][1], '#').'}A',
153        ];
154    }
155
156    public function tokenize(Source $source)
157    {
158        $this->source = $source;
159        $this->code = str_replace(["\r\n", "\r"], "\n", $source->getCode());
160        $this->cursor = 0;
161        $this->lineno = 1;
162        $this->end = \strlen($this->code);
163        $this->tokens = [];
164        $this->state = self::STATE_DATA;
165        $this->states = [];
166        $this->brackets = [];
167        $this->position = -1;
168
169        // find all token starts in one go
170        preg_match_all($this->regexes['lex_tokens_start'], $this->code, $matches, \PREG_OFFSET_CAPTURE);
171        $this->positions = $matches;
172
173        while ($this->cursor < $this->end) {
174            // dispatch to the lexing functions depending
175            // on the current state
176            switch ($this->state) {
177                case self::STATE_DATA:
178                    $this->lexData();
179                    break;
180
181                case self::STATE_BLOCK:
182                    $this->lexBlock();
183                    break;
184
185                case self::STATE_VAR:
186                    $this->lexVar();
187                    break;
188
189                case self::STATE_STRING:
190                    $this->lexString();
191                    break;
192
193                case self::STATE_INTERPOLATION:
194                    $this->lexInterpolation();
195                    break;
196            }
197        }
198
199        $this->pushToken(/* Token::EOF_TYPE */ -1);
200
201        if (!empty($this->brackets)) {
202            list($expect, $lineno) = array_pop($this->brackets);
203            throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source);
204        }
205
206        return new TokenStream($this->tokens, $this->source);
207    }
208
209    private function lexData()
210    {
211        // if no matches are left we return the rest of the template as simple text token
212        if ($this->position == \count($this->positions[0]) - 1) {
213            $this->pushToken(/* Token::TEXT_TYPE */ 0, substr($this->code, $this->cursor));
214            $this->cursor = $this->end;
215
216            return;
217        }
218
219        // Find the first token after the current cursor
220        $position = $this->positions[0][++$this->position];
221        while ($position[1] < $this->cursor) {
222            if ($this->position == \count($this->positions[0]) - 1) {
223                return;
224            }
225            $position = $this->positions[0][++$this->position];
226        }
227
228        // push the template text first
229        $text = $textContent = substr($this->code, $this->cursor, $position[1] - $this->cursor);
230
231        // trim?
232        if (isset($this->positions[2][$this->position][0])) {
233            if ($this->options['whitespace_trim'] === $this->positions[2][$this->position][0]) {
234                // whitespace_trim detected ({%-, {{- or {#-)
235                $text = rtrim($text);
236            } elseif ($this->options['whitespace_line_trim'] === $this->positions[2][$this->position][0]) {
237                // whitespace_line_trim detected ({%~, {{~ or {#~)
238                // don't trim \r and \n
239                $text = rtrim($text, " \t\0\x0B");
240            }
241        }
242        $this->pushToken(/* Token::TEXT_TYPE */ 0, $text);
243        $this->moveCursor($textContent.$position[0]);
244
245        switch ($this->positions[1][$this->position][0]) {
246            case $this->options['tag_comment'][0]:
247                $this->lexComment();
248                break;
249
250            case $this->options['tag_block'][0]:
251                // raw data?
252                if (preg_match($this->regexes['lex_block_raw'], $this->code, $match, 0, $this->cursor)) {
253                    $this->moveCursor($match[0]);
254                    $this->lexRawData();
255                // {% line \d+ %}
256                } elseif (preg_match($this->regexes['lex_block_line'], $this->code, $match, 0, $this->cursor)) {
257                    $this->moveCursor($match[0]);
258                    $this->lineno = (int) $match[1];
259                } else {
260                    $this->pushToken(/* Token::BLOCK_START_TYPE */ 1);
261                    $this->pushState(self::STATE_BLOCK);
262                    $this->currentVarBlockLine = $this->lineno;
263                }
264                break;
265
266            case $this->options['tag_variable'][0]:
267                $this->pushToken(/* Token::VAR_START_TYPE */ 2);
268                $this->pushState(self::STATE_VAR);
269                $this->currentVarBlockLine = $this->lineno;
270                break;
271        }
272    }
273
274    private function lexBlock()
275    {
276        if (empty($this->brackets) && preg_match($this->regexes['lex_block'], $this->code, $match, 0, $this->cursor)) {
277            $this->pushToken(/* Token::BLOCK_END_TYPE */ 3);
278            $this->moveCursor($match[0]);
279            $this->popState();
280        } else {
281            $this->lexExpression();
282        }
283    }
284
285    private function lexVar()
286    {
287        if (empty($this->brackets) && preg_match($this->regexes['lex_var'], $this->code, $match, 0, $this->cursor)) {
288            $this->pushToken(/* Token::VAR_END_TYPE */ 4);
289            $this->moveCursor($match[0]);
290            $this->popState();
291        } else {
292            $this->lexExpression();
293        }
294    }
295
296    private function lexExpression()
297    {
298        // whitespace
299        if (preg_match('/\s+/A', $this->code, $match, 0, $this->cursor)) {
300            $this->moveCursor($match[0]);
301
302            if ($this->cursor >= $this->end) {
303                throw new SyntaxError(sprintf('Unclosed "%s".', self::STATE_BLOCK === $this->state ? 'block' : 'variable'), $this->currentVarBlockLine, $this->source);
304            }
305        }
306
307        // arrow function
308        if ('=' === $this->code[$this->cursor] && '>' === $this->code[$this->cursor + 1]) {
309            $this->pushToken(Token::ARROW_TYPE, '=>');
310            $this->moveCursor('=>');
311        }
312        // operators
313        elseif (preg_match($this->regexes['operator'], $this->code, $match, 0, $this->cursor)) {
314            $this->pushToken(/* Token::OPERATOR_TYPE */ 8, preg_replace('/\s+/', ' ', $match[0]));
315            $this->moveCursor($match[0]);
316        }
317        // names
318        elseif (preg_match(self::REGEX_NAME, $this->code, $match, 0, $this->cursor)) {
319            $this->pushToken(/* Token::NAME_TYPE */ 5, $match[0]);
320            $this->moveCursor($match[0]);
321        }
322        // numbers
323        elseif (preg_match(self::REGEX_NUMBER, $this->code, $match, 0, $this->cursor)) {
324            $number = (float) $match[0];  // floats
325            if (ctype_digit($match[0]) && $number <= \PHP_INT_MAX) {
326                $number = (int) $match[0]; // integers lower than the maximum
327            }
328            $this->pushToken(/* Token::NUMBER_TYPE */ 6, $number);
329            $this->moveCursor($match[0]);
330        }
331        // punctuation
332        elseif (false !== strpos(self::PUNCTUATION, $this->code[$this->cursor])) {
333            // opening bracket
334            if (false !== strpos('([{', $this->code[$this->cursor])) {
335                $this->brackets[] = [$this->code[$this->cursor], $this->lineno];
336            }
337            // closing bracket
338            elseif (false !== strpos(')]}', $this->code[$this->cursor])) {
339                if (empty($this->brackets)) {
340                    throw new SyntaxError(sprintf('Unexpected "%s".', $this->code[$this->cursor]), $this->lineno, $this->source);
341                }
342
343                list($expect, $lineno) = array_pop($this->brackets);
344                if ($this->code[$this->cursor] != strtr($expect, '([{', ')]}')) {
345                    throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source);
346                }
347            }
348
349            $this->pushToken(/* Token::PUNCTUATION_TYPE */ 9, $this->code[$this->cursor]);
350            ++$this->cursor;
351        }
352        // strings
353        elseif (preg_match(self::REGEX_STRING, $this->code, $match, 0, $this->cursor)) {
354            $this->pushToken(/* Token::STRING_TYPE */ 7, stripcslashes(substr($match[0], 1, -1)));
355            $this->moveCursor($match[0]);
356        }
357        // opening double quoted string
358        elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) {
359            $this->brackets[] = ['"', $this->lineno];
360            $this->pushState(self::STATE_STRING);
361            $this->moveCursor($match[0]);
362        }
363        // unlexable
364        else {
365            throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source);
366        }
367    }
368
369    private function lexRawData()
370    {
371        if (!preg_match($this->regexes['lex_raw_data'], $this->code, $match, \PREG_OFFSET_CAPTURE, $this->cursor)) {
372            throw new SyntaxError('Unexpected end of file: Unclosed "verbatim" block.', $this->lineno, $this->source);
373        }
374
375        $text = substr($this->code, $this->cursor, $match[0][1] - $this->cursor);
376        $this->moveCursor($text.$match[0][0]);
377
378        // trim?
379        if (isset($match[1][0])) {
380            if ($this->options['whitespace_trim'] === $match[1][0]) {
381                // whitespace_trim detected ({%-, {{- or {#-)
382                $text = rtrim($text);
383            } else {
384                // whitespace_line_trim detected ({%~, {{~ or {#~)
385                // don't trim \r and \n
386                $text = rtrim($text, " \t\0\x0B");
387            }
388        }
389
390        $this->pushToken(/* Token::TEXT_TYPE */ 0, $text);
391    }
392
393    private function lexComment()
394    {
395        if (!preg_match($this->regexes['lex_comment'], $this->code, $match, \PREG_OFFSET_CAPTURE, $this->cursor)) {
396            throw new SyntaxError('Unclosed comment.', $this->lineno, $this->source);
397        }
398
399        $this->moveCursor(substr($this->code, $this->cursor, $match[0][1] - $this->cursor).$match[0][0]);
400    }
401
402    private function lexString()
403    {
404        if (preg_match($this->regexes['interpolation_start'], $this->code, $match, 0, $this->cursor)) {
405            $this->brackets[] = [$this->options['interpolation'][0], $this->lineno];
406            $this->pushToken(/* Token::INTERPOLATION_START_TYPE */ 10);
407            $this->moveCursor($match[0]);
408            $this->pushState(self::STATE_INTERPOLATION);
409        } elseif (preg_match(self::REGEX_DQ_STRING_PART, $this->code, $match, 0, $this->cursor) && \strlen($match[0]) > 0) {
410            $this->pushToken(/* Token::STRING_TYPE */ 7, stripcslashes($match[0]));
411            $this->moveCursor($match[0]);
412        } elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) {
413            list($expect, $lineno) = array_pop($this->brackets);
414            if ('"' != $this->code[$this->cursor]) {
415                throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source);
416            }
417
418            $this->popState();
419            ++$this->cursor;
420        } else {
421            // unlexable
422            throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source);
423        }
424    }
425
426    private function lexInterpolation()
427    {
428        $bracket = end($this->brackets);
429        if ($this->options['interpolation'][0] === $bracket[0] && preg_match($this->regexes['interpolation_end'], $this->code, $match, 0, $this->cursor)) {
430            array_pop($this->brackets);
431            $this->pushToken(/* Token::INTERPOLATION_END_TYPE */ 11);
432            $this->moveCursor($match[0]);
433            $this->popState();
434        } else {
435            $this->lexExpression();
436        }
437    }
438
439    private function pushToken($type, $value = '')
440    {
441        // do not push empty text tokens
442        if (/* Token::TEXT_TYPE */ 0 === $type && '' === $value) {
443            return;
444        }
445
446        $this->tokens[] = new Token($type, $value, $this->lineno);
447    }
448
449    private function moveCursor($text)
450    {
451        $this->cursor += \strlen($text);
452        $this->lineno += substr_count($text, "\n");
453    }
454
455    private function getOperatorRegex()
456    {
457        $operators = array_merge(
458            ['='],
459            array_keys($this->env->getUnaryOperators()),
460            array_keys($this->env->getBinaryOperators())
461        );
462
463        $operators = array_combine($operators, array_map('strlen', $operators));
464        arsort($operators);
465
466        $regex = [];
467        foreach ($operators as $operator => $length) {
468            // an operator that ends with a character must be followed by
469            // a whitespace, a parenthesis, an opening map [ or sequence {
470            $r = preg_quote($operator, '/');
471            if (ctype_alpha($operator[$length - 1])) {
472                $r .= '(?=[\s()\[{])';
473            }
474
475            // an operator that begins with a character must not have a dot or pipe before
476            if (ctype_alpha($operator[0])) {
477                $r = '(?<![\.\|])'.$r;
478            }
479
480            // an operator with a space can be any amount of whitespaces
481            $r = preg_replace('/\s+/', '\s+', $r);
482
483            $regex[] = $r;
484        }
485
486        return '/'.implode('|', $regex).'/A';
487    }
488
489    private function pushState($state)
490    {
491        $this->states[] = $this->state;
492        $this->state = $state;
493    }
494
495    private function popState()
496    {
497        if (0 === \count($this->states)) {
498            throw new \LogicException('Cannot pop state without a previous state.');
499        }
500
501        $this->state = array_pop($this->states);
502    }
503}
504
505class_alias('Twig\Lexer', 'Twig_Lexer');
506