1<?php 2 3/* 4 * This file is part of Twig. 5 * 6 * (c) Fabien Potencier 7 * (c) Armin Ronacher 8 * 9 * For the full copyright and license information, please view the LICENSE 10 * file that was distributed with this source code. 11 */ 12 13namespace Twig; 14 15use Twig\Error\SyntaxError; 16 17/** 18 * Lexes a template string. 19 * 20 * @author Fabien Potencier <fabien@symfony.com> 21 */ 22class Lexer 23{ 24 private $tokens; 25 private $code; 26 private $cursor; 27 private $lineno; 28 private $end; 29 private $state; 30 private $states; 31 private $brackets; 32 private $env; 33 private $source; 34 private $options; 35 private $regexes; 36 private $position; 37 private $positions; 38 private $currentVarBlockLine; 39 40 public const STATE_DATA = 0; 41 public const STATE_BLOCK = 1; 42 public const STATE_VAR = 2; 43 public const STATE_STRING = 3; 44 public const STATE_INTERPOLATION = 4; 45 46 public const REGEX_NAME = '/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/A'; 47 public const REGEX_NUMBER = '/[0-9]+(?:\.[0-9]+)?([Ee][\+\-][0-9]+)?/A'; 48 public const REGEX_STRING = '/"([^#"\\\\]*(?:\\\\.[^#"\\\\]*)*)"|\'([^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'/As'; 49 public const REGEX_DQ_STRING_DELIM = '/"/A'; 50 public const REGEX_DQ_STRING_PART = '/[^#"\\\\]*(?:(?:\\\\.|#(?!\{))[^#"\\\\]*)*/As'; 51 public const PUNCTUATION = '()[]{}?:.,|'; 52 53 public function __construct(Environment $env, array $options = []) 54 { 55 $this->env = $env; 56 57 $this->options = array_merge([ 58 'tag_comment' => ['{#', '#}'], 59 'tag_block' => ['{%', '%}'], 60 'tag_variable' => ['{{', '}}'], 61 'whitespace_trim' => '-', 62 'whitespace_line_trim' => '~', 63 'whitespace_line_chars' => ' \t\0\x0B', 64 'interpolation' => ['#{', '}'], 65 ], $options); 66 67 // when PHP 7.3 is the min version, we will be able to remove the '#' part in preg_quote as it's part of the default 68 $this->regexes = [ 69 // }} 70 'lex_var' => '{ 71 \s* 72 (?:'. 73 preg_quote($this->options['whitespace_trim'].$this->options['tag_variable'][1], '#').'\s*'. // -}}\s* 74 '|'. 75 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_variable'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~}}[ \t\0\x0B]* 76 '|'. 77 preg_quote($this->options['tag_variable'][1], '#'). // }} 78 ') 79 }Ax', 80 81 // %} 82 'lex_block' => '{ 83 \s* 84 (?:'. 85 preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*\n?'. // -%}\s*\n? 86 '|'. 87 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]* 88 '|'. 89 preg_quote($this->options['tag_block'][1], '#').'\n?'. // %}\n? 90 ') 91 }Ax', 92 93 // {% endverbatim %} 94 'lex_raw_data' => '{'. 95 preg_quote($this->options['tag_block'][0], '#'). // {% 96 '('. 97 $this->options['whitespace_trim']. // - 98 '|'. 99 $this->options['whitespace_line_trim']. // ~ 100 ')?\s*endverbatim\s*'. 101 '(?:'. 102 preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%} 103 '|'. 104 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]* 105 '|'. 106 preg_quote($this->options['tag_block'][1], '#'). // %} 107 ') 108 }sx', 109 110 'operator' => $this->getOperatorRegex(), 111 112 // #} 113 'lex_comment' => '{ 114 (?:'. 115 preg_quote($this->options['whitespace_trim'].$this->options['tag_comment'][1], '#').'\s*\n?'. // -#}\s*\n? 116 '|'. 117 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_comment'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~#}[ \t\0\x0B]* 118 '|'. 119 preg_quote($this->options['tag_comment'][1], '#').'\n?'. // #}\n? 120 ') 121 }sx', 122 123 // verbatim %} 124 'lex_block_raw' => '{ 125 \s*verbatim\s* 126 (?:'. 127 preg_quote($this->options['whitespace_trim'].$this->options['tag_block'][1], '#').'\s*'. // -%}\s* 128 '|'. 129 preg_quote($this->options['whitespace_line_trim'].$this->options['tag_block'][1], '#').'['.$this->options['whitespace_line_chars'].']*'. // ~%}[ \t\0\x0B]* 130 '|'. 131 preg_quote($this->options['tag_block'][1], '#'). // %} 132 ') 133 }Asx', 134 135 'lex_block_line' => '{\s*line\s+(\d+)\s*'.preg_quote($this->options['tag_block'][1], '#').'}As', 136 137 // {{ or {% or {# 138 'lex_tokens_start' => '{ 139 ('. 140 preg_quote($this->options['tag_variable'][0], '#'). // {{ 141 '|'. 142 preg_quote($this->options['tag_block'][0], '#'). // {% 143 '|'. 144 preg_quote($this->options['tag_comment'][0], '#'). // {# 145 ')('. 146 preg_quote($this->options['whitespace_trim'], '#'). // - 147 '|'. 148 preg_quote($this->options['whitespace_line_trim'], '#'). // ~ 149 ')? 150 }sx', 151 'interpolation_start' => '{'.preg_quote($this->options['interpolation'][0], '#').'\s*}A', 152 'interpolation_end' => '{\s*'.preg_quote($this->options['interpolation'][1], '#').'}A', 153 ]; 154 } 155 156 public function tokenize(Source $source) 157 { 158 $this->source = $source; 159 $this->code = str_replace(["\r\n", "\r"], "\n", $source->getCode()); 160 $this->cursor = 0; 161 $this->lineno = 1; 162 $this->end = \strlen($this->code); 163 $this->tokens = []; 164 $this->state = self::STATE_DATA; 165 $this->states = []; 166 $this->brackets = []; 167 $this->position = -1; 168 169 // find all token starts in one go 170 preg_match_all($this->regexes['lex_tokens_start'], $this->code, $matches, \PREG_OFFSET_CAPTURE); 171 $this->positions = $matches; 172 173 while ($this->cursor < $this->end) { 174 // dispatch to the lexing functions depending 175 // on the current state 176 switch ($this->state) { 177 case self::STATE_DATA: 178 $this->lexData(); 179 break; 180 181 case self::STATE_BLOCK: 182 $this->lexBlock(); 183 break; 184 185 case self::STATE_VAR: 186 $this->lexVar(); 187 break; 188 189 case self::STATE_STRING: 190 $this->lexString(); 191 break; 192 193 case self::STATE_INTERPOLATION: 194 $this->lexInterpolation(); 195 break; 196 } 197 } 198 199 $this->pushToken(/* Token::EOF_TYPE */ -1); 200 201 if (!empty($this->brackets)) { 202 list($expect, $lineno) = array_pop($this->brackets); 203 throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source); 204 } 205 206 return new TokenStream($this->tokens, $this->source); 207 } 208 209 private function lexData() 210 { 211 // if no matches are left we return the rest of the template as simple text token 212 if ($this->position == \count($this->positions[0]) - 1) { 213 $this->pushToken(/* Token::TEXT_TYPE */ 0, substr($this->code, $this->cursor)); 214 $this->cursor = $this->end; 215 216 return; 217 } 218 219 // Find the first token after the current cursor 220 $position = $this->positions[0][++$this->position]; 221 while ($position[1] < $this->cursor) { 222 if ($this->position == \count($this->positions[0]) - 1) { 223 return; 224 } 225 $position = $this->positions[0][++$this->position]; 226 } 227 228 // push the template text first 229 $text = $textContent = substr($this->code, $this->cursor, $position[1] - $this->cursor); 230 231 // trim? 232 if (isset($this->positions[2][$this->position][0])) { 233 if ($this->options['whitespace_trim'] === $this->positions[2][$this->position][0]) { 234 // whitespace_trim detected ({%-, {{- or {#-) 235 $text = rtrim($text); 236 } elseif ($this->options['whitespace_line_trim'] === $this->positions[2][$this->position][0]) { 237 // whitespace_line_trim detected ({%~, {{~ or {#~) 238 // don't trim \r and \n 239 $text = rtrim($text, " \t\0\x0B"); 240 } 241 } 242 $this->pushToken(/* Token::TEXT_TYPE */ 0, $text); 243 $this->moveCursor($textContent.$position[0]); 244 245 switch ($this->positions[1][$this->position][0]) { 246 case $this->options['tag_comment'][0]: 247 $this->lexComment(); 248 break; 249 250 case $this->options['tag_block'][0]: 251 // raw data? 252 if (preg_match($this->regexes['lex_block_raw'], $this->code, $match, 0, $this->cursor)) { 253 $this->moveCursor($match[0]); 254 $this->lexRawData(); 255 // {% line \d+ %} 256 } elseif (preg_match($this->regexes['lex_block_line'], $this->code, $match, 0, $this->cursor)) { 257 $this->moveCursor($match[0]); 258 $this->lineno = (int) $match[1]; 259 } else { 260 $this->pushToken(/* Token::BLOCK_START_TYPE */ 1); 261 $this->pushState(self::STATE_BLOCK); 262 $this->currentVarBlockLine = $this->lineno; 263 } 264 break; 265 266 case $this->options['tag_variable'][0]: 267 $this->pushToken(/* Token::VAR_START_TYPE */ 2); 268 $this->pushState(self::STATE_VAR); 269 $this->currentVarBlockLine = $this->lineno; 270 break; 271 } 272 } 273 274 private function lexBlock() 275 { 276 if (empty($this->brackets) && preg_match($this->regexes['lex_block'], $this->code, $match, 0, $this->cursor)) { 277 $this->pushToken(/* Token::BLOCK_END_TYPE */ 3); 278 $this->moveCursor($match[0]); 279 $this->popState(); 280 } else { 281 $this->lexExpression(); 282 } 283 } 284 285 private function lexVar() 286 { 287 if (empty($this->brackets) && preg_match($this->regexes['lex_var'], $this->code, $match, 0, $this->cursor)) { 288 $this->pushToken(/* Token::VAR_END_TYPE */ 4); 289 $this->moveCursor($match[0]); 290 $this->popState(); 291 } else { 292 $this->lexExpression(); 293 } 294 } 295 296 private function lexExpression() 297 { 298 // whitespace 299 if (preg_match('/\s+/A', $this->code, $match, 0, $this->cursor)) { 300 $this->moveCursor($match[0]); 301 302 if ($this->cursor >= $this->end) { 303 throw new SyntaxError(sprintf('Unclosed "%s".', self::STATE_BLOCK === $this->state ? 'block' : 'variable'), $this->currentVarBlockLine, $this->source); 304 } 305 } 306 307 // arrow function 308 if ('=' === $this->code[$this->cursor] && '>' === $this->code[$this->cursor + 1]) { 309 $this->pushToken(Token::ARROW_TYPE, '=>'); 310 $this->moveCursor('=>'); 311 } 312 // operators 313 elseif (preg_match($this->regexes['operator'], $this->code, $match, 0, $this->cursor)) { 314 $this->pushToken(/* Token::OPERATOR_TYPE */ 8, preg_replace('/\s+/', ' ', $match[0])); 315 $this->moveCursor($match[0]); 316 } 317 // names 318 elseif (preg_match(self::REGEX_NAME, $this->code, $match, 0, $this->cursor)) { 319 $this->pushToken(/* Token::NAME_TYPE */ 5, $match[0]); 320 $this->moveCursor($match[0]); 321 } 322 // numbers 323 elseif (preg_match(self::REGEX_NUMBER, $this->code, $match, 0, $this->cursor)) { 324 $number = (float) $match[0]; // floats 325 if (ctype_digit($match[0]) && $number <= \PHP_INT_MAX) { 326 $number = (int) $match[0]; // integers lower than the maximum 327 } 328 $this->pushToken(/* Token::NUMBER_TYPE */ 6, $number); 329 $this->moveCursor($match[0]); 330 } 331 // punctuation 332 elseif (false !== strpos(self::PUNCTUATION, $this->code[$this->cursor])) { 333 // opening bracket 334 if (false !== strpos('([{', $this->code[$this->cursor])) { 335 $this->brackets[] = [$this->code[$this->cursor], $this->lineno]; 336 } 337 // closing bracket 338 elseif (false !== strpos(')]}', $this->code[$this->cursor])) { 339 if (empty($this->brackets)) { 340 throw new SyntaxError(sprintf('Unexpected "%s".', $this->code[$this->cursor]), $this->lineno, $this->source); 341 } 342 343 list($expect, $lineno) = array_pop($this->brackets); 344 if ($this->code[$this->cursor] != strtr($expect, '([{', ')]}')) { 345 throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source); 346 } 347 } 348 349 $this->pushToken(/* Token::PUNCTUATION_TYPE */ 9, $this->code[$this->cursor]); 350 ++$this->cursor; 351 } 352 // strings 353 elseif (preg_match(self::REGEX_STRING, $this->code, $match, 0, $this->cursor)) { 354 $this->pushToken(/* Token::STRING_TYPE */ 7, stripcslashes(substr($match[0], 1, -1))); 355 $this->moveCursor($match[0]); 356 } 357 // opening double quoted string 358 elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) { 359 $this->brackets[] = ['"', $this->lineno]; 360 $this->pushState(self::STATE_STRING); 361 $this->moveCursor($match[0]); 362 } 363 // unlexable 364 else { 365 throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source); 366 } 367 } 368 369 private function lexRawData() 370 { 371 if (!preg_match($this->regexes['lex_raw_data'], $this->code, $match, \PREG_OFFSET_CAPTURE, $this->cursor)) { 372 throw new SyntaxError('Unexpected end of file: Unclosed "verbatim" block.', $this->lineno, $this->source); 373 } 374 375 $text = substr($this->code, $this->cursor, $match[0][1] - $this->cursor); 376 $this->moveCursor($text.$match[0][0]); 377 378 // trim? 379 if (isset($match[1][0])) { 380 if ($this->options['whitespace_trim'] === $match[1][0]) { 381 // whitespace_trim detected ({%-, {{- or {#-) 382 $text = rtrim($text); 383 } else { 384 // whitespace_line_trim detected ({%~, {{~ or {#~) 385 // don't trim \r and \n 386 $text = rtrim($text, " \t\0\x0B"); 387 } 388 } 389 390 $this->pushToken(/* Token::TEXT_TYPE */ 0, $text); 391 } 392 393 private function lexComment() 394 { 395 if (!preg_match($this->regexes['lex_comment'], $this->code, $match, \PREG_OFFSET_CAPTURE, $this->cursor)) { 396 throw new SyntaxError('Unclosed comment.', $this->lineno, $this->source); 397 } 398 399 $this->moveCursor(substr($this->code, $this->cursor, $match[0][1] - $this->cursor).$match[0][0]); 400 } 401 402 private function lexString() 403 { 404 if (preg_match($this->regexes['interpolation_start'], $this->code, $match, 0, $this->cursor)) { 405 $this->brackets[] = [$this->options['interpolation'][0], $this->lineno]; 406 $this->pushToken(/* Token::INTERPOLATION_START_TYPE */ 10); 407 $this->moveCursor($match[0]); 408 $this->pushState(self::STATE_INTERPOLATION); 409 } elseif (preg_match(self::REGEX_DQ_STRING_PART, $this->code, $match, 0, $this->cursor) && \strlen($match[0]) > 0) { 410 $this->pushToken(/* Token::STRING_TYPE */ 7, stripcslashes($match[0])); 411 $this->moveCursor($match[0]); 412 } elseif (preg_match(self::REGEX_DQ_STRING_DELIM, $this->code, $match, 0, $this->cursor)) { 413 list($expect, $lineno) = array_pop($this->brackets); 414 if ('"' != $this->code[$this->cursor]) { 415 throw new SyntaxError(sprintf('Unclosed "%s".', $expect), $lineno, $this->source); 416 } 417 418 $this->popState(); 419 ++$this->cursor; 420 } else { 421 // unlexable 422 throw new SyntaxError(sprintf('Unexpected character "%s".', $this->code[$this->cursor]), $this->lineno, $this->source); 423 } 424 } 425 426 private function lexInterpolation() 427 { 428 $bracket = end($this->brackets); 429 if ($this->options['interpolation'][0] === $bracket[0] && preg_match($this->regexes['interpolation_end'], $this->code, $match, 0, $this->cursor)) { 430 array_pop($this->brackets); 431 $this->pushToken(/* Token::INTERPOLATION_END_TYPE */ 11); 432 $this->moveCursor($match[0]); 433 $this->popState(); 434 } else { 435 $this->lexExpression(); 436 } 437 } 438 439 private function pushToken($type, $value = '') 440 { 441 // do not push empty text tokens 442 if (/* Token::TEXT_TYPE */ 0 === $type && '' === $value) { 443 return; 444 } 445 446 $this->tokens[] = new Token($type, $value, $this->lineno); 447 } 448 449 private function moveCursor($text) 450 { 451 $this->cursor += \strlen($text); 452 $this->lineno += substr_count($text, "\n"); 453 } 454 455 private function getOperatorRegex() 456 { 457 $operators = array_merge( 458 ['='], 459 array_keys($this->env->getUnaryOperators()), 460 array_keys($this->env->getBinaryOperators()) 461 ); 462 463 $operators = array_combine($operators, array_map('strlen', $operators)); 464 arsort($operators); 465 466 $regex = []; 467 foreach ($operators as $operator => $length) { 468 // an operator that ends with a character must be followed by 469 // a whitespace, a parenthesis, an opening map [ or sequence { 470 $r = preg_quote($operator, '/'); 471 if (ctype_alpha($operator[$length - 1])) { 472 $r .= '(?=[\s()\[{])'; 473 } 474 475 // an operator that begins with a character must not have a dot or pipe before 476 if (ctype_alpha($operator[0])) { 477 $r = '(?<![\.\|])'.$r; 478 } 479 480 // an operator with a space can be any amount of whitespaces 481 $r = preg_replace('/\s+/', '\s+', $r); 482 483 $regex[] = $r; 484 } 485 486 return '/'.implode('|', $regex).'/A'; 487 } 488 489 private function pushState($state) 490 { 491 $this->states[] = $this->state; 492 $this->state = $state; 493 } 494 495 private function popState() 496 { 497 if (0 === \count($this->states)) { 498 throw new \LogicException('Cannot pop state without a previous state.'); 499 } 500 501 $this->state = array_pop($this->states); 502 } 503} 504 505class_alias('Twig\Lexer', 'Twig_Lexer'); 506