1<?php 2 3declare(strict_types=1); 4 5namespace Antlr\Antlr4\Runtime; 6 7use Antlr\Antlr4\Runtime\Atn\LexerATNSimulator; 8use Antlr\Antlr4\Runtime\Error\Exceptions\LexerNoViableAltException; 9use Antlr\Antlr4\Runtime\Error\Exceptions\RecognitionException; 10use Antlr\Antlr4\Runtime\Utils\Pair; 11 12/** 13 * A lexer is recognizer that draws input symbols from a character stream. 14 * lexer grammars result in a subclass of this object. A Lexer object 15 * uses simplified match() and error recovery mechanisms in the interest 16 * of speed. 17 */ 18abstract class Lexer extends Recognizer implements TokenSource 19{ 20 public const DEFAULT_MODE = 0; 21 public const MORE = -2; 22 public const SKIP = -3; 23 24 public const DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL; 25 public const HIDDEN = Token::HIDDEN_CHANNEL; 26 public const MIN_CHAR_VALUE = 0x0000; 27 public const MAX_CHAR_VALUE = 0x10FFFF; 28 29 /** @var CharStream|null */ 30 public $input; 31 32 /** @var Pair Pair<TokenSource, CharStream> */ 33 protected $tokenFactorySourcePair; 34 35 /** @var TokenFactory */ 36 protected $factory; 37 38 /** 39 * The goal of all lexer rules/methods is to create a token object. 40 * This is an instance variable as multiple rules may collaborate to 41 * create a single token. `nextToken` will return this object after 42 * matching lexer rule(s). 43 * 44 * If you subclass to allow multiple token emissions, then set this 45 * to the last token to be matched or something nonnull so that 46 * the auto token emit mechanism will not emit another token. 47 * 48 * @var Token|null 49 */ 50 public $token; 51 52 /** 53 * What character index in the stream did the current token start at? 54 * Needed, for example, to get the text for current token. Set at 55 * the start of nextToken. 56 * 57 * @var int 58 */ 59 public $tokenStartCharIndex = -1; 60 61 /** 62 * The line on which the first character of the token resides. 63 * 64 * @var int 65 */ 66 public $tokenStartLine = -1; 67 68 /** 69 * The character position of first character within the line 70 * 71 * @var int 72 */ 73 public $tokenStartCharPositionInLine = -1; 74 75 /** 76 * Once we see EOF on char stream, next token will be EOF. 77 * If you have DONE : EOF ; then you see DONE EOF. 78 * 79 * @var bool 80 */ 81 public $hitEOF = false; 82 83 /** 84 * The channel number for the current token. 85 * 86 * @var int 87 */ 88 public $channel = Token::DEFAULT_CHANNEL; 89 90 /** 91 * The token type for the current token. 92 * 93 * @var int 94 */ 95 public $type = Token::INVALID_TYPE; 96 97 /** @var array<int> */ 98 public $modeStack = []; 99 100 /** @var int */ 101 public $mode = self::DEFAULT_MODE; 102 103 /** 104 * You can set the text for the current token to override what is in the 105 * input char buffer. Use {@see Lexer::setText()} or can set this instance var. 106 * 107 * @var string|null 108 */ 109 public $text; 110 111 /** @var LexerATNSimulator|null */ 112 protected $interp; 113 114 public function __construct(?CharStream $input = null) 115 { 116 parent::__construct(); 117 118 $this->input = $input; 119 $this->factory = CommonTokenFactory::default(); 120 $this->tokenFactorySourcePair = new Pair($this, $input); 121 122 // @todo remove this property 123 $this->interp = null;// child classes must populate this 124 } 125 126 public function reset() : void 127 { 128 // wack Lexer state variables 129 if ($this->input !== null) { 130 $this->input->seek(0);// rewind the input 131 } 132 133 $this->token = null; 134 $this->type = Token::INVALID_TYPE; 135 $this->channel = Token::DEFAULT_CHANNEL; 136 $this->tokenStartCharIndex = -1; 137 $this->tokenStartCharPositionInLine = -1; 138 $this->tokenStartLine = -1; 139 $this->text = null; 140 141 $this->hitEOF = false; 142 $this->mode = self::DEFAULT_MODE; 143 $this->modeStack = []; 144 145 if ($this->interp !== null) { 146 $this->interp->reset(); 147 } 148 } 149 150 /** 151 * Return a token from this source; i.e., match a token on the char stream. 152 */ 153 public function nextToken() : ?Token 154 { 155 if ($this->input === null) { 156 throw new \RuntimeException('NextToken requires a non-null input stream.'); 157 } 158 159 // Mark start location in char stream so unbuffered streams are 160 // guaranteed at least have text of current token 161 $tokenStartMarker = $this->input->mark(); 162 163 try { 164 while (true) { 165 if ($this->hitEOF) { 166 $this->emitEOF(); 167 168 return $this->token; 169 } 170 171 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { 172 throw new \RuntimeException('Unexpected interpreter type.'); 173 } 174 175 $this->token = null; 176 $this->channel = Token::DEFAULT_CHANNEL; 177 $this->tokenStartCharIndex = $this->input->getIndex(); 178 $this->tokenStartCharPositionInLine = $this->interp->getCharPositionInLine(); 179 $this->tokenStartLine = $this->interp->getLine(); 180 $this->text = null; 181 $continueOuter = false; 182 183 while (true) { 184 $this->type = Token::INVALID_TYPE; 185 $ttype = self::SKIP; 186 try { 187 $ttype = $this->interp->match($this->input, $this->mode); 188 } catch (LexerNoViableAltException $e) { 189 $this->notifyListeners($e); // report error 190 $this->recover($e); 191 } 192 193 if ($this->input->LA(1) === Token::EOF) { 194 $this->hitEOF = true; 195 } 196 197 if ($this->type === Token::INVALID_TYPE) { 198 $this->type = $ttype; 199 } 200 201 if ($this->type === self::SKIP) { 202 $continueOuter = true; 203 204 break; 205 } 206 207 if ($this->type !== self::MORE) { 208 break; 209 } 210 } 211 212 if ($continueOuter) { 213 continue; 214 } 215 216 if ($this->token === null) { 217 $this->emit(); 218 } 219 220 return $this->token; 221 } 222 } finally { 223 // make sure we release marker after match or 224 // unbuffered char stream will keep buffering 225 $this->input->release($tokenStartMarker); 226 } 227 } 228 229 /** 230 * Instruct the lexer to skip creating a token for current lexer rule 231 * and look for another token. `nextToken` knows to keep looking when 232 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 233 * if `token === null` at end of any token rule, it creates one for you 234 * and emits it. 235 */ 236 public function skip() : void 237 { 238 $this->type = self::SKIP; 239 } 240 241 public function more() : void 242 { 243 $this->type = self::MORE; 244 } 245 246 public function mode(int $m) : void 247 { 248 $this->mode = $m; 249 } 250 251 public function pushMode(int $m) : void 252 { 253 $this->modeStack[] = $this->mode; 254 255 $this->mode($m); 256 } 257 258 public function popMode() : int 259 { 260 if (\count($this->modeStack) === 0) { 261 throw new \RuntimeException('Empty Stack'); 262 } 263 264 $this->mode(\array_pop($this->modeStack)); 265 266 return $this->mode; 267 } 268 269 public function getSourceName() : string 270 { 271 return $this->input === null ? '' : $this->input->getSourceName(); 272 } 273 274 public function getInputStream() : ?IntStream 275 { 276 return $this->input; 277 } 278 279 public function getTokenFactory() : TokenFactory 280 { 281 return $this->factory; 282 } 283 284 public function setTokenFactory(TokenFactory $factory) : void 285 { 286 $this->factory = $factory; 287 } 288 289 public function setInputStream(IntStream $input) : void 290 { 291 $this->input = null; 292 $this->tokenFactorySourcePair = new Pair($this, $this->input); 293 294 $this->reset(); 295 296 if (!$input instanceof CharStream) { 297 throw new \RuntimeException('Input must be CharStream.'); 298 } 299 300 $this->input = $input; 301 $this->tokenFactorySourcePair = new Pair($this, $this->input); 302 } 303 304 /** 305 * By default does not support multiple emits per nextToken invocation 306 * for efficiency reasons. Subclass and override this method, nextToken, 307 * and getToken (to push tokens into a list and pull from that list 308 * rather than a single variable as this implementation does). 309 */ 310 public function emitToken(Token $token) : void 311 { 312 $this->token = $token; 313 } 314 315 /** 316 * The standard method called to automatically emit a token at the 317 * outermost lexical rule. The token object should point into the 318 * char buffer start..stop. If there is a text override in 'text', 319 * use that to set the token's text. Override this method to emit 320 * custom Token objects or provide a new factory. 321 */ 322 public function emit() : Token 323 { 324 $token = $this->factory->createEx( 325 $this->tokenFactorySourcePair, 326 $this->type, 327 $this->text, 328 $this->channel, 329 $this->tokenStartCharIndex, 330 $this->getCharIndex() - 1, 331 $this->tokenStartLine, 332 $this->tokenStartCharPositionInLine 333 ); 334 335 $this->emitToken($token); 336 337 return $token; 338 } 339 340 public function emitEOF() : Token 341 { 342 if ($this->input === null) { 343 throw new \RuntimeException('Cannot emit EOF for null stream.'); 344 } 345 346 $cpos = $this->getCharPositionInLine(); 347 $lpos = $this->getLine(); 348 $eof = $this->factory->createEx( 349 $this->tokenFactorySourcePair, 350 Token::EOF, 351 null, 352 Token::DEFAULT_CHANNEL, 353 $this->input->getIndex(), 354 $this->input->getIndex() - 1, 355 $lpos, 356 $cpos 357 ); 358 359 $this->emitToken($eof); 360 361 return $eof; 362 } 363 364 public function getLine() : int 365 { 366 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { 367 throw new \RuntimeException('Unexpected interpreter type.'); 368 } 369 370 return $this->interp->getLine(); 371 } 372 373 public function setLine(int $line) : void 374 { 375 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { 376 throw new \RuntimeException('Unexpected interpreter type.'); 377 } 378 379 $this->interp->setLine($line); 380 } 381 382 public function getCharPositionInLine() : int 383 { 384 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { 385 throw new \RuntimeException('Unexpected interpreter type.'); 386 } 387 388 return $this->interp->getCharPositionInLine(); 389 } 390 391 public function setCharPositionInLine(int $charPositionInLine) : void 392 { 393 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { 394 throw new \RuntimeException('Unexpected interpreter type.'); 395 } 396 397 $this->interp->setCharPositionInLine($charPositionInLine); 398 } 399 400 /** 401 * What is the index of the current character of lookahead? 402 */ 403 public function getCharIndex() : int 404 { 405 if ($this->input === null) { 406 throw new \RuntimeException('Cannot know char index for null stream.'); 407 } 408 409 return $this->input->getIndex(); 410 } 411 412 /** 413 * Return the text matched so far for the current token or any text override. 414 */ 415 public function getText() : string 416 { 417 if ($this->text !== null) { 418 return $this->text; 419 } 420 421 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) { 422 throw new \RuntimeException('Unexpected interpreter type.'); 423 } 424 425 return $this->input === null ? '' : $this->interp->getText($this->input); 426 } 427 428 /** 429 * Set the complete text of this token; it wipes any previous changes to the text. 430 */ 431 public function setText(string $text) : void 432 { 433 $this->text = $text; 434 } 435 436 public function getToken() : ?Token 437 { 438 return $this->token; 439 } 440 441 /** 442 * Override if emitting multiple tokens. 443 */ 444 public function setToken(Token $token) : void 445 { 446 $this->token = $token; 447 } 448 449 public function getType() : int 450 { 451 return $this->type; 452 } 453 454 public function setType(int $type) : void 455 { 456 $this->type = $type; 457 } 458 459 public function getChannel() : int 460 { 461 return $this->channel; 462 } 463 464 public function setChannel(int $channel) : void 465 { 466 $this->channel = $channel; 467 } 468 469 /** 470 * @return array<string>|null 471 */ 472 public function getChannelNames() : ?array 473 { 474 return null; 475 } 476 477 /** 478 * @return array<string>|null 479 */ 480 public function getModeNames() : ?array 481 { 482 return null; 483 } 484 485 /** 486 * Return a list of all Token objects in input char stream. 487 * Forces load of all tokens. Does not include EOF token. 488 * 489 * @return array<Token> 490 */ 491 public function getAllTokens() : array 492 { 493 $tokens = []; 494 $token = $this->nextToken(); 495 496 while ($token && $token->getType() !== Token::EOF) { 497 $tokens[] = $token; 498 $token = $this->nextToken(); 499 } 500 501 return $tokens; 502 } 503 504 /** 505 * Lexers can normally match any char in it's vocabulary after matching 506 * a token, so do the easy thing and just kill a character and hope 507 * it all works out. You can instead use the rule invocation stack 508 * to do sophisticated error recovery if you are in a fragment rule. 509 */ 510 public function recover(RecognitionException $re) : void 511 { 512 if ($this->input !== null && $this->input->LA(1) !== Token::EOF) { 513 if ($re instanceof LexerNoViableAltException && $this->interp !== null) { 514 // skip a char and try again 515 $this->interp->consume($this->input); 516 } else { 517 // TODO: Do we lose character or line position information? 518 $this->input->consume(); 519 } 520 } 521 } 522 523 public function notifyListeners(LexerNoViableAltException $e) : void 524 { 525 $start = $this->tokenStartCharIndex; 526 527 if ($this->input === null) { 528 $text = ''; 529 } else { 530 $stop = $this->input->getIndex(); 531 $text = $this->input->getText($start, $stop); 532 } 533 534 $listener = $this->getErrorListenerDispatch(); 535 536 $listener->syntaxError( 537 $this, 538 null, 539 $this->tokenStartLine, 540 $this->tokenStartCharPositionInLine, 541 \sprintf('token recognition error at: \'%s\'', $text), 542 $e 543 ); 544 } 545} 546