1 <?php
2 
3 declare(strict_types=1);
4 
5 namespace Antlr\Antlr4\Runtime;
6 
7 use Antlr\Antlr4\Runtime\Atn\LexerATNSimulator;
8 use Antlr\Antlr4\Runtime\Error\Exceptions\LexerNoViableAltException;
9 use Antlr\Antlr4\Runtime\Error\Exceptions\RecognitionException;
10 use Antlr\Antlr4\Runtime\Utils\Pair;
11 
12 /**
13  * A lexer is recognizer that draws input symbols from a character stream.
14  * lexer grammars result in a subclass of this object. A Lexer object
15  * uses simplified match() and error recovery mechanisms in the interest
16  * of speed.
17  */
18 abstract class Lexer extends Recognizer implements TokenSource
19 {
20     public const DEFAULT_MODE = 0;
21     public const MORE = -2;
22     public const SKIP = -3;
23 
24     public const DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL;
25     public const HIDDEN = Token::HIDDEN_CHANNEL;
26     public const MIN_CHAR_VALUE = 0x0000;
27     public const MAX_CHAR_VALUE = 0x10FFFF;
28 
29     /** @var CharStream|null */
30     public $input;
31 
32     /** @var Pair Pair<TokenSource, CharStream> */
33     protected $tokenFactorySourcePair;
34 
35     /** @var TokenFactory */
36     protected $factory;
37 
38     /**
39      * The goal of all lexer rules/methods is to create a token object.
40      * This is an instance variable as multiple rules may collaborate to
41      * create a single token. `nextToken` will return this object after
42      * matching lexer rule(s).
43      *
44      * If you subclass to allow multiple token emissions, then set this
45      * to the last token to be matched or something nonnull so that
46      * the auto token emit mechanism will not emit another token.
47      *
48      * @var Token|null
49      */
50     public $token;
51 
52     /**
53      * What character index in the stream did the current token start at?
54      * Needed, for example, to get the text for current token. Set at
55      * the start of nextToken.
56      *
57      * @var int
58      */
59     public $tokenStartCharIndex = -1;
60 
61     /**
62      * The line on which the first character of the token resides.
63      *
64      * @var int
65      */
66     public $tokenStartLine = -1;
67 
68     /**
69      * The character position of first character within the line
70      *
71      * @var int
72      */
73     public $tokenStartCharPositionInLine = -1;
74 
75     /**
76      * Once we see EOF on char stream, next token will be EOF.
77      * If you have DONE : EOF ; then you see DONE EOF.
78      *
79      * @var bool
80      */
81     public $hitEOF = false;
82 
83     /**
84      * The channel number for the current token.
85      *
86      * @var int
87      */
88     public $channel = Token::DEFAULT_CHANNEL;
89 
90     /**
91      * The token type for the current token.
92      *
93      * @var int
94      */
95     public $type = Token::INVALID_TYPE;
96 
97     /** @var array<int> */
98     public $modeStack = [];
99 
100     /** @var int */
101     public $mode = self::DEFAULT_MODE;
102 
103     /**
104      * You can set the text for the current token to override what is in the
105      * input char buffer. Use {@see Lexer::setText()} or can set this instance var.
106      *
107      * @var string|null
108      */
109     public $text;
110 
111     /** @var LexerATNSimulator|null */
112     protected $interp;
113 
114     public function __construct(?CharStream $input = null)
115     {
116         parent::__construct();
117 
118         $this->input = $input;
119         $this->factory = CommonTokenFactory::default();
120         $this->tokenFactorySourcePair = new Pair($this, $input);
121 
122         // @todo remove this property
123         $this->interp = null;// child classes must populate this
124     }
125 
126     public function reset() : void
127     {
128         // wack Lexer state variables
129         if ($this->input !== null) {
130             $this->input->seek(0);// rewind the input
131         }
132 
133         $this->token = null;
134         $this->type = Token::INVALID_TYPE;
135         $this->channel = Token::DEFAULT_CHANNEL;
136         $this->tokenStartCharIndex = -1;
137         $this->tokenStartCharPositionInLine = -1;
138         $this->tokenStartLine = -1;
139         $this->text = null;
140 
141         $this->hitEOF = false;
142         $this->mode = self::DEFAULT_MODE;
143         $this->modeStack = [];
144 
145         if ($this->interp !== null) {
146             $this->interp->reset();
147         }
148     }
149 
150     /**
151      * Return a token from this source; i.e., match a token on the char stream.
152      */
153     public function nextToken() : ?Token
154     {
155         if ($this->input === null) {
156             throw new \RuntimeException('NextToken requires a non-null input stream.');
157         }
158 
159         // Mark start location in char stream so unbuffered streams are
160         // guaranteed at least have text of current token
161         $tokenStartMarker = $this->input->mark();
162 
163         try {
164             while (true) {
165                 if ($this->hitEOF) {
166                     $this->emitEOF();
167 
168                     return $this->token;
169                 }
170 
171                 if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
172                     throw new \RuntimeException('Unexpected interpreter type.');
173                 }
174 
175                 $this->token = null;
176                 $this->channel = Token::DEFAULT_CHANNEL;
177                 $this->tokenStartCharIndex = $this->input->getIndex();
178                 $this->tokenStartCharPositionInLine = $this->interp->getCharPositionInLine();
179                 $this->tokenStartLine = $this->interp->getLine();
180                 $this->text = null;
181                 $continueOuter = false;
182 
183                 while (true) {
184                     $this->type = Token::INVALID_TYPE;
185                     $ttype = self::SKIP;
186                     try {
187                         $ttype = $this->interp->match($this->input, $this->mode);
188                     } catch (LexerNoViableAltException $e) {
189                         $this->notifyListeners($e); // report error
190                         $this->recover($e);
191                     }
192 
193                     if ($this->input->LA(1) === Token::EOF) {
194                         $this->hitEOF = true;
195                     }
196 
197                     if ($this->type === Token::INVALID_TYPE) {
198                         $this->type = $ttype;
199                     }
200 
201                     if ($this->type === self::SKIP) {
202                         $continueOuter = true;
203 
204                         break;
205                     }
206 
207                     if ($this->type !== self::MORE) {
208                         break;
209                     }
210                 }
211 
212                 if ($continueOuter) {
213                     continue;
214                 }
215 
216                 if ($this->token === null) {
217                     $this->emit();
218                 }
219 
220                 return $this->token;
221             }
222         } finally {
223             // make sure we release marker after match or
224             // unbuffered char stream will keep buffering
225             $this->input->release($tokenStartMarker);
226         }
227     }
228 
229     /**
230      * Instruct the lexer to skip creating a token for current lexer rule
231      * and look for another token. `nextToken` knows to keep looking when
232      * a lexer rule finishes with token set to SKIP_TOKEN. Recall that
233      * if `token === null` at end of any token rule, it creates one for you
234      * and emits it.
235      */
236     public function skip() : void
237     {
238         $this->type = self::SKIP;
239     }
240 
241     public function more() : void
242     {
243         $this->type = self::MORE;
244     }
245 
246     public function mode(int $m) : void
247     {
248         $this->mode = $m;
249     }
250 
251     public function pushMode(int $m) : void
252     {
253         $this->modeStack[] = $this->mode;
254 
255         $this->mode($m);
256     }
257 
258     public function popMode() : int
259     {
260         if (\count($this->modeStack) === 0) {
261             throw new \RuntimeException('Empty Stack');
262         }
263 
264         $this->mode(\array_pop($this->modeStack));
265 
266         return $this->mode;
267     }
268 
269     public function getSourceName() : string
270     {
271         return $this->input === null ? '' : $this->input->getSourceName();
272     }
273 
274     public function getInputStream() : ?IntStream
275     {
276         return $this->input;
277     }
278 
279     public function getTokenFactory() : TokenFactory
280     {
281         return $this->factory;
282     }
283 
284     public function setTokenFactory(TokenFactory $factory) : void
285     {
286         $this->factory = $factory;
287     }
288 
289     public function setInputStream(IntStream $input) : void
290     {
291         $this->input = null;
292         $this->tokenFactorySourcePair = new Pair($this, $this->input);
293 
294         $this->reset();
295 
296         if (!$input instanceof CharStream) {
297             throw new \RuntimeException('Input must be CharStream.');
298         }
299 
300         $this->input = $input;
301         $this->tokenFactorySourcePair = new Pair($this, $this->input);
302     }
303 
304     /**
305      * By default does not support multiple emits per nextToken invocation
306      * for efficiency reasons. Subclass and override this method, nextToken,
307      * and getToken (to push tokens into a list and pull from that list
308      * rather than a single variable as this implementation does).
309      */
310     public function emitToken(Token $token) : void
311     {
312         $this->token = $token;
313     }
314 
315     /**
316      * The standard method called to automatically emit a token at the
317      * outermost lexical rule. The token object should point into the
318      * char buffer start..stop. If there is a text override in 'text',
319      * use that to set the token's text. Override this method to emit
320      * custom Token objects or provide a new factory.
321      */
322     public function emit() : Token
323     {
324         $token = $this->factory->createEx(
325             $this->tokenFactorySourcePair,
326             $this->type,
327             $this->text,
328             $this->channel,
329             $this->tokenStartCharIndex,
330             $this->getCharIndex() - 1,
331             $this->tokenStartLine,
332             $this->tokenStartCharPositionInLine
333         );
334 
335         $this->emitToken($token);
336 
337         return $token;
338     }
339 
340     public function emitEOF() : Token
341     {
342         if ($this->input === null) {
343             throw new \RuntimeException('Cannot emit EOF for null stream.');
344         }
345 
346         $cpos = $this->getCharPositionInLine();
347         $lpos = $this->getLine();
348         $eof = $this->factory->createEx(
349             $this->tokenFactorySourcePair,
350             Token::EOF,
351             null,
352             Token::DEFAULT_CHANNEL,
353             $this->input->getIndex(),
354             $this->input->getIndex() - 1,
355             $lpos,
356             $cpos
357         );
358 
359         $this->emitToken($eof);
360 
361         return $eof;
362     }
363 
364     public function getLine() : int
365     {
366         if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
367             throw new \RuntimeException('Unexpected interpreter type.');
368         }
369 
370         return $this->interp->getLine();
371     }
372 
373     public function setLine(int $line) : void
374     {
375         if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
376             throw new \RuntimeException('Unexpected interpreter type.');
377         }
378 
379         $this->interp->setLine($line);
380     }
381 
382     public function getCharPositionInLine() : int
383     {
384         if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
385             throw new \RuntimeException('Unexpected interpreter type.');
386         }
387 
388         return $this->interp->getCharPositionInLine();
389     }
390 
391     public function setCharPositionInLine(int $charPositionInLine) : void
392     {
393         if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
394             throw new \RuntimeException('Unexpected interpreter type.');
395         }
396 
397         $this->interp->setCharPositionInLine($charPositionInLine);
398     }
399 
400     /**
401      * What is the index of the current character of lookahead?
402      */
403     public function getCharIndex() : int
404     {
405         if ($this->input === null) {
406             throw new \RuntimeException('Cannot know char index for null stream.');
407         }
408 
409         return $this->input->getIndex();
410     }
411 
412     /**
413      * Return the text matched so far for the current token or any text override.
414      */
415     public function getText() : string
416     {
417         if ($this->text !== null) {
418             return $this->text;
419         }
420 
421         if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
422             throw new \RuntimeException('Unexpected interpreter type.');
423         }
424 
425         return $this->input === null ? '' : $this->interp->getText($this->input);
426     }
427 
428     /**
429      * Set the complete text of this token; it wipes any previous changes to the text.
430      */
431     public function setText(string $text) : void
432     {
433         $this->text = $text;
434     }
435 
436     public function getToken() : ?Token
437     {
438         return $this->token;
439     }
440 
441     /**
442      * Override if emitting multiple tokens.
443      */
444     public function setToken(Token $token) : void
445     {
446         $this->token = $token;
447     }
448 
449     public function getType() : int
450     {
451         return $this->type;
452     }
453 
454     public function setType(int $type) : void
455     {
456         $this->type = $type;
457     }
458 
459     public function getChannel() : int
460     {
461         return $this->channel;
462     }
463 
464     public function setChannel(int $channel) : void
465     {
466         $this->channel = $channel;
467     }
468 
469     /**
470      * @return array<string>|null
471      */
472     public function getChannelNames() : ?array
473     {
474         return null;
475     }
476 
477     /**
478      * @return array<string>|null
479      */
480     public function getModeNames() : ?array
481     {
482         return null;
483     }
484 
485     /**
486      * Return a list of all Token objects in input char stream.
487      * Forces load of all tokens. Does not include EOF token.
488      *
489      * @return array<Token>
490      */
491     public function getAllTokens() : array
492     {
493         $tokens = [];
494         $token = $this->nextToken();
495 
496         while ($token && $token->getType() !== Token::EOF) {
497             $tokens[] = $token;
498             $token = $this->nextToken();
499         }
500 
501         return $tokens;
502     }
503 
504     /**
505      * Lexers can normally match any char in it's vocabulary after matching
506      * a token, so do the easy thing and just kill a character and hope
507      * it all works out. You can instead use the rule invocation stack
508      * to do sophisticated error recovery if you are in a fragment rule.
509      */
510     public function recover(RecognitionException $re) : void
511     {
512         if ($this->input !== null && $this->input->LA(1) !== Token::EOF) {
513             if ($re instanceof LexerNoViableAltException && $this->interp !== null) {
514                 // skip a char and try again
515                 $this->interp->consume($this->input);
516             } else {
517                 // TODO: Do we lose character or line position information?
518                 $this->input->consume();
519             }
520         }
521     }
522 
523     public function notifyListeners(LexerNoViableAltException $e) : void
524     {
525         $start = $this->tokenStartCharIndex;
526 
527         if ($this->input === null) {
528             $text = '';
529         } else {
530             $stop = $this->input->getIndex();
531             $text = $this->input->getText($start, $stop);
532         }
533 
534         $listener = $this->getErrorListenerDispatch();
535 
536         $listener->syntaxError(
537             $this,
538             null,
539             $this->tokenStartLine,
540             $this->tokenStartCharPositionInLine,
541             \sprintf('token recognition error at: \'%s\'', $text),
542             $e
543         );
544     }
545 }
546