1<?php
2
3declare(strict_types=1);
4
5namespace Antlr\Antlr4\Runtime;
6
7use Antlr\Antlr4\Runtime\Atn\LexerATNSimulator;
8use Antlr\Antlr4\Runtime\Error\Exceptions\LexerNoViableAltException;
9use Antlr\Antlr4\Runtime\Error\Exceptions\RecognitionException;
10use Antlr\Antlr4\Runtime\Utils\Pair;
11
12/**
13 * A lexer is recognizer that draws input symbols from a character stream.
14 * lexer grammars result in a subclass of this object. A Lexer object
15 * uses simplified match() and error recovery mechanisms in the interest
16 * of speed.
17 */
18abstract class Lexer extends Recognizer implements TokenSource
19{
20    public const DEFAULT_MODE = 0;
21    public const MORE = -2;
22    public const SKIP = -3;
23
24    public const DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL;
25    public const HIDDEN = Token::HIDDEN_CHANNEL;
26    public const MIN_CHAR_VALUE = 0x0000;
27    public const MAX_CHAR_VALUE = 0x10FFFF;
28
29    /** @var CharStream|null */
30    public $input;
31
32    /** @var Pair Pair<TokenSource, CharStream> */
33    protected $tokenFactorySourcePair;
34
35    /** @var TokenFactory */
36    protected $factory;
37
38    /**
39     * The goal of all lexer rules/methods is to create a token object.
40     * This is an instance variable as multiple rules may collaborate to
41     * create a single token. `nextToken` will return this object after
42     * matching lexer rule(s).
43     *
44     * If you subclass to allow multiple token emissions, then set this
45     * to the last token to be matched or something nonnull so that
46     * the auto token emit mechanism will not emit another token.
47     *
48     * @var Token|null
49     */
50    public $token;
51
52    /**
53     * What character index in the stream did the current token start at?
54     * Needed, for example, to get the text for current token. Set at
55     * the start of nextToken.
56     *
57     * @var int
58     */
59    public $tokenStartCharIndex = -1;
60
61    /**
62     * The line on which the first character of the token resides.
63     *
64     * @var int
65     */
66    public $tokenStartLine = -1;
67
68    /**
69     * The character position of first character within the line
70     *
71     * @var int
72     */
73    public $tokenStartCharPositionInLine = -1;
74
75    /**
76     * Once we see EOF on char stream, next token will be EOF.
77     * If you have DONE : EOF ; then you see DONE EOF.
78     *
79     * @var bool
80     */
81    public $hitEOF = false;
82
83    /**
84     * The channel number for the current token.
85     *
86     * @var int
87     */
88    public $channel = Token::DEFAULT_CHANNEL;
89
90    /**
91     * The token type for the current token.
92     *
93     * @var int
94     */
95    public $type = Token::INVALID_TYPE;
96
97    /** @var array<int> */
98    public $modeStack = [];
99
100    /** @var int */
101    public $mode = self::DEFAULT_MODE;
102
103    /**
104     * You can set the text for the current token to override what is in the
105     * input char buffer. Use {@see Lexer::setText()} or can set this instance var.
106     *
107     * @var string|null
108     */
109    public $text;
110
111    /** @var LexerATNSimulator|null */
112    protected $interp;
113
114    public function __construct(?CharStream $input = null)
115    {
116        parent::__construct();
117
118        $this->input = $input;
119        $this->factory = CommonTokenFactory::default();
120        $this->tokenFactorySourcePair = new Pair($this, $input);
121
122        // @todo remove this property
123        $this->interp = null;// child classes must populate this
124    }
125
126    public function reset() : void
127    {
128        // wack Lexer state variables
129        if ($this->input !== null) {
130            $this->input->seek(0);// rewind the input
131        }
132
133        $this->token = null;
134        $this->type = Token::INVALID_TYPE;
135        $this->channel = Token::DEFAULT_CHANNEL;
136        $this->tokenStartCharIndex = -1;
137        $this->tokenStartCharPositionInLine = -1;
138        $this->tokenStartLine = -1;
139        $this->text = null;
140
141        $this->hitEOF = false;
142        $this->mode = self::DEFAULT_MODE;
143        $this->modeStack = [];
144
145        if ($this->interp !== null) {
146            $this->interp->reset();
147        }
148    }
149
150    /**
151     * Return a token from this source; i.e., match a token on the char stream.
152     */
153    public function nextToken() : ?Token
154    {
155        if ($this->input === null) {
156            throw new \RuntimeException('NextToken requires a non-null input stream.');
157        }
158
159        // Mark start location in char stream so unbuffered streams are
160        // guaranteed at least have text of current token
161        $tokenStartMarker = $this->input->mark();
162
163        try {
164            while (true) {
165                if ($this->hitEOF) {
166                    $this->emitEOF();
167
168                    return $this->token;
169                }
170
171                if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
172                    throw new \RuntimeException('Unexpected interpreter type.');
173                }
174
175                $this->token = null;
176                $this->channel = Token::DEFAULT_CHANNEL;
177                $this->tokenStartCharIndex = $this->input->getIndex();
178                $this->tokenStartCharPositionInLine = $this->interp->getCharPositionInLine();
179                $this->tokenStartLine = $this->interp->getLine();
180                $this->text = null;
181                $continueOuter = false;
182
183                while (true) {
184                    $this->type = Token::INVALID_TYPE;
185                    $ttype = self::SKIP;
186                    try {
187                        $ttype = $this->interp->match($this->input, $this->mode);
188                    } catch (LexerNoViableAltException $e) {
189                        $this->notifyListeners($e); // report error
190                        $this->recover($e);
191                    }
192
193                    if ($this->input->LA(1) === Token::EOF) {
194                        $this->hitEOF = true;
195                    }
196
197                    if ($this->type === Token::INVALID_TYPE) {
198                        $this->type = $ttype;
199                    }
200
201                    if ($this->type === self::SKIP) {
202                        $continueOuter = true;
203
204                        break;
205                    }
206
207                    if ($this->type !== self::MORE) {
208                        break;
209                    }
210                }
211
212                if ($continueOuter) {
213                    continue;
214                }
215
216                if ($this->token === null) {
217                    $this->emit();
218                }
219
220                return $this->token;
221            }
222        } finally {
223            // make sure we release marker after match or
224            // unbuffered char stream will keep buffering
225            $this->input->release($tokenStartMarker);
226        }
227    }
228
229    /**
230     * Instruct the lexer to skip creating a token for current lexer rule
231     * and look for another token. `nextToken` knows to keep looking when
232     * a lexer rule finishes with token set to SKIP_TOKEN. Recall that
233     * if `token === null` at end of any token rule, it creates one for you
234     * and emits it.
235     */
236    public function skip() : void
237    {
238        $this->type = self::SKIP;
239    }
240
241    public function more() : void
242    {
243        $this->type = self::MORE;
244    }
245
246    public function mode(int $m) : void
247    {
248        $this->mode = $m;
249    }
250
251    public function pushMode(int $m) : void
252    {
253        $this->modeStack[] = $this->mode;
254
255        $this->mode($m);
256    }
257
258    public function popMode() : int
259    {
260        if (\count($this->modeStack) === 0) {
261            throw new \RuntimeException('Empty Stack');
262        }
263
264        $this->mode(\array_pop($this->modeStack));
265
266        return $this->mode;
267    }
268
269    public function getSourceName() : string
270    {
271        return $this->input === null ? '' : $this->input->getSourceName();
272    }
273
274    public function getInputStream() : ?IntStream
275    {
276        return $this->input;
277    }
278
279    public function getTokenFactory() : TokenFactory
280    {
281        return $this->factory;
282    }
283
284    public function setTokenFactory(TokenFactory $factory) : void
285    {
286        $this->factory = $factory;
287    }
288
289    public function setInputStream(IntStream $input) : void
290    {
291        $this->input = null;
292        $this->tokenFactorySourcePair = new Pair($this, $this->input);
293
294        $this->reset();
295
296        if (!$input instanceof CharStream) {
297            throw new \RuntimeException('Input must be CharStream.');
298        }
299
300        $this->input = $input;
301        $this->tokenFactorySourcePair = new Pair($this, $this->input);
302    }
303
304    /**
305     * By default does not support multiple emits per nextToken invocation
306     * for efficiency reasons. Subclass and override this method, nextToken,
307     * and getToken (to push tokens into a list and pull from that list
308     * rather than a single variable as this implementation does).
309     */
310    public function emitToken(Token $token) : void
311    {
312        $this->token = $token;
313    }
314
315    /**
316     * The standard method called to automatically emit a token at the
317     * outermost lexical rule. The token object should point into the
318     * char buffer start..stop. If there is a text override in 'text',
319     * use that to set the token's text. Override this method to emit
320     * custom Token objects or provide a new factory.
321     */
322    public function emit() : Token
323    {
324        $token = $this->factory->createEx(
325            $this->tokenFactorySourcePair,
326            $this->type,
327            $this->text,
328            $this->channel,
329            $this->tokenStartCharIndex,
330            $this->getCharIndex() - 1,
331            $this->tokenStartLine,
332            $this->tokenStartCharPositionInLine
333        );
334
335        $this->emitToken($token);
336
337        return $token;
338    }
339
340    public function emitEOF() : Token
341    {
342        if ($this->input === null) {
343            throw new \RuntimeException('Cannot emit EOF for null stream.');
344        }
345
346        $cpos = $this->getCharPositionInLine();
347        $lpos = $this->getLine();
348        $eof = $this->factory->createEx(
349            $this->tokenFactorySourcePair,
350            Token::EOF,
351            null,
352            Token::DEFAULT_CHANNEL,
353            $this->input->getIndex(),
354            $this->input->getIndex() - 1,
355            $lpos,
356            $cpos
357        );
358
359        $this->emitToken($eof);
360
361        return $eof;
362    }
363
364    public function getLine() : int
365    {
366        if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
367            throw new \RuntimeException('Unexpected interpreter type.');
368        }
369
370        return $this->interp->getLine();
371    }
372
373    public function setLine(int $line) : void
374    {
375        if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
376            throw new \RuntimeException('Unexpected interpreter type.');
377        }
378
379        $this->interp->setLine($line);
380    }
381
382    public function getCharPositionInLine() : int
383    {
384        if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
385            throw new \RuntimeException('Unexpected interpreter type.');
386        }
387
388        return $this->interp->getCharPositionInLine();
389    }
390
391    public function setCharPositionInLine(int $charPositionInLine) : void
392    {
393        if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
394            throw new \RuntimeException('Unexpected interpreter type.');
395        }
396
397        $this->interp->setCharPositionInLine($charPositionInLine);
398    }
399
400    /**
401     * What is the index of the current character of lookahead?
402     */
403    public function getCharIndex() : int
404    {
405        if ($this->input === null) {
406            throw new \RuntimeException('Cannot know char index for null stream.');
407        }
408
409        return $this->input->getIndex();
410    }
411
412    /**
413     * Return the text matched so far for the current token or any text override.
414     */
415    public function getText() : string
416    {
417        if ($this->text !== null) {
418            return $this->text;
419        }
420
421        if ($this->interp === null || !$this->interp instanceof LexerATNSimulator) {
422            throw new \RuntimeException('Unexpected interpreter type.');
423        }
424
425        return $this->input === null ? '' : $this->interp->getText($this->input);
426    }
427
428    /**
429     * Set the complete text of this token; it wipes any previous changes to the text.
430     */
431    public function setText(string $text) : void
432    {
433        $this->text = $text;
434    }
435
436    public function getToken() : ?Token
437    {
438        return $this->token;
439    }
440
441    /**
442     * Override if emitting multiple tokens.
443     */
444    public function setToken(Token $token) : void
445    {
446        $this->token = $token;
447    }
448
449    public function getType() : int
450    {
451        return $this->type;
452    }
453
454    public function setType(int $type) : void
455    {
456        $this->type = $type;
457    }
458
459    public function getChannel() : int
460    {
461        return $this->channel;
462    }
463
464    public function setChannel(int $channel) : void
465    {
466        $this->channel = $channel;
467    }
468
469    /**
470     * @return array<string>|null
471     */
472    public function getChannelNames() : ?array
473    {
474        return null;
475    }
476
477    /**
478     * @return array<string>|null
479     */
480    public function getModeNames() : ?array
481    {
482        return null;
483    }
484
485    /**
486     * Return a list of all Token objects in input char stream.
487     * Forces load of all tokens. Does not include EOF token.
488     *
489     * @return array<Token>
490     */
491    public function getAllTokens() : array
492    {
493        $tokens = [];
494        $token = $this->nextToken();
495
496        while ($token && $token->getType() !== Token::EOF) {
497            $tokens[] = $token;
498            $token = $this->nextToken();
499        }
500
501        return $tokens;
502    }
503
504    /**
505     * Lexers can normally match any char in it's vocabulary after matching
506     * a token, so do the easy thing and just kill a character and hope
507     * it all works out. You can instead use the rule invocation stack
508     * to do sophisticated error recovery if you are in a fragment rule.
509     */
510    public function recover(RecognitionException $re) : void
511    {
512        if ($this->input !== null && $this->input->LA(1) !== Token::EOF) {
513            if ($re instanceof LexerNoViableAltException && $this->interp !== null) {
514                // skip a char and try again
515                $this->interp->consume($this->input);
516            } else {
517                // TODO: Do we lose character or line position information?
518                $this->input->consume();
519            }
520        }
521    }
522
523    public function notifyListeners(LexerNoViableAltException $e) : void
524    {
525        $start = $this->tokenStartCharIndex;
526
527        if ($this->input === null) {
528            $text = '';
529        } else {
530            $stop = $this->input->getIndex();
531            $text = $this->input->getText($start, $stop);
532        }
533
534        $listener = $this->getErrorListenerDispatch();
535
536        $listener->syntaxError(
537            $this,
538            null,
539            $this->tokenStartLine,
540            $this->tokenStartCharPositionInLine,
541            \sprintf('token recognition error at: \'%s\'', $text),
542            $e
543        );
544    }
545}
546