1<?php
2/*
3 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
4 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
5 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
6 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
7 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
8 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
9 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
10 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
11 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
12 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
13 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 *
15 * This software consists of voluntary contributions made by many individuals
16 * and is licensed under the MIT license. For more information, see
17 * <http://www.doctrine-project.org>.
18 */
19
20namespace Doctrine\Common\Lexer;
21
22/**
23 * Base class for writing simple lexers, i.e. for creating small DSLs.
24 *
25 * @since  2.0
26 * @author Guilherme Blanco <guilhermeblanco@hotmail.com>
27 * @author Jonathan Wage <jonwage@gmail.com>
28 * @author Roman Borschel <roman@code-factory.org>
29 */
30abstract class AbstractLexer
31{
32    /**
33     * Lexer original input string.
34     *
35     * @var string
36     */
37    private $input;
38
39    /**
40     * Array of scanned tokens.
41     *
42     * Each token is an associative array containing three items:
43     *  - 'value'    : the string value of the token in the input string
44     *  - 'type'     : the type of the token (identifier, numeric, string, input
45     *                 parameter, none)
46     *  - 'position' : the position of the token in the input string
47     *
48     * @var array
49     */
50    private $tokens = array();
51
52    /**
53     * Current lexer position in input string.
54     *
55     * @var integer
56     */
57    private $position = 0;
58
59    /**
60     * Current peek of current lexer position.
61     *
62     * @var integer
63     */
64    private $peek = 0;
65
66    /**
67     * The next token in the input.
68     *
69     * @var array
70     */
71    public $lookahead;
72
73    /**
74     * The last matched/seen token.
75     *
76     * @var array
77     */
78    public $token;
79
80    /**
81     * Sets the input data to be tokenized.
82     *
83     * The Lexer is immediately reset and the new input tokenized.
84     * Any unprocessed tokens from any previous input are lost.
85     *
86     * @param string $input The input to be tokenized.
87     *
88     * @return void
89     */
90    public function setInput($input)
91    {
92        $this->input  = $input;
93        $this->tokens = array();
94
95        $this->reset();
96        $this->scan($input);
97    }
98
99    /**
100     * Resets the lexer.
101     *
102     * @return void
103     */
104    public function reset()
105    {
106        $this->lookahead = null;
107        $this->token = null;
108        $this->peek = 0;
109        $this->position = 0;
110    }
111
112    /**
113     * Resets the peek pointer to 0.
114     *
115     * @return void
116     */
117    public function resetPeek()
118    {
119        $this->peek = 0;
120    }
121
122    /**
123     * Resets the lexer position on the input to the given position.
124     *
125     * @param integer $position Position to place the lexical scanner.
126     *
127     * @return void
128     */
129    public function resetPosition($position = 0)
130    {
131        $this->position = $position;
132    }
133
134    /**
135     * Retrieve the original lexer's input until a given position.
136     *
137     * @param integer $position
138     *
139     * @return string
140     */
141    public function getInputUntilPosition($position)
142    {
143        return substr($this->input, 0, $position);
144    }
145
146    /**
147     * Checks whether a given token matches the current lookahead.
148     *
149     * @param integer|string $token
150     *
151     * @return boolean
152     */
153    public function isNextToken($token)
154    {
155        return null !== $this->lookahead && $this->lookahead['type'] === $token;
156    }
157
158    /**
159     * Checks whether any of the given tokens matches the current lookahead.
160     *
161     * @param array $tokens
162     *
163     * @return boolean
164     */
165    public function isNextTokenAny(array $tokens)
166    {
167        return null !== $this->lookahead && in_array($this->lookahead['type'], $tokens, true);
168    }
169
170    /**
171     * Moves to the next token in the input string.
172     *
173     * @return boolean
174     */
175    public function moveNext()
176    {
177        $this->peek = 0;
178        $this->token = $this->lookahead;
179        $this->lookahead = (isset($this->tokens[$this->position]))
180            ? $this->tokens[$this->position++] : null;
181
182        return $this->lookahead !== null;
183    }
184
185    /**
186     * Tells the lexer to skip input tokens until it sees a token with the given value.
187     *
188     * @param string $type The token type to skip until.
189     *
190     * @return void
191     */
192    public function skipUntil($type)
193    {
194        while ($this->lookahead !== null && $this->lookahead['type'] !== $type) {
195            $this->moveNext();
196        }
197    }
198
199    /**
200     * Checks if given value is identical to the given token.
201     *
202     * @param mixed   $value
203     * @param integer $token
204     *
205     * @return boolean
206     */
207    public function isA($value, $token)
208    {
209        return $this->getType($value) === $token;
210    }
211
212    /**
213     * Moves the lookahead token forward.
214     *
215     * @return array|null The next token or NULL if there are no more tokens ahead.
216     */
217    public function peek()
218    {
219        if (isset($this->tokens[$this->position + $this->peek])) {
220            return $this->tokens[$this->position + $this->peek++];
221        } else {
222            return null;
223        }
224    }
225
226    /**
227     * Peeks at the next token, returns it and immediately resets the peek.
228     *
229     * @return array|null The next token or NULL if there are no more tokens ahead.
230     */
231    public function glimpse()
232    {
233        $peek = $this->peek();
234        $this->peek = 0;
235        return $peek;
236    }
237
238    /**
239     * Scans the input string for tokens.
240     *
241     * @param string $input A query string.
242     *
243     * @return void
244     */
245    protected function scan($input)
246    {
247        static $regex;
248
249        if ( ! isset($regex)) {
250            $regex = sprintf(
251                '/(%s)|%s/%s',
252                implode(')|(', $this->getCatchablePatterns()),
253                implode('|', $this->getNonCatchablePatterns()),
254                $this->getModifiers()
255            );
256        }
257
258        $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE;
259        $matches = preg_split($regex, $input, -1, $flags);
260
261        foreach ($matches as $match) {
262            // Must remain before 'value' assignment since it can change content
263            $type = $this->getType($match[0]);
264
265            $this->tokens[] = array(
266                'value' => $match[0],
267                'type'  => $type,
268                'position' => $match[1],
269            );
270        }
271    }
272
273    /**
274     * Gets the literal for a given token.
275     *
276     * @param integer $token
277     *
278     * @return string
279     */
280    public function getLiteral($token)
281    {
282        $className = get_class($this);
283        $reflClass = new \ReflectionClass($className);
284        $constants = $reflClass->getConstants();
285
286        foreach ($constants as $name => $value) {
287            if ($value === $token) {
288                return $className . '::' . $name;
289            }
290        }
291
292        return $token;
293    }
294
295    /**
296     * Regex modifiers
297     *
298     * @return string
299     */
300    protected function getModifiers()
301    {
302        return 'i';
303    }
304
305    /**
306     * Lexical catchable patterns.
307     *
308     * @return array
309     */
310    abstract protected function getCatchablePatterns();
311
312    /**
313     * Lexical non-catchable patterns.
314     *
315     * @return array
316     */
317    abstract protected function getNonCatchablePatterns();
318
319    /**
320     * Retrieve token type. Also processes the token value if necessary.
321     *
322     * @param string $value
323     *
324     * @return integer
325     */
326    abstract protected function getType(&$value);
327}
328