1<?php
2
3/**
4 * Hoa
5 *
6 *
7 * @license
8 *
9 * New BSD License
10 *
11 * Copyright © 2007-2017, Hoa community. All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions are met:
15 *     * Redistributions of source code must retain the above copyright
16 *       notice, this list of conditions and the following disclaimer.
17 *     * Redistributions in binary form must reproduce the above copyright
18 *       notice, this list of conditions and the following disclaimer in the
19 *       documentation and/or other materials provided with the distribution.
20 *     * Neither the name of the Hoa nor the names of its contributors may be
21 *       used to endorse or promote products derived from this software without
22 *       specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
28 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36
37namespace Hoa\Compiler\Llk;
38
39use Hoa\Compiler;
40
41/**
42 * Class \Hoa\Compiler\Llk\Lexer.
43 *
44 * Lexical analyser, i.e. split a string into a set of lexeme, i.e. tokens.
45 *
46 * @copyright  Copyright © 2007-2017 Hoa community
47 * @license    New BSD License
48 */
49class Lexer
50{
51    /**
52     * Lexer state.
53     *
54     * @var array
55     */
56    protected $_lexerState  = null;
57
58    /**
59     * Text.
60     *
61     * @var string
62     */
63    protected $_text        = null;
64
65    /**
66     * Tokens.
67     *
68     * @var array
69     */
70    protected $_tokens      = [];
71
72    /**
73     * Namespace stacks.
74     *
75     * @var \SplStack
76     */
77    protected $_nsStack     = null;
78
79    /**
80     * PCRE options.
81     *
82     * @var string
83     */
84    protected $_pcreOptions = null;
85
86
87
88    /**
89     * Constructor.
90     *
91     * @param   array  $pragmas    Pragmas.
92     */
93    public function __construct(array $pragmas = [])
94    {
95        if (!isset($pragmas['lexer.unicode']) || true === $pragmas['lexer.unicode']) {
96            $this->_pcreOptions .= 'u';
97        }
98
99        return;
100    }
101
102    /**
103     * Text tokenizer: splits the text in parameter in an ordered array of
104     * tokens.
105     *
106     * @param   string  $text      Text to tokenize.
107     * @param   array   $tokens    Tokens to be returned.
108     * @return  \Generator
109     * @throws  \Hoa\Compiler\Exception\UnrecognizedToken
110     */
111    public function lexMe($text, array $tokens)
112    {
113        $this->_text       = $text;
114        $this->_tokens     = $tokens;
115        $this->_nsStack    = null;
116        $offset            = 0;
117        $maxOffset         = strlen($this->_text);
118        $this->_lexerState = 'default';
119        $stack             = false;
120
121        foreach ($this->_tokens as &$tokens) {
122            $_tokens = [];
123
124            foreach ($tokens as $fullLexeme => $regex) {
125                if (false === strpos($fullLexeme, ':')) {
126                    $_tokens[$fullLexeme] = [$regex, null];
127
128                    continue;
129                }
130
131                list($lexeme, $namespace) = explode(':', $fullLexeme, 2);
132
133                $stack |= ('__shift__' === substr($namespace, 0, 9));
134
135                unset($tokens[$fullLexeme]);
136                $_tokens[$lexeme] = [$regex, $namespace];
137            }
138
139            $tokens = $_tokens;
140        }
141
142        if (true == $stack) {
143            $this->_nsStack = new \SplStack();
144        }
145
146        while ($offset < $maxOffset) {
147            $nextToken = $this->nextToken($offset);
148
149            if (null === $nextToken) {
150                throw new Compiler\Exception\UnrecognizedToken(
151                    'Unrecognized token "%s" at line 1 and column %d:' .
152                    "\n" . '%s' . "\n" .
153                    str_repeat(' ', mb_strlen(substr($text, 0, $offset))) . '↑',
154                    0,
155                    [
156                        mb_substr(substr($text, $offset), 0, 1),
157                        $offset + 1,
158                        $text
159                    ],
160                    1,
161                    $offset
162                );
163            }
164
165            if (true === $nextToken['keep']) {
166                $nextToken['offset'] = $offset;
167                yield $nextToken;
168            }
169
170            $offset += strlen($nextToken['value']);
171        }
172
173        yield [
174            'token'     => 'EOF',
175            'value'     => 'EOF',
176            'length'    => 0,
177            'namespace' => 'default',
178            'keep'      => true,
179            'offset'    => $offset
180        ];
181    }
182
183    /**
184     * Compute the next token recognized at the beginning of the string.
185     *
186     * @param   int  $offset    Offset.
187     * @return  array
188     * @throws  \Hoa\Compiler\Exception\Lexer
189     */
190    protected function nextToken($offset)
191    {
192        $tokenArray = &$this->_tokens[$this->_lexerState];
193
194        foreach ($tokenArray as $lexeme => $bucket) {
195            list($regex, $nextState) = $bucket;
196
197            if (null === $nextState) {
198                $nextState = $this->_lexerState;
199            }
200
201            $out = $this->matchLexeme($lexeme, $regex, $offset);
202
203            if (null !== $out) {
204                $out['namespace'] = $this->_lexerState;
205                $out['keep']      = 'skip' !== $lexeme;
206
207                if ($nextState !== $this->_lexerState) {
208                    $shift = false;
209
210                    if (null !== $this->_nsStack &&
211                        0 !== preg_match('#^__shift__(?:\s*\*\s*(\d+))?$#', $nextState, $matches)) {
212                        $i = isset($matches[1]) ? intval($matches[1]) : 1;
213
214                        if ($i > ($c = count($this->_nsStack))) {
215                            throw new Compiler\Exception\Lexer(
216                                'Cannot shift namespace %d-times, from token ' .
217                                '%s in namespace %s, because the stack ' .
218                                'contains only %d namespaces.',
219                                1,
220                                [
221                                    $i,
222                                    $lexeme,
223                                    $this->_lexerState,
224                                    $c
225                                ]
226                            );
227                        }
228
229                        while (1 <=  $i--) {
230                            $previousNamespace = $this->_nsStack->pop();
231                        }
232
233                        $nextState = $previousNamespace;
234                        $shift     = true;
235                    }
236
237                    if (!isset($this->_tokens[$nextState])) {
238                        throw new Compiler\Exception\Lexer(
239                            'Namespace %s does not exist, called by token %s ' .
240                            'in namespace %s.',
241                            2,
242                            [
243                                $nextState,
244                                $lexeme,
245                                $this->_lexerState
246                            ]
247                        );
248                    }
249
250                    if (null !== $this->_nsStack && false === $shift) {
251                        $this->_nsStack[] = $this->_lexerState;
252                    }
253
254                    $this->_lexerState = $nextState;
255                }
256
257                return $out;
258            }
259        }
260
261        return null;
262    }
263
264    /**
265     * Check if a given lexeme is matched at the beginning of the text.
266     *
267     * @param   string  $lexeme    Name of the lexeme.
268     * @param   string  $regex     Regular expression describing the lexeme.
269     * @param   int     $offset    Offset.
270     * @return  array
271     * @throws  \Hoa\Compiler\Exception\Lexer
272     */
273    protected function matchLexeme($lexeme, $regex, $offset)
274    {
275        $_regex = str_replace('#', '\#', $regex);
276        $preg   = preg_match(
277            '#\G(?|' . $_regex . ')#' . $this->_pcreOptions,
278            $this->_text,
279            $matches,
280            0,
281            $offset
282        );
283
284        if (0 === $preg) {
285            return null;
286        }
287
288        if ('' === $matches[0]) {
289            throw new Compiler\Exception\Lexer(
290                'A lexeme must not match an empty value, which is the ' .
291                'case of "%s" (%s).',
292                3,
293                [$lexeme, $regex]
294            );
295        }
296
297        return [
298            'token'  => $lexeme,
299            'value'  => $matches[0],
300            'length' => mb_strlen($matches[0])
301        ];
302    }
303}
304