1<?php 2 3/** 4 * Hoa 5 * 6 * 7 * @license 8 * 9 * New BSD License 10 * 11 * Copyright © 2007-2017, Hoa community. All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions are met: 15 * * Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * * Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * * Neither the name of the Hoa nor the names of its contributors may be 21 * used to endorse or promote products derived from this software without 22 * specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE 28 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 */ 36 37namespace Hoa\Compiler\Llk; 38 39use Hoa\Compiler; 40 41/** 42 * Class \Hoa\Compiler\Llk\Lexer. 43 * 44 * Lexical analyser, i.e. split a string into a set of lexeme, i.e. tokens. 45 * 46 * @copyright Copyright © 2007-2017 Hoa community 47 * @license New BSD License 48 */ 49class Lexer 50{ 51 /** 52 * Lexer state. 53 * 54 * @var array 55 */ 56 protected $_lexerState = null; 57 58 /** 59 * Text. 60 * 61 * @var string 62 */ 63 protected $_text = null; 64 65 /** 66 * Tokens. 67 * 68 * @var array 69 */ 70 protected $_tokens = []; 71 72 /** 73 * Namespace stacks. 74 * 75 * @var \SplStack 76 */ 77 protected $_nsStack = null; 78 79 /** 80 * PCRE options. 81 * 82 * @var string 83 */ 84 protected $_pcreOptions = null; 85 86 87 88 /** 89 * Constructor. 90 * 91 * @param array $pragmas Pragmas. 92 */ 93 public function __construct(array $pragmas = []) 94 { 95 if (!isset($pragmas['lexer.unicode']) || true === $pragmas['lexer.unicode']) { 96 $this->_pcreOptions .= 'u'; 97 } 98 99 return; 100 } 101 102 /** 103 * Text tokenizer: splits the text in parameter in an ordered array of 104 * tokens. 105 * 106 * @param string $text Text to tokenize. 107 * @param array $tokens Tokens to be returned. 108 * @return \Generator 109 * @throws \Hoa\Compiler\Exception\UnrecognizedToken 110 */ 111 public function lexMe($text, array $tokens) 112 { 113 $this->_text = $text; 114 $this->_tokens = $tokens; 115 $this->_nsStack = null; 116 $offset = 0; 117 $maxOffset = strlen($this->_text); 118 $this->_lexerState = 'default'; 119 $stack = false; 120 121 foreach ($this->_tokens as &$tokens) { 122 $_tokens = []; 123 124 foreach ($tokens as $fullLexeme => $regex) { 125 if (false === strpos($fullLexeme, ':')) { 126 $_tokens[$fullLexeme] = [$regex, null]; 127 128 continue; 129 } 130 131 list($lexeme, $namespace) = explode(':', $fullLexeme, 2); 132 133 $stack |= ('__shift__' === substr($namespace, 0, 9)); 134 135 unset($tokens[$fullLexeme]); 136 $_tokens[$lexeme] = [$regex, $namespace]; 137 } 138 139 $tokens = $_tokens; 140 } 141 142 if (true == $stack) { 143 $this->_nsStack = new \SplStack(); 144 } 145 146 while ($offset < $maxOffset) { 147 $nextToken = $this->nextToken($offset); 148 149 if (null === $nextToken) { 150 throw new Compiler\Exception\UnrecognizedToken( 151 'Unrecognized token "%s" at line 1 and column %d:' . 152 "\n" . '%s' . "\n" . 153 str_repeat(' ', mb_strlen(substr($text, 0, $offset))) . '↑', 154 0, 155 [ 156 mb_substr(substr($text, $offset), 0, 1), 157 $offset + 1, 158 $text 159 ], 160 1, 161 $offset 162 ); 163 } 164 165 if (true === $nextToken['keep']) { 166 $nextToken['offset'] = $offset; 167 yield $nextToken; 168 } 169 170 $offset += strlen($nextToken['value']); 171 } 172 173 yield [ 174 'token' => 'EOF', 175 'value' => 'EOF', 176 'length' => 0, 177 'namespace' => 'default', 178 'keep' => true, 179 'offset' => $offset 180 ]; 181 } 182 183 /** 184 * Compute the next token recognized at the beginning of the string. 185 * 186 * @param int $offset Offset. 187 * @return array 188 * @throws \Hoa\Compiler\Exception\Lexer 189 */ 190 protected function nextToken($offset) 191 { 192 $tokenArray = &$this->_tokens[$this->_lexerState]; 193 194 foreach ($tokenArray as $lexeme => $bucket) { 195 list($regex, $nextState) = $bucket; 196 197 if (null === $nextState) { 198 $nextState = $this->_lexerState; 199 } 200 201 $out = $this->matchLexeme($lexeme, $regex, $offset); 202 203 if (null !== $out) { 204 $out['namespace'] = $this->_lexerState; 205 $out['keep'] = 'skip' !== $lexeme; 206 207 if ($nextState !== $this->_lexerState) { 208 $shift = false; 209 210 if (null !== $this->_nsStack && 211 0 !== preg_match('#^__shift__(?:\s*\*\s*(\d+))?$#', $nextState, $matches)) { 212 $i = isset($matches[1]) ? intval($matches[1]) : 1; 213 214 if ($i > ($c = count($this->_nsStack))) { 215 throw new Compiler\Exception\Lexer( 216 'Cannot shift namespace %d-times, from token ' . 217 '%s in namespace %s, because the stack ' . 218 'contains only %d namespaces.', 219 1, 220 [ 221 $i, 222 $lexeme, 223 $this->_lexerState, 224 $c 225 ] 226 ); 227 } 228 229 while (1 <= $i--) { 230 $previousNamespace = $this->_nsStack->pop(); 231 } 232 233 $nextState = $previousNamespace; 234 $shift = true; 235 } 236 237 if (!isset($this->_tokens[$nextState])) { 238 throw new Compiler\Exception\Lexer( 239 'Namespace %s does not exist, called by token %s ' . 240 'in namespace %s.', 241 2, 242 [ 243 $nextState, 244 $lexeme, 245 $this->_lexerState 246 ] 247 ); 248 } 249 250 if (null !== $this->_nsStack && false === $shift) { 251 $this->_nsStack[] = $this->_lexerState; 252 } 253 254 $this->_lexerState = $nextState; 255 } 256 257 return $out; 258 } 259 } 260 261 return null; 262 } 263 264 /** 265 * Check if a given lexeme is matched at the beginning of the text. 266 * 267 * @param string $lexeme Name of the lexeme. 268 * @param string $regex Regular expression describing the lexeme. 269 * @param int $offset Offset. 270 * @return array 271 * @throws \Hoa\Compiler\Exception\Lexer 272 */ 273 protected function matchLexeme($lexeme, $regex, $offset) 274 { 275 $_regex = str_replace('#', '\#', $regex); 276 $preg = preg_match( 277 '#\G(?|' . $_regex . ')#' . $this->_pcreOptions, 278 $this->_text, 279 $matches, 280 0, 281 $offset 282 ); 283 284 if (0 === $preg) { 285 return null; 286 } 287 288 if ('' === $matches[0]) { 289 throw new Compiler\Exception\Lexer( 290 'A lexeme must not match an empty value, which is the ' . 291 'case of "%s" (%s).', 292 3, 293 [$lexeme, $regex] 294 ); 295 } 296 297 return [ 298 'token' => $lexeme, 299 'value' => $matches[0], 300 'length' => mb_strlen($matches[0]) 301 ]; 302 } 303} 304