1<?php 2/* 3 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 4 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 5 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 6 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 7 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 8 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 9 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 10 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 11 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 12 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 13 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 * 15 * This software consists of voluntary contributions made by many individuals 16 * and is licensed under the MIT license. For more information, see 17 * <http://www.doctrine-project.org>. 18 */ 19 20namespace Doctrine\Common\Lexer; 21 22/** 23 * Base class for writing simple lexers, i.e. for creating small DSLs. 24 * 25 * @since 2.0 26 * @author Guilherme Blanco <guilhermeblanco@hotmail.com> 27 * @author Jonathan Wage <jonwage@gmail.com> 28 * @author Roman Borschel <roman@code-factory.org> 29 */ 30abstract class AbstractLexer 31{ 32 /** 33 * Lexer original input string. 34 * 35 * @var string 36 */ 37 private $input; 38 39 /** 40 * Array of scanned tokens. 41 * 42 * Each token is an associative array containing three items: 43 * - 'value' : the string value of the token in the input string 44 * - 'type' : the type of the token (identifier, numeric, string, input 45 * parameter, none) 46 * - 'position' : the position of the token in the input string 47 * 48 * @var array 49 */ 50 private $tokens = array(); 51 52 /** 53 * Current lexer position in input string. 54 * 55 * @var integer 56 */ 57 private $position = 0; 58 59 /** 60 * Current peek of current lexer position. 61 * 62 * @var integer 63 */ 64 private $peek = 0; 65 66 /** 67 * The next token in the input. 68 * 69 * @var array 70 */ 71 public $lookahead; 72 73 /** 74 * The last matched/seen token. 75 * 76 * @var array 77 */ 78 public $token; 79 80 /** 81 * Sets the input data to be tokenized. 82 * 83 * The Lexer is immediately reset and the new input tokenized. 84 * Any unprocessed tokens from any previous input are lost. 85 * 86 * @param string $input The input to be tokenized. 87 * 88 * @return void 89 */ 90 public function setInput($input) 91 { 92 $this->input = $input; 93 $this->tokens = array(); 94 95 $this->reset(); 96 $this->scan($input); 97 } 98 99 /** 100 * Resets the lexer. 101 * 102 * @return void 103 */ 104 public function reset() 105 { 106 $this->lookahead = null; 107 $this->token = null; 108 $this->peek = 0; 109 $this->position = 0; 110 } 111 112 /** 113 * Resets the peek pointer to 0. 114 * 115 * @return void 116 */ 117 public function resetPeek() 118 { 119 $this->peek = 0; 120 } 121 122 /** 123 * Resets the lexer position on the input to the given position. 124 * 125 * @param integer $position Position to place the lexical scanner. 126 * 127 * @return void 128 */ 129 public function resetPosition($position = 0) 130 { 131 $this->position = $position; 132 } 133 134 /** 135 * Retrieve the original lexer's input until a given position. 136 * 137 * @param integer $position 138 * 139 * @return string 140 */ 141 public function getInputUntilPosition($position) 142 { 143 return substr($this->input, 0, $position); 144 } 145 146 /** 147 * Checks whether a given token matches the current lookahead. 148 * 149 * @param integer|string $token 150 * 151 * @return boolean 152 */ 153 public function isNextToken($token) 154 { 155 return null !== $this->lookahead && $this->lookahead['type'] === $token; 156 } 157 158 /** 159 * Checks whether any of the given tokens matches the current lookahead. 160 * 161 * @param array $tokens 162 * 163 * @return boolean 164 */ 165 public function isNextTokenAny(array $tokens) 166 { 167 return null !== $this->lookahead && in_array($this->lookahead['type'], $tokens, true); 168 } 169 170 /** 171 * Moves to the next token in the input string. 172 * 173 * @return boolean 174 */ 175 public function moveNext() 176 { 177 $this->peek = 0; 178 $this->token = $this->lookahead; 179 $this->lookahead = (isset($this->tokens[$this->position])) 180 ? $this->tokens[$this->position++] : null; 181 182 return $this->lookahead !== null; 183 } 184 185 /** 186 * Tells the lexer to skip input tokens until it sees a token with the given value. 187 * 188 * @param string $type The token type to skip until. 189 * 190 * @return void 191 */ 192 public function skipUntil($type) 193 { 194 while ($this->lookahead !== null && $this->lookahead['type'] !== $type) { 195 $this->moveNext(); 196 } 197 } 198 199 /** 200 * Checks if given value is identical to the given token. 201 * 202 * @param mixed $value 203 * @param integer $token 204 * 205 * @return boolean 206 */ 207 public function isA($value, $token) 208 { 209 return $this->getType($value) === $token; 210 } 211 212 /** 213 * Moves the lookahead token forward. 214 * 215 * @return array|null The next token or NULL if there are no more tokens ahead. 216 */ 217 public function peek() 218 { 219 if (isset($this->tokens[$this->position + $this->peek])) { 220 return $this->tokens[$this->position + $this->peek++]; 221 } else { 222 return null; 223 } 224 } 225 226 /** 227 * Peeks at the next token, returns it and immediately resets the peek. 228 * 229 * @return array|null The next token or NULL if there are no more tokens ahead. 230 */ 231 public function glimpse() 232 { 233 $peek = $this->peek(); 234 $this->peek = 0; 235 return $peek; 236 } 237 238 /** 239 * Scans the input string for tokens. 240 * 241 * @param string $input A query string. 242 * 243 * @return void 244 */ 245 protected function scan($input) 246 { 247 static $regex; 248 249 if ( ! isset($regex)) { 250 $regex = sprintf( 251 '/(%s)|%s/%s', 252 implode(')|(', $this->getCatchablePatterns()), 253 implode('|', $this->getNonCatchablePatterns()), 254 $this->getModifiers() 255 ); 256 } 257 258 $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE; 259 $matches = preg_split($regex, $input, -1, $flags); 260 261 foreach ($matches as $match) { 262 // Must remain before 'value' assignment since it can change content 263 $type = $this->getType($match[0]); 264 265 $this->tokens[] = array( 266 'value' => $match[0], 267 'type' => $type, 268 'position' => $match[1], 269 ); 270 } 271 } 272 273 /** 274 * Gets the literal for a given token. 275 * 276 * @param integer $token 277 * 278 * @return string 279 */ 280 public function getLiteral($token) 281 { 282 $className = get_class($this); 283 $reflClass = new \ReflectionClass($className); 284 $constants = $reflClass->getConstants(); 285 286 foreach ($constants as $name => $value) { 287 if ($value === $token) { 288 return $className . '::' . $name; 289 } 290 } 291 292 return $token; 293 } 294 295 /** 296 * Regex modifiers 297 * 298 * @return string 299 */ 300 protected function getModifiers() 301 { 302 return 'i'; 303 } 304 305 /** 306 * Lexical catchable patterns. 307 * 308 * @return array 309 */ 310 abstract protected function getCatchablePatterns(); 311 312 /** 313 * Lexical non-catchable patterns. 314 * 315 * @return array 316 */ 317 abstract protected function getNonCatchablePatterns(); 318 319 /** 320 * Retrieve token type. Also processes the token value if necessary. 321 * 322 * @param string $value 323 * 324 * @return integer 325 */ 326 abstract protected function getType(&$value); 327} 328