1<?php 2/** 3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4 * For an intro to the Lexer see: 5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6 * 7 * @author Marcus Baker http://www.lastcraft.com 8 */ 9 10namespace dokuwiki\Parsing\Lexer; 11 12// FIXME move elsewhere 13 14define("DOKU_LEXER_ENTER", 1); 15define("DOKU_LEXER_MATCHED", 2); 16define("DOKU_LEXER_UNMATCHED", 3); 17define("DOKU_LEXER_EXIT", 4); 18define("DOKU_LEXER_SPECIAL", 5); 19 20/** 21 * Accepts text and breaks it into tokens. 22 * 23 * Some optimisation to make the sure the content is only scanned by the PHP regex 24 * parser once. Lexer modes must not start with leading underscores. 25 */ 26class Lexer 27{ 28 /** @var ParallelRegex[] */ 29 protected $regexes; 30 /** @var \Doku_Handler */ 31 protected $handler; 32 /** @var StateStack */ 33 protected $modeStack; 34 /** @var array mode "rewrites" */ 35 protected $mode_handlers; 36 /** @var bool case sensitive? */ 37 protected $case; 38 39 /** 40 * Sets up the lexer in case insensitive matching by default. 41 * 42 * @param \Doku_Handler $handler Handling strategy by reference. 43 * @param string $start Starting handler. 44 * @param boolean $case True for case sensitive. 45 */ 46 public function __construct($handler, $start = "accept", $case = false) 47 { 48 $this->case = $case; 49 $this->regexes = array(); 50 $this->handler = $handler; 51 $this->modeStack = new StateStack($start); 52 $this->mode_handlers = array(); 53 } 54 55 /** 56 * Adds a token search pattern for a particular parsing mode. 57 * 58 * The pattern does not change the current mode. 59 * 60 * @param string $pattern Perl style regex, but ( and ) 61 * lose the usual meaning. 62 * @param string $mode Should only apply this 63 * pattern when dealing with 64 * this type of input. 65 */ 66 public function addPattern($pattern, $mode = "accept") 67 { 68 if (! isset($this->regexes[$mode])) { 69 $this->regexes[$mode] = new ParallelRegex($this->case); 70 } 71 $this->regexes[$mode]->addPattern($pattern); 72 } 73 74 /** 75 * Adds a pattern that will enter a new parsing mode. 76 * 77 * Useful for entering parenthesis, strings, tags, etc. 78 * 79 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 80 * @param string $mode Should only apply this pattern when dealing with this type of input. 81 * @param string $new_mode Change parsing to this new nested mode. 82 */ 83 public function addEntryPattern($pattern, $mode, $new_mode) 84 { 85 if (! isset($this->regexes[$mode])) { 86 $this->regexes[$mode] = new ParallelRegex($this->case); 87 } 88 $this->regexes[$mode]->addPattern($pattern, $new_mode); 89 } 90 91 /** 92 * Adds a pattern that will exit the current mode and re-enter the previous one. 93 * 94 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 95 * @param string $mode Mode to leave. 96 */ 97 public function addExitPattern($pattern, $mode) 98 { 99 if (! isset($this->regexes[$mode])) { 100 $this->regexes[$mode] = new ParallelRegex($this->case); 101 } 102 $this->regexes[$mode]->addPattern($pattern, "__exit"); 103 } 104 105 /** 106 * Adds a pattern that has a special mode. 107 * 108 * Acts as an entry and exit pattern in one go, effectively calling a special 109 * parser handler for this token only. 110 * 111 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 112 * @param string $mode Should only apply this pattern when dealing with this type of input. 113 * @param string $special Use this mode for this one token. 114 */ 115 public function addSpecialPattern($pattern, $mode, $special) 116 { 117 if (! isset($this->regexes[$mode])) { 118 $this->regexes[$mode] = new ParallelRegex($this->case); 119 } 120 $this->regexes[$mode]->addPattern($pattern, "_$special"); 121 } 122 123 /** 124 * Adds a mapping from a mode to another handler. 125 * 126 * @param string $mode Mode to be remapped. 127 * @param string $handler New target handler. 128 */ 129 public function mapHandler($mode, $handler) 130 { 131 $this->mode_handlers[$mode] = $handler; 132 } 133 134 /** 135 * Splits the page text into tokens. 136 * 137 * Will fail if the handlers report an error or if no content is consumed. If successful then each 138 * unparsed and parsed token invokes a call to the held listener. 139 * 140 * @param string $raw Raw HTML text. 141 * @return boolean True on success, else false. 142 */ 143 public function parse($raw) 144 { 145 if (! isset($this->handler)) { 146 return false; 147 } 148 $initialLength = strlen($raw); 149 $length = $initialLength; 150 $pos = 0; 151 while (is_array($parsed = $this->reduce($raw))) { 152 list($unmatched, $matched, $mode) = $parsed; 153 $currentLength = strlen($raw); 154 $matchPos = $initialLength - $currentLength - strlen($matched); 155 if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 156 return false; 157 } 158 if ($currentLength == $length) { 159 return false; 160 } 161 $length = $currentLength; 162 $pos = $initialLength - $currentLength; 163 } 164 if (!$parsed) { 165 return false; 166 } 167 return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 168 } 169 170 /** 171 * Gives plugins access to the mode stack 172 * 173 * @return StateStack 174 */ 175 public function getModeStack() 176 { 177 return $this->modeStack; 178 } 179 180 /** 181 * Sends the matched token and any leading unmatched 182 * text to the parser changing the lexer to a new 183 * mode if one is listed. 184 * 185 * @param string $unmatched Unmatched leading portion. 186 * @param string $matched Actual token match. 187 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 188 * @param int $initialPos 189 * @param int $matchPos Current byte index location in raw doc thats being parsed 190 * @return boolean False if there was any error from the parser. 191 */ 192 protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 193 { 194 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 195 return false; 196 } 197 if ($this->isModeEnd($mode)) { 198 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 199 return false; 200 } 201 return $this->modeStack->leave(); 202 } 203 if ($this->isSpecialMode($mode)) { 204 $this->modeStack->enter($this->decodeSpecial($mode)); 205 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 206 return false; 207 } 208 return $this->modeStack->leave(); 209 } 210 if (is_string($mode)) { 211 $this->modeStack->enter($mode); 212 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 213 } 214 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 215 } 216 217 /** 218 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 219 * mode stack. 220 * 221 * @param string $mode Mode to test. 222 * @return boolean True if this is the exit mode. 223 */ 224 protected function isModeEnd($mode) 225 { 226 return ($mode === "__exit"); 227 } 228 229 /** 230 * Test to see if the mode is one where this mode is entered for this token only and automatically 231 * leaves immediately afterwoods. 232 * 233 * @param string $mode Mode to test. 234 * @return boolean True if this is the exit mode. 235 */ 236 protected function isSpecialMode($mode) 237 { 238 return (strncmp($mode, "_", 1) == 0); 239 } 240 241 /** 242 * Strips the magic underscore marking single token modes. 243 * 244 * @param string $mode Mode to decode. 245 * @return string Underlying mode name. 246 */ 247 protected function decodeSpecial($mode) 248 { 249 return substr($mode, 1); 250 } 251 252 /** 253 * Calls the parser method named after the current mode. 254 * 255 * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 256 * 257 * @param string $content Text parsed. 258 * @param boolean $is_match Token is recognised rather 259 * than unparsed data. 260 * @param int $pos Current byte index location in raw doc 261 * thats being parsed 262 * @return bool 263 */ 264 protected function invokeHandler($content, $is_match, $pos) 265 { 266 if (($content === "") || ($content === false)) { 267 return true; 268 } 269 $handler = $this->modeStack->getCurrent(); 270 if (isset($this->mode_handlers[$handler])) { 271 $handler = $this->mode_handlers[$handler]; 272 } 273 274 // modes starting with plugin_ are all handled by the same 275 // handler but with an additional parameter 276 if (substr($handler, 0, 7)=='plugin_') { 277 list($handler,$plugin) = explode('_', $handler, 2); 278 return $this->handler->$handler($content, $is_match, $pos, $plugin); 279 } 280 281 return $this->handler->$handler($content, $is_match, $pos); 282 } 283 284 /** 285 * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 286 * unparsed data. Empty strings will not be matched. 287 * 288 * @param string $raw The subject to parse. This is the content that will be eaten. 289 * @return array|bool Three item list of unparsed content followed by the 290 * recognised token and finally the action the parser is to take. 291 * True if no match, false if there is a parsing error. 292 */ 293 protected function reduce(&$raw) 294 { 295 if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 296 return false; 297 } 298 if ($raw === "") { 299 return true; 300 } 301 if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 302 list($unparsed, $match, $raw) = $split; 303 return array($unparsed, $match, $action); 304 } 305 return true; 306 } 307 308 /** 309 * Escapes regex characters other than (, ) and / 310 * 311 * @param string $str 312 * @return string 313 */ 314 public static function escape($str) 315 { 316 $chars = array( 317 '/\\\\/', 318 '/\./', 319 '/\+/', 320 '/\*/', 321 '/\?/', 322 '/\[/', 323 '/\^/', 324 '/\]/', 325 '/\$/', 326 '/\{/', 327 '/\}/', 328 '/\=/', 329 '/\!/', 330 '/\</', 331 '/\>/', 332 '/\|/', 333 '/\:/' 334 ); 335 336 $escaped = array( 337 '\\\\\\\\', 338 '\.', 339 '\+', 340 '\*', 341 '\?', 342 '\[', 343 '\^', 344 '\]', 345 '\$', 346 '\{', 347 '\}', 348 '\=', 349 '\!', 350 '\<', 351 '\>', 352 '\|', 353 '\:' 354 ); 355 return preg_replace($chars, $escaped, $str); 356 } 357} 358