1<?php 2/** 3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4 * For an intro to the Lexer see: 5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6 * 7 * @author Marcus Baker http://www.lastcraft.com 8 */ 9 10namespace dokuwiki\Parsing\Lexer; 11 12/** 13 * Accepts text and breaks it into tokens. 14 * 15 * Some optimisation to make the sure the content is only scanned by the PHP regex 16 * parser once. Lexer modes must not start with leading underscores. 17 */ 18class Lexer 19{ 20 /** @var ParallelRegex[] */ 21 protected $regexes = []; 22 /** @var \Doku_Handler */ 23 protected $handler; 24 /** @var StateStack */ 25 protected $modeStack; 26 /** @var array mode "rewrites" */ 27 protected $mode_handlers = []; 28 /** @var bool case sensitive? */ 29 protected $case; 30 31 /** 32 * Sets up the lexer in case insensitive matching by default. 33 * 34 * @param \Doku_Handler $handler Handling strategy by reference. 35 * @param string $start Starting handler. 36 * @param boolean $case True for case sensitive. 37 */ 38 public function __construct($handler, $start = "accept", $case = false) 39 { 40 $this->case = $case; 41 $this->handler = $handler; 42 $this->modeStack = new StateStack($start); 43 } 44 45 /** 46 * Adds a token search pattern for a particular parsing mode. 47 * 48 * The pattern does not change the current mode. 49 * 50 * @param string $pattern Perl style regex, but ( and ) 51 * lose the usual meaning. 52 * @param string $mode Should only apply this 53 * pattern when dealing with 54 * this type of input. 55 */ 56 public function addPattern($pattern, $mode = "accept") 57 { 58 if (! isset($this->regexes[$mode])) { 59 $this->regexes[$mode] = new ParallelRegex($this->case); 60 } 61 $this->regexes[$mode]->addPattern($pattern); 62 } 63 64 /** 65 * Adds a pattern that will enter a new parsing mode. 66 * 67 * Useful for entering parenthesis, strings, tags, etc. 68 * 69 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 70 * @param string $mode Should only apply this pattern when dealing with this type of input. 71 * @param string $new_mode Change parsing to this new nested mode. 72 */ 73 public function addEntryPattern($pattern, $mode, $new_mode) 74 { 75 if (! isset($this->regexes[$mode])) { 76 $this->regexes[$mode] = new ParallelRegex($this->case); 77 } 78 $this->regexes[$mode]->addPattern($pattern, $new_mode); 79 } 80 81 /** 82 * Adds a pattern that will exit the current mode and re-enter the previous one. 83 * 84 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 85 * @param string $mode Mode to leave. 86 */ 87 public function addExitPattern($pattern, $mode) 88 { 89 if (! isset($this->regexes[$mode])) { 90 $this->regexes[$mode] = new ParallelRegex($this->case); 91 } 92 $this->regexes[$mode]->addPattern($pattern, "__exit"); 93 } 94 95 /** 96 * Adds a pattern that has a special mode. 97 * 98 * Acts as an entry and exit pattern in one go, effectively calling a special 99 * parser handler for this token only. 100 * 101 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 102 * @param string $mode Should only apply this pattern when dealing with this type of input. 103 * @param string $special Use this mode for this one token. 104 */ 105 public function addSpecialPattern($pattern, $mode, $special) 106 { 107 if (! isset($this->regexes[$mode])) { 108 $this->regexes[$mode] = new ParallelRegex($this->case); 109 } 110 $this->regexes[$mode]->addPattern($pattern, "_$special"); 111 } 112 113 /** 114 * Adds a mapping from a mode to another handler. 115 * 116 * @param string $mode Mode to be remapped. 117 * @param string $handler New target handler. 118 */ 119 public function mapHandler($mode, $handler) 120 { 121 $this->mode_handlers[$mode] = $handler; 122 } 123 124 /** 125 * Splits the page text into tokens. 126 * 127 * Will fail if the handlers report an error or if no content is consumed. If successful then each 128 * unparsed and parsed token invokes a call to the held listener. 129 * 130 * @param string $raw Raw HTML text. 131 * @return boolean True on success, else false. 132 */ 133 public function parse($raw) 134 { 135 if (! isset($this->handler)) { 136 return false; 137 } 138 $initialLength = strlen($raw); 139 $length = $initialLength; 140 $pos = 0; 141 while (is_array($parsed = $this->reduce($raw))) { 142 [$unmatched, $matched, $mode] = $parsed; 143 $currentLength = strlen($raw); 144 $matchPos = $initialLength - $currentLength - strlen($matched); 145 if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 146 return false; 147 } 148 if ($currentLength === $length) { 149 return false; 150 } 151 $length = $currentLength; 152 $pos = $initialLength - $currentLength; 153 } 154 if (!$parsed) { 155 return false; 156 } 157 return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 158 } 159 160 /** 161 * Gives plugins access to the mode stack 162 * 163 * @return StateStack 164 */ 165 public function getModeStack() 166 { 167 return $this->modeStack; 168 } 169 170 /** 171 * Sends the matched token and any leading unmatched 172 * text to the parser changing the lexer to a new 173 * mode if one is listed. 174 * 175 * @param string $unmatched Unmatched leading portion. 176 * @param string $matched Actual token match. 177 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 178 * @param int $initialPos 179 * @param int $matchPos Current byte index location in raw doc thats being parsed 180 * @return boolean False if there was any error from the parser. 181 */ 182 protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 183 { 184 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 185 return false; 186 } 187 if ($this->isModeEnd($mode)) { 188 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 189 return false; 190 } 191 return $this->modeStack->leave(); 192 } 193 if ($this->isSpecialMode($mode)) { 194 $this->modeStack->enter($this->decodeSpecial($mode)); 195 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 196 return false; 197 } 198 return $this->modeStack->leave(); 199 } 200 if (is_string($mode)) { 201 $this->modeStack->enter($mode); 202 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 203 } 204 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 205 } 206 207 /** 208 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 209 * mode stack. 210 * 211 * @param string $mode Mode to test. 212 * @return boolean True if this is the exit mode. 213 */ 214 protected function isModeEnd($mode) 215 { 216 return ($mode === "__exit"); 217 } 218 219 /** 220 * Test to see if the mode is one where this mode is entered for this token only and automatically 221 * leaves immediately afterwoods. 222 * 223 * @param string $mode Mode to test. 224 * @return boolean True if this is the exit mode. 225 */ 226 protected function isSpecialMode($mode) 227 { 228 return (strncmp($mode, "_", 1) == 0); 229 } 230 231 /** 232 * Strips the magic underscore marking single token modes. 233 * 234 * @param string $mode Mode to decode. 235 * @return string Underlying mode name. 236 */ 237 protected function decodeSpecial($mode) 238 { 239 return substr($mode, 1); 240 } 241 242 /** 243 * Calls the parser method named after the current mode. 244 * 245 * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 246 * 247 * @param string $content Text parsed. 248 * @param boolean $is_match Token is recognised rather 249 * than unparsed data. 250 * @param int $pos Current byte index location in raw doc 251 * thats being parsed 252 * @return bool 253 */ 254 protected function invokeHandler($content, $is_match, $pos) 255 { 256 if (($content === "") || ($content === false)) { 257 return true; 258 } 259 $handler = $this->modeStack->getCurrent(); 260 if (isset($this->mode_handlers[$handler])) { 261 $handler = $this->mode_handlers[$handler]; 262 } 263 264 // modes starting with plugin_ are all handled by the same 265 // handler but with an additional parameter 266 if (substr($handler, 0, 7) == 'plugin_') { 267 [$handler, $plugin] = sexplode('_', $handler, 2, ''); 268 return $this->handler->$handler($content, $is_match, $pos, $plugin); 269 } 270 271 return $this->handler->$handler($content, $is_match, $pos); 272 } 273 274 /** 275 * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 276 * unparsed data. Empty strings will not be matched. 277 * 278 * @param string $raw The subject to parse. This is the content that will be eaten. 279 * @return array|bool Three item list of unparsed content followed by the 280 * recognised token and finally the action the parser is to take. 281 * True if no match, false if there is a parsing error. 282 */ 283 protected function reduce(&$raw) 284 { 285 if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 286 return false; 287 } 288 if ($raw === "") { 289 return true; 290 } 291 if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 292 [$unparsed, $match, $raw] = $split; 293 return [$unparsed, $match, $action]; 294 } 295 return true; 296 } 297 298 /** 299 * Escapes regex characters other than (, ) and / 300 * 301 * @param string $str 302 * @return string 303 */ 304 public static function escape($str) 305 { 306 $chars = [ 307 '/\\\\/', 308 '/\./', 309 '/\+/', 310 '/\*/', 311 '/\?/', 312 '/\[/', 313 '/\^/', 314 '/\]/', 315 '/\$/', 316 '/\{/', 317 '/\}/', 318 '/\=/', 319 '/\!/', 320 '/\</', 321 '/\>/', 322 '/\|/', 323 '/\:/' 324 ]; 325 326 $escaped = [ 327 '\\\\\\\\', 328 '\.', 329 '\+', 330 '\*', 331 '\?', 332 '\[', 333 '\^', 334 '\]', 335 '\$', 336 '\{', 337 '\}', 338 '\=', 339 '\!', 340 '\<', 341 '\>', 342 '\|', 343 '\:' 344 ]; 345 346 return preg_replace($chars, $escaped, $str); 347 } 348} 349