1<?php 2 3/** 4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5 * For an intro to the Lexer see: 6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7 * 8 * @author Marcus Baker http://www.lastcraft.com 9 */ 10 11namespace dokuwiki\Parsing\Lexer; 12 13/** 14 * Accepts text and breaks it into tokens. 15 * 16 * Some optimisation to make the sure the content is only scanned by the PHP regex 17 * parser once. Lexer modes must not start with leading underscores. 18 */ 19class Lexer 20{ 21 /** @var ParallelRegex[] */ 22 protected $regexes = []; 23 /** @var \Doku_Handler */ 24 protected $handler; 25 /** @var StateStack */ 26 protected $modeStack; 27 /** @var array mode "rewrites" */ 28 protected $mode_handlers = []; 29 /** @var bool case sensitive? */ 30 protected $case; 31 32 /** 33 * Sets up the lexer in case insensitive matching by default. 34 * 35 * @param \Doku_Handler $handler Handling strategy by reference. 36 * @param string $start Starting handler. 37 * @param boolean $case True for case sensitive. 38 */ 39 public function __construct($handler, $start = "accept", $case = false) 40 { 41 $this->case = $case; 42 $this->handler = $handler; 43 $this->modeStack = new StateStack($start); 44 } 45 46 /** 47 * Adds a token search pattern for a particular parsing mode. 48 * 49 * The pattern does not change the current mode. 50 * 51 * @param string $pattern Perl style regex, but ( and ) 52 * lose the usual meaning. 53 * @param string $mode Should only apply this 54 * pattern when dealing with 55 * this type of input. 56 */ 57 public function addPattern($pattern, $mode = "accept") 58 { 59 if (! isset($this->regexes[$mode])) { 60 $this->regexes[$mode] = new ParallelRegex($this->case); 61 } 62 $this->regexes[$mode]->addPattern($pattern); 63 } 64 65 /** 66 * Adds a pattern that will enter a new parsing mode. 67 * 68 * Useful for entering parenthesis, strings, tags, etc. 69 * 70 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 71 * @param string $mode Should only apply this pattern when dealing with this type of input. 72 * @param string $new_mode Change parsing to this new nested mode. 73 */ 74 public function addEntryPattern($pattern, $mode, $new_mode) 75 { 76 if (! isset($this->regexes[$mode])) { 77 $this->regexes[$mode] = new ParallelRegex($this->case); 78 } 79 $this->regexes[$mode]->addPattern($pattern, $new_mode); 80 } 81 82 /** 83 * Adds a pattern that will exit the current mode and re-enter the previous one. 84 * 85 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 86 * @param string $mode Mode to leave. 87 */ 88 public function addExitPattern($pattern, $mode) 89 { 90 if (! isset($this->regexes[$mode])) { 91 $this->regexes[$mode] = new ParallelRegex($this->case); 92 } 93 $this->regexes[$mode]->addPattern($pattern, "__exit"); 94 } 95 96 /** 97 * Adds a pattern that has a special mode. 98 * 99 * Acts as an entry and exit pattern in one go, effectively calling a special 100 * parser handler for this token only. 101 * 102 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 103 * @param string $mode Should only apply this pattern when dealing with this type of input. 104 * @param string $special Use this mode for this one token. 105 */ 106 public function addSpecialPattern($pattern, $mode, $special) 107 { 108 if (! isset($this->regexes[$mode])) { 109 $this->regexes[$mode] = new ParallelRegex($this->case); 110 } 111 $this->regexes[$mode]->addPattern($pattern, "_$special"); 112 } 113 114 /** 115 * Adds a mapping from a mode to another handler. 116 * 117 * @param string $mode Mode to be remapped. 118 * @param string $handler New target handler. 119 */ 120 public function mapHandler($mode, $handler) 121 { 122 $this->mode_handlers[$mode] = $handler; 123 } 124 125 /** 126 * Splits the page text into tokens. 127 * 128 * Will fail if the handlers report an error or if no content is consumed. If successful then each 129 * unparsed and parsed token invokes a call to the held listener. 130 * 131 * @param string $raw Raw HTML text. 132 * @return boolean True on success, else false. 133 */ 134 public function parse($raw) 135 { 136 if (! isset($this->handler)) { 137 return false; 138 } 139 $initialLength = strlen($raw); 140 $length = $initialLength; 141 $pos = 0; 142 while (is_array($parsed = $this->reduce($raw))) { 143 [$unmatched, $matched, $mode] = $parsed; 144 $currentLength = strlen($raw); 145 $matchPos = $initialLength - $currentLength - strlen($matched); 146 if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 147 return false; 148 } 149 if ($currentLength === $length) { 150 return false; 151 } 152 $length = $currentLength; 153 $pos = $initialLength - $currentLength; 154 } 155 if (!$parsed) { 156 return false; 157 } 158 return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 159 } 160 161 /** 162 * Gives plugins access to the mode stack 163 * 164 * @return StateStack 165 */ 166 public function getModeStack() 167 { 168 return $this->modeStack; 169 } 170 171 /** 172 * Sends the matched token and any leading unmatched 173 * text to the parser changing the lexer to a new 174 * mode if one is listed. 175 * 176 * @param string $unmatched Unmatched leading portion. 177 * @param string $matched Actual token match. 178 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 179 * @param int $initialPos 180 * @param int $matchPos Current byte index location in raw doc thats being parsed 181 * @return boolean False if there was any error from the parser. 182 */ 183 protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 184 { 185 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 186 return false; 187 } 188 if ($this->isModeEnd($mode)) { 189 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 190 return false; 191 } 192 return $this->modeStack->leave(); 193 } 194 if ($this->isSpecialMode($mode)) { 195 $this->modeStack->enter($this->decodeSpecial($mode)); 196 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 197 return false; 198 } 199 return $this->modeStack->leave(); 200 } 201 if (is_string($mode)) { 202 $this->modeStack->enter($mode); 203 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 204 } 205 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 206 } 207 208 /** 209 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 210 * mode stack. 211 * 212 * @param string $mode Mode to test. 213 * @return boolean True if this is the exit mode. 214 */ 215 protected function isModeEnd($mode) 216 { 217 return ($mode === "__exit"); 218 } 219 220 /** 221 * Test to see if the mode is one where this mode is entered for this token only and automatically 222 * leaves immediately afterwoods. 223 * 224 * @param string $mode Mode to test. 225 * @return boolean True if this is the exit mode. 226 */ 227 protected function isSpecialMode($mode) 228 { 229 return str_starts_with($mode, '_'); 230 } 231 232 /** 233 * Strips the magic underscore marking single token modes. 234 * 235 * @param string $mode Mode to decode. 236 * @return string Underlying mode name. 237 */ 238 protected function decodeSpecial($mode) 239 { 240 return substr($mode, 1); 241 } 242 243 /** 244 * Calls the parser method named after the current mode. 245 * 246 * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 247 * 248 * @param string $content Text parsed. 249 * @param boolean $is_match Token is recognised rather 250 * than unparsed data. 251 * @param int $pos Current byte index location in raw doc 252 * thats being parsed 253 * @return bool 254 */ 255 protected function invokeHandler($content, $is_match, $pos) 256 { 257 if (($content === "") || ($content === false)) { 258 return true; 259 } 260 $handler = $this->modeStack->getCurrent(); 261 if (isset($this->mode_handlers[$handler])) { 262 $handler = $this->mode_handlers[$handler]; 263 } 264 265 // modes starting with plugin_ are all handled by the same 266 // handler but with an additional parameter 267 if (str_starts_with($handler, 'plugin_')) { 268 [$handler, $plugin] = sexplode('_', $handler, 2, ''); 269 return $this->handler->$handler($content, $is_match, $pos, $plugin); 270 } 271 272 return $this->handler->$handler($content, $is_match, $pos); 273 } 274 275 /** 276 * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 277 * unparsed data. Empty strings will not be matched. 278 * 279 * @param string $raw The subject to parse. This is the content that will be eaten. 280 * @return array|bool Three item list of unparsed content followed by the 281 * recognised token and finally the action the parser is to take. 282 * True if no match, false if there is a parsing error. 283 */ 284 protected function reduce(&$raw) 285 { 286 if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 287 return false; 288 } 289 if ($raw === "") { 290 return true; 291 } 292 if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 293 [$unparsed, $match, $raw] = $split; 294 return [$unparsed, $match, $action]; 295 } 296 return true; 297 } 298 299 /** 300 * Escapes regex characters other than (, ) and / 301 * 302 * @param string $str 303 * @return string 304 */ 305 public static function escape($str) 306 { 307 $chars = [ 308 '/\\\\/', 309 '/\./', 310 '/\+/', 311 '/\*/', 312 '/\?/', 313 '/\[/', 314 '/\^/', 315 '/\]/', 316 '/\$/', 317 '/\{/', 318 '/\}/', 319 '/\=/', 320 '/\!/', 321 '/\</', 322 '/\>/', 323 '/\|/', 324 '/\:/' 325 ]; 326 327 $escaped = [ 328 '\\\\\\\\', 329 '\.', 330 '\+', 331 '\*', 332 '\?', 333 '\[', 334 '\^', 335 '\]', 336 '\$', 337 '\{', 338 '\}', 339 '\=', 340 '\!', 341 '\<', 342 '\>', 343 '\|', 344 '\:' 345 ]; 346 347 return preg_replace($chars, $escaped, $str); 348 } 349} 350