1<?php 2 3/** 4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5 * For an intro to the Lexer see: 6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7 * 8 * @author Marcus Baker http://www.lastcraft.com 9 */ 10 11namespace dokuwiki\Parsing\Lexer; 12 13/** 14 * Accepts text and breaks it into tokens. 15 * 16 * Some optimisation to make the sure the content is only scanned by the PHP regex 17 * parser once. Lexer modes must not start with leading underscores. 18 */ 19class Lexer 20{ 21 /** Signal for leaving a mode */ 22 public const MODE_EXIT = '__exit'; 23 /** Prefix marking special (enter-and-exit) patterns */ 24 public const MODE_SPECIAL_PREFIX = '_'; 25 26 /** @var ParallelRegex[] */ 27 protected $regexes = []; 28 /** @var \Doku_Handler */ 29 protected $handler; 30 /** @var StateStack */ 31 protected $modeStack; 32 /** @var array mode "rewrites" */ 33 protected $mode_handlers = []; 34 /** @var bool case sensitive? */ 35 protected $case; 36 37 /** 38 * Sets up the lexer in case insensitive matching by default. 39 * 40 * @param \Doku_Handler $handler Handling strategy by reference. 41 * @param string $start Starting handler. 42 * @param boolean $case True for case sensitive. 43 */ 44 public function __construct($handler, $start = "accept", $case = false) 45 { 46 $this->case = $case; 47 $this->handler = $handler; 48 $this->modeStack = new StateStack($start); 49 } 50 51 /** 52 * Adds a token search pattern for a particular parsing mode. 53 * 54 * The pattern does not change the current mode. 55 * 56 * @param string $pattern Perl style regex, but ( and ) 57 * lose the usual meaning. 58 * @param string $mode Should only apply this 59 * pattern when dealing with 60 * this type of input. 61 */ 62 public function addPattern($pattern, $mode = "accept") 63 { 64 if (! isset($this->regexes[$mode])) { 65 $this->regexes[$mode] = new ParallelRegex($this->case); 66 } 67 $this->regexes[$mode]->addPattern($pattern); 68 } 69 70 /** 71 * Adds a pattern that will enter a new parsing mode. 72 * 73 * Useful for entering parenthesis, strings, tags, etc. 74 * 75 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 76 * @param string $mode Should only apply this pattern when dealing with this type of input. 77 * @param string $new_mode Change parsing to this new nested mode. 78 */ 79 public function addEntryPattern($pattern, $mode, $new_mode) 80 { 81 if (! isset($this->regexes[$mode])) { 82 $this->regexes[$mode] = new ParallelRegex($this->case); 83 } 84 $this->regexes[$mode]->addPattern($pattern, $new_mode); 85 } 86 87 /** 88 * Adds a pattern that will exit the current mode and re-enter the previous one. 89 * 90 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 91 * @param string $mode Mode to leave. 92 */ 93 public function addExitPattern($pattern, $mode) 94 { 95 if (! isset($this->regexes[$mode])) { 96 $this->regexes[$mode] = new ParallelRegex($this->case); 97 } 98 $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT); 99 } 100 101 /** 102 * Adds a pattern that has a special mode. 103 * 104 * Acts as an entry and exit pattern in one go, effectively calling a special 105 * parser handler for this token only. 106 * 107 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 108 * @param string $mode Should only apply this pattern when dealing with this type of input. 109 * @param string $special Use this mode for this one token. 110 */ 111 public function addSpecialPattern($pattern, $mode, $special) 112 { 113 if (! isset($this->regexes[$mode])) { 114 $this->regexes[$mode] = new ParallelRegex($this->case); 115 } 116 $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special); 117 } 118 119 /** 120 * Adds a mapping from a mode to another handler. 121 * 122 * @param string $mode Mode to be remapped. 123 * @param string $handler New target handler. 124 */ 125 public function mapHandler($mode, $handler) 126 { 127 $this->mode_handlers[$mode] = $handler; 128 } 129 130 /** 131 * Splits the page text into tokens. 132 * 133 * Will fail if the handlers report an error or if no content is consumed. If successful then each 134 * unparsed and parsed token invokes a call to the held listener. 135 * 136 * @param string $raw Raw HTML text. 137 * @return boolean True on success, else false. 138 */ 139 public function parse($raw) 140 { 141 if (! isset($this->handler)) { 142 return false; 143 } 144 $initialLength = strlen($raw); 145 $length = $initialLength; 146 $pos = 0; 147 while (is_array($parsed = $this->reduce($raw))) { 148 [$unmatched, $matched, $mode] = $parsed; 149 $currentLength = strlen($raw); 150 $matchPos = $initialLength - $currentLength - strlen($matched); 151 if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 152 return false; 153 } 154 if ($currentLength === $length) { 155 return false; 156 } 157 $length = $currentLength; 158 $pos = $initialLength - $currentLength; 159 } 160 if (!$parsed) { 161 return false; 162 } 163 return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 164 } 165 166 /** 167 * Gives plugins access to the mode stack 168 * 169 * @return StateStack 170 */ 171 public function getModeStack() 172 { 173 return $this->modeStack; 174 } 175 176 /** 177 * Sends the matched token and any leading unmatched 178 * text to the parser changing the lexer to a new 179 * mode if one is listed. 180 * 181 * @param string $unmatched Unmatched leading portion. 182 * @param string $matched Actual token match. 183 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 184 * @param int $initialPos 185 * @param int $matchPos Current byte index location in raw doc thats being parsed 186 * @return boolean False if there was any error from the parser. 187 */ 188 protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 189 { 190 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 191 return false; 192 } 193 if ($this->isModeEnd($mode)) { 194 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 195 return false; 196 } 197 return $this->modeStack->leave(); 198 } 199 if ($this->isSpecialMode($mode)) { 200 $this->modeStack->enter($this->decodeSpecial($mode)); 201 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 202 return false; 203 } 204 return $this->modeStack->leave(); 205 } 206 if (is_string($mode)) { 207 $this->modeStack->enter($mode); 208 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 209 } 210 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 211 } 212 213 /** 214 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 215 * mode stack. 216 * 217 * @param string $mode Mode to test. 218 * @return boolean True if this is the exit mode. 219 */ 220 protected function isModeEnd($mode) 221 { 222 return ($mode === self::MODE_EXIT); 223 } 224 225 /** 226 * Test to see if the mode is one where this mode is entered for this token only and automatically 227 * leaves immediately afterwoods. 228 * 229 * @param string $mode Mode to test. 230 * @return boolean True if this is the exit mode. 231 */ 232 protected function isSpecialMode($mode) 233 { 234 return str_starts_with($mode, self::MODE_SPECIAL_PREFIX); 235 } 236 237 /** 238 * Strips the magic underscore marking single token modes. 239 * 240 * @param string $mode Mode to decode. 241 * @return string Underlying mode name. 242 */ 243 protected function decodeSpecial($mode) 244 { 245 return substr($mode, strlen(self::MODE_SPECIAL_PREFIX)); 246 } 247 248 /** 249 * Calls the parser method named after the current mode. 250 * 251 * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 252 * 253 * @param string $content Text parsed. 254 * @param boolean $is_match Token is recognised rather 255 * than unparsed data. 256 * @param int $pos Current byte index location in raw doc 257 * thats being parsed 258 * @return bool 259 */ 260 protected function invokeHandler($content, $is_match, $pos) 261 { 262 if (($content === "") || ($content === false)) { 263 return true; 264 } 265 $handler = $this->modeStack->getCurrent(); 266 if (isset($this->mode_handlers[$handler])) { 267 $handler = $this->mode_handlers[$handler]; 268 } 269 270 // modes starting with plugin_ are all handled by the same 271 // handler but with an additional parameter 272 if (str_starts_with($handler, 'plugin_')) { 273 [$handler, $plugin] = sexplode('_', $handler, 2, ''); 274 return $this->handler->$handler($content, $is_match, $pos, $plugin); 275 } 276 277 return $this->handler->$handler($content, $is_match, $pos); 278 } 279 280 /** 281 * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 282 * unparsed data. Empty strings will not be matched. 283 * 284 * @param string $raw The subject to parse. This is the content that will be eaten. 285 * @return array|bool Three item list of unparsed content followed by the 286 * recognised token and finally the action the parser is to take. 287 * True if no match, false if there is a parsing error. 288 */ 289 protected function reduce(&$raw) 290 { 291 if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 292 return false; 293 } 294 if ($raw === "") { 295 return true; 296 } 297 if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 298 [$unparsed, $match, $raw] = $split; 299 return [$unparsed, $match, $action]; 300 } 301 return true; 302 } 303 304 /** 305 * Escapes regex characters other than (, ) and / 306 * 307 * @param string $str 308 * @return string 309 */ 310 public static function escape($str) 311 { 312 $chars = [ 313 '/\\\\/', 314 '/\./', 315 '/\+/', 316 '/\*/', 317 '/\?/', 318 '/\[/', 319 '/\^/', 320 '/\]/', 321 '/\$/', 322 '/\{/', 323 '/\}/', 324 '/\=/', 325 '/\!/', 326 '/\</', 327 '/\>/', 328 '/\|/', 329 '/\:/' 330 ]; 331 332 $escaped = [ 333 '\\\\\\\\', 334 '\.', 335 '\+', 336 '\*', 337 '\?', 338 '\[', 339 '\^', 340 '\]', 341 '\$', 342 '\{', 343 '\}', 344 '\=', 345 '\!', 346 '\<', 347 '\>', 348 '\|', 349 '\:' 350 ]; 351 352 return preg_replace($chars, $escaped, $str); 353 } 354} 355