1<?php 2 3/** 4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5 * For an intro to the Lexer see: 6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7 * 8 * @author Marcus Baker http://www.lastcraft.com 9 */ 10 11namespace dokuwiki\Parsing\Lexer; 12 13use dokuwiki\Parsing\Handler; 14 15/** 16 * Accepts text and breaks it into tokens. 17 * 18 * Some optimisation to make the sure the content is only scanned by the PHP regex 19 * parser once. Lexer modes must not start with leading underscores. 20 */ 21class Lexer 22{ 23 /** Signal for leaving a mode */ 24 public const MODE_EXIT = '__exit'; 25 /** Prefix marking special (enter-and-exit) patterns */ 26 public const MODE_SPECIAL_PREFIX = '_'; 27 28 /** @var ParallelRegex[] */ 29 protected $regexes = []; 30 /** @var Handler */ 31 protected $handler; 32 /** @var StateStack */ 33 protected $modeStack; 34 /** @var array mode "rewrites" */ 35 protected $mode_handlers = []; 36 /** @var bool case sensitive? */ 37 protected $case; 38 39 /** 40 * Sets up the lexer in case insensitive matching by default. 41 * 42 * @param Handler $handler Handling strategy by reference. 43 * @param string $start Starting handler. 44 * @param boolean $case True for case sensitive. 45 */ 46 public function __construct($handler, $start = "accept", $case = false) 47 { 48 $this->case = $case; 49 $this->handler = $handler; 50 $this->modeStack = new StateStack($start); 51 } 52 53 /** 54 * Adds a token search pattern for a particular parsing mode. 55 * 56 * The pattern does not change the current mode. 57 * 58 * @param string $pattern Perl style regex, but ( and ) 59 * lose the usual meaning. 60 * @param string $mode Should only apply this 61 * pattern when dealing with 62 * this type of input. 63 */ 64 public function addPattern($pattern, $mode = "accept") 65 { 66 if (! isset($this->regexes[$mode])) { 67 $this->regexes[$mode] = new ParallelRegex($this->case); 68 } 69 $this->regexes[$mode]->addPattern($pattern); 70 } 71 72 /** 73 * Adds a pattern that will enter a new parsing mode. 74 * 75 * Useful for entering parenthesis, strings, tags, etc. 76 * 77 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 78 * @param string $mode Should only apply this pattern when dealing with this type of input. 79 * @param string $new_mode Change parsing to this new nested mode. 80 */ 81 public function addEntryPattern($pattern, $mode, $new_mode) 82 { 83 if (! isset($this->regexes[$mode])) { 84 $this->regexes[$mode] = new ParallelRegex($this->case); 85 } 86 $this->regexes[$mode]->addPattern($pattern, $new_mode); 87 } 88 89 /** 90 * Adds a pattern that will exit the current mode and re-enter the previous one. 91 * 92 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 93 * @param string $mode Mode to leave. 94 */ 95 public function addExitPattern($pattern, $mode) 96 { 97 if (! isset($this->regexes[$mode])) { 98 $this->regexes[$mode] = new ParallelRegex($this->case); 99 } 100 $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT); 101 } 102 103 /** 104 * Adds a pattern that has a special mode. 105 * 106 * Acts as an entry and exit pattern in one go, effectively calling a special 107 * parser handler for this token only. 108 * 109 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 110 * @param string $mode Should only apply this pattern when dealing with this type of input. 111 * @param string $special Use this mode for this one token. 112 */ 113 public function addSpecialPattern($pattern, $mode, $special) 114 { 115 if (! isset($this->regexes[$mode])) { 116 $this->regexes[$mode] = new ParallelRegex($this->case); 117 } 118 $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special); 119 } 120 121 /** 122 * Adds a mapping from a mode to another handler. 123 * 124 * @param string $mode Mode to be remapped. 125 * @param string $handler New target handler. 126 */ 127 public function mapHandler($mode, $handler) 128 { 129 $this->mode_handlers[$mode] = $handler; 130 } 131 132 /** 133 * Splits the page text into tokens. 134 * 135 * Will fail if the handlers report an error or if no content is consumed. If successful then each 136 * unparsed and parsed token invokes a call to the held listener. 137 * 138 * @param string $raw Raw HTML text. 139 * @return boolean True on success, else false. 140 */ 141 public function parse($raw) 142 { 143 if (! isset($this->handler)) { 144 return false; 145 } 146 $offset = 0; 147 while (is_array($parsed = $this->reduce($raw, $offset))) { 148 [$unmatched, $matched, $mode] = $parsed; 149 $matchPos = $offset + strlen($unmatched); 150 if (! $this->dispatchTokens($unmatched, $matched, $mode, $offset, $matchPos)) { 151 return false; 152 } 153 $newOffset = $matchPos + strlen($matched); 154 if ($newOffset === $offset) { 155 return false; 156 } 157 $offset = $newOffset; 158 } 159 if (!$parsed) { 160 return false; 161 } 162 return $this->invokeHandler(substr($raw, $offset), DOKU_LEXER_UNMATCHED, $offset); 163 } 164 165 /** 166 * Gives plugins access to the mode stack 167 * 168 * @return StateStack 169 */ 170 public function getModeStack() 171 { 172 return $this->modeStack; 173 } 174 175 /** 176 * Sends the matched token and any leading unmatched 177 * text to the parser changing the lexer to a new 178 * mode if one is listed. 179 * 180 * @param string $unmatched Unmatched leading portion. 181 * @param string $matched Actual token match. 182 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 183 * @param int $initialPos 184 * @param int $matchPos Current byte index location in raw doc thats being parsed 185 * @return boolean False if there was any error from the parser. 186 */ 187 protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 188 { 189 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 190 return false; 191 } 192 if ($this->isModeEnd($mode)) { 193 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 194 return false; 195 } 196 return $this->modeStack->leave(); 197 } 198 if ($this->isSpecialMode($mode)) { 199 $this->modeStack->enter($this->decodeSpecial($mode)); 200 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 201 return false; 202 } 203 return $this->modeStack->leave(); 204 } 205 if (is_string($mode)) { 206 $this->modeStack->enter($mode); 207 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 208 } 209 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 210 } 211 212 /** 213 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 214 * mode stack. 215 * 216 * @param string $mode Mode to test. 217 * @return boolean True if this is the exit mode. 218 */ 219 protected function isModeEnd($mode) 220 { 221 return ($mode === self::MODE_EXIT); 222 } 223 224 /** 225 * Test to see if the mode is one where this mode is entered for this token only and automatically 226 * leaves immediately afterwoods. 227 * 228 * @param string $mode Mode to test. 229 * @return boolean True if this is the exit mode. 230 */ 231 protected function isSpecialMode($mode) 232 { 233 return str_starts_with($mode, self::MODE_SPECIAL_PREFIX); 234 } 235 236 /** 237 * Strips the magic underscore marking single token modes. 238 * 239 * @param string $mode Mode to decode. 240 * @return string Underlying mode name. 241 */ 242 protected function decodeSpecial($mode) 243 { 244 return substr($mode, strlen(self::MODE_SPECIAL_PREFIX)); 245 } 246 247 /** 248 * Dispatches a token to the handler. 249 * 250 * Resolves mode name aliases (e.g. unformattedalt → unformatted) and 251 * delegates all dispatch logic to Handler::handleToken(). 252 * 253 * @param string $content Text parsed. 254 * @param int $state One of the DOKU_LEXER_* constants identifying the 255 * lexer event (ENTER / MATCHED / UNMATCHED / EXIT / 256 * SPECIAL). 257 * @param int $pos Current byte index location in raw doc 258 * thats being parsed 259 * @return bool 260 */ 261 protected function invokeHandler($content, $state, $pos) 262 { 263 if ($content === false) { 264 return true; 265 } 266 // Empty content is a no-op for every state EXCEPT EXIT: a zero-width 267 // exit pattern (lookahead-only) must still fire the mode's exit 268 // handler so cleanup like restoring a buffered call writer happens. 269 // Skipping it would pop the mode stack but leave the handler-side 270 // state stale. 271 if ($content === '' && $state !== DOKU_LEXER_EXIT) { 272 return true; 273 } 274 $originalName = $this->modeStack->getCurrent(); 275 $modeName = $this->mode_handlers[$originalName] ?? $originalName; 276 277 return $this->handler->handleToken($modeName, $content, $state, $pos, $originalName); 278 } 279 280 /** 281 * Tries to match the next token starting at `$offset` in `$raw`. 282 * 283 * The full subject is passed to the regex engine (rather than a 284 * truncated tail) so that lookbehind assertions in the registered 285 * patterns can see characters before the current offset. Empty 286 * subjects (offset past end) will not be matched. 287 * 288 * @param string $raw The full subject to parse. 289 * @param int $offset Byte offset at which to resume matching. 290 * @return array|bool Three item list of unparsed content followed by the 291 * recognised token and finally the action the parser is to take. 292 * True if no match, false if there is a parsing error. 293 */ 294 protected function reduce($raw, $offset) 295 { 296 if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 297 return false; 298 } 299 if ($offset >= strlen($raw)) { 300 return true; 301 } 302 if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split, $offset)) { 303 [$unparsed, $match] = $split; 304 return [$unparsed, $match, $action]; 305 } 306 return true; 307 } 308 309 /** 310 * Escapes regex characters other than (, ) and / 311 * 312 * @param string $str 313 * @return string 314 */ 315 public static function escape($str) 316 { 317 $chars = [ 318 '/\\\\/', 319 '/\./', 320 '/\+/', 321 '/\*/', 322 '/\?/', 323 '/\[/', 324 '/\^/', 325 '/\]/', 326 '/\$/', 327 '/\{/', 328 '/\}/', 329 '/\=/', 330 '/\!/', 331 '/\</', 332 '/\>/', 333 '/\|/', 334 '/\:/' 335 ]; 336 337 $escaped = [ 338 '\\\\\\\\', 339 '\.', 340 '\+', 341 '\*', 342 '\?', 343 '\[', 344 '\^', 345 '\]', 346 '\$', 347 '\{', 348 '\}', 349 '\=', 350 '\!', 351 '\<', 352 '\>', 353 '\|', 354 '\:' 355 ]; 356 357 return preg_replace($chars, $escaped, $str); 358 } 359} 360