1<?php 2 3/** 4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5 * For an intro to the Lexer see: 6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7 * 8 * @author Marcus Baker http://www.lastcraft.com 9 */ 10 11namespace dokuwiki\Parsing\Lexer; 12 13use dokuwiki\Parsing\Handler; 14 15/** 16 * Accepts text and breaks it into tokens. 17 * 18 * Some optimisation to make the sure the content is only scanned by the PHP regex 19 * parser once. Lexer modes must not start with leading underscores. 20 */ 21class Lexer 22{ 23 /** Signal for leaving a mode */ 24 public const MODE_EXIT = '__exit'; 25 /** Prefix marking special (enter-and-exit) patterns */ 26 public const MODE_SPECIAL_PREFIX = '_'; 27 28 /** @var ParallelRegex[] */ 29 protected $regexes = []; 30 /** @var Handler */ 31 protected $handler; 32 /** @var StateStack */ 33 protected $modeStack; 34 /** @var array mode "rewrites" */ 35 protected $mode_handlers = []; 36 /** @var bool case sensitive? */ 37 protected $case; 38 39 /** 40 * Sets up the lexer in case insensitive matching by default. 41 * 42 * @param Handler $handler Handling strategy by reference. 43 * @param string $start Starting handler. 44 * @param boolean $case True for case sensitive. 45 */ 46 public function __construct($handler, $start = "accept", $case = false) 47 { 48 $this->case = $case; 49 $this->handler = $handler; 50 $this->modeStack = new StateStack($start); 51 } 52 53 /** 54 * Adds a token search pattern for a particular parsing mode. 55 * 56 * The pattern does not change the current mode. 57 * 58 * @param string $pattern Perl style regex, but ( and ) 59 * lose the usual meaning. 60 * @param string $mode Should only apply this 61 * pattern when dealing with 62 * this type of input. 63 */ 64 public function addPattern($pattern, $mode = "accept") 65 { 66 if (! isset($this->regexes[$mode])) { 67 $this->regexes[$mode] = new ParallelRegex($this->case); 68 } 69 $this->regexes[$mode]->addPattern($pattern); 70 } 71 72 /** 73 * Adds a pattern that will enter a new parsing mode. 74 * 75 * Useful for entering parenthesis, strings, tags, etc. 76 * 77 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 78 * @param string $mode Should only apply this pattern when dealing with this type of input. 79 * @param string $new_mode Change parsing to this new nested mode. 80 */ 81 public function addEntryPattern($pattern, $mode, $new_mode) 82 { 83 if (! isset($this->regexes[$mode])) { 84 $this->regexes[$mode] = new ParallelRegex($this->case); 85 } 86 $this->regexes[$mode]->addPattern($pattern, $new_mode); 87 } 88 89 /** 90 * Adds a pattern that will exit the current mode and re-enter the previous one. 91 * 92 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 93 * @param string $mode Mode to leave. 94 */ 95 public function addExitPattern($pattern, $mode) 96 { 97 if (! isset($this->regexes[$mode])) { 98 $this->regexes[$mode] = new ParallelRegex($this->case); 99 } 100 $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT); 101 } 102 103 /** 104 * Adds a pattern that has a special mode. 105 * 106 * Acts as an entry and exit pattern in one go, effectively calling a special 107 * parser handler for this token only. 108 * 109 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 110 * @param string $mode Should only apply this pattern when dealing with this type of input. 111 * @param string $special Use this mode for this one token. 112 */ 113 public function addSpecialPattern($pattern, $mode, $special) 114 { 115 if (! isset($this->regexes[$mode])) { 116 $this->regexes[$mode] = new ParallelRegex($this->case); 117 } 118 $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special); 119 } 120 121 /** 122 * Adds a mapping from a mode to another handler. 123 * 124 * @param string $mode Mode to be remapped. 125 * @param string $handler New target handler. 126 */ 127 public function mapHandler($mode, $handler) 128 { 129 $this->mode_handlers[$mode] = $handler; 130 } 131 132 /** 133 * Splits the page text into tokens. 134 * 135 * Will fail if the handlers report an error or if no content is consumed. If successful then each 136 * unparsed and parsed token invokes a call to the held listener. 137 * 138 * @param string $raw Raw HTML text. 139 * @return boolean True on success, else false. 140 */ 141 public function parse($raw) 142 { 143 if (! isset($this->handler)) { 144 return false; 145 } 146 $initialLength = strlen($raw); 147 $length = $initialLength; 148 $pos = 0; 149 while (is_array($parsed = $this->reduce($raw))) { 150 [$unmatched, $matched, $mode] = $parsed; 151 $currentLength = strlen($raw); 152 $matchPos = $initialLength - $currentLength - strlen($matched); 153 if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 154 return false; 155 } 156 if ($currentLength === $length) { 157 return false; 158 } 159 $length = $currentLength; 160 $pos = $initialLength - $currentLength; 161 } 162 if (!$parsed) { 163 return false; 164 } 165 return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 166 } 167 168 /** 169 * Gives plugins access to the mode stack 170 * 171 * @return StateStack 172 */ 173 public function getModeStack() 174 { 175 return $this->modeStack; 176 } 177 178 /** 179 * Sends the matched token and any leading unmatched 180 * text to the parser changing the lexer to a new 181 * mode if one is listed. 182 * 183 * @param string $unmatched Unmatched leading portion. 184 * @param string $matched Actual token match. 185 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 186 * @param int $initialPos 187 * @param int $matchPos Current byte index location in raw doc thats being parsed 188 * @return boolean False if there was any error from the parser. 189 */ 190 protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 191 { 192 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 193 return false; 194 } 195 if ($this->isModeEnd($mode)) { 196 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 197 return false; 198 } 199 return $this->modeStack->leave(); 200 } 201 if ($this->isSpecialMode($mode)) { 202 $this->modeStack->enter($this->decodeSpecial($mode)); 203 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 204 return false; 205 } 206 return $this->modeStack->leave(); 207 } 208 if (is_string($mode)) { 209 $this->modeStack->enter($mode); 210 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 211 } 212 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 213 } 214 215 /** 216 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 217 * mode stack. 218 * 219 * @param string $mode Mode to test. 220 * @return boolean True if this is the exit mode. 221 */ 222 protected function isModeEnd($mode) 223 { 224 return ($mode === self::MODE_EXIT); 225 } 226 227 /** 228 * Test to see if the mode is one where this mode is entered for this token only and automatically 229 * leaves immediately afterwoods. 230 * 231 * @param string $mode Mode to test. 232 * @return boolean True if this is the exit mode. 233 */ 234 protected function isSpecialMode($mode) 235 { 236 return str_starts_with($mode, self::MODE_SPECIAL_PREFIX); 237 } 238 239 /** 240 * Strips the magic underscore marking single token modes. 241 * 242 * @param string $mode Mode to decode. 243 * @return string Underlying mode name. 244 */ 245 protected function decodeSpecial($mode) 246 { 247 return substr($mode, strlen(self::MODE_SPECIAL_PREFIX)); 248 } 249 250 /** 251 * Dispatches a token to the handler. 252 * 253 * Resolves mode name aliases (e.g. unformattedalt → unformatted) and 254 * delegates all dispatch logic to Handler::handleToken(). 255 * 256 * @param string $content Text parsed. 257 * @param boolean $is_match Token is recognised rather 258 * than unparsed data. 259 * @param int $pos Current byte index location in raw doc 260 * thats being parsed 261 * @return bool 262 */ 263 protected function invokeHandler($content, $is_match, $pos) 264 { 265 if (($content === "") || ($content === false)) { 266 return true; 267 } 268 $originalName = $this->modeStack->getCurrent(); 269 $modeName = $this->mode_handlers[$originalName] ?? $originalName; 270 271 return $this->handler->handleToken($modeName, $content, $is_match, $pos, $originalName); 272 } 273 274 /** 275 * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 276 * unparsed data. Empty strings will not be matched. 277 * 278 * @param string $raw The subject to parse. This is the content that will be eaten. 279 * @return array|bool Three item list of unparsed content followed by the 280 * recognised token and finally the action the parser is to take. 281 * True if no match, false if there is a parsing error. 282 */ 283 protected function reduce(&$raw) 284 { 285 if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 286 return false; 287 } 288 if ($raw === "") { 289 return true; 290 } 291 if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 292 [$unparsed, $match, $raw] = $split; 293 return [$unparsed, $match, $action]; 294 } 295 return true; 296 } 297 298 /** 299 * Escapes regex characters other than (, ) and / 300 * 301 * @param string $str 302 * @return string 303 */ 304 public static function escape($str) 305 { 306 $chars = [ 307 '/\\\\/', 308 '/\./', 309 '/\+/', 310 '/\*/', 311 '/\?/', 312 '/\[/', 313 '/\^/', 314 '/\]/', 315 '/\$/', 316 '/\{/', 317 '/\}/', 318 '/\=/', 319 '/\!/', 320 '/\</', 321 '/\>/', 322 '/\|/', 323 '/\:/' 324 ]; 325 326 $escaped = [ 327 '\\\\\\\\', 328 '\.', 329 '\+', 330 '\*', 331 '\?', 332 '\[', 333 '\^', 334 '\]', 335 '\$', 336 '\{', 337 '\}', 338 '\=', 339 '\!', 340 '\<', 341 '\>', 342 '\|', 343 '\:' 344 ]; 345 346 return preg_replace($chars, $escaped, $str); 347 } 348} 349