1be906b56SAndreas Gohr<?php 2be906b56SAndreas Gohr/** 3be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4be906b56SAndreas Gohr * For an intro to the Lexer see: 5be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6be906b56SAndreas Gohr * 7be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 8be906b56SAndreas Gohr */ 9be906b56SAndreas Gohr 10be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 11be906b56SAndreas Gohr 12be906b56SAndreas Gohr/** 13be906b56SAndreas Gohr * Accepts text and breaks it into tokens. 14be906b56SAndreas Gohr * 15be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex 16be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores. 17be906b56SAndreas Gohr */ 18be906b56SAndreas Gohrclass Lexer 19be906b56SAndreas Gohr{ 20be906b56SAndreas Gohr /** @var ParallelRegex[] */ 21*bcaec9f4SAndreas Gohr protected $regexes = []; 22be906b56SAndreas Gohr /** @var \Doku_Handler */ 23be906b56SAndreas Gohr protected $handler; 24be906b56SAndreas Gohr /** @var StateStack */ 25661c1ddcSChristopher Smith protected $modeStack; 26be906b56SAndreas Gohr /** @var array mode "rewrites" */ 27*bcaec9f4SAndreas Gohr protected $mode_handlers = []; 28be906b56SAndreas Gohr /** @var bool case sensitive? */ 29be906b56SAndreas Gohr protected $case; 30be906b56SAndreas Gohr 31be906b56SAndreas Gohr /** 32be906b56SAndreas Gohr * Sets up the lexer in case insensitive matching by default. 33be906b56SAndreas Gohr * 34be906b56SAndreas Gohr * @param \Doku_Handler $handler Handling strategy by reference. 35be906b56SAndreas Gohr * @param string $start Starting handler. 36be906b56SAndreas Gohr * @param boolean $case True for case sensitive. 37be906b56SAndreas Gohr */ 38be906b56SAndreas Gohr public function __construct($handler, $start = "accept", $case = false) 39be906b56SAndreas Gohr { 40be906b56SAndreas Gohr $this->case = $case; 41be906b56SAndreas Gohr $this->handler = $handler; 42661c1ddcSChristopher Smith $this->modeStack = new StateStack($start); 43be906b56SAndreas Gohr } 44be906b56SAndreas Gohr 45be906b56SAndreas Gohr /** 46be906b56SAndreas Gohr * Adds a token search pattern for a particular parsing mode. 47be906b56SAndreas Gohr * 48be906b56SAndreas Gohr * The pattern does not change the current mode. 49be906b56SAndreas Gohr * 50be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) 51be906b56SAndreas Gohr * lose the usual meaning. 52be906b56SAndreas Gohr * @param string $mode Should only apply this 53be906b56SAndreas Gohr * pattern when dealing with 54be906b56SAndreas Gohr * this type of input. 55be906b56SAndreas Gohr */ 56be906b56SAndreas Gohr public function addPattern($pattern, $mode = "accept") 57be906b56SAndreas Gohr { 58be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 59be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 60be906b56SAndreas Gohr } 61be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern); 62be906b56SAndreas Gohr } 63be906b56SAndreas Gohr 64be906b56SAndreas Gohr /** 65be906b56SAndreas Gohr * Adds a pattern that will enter a new parsing mode. 66be906b56SAndreas Gohr * 67be906b56SAndreas Gohr * Useful for entering parenthesis, strings, tags, etc. 68be906b56SAndreas Gohr * 69be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 70be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 71be906b56SAndreas Gohr * @param string $new_mode Change parsing to this new nested mode. 72be906b56SAndreas Gohr */ 73be906b56SAndreas Gohr public function addEntryPattern($pattern, $mode, $new_mode) 74be906b56SAndreas Gohr { 75be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 76be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 77be906b56SAndreas Gohr } 78be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, $new_mode); 79be906b56SAndreas Gohr } 80be906b56SAndreas Gohr 81be906b56SAndreas Gohr /** 82be906b56SAndreas Gohr * Adds a pattern that will exit the current mode and re-enter the previous one. 83be906b56SAndreas Gohr * 84be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 85be906b56SAndreas Gohr * @param string $mode Mode to leave. 86be906b56SAndreas Gohr */ 87be906b56SAndreas Gohr public function addExitPattern($pattern, $mode) 88be906b56SAndreas Gohr { 89be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 90be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 91be906b56SAndreas Gohr } 92be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, "__exit"); 93be906b56SAndreas Gohr } 94be906b56SAndreas Gohr 95be906b56SAndreas Gohr /** 96be906b56SAndreas Gohr * Adds a pattern that has a special mode. 97be906b56SAndreas Gohr * 98be906b56SAndreas Gohr * Acts as an entry and exit pattern in one go, effectively calling a special 99be906b56SAndreas Gohr * parser handler for this token only. 100be906b56SAndreas Gohr * 101be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 102be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 103be906b56SAndreas Gohr * @param string $special Use this mode for this one token. 104be906b56SAndreas Gohr */ 105be906b56SAndreas Gohr public function addSpecialPattern($pattern, $mode, $special) 106be906b56SAndreas Gohr { 107be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 108be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 109be906b56SAndreas Gohr } 110be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, "_$special"); 111be906b56SAndreas Gohr } 112be906b56SAndreas Gohr 113be906b56SAndreas Gohr /** 114be906b56SAndreas Gohr * Adds a mapping from a mode to another handler. 115be906b56SAndreas Gohr * 116be906b56SAndreas Gohr * @param string $mode Mode to be remapped. 117be906b56SAndreas Gohr * @param string $handler New target handler. 118be906b56SAndreas Gohr */ 119be906b56SAndreas Gohr public function mapHandler($mode, $handler) 120be906b56SAndreas Gohr { 121be906b56SAndreas Gohr $this->mode_handlers[$mode] = $handler; 122be906b56SAndreas Gohr } 123be906b56SAndreas Gohr 124be906b56SAndreas Gohr /** 125be906b56SAndreas Gohr * Splits the page text into tokens. 126be906b56SAndreas Gohr * 127be906b56SAndreas Gohr * Will fail if the handlers report an error or if no content is consumed. If successful then each 128be906b56SAndreas Gohr * unparsed and parsed token invokes a call to the held listener. 129be906b56SAndreas Gohr * 130be906b56SAndreas Gohr * @param string $raw Raw HTML text. 131be906b56SAndreas Gohr * @return boolean True on success, else false. 132be906b56SAndreas Gohr */ 133be906b56SAndreas Gohr public function parse($raw) 134be906b56SAndreas Gohr { 135be906b56SAndreas Gohr if (! isset($this->handler)) { 136be906b56SAndreas Gohr return false; 137be906b56SAndreas Gohr } 138be906b56SAndreas Gohr $initialLength = strlen($raw); 139be906b56SAndreas Gohr $length = $initialLength; 140be906b56SAndreas Gohr $pos = 0; 141be906b56SAndreas Gohr while (is_array($parsed = $this->reduce($raw))) { 142*bcaec9f4SAndreas Gohr [$unmatched, $matched, $mode] = $parsed; 143be906b56SAndreas Gohr $currentLength = strlen($raw); 144be906b56SAndreas Gohr $matchPos = $initialLength - $currentLength - strlen($matched); 145be906b56SAndreas Gohr if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 146be906b56SAndreas Gohr return false; 147be906b56SAndreas Gohr } 148*bcaec9f4SAndreas Gohr if ($currentLength === $length) { 149be906b56SAndreas Gohr return false; 150be906b56SAndreas Gohr } 151be906b56SAndreas Gohr $length = $currentLength; 152be906b56SAndreas Gohr $pos = $initialLength - $currentLength; 153be906b56SAndreas Gohr } 154be906b56SAndreas Gohr if (!$parsed) { 155be906b56SAndreas Gohr return false; 156be906b56SAndreas Gohr } 157be906b56SAndreas Gohr return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 158be906b56SAndreas Gohr } 159be906b56SAndreas Gohr 160be906b56SAndreas Gohr /** 161368a782fSAnna Dabrowska * Gives plugins access to the mode stack 162368a782fSAnna Dabrowska * 163368a782fSAnna Dabrowska * @return StateStack 164368a782fSAnna Dabrowska */ 165368a782fSAnna Dabrowska public function getModeStack() 166368a782fSAnna Dabrowska { 167368a782fSAnna Dabrowska return $this->modeStack; 168368a782fSAnna Dabrowska } 169368a782fSAnna Dabrowska 170368a782fSAnna Dabrowska /** 171be906b56SAndreas Gohr * Sends the matched token and any leading unmatched 172be906b56SAndreas Gohr * text to the parser changing the lexer to a new 173be906b56SAndreas Gohr * mode if one is listed. 174be906b56SAndreas Gohr * 175be906b56SAndreas Gohr * @param string $unmatched Unmatched leading portion. 176be906b56SAndreas Gohr * @param string $matched Actual token match. 177be906b56SAndreas Gohr * @param bool|string $mode Mode after match. A boolean false mode causes no change. 178be906b56SAndreas Gohr * @param int $initialPos 179be906b56SAndreas Gohr * @param int $matchPos Current byte index location in raw doc thats being parsed 180be906b56SAndreas Gohr * @return boolean False if there was any error from the parser. 181be906b56SAndreas Gohr */ 182661c1ddcSChristopher Smith protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 183be906b56SAndreas Gohr { 184be906b56SAndreas Gohr if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 185be906b56SAndreas Gohr return false; 186be906b56SAndreas Gohr } 187be906b56SAndreas Gohr if ($this->isModeEnd($mode)) { 188be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 189be906b56SAndreas Gohr return false; 190be906b56SAndreas Gohr } 191661c1ddcSChristopher Smith return $this->modeStack->leave(); 192be906b56SAndreas Gohr } 193be906b56SAndreas Gohr if ($this->isSpecialMode($mode)) { 194661c1ddcSChristopher Smith $this->modeStack->enter($this->decodeSpecial($mode)); 195be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 196be906b56SAndreas Gohr return false; 197be906b56SAndreas Gohr } 198661c1ddcSChristopher Smith return $this->modeStack->leave(); 199be906b56SAndreas Gohr } 200be906b56SAndreas Gohr if (is_string($mode)) { 201661c1ddcSChristopher Smith $this->modeStack->enter($mode); 202be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 203be906b56SAndreas Gohr } 204be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 205be906b56SAndreas Gohr } 206be906b56SAndreas Gohr 207be906b56SAndreas Gohr /** 208be906b56SAndreas Gohr * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 209be906b56SAndreas Gohr * mode stack. 210be906b56SAndreas Gohr * 211be906b56SAndreas Gohr * @param string $mode Mode to test. 212be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 213be906b56SAndreas Gohr */ 214be906b56SAndreas Gohr protected function isModeEnd($mode) 215be906b56SAndreas Gohr { 216be906b56SAndreas Gohr return ($mode === "__exit"); 217be906b56SAndreas Gohr } 218be906b56SAndreas Gohr 219be906b56SAndreas Gohr /** 220be906b56SAndreas Gohr * Test to see if the mode is one where this mode is entered for this token only and automatically 221be906b56SAndreas Gohr * leaves immediately afterwoods. 222be906b56SAndreas Gohr * 223be906b56SAndreas Gohr * @param string $mode Mode to test. 224be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 225be906b56SAndreas Gohr */ 226be906b56SAndreas Gohr protected function isSpecialMode($mode) 227be906b56SAndreas Gohr { 228be906b56SAndreas Gohr return (strncmp($mode, "_", 1) == 0); 229be906b56SAndreas Gohr } 230be906b56SAndreas Gohr 231be906b56SAndreas Gohr /** 232be906b56SAndreas Gohr * Strips the magic underscore marking single token modes. 233be906b56SAndreas Gohr * 234be906b56SAndreas Gohr * @param string $mode Mode to decode. 235be906b56SAndreas Gohr * @return string Underlying mode name. 236be906b56SAndreas Gohr */ 237be906b56SAndreas Gohr protected function decodeSpecial($mode) 238be906b56SAndreas Gohr { 239be906b56SAndreas Gohr return substr($mode, 1); 240be906b56SAndreas Gohr } 241be906b56SAndreas Gohr 242be906b56SAndreas Gohr /** 243be906b56SAndreas Gohr * Calls the parser method named after the current mode. 244be906b56SAndreas Gohr * 245be906b56SAndreas Gohr * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 246be906b56SAndreas Gohr * 247be906b56SAndreas Gohr * @param string $content Text parsed. 248be906b56SAndreas Gohr * @param boolean $is_match Token is recognised rather 249be906b56SAndreas Gohr * than unparsed data. 250be906b56SAndreas Gohr * @param int $pos Current byte index location in raw doc 251be906b56SAndreas Gohr * thats being parsed 252be906b56SAndreas Gohr * @return bool 253be906b56SAndreas Gohr */ 254be906b56SAndreas Gohr protected function invokeHandler($content, $is_match, $pos) 255be906b56SAndreas Gohr { 256be906b56SAndreas Gohr if (($content === "") || ($content === false)) { 257be906b56SAndreas Gohr return true; 258be906b56SAndreas Gohr } 259661c1ddcSChristopher Smith $handler = $this->modeStack->getCurrent(); 260be906b56SAndreas Gohr if (isset($this->mode_handlers[$handler])) { 261be906b56SAndreas Gohr $handler = $this->mode_handlers[$handler]; 262be906b56SAndreas Gohr } 263be906b56SAndreas Gohr 264be906b56SAndreas Gohr // modes starting with plugin_ are all handled by the same 265be906b56SAndreas Gohr // handler but with an additional parameter 266be906b56SAndreas Gohr if (substr($handler, 0, 7)=='plugin_') { 267*bcaec9f4SAndreas Gohr [$handler, $plugin] = sexplode('_', $handler, 2, ''); 268be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos, $plugin); 269be906b56SAndreas Gohr } 270be906b56SAndreas Gohr 271be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos); 272be906b56SAndreas Gohr } 273be906b56SAndreas Gohr 274be906b56SAndreas Gohr /** 275be906b56SAndreas Gohr * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 276be906b56SAndreas Gohr * unparsed data. Empty strings will not be matched. 277be906b56SAndreas Gohr * 278be906b56SAndreas Gohr * @param string $raw The subject to parse. This is the content that will be eaten. 279be906b56SAndreas Gohr * @return array|bool Three item list of unparsed content followed by the 280be906b56SAndreas Gohr * recognised token and finally the action the parser is to take. 281be906b56SAndreas Gohr * True if no match, false if there is a parsing error. 282be906b56SAndreas Gohr */ 283be906b56SAndreas Gohr protected function reduce(&$raw) 284be906b56SAndreas Gohr { 285661c1ddcSChristopher Smith if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 286be906b56SAndreas Gohr return false; 287be906b56SAndreas Gohr } 288be906b56SAndreas Gohr if ($raw === "") { 289be906b56SAndreas Gohr return true; 290be906b56SAndreas Gohr } 291661c1ddcSChristopher Smith if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 292*bcaec9f4SAndreas Gohr [$unparsed, $match, $raw] = $split; 293*bcaec9f4SAndreas Gohr return [$unparsed, $match, $action]; 294be906b56SAndreas Gohr } 295be906b56SAndreas Gohr return true; 296be906b56SAndreas Gohr } 297be906b56SAndreas Gohr 298be906b56SAndreas Gohr /** 299be906b56SAndreas Gohr * Escapes regex characters other than (, ) and / 300be906b56SAndreas Gohr * 301be906b56SAndreas Gohr * @param string $str 302be906b56SAndreas Gohr * @return string 303be906b56SAndreas Gohr */ 304be906b56SAndreas Gohr public static function escape($str) 305be906b56SAndreas Gohr { 306*bcaec9f4SAndreas Gohr $chars = [ 307be906b56SAndreas Gohr '/\\\\/', 308be906b56SAndreas Gohr '/\./', 309be906b56SAndreas Gohr '/\+/', 310be906b56SAndreas Gohr '/\*/', 311be906b56SAndreas Gohr '/\?/', 312be906b56SAndreas Gohr '/\[/', 313be906b56SAndreas Gohr '/\^/', 314be906b56SAndreas Gohr '/\]/', 315be906b56SAndreas Gohr '/\$/', 316be906b56SAndreas Gohr '/\{/', 317be906b56SAndreas Gohr '/\}/', 318be906b56SAndreas Gohr '/\=/', 319be906b56SAndreas Gohr '/\!/', 320be906b56SAndreas Gohr '/\</', 321be906b56SAndreas Gohr '/\>/', 322be906b56SAndreas Gohr '/\|/', 323be906b56SAndreas Gohr '/\:/' 324*bcaec9f4SAndreas Gohr ]; 325be906b56SAndreas Gohr 326*bcaec9f4SAndreas Gohr $escaped = [ 327be906b56SAndreas Gohr '\\\\\\\\', 328be906b56SAndreas Gohr '\.', 329be906b56SAndreas Gohr '\+', 330be906b56SAndreas Gohr '\*', 331be906b56SAndreas Gohr '\?', 332be906b56SAndreas Gohr '\[', 333be906b56SAndreas Gohr '\^', 334be906b56SAndreas Gohr '\]', 335be906b56SAndreas Gohr '\$', 336be906b56SAndreas Gohr '\{', 337be906b56SAndreas Gohr '\}', 338be906b56SAndreas Gohr '\=', 339be906b56SAndreas Gohr '\!', 340be906b56SAndreas Gohr '\<', 341be906b56SAndreas Gohr '\>', 342be906b56SAndreas Gohr '\|', 343be906b56SAndreas Gohr '\:' 344*bcaec9f4SAndreas Gohr ]; 345*bcaec9f4SAndreas Gohr 346be906b56SAndreas Gohr return preg_replace($chars, $escaped, $str); 347be906b56SAndreas Gohr } 348be906b56SAndreas Gohr} 349