1be906b56SAndreas Gohr<?php 2d4f83172SAndreas Gohr 3be906b56SAndreas Gohr/** 4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5be906b56SAndreas Gohr * For an intro to the Lexer see: 6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7be906b56SAndreas Gohr * 8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 9be906b56SAndreas Gohr */ 10be906b56SAndreas Gohr 11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 12be906b56SAndreas Gohr 13be906b56SAndreas Gohr/** 14be906b56SAndreas Gohr * Accepts text and breaks it into tokens. 15be906b56SAndreas Gohr * 16be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex 17be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores. 18be906b56SAndreas Gohr */ 19be906b56SAndreas Gohrclass Lexer 20be906b56SAndreas Gohr{ 21be906b56SAndreas Gohr /** @var ParallelRegex[] */ 22bcaec9f4SAndreas Gohr protected $regexes = []; 23be906b56SAndreas Gohr /** @var \Doku_Handler */ 24be906b56SAndreas Gohr protected $handler; 25be906b56SAndreas Gohr /** @var StateStack */ 26661c1ddcSChristopher Smith protected $modeStack; 27be906b56SAndreas Gohr /** @var array mode "rewrites" */ 28bcaec9f4SAndreas Gohr protected $mode_handlers = []; 29be906b56SAndreas Gohr /** @var bool case sensitive? */ 30be906b56SAndreas Gohr protected $case; 31be906b56SAndreas Gohr 32be906b56SAndreas Gohr /** 33be906b56SAndreas Gohr * Sets up the lexer in case insensitive matching by default. 34be906b56SAndreas Gohr * 35be906b56SAndreas Gohr * @param \Doku_Handler $handler Handling strategy by reference. 36be906b56SAndreas Gohr * @param string $start Starting handler. 37be906b56SAndreas Gohr * @param boolean $case True for case sensitive. 38be906b56SAndreas Gohr */ 39be906b56SAndreas Gohr public function __construct($handler, $start = "accept", $case = false) 40be906b56SAndreas Gohr { 41be906b56SAndreas Gohr $this->case = $case; 42be906b56SAndreas Gohr $this->handler = $handler; 43661c1ddcSChristopher Smith $this->modeStack = new StateStack($start); 44be906b56SAndreas Gohr } 45be906b56SAndreas Gohr 46be906b56SAndreas Gohr /** 47be906b56SAndreas Gohr * Adds a token search pattern for a particular parsing mode. 48be906b56SAndreas Gohr * 49be906b56SAndreas Gohr * The pattern does not change the current mode. 50be906b56SAndreas Gohr * 51be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) 52be906b56SAndreas Gohr * lose the usual meaning. 53be906b56SAndreas Gohr * @param string $mode Should only apply this 54be906b56SAndreas Gohr * pattern when dealing with 55be906b56SAndreas Gohr * this type of input. 56be906b56SAndreas Gohr */ 57be906b56SAndreas Gohr public function addPattern($pattern, $mode = "accept") 58be906b56SAndreas Gohr { 59be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 60be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 61be906b56SAndreas Gohr } 62be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern); 63be906b56SAndreas Gohr } 64be906b56SAndreas Gohr 65be906b56SAndreas Gohr /** 66be906b56SAndreas Gohr * Adds a pattern that will enter a new parsing mode. 67be906b56SAndreas Gohr * 68be906b56SAndreas Gohr * Useful for entering parenthesis, strings, tags, etc. 69be906b56SAndreas Gohr * 70be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 71be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 72be906b56SAndreas Gohr * @param string $new_mode Change parsing to this new nested mode. 73be906b56SAndreas Gohr */ 74be906b56SAndreas Gohr public function addEntryPattern($pattern, $mode, $new_mode) 75be906b56SAndreas Gohr { 76be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 77be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 78be906b56SAndreas Gohr } 79be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, $new_mode); 80be906b56SAndreas Gohr } 81be906b56SAndreas Gohr 82be906b56SAndreas Gohr /** 83be906b56SAndreas Gohr * Adds a pattern that will exit the current mode and re-enter the previous one. 84be906b56SAndreas Gohr * 85be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 86be906b56SAndreas Gohr * @param string $mode Mode to leave. 87be906b56SAndreas Gohr */ 88be906b56SAndreas Gohr public function addExitPattern($pattern, $mode) 89be906b56SAndreas Gohr { 90be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 91be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 92be906b56SAndreas Gohr } 93be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, "__exit"); 94be906b56SAndreas Gohr } 95be906b56SAndreas Gohr 96be906b56SAndreas Gohr /** 97be906b56SAndreas Gohr * Adds a pattern that has a special mode. 98be906b56SAndreas Gohr * 99be906b56SAndreas Gohr * Acts as an entry and exit pattern in one go, effectively calling a special 100be906b56SAndreas Gohr * parser handler for this token only. 101be906b56SAndreas Gohr * 102be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 103be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 104be906b56SAndreas Gohr * @param string $special Use this mode for this one token. 105be906b56SAndreas Gohr */ 106be906b56SAndreas Gohr public function addSpecialPattern($pattern, $mode, $special) 107be906b56SAndreas Gohr { 108be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 109be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 110be906b56SAndreas Gohr } 111be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, "_$special"); 112be906b56SAndreas Gohr } 113be906b56SAndreas Gohr 114be906b56SAndreas Gohr /** 115be906b56SAndreas Gohr * Adds a mapping from a mode to another handler. 116be906b56SAndreas Gohr * 117be906b56SAndreas Gohr * @param string $mode Mode to be remapped. 118be906b56SAndreas Gohr * @param string $handler New target handler. 119be906b56SAndreas Gohr */ 120be906b56SAndreas Gohr public function mapHandler($mode, $handler) 121be906b56SAndreas Gohr { 122be906b56SAndreas Gohr $this->mode_handlers[$mode] = $handler; 123be906b56SAndreas Gohr } 124be906b56SAndreas Gohr 125be906b56SAndreas Gohr /** 126be906b56SAndreas Gohr * Splits the page text into tokens. 127be906b56SAndreas Gohr * 128be906b56SAndreas Gohr * Will fail if the handlers report an error or if no content is consumed. If successful then each 129be906b56SAndreas Gohr * unparsed and parsed token invokes a call to the held listener. 130be906b56SAndreas Gohr * 131be906b56SAndreas Gohr * @param string $raw Raw HTML text. 132be906b56SAndreas Gohr * @return boolean True on success, else false. 133be906b56SAndreas Gohr */ 134be906b56SAndreas Gohr public function parse($raw) 135be906b56SAndreas Gohr { 136be906b56SAndreas Gohr if (! isset($this->handler)) { 137be906b56SAndreas Gohr return false; 138be906b56SAndreas Gohr } 139be906b56SAndreas Gohr $initialLength = strlen($raw); 140be906b56SAndreas Gohr $length = $initialLength; 141be906b56SAndreas Gohr $pos = 0; 142be906b56SAndreas Gohr while (is_array($parsed = $this->reduce($raw))) { 143bcaec9f4SAndreas Gohr [$unmatched, $matched, $mode] = $parsed; 144be906b56SAndreas Gohr $currentLength = strlen($raw); 145be906b56SAndreas Gohr $matchPos = $initialLength - $currentLength - strlen($matched); 146be906b56SAndreas Gohr if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 147be906b56SAndreas Gohr return false; 148be906b56SAndreas Gohr } 149bcaec9f4SAndreas Gohr if ($currentLength === $length) { 150be906b56SAndreas Gohr return false; 151be906b56SAndreas Gohr } 152be906b56SAndreas Gohr $length = $currentLength; 153be906b56SAndreas Gohr $pos = $initialLength - $currentLength; 154be906b56SAndreas Gohr } 155be906b56SAndreas Gohr if (!$parsed) { 156be906b56SAndreas Gohr return false; 157be906b56SAndreas Gohr } 158be906b56SAndreas Gohr return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 159be906b56SAndreas Gohr } 160be906b56SAndreas Gohr 161be906b56SAndreas Gohr /** 162368a782fSAnna Dabrowska * Gives plugins access to the mode stack 163368a782fSAnna Dabrowska * 164368a782fSAnna Dabrowska * @return StateStack 165368a782fSAnna Dabrowska */ 166368a782fSAnna Dabrowska public function getModeStack() 167368a782fSAnna Dabrowska { 168368a782fSAnna Dabrowska return $this->modeStack; 169368a782fSAnna Dabrowska } 170368a782fSAnna Dabrowska 171368a782fSAnna Dabrowska /** 172be906b56SAndreas Gohr * Sends the matched token and any leading unmatched 173be906b56SAndreas Gohr * text to the parser changing the lexer to a new 174be906b56SAndreas Gohr * mode if one is listed. 175be906b56SAndreas Gohr * 176be906b56SAndreas Gohr * @param string $unmatched Unmatched leading portion. 177be906b56SAndreas Gohr * @param string $matched Actual token match. 178be906b56SAndreas Gohr * @param bool|string $mode Mode after match. A boolean false mode causes no change. 179be906b56SAndreas Gohr * @param int $initialPos 180be906b56SAndreas Gohr * @param int $matchPos Current byte index location in raw doc thats being parsed 181be906b56SAndreas Gohr * @return boolean False if there was any error from the parser. 182be906b56SAndreas Gohr */ 183661c1ddcSChristopher Smith protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 184be906b56SAndreas Gohr { 185be906b56SAndreas Gohr if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 186be906b56SAndreas Gohr return false; 187be906b56SAndreas Gohr } 188be906b56SAndreas Gohr if ($this->isModeEnd($mode)) { 189be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 190be906b56SAndreas Gohr return false; 191be906b56SAndreas Gohr } 192661c1ddcSChristopher Smith return $this->modeStack->leave(); 193be906b56SAndreas Gohr } 194be906b56SAndreas Gohr if ($this->isSpecialMode($mode)) { 195661c1ddcSChristopher Smith $this->modeStack->enter($this->decodeSpecial($mode)); 196be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 197be906b56SAndreas Gohr return false; 198be906b56SAndreas Gohr } 199661c1ddcSChristopher Smith return $this->modeStack->leave(); 200be906b56SAndreas Gohr } 201be906b56SAndreas Gohr if (is_string($mode)) { 202661c1ddcSChristopher Smith $this->modeStack->enter($mode); 203be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 204be906b56SAndreas Gohr } 205be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 206be906b56SAndreas Gohr } 207be906b56SAndreas Gohr 208be906b56SAndreas Gohr /** 209be906b56SAndreas Gohr * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 210be906b56SAndreas Gohr * mode stack. 211be906b56SAndreas Gohr * 212be906b56SAndreas Gohr * @param string $mode Mode to test. 213be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 214be906b56SAndreas Gohr */ 215be906b56SAndreas Gohr protected function isModeEnd($mode) 216be906b56SAndreas Gohr { 217be906b56SAndreas Gohr return ($mode === "__exit"); 218be906b56SAndreas Gohr } 219be906b56SAndreas Gohr 220be906b56SAndreas Gohr /** 221be906b56SAndreas Gohr * Test to see if the mode is one where this mode is entered for this token only and automatically 222be906b56SAndreas Gohr * leaves immediately afterwoods. 223be906b56SAndreas Gohr * 224be906b56SAndreas Gohr * @param string $mode Mode to test. 225be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 226be906b56SAndreas Gohr */ 227be906b56SAndreas Gohr protected function isSpecialMode($mode) 228be906b56SAndreas Gohr { 229*6c16a3a9Sfiwswe return str_starts_with($mode, '_'); 230be906b56SAndreas Gohr } 231be906b56SAndreas Gohr 232be906b56SAndreas Gohr /** 233be906b56SAndreas Gohr * Strips the magic underscore marking single token modes. 234be906b56SAndreas Gohr * 235be906b56SAndreas Gohr * @param string $mode Mode to decode. 236be906b56SAndreas Gohr * @return string Underlying mode name. 237be906b56SAndreas Gohr */ 238be906b56SAndreas Gohr protected function decodeSpecial($mode) 239be906b56SAndreas Gohr { 240be906b56SAndreas Gohr return substr($mode, 1); 241be906b56SAndreas Gohr } 242be906b56SAndreas Gohr 243be906b56SAndreas Gohr /** 244be906b56SAndreas Gohr * Calls the parser method named after the current mode. 245be906b56SAndreas Gohr * 246be906b56SAndreas Gohr * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 247be906b56SAndreas Gohr * 248be906b56SAndreas Gohr * @param string $content Text parsed. 249be906b56SAndreas Gohr * @param boolean $is_match Token is recognised rather 250be906b56SAndreas Gohr * than unparsed data. 251be906b56SAndreas Gohr * @param int $pos Current byte index location in raw doc 252be906b56SAndreas Gohr * thats being parsed 253be906b56SAndreas Gohr * @return bool 254be906b56SAndreas Gohr */ 255be906b56SAndreas Gohr protected function invokeHandler($content, $is_match, $pos) 256be906b56SAndreas Gohr { 257be906b56SAndreas Gohr if (($content === "") || ($content === false)) { 258be906b56SAndreas Gohr return true; 259be906b56SAndreas Gohr } 260661c1ddcSChristopher Smith $handler = $this->modeStack->getCurrent(); 261be906b56SAndreas Gohr if (isset($this->mode_handlers[$handler])) { 262be906b56SAndreas Gohr $handler = $this->mode_handlers[$handler]; 263be906b56SAndreas Gohr } 264be906b56SAndreas Gohr 265be906b56SAndreas Gohr // modes starting with plugin_ are all handled by the same 266be906b56SAndreas Gohr // handler but with an additional parameter 267*6c16a3a9Sfiwswe if (str_starts_with($handler, 'plugin_')) { 268bcaec9f4SAndreas Gohr [$handler, $plugin] = sexplode('_', $handler, 2, ''); 269be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos, $plugin); 270be906b56SAndreas Gohr } 271be906b56SAndreas Gohr 272be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos); 273be906b56SAndreas Gohr } 274be906b56SAndreas Gohr 275be906b56SAndreas Gohr /** 276be906b56SAndreas Gohr * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 277be906b56SAndreas Gohr * unparsed data. Empty strings will not be matched. 278be906b56SAndreas Gohr * 279be906b56SAndreas Gohr * @param string $raw The subject to parse. This is the content that will be eaten. 280be906b56SAndreas Gohr * @return array|bool Three item list of unparsed content followed by the 281be906b56SAndreas Gohr * recognised token and finally the action the parser is to take. 282be906b56SAndreas Gohr * True if no match, false if there is a parsing error. 283be906b56SAndreas Gohr */ 284be906b56SAndreas Gohr protected function reduce(&$raw) 285be906b56SAndreas Gohr { 286661c1ddcSChristopher Smith if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 287be906b56SAndreas Gohr return false; 288be906b56SAndreas Gohr } 289be906b56SAndreas Gohr if ($raw === "") { 290be906b56SAndreas Gohr return true; 291be906b56SAndreas Gohr } 292661c1ddcSChristopher Smith if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 293bcaec9f4SAndreas Gohr [$unparsed, $match, $raw] = $split; 294bcaec9f4SAndreas Gohr return [$unparsed, $match, $action]; 295be906b56SAndreas Gohr } 296be906b56SAndreas Gohr return true; 297be906b56SAndreas Gohr } 298be906b56SAndreas Gohr 299be906b56SAndreas Gohr /** 300be906b56SAndreas Gohr * Escapes regex characters other than (, ) and / 301be906b56SAndreas Gohr * 302be906b56SAndreas Gohr * @param string $str 303be906b56SAndreas Gohr * @return string 304be906b56SAndreas Gohr */ 305be906b56SAndreas Gohr public static function escape($str) 306be906b56SAndreas Gohr { 307bcaec9f4SAndreas Gohr $chars = [ 308be906b56SAndreas Gohr '/\\\\/', 309be906b56SAndreas Gohr '/\./', 310be906b56SAndreas Gohr '/\+/', 311be906b56SAndreas Gohr '/\*/', 312be906b56SAndreas Gohr '/\?/', 313be906b56SAndreas Gohr '/\[/', 314be906b56SAndreas Gohr '/\^/', 315be906b56SAndreas Gohr '/\]/', 316be906b56SAndreas Gohr '/\$/', 317be906b56SAndreas Gohr '/\{/', 318be906b56SAndreas Gohr '/\}/', 319be906b56SAndreas Gohr '/\=/', 320be906b56SAndreas Gohr '/\!/', 321be906b56SAndreas Gohr '/\</', 322be906b56SAndreas Gohr '/\>/', 323be906b56SAndreas Gohr '/\|/', 324be906b56SAndreas Gohr '/\:/' 325bcaec9f4SAndreas Gohr ]; 326be906b56SAndreas Gohr 327bcaec9f4SAndreas Gohr $escaped = [ 328be906b56SAndreas Gohr '\\\\\\\\', 329be906b56SAndreas Gohr '\.', 330be906b56SAndreas Gohr '\+', 331be906b56SAndreas Gohr '\*', 332be906b56SAndreas Gohr '\?', 333be906b56SAndreas Gohr '\[', 334be906b56SAndreas Gohr '\^', 335be906b56SAndreas Gohr '\]', 336be906b56SAndreas Gohr '\$', 337be906b56SAndreas Gohr '\{', 338be906b56SAndreas Gohr '\}', 339be906b56SAndreas Gohr '\=', 340be906b56SAndreas Gohr '\!', 341be906b56SAndreas Gohr '\<', 342be906b56SAndreas Gohr '\>', 343be906b56SAndreas Gohr '\|', 344be906b56SAndreas Gohr '\:' 345bcaec9f4SAndreas Gohr ]; 346bcaec9f4SAndreas Gohr 347be906b56SAndreas Gohr return preg_replace($chars, $escaped, $str); 348be906b56SAndreas Gohr } 349be906b56SAndreas Gohr} 350