1be906b56SAndreas Gohr<?php 2d4f83172SAndreas Gohr 3be906b56SAndreas Gohr/** 4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5be906b56SAndreas Gohr * For an intro to the Lexer see: 6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7be906b56SAndreas Gohr * 8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 9be906b56SAndreas Gohr */ 10be906b56SAndreas Gohr 11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 12be906b56SAndreas Gohr 13be906b56SAndreas Gohr/** 14be906b56SAndreas Gohr * Accepts text and breaks it into tokens. 15be906b56SAndreas Gohr * 16be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex 17be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores. 18be906b56SAndreas Gohr */ 19be906b56SAndreas Gohrclass Lexer 20be906b56SAndreas Gohr{ 21*f8026da1SAndreas Gohr /** Signal for leaving a mode */ 22*f8026da1SAndreas Gohr public const MODE_EXIT = '__exit'; 23*f8026da1SAndreas Gohr /** Prefix marking special (enter-and-exit) patterns */ 24*f8026da1SAndreas Gohr public const MODE_SPECIAL_PREFIX = '_'; 25*f8026da1SAndreas Gohr 26be906b56SAndreas Gohr /** @var ParallelRegex[] */ 27bcaec9f4SAndreas Gohr protected $regexes = []; 28be906b56SAndreas Gohr /** @var \Doku_Handler */ 29be906b56SAndreas Gohr protected $handler; 30be906b56SAndreas Gohr /** @var StateStack */ 31661c1ddcSChristopher Smith protected $modeStack; 32be906b56SAndreas Gohr /** @var array mode "rewrites" */ 33bcaec9f4SAndreas Gohr protected $mode_handlers = []; 34be906b56SAndreas Gohr /** @var bool case sensitive? */ 35be906b56SAndreas Gohr protected $case; 36be906b56SAndreas Gohr 37be906b56SAndreas Gohr /** 38be906b56SAndreas Gohr * Sets up the lexer in case insensitive matching by default. 39be906b56SAndreas Gohr * 40be906b56SAndreas Gohr * @param \Doku_Handler $handler Handling strategy by reference. 41be906b56SAndreas Gohr * @param string $start Starting handler. 42be906b56SAndreas Gohr * @param boolean $case True for case sensitive. 43be906b56SAndreas Gohr */ 44be906b56SAndreas Gohr public function __construct($handler, $start = "accept", $case = false) 45be906b56SAndreas Gohr { 46be906b56SAndreas Gohr $this->case = $case; 47be906b56SAndreas Gohr $this->handler = $handler; 48661c1ddcSChristopher Smith $this->modeStack = new StateStack($start); 49be906b56SAndreas Gohr } 50be906b56SAndreas Gohr 51be906b56SAndreas Gohr /** 52be906b56SAndreas Gohr * Adds a token search pattern for a particular parsing mode. 53be906b56SAndreas Gohr * 54be906b56SAndreas Gohr * The pattern does not change the current mode. 55be906b56SAndreas Gohr * 56be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) 57be906b56SAndreas Gohr * lose the usual meaning. 58be906b56SAndreas Gohr * @param string $mode Should only apply this 59be906b56SAndreas Gohr * pattern when dealing with 60be906b56SAndreas Gohr * this type of input. 61be906b56SAndreas Gohr */ 62be906b56SAndreas Gohr public function addPattern($pattern, $mode = "accept") 63be906b56SAndreas Gohr { 64be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 65be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 66be906b56SAndreas Gohr } 67be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern); 68be906b56SAndreas Gohr } 69be906b56SAndreas Gohr 70be906b56SAndreas Gohr /** 71be906b56SAndreas Gohr * Adds a pattern that will enter a new parsing mode. 72be906b56SAndreas Gohr * 73be906b56SAndreas Gohr * Useful for entering parenthesis, strings, tags, etc. 74be906b56SAndreas Gohr * 75be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 76be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 77be906b56SAndreas Gohr * @param string $new_mode Change parsing to this new nested mode. 78be906b56SAndreas Gohr */ 79be906b56SAndreas Gohr public function addEntryPattern($pattern, $mode, $new_mode) 80be906b56SAndreas Gohr { 81be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 82be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 83be906b56SAndreas Gohr } 84be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, $new_mode); 85be906b56SAndreas Gohr } 86be906b56SAndreas Gohr 87be906b56SAndreas Gohr /** 88be906b56SAndreas Gohr * Adds a pattern that will exit the current mode and re-enter the previous one. 89be906b56SAndreas Gohr * 90be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 91be906b56SAndreas Gohr * @param string $mode Mode to leave. 92be906b56SAndreas Gohr */ 93be906b56SAndreas Gohr public function addExitPattern($pattern, $mode) 94be906b56SAndreas Gohr { 95be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 96be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 97be906b56SAndreas Gohr } 98*f8026da1SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT); 99be906b56SAndreas Gohr } 100be906b56SAndreas Gohr 101be906b56SAndreas Gohr /** 102be906b56SAndreas Gohr * Adds a pattern that has a special mode. 103be906b56SAndreas Gohr * 104be906b56SAndreas Gohr * Acts as an entry and exit pattern in one go, effectively calling a special 105be906b56SAndreas Gohr * parser handler for this token only. 106be906b56SAndreas Gohr * 107be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 108be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 109be906b56SAndreas Gohr * @param string $special Use this mode for this one token. 110be906b56SAndreas Gohr */ 111be906b56SAndreas Gohr public function addSpecialPattern($pattern, $mode, $special) 112be906b56SAndreas Gohr { 113be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 114be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 115be906b56SAndreas Gohr } 116*f8026da1SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special); 117be906b56SAndreas Gohr } 118be906b56SAndreas Gohr 119be906b56SAndreas Gohr /** 120be906b56SAndreas Gohr * Adds a mapping from a mode to another handler. 121be906b56SAndreas Gohr * 122be906b56SAndreas Gohr * @param string $mode Mode to be remapped. 123be906b56SAndreas Gohr * @param string $handler New target handler. 124be906b56SAndreas Gohr */ 125be906b56SAndreas Gohr public function mapHandler($mode, $handler) 126be906b56SAndreas Gohr { 127be906b56SAndreas Gohr $this->mode_handlers[$mode] = $handler; 128be906b56SAndreas Gohr } 129be906b56SAndreas Gohr 130be906b56SAndreas Gohr /** 131be906b56SAndreas Gohr * Splits the page text into tokens. 132be906b56SAndreas Gohr * 133be906b56SAndreas Gohr * Will fail if the handlers report an error or if no content is consumed. If successful then each 134be906b56SAndreas Gohr * unparsed and parsed token invokes a call to the held listener. 135be906b56SAndreas Gohr * 136be906b56SAndreas Gohr * @param string $raw Raw HTML text. 137be906b56SAndreas Gohr * @return boolean True on success, else false. 138be906b56SAndreas Gohr */ 139be906b56SAndreas Gohr public function parse($raw) 140be906b56SAndreas Gohr { 141be906b56SAndreas Gohr if (! isset($this->handler)) { 142be906b56SAndreas Gohr return false; 143be906b56SAndreas Gohr } 144be906b56SAndreas Gohr $initialLength = strlen($raw); 145be906b56SAndreas Gohr $length = $initialLength; 146be906b56SAndreas Gohr $pos = 0; 147be906b56SAndreas Gohr while (is_array($parsed = $this->reduce($raw))) { 148bcaec9f4SAndreas Gohr [$unmatched, $matched, $mode] = $parsed; 149be906b56SAndreas Gohr $currentLength = strlen($raw); 150be906b56SAndreas Gohr $matchPos = $initialLength - $currentLength - strlen($matched); 151be906b56SAndreas Gohr if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 152be906b56SAndreas Gohr return false; 153be906b56SAndreas Gohr } 154bcaec9f4SAndreas Gohr if ($currentLength === $length) { 155be906b56SAndreas Gohr return false; 156be906b56SAndreas Gohr } 157be906b56SAndreas Gohr $length = $currentLength; 158be906b56SAndreas Gohr $pos = $initialLength - $currentLength; 159be906b56SAndreas Gohr } 160be906b56SAndreas Gohr if (!$parsed) { 161be906b56SAndreas Gohr return false; 162be906b56SAndreas Gohr } 163be906b56SAndreas Gohr return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 164be906b56SAndreas Gohr } 165be906b56SAndreas Gohr 166be906b56SAndreas Gohr /** 167368a782fSAnna Dabrowska * Gives plugins access to the mode stack 168368a782fSAnna Dabrowska * 169368a782fSAnna Dabrowska * @return StateStack 170368a782fSAnna Dabrowska */ 171368a782fSAnna Dabrowska public function getModeStack() 172368a782fSAnna Dabrowska { 173368a782fSAnna Dabrowska return $this->modeStack; 174368a782fSAnna Dabrowska } 175368a782fSAnna Dabrowska 176368a782fSAnna Dabrowska /** 177be906b56SAndreas Gohr * Sends the matched token and any leading unmatched 178be906b56SAndreas Gohr * text to the parser changing the lexer to a new 179be906b56SAndreas Gohr * mode if one is listed. 180be906b56SAndreas Gohr * 181be906b56SAndreas Gohr * @param string $unmatched Unmatched leading portion. 182be906b56SAndreas Gohr * @param string $matched Actual token match. 183be906b56SAndreas Gohr * @param bool|string $mode Mode after match. A boolean false mode causes no change. 184be906b56SAndreas Gohr * @param int $initialPos 185be906b56SAndreas Gohr * @param int $matchPos Current byte index location in raw doc thats being parsed 186be906b56SAndreas Gohr * @return boolean False if there was any error from the parser. 187be906b56SAndreas Gohr */ 188661c1ddcSChristopher Smith protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 189be906b56SAndreas Gohr { 190be906b56SAndreas Gohr if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 191be906b56SAndreas Gohr return false; 192be906b56SAndreas Gohr } 193be906b56SAndreas Gohr if ($this->isModeEnd($mode)) { 194be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 195be906b56SAndreas Gohr return false; 196be906b56SAndreas Gohr } 197661c1ddcSChristopher Smith return $this->modeStack->leave(); 198be906b56SAndreas Gohr } 199be906b56SAndreas Gohr if ($this->isSpecialMode($mode)) { 200661c1ddcSChristopher Smith $this->modeStack->enter($this->decodeSpecial($mode)); 201be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 202be906b56SAndreas Gohr return false; 203be906b56SAndreas Gohr } 204661c1ddcSChristopher Smith return $this->modeStack->leave(); 205be906b56SAndreas Gohr } 206be906b56SAndreas Gohr if (is_string($mode)) { 207661c1ddcSChristopher Smith $this->modeStack->enter($mode); 208be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 209be906b56SAndreas Gohr } 210be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 211be906b56SAndreas Gohr } 212be906b56SAndreas Gohr 213be906b56SAndreas Gohr /** 214be906b56SAndreas Gohr * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 215be906b56SAndreas Gohr * mode stack. 216be906b56SAndreas Gohr * 217be906b56SAndreas Gohr * @param string $mode Mode to test. 218be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 219be906b56SAndreas Gohr */ 220be906b56SAndreas Gohr protected function isModeEnd($mode) 221be906b56SAndreas Gohr { 222*f8026da1SAndreas Gohr return ($mode === self::MODE_EXIT); 223be906b56SAndreas Gohr } 224be906b56SAndreas Gohr 225be906b56SAndreas Gohr /** 226be906b56SAndreas Gohr * Test to see if the mode is one where this mode is entered for this token only and automatically 227be906b56SAndreas Gohr * leaves immediately afterwoods. 228be906b56SAndreas Gohr * 229be906b56SAndreas Gohr * @param string $mode Mode to test. 230be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 231be906b56SAndreas Gohr */ 232be906b56SAndreas Gohr protected function isSpecialMode($mode) 233be906b56SAndreas Gohr { 234*f8026da1SAndreas Gohr return str_starts_with($mode, self::MODE_SPECIAL_PREFIX); 235be906b56SAndreas Gohr } 236be906b56SAndreas Gohr 237be906b56SAndreas Gohr /** 238be906b56SAndreas Gohr * Strips the magic underscore marking single token modes. 239be906b56SAndreas Gohr * 240be906b56SAndreas Gohr * @param string $mode Mode to decode. 241be906b56SAndreas Gohr * @return string Underlying mode name. 242be906b56SAndreas Gohr */ 243be906b56SAndreas Gohr protected function decodeSpecial($mode) 244be906b56SAndreas Gohr { 245*f8026da1SAndreas Gohr return substr($mode, strlen(self::MODE_SPECIAL_PREFIX)); 246be906b56SAndreas Gohr } 247be906b56SAndreas Gohr 248be906b56SAndreas Gohr /** 249be906b56SAndreas Gohr * Calls the parser method named after the current mode. 250be906b56SAndreas Gohr * 251be906b56SAndreas Gohr * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 252be906b56SAndreas Gohr * 253be906b56SAndreas Gohr * @param string $content Text parsed. 254be906b56SAndreas Gohr * @param boolean $is_match Token is recognised rather 255be906b56SAndreas Gohr * than unparsed data. 256be906b56SAndreas Gohr * @param int $pos Current byte index location in raw doc 257be906b56SAndreas Gohr * thats being parsed 258be906b56SAndreas Gohr * @return bool 259be906b56SAndreas Gohr */ 260be906b56SAndreas Gohr protected function invokeHandler($content, $is_match, $pos) 261be906b56SAndreas Gohr { 262be906b56SAndreas Gohr if (($content === "") || ($content === false)) { 263be906b56SAndreas Gohr return true; 264be906b56SAndreas Gohr } 265661c1ddcSChristopher Smith $handler = $this->modeStack->getCurrent(); 266be906b56SAndreas Gohr if (isset($this->mode_handlers[$handler])) { 267be906b56SAndreas Gohr $handler = $this->mode_handlers[$handler]; 268be906b56SAndreas Gohr } 269be906b56SAndreas Gohr 270be906b56SAndreas Gohr // modes starting with plugin_ are all handled by the same 271be906b56SAndreas Gohr // handler but with an additional parameter 2726c16a3a9Sfiwswe if (str_starts_with($handler, 'plugin_')) { 273bcaec9f4SAndreas Gohr [$handler, $plugin] = sexplode('_', $handler, 2, ''); 274be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos, $plugin); 275be906b56SAndreas Gohr } 276be906b56SAndreas Gohr 277be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos); 278be906b56SAndreas Gohr } 279be906b56SAndreas Gohr 280be906b56SAndreas Gohr /** 281be906b56SAndreas Gohr * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 282be906b56SAndreas Gohr * unparsed data. Empty strings will not be matched. 283be906b56SAndreas Gohr * 284be906b56SAndreas Gohr * @param string $raw The subject to parse. This is the content that will be eaten. 285be906b56SAndreas Gohr * @return array|bool Three item list of unparsed content followed by the 286be906b56SAndreas Gohr * recognised token and finally the action the parser is to take. 287be906b56SAndreas Gohr * True if no match, false if there is a parsing error. 288be906b56SAndreas Gohr */ 289be906b56SAndreas Gohr protected function reduce(&$raw) 290be906b56SAndreas Gohr { 291661c1ddcSChristopher Smith if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 292be906b56SAndreas Gohr return false; 293be906b56SAndreas Gohr } 294be906b56SAndreas Gohr if ($raw === "") { 295be906b56SAndreas Gohr return true; 296be906b56SAndreas Gohr } 297661c1ddcSChristopher Smith if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 298bcaec9f4SAndreas Gohr [$unparsed, $match, $raw] = $split; 299bcaec9f4SAndreas Gohr return [$unparsed, $match, $action]; 300be906b56SAndreas Gohr } 301be906b56SAndreas Gohr return true; 302be906b56SAndreas Gohr } 303be906b56SAndreas Gohr 304be906b56SAndreas Gohr /** 305be906b56SAndreas Gohr * Escapes regex characters other than (, ) and / 306be906b56SAndreas Gohr * 307be906b56SAndreas Gohr * @param string $str 308be906b56SAndreas Gohr * @return string 309be906b56SAndreas Gohr */ 310be906b56SAndreas Gohr public static function escape($str) 311be906b56SAndreas Gohr { 312bcaec9f4SAndreas Gohr $chars = [ 313be906b56SAndreas Gohr '/\\\\/', 314be906b56SAndreas Gohr '/\./', 315be906b56SAndreas Gohr '/\+/', 316be906b56SAndreas Gohr '/\*/', 317be906b56SAndreas Gohr '/\?/', 318be906b56SAndreas Gohr '/\[/', 319be906b56SAndreas Gohr '/\^/', 320be906b56SAndreas Gohr '/\]/', 321be906b56SAndreas Gohr '/\$/', 322be906b56SAndreas Gohr '/\{/', 323be906b56SAndreas Gohr '/\}/', 324be906b56SAndreas Gohr '/\=/', 325be906b56SAndreas Gohr '/\!/', 326be906b56SAndreas Gohr '/\</', 327be906b56SAndreas Gohr '/\>/', 328be906b56SAndreas Gohr '/\|/', 329be906b56SAndreas Gohr '/\:/' 330bcaec9f4SAndreas Gohr ]; 331be906b56SAndreas Gohr 332bcaec9f4SAndreas Gohr $escaped = [ 333be906b56SAndreas Gohr '\\\\\\\\', 334be906b56SAndreas Gohr '\.', 335be906b56SAndreas Gohr '\+', 336be906b56SAndreas Gohr '\*', 337be906b56SAndreas Gohr '\?', 338be906b56SAndreas Gohr '\[', 339be906b56SAndreas Gohr '\^', 340be906b56SAndreas Gohr '\]', 341be906b56SAndreas Gohr '\$', 342be906b56SAndreas Gohr '\{', 343be906b56SAndreas Gohr '\}', 344be906b56SAndreas Gohr '\=', 345be906b56SAndreas Gohr '\!', 346be906b56SAndreas Gohr '\<', 347be906b56SAndreas Gohr '\>', 348be906b56SAndreas Gohr '\|', 349be906b56SAndreas Gohr '\:' 350bcaec9f4SAndreas Gohr ]; 351bcaec9f4SAndreas Gohr 352be906b56SAndreas Gohr return preg_replace($chars, $escaped, $str); 353be906b56SAndreas Gohr } 354be906b56SAndreas Gohr} 355