1be906b56SAndreas Gohr<?php 2d4f83172SAndreas Gohr 3be906b56SAndreas Gohr/** 4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5be906b56SAndreas Gohr * For an intro to the Lexer see: 6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7be906b56SAndreas Gohr * 8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 9be906b56SAndreas Gohr */ 10be906b56SAndreas Gohr 11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 12be906b56SAndreas Gohr 13*71096e46SAndreas Gohruse dokuwiki\Parsing\Handler; 14*71096e46SAndreas Gohr 15be906b56SAndreas Gohr/** 16be906b56SAndreas Gohr * Accepts text and breaks it into tokens. 17be906b56SAndreas Gohr * 18be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex 19be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores. 20be906b56SAndreas Gohr */ 21be906b56SAndreas Gohrclass Lexer 22be906b56SAndreas Gohr{ 23f8026da1SAndreas Gohr /** Signal for leaving a mode */ 24f8026da1SAndreas Gohr public const MODE_EXIT = '__exit'; 25f8026da1SAndreas Gohr /** Prefix marking special (enter-and-exit) patterns */ 26f8026da1SAndreas Gohr public const MODE_SPECIAL_PREFIX = '_'; 27f8026da1SAndreas Gohr 28be906b56SAndreas Gohr /** @var ParallelRegex[] */ 29bcaec9f4SAndreas Gohr protected $regexes = []; 30*71096e46SAndreas Gohr /** @var Handler */ 31be906b56SAndreas Gohr protected $handler; 32be906b56SAndreas Gohr /** @var StateStack */ 33661c1ddcSChristopher Smith protected $modeStack; 34be906b56SAndreas Gohr /** @var array mode "rewrites" */ 35bcaec9f4SAndreas Gohr protected $mode_handlers = []; 36be906b56SAndreas Gohr /** @var bool case sensitive? */ 37be906b56SAndreas Gohr protected $case; 38be906b56SAndreas Gohr 39be906b56SAndreas Gohr /** 40be906b56SAndreas Gohr * Sets up the lexer in case insensitive matching by default. 41be906b56SAndreas Gohr * 42*71096e46SAndreas Gohr * @param Handler $handler Handling strategy by reference. 43be906b56SAndreas Gohr * @param string $start Starting handler. 44be906b56SAndreas Gohr * @param boolean $case True for case sensitive. 45be906b56SAndreas Gohr */ 46be906b56SAndreas Gohr public function __construct($handler, $start = "accept", $case = false) 47be906b56SAndreas Gohr { 48be906b56SAndreas Gohr $this->case = $case; 49be906b56SAndreas Gohr $this->handler = $handler; 50661c1ddcSChristopher Smith $this->modeStack = new StateStack($start); 51be906b56SAndreas Gohr } 52be906b56SAndreas Gohr 53be906b56SAndreas Gohr /** 54be906b56SAndreas Gohr * Adds a token search pattern for a particular parsing mode. 55be906b56SAndreas Gohr * 56be906b56SAndreas Gohr * The pattern does not change the current mode. 57be906b56SAndreas Gohr * 58be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) 59be906b56SAndreas Gohr * lose the usual meaning. 60be906b56SAndreas Gohr * @param string $mode Should only apply this 61be906b56SAndreas Gohr * pattern when dealing with 62be906b56SAndreas Gohr * this type of input. 63be906b56SAndreas Gohr */ 64be906b56SAndreas Gohr public function addPattern($pattern, $mode = "accept") 65be906b56SAndreas Gohr { 66be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 67be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 68be906b56SAndreas Gohr } 69be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern); 70be906b56SAndreas Gohr } 71be906b56SAndreas Gohr 72be906b56SAndreas Gohr /** 73be906b56SAndreas Gohr * Adds a pattern that will enter a new parsing mode. 74be906b56SAndreas Gohr * 75be906b56SAndreas Gohr * Useful for entering parenthesis, strings, tags, etc. 76be906b56SAndreas Gohr * 77be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 78be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 79be906b56SAndreas Gohr * @param string $new_mode Change parsing to this new nested mode. 80be906b56SAndreas Gohr */ 81be906b56SAndreas Gohr public function addEntryPattern($pattern, $mode, $new_mode) 82be906b56SAndreas Gohr { 83be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 84be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 85be906b56SAndreas Gohr } 86be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, $new_mode); 87be906b56SAndreas Gohr } 88be906b56SAndreas Gohr 89be906b56SAndreas Gohr /** 90be906b56SAndreas Gohr * Adds a pattern that will exit the current mode and re-enter the previous one. 91be906b56SAndreas Gohr * 92be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 93be906b56SAndreas Gohr * @param string $mode Mode to leave. 94be906b56SAndreas Gohr */ 95be906b56SAndreas Gohr public function addExitPattern($pattern, $mode) 96be906b56SAndreas Gohr { 97be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 98be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 99be906b56SAndreas Gohr } 100f8026da1SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, self::MODE_EXIT); 101be906b56SAndreas Gohr } 102be906b56SAndreas Gohr 103be906b56SAndreas Gohr /** 104be906b56SAndreas Gohr * Adds a pattern that has a special mode. 105be906b56SAndreas Gohr * 106be906b56SAndreas Gohr * Acts as an entry and exit pattern in one go, effectively calling a special 107be906b56SAndreas Gohr * parser handler for this token only. 108be906b56SAndreas Gohr * 109be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 110be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 111be906b56SAndreas Gohr * @param string $special Use this mode for this one token. 112be906b56SAndreas Gohr */ 113be906b56SAndreas Gohr public function addSpecialPattern($pattern, $mode, $special) 114be906b56SAndreas Gohr { 115be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 116be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 117be906b56SAndreas Gohr } 118f8026da1SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, self::MODE_SPECIAL_PREFIX . $special); 119be906b56SAndreas Gohr } 120be906b56SAndreas Gohr 121be906b56SAndreas Gohr /** 122be906b56SAndreas Gohr * Adds a mapping from a mode to another handler. 123be906b56SAndreas Gohr * 124be906b56SAndreas Gohr * @param string $mode Mode to be remapped. 125be906b56SAndreas Gohr * @param string $handler New target handler. 126be906b56SAndreas Gohr */ 127be906b56SAndreas Gohr public function mapHandler($mode, $handler) 128be906b56SAndreas Gohr { 129be906b56SAndreas Gohr $this->mode_handlers[$mode] = $handler; 130be906b56SAndreas Gohr } 131be906b56SAndreas Gohr 132be906b56SAndreas Gohr /** 133be906b56SAndreas Gohr * Splits the page text into tokens. 134be906b56SAndreas Gohr * 135be906b56SAndreas Gohr * Will fail if the handlers report an error or if no content is consumed. If successful then each 136be906b56SAndreas Gohr * unparsed and parsed token invokes a call to the held listener. 137be906b56SAndreas Gohr * 138be906b56SAndreas Gohr * @param string $raw Raw HTML text. 139be906b56SAndreas Gohr * @return boolean True on success, else false. 140be906b56SAndreas Gohr */ 141be906b56SAndreas Gohr public function parse($raw) 142be906b56SAndreas Gohr { 143be906b56SAndreas Gohr if (! isset($this->handler)) { 144be906b56SAndreas Gohr return false; 145be906b56SAndreas Gohr } 146be906b56SAndreas Gohr $initialLength = strlen($raw); 147be906b56SAndreas Gohr $length = $initialLength; 148be906b56SAndreas Gohr $pos = 0; 149be906b56SAndreas Gohr while (is_array($parsed = $this->reduce($raw))) { 150bcaec9f4SAndreas Gohr [$unmatched, $matched, $mode] = $parsed; 151be906b56SAndreas Gohr $currentLength = strlen($raw); 152be906b56SAndreas Gohr $matchPos = $initialLength - $currentLength - strlen($matched); 153be906b56SAndreas Gohr if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 154be906b56SAndreas Gohr return false; 155be906b56SAndreas Gohr } 156bcaec9f4SAndreas Gohr if ($currentLength === $length) { 157be906b56SAndreas Gohr return false; 158be906b56SAndreas Gohr } 159be906b56SAndreas Gohr $length = $currentLength; 160be906b56SAndreas Gohr $pos = $initialLength - $currentLength; 161be906b56SAndreas Gohr } 162be906b56SAndreas Gohr if (!$parsed) { 163be906b56SAndreas Gohr return false; 164be906b56SAndreas Gohr } 165be906b56SAndreas Gohr return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 166be906b56SAndreas Gohr } 167be906b56SAndreas Gohr 168be906b56SAndreas Gohr /** 169368a782fSAnna Dabrowska * Gives plugins access to the mode stack 170368a782fSAnna Dabrowska * 171368a782fSAnna Dabrowska * @return StateStack 172368a782fSAnna Dabrowska */ 173368a782fSAnna Dabrowska public function getModeStack() 174368a782fSAnna Dabrowska { 175368a782fSAnna Dabrowska return $this->modeStack; 176368a782fSAnna Dabrowska } 177368a782fSAnna Dabrowska 178368a782fSAnna Dabrowska /** 179be906b56SAndreas Gohr * Sends the matched token and any leading unmatched 180be906b56SAndreas Gohr * text to the parser changing the lexer to a new 181be906b56SAndreas Gohr * mode if one is listed. 182be906b56SAndreas Gohr * 183be906b56SAndreas Gohr * @param string $unmatched Unmatched leading portion. 184be906b56SAndreas Gohr * @param string $matched Actual token match. 185be906b56SAndreas Gohr * @param bool|string $mode Mode after match. A boolean false mode causes no change. 186be906b56SAndreas Gohr * @param int $initialPos 187be906b56SAndreas Gohr * @param int $matchPos Current byte index location in raw doc thats being parsed 188be906b56SAndreas Gohr * @return boolean False if there was any error from the parser. 189be906b56SAndreas Gohr */ 190661c1ddcSChristopher Smith protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) 191be906b56SAndreas Gohr { 192be906b56SAndreas Gohr if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 193be906b56SAndreas Gohr return false; 194be906b56SAndreas Gohr } 195be906b56SAndreas Gohr if ($this->isModeEnd($mode)) { 196be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 197be906b56SAndreas Gohr return false; 198be906b56SAndreas Gohr } 199661c1ddcSChristopher Smith return $this->modeStack->leave(); 200be906b56SAndreas Gohr } 201be906b56SAndreas Gohr if ($this->isSpecialMode($mode)) { 202661c1ddcSChristopher Smith $this->modeStack->enter($this->decodeSpecial($mode)); 203be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 204be906b56SAndreas Gohr return false; 205be906b56SAndreas Gohr } 206661c1ddcSChristopher Smith return $this->modeStack->leave(); 207be906b56SAndreas Gohr } 208be906b56SAndreas Gohr if (is_string($mode)) { 209661c1ddcSChristopher Smith $this->modeStack->enter($mode); 210be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 211be906b56SAndreas Gohr } 212be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 213be906b56SAndreas Gohr } 214be906b56SAndreas Gohr 215be906b56SAndreas Gohr /** 216be906b56SAndreas Gohr * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 217be906b56SAndreas Gohr * mode stack. 218be906b56SAndreas Gohr * 219be906b56SAndreas Gohr * @param string $mode Mode to test. 220be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 221be906b56SAndreas Gohr */ 222be906b56SAndreas Gohr protected function isModeEnd($mode) 223be906b56SAndreas Gohr { 224f8026da1SAndreas Gohr return ($mode === self::MODE_EXIT); 225be906b56SAndreas Gohr } 226be906b56SAndreas Gohr 227be906b56SAndreas Gohr /** 228be906b56SAndreas Gohr * Test to see if the mode is one where this mode is entered for this token only and automatically 229be906b56SAndreas Gohr * leaves immediately afterwoods. 230be906b56SAndreas Gohr * 231be906b56SAndreas Gohr * @param string $mode Mode to test. 232be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 233be906b56SAndreas Gohr */ 234be906b56SAndreas Gohr protected function isSpecialMode($mode) 235be906b56SAndreas Gohr { 236f8026da1SAndreas Gohr return str_starts_with($mode, self::MODE_SPECIAL_PREFIX); 237be906b56SAndreas Gohr } 238be906b56SAndreas Gohr 239be906b56SAndreas Gohr /** 240be906b56SAndreas Gohr * Strips the magic underscore marking single token modes. 241be906b56SAndreas Gohr * 242be906b56SAndreas Gohr * @param string $mode Mode to decode. 243be906b56SAndreas Gohr * @return string Underlying mode name. 244be906b56SAndreas Gohr */ 245be906b56SAndreas Gohr protected function decodeSpecial($mode) 246be906b56SAndreas Gohr { 247f8026da1SAndreas Gohr return substr($mode, strlen(self::MODE_SPECIAL_PREFIX)); 248be906b56SAndreas Gohr } 249be906b56SAndreas Gohr 250be906b56SAndreas Gohr /** 251*71096e46SAndreas Gohr * Dispatches a token to the handler. 252be906b56SAndreas Gohr * 253*71096e46SAndreas Gohr * Resolves mode name aliases (e.g. unformattedalt → unformatted) and 254*71096e46SAndreas Gohr * delegates all dispatch logic to Handler::handleToken(). 255be906b56SAndreas Gohr * 256be906b56SAndreas Gohr * @param string $content Text parsed. 257be906b56SAndreas Gohr * @param boolean $is_match Token is recognised rather 258be906b56SAndreas Gohr * than unparsed data. 259be906b56SAndreas Gohr * @param int $pos Current byte index location in raw doc 260be906b56SAndreas Gohr * thats being parsed 261be906b56SAndreas Gohr * @return bool 262be906b56SAndreas Gohr */ 263be906b56SAndreas Gohr protected function invokeHandler($content, $is_match, $pos) 264be906b56SAndreas Gohr { 265be906b56SAndreas Gohr if (($content === "") || ($content === false)) { 266be906b56SAndreas Gohr return true; 267be906b56SAndreas Gohr } 268*71096e46SAndreas Gohr $originalName = $this->modeStack->getCurrent(); 269*71096e46SAndreas Gohr $modeName = $this->mode_handlers[$originalName] ?? $originalName; 270be906b56SAndreas Gohr 271*71096e46SAndreas Gohr return $this->handler->handleToken($modeName, $content, $is_match, $pos, $originalName); 272be906b56SAndreas Gohr } 273be906b56SAndreas Gohr 274be906b56SAndreas Gohr /** 275be906b56SAndreas Gohr * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 276be906b56SAndreas Gohr * unparsed data. Empty strings will not be matched. 277be906b56SAndreas Gohr * 278be906b56SAndreas Gohr * @param string $raw The subject to parse. This is the content that will be eaten. 279be906b56SAndreas Gohr * @return array|bool Three item list of unparsed content followed by the 280be906b56SAndreas Gohr * recognised token and finally the action the parser is to take. 281be906b56SAndreas Gohr * True if no match, false if there is a parsing error. 282be906b56SAndreas Gohr */ 283be906b56SAndreas Gohr protected function reduce(&$raw) 284be906b56SAndreas Gohr { 285661c1ddcSChristopher Smith if (! isset($this->regexes[$this->modeStack->getCurrent()])) { 286be906b56SAndreas Gohr return false; 287be906b56SAndreas Gohr } 288be906b56SAndreas Gohr if ($raw === "") { 289be906b56SAndreas Gohr return true; 290be906b56SAndreas Gohr } 291661c1ddcSChristopher Smith if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { 292bcaec9f4SAndreas Gohr [$unparsed, $match, $raw] = $split; 293bcaec9f4SAndreas Gohr return [$unparsed, $match, $action]; 294be906b56SAndreas Gohr } 295be906b56SAndreas Gohr return true; 296be906b56SAndreas Gohr } 297be906b56SAndreas Gohr 298be906b56SAndreas Gohr /** 299be906b56SAndreas Gohr * Escapes regex characters other than (, ) and / 300be906b56SAndreas Gohr * 301be906b56SAndreas Gohr * @param string $str 302be906b56SAndreas Gohr * @return string 303be906b56SAndreas Gohr */ 304be906b56SAndreas Gohr public static function escape($str) 305be906b56SAndreas Gohr { 306bcaec9f4SAndreas Gohr $chars = [ 307be906b56SAndreas Gohr '/\\\\/', 308be906b56SAndreas Gohr '/\./', 309be906b56SAndreas Gohr '/\+/', 310be906b56SAndreas Gohr '/\*/', 311be906b56SAndreas Gohr '/\?/', 312be906b56SAndreas Gohr '/\[/', 313be906b56SAndreas Gohr '/\^/', 314be906b56SAndreas Gohr '/\]/', 315be906b56SAndreas Gohr '/\$/', 316be906b56SAndreas Gohr '/\{/', 317be906b56SAndreas Gohr '/\}/', 318be906b56SAndreas Gohr '/\=/', 319be906b56SAndreas Gohr '/\!/', 320be906b56SAndreas Gohr '/\</', 321be906b56SAndreas Gohr '/\>/', 322be906b56SAndreas Gohr '/\|/', 323be906b56SAndreas Gohr '/\:/' 324bcaec9f4SAndreas Gohr ]; 325be906b56SAndreas Gohr 326bcaec9f4SAndreas Gohr $escaped = [ 327be906b56SAndreas Gohr '\\\\\\\\', 328be906b56SAndreas Gohr '\.', 329be906b56SAndreas Gohr '\+', 330be906b56SAndreas Gohr '\*', 331be906b56SAndreas Gohr '\?', 332be906b56SAndreas Gohr '\[', 333be906b56SAndreas Gohr '\^', 334be906b56SAndreas Gohr '\]', 335be906b56SAndreas Gohr '\$', 336be906b56SAndreas Gohr '\{', 337be906b56SAndreas Gohr '\}', 338be906b56SAndreas Gohr '\=', 339be906b56SAndreas Gohr '\!', 340be906b56SAndreas Gohr '\<', 341be906b56SAndreas Gohr '\>', 342be906b56SAndreas Gohr '\|', 343be906b56SAndreas Gohr '\:' 344bcaec9f4SAndreas Gohr ]; 345bcaec9f4SAndreas Gohr 346be906b56SAndreas Gohr return preg_replace($chars, $escaped, $str); 347be906b56SAndreas Gohr } 348be906b56SAndreas Gohr} 349