1*be906b56SAndreas Gohr<?php 2*be906b56SAndreas Gohr/** 3*be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4*be906b56SAndreas Gohr * For an intro to the Lexer see: 5*be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6*be906b56SAndreas Gohr * 7*be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 8*be906b56SAndreas Gohr */ 9*be906b56SAndreas Gohr 10*be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 11*be906b56SAndreas Gohr 12*be906b56SAndreas Gohr// FIXME move elsewhere 13*be906b56SAndreas Gohr 14*be906b56SAndreas Gohrdefine("DOKU_LEXER_ENTER", 1); 15*be906b56SAndreas Gohrdefine("DOKU_LEXER_MATCHED", 2); 16*be906b56SAndreas Gohrdefine("DOKU_LEXER_UNMATCHED", 3); 17*be906b56SAndreas Gohrdefine("DOKU_LEXER_EXIT", 4); 18*be906b56SAndreas Gohrdefine("DOKU_LEXER_SPECIAL", 5); 19*be906b56SAndreas Gohr 20*be906b56SAndreas Gohr/** 21*be906b56SAndreas Gohr * Accepts text and breaks it into tokens. 22*be906b56SAndreas Gohr * 23*be906b56SAndreas Gohr * Some optimisation to make the sure the content is only scanned by the PHP regex 24*be906b56SAndreas Gohr * parser once. Lexer modes must not start with leading underscores. 25*be906b56SAndreas Gohr */ 26*be906b56SAndreas Gohrclass Lexer 27*be906b56SAndreas Gohr{ 28*be906b56SAndreas Gohr /** @var ParallelRegex[] */ 29*be906b56SAndreas Gohr protected $regexes; 30*be906b56SAndreas Gohr /** @var \Doku_Handler */ 31*be906b56SAndreas Gohr protected $handler; 32*be906b56SAndreas Gohr /** @var StateStack */ 33*be906b56SAndreas Gohr protected $mode; 34*be906b56SAndreas Gohr /** @var array mode "rewrites" */ 35*be906b56SAndreas Gohr protected $mode_handlers; 36*be906b56SAndreas Gohr /** @var bool case sensitive? */ 37*be906b56SAndreas Gohr protected $case; 38*be906b56SAndreas Gohr 39*be906b56SAndreas Gohr /** 40*be906b56SAndreas Gohr * Sets up the lexer in case insensitive matching by default. 41*be906b56SAndreas Gohr * 42*be906b56SAndreas Gohr * @param \Doku_Handler $handler Handling strategy by reference. 43*be906b56SAndreas Gohr * @param string $start Starting handler. 44*be906b56SAndreas Gohr * @param boolean $case True for case sensitive. 45*be906b56SAndreas Gohr */ 46*be906b56SAndreas Gohr public function __construct($handler, $start = "accept", $case = false) 47*be906b56SAndreas Gohr { 48*be906b56SAndreas Gohr $this->case = $case; 49*be906b56SAndreas Gohr $this->regexes = array(); 50*be906b56SAndreas Gohr $this->handler = $handler; 51*be906b56SAndreas Gohr $this->mode = new StateStack($start); 52*be906b56SAndreas Gohr $this->mode_handlers = array(); 53*be906b56SAndreas Gohr } 54*be906b56SAndreas Gohr 55*be906b56SAndreas Gohr /** 56*be906b56SAndreas Gohr * Adds a token search pattern for a particular parsing mode. 57*be906b56SAndreas Gohr * 58*be906b56SAndreas Gohr * The pattern does not change the current mode. 59*be906b56SAndreas Gohr * 60*be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) 61*be906b56SAndreas Gohr * lose the usual meaning. 62*be906b56SAndreas Gohr * @param string $mode Should only apply this 63*be906b56SAndreas Gohr * pattern when dealing with 64*be906b56SAndreas Gohr * this type of input. 65*be906b56SAndreas Gohr */ 66*be906b56SAndreas Gohr public function addPattern($pattern, $mode = "accept") 67*be906b56SAndreas Gohr { 68*be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 69*be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 70*be906b56SAndreas Gohr } 71*be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern); 72*be906b56SAndreas Gohr } 73*be906b56SAndreas Gohr 74*be906b56SAndreas Gohr /** 75*be906b56SAndreas Gohr * Adds a pattern that will enter a new parsing mode. 76*be906b56SAndreas Gohr * 77*be906b56SAndreas Gohr * Useful for entering parenthesis, strings, tags, etc. 78*be906b56SAndreas Gohr * 79*be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 80*be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 81*be906b56SAndreas Gohr * @param string $new_mode Change parsing to this new nested mode. 82*be906b56SAndreas Gohr */ 83*be906b56SAndreas Gohr public function addEntryPattern($pattern, $mode, $new_mode) 84*be906b56SAndreas Gohr { 85*be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 86*be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 87*be906b56SAndreas Gohr } 88*be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, $new_mode); 89*be906b56SAndreas Gohr } 90*be906b56SAndreas Gohr 91*be906b56SAndreas Gohr /** 92*be906b56SAndreas Gohr * Adds a pattern that will exit the current mode and re-enter the previous one. 93*be906b56SAndreas Gohr * 94*be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 95*be906b56SAndreas Gohr * @param string $mode Mode to leave. 96*be906b56SAndreas Gohr */ 97*be906b56SAndreas Gohr public function addExitPattern($pattern, $mode) 98*be906b56SAndreas Gohr { 99*be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 100*be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 101*be906b56SAndreas Gohr } 102*be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, "__exit"); 103*be906b56SAndreas Gohr } 104*be906b56SAndreas Gohr 105*be906b56SAndreas Gohr /** 106*be906b56SAndreas Gohr * Adds a pattern that has a special mode. 107*be906b56SAndreas Gohr * 108*be906b56SAndreas Gohr * Acts as an entry and exit pattern in one go, effectively calling a special 109*be906b56SAndreas Gohr * parser handler for this token only. 110*be906b56SAndreas Gohr * 111*be906b56SAndreas Gohr * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 112*be906b56SAndreas Gohr * @param string $mode Should only apply this pattern when dealing with this type of input. 113*be906b56SAndreas Gohr * @param string $special Use this mode for this one token. 114*be906b56SAndreas Gohr */ 115*be906b56SAndreas Gohr public function addSpecialPattern($pattern, $mode, $special) 116*be906b56SAndreas Gohr { 117*be906b56SAndreas Gohr if (! isset($this->regexes[$mode])) { 118*be906b56SAndreas Gohr $this->regexes[$mode] = new ParallelRegex($this->case); 119*be906b56SAndreas Gohr } 120*be906b56SAndreas Gohr $this->regexes[$mode]->addPattern($pattern, "_$special"); 121*be906b56SAndreas Gohr } 122*be906b56SAndreas Gohr 123*be906b56SAndreas Gohr /** 124*be906b56SAndreas Gohr * Adds a mapping from a mode to another handler. 125*be906b56SAndreas Gohr * 126*be906b56SAndreas Gohr * @param string $mode Mode to be remapped. 127*be906b56SAndreas Gohr * @param string $handler New target handler. 128*be906b56SAndreas Gohr */ 129*be906b56SAndreas Gohr public function mapHandler($mode, $handler) 130*be906b56SAndreas Gohr { 131*be906b56SAndreas Gohr $this->mode_handlers[$mode] = $handler; 132*be906b56SAndreas Gohr } 133*be906b56SAndreas Gohr 134*be906b56SAndreas Gohr /** 135*be906b56SAndreas Gohr * Splits the page text into tokens. 136*be906b56SAndreas Gohr * 137*be906b56SAndreas Gohr * Will fail if the handlers report an error or if no content is consumed. If successful then each 138*be906b56SAndreas Gohr * unparsed and parsed token invokes a call to the held listener. 139*be906b56SAndreas Gohr * 140*be906b56SAndreas Gohr * @param string $raw Raw HTML text. 141*be906b56SAndreas Gohr * @return boolean True on success, else false. 142*be906b56SAndreas Gohr */ 143*be906b56SAndreas Gohr public function parse($raw) 144*be906b56SAndreas Gohr { 145*be906b56SAndreas Gohr if (! isset($this->handler)) { 146*be906b56SAndreas Gohr return false; 147*be906b56SAndreas Gohr } 148*be906b56SAndreas Gohr $initialLength = strlen($raw); 149*be906b56SAndreas Gohr $length = $initialLength; 150*be906b56SAndreas Gohr $pos = 0; 151*be906b56SAndreas Gohr while (is_array($parsed = $this->reduce($raw))) { 152*be906b56SAndreas Gohr list($unmatched, $matched, $mode) = $parsed; 153*be906b56SAndreas Gohr $currentLength = strlen($raw); 154*be906b56SAndreas Gohr $matchPos = $initialLength - $currentLength - strlen($matched); 155*be906b56SAndreas Gohr if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 156*be906b56SAndreas Gohr return false; 157*be906b56SAndreas Gohr } 158*be906b56SAndreas Gohr if ($currentLength == $length) { 159*be906b56SAndreas Gohr return false; 160*be906b56SAndreas Gohr } 161*be906b56SAndreas Gohr $length = $currentLength; 162*be906b56SAndreas Gohr $pos = $initialLength - $currentLength; 163*be906b56SAndreas Gohr } 164*be906b56SAndreas Gohr if (!$parsed) { 165*be906b56SAndreas Gohr return false; 166*be906b56SAndreas Gohr } 167*be906b56SAndreas Gohr return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 168*be906b56SAndreas Gohr } 169*be906b56SAndreas Gohr 170*be906b56SAndreas Gohr /** 171*be906b56SAndreas Gohr * Sends the matched token and any leading unmatched 172*be906b56SAndreas Gohr * text to the parser changing the lexer to a new 173*be906b56SAndreas Gohr * mode if one is listed. 174*be906b56SAndreas Gohr * 175*be906b56SAndreas Gohr * @param string $unmatched Unmatched leading portion. 176*be906b56SAndreas Gohr * @param string $matched Actual token match. 177*be906b56SAndreas Gohr * @param bool|string $mode Mode after match. A boolean false mode causes no change. 178*be906b56SAndreas Gohr * @param int $initialPos 179*be906b56SAndreas Gohr * @param int $matchPos Current byte index location in raw doc thats being parsed 180*be906b56SAndreas Gohr * @return boolean False if there was any error from the parser. 181*be906b56SAndreas Gohr */ 182*be906b56SAndreas Gohr protected function dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) 183*be906b56SAndreas Gohr { 184*be906b56SAndreas Gohr if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 185*be906b56SAndreas Gohr return false; 186*be906b56SAndreas Gohr } 187*be906b56SAndreas Gohr if ($this->isModeEnd($mode)) { 188*be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 189*be906b56SAndreas Gohr return false; 190*be906b56SAndreas Gohr } 191*be906b56SAndreas Gohr return $this->mode->leave(); 192*be906b56SAndreas Gohr } 193*be906b56SAndreas Gohr if ($this->isSpecialMode($mode)) { 194*be906b56SAndreas Gohr $this->mode->enter($this->decodeSpecial($mode)); 195*be906b56SAndreas Gohr if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 196*be906b56SAndreas Gohr return false; 197*be906b56SAndreas Gohr } 198*be906b56SAndreas Gohr return $this->mode->leave(); 199*be906b56SAndreas Gohr } 200*be906b56SAndreas Gohr if (is_string($mode)) { 201*be906b56SAndreas Gohr $this->mode->enter($mode); 202*be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 203*be906b56SAndreas Gohr } 204*be906b56SAndreas Gohr return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 205*be906b56SAndreas Gohr } 206*be906b56SAndreas Gohr 207*be906b56SAndreas Gohr /** 208*be906b56SAndreas Gohr * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 209*be906b56SAndreas Gohr * mode stack. 210*be906b56SAndreas Gohr * 211*be906b56SAndreas Gohr * @param string $mode Mode to test. 212*be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 213*be906b56SAndreas Gohr */ 214*be906b56SAndreas Gohr protected function isModeEnd($mode) 215*be906b56SAndreas Gohr { 216*be906b56SAndreas Gohr return ($mode === "__exit"); 217*be906b56SAndreas Gohr } 218*be906b56SAndreas Gohr 219*be906b56SAndreas Gohr /** 220*be906b56SAndreas Gohr * Test to see if the mode is one where this mode is entered for this token only and automatically 221*be906b56SAndreas Gohr * leaves immediately afterwoods. 222*be906b56SAndreas Gohr * 223*be906b56SAndreas Gohr * @param string $mode Mode to test. 224*be906b56SAndreas Gohr * @return boolean True if this is the exit mode. 225*be906b56SAndreas Gohr */ 226*be906b56SAndreas Gohr protected function isSpecialMode($mode) 227*be906b56SAndreas Gohr { 228*be906b56SAndreas Gohr return (strncmp($mode, "_", 1) == 0); 229*be906b56SAndreas Gohr } 230*be906b56SAndreas Gohr 231*be906b56SAndreas Gohr /** 232*be906b56SAndreas Gohr * Strips the magic underscore marking single token modes. 233*be906b56SAndreas Gohr * 234*be906b56SAndreas Gohr * @param string $mode Mode to decode. 235*be906b56SAndreas Gohr * @return string Underlying mode name. 236*be906b56SAndreas Gohr */ 237*be906b56SAndreas Gohr protected function decodeSpecial($mode) 238*be906b56SAndreas Gohr { 239*be906b56SAndreas Gohr return substr($mode, 1); 240*be906b56SAndreas Gohr } 241*be906b56SAndreas Gohr 242*be906b56SAndreas Gohr /** 243*be906b56SAndreas Gohr * Calls the parser method named after the current mode. 244*be906b56SAndreas Gohr * 245*be906b56SAndreas Gohr * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 246*be906b56SAndreas Gohr * 247*be906b56SAndreas Gohr * @param string $content Text parsed. 248*be906b56SAndreas Gohr * @param boolean $is_match Token is recognised rather 249*be906b56SAndreas Gohr * than unparsed data. 250*be906b56SAndreas Gohr * @param int $pos Current byte index location in raw doc 251*be906b56SAndreas Gohr * thats being parsed 252*be906b56SAndreas Gohr * @return bool 253*be906b56SAndreas Gohr */ 254*be906b56SAndreas Gohr protected function invokeHandler($content, $is_match, $pos) 255*be906b56SAndreas Gohr { 256*be906b56SAndreas Gohr if (($content === "") || ($content === false)) { 257*be906b56SAndreas Gohr return true; 258*be906b56SAndreas Gohr } 259*be906b56SAndreas Gohr $handler = $this->mode->getCurrent(); 260*be906b56SAndreas Gohr if (isset($this->mode_handlers[$handler])) { 261*be906b56SAndreas Gohr $handler = $this->mode_handlers[$handler]; 262*be906b56SAndreas Gohr } 263*be906b56SAndreas Gohr 264*be906b56SAndreas Gohr // modes starting with plugin_ are all handled by the same 265*be906b56SAndreas Gohr // handler but with an additional parameter 266*be906b56SAndreas Gohr if (substr($handler, 0, 7)=='plugin_') { 267*be906b56SAndreas Gohr list($handler,$plugin) = explode('_', $handler, 2); 268*be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos, $plugin); 269*be906b56SAndreas Gohr } 270*be906b56SAndreas Gohr 271*be906b56SAndreas Gohr return $this->handler->$handler($content, $is_match, $pos); 272*be906b56SAndreas Gohr } 273*be906b56SAndreas Gohr 274*be906b56SAndreas Gohr /** 275*be906b56SAndreas Gohr * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 276*be906b56SAndreas Gohr * unparsed data. Empty strings will not be matched. 277*be906b56SAndreas Gohr * 278*be906b56SAndreas Gohr * @param string $raw The subject to parse. This is the content that will be eaten. 279*be906b56SAndreas Gohr * @return array|bool Three item list of unparsed content followed by the 280*be906b56SAndreas Gohr * recognised token and finally the action the parser is to take. 281*be906b56SAndreas Gohr * True if no match, false if there is a parsing error. 282*be906b56SAndreas Gohr */ 283*be906b56SAndreas Gohr protected function reduce(&$raw) 284*be906b56SAndreas Gohr { 285*be906b56SAndreas Gohr if (! isset($this->regexes[$this->mode->getCurrent()])) { 286*be906b56SAndreas Gohr return false; 287*be906b56SAndreas Gohr } 288*be906b56SAndreas Gohr if ($raw === "") { 289*be906b56SAndreas Gohr return true; 290*be906b56SAndreas Gohr } 291*be906b56SAndreas Gohr if ($action = $this->regexes[$this->mode->getCurrent()]->split($raw, $split)) { 292*be906b56SAndreas Gohr list($unparsed, $match, $raw) = $split; 293*be906b56SAndreas Gohr return array($unparsed, $match, $action); 294*be906b56SAndreas Gohr } 295*be906b56SAndreas Gohr return true; 296*be906b56SAndreas Gohr } 297*be906b56SAndreas Gohr 298*be906b56SAndreas Gohr /** 299*be906b56SAndreas Gohr * Escapes regex characters other than (, ) and / 300*be906b56SAndreas Gohr * 301*be906b56SAndreas Gohr * @param string $str 302*be906b56SAndreas Gohr * @return string 303*be906b56SAndreas Gohr */ 304*be906b56SAndreas Gohr public static function escape($str) 305*be906b56SAndreas Gohr { 306*be906b56SAndreas Gohr $chars = array( 307*be906b56SAndreas Gohr '/\\\\/', 308*be906b56SAndreas Gohr '/\./', 309*be906b56SAndreas Gohr '/\+/', 310*be906b56SAndreas Gohr '/\*/', 311*be906b56SAndreas Gohr '/\?/', 312*be906b56SAndreas Gohr '/\[/', 313*be906b56SAndreas Gohr '/\^/', 314*be906b56SAndreas Gohr '/\]/', 315*be906b56SAndreas Gohr '/\$/', 316*be906b56SAndreas Gohr '/\{/', 317*be906b56SAndreas Gohr '/\}/', 318*be906b56SAndreas Gohr '/\=/', 319*be906b56SAndreas Gohr '/\!/', 320*be906b56SAndreas Gohr '/\</', 321*be906b56SAndreas Gohr '/\>/', 322*be906b56SAndreas Gohr '/\|/', 323*be906b56SAndreas Gohr '/\:/' 324*be906b56SAndreas Gohr ); 325*be906b56SAndreas Gohr 326*be906b56SAndreas Gohr $escaped = array( 327*be906b56SAndreas Gohr '\\\\\\\\', 328*be906b56SAndreas Gohr '\.', 329*be906b56SAndreas Gohr '\+', 330*be906b56SAndreas Gohr '\*', 331*be906b56SAndreas Gohr '\?', 332*be906b56SAndreas Gohr '\[', 333*be906b56SAndreas Gohr '\^', 334*be906b56SAndreas Gohr '\]', 335*be906b56SAndreas Gohr '\$', 336*be906b56SAndreas Gohr '\{', 337*be906b56SAndreas Gohr '\}', 338*be906b56SAndreas Gohr '\=', 339*be906b56SAndreas Gohr '\!', 340*be906b56SAndreas Gohr '\<', 341*be906b56SAndreas Gohr '\>', 342*be906b56SAndreas Gohr '\|', 343*be906b56SAndreas Gohr '\:' 344*be906b56SAndreas Gohr ); 345*be906b56SAndreas Gohr return preg_replace($chars, $escaped, $str); 346*be906b56SAndreas Gohr } 347*be906b56SAndreas Gohr} 348