1<?php 2/** 3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4 * For an intro to the Lexer see: 5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6 * 7 * @author Marcus Baker http://www.lastcraft.com 8 */ 9 10namespace dokuwiki\Parsing\Lexer; 11 12// FIXME move elsewhere 13 14define("DOKU_LEXER_ENTER", 1); 15define("DOKU_LEXER_MATCHED", 2); 16define("DOKU_LEXER_UNMATCHED", 3); 17define("DOKU_LEXER_EXIT", 4); 18define("DOKU_LEXER_SPECIAL", 5); 19 20/** 21 * Accepts text and breaks it into tokens. 22 * 23 * Some optimisation to make the sure the content is only scanned by the PHP regex 24 * parser once. Lexer modes must not start with leading underscores. 25 */ 26class Lexer 27{ 28 /** @var ParallelRegex[] */ 29 protected $regexes; 30 /** @var \Doku_Handler */ 31 protected $handler; 32 /** @var StateStack */ 33 protected $mode; 34 /** @var array mode "rewrites" */ 35 protected $mode_handlers; 36 /** @var bool case sensitive? */ 37 protected $case; 38 39 /** 40 * Sets up the lexer in case insensitive matching by default. 41 * 42 * @param \Doku_Handler $handler Handling strategy by reference. 43 * @param string $start Starting handler. 44 * @param boolean $case True for case sensitive. 45 */ 46 public function __construct($handler, $start = "accept", $case = false) 47 { 48 $this->case = $case; 49 $this->regexes = array(); 50 $this->handler = $handler; 51 $this->mode = new StateStack($start); 52 $this->mode_handlers = array(); 53 } 54 55 /** 56 * Adds a token search pattern for a particular parsing mode. 57 * 58 * The pattern does not change the current mode. 59 * 60 * @param string $pattern Perl style regex, but ( and ) 61 * lose the usual meaning. 62 * @param string $mode Should only apply this 63 * pattern when dealing with 64 * this type of input. 65 */ 66 public function addPattern($pattern, $mode = "accept") 67 { 68 if (! isset($this->regexes[$mode])) { 69 $this->regexes[$mode] = new ParallelRegex($this->case); 70 } 71 $this->regexes[$mode]->addPattern($pattern); 72 } 73 74 /** 75 * Adds a pattern that will enter a new parsing mode. 76 * 77 * Useful for entering parenthesis, strings, tags, etc. 78 * 79 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 80 * @param string $mode Should only apply this pattern when dealing with this type of input. 81 * @param string $new_mode Change parsing to this new nested mode. 82 */ 83 public function addEntryPattern($pattern, $mode, $new_mode) 84 { 85 if (! isset($this->regexes[$mode])) { 86 $this->regexes[$mode] = new ParallelRegex($this->case); 87 } 88 $this->regexes[$mode]->addPattern($pattern, $new_mode); 89 } 90 91 /** 92 * Adds a pattern that will exit the current mode and re-enter the previous one. 93 * 94 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 95 * @param string $mode Mode to leave. 96 */ 97 public function addExitPattern($pattern, $mode) 98 { 99 if (! isset($this->regexes[$mode])) { 100 $this->regexes[$mode] = new ParallelRegex($this->case); 101 } 102 $this->regexes[$mode]->addPattern($pattern, "__exit"); 103 } 104 105 /** 106 * Adds a pattern that has a special mode. 107 * 108 * Acts as an entry and exit pattern in one go, effectively calling a special 109 * parser handler for this token only. 110 * 111 * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. 112 * @param string $mode Should only apply this pattern when dealing with this type of input. 113 * @param string $special Use this mode for this one token. 114 */ 115 public function addSpecialPattern($pattern, $mode, $special) 116 { 117 if (! isset($this->regexes[$mode])) { 118 $this->regexes[$mode] = new ParallelRegex($this->case); 119 } 120 $this->regexes[$mode]->addPattern($pattern, "_$special"); 121 } 122 123 /** 124 * Adds a mapping from a mode to another handler. 125 * 126 * @param string $mode Mode to be remapped. 127 * @param string $handler New target handler. 128 */ 129 public function mapHandler($mode, $handler) 130 { 131 $this->mode_handlers[$mode] = $handler; 132 } 133 134 /** 135 * Splits the page text into tokens. 136 * 137 * Will fail if the handlers report an error or if no content is consumed. If successful then each 138 * unparsed and parsed token invokes a call to the held listener. 139 * 140 * @param string $raw Raw HTML text. 141 * @return boolean True on success, else false. 142 */ 143 public function parse($raw) 144 { 145 if (! isset($this->handler)) { 146 return false; 147 } 148 $initialLength = strlen($raw); 149 $length = $initialLength; 150 $pos = 0; 151 while (is_array($parsed = $this->reduce($raw))) { 152 list($unmatched, $matched, $mode) = $parsed; 153 $currentLength = strlen($raw); 154 $matchPos = $initialLength - $currentLength - strlen($matched); 155 if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { 156 return false; 157 } 158 if ($currentLength == $length) { 159 return false; 160 } 161 $length = $currentLength; 162 $pos = $initialLength - $currentLength; 163 } 164 if (!$parsed) { 165 return false; 166 } 167 return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); 168 } 169 170 /** 171 * Sends the matched token and any leading unmatched 172 * text to the parser changing the lexer to a new 173 * mode if one is listed. 174 * 175 * @param string $unmatched Unmatched leading portion. 176 * @param string $matched Actual token match. 177 * @param bool|string $mode Mode after match. A boolean false mode causes no change. 178 * @param int $initialPos 179 * @param int $matchPos Current byte index location in raw doc thats being parsed 180 * @return boolean False if there was any error from the parser. 181 */ 182 protected function dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) 183 { 184 if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { 185 return false; 186 } 187 if ($this->isModeEnd($mode)) { 188 if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { 189 return false; 190 } 191 return $this->mode->leave(); 192 } 193 if ($this->isSpecialMode($mode)) { 194 $this->mode->enter($this->decodeSpecial($mode)); 195 if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { 196 return false; 197 } 198 return $this->mode->leave(); 199 } 200 if (is_string($mode)) { 201 $this->mode->enter($mode); 202 return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); 203 } 204 return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); 205 } 206 207 /** 208 * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching 209 * mode stack. 210 * 211 * @param string $mode Mode to test. 212 * @return boolean True if this is the exit mode. 213 */ 214 protected function isModeEnd($mode) 215 { 216 return ($mode === "__exit"); 217 } 218 219 /** 220 * Test to see if the mode is one where this mode is entered for this token only and automatically 221 * leaves immediately afterwoods. 222 * 223 * @param string $mode Mode to test. 224 * @return boolean True if this is the exit mode. 225 */ 226 protected function isSpecialMode($mode) 227 { 228 return (strncmp($mode, "_", 1) == 0); 229 } 230 231 /** 232 * Strips the magic underscore marking single token modes. 233 * 234 * @param string $mode Mode to decode. 235 * @return string Underlying mode name. 236 */ 237 protected function decodeSpecial($mode) 238 { 239 return substr($mode, 1); 240 } 241 242 /** 243 * Calls the parser method named after the current mode. 244 * 245 * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. 246 * 247 * @param string $content Text parsed. 248 * @param boolean $is_match Token is recognised rather 249 * than unparsed data. 250 * @param int $pos Current byte index location in raw doc 251 * thats being parsed 252 * @return bool 253 */ 254 protected function invokeHandler($content, $is_match, $pos) 255 { 256 if (($content === "") || ($content === false)) { 257 return true; 258 } 259 $handler = $this->mode->getCurrent(); 260 if (isset($this->mode_handlers[$handler])) { 261 $handler = $this->mode_handlers[$handler]; 262 } 263 264 // modes starting with plugin_ are all handled by the same 265 // handler but with an additional parameter 266 if (substr($handler, 0, 7)=='plugin_') { 267 list($handler,$plugin) = explode('_', $handler, 2); 268 return $this->handler->$handler($content, $is_match, $pos, $plugin); 269 } 270 271 return $this->handler->$handler($content, $is_match, $pos); 272 } 273 274 /** 275 * Tries to match a chunk of text and if successful removes the recognised chunk and any leading 276 * unparsed data. Empty strings will not be matched. 277 * 278 * @param string $raw The subject to parse. This is the content that will be eaten. 279 * @return array|bool Three item list of unparsed content followed by the 280 * recognised token and finally the action the parser is to take. 281 * True if no match, false if there is a parsing error. 282 */ 283 protected function reduce(&$raw) 284 { 285 if (! isset($this->regexes[$this->mode->getCurrent()])) { 286 return false; 287 } 288 if ($raw === "") { 289 return true; 290 } 291 if ($action = $this->regexes[$this->mode->getCurrent()]->split($raw, $split)) { 292 list($unparsed, $match, $raw) = $split; 293 return array($unparsed, $match, $action); 294 } 295 return true; 296 } 297 298 /** 299 * Escapes regex characters other than (, ) and / 300 * 301 * @param string $str 302 * @return string 303 */ 304 public static function escape($str) 305 { 306 $chars = array( 307 '/\\\\/', 308 '/\./', 309 '/\+/', 310 '/\*/', 311 '/\?/', 312 '/\[/', 313 '/\^/', 314 '/\]/', 315 '/\$/', 316 '/\{/', 317 '/\}/', 318 '/\=/', 319 '/\!/', 320 '/\</', 321 '/\>/', 322 '/\|/', 323 '/\:/' 324 ); 325 326 $escaped = array( 327 '\\\\\\\\', 328 '\.', 329 '\+', 330 '\*', 331 '\?', 332 '\[', 333 '\^', 334 '\]', 335 '\$', 336 '\{', 337 '\}', 338 '\=', 339 '\!', 340 '\<', 341 '\>', 342 '\|', 343 '\:' 344 ); 345 return preg_replace($chars, $escaped, $str); 346 } 347} 348