1*be906b56SAndreas Gohr<?php 2*be906b56SAndreas Gohr/** 3*be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4*be906b56SAndreas Gohr * For an intro to the Lexer see: 5*be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6*be906b56SAndreas Gohr * 7*be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 8*be906b56SAndreas Gohr */ 9*be906b56SAndreas Gohr 10*be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 11*be906b56SAndreas Gohr 12*be906b56SAndreas Gohr/** 13*be906b56SAndreas Gohr * Compounded regular expression. 14*be906b56SAndreas Gohr * 15*be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned. 16*be906b56SAndreas Gohr */ 17*be906b56SAndreas Gohrclass ParallelRegex 18*be906b56SAndreas Gohr{ 19*be906b56SAndreas Gohr /** @var string[] patterns to match */ 20*be906b56SAndreas Gohr protected $patterns; 21*be906b56SAndreas Gohr /** @var string[] labels for above patterns */ 22*be906b56SAndreas Gohr protected $labels; 23*be906b56SAndreas Gohr /** @var string the compound regex matching all patterns */ 24*be906b56SAndreas Gohr protected $regex; 25*be906b56SAndreas Gohr /** @var bool case sensitive matching? */ 26*be906b56SAndreas Gohr protected $case; 27*be906b56SAndreas Gohr 28*be906b56SAndreas Gohr /** 29*be906b56SAndreas Gohr * Constructor. Starts with no patterns. 30*be906b56SAndreas Gohr * 31*be906b56SAndreas Gohr * @param boolean $case True for case sensitive, false 32*be906b56SAndreas Gohr * for insensitive. 33*be906b56SAndreas Gohr */ 34*be906b56SAndreas Gohr public function __construct($case) 35*be906b56SAndreas Gohr { 36*be906b56SAndreas Gohr $this->case = $case; 37*be906b56SAndreas Gohr $this->patterns = array(); 38*be906b56SAndreas Gohr $this->labels = array(); 39*be906b56SAndreas Gohr $this->regex = null; 40*be906b56SAndreas Gohr } 41*be906b56SAndreas Gohr 42*be906b56SAndreas Gohr /** 43*be906b56SAndreas Gohr * Adds a pattern with an optional label. 44*be906b56SAndreas Gohr * 45*be906b56SAndreas Gohr * @param mixed $pattern Perl style regex. Must be UTF-8 46*be906b56SAndreas Gohr * encoded. If its a string, the (, ) 47*be906b56SAndreas Gohr * lose their meaning unless they 48*be906b56SAndreas Gohr * form part of a lookahead or 49*be906b56SAndreas Gohr * lookbehind assertation. 50*be906b56SAndreas Gohr * @param bool|string $label Label of regex to be returned 51*be906b56SAndreas Gohr * on a match. Label must be ASCII 52*be906b56SAndreas Gohr */ 53*be906b56SAndreas Gohr public function addPattern($pattern, $label = true) 54*be906b56SAndreas Gohr { 55*be906b56SAndreas Gohr $count = count($this->patterns); 56*be906b56SAndreas Gohr $this->patterns[$count] = $pattern; 57*be906b56SAndreas Gohr $this->labels[$count] = $label; 58*be906b56SAndreas Gohr $this->regex = null; 59*be906b56SAndreas Gohr } 60*be906b56SAndreas Gohr 61*be906b56SAndreas Gohr /** 62*be906b56SAndreas Gohr * Attempts to match all patterns at once against a string. 63*be906b56SAndreas Gohr * 64*be906b56SAndreas Gohr * @param string $subject String to match against. 65*be906b56SAndreas Gohr * @param string $match First matched portion of 66*be906b56SAndreas Gohr * subject. 67*be906b56SAndreas Gohr * @return bool|string False if no match found, label if label exists, true if not 68*be906b56SAndreas Gohr */ 69*be906b56SAndreas Gohr public function match($subject, &$match) 70*be906b56SAndreas Gohr { 71*be906b56SAndreas Gohr if (count($this->patterns) == 0) { 72*be906b56SAndreas Gohr return false; 73*be906b56SAndreas Gohr } 74*be906b56SAndreas Gohr if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { 75*be906b56SAndreas Gohr $match = ""; 76*be906b56SAndreas Gohr return false; 77*be906b56SAndreas Gohr } 78*be906b56SAndreas Gohr 79*be906b56SAndreas Gohr $match = $matches[0]; 80*be906b56SAndreas Gohr $size = count($matches); 81*be906b56SAndreas Gohr // FIXME this could be made faster by storing the labels as keys in a hashmap 82*be906b56SAndreas Gohr for ($i = 1; $i < $size; $i++) { 83*be906b56SAndreas Gohr if ($matches[$i] && isset($this->labels[$i - 1])) { 84*be906b56SAndreas Gohr return $this->labels[$i - 1]; 85*be906b56SAndreas Gohr } 86*be906b56SAndreas Gohr } 87*be906b56SAndreas Gohr return true; 88*be906b56SAndreas Gohr } 89*be906b56SAndreas Gohr 90*be906b56SAndreas Gohr /** 91*be906b56SAndreas Gohr * Attempts to split the string against all patterns at once 92*be906b56SAndreas Gohr * 93*be906b56SAndreas Gohr * @param string $subject String to match against. 94*be906b56SAndreas Gohr * @param array $split The split result: array containing, pre-match, match & post-match strings 95*be906b56SAndreas Gohr * @return boolean True on success. 96*be906b56SAndreas Gohr * 97*be906b56SAndreas Gohr * @author Christopher Smith <chris@jalakai.co.uk> 98*be906b56SAndreas Gohr */ 99*be906b56SAndreas Gohr public function split($subject, &$split) 100*be906b56SAndreas Gohr { 101*be906b56SAndreas Gohr if (count($this->patterns) == 0) { 102*be906b56SAndreas Gohr return false; 103*be906b56SAndreas Gohr } 104*be906b56SAndreas Gohr 105*be906b56SAndreas Gohr if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { 106*be906b56SAndreas Gohr if (function_exists('preg_last_error')) { 107*be906b56SAndreas Gohr $err = preg_last_error(); 108*be906b56SAndreas Gohr switch ($err) { 109*be906b56SAndreas Gohr case PREG_BACKTRACK_LIMIT_ERROR: 110*be906b56SAndreas Gohr msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1); 111*be906b56SAndreas Gohr break; 112*be906b56SAndreas Gohr case PREG_RECURSION_LIMIT_ERROR: 113*be906b56SAndreas Gohr msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1); 114*be906b56SAndreas Gohr break; 115*be906b56SAndreas Gohr case PREG_BAD_UTF8_ERROR: 116*be906b56SAndreas Gohr msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1); 117*be906b56SAndreas Gohr break; 118*be906b56SAndreas Gohr case PREG_INTERNAL_ERROR: 119*be906b56SAndreas Gohr msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1); 120*be906b56SAndreas Gohr break; 121*be906b56SAndreas Gohr } 122*be906b56SAndreas Gohr } 123*be906b56SAndreas Gohr 124*be906b56SAndreas Gohr $split = array($subject, "", ""); 125*be906b56SAndreas Gohr return false; 126*be906b56SAndreas Gohr } 127*be906b56SAndreas Gohr 128*be906b56SAndreas Gohr $idx = count($matches)-2; 129*be906b56SAndreas Gohr list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2); 130*be906b56SAndreas Gohr $split = array($pre, $matches[0], $post); 131*be906b56SAndreas Gohr 132*be906b56SAndreas Gohr return isset($this->labels[$idx]) ? $this->labels[$idx] : true; 133*be906b56SAndreas Gohr } 134*be906b56SAndreas Gohr 135*be906b56SAndreas Gohr /** 136*be906b56SAndreas Gohr * Compounds the patterns into a single 137*be906b56SAndreas Gohr * regular expression separated with the 138*be906b56SAndreas Gohr * "or" operator. Caches the regex. 139*be906b56SAndreas Gohr * Will automatically escape (, ) and / tokens. 140*be906b56SAndreas Gohr * 141*be906b56SAndreas Gohr * @return null|string 142*be906b56SAndreas Gohr */ 143*be906b56SAndreas Gohr protected function getCompoundedRegex() 144*be906b56SAndreas Gohr { 145*be906b56SAndreas Gohr if ($this->regex == null) { 146*be906b56SAndreas Gohr $cnt = count($this->patterns); 147*be906b56SAndreas Gohr for ($i = 0; $i < $cnt; $i++) { 148*be906b56SAndreas Gohr /* 149*be906b56SAndreas Gohr * decompose the input pattern into "(", "(?", ")", 150*be906b56SAndreas Gohr * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"... 151*be906b56SAndreas Gohr * elements. 152*be906b56SAndreas Gohr */ 153*be906b56SAndreas Gohr preg_match_all('/\\\\.|' . 154*be906b56SAndreas Gohr '\(\?|' . 155*be906b56SAndreas Gohr '[()]|' . 156*be906b56SAndreas Gohr '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' . 157*be906b56SAndreas Gohr '[^[()\\\\]+/', $this->patterns[$i], $elts); 158*be906b56SAndreas Gohr 159*be906b56SAndreas Gohr $pattern = ""; 160*be906b56SAndreas Gohr $level = 0; 161*be906b56SAndreas Gohr 162*be906b56SAndreas Gohr foreach ($elts[0] as $elt) { 163*be906b56SAndreas Gohr /* 164*be906b56SAndreas Gohr * for "(", ")" remember the nesting level, add "\" 165*be906b56SAndreas Gohr * only to the non-"(?" ones. 166*be906b56SAndreas Gohr */ 167*be906b56SAndreas Gohr 168*be906b56SAndreas Gohr switch ($elt) { 169*be906b56SAndreas Gohr case '(': 170*be906b56SAndreas Gohr $pattern .= '\('; 171*be906b56SAndreas Gohr break; 172*be906b56SAndreas Gohr case ')': 173*be906b56SAndreas Gohr if ($level > 0) 174*be906b56SAndreas Gohr $level--; /* closing (? */ 175*be906b56SAndreas Gohr else $pattern .= '\\'; 176*be906b56SAndreas Gohr $pattern .= ')'; 177*be906b56SAndreas Gohr break; 178*be906b56SAndreas Gohr case '(?': 179*be906b56SAndreas Gohr $level++; 180*be906b56SAndreas Gohr $pattern .= '(?'; 181*be906b56SAndreas Gohr break; 182*be906b56SAndreas Gohr default: 183*be906b56SAndreas Gohr if (substr($elt, 0, 1) == '\\') 184*be906b56SAndreas Gohr $pattern .= $elt; 185*be906b56SAndreas Gohr else $pattern .= str_replace('/', '\/', $elt); 186*be906b56SAndreas Gohr } 187*be906b56SAndreas Gohr } 188*be906b56SAndreas Gohr $this->patterns[$i] = "($pattern)"; 189*be906b56SAndreas Gohr } 190*be906b56SAndreas Gohr $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags(); 191*be906b56SAndreas Gohr } 192*be906b56SAndreas Gohr return $this->regex; 193*be906b56SAndreas Gohr } 194*be906b56SAndreas Gohr 195*be906b56SAndreas Gohr /** 196*be906b56SAndreas Gohr * Accessor for perl regex mode flags to use. 197*be906b56SAndreas Gohr * @return string Perl regex flags. 198*be906b56SAndreas Gohr */ 199*be906b56SAndreas Gohr protected function getPerlMatchingFlags() 200*be906b56SAndreas Gohr { 201*be906b56SAndreas Gohr return ($this->case ? "msS" : "msSi"); 202*be906b56SAndreas Gohr } 203*be906b56SAndreas Gohr} 204