1be906b56SAndreas Gohr<?php 2be906b56SAndreas Gohr/** 3be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 4be906b56SAndreas Gohr * For an intro to the Lexer see: 5be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 6be906b56SAndreas Gohr * 7be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 8be906b56SAndreas Gohr */ 9be906b56SAndreas Gohr 10be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 11be906b56SAndreas Gohr 12be906b56SAndreas Gohr/** 13be906b56SAndreas Gohr * Compounded regular expression. 14be906b56SAndreas Gohr * 15be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned. 16be906b56SAndreas Gohr */ 17be906b56SAndreas Gohrclass ParallelRegex 18be906b56SAndreas Gohr{ 19be906b56SAndreas Gohr /** @var string[] patterns to match */ 20*bcaec9f4SAndreas Gohr protected $patterns = []; 21be906b56SAndreas Gohr /** @var string[] labels for above patterns */ 22*bcaec9f4SAndreas Gohr protected $labels = []; 23be906b56SAndreas Gohr /** @var string the compound regex matching all patterns */ 24be906b56SAndreas Gohr protected $regex; 25be906b56SAndreas Gohr /** @var bool case sensitive matching? */ 26be906b56SAndreas Gohr protected $case; 27be906b56SAndreas Gohr 28be906b56SAndreas Gohr /** 29be906b56SAndreas Gohr * Constructor. Starts with no patterns. 30be906b56SAndreas Gohr * 31be906b56SAndreas Gohr * @param boolean $case True for case sensitive, false 32be906b56SAndreas Gohr * for insensitive. 33be906b56SAndreas Gohr */ 34be906b56SAndreas Gohr public function __construct($case) 35be906b56SAndreas Gohr { 36be906b56SAndreas Gohr $this->case = $case; 37be906b56SAndreas Gohr } 38be906b56SAndreas Gohr 39be906b56SAndreas Gohr /** 40be906b56SAndreas Gohr * Adds a pattern with an optional label. 41be906b56SAndreas Gohr * 42be906b56SAndreas Gohr * @param mixed $pattern Perl style regex. Must be UTF-8 43be906b56SAndreas Gohr * encoded. If its a string, the (, ) 44be906b56SAndreas Gohr * lose their meaning unless they 45be906b56SAndreas Gohr * form part of a lookahead or 46be906b56SAndreas Gohr * lookbehind assertation. 47be906b56SAndreas Gohr * @param bool|string $label Label of regex to be returned 48be906b56SAndreas Gohr * on a match. Label must be ASCII 49be906b56SAndreas Gohr */ 50be906b56SAndreas Gohr public function addPattern($pattern, $label = true) 51be906b56SAndreas Gohr { 52be906b56SAndreas Gohr $count = count($this->patterns); 53be906b56SAndreas Gohr $this->patterns[$count] = $pattern; 54be906b56SAndreas Gohr $this->labels[$count] = $label; 55be906b56SAndreas Gohr $this->regex = null; 56be906b56SAndreas Gohr } 57be906b56SAndreas Gohr 58be906b56SAndreas Gohr /** 59be906b56SAndreas Gohr * Attempts to match all patterns at once against a string. 60be906b56SAndreas Gohr * 61be906b56SAndreas Gohr * @param string $subject String to match against. 62be906b56SAndreas Gohr * @param string $match First matched portion of 63be906b56SAndreas Gohr * subject. 64be906b56SAndreas Gohr * @return bool|string False if no match found, label if label exists, true if not 65be906b56SAndreas Gohr */ 66fe2e97f6SAndreas Gohr public function apply($subject, &$match) 67be906b56SAndreas Gohr { 68be906b56SAndreas Gohr if (count($this->patterns) == 0) { 69be906b56SAndreas Gohr return false; 70be906b56SAndreas Gohr } 71be906b56SAndreas Gohr if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { 72be906b56SAndreas Gohr $match = ""; 73be906b56SAndreas Gohr return false; 74be906b56SAndreas Gohr } 75be906b56SAndreas Gohr 76be906b56SAndreas Gohr $match = $matches[0]; 77be906b56SAndreas Gohr $size = count($matches); 78be906b56SAndreas Gohr // FIXME this could be made faster by storing the labels as keys in a hashmap 79be906b56SAndreas Gohr for ($i = 1; $i < $size; $i++) { 80be906b56SAndreas Gohr if ($matches[$i] && isset($this->labels[$i - 1])) { 81be906b56SAndreas Gohr return $this->labels[$i - 1]; 82be906b56SAndreas Gohr } 83be906b56SAndreas Gohr } 84be906b56SAndreas Gohr return true; 85be906b56SAndreas Gohr } 86be906b56SAndreas Gohr 87be906b56SAndreas Gohr /** 88be906b56SAndreas Gohr * Attempts to split the string against all patterns at once 89be906b56SAndreas Gohr * 90be906b56SAndreas Gohr * @param string $subject String to match against. 91be906b56SAndreas Gohr * @param array $split The split result: array containing, pre-match, match & post-match strings 92be906b56SAndreas Gohr * @return boolean True on success. 93be906b56SAndreas Gohr * 94be906b56SAndreas Gohr * @author Christopher Smith <chris@jalakai.co.uk> 95be906b56SAndreas Gohr */ 96be906b56SAndreas Gohr public function split($subject, &$split) 97be906b56SAndreas Gohr { 98be906b56SAndreas Gohr if (count($this->patterns) == 0) { 99be906b56SAndreas Gohr return false; 100be906b56SAndreas Gohr } 101be906b56SAndreas Gohr 102be906b56SAndreas Gohr if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { 103be906b56SAndreas Gohr if (function_exists('preg_last_error')) { 104be906b56SAndreas Gohr $err = preg_last_error(); 105be906b56SAndreas Gohr switch ($err) { 106be906b56SAndreas Gohr case PREG_BACKTRACK_LIMIT_ERROR: 107be906b56SAndreas Gohr msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1); 108be906b56SAndreas Gohr break; 109be906b56SAndreas Gohr case PREG_RECURSION_LIMIT_ERROR: 110be906b56SAndreas Gohr msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1); 111be906b56SAndreas Gohr break; 112be906b56SAndreas Gohr case PREG_BAD_UTF8_ERROR: 113be906b56SAndreas Gohr msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1); 114be906b56SAndreas Gohr break; 115be906b56SAndreas Gohr case PREG_INTERNAL_ERROR: 116be906b56SAndreas Gohr msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1); 117be906b56SAndreas Gohr break; 118be906b56SAndreas Gohr } 119be906b56SAndreas Gohr } 120be906b56SAndreas Gohr 121*bcaec9f4SAndreas Gohr $split = [$subject, "", ""]; 122be906b56SAndreas Gohr return false; 123be906b56SAndreas Gohr } 124be906b56SAndreas Gohr 125be906b56SAndreas Gohr $idx = count($matches)-2; 126*bcaec9f4SAndreas Gohr [$pre, $post] = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2); 127*bcaec9f4SAndreas Gohr $split = [$pre, $matches[0], $post]; 128be906b56SAndreas Gohr 129*bcaec9f4SAndreas Gohr return $this->labels[$idx] ?? true; 130be906b56SAndreas Gohr } 131be906b56SAndreas Gohr 132be906b56SAndreas Gohr /** 133be906b56SAndreas Gohr * Compounds the patterns into a single 134be906b56SAndreas Gohr * regular expression separated with the 135be906b56SAndreas Gohr * "or" operator. Caches the regex. 136be906b56SAndreas Gohr * Will automatically escape (, ) and / tokens. 137be906b56SAndreas Gohr * 138be906b56SAndreas Gohr * @return null|string 139be906b56SAndreas Gohr */ 140be906b56SAndreas Gohr protected function getCompoundedRegex() 141be906b56SAndreas Gohr { 142be906b56SAndreas Gohr if ($this->regex == null) { 143be906b56SAndreas Gohr $cnt = count($this->patterns); 144be906b56SAndreas Gohr for ($i = 0; $i < $cnt; $i++) { 145be906b56SAndreas Gohr /* 146be906b56SAndreas Gohr * decompose the input pattern into "(", "(?", ")", 147be906b56SAndreas Gohr * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"... 148be906b56SAndreas Gohr * elements. 149be906b56SAndreas Gohr */ 150be906b56SAndreas Gohr preg_match_all('/\\\\.|' . 151be906b56SAndreas Gohr '\(\?|' . 152be906b56SAndreas Gohr '[()]|' . 153be906b56SAndreas Gohr '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' . 154be906b56SAndreas Gohr '[^[()\\\\]+/', $this->patterns[$i], $elts); 155be906b56SAndreas Gohr 156be906b56SAndreas Gohr $pattern = ""; 157be906b56SAndreas Gohr $level = 0; 158be906b56SAndreas Gohr 159be906b56SAndreas Gohr foreach ($elts[0] as $elt) { 160be906b56SAndreas Gohr /* 161be906b56SAndreas Gohr * for "(", ")" remember the nesting level, add "\" 162be906b56SAndreas Gohr * only to the non-"(?" ones. 163be906b56SAndreas Gohr */ 164be906b56SAndreas Gohr 165be906b56SAndreas Gohr switch ($elt) { 166be906b56SAndreas Gohr case '(': 167be906b56SAndreas Gohr $pattern .= '\('; 168be906b56SAndreas Gohr break; 169be906b56SAndreas Gohr case ')': 170be906b56SAndreas Gohr if ($level > 0) 171be906b56SAndreas Gohr $level--; /* closing (? */ 172be906b56SAndreas Gohr else $pattern .= '\\'; 173be906b56SAndreas Gohr $pattern .= ')'; 174be906b56SAndreas Gohr break; 175be906b56SAndreas Gohr case '(?': 176be906b56SAndreas Gohr $level++; 177be906b56SAndreas Gohr $pattern .= '(?'; 178be906b56SAndreas Gohr break; 179be906b56SAndreas Gohr default: 180be906b56SAndreas Gohr if (substr($elt, 0, 1) == '\\') 181be906b56SAndreas Gohr $pattern .= $elt; 182be906b56SAndreas Gohr else $pattern .= str_replace('/', '\/', $elt); 183be906b56SAndreas Gohr } 184be906b56SAndreas Gohr } 185be906b56SAndreas Gohr $this->patterns[$i] = "($pattern)"; 186be906b56SAndreas Gohr } 187be906b56SAndreas Gohr $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags(); 188be906b56SAndreas Gohr } 189be906b56SAndreas Gohr return $this->regex; 190be906b56SAndreas Gohr } 191be906b56SAndreas Gohr 192be906b56SAndreas Gohr /** 193be906b56SAndreas Gohr * Accessor for perl regex mode flags to use. 194be906b56SAndreas Gohr * @return string Perl regex flags. 195be906b56SAndreas Gohr */ 196be906b56SAndreas Gohr protected function getPerlMatchingFlags() 197be906b56SAndreas Gohr { 198be906b56SAndreas Gohr return ($this->case ? "msS" : "msSi"); 199be906b56SAndreas Gohr } 200be906b56SAndreas Gohr} 201