1be906b56SAndreas Gohr<?php 2d4f83172SAndreas Gohr 3be906b56SAndreas Gohr/** 4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5be906b56SAndreas Gohr * For an intro to the Lexer see: 6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7be906b56SAndreas Gohr * 8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com 9be906b56SAndreas Gohr */ 10be906b56SAndreas Gohr 11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer; 12be906b56SAndreas Gohr 13be906b56SAndreas Gohr/** 14be906b56SAndreas Gohr * Compounded regular expression. 15be906b56SAndreas Gohr * 16be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned. 17be906b56SAndreas Gohr */ 18be906b56SAndreas Gohrclass ParallelRegex 19be906b56SAndreas Gohr{ 20be906b56SAndreas Gohr /** @var string[] patterns to match */ 21bcaec9f4SAndreas Gohr protected $patterns = []; 22be906b56SAndreas Gohr /** @var string[] labels for above patterns */ 23bcaec9f4SAndreas Gohr protected $labels = []; 24be906b56SAndreas Gohr /** @var string the compound regex matching all patterns */ 25be906b56SAndreas Gohr protected $regex; 26be906b56SAndreas Gohr /** @var bool case sensitive matching? */ 27be906b56SAndreas Gohr protected $case; 28be906b56SAndreas Gohr 29be906b56SAndreas Gohr /** 30be906b56SAndreas Gohr * Constructor. Starts with no patterns. 31be906b56SAndreas Gohr * 32be906b56SAndreas Gohr * @param boolean $case True for case sensitive, false 33be906b56SAndreas Gohr * for insensitive. 34be906b56SAndreas Gohr */ 35be906b56SAndreas Gohr public function __construct($case) 36be906b56SAndreas Gohr { 37be906b56SAndreas Gohr $this->case = $case; 38be906b56SAndreas Gohr } 39be906b56SAndreas Gohr 40be906b56SAndreas Gohr /** 41be906b56SAndreas Gohr * Adds a pattern with an optional label. 42be906b56SAndreas Gohr * 43be906b56SAndreas Gohr * @param mixed $pattern Perl style regex. Must be UTF-8 44be906b56SAndreas Gohr * encoded. If its a string, the (, ) 45be906b56SAndreas Gohr * lose their meaning unless they 46be906b56SAndreas Gohr * form part of a lookahead or 47be906b56SAndreas Gohr * lookbehind assertation. 48be906b56SAndreas Gohr * @param bool|string $label Label of regex to be returned 49be906b56SAndreas Gohr * on a match. Label must be ASCII 50be906b56SAndreas Gohr */ 51be906b56SAndreas Gohr public function addPattern($pattern, $label = true) 52be906b56SAndreas Gohr { 53be906b56SAndreas Gohr $count = count($this->patterns); 54be906b56SAndreas Gohr $this->patterns[$count] = $pattern; 55be906b56SAndreas Gohr $this->labels[$count] = $label; 56be906b56SAndreas Gohr $this->regex = null; 57be906b56SAndreas Gohr } 58be906b56SAndreas Gohr 59be906b56SAndreas Gohr /** 60be906b56SAndreas Gohr * Attempts to match all patterns at once against a string. 61be906b56SAndreas Gohr * 62be906b56SAndreas Gohr * @param string $subject String to match against. 63be906b56SAndreas Gohr * @param string $match First matched portion of 64be906b56SAndreas Gohr * subject. 65be906b56SAndreas Gohr * @return bool|string False if no match found, label if label exists, true if not 66be906b56SAndreas Gohr */ 67fe2e97f6SAndreas Gohr public function apply($subject, &$match) 68be906b56SAndreas Gohr { 69be906b56SAndreas Gohr if (count($this->patterns) == 0) { 70be906b56SAndreas Gohr return false; 71be906b56SAndreas Gohr } 72be906b56SAndreas Gohr if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { 73be906b56SAndreas Gohr $match = ""; 74be906b56SAndreas Gohr return false; 75be906b56SAndreas Gohr } 76be906b56SAndreas Gohr 77be906b56SAndreas Gohr $match = $matches[0]; 78be906b56SAndreas Gohr $size = count($matches); 79be906b56SAndreas Gohr // FIXME this could be made faster by storing the labels as keys in a hashmap 80be906b56SAndreas Gohr for ($i = 1; $i < $size; $i++) { 81be906b56SAndreas Gohr if ($matches[$i] && isset($this->labels[$i - 1])) { 82be906b56SAndreas Gohr return $this->labels[$i - 1]; 83be906b56SAndreas Gohr } 84be906b56SAndreas Gohr } 85be906b56SAndreas Gohr return true; 86be906b56SAndreas Gohr } 87be906b56SAndreas Gohr 88be906b56SAndreas Gohr /** 89be906b56SAndreas Gohr * Attempts to split the string against all patterns at once 90be906b56SAndreas Gohr * 91be906b56SAndreas Gohr * @param string $subject String to match against. 92be906b56SAndreas Gohr * @param array $split The split result: array containing, pre-match, match & post-match strings 93be906b56SAndreas Gohr * @return boolean True on success. 94be906b56SAndreas Gohr * 95be906b56SAndreas Gohr * @author Christopher Smith <chris@jalakai.co.uk> 96be906b56SAndreas Gohr */ 97be906b56SAndreas Gohr public function split($subject, &$split) 98be906b56SAndreas Gohr { 99be906b56SAndreas Gohr if (count($this->patterns) == 0) { 100be906b56SAndreas Gohr return false; 101be906b56SAndreas Gohr } 102be906b56SAndreas Gohr 103be906b56SAndreas Gohr if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) { 104be906b56SAndreas Gohr if (function_exists('preg_last_error')) { 105be906b56SAndreas Gohr $err = preg_last_error(); 106be906b56SAndreas Gohr switch ($err) { 107be906b56SAndreas Gohr case PREG_BACKTRACK_LIMIT_ERROR: 108be906b56SAndreas Gohr msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1); 109be906b56SAndreas Gohr break; 110be906b56SAndreas Gohr case PREG_RECURSION_LIMIT_ERROR: 111be906b56SAndreas Gohr msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1); 112be906b56SAndreas Gohr break; 113be906b56SAndreas Gohr case PREG_BAD_UTF8_ERROR: 114be906b56SAndreas Gohr msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1); 115be906b56SAndreas Gohr break; 116be906b56SAndreas Gohr case PREG_INTERNAL_ERROR: 117be906b56SAndreas Gohr msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1); 118be906b56SAndreas Gohr break; 119be906b56SAndreas Gohr } 120be906b56SAndreas Gohr } 121be906b56SAndreas Gohr 122bcaec9f4SAndreas Gohr $split = [$subject, "", ""]; 123be906b56SAndreas Gohr return false; 124be906b56SAndreas Gohr } 125be906b56SAndreas Gohr 126be906b56SAndreas Gohr $idx = count($matches) - 2; 127bcaec9f4SAndreas Gohr [$pre, $post] = preg_split($this->patterns[$idx] . $this->getPerlMatchingFlags(), $subject, 2); 128bcaec9f4SAndreas Gohr $split = [$pre, $matches[0], $post]; 129be906b56SAndreas Gohr 130bcaec9f4SAndreas Gohr return $this->labels[$idx] ?? true; 131be906b56SAndreas Gohr } 132be906b56SAndreas Gohr 133be906b56SAndreas Gohr /** 134be906b56SAndreas Gohr * Compounds the patterns into a single 135be906b56SAndreas Gohr * regular expression separated with the 136be906b56SAndreas Gohr * "or" operator. Caches the regex. 137be906b56SAndreas Gohr * Will automatically escape (, ) and / tokens. 138be906b56SAndreas Gohr * 139be906b56SAndreas Gohr * @return null|string 140be906b56SAndreas Gohr */ 141be906b56SAndreas Gohr protected function getCompoundedRegex() 142be906b56SAndreas Gohr { 143be906b56SAndreas Gohr if ($this->regex == null) { 144be906b56SAndreas Gohr $cnt = count($this->patterns); 145be906b56SAndreas Gohr for ($i = 0; $i < $cnt; $i++) { 146be906b56SAndreas Gohr /* 147be906b56SAndreas Gohr * decompose the input pattern into "(", "(?", ")", 148be906b56SAndreas Gohr * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"... 149be906b56SAndreas Gohr * elements. 150be906b56SAndreas Gohr */ 151be906b56SAndreas Gohr preg_match_all('/\\\\.|' . 152be906b56SAndreas Gohr '\(\?|' . 153be906b56SAndreas Gohr '[()]|' . 154be906b56SAndreas Gohr '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' . 155be906b56SAndreas Gohr '[^[()\\\\]+/', $this->patterns[$i], $elts); 156be906b56SAndreas Gohr 157be906b56SAndreas Gohr $pattern = ""; 158be906b56SAndreas Gohr $level = 0; 159be906b56SAndreas Gohr 160be906b56SAndreas Gohr foreach ($elts[0] as $elt) { 161be906b56SAndreas Gohr /* 162be906b56SAndreas Gohr * for "(", ")" remember the nesting level, add "\" 163be906b56SAndreas Gohr * only to the non-"(?" ones. 164be906b56SAndreas Gohr */ 165be906b56SAndreas Gohr 166be906b56SAndreas Gohr switch ($elt) { 167be906b56SAndreas Gohr case '(': 168be906b56SAndreas Gohr $pattern .= '\('; 169be906b56SAndreas Gohr break; 170be906b56SAndreas Gohr case ')': 171be906b56SAndreas Gohr if ($level > 0) 172be906b56SAndreas Gohr $level--; /* closing (? */ 173be906b56SAndreas Gohr else $pattern .= '\\'; 174be906b56SAndreas Gohr $pattern .= ')'; 175be906b56SAndreas Gohr break; 176be906b56SAndreas Gohr case '(?': 177be906b56SAndreas Gohr $level++; 178be906b56SAndreas Gohr $pattern .= '(?'; 179be906b56SAndreas Gohr break; 180be906b56SAndreas Gohr default: 181*6c16a3a9Sfiwswe if (str_starts_with($elt, '\\')) 182be906b56SAndreas Gohr $pattern .= $elt; 183be906b56SAndreas Gohr else $pattern .= str_replace('/', '\/', $elt); 184be906b56SAndreas Gohr } 185be906b56SAndreas Gohr } 186be906b56SAndreas Gohr $this->patterns[$i] = "($pattern)"; 187be906b56SAndreas Gohr } 188be906b56SAndreas Gohr $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags(); 189be906b56SAndreas Gohr } 190be906b56SAndreas Gohr return $this->regex; 191be906b56SAndreas Gohr } 192be906b56SAndreas Gohr 193be906b56SAndreas Gohr /** 194be906b56SAndreas Gohr * Accessor for perl regex mode flags to use. 195be906b56SAndreas Gohr * @return string Perl regex flags. 196be906b56SAndreas Gohr */ 197be906b56SAndreas Gohr protected function getPerlMatchingFlags() 198be906b56SAndreas Gohr { 199be906b56SAndreas Gohr return ($this->case ? "msS" : "msSi"); 200be906b56SAndreas Gohr } 201be906b56SAndreas Gohr} 202