1<?php 2 3/** 4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5 * For an intro to the Lexer see: 6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes 7 * 8 * @author Marcus Baker http://www.lastcraft.com 9 */ 10 11namespace dokuwiki\Parsing\Lexer; 12 13/** 14 * Compounded regular expression. 15 * 16 * Any of the contained patterns could match and when one does it's label is returned. 17 */ 18class ParallelRegex 19{ 20 /** @var string[] patterns to match */ 21 protected $patterns = []; 22 /** @var string[] labels for above patterns */ 23 protected $labels = []; 24 /** @var string the compound regex matching all patterns */ 25 protected $regex; 26 /** @var bool case sensitive matching? */ 27 protected $case; 28 29 /** 30 * Constructor. Starts with no patterns. 31 * 32 * @param boolean $case True for case sensitive, false 33 * for insensitive. 34 */ 35 public function __construct($case) 36 { 37 $this->case = $case; 38 } 39 40 /** 41 * Adds a pattern with an optional label. 42 * 43 * @param mixed $pattern Perl style regex. Must be UTF-8 44 * encoded. If its a string, the (, ) 45 * lose their meaning unless they 46 * form part of a lookahead or 47 * lookbehind assertation. 48 * @param bool|string $label Label of regex to be returned 49 * on a match. Label must be ASCII 50 */ 51 public function addPattern($pattern, $label = true) 52 { 53 $count = count($this->patterns); 54 $this->patterns[$count] = $pattern; 55 $this->labels[$count] = $label; 56 $this->regex = null; 57 } 58 59 /** 60 * Attempts to split the string against all patterns at once. 61 * 62 * When `$offset` is non-zero, the match begins at that byte position in 63 * `$subject`, but the full subject is still passed to PCRE so any 64 * lookbehinds in the patterns can see characters before the offset. 65 * This is essential for inline-formatting closers like 66 * `(?<=[^\s])\*\*`, whose preceding non-whitespace character may have 67 * been consumed as part of a previous token (e.g. a `[[link]]`). 68 * 69 * @param string $subject String to match against. 70 * @param array $split The split result: array containing, pre-match, match & post-match strings 71 * @param int $offset Byte offset into `$subject` at which to start matching. 72 * @return boolean True on success. 73 * 74 * @author Christopher Smith <chris@jalakai.co.uk> 75 */ 76 public function split($subject, &$split, $offset = 0) 77 { 78 if (count($this->patterns) == 0) { 79 return false; 80 } 81 82 if (! preg_match($this->getCompoundedRegex(), $subject, $matches, PREG_OFFSET_CAPTURE, $offset)) { 83 if (function_exists('preg_last_error')) { 84 $err = preg_last_error(); 85 switch ($err) { 86 case PREG_BACKTRACK_LIMIT_ERROR: 87 msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1); 88 break; 89 case PREG_RECURSION_LIMIT_ERROR: 90 msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1); 91 break; 92 case PREG_BAD_UTF8_ERROR: 93 msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1); 94 break; 95 case PREG_INTERNAL_ERROR: 96 msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1); 97 break; 98 } 99 } 100 101 $split = [substr($subject, $offset), "", ""]; 102 return false; 103 } 104 105 $idx = count($matches) - 2; 106 $matchText = (string) $matches[0][0]; 107 // Byte offset from PREG_OFFSET_CAPTURE; cast makes the int type 108 // obvious to static analysers that don't model the flag. 109 $matchStart = (int) $matches[0][1]; 110 $pre = substr($subject, $offset, $matchStart - $offset); 111 $post = substr($subject, $matchStart + strlen($matchText)); 112 $split = [$pre, $matchText, $post]; 113 114 return $this->labels[$idx] ?? true; 115 } 116 117 /** 118 * Compounds the patterns into a single 119 * regular expression separated with the 120 * "or" operator. Caches the regex. 121 * Will automatically escape (, ) and / tokens. 122 * 123 * @return null|string 124 */ 125 protected function getCompoundedRegex() 126 { 127 if ($this->regex == null) { 128 $cnt = count($this->patterns); 129 for ($i = 0; $i < $cnt; $i++) { 130 /* 131 * decompose the input pattern into "(", "(?", ")", 132 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"... 133 * elements. 134 */ 135 preg_match_all('/\\\\.|' . 136 '\(\?|' . 137 '[()]|' . 138 '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' . 139 '[^[()\\\\]+/', $this->patterns[$i], $elts); 140 141 $pattern = ""; 142 $level = 0; 143 144 foreach ($elts[0] as $elt) { 145 /* 146 * for "(", ")" remember the nesting level, add "\" 147 * only to the non-"(?" ones. 148 */ 149 150 switch ($elt) { 151 case '(': 152 $pattern .= '\('; 153 break; 154 case ')': 155 if ($level > 0) 156 $level--; /* closing (? */ 157 else $pattern .= '\\'; 158 $pattern .= ')'; 159 break; 160 case '(?': 161 $level++; 162 $pattern .= '(?'; 163 break; 164 default: 165 if (str_starts_with($elt, '\\')) 166 $pattern .= $elt; 167 else $pattern .= str_replace('/', '\/', $elt); 168 } 169 } 170 $this->patterns[$i] = "($pattern)"; 171 } 172 $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags(); 173 } 174 return $this->regex; 175 } 176 177 /** 178 * Accessor for perl regex mode flags to use. 179 * @return string Perl regex flags. 180 */ 181 protected function getPerlMatchingFlags() 182 { 183 return ($this->case ? "msS" : "msSi"); 184 } 185} 186