1 <?php
2 
3 /**
4  * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5  * For an intro to the Lexer see:
6  * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7  *
8  * @author Marcus Baker http://www.lastcraft.com
9  */
10 
11 namespace dokuwiki\Parsing\Lexer;
12 
13 /**
14  * Compounded regular expression.
15  *
16  * Any of the contained patterns could match and when one does it's label is returned.
17  */
18 class ParallelRegex
19 {
20     /** @var string[] patterns to match */
21     protected $patterns = [];
22     /** @var string[] labels for above patterns */
23     protected $labels = [];
24     /** @var string the compound regex matching all patterns */
25     protected $regex;
26     /** @var bool case sensitive matching? */
27     protected $case;
28 
29     /**
30      * Constructor. Starts with no patterns.
31      *
32      * @param boolean $case    True for case sensitive, false
33      *                         for insensitive.
34      */
35     public function __construct($case)
36     {
37         $this->case = $case;
38     }
39 
40     /**
41      * Adds a pattern with an optional label.
42      *
43      * @param mixed       $pattern Perl style regex. Must be UTF-8
44      *                             encoded. If its a string, the (, )
45      *                             lose their meaning unless they
46      *                             form part of a lookahead or
47      *                             lookbehind assertation.
48      * @param bool|string $label   Label of regex to be returned
49      *                             on a match. Label must be ASCII
50      */
51     public function addPattern($pattern, $label = true)
52     {
53         $count = count($this->patterns);
54         $this->patterns[$count] = $pattern;
55         $this->labels[$count] = $label;
56         $this->regex = null;
57     }
58 
59     /**
60      * Attempts to match all patterns at once against a string.
61      *
62      * @param string $subject      String to match against.
63      * @param string $match        First matched portion of
64      *                             subject.
65      * @return bool|string         False if no match found, label if label exists, true if not
66      */
67     public function apply($subject, &$match)
68     {
69         if (count($this->patterns) == 0) {
70             return false;
71         }
72         if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
73             $match = "";
74             return false;
75         }
76 
77         $match = $matches[0];
78         $size = count($matches);
79         // FIXME this could be made faster by storing the labels as keys in a hashmap
80         for ($i = 1; $i < $size; $i++) {
81             if ($matches[$i] && isset($this->labels[$i - 1])) {
82                 return $this->labels[$i - 1];
83             }
84         }
85         return true;
86     }
87 
88     /**
89      * Attempts to split the string against all patterns at once
90      *
91      * @param string $subject      String to match against.
92      * @param array $split         The split result: array containing, pre-match, match & post-match strings
93      * @return boolean             True on success.
94      *
95      * @author Christopher Smith <chris@jalakai.co.uk>
96      */
97     public function split($subject, &$split)
98     {
99         if (count($this->patterns) == 0) {
100             return false;
101         }
102 
103         if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
104             if (function_exists('preg_last_error')) {
105                 $err = preg_last_error();
106                 switch ($err) {
107                     case PREG_BACKTRACK_LIMIT_ERROR:
108                         msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
109                         break;
110                     case PREG_RECURSION_LIMIT_ERROR:
111                         msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
112                         break;
113                     case PREG_BAD_UTF8_ERROR:
114                         msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
115                         break;
116                     case PREG_INTERNAL_ERROR:
117                         msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
118                         break;
119                 }
120             }
121 
122             $split = [$subject, "", ""];
123             return false;
124         }
125 
126         $idx = count($matches) - 2;
127         [$pre, $post] = preg_split($this->patterns[$idx] . $this->getPerlMatchingFlags(), $subject, 2);
128         $split = [$pre, $matches[0], $post];
129 
130         return $this->labels[$idx] ?? true;
131     }
132 
133     /**
134      * Compounds the patterns into a single
135      * regular expression separated with the
136      * "or" operator. Caches the regex.
137      * Will automatically escape (, ) and / tokens.
138      *
139      * @return null|string
140      */
141     protected function getCompoundedRegex()
142     {
143         if ($this->regex == null) {
144             $cnt = count($this->patterns);
145             for ($i = 0; $i < $cnt; $i++) {
146                 /*
147                  * decompose the input pattern into "(", "(?", ")",
148                  * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
149                  * elements.
150                  */
151                 preg_match_all('/\\\\.|' .
152                                '\(\?|' .
153                                '[()]|' .
154                                '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
155                                '[^[()\\\\]+/', $this->patterns[$i], $elts);
156 
157                 $pattern = "";
158                 $level = 0;
159 
160                 foreach ($elts[0] as $elt) {
161                     /*
162                      * for "(", ")" remember the nesting level, add "\"
163                      * only to the non-"(?" ones.
164                      */
165 
166                     switch ($elt) {
167                         case '(':
168                             $pattern .= '\(';
169                             break;
170                         case ')':
171                             if ($level > 0)
172                                 $level--; /* closing (? */
173                             else $pattern .= '\\';
174                             $pattern .= ')';
175                             break;
176                         case '(?':
177                             $level++;
178                             $pattern .= '(?';
179                             break;
180                         default:
181                             if (str_starts_with($elt, '\\'))
182                                 $pattern .= $elt;
183                             else $pattern .= str_replace('/', '\/', $elt);
184                     }
185                 }
186                 $this->patterns[$i] = "($pattern)";
187             }
188             $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
189         }
190         return $this->regex;
191     }
192 
193     /**
194      * Accessor for perl regex mode flags to use.
195      * @return string       Perl regex flags.
196      */
197     protected function getPerlMatchingFlags()
198     {
199         return ($this->case ? "msS" : "msSi");
200     }
201 }
202