1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13/**
14 * Compounded regular expression.
15 *
16 * Any of the contained patterns could match and when one does it's label is returned.
17 */
18class ParallelRegex
19{
20    /** @var string[] patterns to match */
21    protected $patterns = [];
22    /** @var string[] labels for above patterns */
23    protected $labels = [];
24    /** @var string the compound regex matching all patterns */
25    protected $regex;
26    /** @var bool case sensitive matching? */
27    protected $case;
28
29    /**
30     * Constructor. Starts with no patterns.
31     *
32     * @param boolean $case    True for case sensitive, false
33     *                         for insensitive.
34     */
35    public function __construct($case)
36    {
37        $this->case = $case;
38    }
39
40    /**
41     * Adds a pattern with an optional label.
42     *
43     * @param mixed       $pattern Perl style regex. Must be UTF-8
44     *                             encoded. If its a string, the (, )
45     *                             lose their meaning unless they
46     *                             form part of a lookahead or
47     *                             lookbehind assertation.
48     * @param bool|string $label   Label of regex to be returned
49     *                             on a match. Label must be ASCII
50     */
51    public function addPattern($pattern, $label = true)
52    {
53        $count = count($this->patterns);
54        $this->patterns[$count] = $pattern;
55        $this->labels[$count] = $label;
56        $this->regex = null;
57    }
58
59    /**
60     * Attempts to match all patterns at once against a string.
61     *
62     * @param string $subject      String to match against.
63     * @param string $match        First matched portion of
64     *                             subject.
65     * @return bool|string         False if no match found, label if label exists, true if not
66     */
67    public function apply($subject, &$match)
68    {
69        if (count($this->patterns) == 0) {
70            return false;
71        }
72        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
73            $match = "";
74            return false;
75        }
76
77        $match = $matches[0];
78        $size = count($matches);
79        // FIXME this could be made faster by storing the labels as keys in a hashmap
80        for ($i = 1; $i < $size; $i++) {
81            if ($matches[$i] && isset($this->labels[$i - 1])) {
82                return $this->labels[$i - 1];
83            }
84        }
85        return true;
86    }
87
88    /**
89     * Attempts to split the string against all patterns at once
90     *
91     * @param string $subject      String to match against.
92     * @param array $split         The split result: array containing, pre-match, match & post-match strings
93     * @return boolean             True on success.
94     *
95     * @author Christopher Smith <chris@jalakai.co.uk>
96     */
97    public function split($subject, &$split)
98    {
99        if (count($this->patterns) == 0) {
100            return false;
101        }
102
103        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
104            if (function_exists('preg_last_error')) {
105                $err = preg_last_error();
106                switch ($err) {
107                    case PREG_BACKTRACK_LIMIT_ERROR:
108                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
109                        break;
110                    case PREG_RECURSION_LIMIT_ERROR:
111                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
112                        break;
113                    case PREG_BAD_UTF8_ERROR:
114                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
115                        break;
116                    case PREG_INTERNAL_ERROR:
117                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
118                        break;
119                }
120            }
121
122            $split = [$subject, "", ""];
123            return false;
124        }
125
126        $idx = count($matches) - 2;
127        [$pre, $post] = preg_split($this->patterns[$idx] . $this->getPerlMatchingFlags(), $subject, 2);
128        $split = [$pre, $matches[0], $post];
129
130        return $this->labels[$idx] ?? true;
131    }
132
133    /**
134     * Compounds the patterns into a single
135     * regular expression separated with the
136     * "or" operator. Caches the regex.
137     * Will automatically escape (, ) and / tokens.
138     *
139     * @return null|string
140     */
141    protected function getCompoundedRegex()
142    {
143        if ($this->regex == null) {
144            $cnt = count($this->patterns);
145            for ($i = 0; $i < $cnt; $i++) {
146                /*
147                 * decompose the input pattern into "(", "(?", ")",
148                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
149                 * elements.
150                 */
151                preg_match_all('/\\\\.|' .
152                               '\(\?|' .
153                               '[()]|' .
154                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
155                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
156
157                $pattern = "";
158                $level = 0;
159
160                foreach ($elts[0] as $elt) {
161                    /*
162                     * for "(", ")" remember the nesting level, add "\"
163                     * only to the non-"(?" ones.
164                     */
165
166                    switch ($elt) {
167                        case '(':
168                            $pattern .= '\(';
169                            break;
170                        case ')':
171                            if ($level > 0)
172                                $level--; /* closing (? */
173                            else $pattern .= '\\';
174                            $pattern .= ')';
175                            break;
176                        case '(?':
177                            $level++;
178                            $pattern .= '(?';
179                            break;
180                        default:
181                            if (str_starts_with($elt, '\\'))
182                                $pattern .= $elt;
183                            else $pattern .= str_replace('/', '\/', $elt);
184                    }
185                }
186                $this->patterns[$i] = "($pattern)";
187            }
188            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
189        }
190        return $this->regex;
191    }
192
193    /**
194     * Accessor for perl regex mode flags to use.
195     * @return string       Perl regex flags.
196     */
197    protected function getPerlMatchingFlags()
198    {
199        return ($this->case ? "msS" : "msSi");
200    }
201}
202