xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision 8c7c53b0321a3cd3116b8d3b2ad27863a38dece7)
1<?php
2/**
3 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4 * For an intro to the Lexer see:
5 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6 *
7 * @author Marcus Baker http://www.lastcraft.com
8 */
9
10namespace dokuwiki\Parsing\Lexer;
11
12/**
13 * Compounded regular expression.
14 *
15 * Any of the contained patterns could match and when one does it's label is returned.
16 */
17class ParallelRegex
18{
19    /** @var string[] patterns to match */
20    protected $patterns = [];
21    /** @var string[] labels for above patterns */
22    protected $labels = [];
23    /** @var string the compound regex matching all patterns */
24    protected $regex;
25    /** @var bool case sensitive matching? */
26    protected $case;
27
28    /**
29     * Constructor. Starts with no patterns.
30     *
31     * @param boolean $case    True for case sensitive, false
32     *                         for insensitive.
33     */
34    public function __construct($case)
35    {
36        $this->case = $case;
37    }
38
39    /**
40     * Adds a pattern with an optional label.
41     *
42     * @param mixed       $pattern Perl style regex. Must be UTF-8
43     *                             encoded. If its a string, the (, )
44     *                             lose their meaning unless they
45     *                             form part of a lookahead or
46     *                             lookbehind assertation.
47     * @param bool|string $label   Label of regex to be returned
48     *                             on a match. Label must be ASCII
49     */
50    public function addPattern($pattern, $label = true)
51    {
52        $count = count($this->patterns);
53        $this->patterns[$count] = $pattern;
54        $this->labels[$count] = $label;
55        $this->regex = null;
56    }
57
58    /**
59     * Attempts to match all patterns at once against a string.
60     *
61     * @param string $subject      String to match against.
62     * @param string $match        First matched portion of
63     *                             subject.
64     * @return bool|string         False if no match found, label if label exists, true if not
65     */
66    public function apply($subject, &$match)
67    {
68        if (count($this->patterns) == 0) {
69            return false;
70        }
71        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
72            $match = "";
73            return false;
74        }
75
76        $match = $matches[0];
77        $size = count($matches);
78        // FIXME this could be made faster by storing the labels as keys in a hashmap
79        for ($i = 1; $i < $size; $i++) {
80            if ($matches[$i] && isset($this->labels[$i - 1])) {
81                return $this->labels[$i - 1];
82            }
83        }
84        return true;
85    }
86
87    /**
88     * Attempts to split the string against all patterns at once
89     *
90     * @param string $subject      String to match against.
91     * @param array $split         The split result: array containing, pre-match, match & post-match strings
92     * @return boolean             True on success.
93     *
94     * @author Christopher Smith <chris@jalakai.co.uk>
95     */
96    public function split($subject, &$split)
97    {
98        if (count($this->patterns) == 0) {
99            return false;
100        }
101
102        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
103            if (function_exists('preg_last_error')) {
104                $err = preg_last_error();
105                switch ($err) {
106                    case PREG_BACKTRACK_LIMIT_ERROR:
107                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
108                        break;
109                    case PREG_RECURSION_LIMIT_ERROR:
110                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
111                        break;
112                    case PREG_BAD_UTF8_ERROR:
113                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
114                        break;
115                    case PREG_INTERNAL_ERROR:
116                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
117                        break;
118                }
119            }
120
121            $split = [$subject, "", ""];
122            return false;
123        }
124
125        $idx = count($matches)-2;
126        [$pre, $post] = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
127        $split = [$pre, $matches[0], $post];
128
129        return $this->labels[$idx] ?? true;
130    }
131
132    /**
133     * Compounds the patterns into a single
134     * regular expression separated with the
135     * "or" operator. Caches the regex.
136     * Will automatically escape (, ) and / tokens.
137     *
138     * @return null|string
139     */
140    protected function getCompoundedRegex()
141    {
142        if ($this->regex == null) {
143            $cnt = count($this->patterns);
144            for ($i = 0; $i < $cnt; $i++) {
145                /*
146                 * decompose the input pattern into "(", "(?", ")",
147                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
148                 * elements.
149                 */
150                preg_match_all('/\\\\.|' .
151                               '\(\?|' .
152                               '[()]|' .
153                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
154                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
155
156                $pattern = "";
157                $level = 0;
158
159                foreach ($elts[0] as $elt) {
160                    /*
161                     * for "(", ")" remember the nesting level, add "\"
162                     * only to the non-"(?" ones.
163                     */
164
165                    switch ($elt) {
166                        case '(':
167                            $pattern .= '\(';
168                            break;
169                        case ')':
170                            if ($level > 0)
171                                $level--; /* closing (? */
172                            else $pattern .= '\\';
173                            $pattern .= ')';
174                            break;
175                        case '(?':
176                            $level++;
177                            $pattern .= '(?';
178                            break;
179                        default:
180                            if (substr($elt, 0, 1) == '\\')
181                                $pattern .= $elt;
182                            else $pattern .= str_replace('/', '\/', $elt);
183                    }
184                }
185                $this->patterns[$i] = "($pattern)";
186            }
187            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
188        }
189        return $this->regex;
190    }
191
192    /**
193     * Accessor for perl regex mode flags to use.
194     * @return string       Perl regex flags.
195     */
196    protected function getPerlMatchingFlags()
197    {
198        return ($this->case ? "msS" : "msSi");
199    }
200}
201