xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision 504c13e8df88563c11b3720b317991bc38835a35)
1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13/**
14 * Compounded regular expression.
15 *
16 * Any of the contained patterns could match and when one does it's label is returned.
17 */
18class ParallelRegex
19{
20    /** @var string[] patterns to match */
21    protected $patterns = [];
22    /** @var string[] labels for above patterns */
23    protected $labels = [];
24    /** @var string the compound regex matching all patterns */
25    protected $regex;
26    /** @var bool case sensitive matching? */
27    protected $case;
28
29    /**
30     * Constructor. Starts with no patterns.
31     *
32     * @param boolean $case    True for case sensitive, false
33     *                         for insensitive.
34     */
35    public function __construct($case)
36    {
37        $this->case = $case;
38    }
39
40    /**
41     * Adds a pattern with an optional label.
42     *
43     * @param mixed       $pattern Perl style regex. Must be UTF-8
44     *                             encoded. If its a string, the (, )
45     *                             lose their meaning unless they
46     *                             form part of a lookahead or
47     *                             lookbehind assertation.
48     * @param bool|string $label   Label of regex to be returned
49     *                             on a match. Label must be ASCII
50     */
51    public function addPattern($pattern, $label = true)
52    {
53        $count = count($this->patterns);
54        $this->patterns[$count] = $pattern;
55        $this->labels[$count] = $label;
56        $this->regex = null;
57    }
58
59    /**
60     * Attempts to split the string against all patterns at once
61     *
62     * @param string $subject      String to match against.
63     * @param array $split         The split result: array containing, pre-match, match & post-match strings
64     * @return boolean             True on success.
65     *
66     * @author Christopher Smith <chris@jalakai.co.uk>
67     */
68    public function split($subject, &$split)
69    {
70        if (count($this->patterns) == 0) {
71            return false;
72        }
73
74        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
75            if (function_exists('preg_last_error')) {
76                $err = preg_last_error();
77                switch ($err) {
78                    case PREG_BACKTRACK_LIMIT_ERROR:
79                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
80                        break;
81                    case PREG_RECURSION_LIMIT_ERROR:
82                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
83                        break;
84                    case PREG_BAD_UTF8_ERROR:
85                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
86                        break;
87                    case PREG_INTERNAL_ERROR:
88                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
89                        break;
90                }
91            }
92
93            $split = [$subject, "", ""];
94            return false;
95        }
96
97        $idx = count($matches) - 2;
98        [$pre, $post] = preg_split($this->patterns[$idx] . $this->getPerlMatchingFlags(), $subject, 2);
99        $split = [$pre, $matches[0], $post];
100
101        return $this->labels[$idx] ?? true;
102    }
103
104    /**
105     * Compounds the patterns into a single
106     * regular expression separated with the
107     * "or" operator. Caches the regex.
108     * Will automatically escape (, ) and / tokens.
109     *
110     * @return null|string
111     */
112    protected function getCompoundedRegex()
113    {
114        if ($this->regex == null) {
115            $cnt = count($this->patterns);
116            for ($i = 0; $i < $cnt; $i++) {
117                /*
118                 * decompose the input pattern into "(", "(?", ")",
119                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
120                 * elements.
121                 */
122                preg_match_all('/\\\\.|' .
123                               '\(\?|' .
124                               '[()]|' .
125                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
126                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
127
128                $pattern = "";
129                $level = 0;
130
131                foreach ($elts[0] as $elt) {
132                    /*
133                     * for "(", ")" remember the nesting level, add "\"
134                     * only to the non-"(?" ones.
135                     */
136
137                    switch ($elt) {
138                        case '(':
139                            $pattern .= '\(';
140                            break;
141                        case ')':
142                            if ($level > 0)
143                                $level--; /* closing (? */
144                            else $pattern .= '\\';
145                            $pattern .= ')';
146                            break;
147                        case '(?':
148                            $level++;
149                            $pattern .= '(?';
150                            break;
151                        default:
152                            if (str_starts_with($elt, '\\'))
153                                $pattern .= $elt;
154                            else $pattern .= str_replace('/', '\/', $elt);
155                    }
156                }
157                $this->patterns[$i] = "($pattern)";
158            }
159            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
160        }
161        return $this->regex;
162    }
163
164    /**
165     * Accessor for perl regex mode flags to use.
166     * @return string       Perl regex flags.
167     */
168    protected function getPerlMatchingFlags()
169    {
170        return ($this->case ? "msS" : "msSi");
171    }
172}
173