xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision 8ed75a23932353c18b43f67323808e9a662f532a)
1<?php
2
3/**
4 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5 * For an intro to the Lexer see:
6 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7 *
8 * @author Marcus Baker http://www.lastcraft.com
9 */
10
11namespace dokuwiki\Parsing\Lexer;
12
13/**
14 * Compounded regular expression.
15 *
16 * Any of the contained patterns could match and when one does it's label is returned.
17 */
18class ParallelRegex
19{
20    /** @var string[] patterns to match */
21    protected $patterns = [];
22    /** @var string[] labels for above patterns */
23    protected $labels = [];
24    /** @var string the compound regex matching all patterns */
25    protected $regex;
26    /** @var bool case sensitive matching? */
27    protected $case;
28
29    /**
30     * Constructor. Starts with no patterns.
31     *
32     * @param boolean $case    True for case sensitive, false
33     *                         for insensitive.
34     */
35    public function __construct($case)
36    {
37        $this->case = $case;
38    }
39
40    /**
41     * Adds a pattern with an optional label.
42     *
43     * @param mixed       $pattern Perl style regex. Must be UTF-8
44     *                             encoded. If its a string, the (, )
45     *                             lose their meaning unless they
46     *                             form part of a lookahead or
47     *                             lookbehind assertation.
48     * @param bool|string $label   Label of regex to be returned
49     *                             on a match. Label must be ASCII
50     */
51    public function addPattern($pattern, $label = true)
52    {
53        $count = count($this->patterns);
54        $this->patterns[$count] = $pattern;
55        $this->labels[$count] = $label;
56        $this->regex = null;
57    }
58
59    /**
60     * Attempts to split the string against all patterns at once.
61     *
62     * When `$offset` is non-zero, the match begins at that byte position in
63     * `$subject`, but the full subject is still passed to PCRE so any
64     * lookbehinds in the patterns can see characters before the offset.
65     * This is essential for inline-formatting closers like
66     * `(?<=[^\s])\*\*`, whose preceding non-whitespace character may have
67     * been consumed as part of a previous token (e.g. a `[[link]]`).
68     *
69     * @param string $subject      String to match against.
70     * @param array $split         The split result: array containing, pre-match, match & post-match strings
71     * @param int $offset          Byte offset into `$subject` at which to start matching.
72     * @return boolean             True on success.
73     *
74     * @author Christopher Smith <chris@jalakai.co.uk>
75     */
76    public function split($subject, &$split, $offset = 0)
77    {
78        if (count($this->patterns) == 0) {
79            return false;
80        }
81
82        if (! preg_match($this->getCompoundedRegex(), $subject, $matches, PREG_OFFSET_CAPTURE, $offset)) {
83            if (function_exists('preg_last_error')) {
84                $err = preg_last_error();
85                switch ($err) {
86                    case PREG_BACKTRACK_LIMIT_ERROR:
87                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
88                        break;
89                    case PREG_RECURSION_LIMIT_ERROR:
90                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
91                        break;
92                    case PREG_BAD_UTF8_ERROR:
93                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
94                        break;
95                    case PREG_INTERNAL_ERROR:
96                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
97                        break;
98                }
99            }
100
101            $split = [substr($subject, $offset), "", ""];
102            return false;
103        }
104
105        $idx = count($matches) - 2;
106        $matchText = (string) $matches[0][0];
107        // Byte offset from PREG_OFFSET_CAPTURE; cast makes the int type
108        // obvious to static analysers that don't model the flag.
109        $matchStart = (int) $matches[0][1];
110        $pre = substr($subject, $offset, $matchStart - $offset);
111        $post = substr($subject, $matchStart + strlen($matchText));
112        $split = [$pre, $matchText, $post];
113
114        return $this->labels[$idx] ?? true;
115    }
116
117    /**
118     * Compounds the patterns into a single
119     * regular expression separated with the
120     * "or" operator. Caches the regex.
121     * Will automatically escape (, ) and / tokens.
122     *
123     * @return null|string
124     */
125    protected function getCompoundedRegex()
126    {
127        if ($this->regex == null) {
128            $cnt = count($this->patterns);
129            for ($i = 0; $i < $cnt; $i++) {
130                /*
131                 * decompose the input pattern into "(", "(?", ")",
132                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
133                 * elements.
134                 */
135                preg_match_all('/\\\\.|' .
136                               '\(\?|' .
137                               '[()]|' .
138                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
139                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
140
141                $pattern = "";
142                $level = 0;
143
144                foreach ($elts[0] as $elt) {
145                    /*
146                     * for "(", ")" remember the nesting level, add "\"
147                     * only to the non-"(?" ones.
148                     */
149
150                    switch ($elt) {
151                        case '(':
152                            $pattern .= '\(';
153                            break;
154                        case ')':
155                            if ($level > 0)
156                                $level--; /* closing (? */
157                            else $pattern .= '\\';
158                            $pattern .= ')';
159                            break;
160                        case '(?':
161                            $level++;
162                            $pattern .= '(?';
163                            break;
164                        default:
165                            if (str_starts_with($elt, '\\'))
166                                $pattern .= $elt;
167                            else $pattern .= str_replace('/', '\/', $elt);
168                    }
169                }
170                $this->patterns[$i] = "($pattern)";
171            }
172            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
173        }
174        return $this->regex;
175    }
176
177    /**
178     * Accessor for perl regex mode flags to use.
179     * @return string       Perl regex flags.
180     */
181    protected function getPerlMatchingFlags()
182    {
183        return ($this->case ? "msS" : "msSi");
184    }
185}
186