xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision 8ab4ec30c7bd174872065a1b719923d1964ac999)
1be906b56SAndreas Gohr<?php
2d4f83172SAndreas Gohr
3be906b56SAndreas Gohr/**
4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5be906b56SAndreas Gohr * For an intro to the Lexer see:
6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7be906b56SAndreas Gohr *
8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
9be906b56SAndreas Gohr */
10be906b56SAndreas Gohr
11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
12be906b56SAndreas Gohr
13be906b56SAndreas Gohr/**
14be906b56SAndreas Gohr * Compounded regular expression.
15be906b56SAndreas Gohr *
16be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned.
17be906b56SAndreas Gohr */
18be906b56SAndreas Gohrclass ParallelRegex
19be906b56SAndreas Gohr{
20be906b56SAndreas Gohr    /** @var string[] patterns to match */
21bcaec9f4SAndreas Gohr    protected $patterns = [];
22be906b56SAndreas Gohr    /** @var string[] labels for above patterns */
23bcaec9f4SAndreas Gohr    protected $labels = [];
24be906b56SAndreas Gohr    /** @var string the compound regex matching all patterns */
25be906b56SAndreas Gohr    protected $regex;
26be906b56SAndreas Gohr    /** @var bool case sensitive matching? */
27be906b56SAndreas Gohr    protected $case;
28be906b56SAndreas Gohr
29be906b56SAndreas Gohr    /**
30be906b56SAndreas Gohr     * Constructor. Starts with no patterns.
31be906b56SAndreas Gohr     *
32be906b56SAndreas Gohr     * @param boolean $case    True for case sensitive, false
33be906b56SAndreas Gohr     *                         for insensitive.
34be906b56SAndreas Gohr     */
35be906b56SAndreas Gohr    public function __construct($case)
36be906b56SAndreas Gohr    {
37be906b56SAndreas Gohr        $this->case = $case;
38be906b56SAndreas Gohr    }
39be906b56SAndreas Gohr
40be906b56SAndreas Gohr    /**
41be906b56SAndreas Gohr     * Adds a pattern with an optional label.
42be906b56SAndreas Gohr     *
43be906b56SAndreas Gohr     * @param mixed       $pattern Perl style regex. Must be UTF-8
44be906b56SAndreas Gohr     *                             encoded. If its a string, the (, )
45be906b56SAndreas Gohr     *                             lose their meaning unless they
46be906b56SAndreas Gohr     *                             form part of a lookahead or
47be906b56SAndreas Gohr     *                             lookbehind assertation.
48be906b56SAndreas Gohr     * @param bool|string $label   Label of regex to be returned
49be906b56SAndreas Gohr     *                             on a match. Label must be ASCII
50be906b56SAndreas Gohr     */
51be906b56SAndreas Gohr    public function addPattern($pattern, $label = true)
52be906b56SAndreas Gohr    {
53be906b56SAndreas Gohr        $count = count($this->patterns);
54be906b56SAndreas Gohr        $this->patterns[$count] = $pattern;
55be906b56SAndreas Gohr        $this->labels[$count] = $label;
56be906b56SAndreas Gohr        $this->regex = null;
57be906b56SAndreas Gohr    }
58be906b56SAndreas Gohr
59be906b56SAndreas Gohr    /**
60be906b56SAndreas Gohr     * Attempts to split the string against all patterns at once
61be906b56SAndreas Gohr     *
62be906b56SAndreas Gohr     * @param string $subject      String to match against.
63be906b56SAndreas Gohr     * @param array $split         The split result: array containing, pre-match, match & post-match strings
64be906b56SAndreas Gohr     * @return boolean             True on success.
65be906b56SAndreas Gohr     *
66be906b56SAndreas Gohr     * @author Christopher Smith <chris@jalakai.co.uk>
67be906b56SAndreas Gohr     */
68be906b56SAndreas Gohr    public function split($subject, &$split)
69be906b56SAndreas Gohr    {
70be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
71be906b56SAndreas Gohr            return false;
72be906b56SAndreas Gohr        }
73be906b56SAndreas Gohr
74be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
75be906b56SAndreas Gohr            if (function_exists('preg_last_error')) {
76be906b56SAndreas Gohr                $err = preg_last_error();
77be906b56SAndreas Gohr                switch ($err) {
78be906b56SAndreas Gohr                    case PREG_BACKTRACK_LIMIT_ERROR:
79be906b56SAndreas Gohr                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
80be906b56SAndreas Gohr                        break;
81be906b56SAndreas Gohr                    case PREG_RECURSION_LIMIT_ERROR:
82be906b56SAndreas Gohr                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
83be906b56SAndreas Gohr                        break;
84be906b56SAndreas Gohr                    case PREG_BAD_UTF8_ERROR:
85be906b56SAndreas Gohr                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
86be906b56SAndreas Gohr                        break;
87be906b56SAndreas Gohr                    case PREG_INTERNAL_ERROR:
88be906b56SAndreas Gohr                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
89be906b56SAndreas Gohr                        break;
90be906b56SAndreas Gohr                }
91be906b56SAndreas Gohr            }
92be906b56SAndreas Gohr
93bcaec9f4SAndreas Gohr            $split = [$subject, "", ""];
94be906b56SAndreas Gohr            return false;
95be906b56SAndreas Gohr        }
96be906b56SAndreas Gohr
97be906b56SAndreas Gohr        $idx = count($matches) - 2;
98bcaec9f4SAndreas Gohr        [$pre, $post] = preg_split($this->patterns[$idx] . $this->getPerlMatchingFlags(), $subject, 2);
99bcaec9f4SAndreas Gohr        $split = [$pre, $matches[0], $post];
100be906b56SAndreas Gohr
101bcaec9f4SAndreas Gohr        return $this->labels[$idx] ?? true;
102be906b56SAndreas Gohr    }
103be906b56SAndreas Gohr
104be906b56SAndreas Gohr    /**
105be906b56SAndreas Gohr     * Compounds the patterns into a single
106be906b56SAndreas Gohr     * regular expression separated with the
107be906b56SAndreas Gohr     * "or" operator. Caches the regex.
108be906b56SAndreas Gohr     * Will automatically escape (, ) and / tokens.
109be906b56SAndreas Gohr     *
110be906b56SAndreas Gohr     * @return null|string
111be906b56SAndreas Gohr     */
112be906b56SAndreas Gohr    protected function getCompoundedRegex()
113be906b56SAndreas Gohr    {
114be906b56SAndreas Gohr        if ($this->regex == null) {
115be906b56SAndreas Gohr            $cnt = count($this->patterns);
116be906b56SAndreas Gohr            for ($i = 0; $i < $cnt; $i++) {
117be906b56SAndreas Gohr                /*
118be906b56SAndreas Gohr                 * decompose the input pattern into "(", "(?", ")",
119be906b56SAndreas Gohr                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
120be906b56SAndreas Gohr                 * elements.
121be906b56SAndreas Gohr                 */
122be906b56SAndreas Gohr                preg_match_all('/\\\\.|' .
123be906b56SAndreas Gohr                               '\(\?|' .
124be906b56SAndreas Gohr                               '[()]|' .
125be906b56SAndreas Gohr                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
126be906b56SAndreas Gohr                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
127be906b56SAndreas Gohr
128be906b56SAndreas Gohr                $pattern = "";
129be906b56SAndreas Gohr                $level = 0;
130be906b56SAndreas Gohr
131be906b56SAndreas Gohr                foreach ($elts[0] as $elt) {
132be906b56SAndreas Gohr                    /*
133be906b56SAndreas Gohr                     * for "(", ")" remember the nesting level, add "\"
134be906b56SAndreas Gohr                     * only to the non-"(?" ones.
135be906b56SAndreas Gohr                     */
136be906b56SAndreas Gohr
137be906b56SAndreas Gohr                    switch ($elt) {
138be906b56SAndreas Gohr                        case '(':
139be906b56SAndreas Gohr                            $pattern .= '\(';
140be906b56SAndreas Gohr                            break;
141be906b56SAndreas Gohr                        case ')':
142be906b56SAndreas Gohr                            if ($level > 0)
143be906b56SAndreas Gohr                                $level--; /* closing (? */
144be906b56SAndreas Gohr                            else $pattern .= '\\';
145be906b56SAndreas Gohr                            $pattern .= ')';
146be906b56SAndreas Gohr                            break;
147be906b56SAndreas Gohr                        case '(?':
148be906b56SAndreas Gohr                            $level++;
149be906b56SAndreas Gohr                            $pattern .= '(?';
150be906b56SAndreas Gohr                            break;
151be906b56SAndreas Gohr                        default:
152*6c16a3a9Sfiwswe                            if (str_starts_with($elt, '\\'))
153be906b56SAndreas Gohr                                $pattern .= $elt;
154be906b56SAndreas Gohr                            else $pattern .= str_replace('/', '\/', $elt);
155be906b56SAndreas Gohr                    }
156be906b56SAndreas Gohr                }
157be906b56SAndreas Gohr                $this->patterns[$i] = "($pattern)";
158be906b56SAndreas Gohr            }
159be906b56SAndreas Gohr            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
160be906b56SAndreas Gohr        }
161be906b56SAndreas Gohr        return $this->regex;
162be906b56SAndreas Gohr    }
163be906b56SAndreas Gohr
164be906b56SAndreas Gohr    /**
165be906b56SAndreas Gohr     * Accessor for perl regex mode flags to use.
166be906b56SAndreas Gohr     * @return string       Perl regex flags.
167be906b56SAndreas Gohr     */
168be906b56SAndreas Gohr    protected function getPerlMatchingFlags()
169be906b56SAndreas Gohr    {
170be906b56SAndreas Gohr        return ($this->case ? "msS" : "msSi");
171be906b56SAndreas Gohr    }
172be906b56SAndreas Gohr}
173