xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision d4f83172d9533c4d84f450fe22ef630816b21d75)
1be906b56SAndreas Gohr<?php
2*d4f83172SAndreas Gohr
3be906b56SAndreas Gohr/**
4be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
5be906b56SAndreas Gohr * For an intro to the Lexer see:
6be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
7be906b56SAndreas Gohr *
8be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
9be906b56SAndreas Gohr */
10be906b56SAndreas Gohr
11be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
12be906b56SAndreas Gohr
13be906b56SAndreas Gohr/**
14be906b56SAndreas Gohr * Compounded regular expression.
15be906b56SAndreas Gohr *
16be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned.
17be906b56SAndreas Gohr */
18be906b56SAndreas Gohrclass ParallelRegex
19be906b56SAndreas Gohr{
20be906b56SAndreas Gohr    /** @var string[] patterns to match */
21bcaec9f4SAndreas Gohr    protected $patterns = [];
22be906b56SAndreas Gohr    /** @var string[] labels for above patterns */
23bcaec9f4SAndreas Gohr    protected $labels = [];
24be906b56SAndreas Gohr    /** @var string the compound regex matching all patterns */
25be906b56SAndreas Gohr    protected $regex;
26be906b56SAndreas Gohr    /** @var bool case sensitive matching? */
27be906b56SAndreas Gohr    protected $case;
28be906b56SAndreas Gohr
29be906b56SAndreas Gohr    /**
30be906b56SAndreas Gohr     * Constructor. Starts with no patterns.
31be906b56SAndreas Gohr     *
32be906b56SAndreas Gohr     * @param boolean $case    True for case sensitive, false
33be906b56SAndreas Gohr     *                         for insensitive.
34be906b56SAndreas Gohr     */
35be906b56SAndreas Gohr    public function __construct($case)
36be906b56SAndreas Gohr    {
37be906b56SAndreas Gohr        $this->case = $case;
38be906b56SAndreas Gohr    }
39be906b56SAndreas Gohr
40be906b56SAndreas Gohr    /**
41be906b56SAndreas Gohr     * Adds a pattern with an optional label.
42be906b56SAndreas Gohr     *
43be906b56SAndreas Gohr     * @param mixed       $pattern Perl style regex. Must be UTF-8
44be906b56SAndreas Gohr     *                             encoded. If its a string, the (, )
45be906b56SAndreas Gohr     *                             lose their meaning unless they
46be906b56SAndreas Gohr     *                             form part of a lookahead or
47be906b56SAndreas Gohr     *                             lookbehind assertation.
48be906b56SAndreas Gohr     * @param bool|string $label   Label of regex to be returned
49be906b56SAndreas Gohr     *                             on a match. Label must be ASCII
50be906b56SAndreas Gohr     */
51be906b56SAndreas Gohr    public function addPattern($pattern, $label = true)
52be906b56SAndreas Gohr    {
53be906b56SAndreas Gohr        $count = count($this->patterns);
54be906b56SAndreas Gohr        $this->patterns[$count] = $pattern;
55be906b56SAndreas Gohr        $this->labels[$count] = $label;
56be906b56SAndreas Gohr        $this->regex = null;
57be906b56SAndreas Gohr    }
58be906b56SAndreas Gohr
59be906b56SAndreas Gohr    /**
60be906b56SAndreas Gohr     * Attempts to match all patterns at once against a string.
61be906b56SAndreas Gohr     *
62be906b56SAndreas Gohr     * @param string $subject      String to match against.
63be906b56SAndreas Gohr     * @param string $match        First matched portion of
64be906b56SAndreas Gohr     *                             subject.
65be906b56SAndreas Gohr     * @return bool|string         False if no match found, label if label exists, true if not
66be906b56SAndreas Gohr     */
67fe2e97f6SAndreas Gohr    public function apply($subject, &$match)
68be906b56SAndreas Gohr    {
69be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
70be906b56SAndreas Gohr            return false;
71be906b56SAndreas Gohr        }
72be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
73be906b56SAndreas Gohr            $match = "";
74be906b56SAndreas Gohr            return false;
75be906b56SAndreas Gohr        }
76be906b56SAndreas Gohr
77be906b56SAndreas Gohr        $match = $matches[0];
78be906b56SAndreas Gohr        $size = count($matches);
79be906b56SAndreas Gohr        // FIXME this could be made faster by storing the labels as keys in a hashmap
80be906b56SAndreas Gohr        for ($i = 1; $i < $size; $i++) {
81be906b56SAndreas Gohr            if ($matches[$i] && isset($this->labels[$i - 1])) {
82be906b56SAndreas Gohr                return $this->labels[$i - 1];
83be906b56SAndreas Gohr            }
84be906b56SAndreas Gohr        }
85be906b56SAndreas Gohr        return true;
86be906b56SAndreas Gohr    }
87be906b56SAndreas Gohr
88be906b56SAndreas Gohr    /**
89be906b56SAndreas Gohr     * Attempts to split the string against all patterns at once
90be906b56SAndreas Gohr     *
91be906b56SAndreas Gohr     * @param string $subject      String to match against.
92be906b56SAndreas Gohr     * @param array $split         The split result: array containing, pre-match, match & post-match strings
93be906b56SAndreas Gohr     * @return boolean             True on success.
94be906b56SAndreas Gohr     *
95be906b56SAndreas Gohr     * @author Christopher Smith <chris@jalakai.co.uk>
96be906b56SAndreas Gohr     */
97be906b56SAndreas Gohr    public function split($subject, &$split)
98be906b56SAndreas Gohr    {
99be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
100be906b56SAndreas Gohr            return false;
101be906b56SAndreas Gohr        }
102be906b56SAndreas Gohr
103be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
104be906b56SAndreas Gohr            if (function_exists('preg_last_error')) {
105be906b56SAndreas Gohr                $err = preg_last_error();
106be906b56SAndreas Gohr                switch ($err) {
107be906b56SAndreas Gohr                    case PREG_BACKTRACK_LIMIT_ERROR:
108be906b56SAndreas Gohr                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
109be906b56SAndreas Gohr                        break;
110be906b56SAndreas Gohr                    case PREG_RECURSION_LIMIT_ERROR:
111be906b56SAndreas Gohr                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
112be906b56SAndreas Gohr                        break;
113be906b56SAndreas Gohr                    case PREG_BAD_UTF8_ERROR:
114be906b56SAndreas Gohr                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
115be906b56SAndreas Gohr                        break;
116be906b56SAndreas Gohr                    case PREG_INTERNAL_ERROR:
117be906b56SAndreas Gohr                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
118be906b56SAndreas Gohr                        break;
119be906b56SAndreas Gohr                }
120be906b56SAndreas Gohr            }
121be906b56SAndreas Gohr
122bcaec9f4SAndreas Gohr            $split = [$subject, "", ""];
123be906b56SAndreas Gohr            return false;
124be906b56SAndreas Gohr        }
125be906b56SAndreas Gohr
126be906b56SAndreas Gohr        $idx = count($matches) - 2;
127bcaec9f4SAndreas Gohr        [$pre, $post] = preg_split($this->patterns[$idx] . $this->getPerlMatchingFlags(), $subject, 2);
128bcaec9f4SAndreas Gohr        $split = [$pre, $matches[0], $post];
129be906b56SAndreas Gohr
130bcaec9f4SAndreas Gohr        return $this->labels[$idx] ?? true;
131be906b56SAndreas Gohr    }
132be906b56SAndreas Gohr
133be906b56SAndreas Gohr    /**
134be906b56SAndreas Gohr     * Compounds the patterns into a single
135be906b56SAndreas Gohr     * regular expression separated with the
136be906b56SAndreas Gohr     * "or" operator. Caches the regex.
137be906b56SAndreas Gohr     * Will automatically escape (, ) and / tokens.
138be906b56SAndreas Gohr     *
139be906b56SAndreas Gohr     * @return null|string
140be906b56SAndreas Gohr     */
141be906b56SAndreas Gohr    protected function getCompoundedRegex()
142be906b56SAndreas Gohr    {
143be906b56SAndreas Gohr        if ($this->regex == null) {
144be906b56SAndreas Gohr            $cnt = count($this->patterns);
145be906b56SAndreas Gohr            for ($i = 0; $i < $cnt; $i++) {
146be906b56SAndreas Gohr                /*
147be906b56SAndreas Gohr                 * decompose the input pattern into "(", "(?", ")",
148be906b56SAndreas Gohr                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
149be906b56SAndreas Gohr                 * elements.
150be906b56SAndreas Gohr                 */
151be906b56SAndreas Gohr                preg_match_all('/\\\\.|' .
152be906b56SAndreas Gohr                               '\(\?|' .
153be906b56SAndreas Gohr                               '[()]|' .
154be906b56SAndreas Gohr                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
155be906b56SAndreas Gohr                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
156be906b56SAndreas Gohr
157be906b56SAndreas Gohr                $pattern = "";
158be906b56SAndreas Gohr                $level = 0;
159be906b56SAndreas Gohr
160be906b56SAndreas Gohr                foreach ($elts[0] as $elt) {
161be906b56SAndreas Gohr                    /*
162be906b56SAndreas Gohr                     * for "(", ")" remember the nesting level, add "\"
163be906b56SAndreas Gohr                     * only to the non-"(?" ones.
164be906b56SAndreas Gohr                     */
165be906b56SAndreas Gohr
166be906b56SAndreas Gohr                    switch ($elt) {
167be906b56SAndreas Gohr                        case '(':
168be906b56SAndreas Gohr                            $pattern .= '\(';
169be906b56SAndreas Gohr                            break;
170be906b56SAndreas Gohr                        case ')':
171be906b56SAndreas Gohr                            if ($level > 0)
172be906b56SAndreas Gohr                                $level--; /* closing (? */
173be906b56SAndreas Gohr                            else $pattern .= '\\';
174be906b56SAndreas Gohr                            $pattern .= ')';
175be906b56SAndreas Gohr                            break;
176be906b56SAndreas Gohr                        case '(?':
177be906b56SAndreas Gohr                            $level++;
178be906b56SAndreas Gohr                            $pattern .= '(?';
179be906b56SAndreas Gohr                            break;
180be906b56SAndreas Gohr                        default:
181be906b56SAndreas Gohr                            if (substr($elt, 0, 1) == '\\')
182be906b56SAndreas Gohr                                $pattern .= $elt;
183be906b56SAndreas Gohr                            else $pattern .= str_replace('/', '\/', $elt);
184be906b56SAndreas Gohr                    }
185be906b56SAndreas Gohr                }
186be906b56SAndreas Gohr                $this->patterns[$i] = "($pattern)";
187be906b56SAndreas Gohr            }
188be906b56SAndreas Gohr            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
189be906b56SAndreas Gohr        }
190be906b56SAndreas Gohr        return $this->regex;
191be906b56SAndreas Gohr    }
192be906b56SAndreas Gohr
193be906b56SAndreas Gohr    /**
194be906b56SAndreas Gohr     * Accessor for perl regex mode flags to use.
195be906b56SAndreas Gohr     * @return string       Perl regex flags.
196be906b56SAndreas Gohr     */
197be906b56SAndreas Gohr    protected function getPerlMatchingFlags()
198be906b56SAndreas Gohr    {
199be906b56SAndreas Gohr        return ($this->case ? "msS" : "msSi");
200be906b56SAndreas Gohr    }
201be906b56SAndreas Gohr}
202