xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision bcaec9f47d06126b3e653fea89a86d8b6a6cbef8)
1be906b56SAndreas Gohr<?php
2be906b56SAndreas Gohr/**
3be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4be906b56SAndreas Gohr * For an intro to the Lexer see:
5be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6be906b56SAndreas Gohr *
7be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
8be906b56SAndreas Gohr */
9be906b56SAndreas Gohr
10be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
11be906b56SAndreas Gohr
12be906b56SAndreas Gohr/**
13be906b56SAndreas Gohr * Compounded regular expression.
14be906b56SAndreas Gohr *
15be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned.
16be906b56SAndreas Gohr */
17be906b56SAndreas Gohrclass ParallelRegex
18be906b56SAndreas Gohr{
19be906b56SAndreas Gohr    /** @var string[] patterns to match */
20*bcaec9f4SAndreas Gohr    protected $patterns = [];
21be906b56SAndreas Gohr    /** @var string[] labels for above patterns */
22*bcaec9f4SAndreas Gohr    protected $labels = [];
23be906b56SAndreas Gohr    /** @var string the compound regex matching all patterns */
24be906b56SAndreas Gohr    protected $regex;
25be906b56SAndreas Gohr    /** @var bool case sensitive matching? */
26be906b56SAndreas Gohr    protected $case;
27be906b56SAndreas Gohr
28be906b56SAndreas Gohr    /**
29be906b56SAndreas Gohr     * Constructor. Starts with no patterns.
30be906b56SAndreas Gohr     *
31be906b56SAndreas Gohr     * @param boolean $case    True for case sensitive, false
32be906b56SAndreas Gohr     *                         for insensitive.
33be906b56SAndreas Gohr     */
34be906b56SAndreas Gohr    public function __construct($case)
35be906b56SAndreas Gohr    {
36be906b56SAndreas Gohr        $this->case = $case;
37be906b56SAndreas Gohr    }
38be906b56SAndreas Gohr
39be906b56SAndreas Gohr    /**
40be906b56SAndreas Gohr     * Adds a pattern with an optional label.
41be906b56SAndreas Gohr     *
42be906b56SAndreas Gohr     * @param mixed       $pattern Perl style regex. Must be UTF-8
43be906b56SAndreas Gohr     *                             encoded. If its a string, the (, )
44be906b56SAndreas Gohr     *                             lose their meaning unless they
45be906b56SAndreas Gohr     *                             form part of a lookahead or
46be906b56SAndreas Gohr     *                             lookbehind assertation.
47be906b56SAndreas Gohr     * @param bool|string $label   Label of regex to be returned
48be906b56SAndreas Gohr     *                             on a match. Label must be ASCII
49be906b56SAndreas Gohr     */
50be906b56SAndreas Gohr    public function addPattern($pattern, $label = true)
51be906b56SAndreas Gohr    {
52be906b56SAndreas Gohr        $count = count($this->patterns);
53be906b56SAndreas Gohr        $this->patterns[$count] = $pattern;
54be906b56SAndreas Gohr        $this->labels[$count] = $label;
55be906b56SAndreas Gohr        $this->regex = null;
56be906b56SAndreas Gohr    }
57be906b56SAndreas Gohr
58be906b56SAndreas Gohr    /**
59be906b56SAndreas Gohr     * Attempts to match all patterns at once against a string.
60be906b56SAndreas Gohr     *
61be906b56SAndreas Gohr     * @param string $subject      String to match against.
62be906b56SAndreas Gohr     * @param string $match        First matched portion of
63be906b56SAndreas Gohr     *                             subject.
64be906b56SAndreas Gohr     * @return bool|string         False if no match found, label if label exists, true if not
65be906b56SAndreas Gohr     */
66fe2e97f6SAndreas Gohr    public function apply($subject, &$match)
67be906b56SAndreas Gohr    {
68be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
69be906b56SAndreas Gohr            return false;
70be906b56SAndreas Gohr        }
71be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
72be906b56SAndreas Gohr            $match = "";
73be906b56SAndreas Gohr            return false;
74be906b56SAndreas Gohr        }
75be906b56SAndreas Gohr
76be906b56SAndreas Gohr        $match = $matches[0];
77be906b56SAndreas Gohr        $size = count($matches);
78be906b56SAndreas Gohr        // FIXME this could be made faster by storing the labels as keys in a hashmap
79be906b56SAndreas Gohr        for ($i = 1; $i < $size; $i++) {
80be906b56SAndreas Gohr            if ($matches[$i] && isset($this->labels[$i - 1])) {
81be906b56SAndreas Gohr                return $this->labels[$i - 1];
82be906b56SAndreas Gohr            }
83be906b56SAndreas Gohr        }
84be906b56SAndreas Gohr        return true;
85be906b56SAndreas Gohr    }
86be906b56SAndreas Gohr
87be906b56SAndreas Gohr    /**
88be906b56SAndreas Gohr     * Attempts to split the string against all patterns at once
89be906b56SAndreas Gohr     *
90be906b56SAndreas Gohr     * @param string $subject      String to match against.
91be906b56SAndreas Gohr     * @param array $split         The split result: array containing, pre-match, match & post-match strings
92be906b56SAndreas Gohr     * @return boolean             True on success.
93be906b56SAndreas Gohr     *
94be906b56SAndreas Gohr     * @author Christopher Smith <chris@jalakai.co.uk>
95be906b56SAndreas Gohr     */
96be906b56SAndreas Gohr    public function split($subject, &$split)
97be906b56SAndreas Gohr    {
98be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
99be906b56SAndreas Gohr            return false;
100be906b56SAndreas Gohr        }
101be906b56SAndreas Gohr
102be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
103be906b56SAndreas Gohr            if (function_exists('preg_last_error')) {
104be906b56SAndreas Gohr                $err = preg_last_error();
105be906b56SAndreas Gohr                switch ($err) {
106be906b56SAndreas Gohr                    case PREG_BACKTRACK_LIMIT_ERROR:
107be906b56SAndreas Gohr                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
108be906b56SAndreas Gohr                        break;
109be906b56SAndreas Gohr                    case PREG_RECURSION_LIMIT_ERROR:
110be906b56SAndreas Gohr                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
111be906b56SAndreas Gohr                        break;
112be906b56SAndreas Gohr                    case PREG_BAD_UTF8_ERROR:
113be906b56SAndreas Gohr                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
114be906b56SAndreas Gohr                        break;
115be906b56SAndreas Gohr                    case PREG_INTERNAL_ERROR:
116be906b56SAndreas Gohr                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
117be906b56SAndreas Gohr                        break;
118be906b56SAndreas Gohr                }
119be906b56SAndreas Gohr            }
120be906b56SAndreas Gohr
121*bcaec9f4SAndreas Gohr            $split = [$subject, "", ""];
122be906b56SAndreas Gohr            return false;
123be906b56SAndreas Gohr        }
124be906b56SAndreas Gohr
125be906b56SAndreas Gohr        $idx = count($matches)-2;
126*bcaec9f4SAndreas Gohr        [$pre, $post] = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
127*bcaec9f4SAndreas Gohr        $split = [$pre, $matches[0], $post];
128be906b56SAndreas Gohr
129*bcaec9f4SAndreas Gohr        return $this->labels[$idx] ?? true;
130be906b56SAndreas Gohr    }
131be906b56SAndreas Gohr
132be906b56SAndreas Gohr    /**
133be906b56SAndreas Gohr     * Compounds the patterns into a single
134be906b56SAndreas Gohr     * regular expression separated with the
135be906b56SAndreas Gohr     * "or" operator. Caches the regex.
136be906b56SAndreas Gohr     * Will automatically escape (, ) and / tokens.
137be906b56SAndreas Gohr     *
138be906b56SAndreas Gohr     * @return null|string
139be906b56SAndreas Gohr     */
140be906b56SAndreas Gohr    protected function getCompoundedRegex()
141be906b56SAndreas Gohr    {
142be906b56SAndreas Gohr        if ($this->regex == null) {
143be906b56SAndreas Gohr            $cnt = count($this->patterns);
144be906b56SAndreas Gohr            for ($i = 0; $i < $cnt; $i++) {
145be906b56SAndreas Gohr                /*
146be906b56SAndreas Gohr                 * decompose the input pattern into "(", "(?", ")",
147be906b56SAndreas Gohr                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
148be906b56SAndreas Gohr                 * elements.
149be906b56SAndreas Gohr                 */
150be906b56SAndreas Gohr                preg_match_all('/\\\\.|' .
151be906b56SAndreas Gohr                               '\(\?|' .
152be906b56SAndreas Gohr                               '[()]|' .
153be906b56SAndreas Gohr                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
154be906b56SAndreas Gohr                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
155be906b56SAndreas Gohr
156be906b56SAndreas Gohr                $pattern = "";
157be906b56SAndreas Gohr                $level = 0;
158be906b56SAndreas Gohr
159be906b56SAndreas Gohr                foreach ($elts[0] as $elt) {
160be906b56SAndreas Gohr                    /*
161be906b56SAndreas Gohr                     * for "(", ")" remember the nesting level, add "\"
162be906b56SAndreas Gohr                     * only to the non-"(?" ones.
163be906b56SAndreas Gohr                     */
164be906b56SAndreas Gohr
165be906b56SAndreas Gohr                    switch ($elt) {
166be906b56SAndreas Gohr                        case '(':
167be906b56SAndreas Gohr                            $pattern .= '\(';
168be906b56SAndreas Gohr                            break;
169be906b56SAndreas Gohr                        case ')':
170be906b56SAndreas Gohr                            if ($level > 0)
171be906b56SAndreas Gohr                                $level--; /* closing (? */
172be906b56SAndreas Gohr                            else $pattern .= '\\';
173be906b56SAndreas Gohr                            $pattern .= ')';
174be906b56SAndreas Gohr                            break;
175be906b56SAndreas Gohr                        case '(?':
176be906b56SAndreas Gohr                            $level++;
177be906b56SAndreas Gohr                            $pattern .= '(?';
178be906b56SAndreas Gohr                            break;
179be906b56SAndreas Gohr                        default:
180be906b56SAndreas Gohr                            if (substr($elt, 0, 1) == '\\')
181be906b56SAndreas Gohr                                $pattern .= $elt;
182be906b56SAndreas Gohr                            else $pattern .= str_replace('/', '\/', $elt);
183be906b56SAndreas Gohr                    }
184be906b56SAndreas Gohr                }
185be906b56SAndreas Gohr                $this->patterns[$i] = "($pattern)";
186be906b56SAndreas Gohr            }
187be906b56SAndreas Gohr            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
188be906b56SAndreas Gohr        }
189be906b56SAndreas Gohr        return $this->regex;
190be906b56SAndreas Gohr    }
191be906b56SAndreas Gohr
192be906b56SAndreas Gohr    /**
193be906b56SAndreas Gohr     * Accessor for perl regex mode flags to use.
194be906b56SAndreas Gohr     * @return string       Perl regex flags.
195be906b56SAndreas Gohr     */
196be906b56SAndreas Gohr    protected function getPerlMatchingFlags()
197be906b56SAndreas Gohr    {
198be906b56SAndreas Gohr        return ($this->case ? "msS" : "msSi");
199be906b56SAndreas Gohr    }
200be906b56SAndreas Gohr}
201