xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision fe2e97f6756a6c569a993bc8d25fb5772749f89f)
1be906b56SAndreas Gohr<?php
2be906b56SAndreas Gohr/**
3be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4be906b56SAndreas Gohr * For an intro to the Lexer see:
5be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6be906b56SAndreas Gohr *
7be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
8be906b56SAndreas Gohr */
9be906b56SAndreas Gohr
10be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
11be906b56SAndreas Gohr
12be906b56SAndreas Gohr/**
13be906b56SAndreas Gohr * Compounded regular expression.
14be906b56SAndreas Gohr *
15be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned.
16be906b56SAndreas Gohr */
17be906b56SAndreas Gohrclass ParallelRegex
18be906b56SAndreas Gohr{
19be906b56SAndreas Gohr    /** @var string[] patterns to match */
20be906b56SAndreas Gohr    protected $patterns;
21be906b56SAndreas Gohr    /** @var string[] labels for above patterns */
22be906b56SAndreas Gohr    protected $labels;
23be906b56SAndreas Gohr    /** @var string the compound regex matching all patterns */
24be906b56SAndreas Gohr    protected $regex;
25be906b56SAndreas Gohr    /** @var bool case sensitive matching? */
26be906b56SAndreas Gohr    protected $case;
27be906b56SAndreas Gohr
28be906b56SAndreas Gohr    /**
29be906b56SAndreas Gohr     * Constructor. Starts with no patterns.
30be906b56SAndreas Gohr     *
31be906b56SAndreas Gohr     * @param boolean $case    True for case sensitive, false
32be906b56SAndreas Gohr     *                         for insensitive.
33be906b56SAndreas Gohr     */
34be906b56SAndreas Gohr    public function __construct($case)
35be906b56SAndreas Gohr    {
36be906b56SAndreas Gohr        $this->case = $case;
37be906b56SAndreas Gohr        $this->patterns = array();
38be906b56SAndreas Gohr        $this->labels = array();
39be906b56SAndreas Gohr        $this->regex = null;
40be906b56SAndreas Gohr    }
41be906b56SAndreas Gohr
42be906b56SAndreas Gohr    /**
43be906b56SAndreas Gohr     * Adds a pattern with an optional label.
44be906b56SAndreas Gohr     *
45be906b56SAndreas Gohr     * @param mixed       $pattern Perl style regex. Must be UTF-8
46be906b56SAndreas Gohr     *                             encoded. If its a string, the (, )
47be906b56SAndreas Gohr     *                             lose their meaning unless they
48be906b56SAndreas Gohr     *                             form part of a lookahead or
49be906b56SAndreas Gohr     *                             lookbehind assertation.
50be906b56SAndreas Gohr     * @param bool|string $label   Label of regex to be returned
51be906b56SAndreas Gohr     *                             on a match. Label must be ASCII
52be906b56SAndreas Gohr     */
53be906b56SAndreas Gohr    public function addPattern($pattern, $label = true)
54be906b56SAndreas Gohr    {
55be906b56SAndreas Gohr        $count = count($this->patterns);
56be906b56SAndreas Gohr        $this->patterns[$count] = $pattern;
57be906b56SAndreas Gohr        $this->labels[$count] = $label;
58be906b56SAndreas Gohr        $this->regex = null;
59be906b56SAndreas Gohr    }
60be906b56SAndreas Gohr
61be906b56SAndreas Gohr    /**
62be906b56SAndreas Gohr     * Attempts to match all patterns at once against a string.
63be906b56SAndreas Gohr     *
64be906b56SAndreas Gohr     * @param string $subject      String to match against.
65be906b56SAndreas Gohr     * @param string $match        First matched portion of
66be906b56SAndreas Gohr     *                             subject.
67be906b56SAndreas Gohr     * @return bool|string         False if no match found, label if label exists, true if not
68be906b56SAndreas Gohr     */
69*fe2e97f6SAndreas Gohr    public function apply($subject, &$match)
70be906b56SAndreas Gohr    {
71be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
72be906b56SAndreas Gohr            return false;
73be906b56SAndreas Gohr        }
74be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
75be906b56SAndreas Gohr            $match = "";
76be906b56SAndreas Gohr            return false;
77be906b56SAndreas Gohr        }
78be906b56SAndreas Gohr
79be906b56SAndreas Gohr        $match = $matches[0];
80be906b56SAndreas Gohr        $size = count($matches);
81be906b56SAndreas Gohr        // FIXME this could be made faster by storing the labels as keys in a hashmap
82be906b56SAndreas Gohr        for ($i = 1; $i < $size; $i++) {
83be906b56SAndreas Gohr            if ($matches[$i] && isset($this->labels[$i - 1])) {
84be906b56SAndreas Gohr                return $this->labels[$i - 1];
85be906b56SAndreas Gohr            }
86be906b56SAndreas Gohr        }
87be906b56SAndreas Gohr        return true;
88be906b56SAndreas Gohr    }
89be906b56SAndreas Gohr
90be906b56SAndreas Gohr    /**
91be906b56SAndreas Gohr     * Attempts to split the string against all patterns at once
92be906b56SAndreas Gohr     *
93be906b56SAndreas Gohr     * @param string $subject      String to match against.
94be906b56SAndreas Gohr     * @param array $split         The split result: array containing, pre-match, match & post-match strings
95be906b56SAndreas Gohr     * @return boolean             True on success.
96be906b56SAndreas Gohr     *
97be906b56SAndreas Gohr     * @author Christopher Smith <chris@jalakai.co.uk>
98be906b56SAndreas Gohr     */
99be906b56SAndreas Gohr    public function split($subject, &$split)
100be906b56SAndreas Gohr    {
101be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
102be906b56SAndreas Gohr            return false;
103be906b56SAndreas Gohr        }
104be906b56SAndreas Gohr
105be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
106be906b56SAndreas Gohr            if (function_exists('preg_last_error')) {
107be906b56SAndreas Gohr                $err = preg_last_error();
108be906b56SAndreas Gohr                switch ($err) {
109be906b56SAndreas Gohr                    case PREG_BACKTRACK_LIMIT_ERROR:
110be906b56SAndreas Gohr                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
111be906b56SAndreas Gohr                        break;
112be906b56SAndreas Gohr                    case PREG_RECURSION_LIMIT_ERROR:
113be906b56SAndreas Gohr                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
114be906b56SAndreas Gohr                        break;
115be906b56SAndreas Gohr                    case PREG_BAD_UTF8_ERROR:
116be906b56SAndreas Gohr                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
117be906b56SAndreas Gohr                        break;
118be906b56SAndreas Gohr                    case PREG_INTERNAL_ERROR:
119be906b56SAndreas Gohr                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
120be906b56SAndreas Gohr                        break;
121be906b56SAndreas Gohr                }
122be906b56SAndreas Gohr            }
123be906b56SAndreas Gohr
124be906b56SAndreas Gohr            $split = array($subject, "", "");
125be906b56SAndreas Gohr            return false;
126be906b56SAndreas Gohr        }
127be906b56SAndreas Gohr
128be906b56SAndreas Gohr        $idx = count($matches)-2;
129be906b56SAndreas Gohr        list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
130be906b56SAndreas Gohr        $split = array($pre, $matches[0], $post);
131be906b56SAndreas Gohr
132be906b56SAndreas Gohr        return isset($this->labels[$idx]) ? $this->labels[$idx] : true;
133be906b56SAndreas Gohr    }
134be906b56SAndreas Gohr
135be906b56SAndreas Gohr    /**
136be906b56SAndreas Gohr     * Compounds the patterns into a single
137be906b56SAndreas Gohr     * regular expression separated with the
138be906b56SAndreas Gohr     * "or" operator. Caches the regex.
139be906b56SAndreas Gohr     * Will automatically escape (, ) and / tokens.
140be906b56SAndreas Gohr     *
141be906b56SAndreas Gohr     * @return null|string
142be906b56SAndreas Gohr     */
143be906b56SAndreas Gohr    protected function getCompoundedRegex()
144be906b56SAndreas Gohr    {
145be906b56SAndreas Gohr        if ($this->regex == null) {
146be906b56SAndreas Gohr            $cnt = count($this->patterns);
147be906b56SAndreas Gohr            for ($i = 0; $i < $cnt; $i++) {
148be906b56SAndreas Gohr                /*
149be906b56SAndreas Gohr                 * decompose the input pattern into "(", "(?", ")",
150be906b56SAndreas Gohr                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
151be906b56SAndreas Gohr                 * elements.
152be906b56SAndreas Gohr                 */
153be906b56SAndreas Gohr                preg_match_all('/\\\\.|' .
154be906b56SAndreas Gohr                               '\(\?|' .
155be906b56SAndreas Gohr                               '[()]|' .
156be906b56SAndreas Gohr                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
157be906b56SAndreas Gohr                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
158be906b56SAndreas Gohr
159be906b56SAndreas Gohr                $pattern = "";
160be906b56SAndreas Gohr                $level = 0;
161be906b56SAndreas Gohr
162be906b56SAndreas Gohr                foreach ($elts[0] as $elt) {
163be906b56SAndreas Gohr                    /*
164be906b56SAndreas Gohr                     * for "(", ")" remember the nesting level, add "\"
165be906b56SAndreas Gohr                     * only to the non-"(?" ones.
166be906b56SAndreas Gohr                     */
167be906b56SAndreas Gohr
168be906b56SAndreas Gohr                    switch ($elt) {
169be906b56SAndreas Gohr                        case '(':
170be906b56SAndreas Gohr                            $pattern .= '\(';
171be906b56SAndreas Gohr                            break;
172be906b56SAndreas Gohr                        case ')':
173be906b56SAndreas Gohr                            if ($level > 0)
174be906b56SAndreas Gohr                                $level--; /* closing (? */
175be906b56SAndreas Gohr                            else $pattern .= '\\';
176be906b56SAndreas Gohr                            $pattern .= ')';
177be906b56SAndreas Gohr                            break;
178be906b56SAndreas Gohr                        case '(?':
179be906b56SAndreas Gohr                            $level++;
180be906b56SAndreas Gohr                            $pattern .= '(?';
181be906b56SAndreas Gohr                            break;
182be906b56SAndreas Gohr                        default:
183be906b56SAndreas Gohr                            if (substr($elt, 0, 1) == '\\')
184be906b56SAndreas Gohr                                $pattern .= $elt;
185be906b56SAndreas Gohr                            else $pattern .= str_replace('/', '\/', $elt);
186be906b56SAndreas Gohr                    }
187be906b56SAndreas Gohr                }
188be906b56SAndreas Gohr                $this->patterns[$i] = "($pattern)";
189be906b56SAndreas Gohr            }
190be906b56SAndreas Gohr            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
191be906b56SAndreas Gohr        }
192be906b56SAndreas Gohr        return $this->regex;
193be906b56SAndreas Gohr    }
194be906b56SAndreas Gohr
195be906b56SAndreas Gohr    /**
196be906b56SAndreas Gohr     * Accessor for perl regex mode flags to use.
197be906b56SAndreas Gohr     * @return string       Perl regex flags.
198be906b56SAndreas Gohr     */
199be906b56SAndreas Gohr    protected function getPerlMatchingFlags()
200be906b56SAndreas Gohr    {
201be906b56SAndreas Gohr        return ($this->case ? "msS" : "msSi");
202be906b56SAndreas Gohr    }
203be906b56SAndreas Gohr}
204