xref: /dokuwiki/inc/Parsing/Lexer/ParallelRegex.php (revision be906b566b9bdfd92c032ee07c4fd077d820a8d1)
1*be906b56SAndreas Gohr<?php
2*be906b56SAndreas Gohr/**
3*be906b56SAndreas Gohr * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4*be906b56SAndreas Gohr * For an intro to the Lexer see:
5*be906b56SAndreas Gohr * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6*be906b56SAndreas Gohr *
7*be906b56SAndreas Gohr * @author Marcus Baker http://www.lastcraft.com
8*be906b56SAndreas Gohr */
9*be906b56SAndreas Gohr
10*be906b56SAndreas Gohrnamespace dokuwiki\Parsing\Lexer;
11*be906b56SAndreas Gohr
12*be906b56SAndreas Gohr/**
13*be906b56SAndreas Gohr * Compounded regular expression.
14*be906b56SAndreas Gohr *
15*be906b56SAndreas Gohr * Any of the contained patterns could match and when one does it's label is returned.
16*be906b56SAndreas Gohr */
17*be906b56SAndreas Gohrclass ParallelRegex
18*be906b56SAndreas Gohr{
19*be906b56SAndreas Gohr    /** @var string[] patterns to match */
20*be906b56SAndreas Gohr    protected $patterns;
21*be906b56SAndreas Gohr    /** @var string[] labels for above patterns */
22*be906b56SAndreas Gohr    protected $labels;
23*be906b56SAndreas Gohr    /** @var string the compound regex matching all patterns */
24*be906b56SAndreas Gohr    protected $regex;
25*be906b56SAndreas Gohr    /** @var bool case sensitive matching? */
26*be906b56SAndreas Gohr    protected $case;
27*be906b56SAndreas Gohr
28*be906b56SAndreas Gohr    /**
29*be906b56SAndreas Gohr     * Constructor. Starts with no patterns.
30*be906b56SAndreas Gohr     *
31*be906b56SAndreas Gohr     * @param boolean $case    True for case sensitive, false
32*be906b56SAndreas Gohr     *                         for insensitive.
33*be906b56SAndreas Gohr     */
34*be906b56SAndreas Gohr    public function __construct($case)
35*be906b56SAndreas Gohr    {
36*be906b56SAndreas Gohr        $this->case = $case;
37*be906b56SAndreas Gohr        $this->patterns = array();
38*be906b56SAndreas Gohr        $this->labels = array();
39*be906b56SAndreas Gohr        $this->regex = null;
40*be906b56SAndreas Gohr    }
41*be906b56SAndreas Gohr
42*be906b56SAndreas Gohr    /**
43*be906b56SAndreas Gohr     * Adds a pattern with an optional label.
44*be906b56SAndreas Gohr     *
45*be906b56SAndreas Gohr     * @param mixed       $pattern Perl style regex. Must be UTF-8
46*be906b56SAndreas Gohr     *                             encoded. If its a string, the (, )
47*be906b56SAndreas Gohr     *                             lose their meaning unless they
48*be906b56SAndreas Gohr     *                             form part of a lookahead or
49*be906b56SAndreas Gohr     *                             lookbehind assertation.
50*be906b56SAndreas Gohr     * @param bool|string $label   Label of regex to be returned
51*be906b56SAndreas Gohr     *                             on a match. Label must be ASCII
52*be906b56SAndreas Gohr     */
53*be906b56SAndreas Gohr    public function addPattern($pattern, $label = true)
54*be906b56SAndreas Gohr    {
55*be906b56SAndreas Gohr        $count = count($this->patterns);
56*be906b56SAndreas Gohr        $this->patterns[$count] = $pattern;
57*be906b56SAndreas Gohr        $this->labels[$count] = $label;
58*be906b56SAndreas Gohr        $this->regex = null;
59*be906b56SAndreas Gohr    }
60*be906b56SAndreas Gohr
61*be906b56SAndreas Gohr    /**
62*be906b56SAndreas Gohr     * Attempts to match all patterns at once against a string.
63*be906b56SAndreas Gohr     *
64*be906b56SAndreas Gohr     * @param string $subject      String to match against.
65*be906b56SAndreas Gohr     * @param string $match        First matched portion of
66*be906b56SAndreas Gohr     *                             subject.
67*be906b56SAndreas Gohr     * @return bool|string         False if no match found, label if label exists, true if not
68*be906b56SAndreas Gohr     */
69*be906b56SAndreas Gohr    public function match($subject, &$match)
70*be906b56SAndreas Gohr    {
71*be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
72*be906b56SAndreas Gohr            return false;
73*be906b56SAndreas Gohr        }
74*be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
75*be906b56SAndreas Gohr            $match = "";
76*be906b56SAndreas Gohr            return false;
77*be906b56SAndreas Gohr        }
78*be906b56SAndreas Gohr
79*be906b56SAndreas Gohr        $match = $matches[0];
80*be906b56SAndreas Gohr        $size = count($matches);
81*be906b56SAndreas Gohr        // FIXME this could be made faster by storing the labels as keys in a hashmap
82*be906b56SAndreas Gohr        for ($i = 1; $i < $size; $i++) {
83*be906b56SAndreas Gohr            if ($matches[$i] && isset($this->labels[$i - 1])) {
84*be906b56SAndreas Gohr                return $this->labels[$i - 1];
85*be906b56SAndreas Gohr            }
86*be906b56SAndreas Gohr        }
87*be906b56SAndreas Gohr        return true;
88*be906b56SAndreas Gohr    }
89*be906b56SAndreas Gohr
90*be906b56SAndreas Gohr    /**
91*be906b56SAndreas Gohr     * Attempts to split the string against all patterns at once
92*be906b56SAndreas Gohr     *
93*be906b56SAndreas Gohr     * @param string $subject      String to match against.
94*be906b56SAndreas Gohr     * @param array $split         The split result: array containing, pre-match, match & post-match strings
95*be906b56SAndreas Gohr     * @return boolean             True on success.
96*be906b56SAndreas Gohr     *
97*be906b56SAndreas Gohr     * @author Christopher Smith <chris@jalakai.co.uk>
98*be906b56SAndreas Gohr     */
99*be906b56SAndreas Gohr    public function split($subject, &$split)
100*be906b56SAndreas Gohr    {
101*be906b56SAndreas Gohr        if (count($this->patterns) == 0) {
102*be906b56SAndreas Gohr            return false;
103*be906b56SAndreas Gohr        }
104*be906b56SAndreas Gohr
105*be906b56SAndreas Gohr        if (! preg_match($this->getCompoundedRegex(), $subject, $matches)) {
106*be906b56SAndreas Gohr            if (function_exists('preg_last_error')) {
107*be906b56SAndreas Gohr                $err = preg_last_error();
108*be906b56SAndreas Gohr                switch ($err) {
109*be906b56SAndreas Gohr                    case PREG_BACKTRACK_LIMIT_ERROR:
110*be906b56SAndreas Gohr                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
111*be906b56SAndreas Gohr                        break;
112*be906b56SAndreas Gohr                    case PREG_RECURSION_LIMIT_ERROR:
113*be906b56SAndreas Gohr                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
114*be906b56SAndreas Gohr                        break;
115*be906b56SAndreas Gohr                    case PREG_BAD_UTF8_ERROR:
116*be906b56SAndreas Gohr                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
117*be906b56SAndreas Gohr                        break;
118*be906b56SAndreas Gohr                    case PREG_INTERNAL_ERROR:
119*be906b56SAndreas Gohr                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
120*be906b56SAndreas Gohr                        break;
121*be906b56SAndreas Gohr                }
122*be906b56SAndreas Gohr            }
123*be906b56SAndreas Gohr
124*be906b56SAndreas Gohr            $split = array($subject, "", "");
125*be906b56SAndreas Gohr            return false;
126*be906b56SAndreas Gohr        }
127*be906b56SAndreas Gohr
128*be906b56SAndreas Gohr        $idx = count($matches)-2;
129*be906b56SAndreas Gohr        list($pre, $post) = preg_split($this->patterns[$idx].$this->getPerlMatchingFlags(), $subject, 2);
130*be906b56SAndreas Gohr        $split = array($pre, $matches[0], $post);
131*be906b56SAndreas Gohr
132*be906b56SAndreas Gohr        return isset($this->labels[$idx]) ? $this->labels[$idx] : true;
133*be906b56SAndreas Gohr    }
134*be906b56SAndreas Gohr
135*be906b56SAndreas Gohr    /**
136*be906b56SAndreas Gohr     * Compounds the patterns into a single
137*be906b56SAndreas Gohr     * regular expression separated with the
138*be906b56SAndreas Gohr     * "or" operator. Caches the regex.
139*be906b56SAndreas Gohr     * Will automatically escape (, ) and / tokens.
140*be906b56SAndreas Gohr     *
141*be906b56SAndreas Gohr     * @return null|string
142*be906b56SAndreas Gohr     */
143*be906b56SAndreas Gohr    protected function getCompoundedRegex()
144*be906b56SAndreas Gohr    {
145*be906b56SAndreas Gohr        if ($this->regex == null) {
146*be906b56SAndreas Gohr            $cnt = count($this->patterns);
147*be906b56SAndreas Gohr            for ($i = 0; $i < $cnt; $i++) {
148*be906b56SAndreas Gohr                /*
149*be906b56SAndreas Gohr                 * decompose the input pattern into "(", "(?", ")",
150*be906b56SAndreas Gohr                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
151*be906b56SAndreas Gohr                 * elements.
152*be906b56SAndreas Gohr                 */
153*be906b56SAndreas Gohr                preg_match_all('/\\\\.|' .
154*be906b56SAndreas Gohr                               '\(\?|' .
155*be906b56SAndreas Gohr                               '[()]|' .
156*be906b56SAndreas Gohr                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
157*be906b56SAndreas Gohr                               '[^[()\\\\]+/', $this->patterns[$i], $elts);
158*be906b56SAndreas Gohr
159*be906b56SAndreas Gohr                $pattern = "";
160*be906b56SAndreas Gohr                $level = 0;
161*be906b56SAndreas Gohr
162*be906b56SAndreas Gohr                foreach ($elts[0] as $elt) {
163*be906b56SAndreas Gohr                    /*
164*be906b56SAndreas Gohr                     * for "(", ")" remember the nesting level, add "\"
165*be906b56SAndreas Gohr                     * only to the non-"(?" ones.
166*be906b56SAndreas Gohr                     */
167*be906b56SAndreas Gohr
168*be906b56SAndreas Gohr                    switch ($elt) {
169*be906b56SAndreas Gohr                        case '(':
170*be906b56SAndreas Gohr                            $pattern .= '\(';
171*be906b56SAndreas Gohr                            break;
172*be906b56SAndreas Gohr                        case ')':
173*be906b56SAndreas Gohr                            if ($level > 0)
174*be906b56SAndreas Gohr                                $level--; /* closing (? */
175*be906b56SAndreas Gohr                            else $pattern .= '\\';
176*be906b56SAndreas Gohr                            $pattern .= ')';
177*be906b56SAndreas Gohr                            break;
178*be906b56SAndreas Gohr                        case '(?':
179*be906b56SAndreas Gohr                            $level++;
180*be906b56SAndreas Gohr                            $pattern .= '(?';
181*be906b56SAndreas Gohr                            break;
182*be906b56SAndreas Gohr                        default:
183*be906b56SAndreas Gohr                            if (substr($elt, 0, 1) == '\\')
184*be906b56SAndreas Gohr                                $pattern .= $elt;
185*be906b56SAndreas Gohr                            else $pattern .= str_replace('/', '\/', $elt);
186*be906b56SAndreas Gohr                    }
187*be906b56SAndreas Gohr                }
188*be906b56SAndreas Gohr                $this->patterns[$i] = "($pattern)";
189*be906b56SAndreas Gohr            }
190*be906b56SAndreas Gohr            $this->regex = "/" . implode("|", $this->patterns) . "/" . $this->getPerlMatchingFlags();
191*be906b56SAndreas Gohr        }
192*be906b56SAndreas Gohr        return $this->regex;
193*be906b56SAndreas Gohr    }
194*be906b56SAndreas Gohr
195*be906b56SAndreas Gohr    /**
196*be906b56SAndreas Gohr     * Accessor for perl regex mode flags to use.
197*be906b56SAndreas Gohr     * @return string       Perl regex flags.
198*be906b56SAndreas Gohr     */
199*be906b56SAndreas Gohr    protected function getPerlMatchingFlags()
200*be906b56SAndreas Gohr    {
201*be906b56SAndreas Gohr        return ($this->case ? "msS" : "msSi");
202*be906b56SAndreas Gohr    }
203*be906b56SAndreas Gohr}
204