1<?php
2
3namespace AST;
4use \InvalidArgumentException;
5use \RuntimeException;
6
7require_once "exceptions.php";
8
9
10function mb_preg_adapt_regex($regex) {
11    if (strlen($regex) < 2) {
12        return array($regex, '');
13    }
14    // "/rgx/options" --> ["rgx", "options"]
15    $lastDelim = strrpos($regex, $regex[0]);
16    if ($lastDelim === false || $lastDelim < 1) {
17        return array($regex, '');
18    }
19    $options = substr($regex, $lastDelim + 1);
20    $regex = substr($regex, 1, $lastDelim - 1);
21    return array($regex, $options);
22}
23
24function mb_preg_match($text, $matchRegex, $position=0) {
25    mb_regex_encoding('UTF-8');
26    list($matchRegex, $matchRegexOptions) = mb_preg_adapt_regex($matchRegex);
27    $textPiece = mb_substr($text, $position);
28    if (mb_ereg_search_init($textPiece) === false) {
29        return null;
30    }
31    $result = mb_ereg_search_pos($matchRegex, $matchRegexOptions);
32    list($matchOfsBytes, $matchLenBytes) = $result;
33    // Is the match offset in byte coinciding with the specified position?
34    // Or: is the match exactly at $position?
35    if (mb_strlen(mb_strcut($textPiece, 0, $matchOfsBytes)) > 0) {
36        return null;
37    }
38    return mb_strcut($textPiece, $matchOfsBytes, $matchLenBytes);
39}
40
41
42function sb_preg_match($text, $matchRegex, $position=0) {
43    $matches = null;
44    $result = preg_match($matchRegex, $text, $matches, PREG_OFFSET_CAPTURE, $position);
45    if ($result === 0) {
46        return null;
47    } elseif ($result === false) {
48        throw new InvalidArgumentException('An error occurred in preg_match.');
49    } elseif (count($matches) == 0) {
50        throw new RuntimeException('No matches?');
51    }
52    list($matchTxt, $matchOfs) = $matches[0];
53    if ($matchOfs > $position) {
54        return null;
55    }
56    return $matchTxt;
57}
58
59
60
61class TokenDefinition {
62    private $_representation = null;
63    private $_name = null;
64    private $_matchRegex = null;
65
66    public function __construct($representation, $name=null, $matchRegex=null) {
67        $this->_representation = $representation;
68        if ($name === null) {
69            $name = $representation;
70        }
71        $this->_name = $name;
72        if ($matchRegex === null) {
73            // preg_quote works also for multibyte
74            // https://stackoverflow.com/a/31733257/1749822
75            $matchRegex = '/' . preg_quote($representation) . '/';
76        }
77        $this->_matchRegex = $matchRegex;
78    }
79
80    public function representation() { return $this->_representation; }
81    public function name() { return $this->_name; }
82
83    public static function supportsMultibyte() {
84        static $_loaded = null;
85        if ($_loaded === null) {
86            $_loaded = (extension_loaded('mbstring') === true);
87        }
88        return $_loaded;
89    }
90
91    public function tryMatch($text, $position) {
92        if (self::supportsMultibyte()) {
93            return mb_preg_match($text, $this->_matchRegex, $position);
94        } else {
95            return sb_preg_match($text, $this->_matchRegex, $position);
96        }
97    }
98
99    public function __toString() {
100        return '<' . $this->name() . '>';
101    }
102}
103
104class TokenInstance {
105    private $_definition = null;
106    private $_text = null;
107    private $_position = null;
108    private $_length = null;
109
110    public function __construct($definition, $text, $position, $length) {
111        $this->_definition = $definition;
112        $this->_text = $text;
113        $this->_position = $position;
114        $this->_length = $length;
115    }
116
117    public function definition() { return $this->_definition; }
118    public function text() { return $this->_text; }
119    public function position() { return $this->_position; }
120    public function length() { return $this->_length; }
121    public function match() {
122        if (TokenDefinition::supportsMultibyte()) {
123            return mb_substr($this->_text, $this->position(), $this->length());
124        } else {
125            return substr($this->_text, $this->position(), $this->length());
126        }
127    }
128
129    public function __toString() {
130        return '<' . $this->definition()->name() . ':' . $this->match() . '>';
131    }
132}
133
134function tokenize($text, array $tokDefs, array $stripTokDefs) {
135    if (TokenDefinition::supportsMultibyte()) {
136        $textLen = mb_strlen($text);
137    } else {
138        $textLen = strlen($text);
139    }
140
141    $tokInsts = array();
142    $foundTokInst = null;
143    for ($position = 0; $position < $textLen; $position += $foundTokInst->length()) {
144        $foundTokInst = null;
145        foreach ($tokDefs as $tokDef) {
146            $match = $tokDef->tryMatch($text, $position);
147            if ($match !== null) {
148                if (TokenDefinition::supportsMultibyte()) {
149                    $matchLen = mb_strlen($match);
150                } else {
151                    $matchLen = strlen($match);
152                }
153                $foundTokInst = new TokenInstance($tokDef, $text, $position, $matchLen);
154                break;
155            }
156        }
157        if ($foundTokInst === null) {
158            throw new UnknownTokenException($text, $position);
159        } elseif (!in_array($foundTokInst->definition(), $stripTokDefs)) {
160            $tokInsts[] = $foundTokInst;
161        }
162    }
163    return $tokInsts;
164}
165
166?>