1<?php 2 3namespace AST; 4use \InvalidArgumentException; 5use \RuntimeException; 6 7require_once "exceptions.php"; 8 9 10function mb_preg_adapt_regex($regex) { 11 if (strlen($regex) < 2) { 12 return array($regex, ''); 13 } 14 // "/rgx/options" --> ["rgx", "options"] 15 $lastDelim = strrpos($regex, $regex[0]); 16 if ($lastDelim === false || $lastDelim < 1) { 17 return array($regex, ''); 18 } 19 $options = substr($regex, $lastDelim + 1); 20 $regex = substr($regex, 1, $lastDelim - 1); 21 return array($regex, $options); 22} 23 24function mb_preg_match($text, $matchRegex, $position=0) { 25 mb_regex_encoding('UTF-8'); 26 list($matchRegex, $matchRegexOptions) = mb_preg_adapt_regex($matchRegex); 27 $textPiece = mb_substr($text, $position); 28 if (mb_ereg_search_init($textPiece) === false) { 29 return null; 30 } 31 $result = mb_ereg_search_pos($matchRegex, $matchRegexOptions); 32 list($matchOfsBytes, $matchLenBytes) = $result; 33 // Is the match offset in byte coinciding with the specified position? 34 // Or: is the match exactly at $position? 35 if (mb_strlen(mb_strcut($textPiece, 0, $matchOfsBytes)) > 0) { 36 return null; 37 } 38 return mb_strcut($textPiece, $matchOfsBytes, $matchLenBytes); 39} 40 41 42function sb_preg_match($text, $matchRegex, $position=0) { 43 $matches = null; 44 $result = preg_match($matchRegex, $text, $matches, PREG_OFFSET_CAPTURE, $position); 45 if ($result === 0) { 46 return null; 47 } elseif ($result === false) { 48 throw new InvalidArgumentException('An error occurred in preg_match.'); 49 } elseif (count($matches) == 0) { 50 throw new RuntimeException('No matches?'); 51 } 52 list($matchTxt, $matchOfs) = $matches[0]; 53 if ($matchOfs > $position) { 54 return null; 55 } 56 return $matchTxt; 57} 58 59 60 61class TokenDefinition { 62 private $_representation = null; 63 private $_name = null; 64 private $_matchRegex = null; 65 66 public function __construct($representation, $name=null, $matchRegex=null) { 67 $this->_representation = $representation; 68 if ($name === null) { 69 $name = $representation; 70 } 71 $this->_name = $name; 72 if ($matchRegex === null) { 73 // preg_quote works also for multibyte 74 // https://stackoverflow.com/a/31733257/1749822 75 $matchRegex = '/' . preg_quote($representation) . '/'; 76 } 77 $this->_matchRegex = $matchRegex; 78 } 79 80 public function representation() { return $this->_representation; } 81 public function name() { return $this->_name; } 82 83 public static function supportsMultibyte() { 84 static $_loaded = null; 85 if ($_loaded === null) { 86 $_loaded = (extension_loaded('mbstring') === true); 87 } 88 return $_loaded; 89 } 90 91 public function tryMatch($text, $position) { 92 if (self::supportsMultibyte()) { 93 return mb_preg_match($text, $this->_matchRegex, $position); 94 } else { 95 return sb_preg_match($text, $this->_matchRegex, $position); 96 } 97 } 98 99 public function __toString() { 100 return '<' . $this->name() . '>'; 101 } 102} 103 104class TokenInstance { 105 private $_definition = null; 106 private $_text = null; 107 private $_position = null; 108 private $_length = null; 109 110 public function __construct($definition, $text, $position, $length) { 111 $this->_definition = $definition; 112 $this->_text = $text; 113 $this->_position = $position; 114 $this->_length = $length; 115 } 116 117 public function definition() { return $this->_definition; } 118 public function text() { return $this->_text; } 119 public function position() { return $this->_position; } 120 public function length() { return $this->_length; } 121 public function match() { 122 if (TokenDefinition::supportsMultibyte()) { 123 return mb_substr($this->_text, $this->position(), $this->length()); 124 } else { 125 return substr($this->_text, $this->position(), $this->length()); 126 } 127 } 128 129 public function __toString() { 130 return '<' . $this->definition()->name() . ':' . $this->match() . '>'; 131 } 132} 133 134function tokenize($text, array $tokDefs, array $stripTokDefs) { 135 if (TokenDefinition::supportsMultibyte()) { 136 $textLen = mb_strlen($text); 137 } else { 138 $textLen = strlen($text); 139 } 140 141 $tokInsts = array(); 142 $foundTokInst = null; 143 for ($position = 0; $position < $textLen; $position += $foundTokInst->length()) { 144 $foundTokInst = null; 145 foreach ($tokDefs as $tokDef) { 146 $match = $tokDef->tryMatch($text, $position); 147 if ($match !== null) { 148 if (TokenDefinition::supportsMultibyte()) { 149 $matchLen = mb_strlen($match); 150 } else { 151 $matchLen = strlen($match); 152 } 153 $foundTokInst = new TokenInstance($tokDef, $text, $position, $matchLen); 154 break; 155 } 156 } 157 if ($foundTokInst === null) { 158 throw new UnknownTokenException($text, $position); 159 } elseif (!in_array($foundTokInst->definition(), $stripTokDefs)) { 160 $tokInsts[] = $foundTokInst; 161 } 162 } 163 return $tokInsts; 164} 165 166?>