1<?php 2 3/* 4 * This file is part of the Symfony package. 5 * 6 * (c) Fabien Potencier <fabien@symfony.com> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12namespace Symfony\Polyfill\Intl\Normalizer; 13 14/** 15 * Normalizer is a PHP fallback implementation of the Normalizer class provided by the intl extension. 16 * 17 * It has been validated with Unicode 6.3 Normalization Conformance Test. 18 * See http://www.unicode.org/reports/tr15/ for detailed info about Unicode normalizations. 19 * 20 * @author Nicolas Grekas <p@tchwork.com> 21 * 22 * @internal 23 */ 24class Normalizer 25{ 26 public const FORM_D = \Normalizer::FORM_D; 27 public const FORM_KD = \Normalizer::FORM_KD; 28 public const FORM_C = \Normalizer::FORM_C; 29 public const FORM_KC = \Normalizer::FORM_KC; 30 public const NFD = \Normalizer::NFD; 31 public const NFKD = \Normalizer::NFKD; 32 public const NFC = \Normalizer::NFC; 33 public const NFKC = \Normalizer::NFKC; 34 35 private static $C; 36 private static $D; 37 private static $KD; 38 private static $cC; 39 private static $ulenMask = ["\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4]; 40 private static $ASCII = "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; 41 42 public static function isNormalized(string $s, int $form = self::FORM_C) 43 { 44 if (!\in_array($form, [self::NFD, self::NFKD, self::NFC, self::NFKC])) { 45 return false; 46 } 47 if (!isset($s[strspn($s, self::$ASCII)])) { 48 return true; 49 } 50 if (self::NFC == $form && preg_match('//u', $s) && !preg_match('/[^\x00-\x{2FF}]/u', $s)) { 51 return true; 52 } 53 54 return self::normalize($s, $form) === $s; 55 } 56 57 public static function normalize(string $s, int $form = self::FORM_C) 58 { 59 if (!preg_match('//u', $s)) { 60 return false; 61 } 62 63 switch ($form) { 64 case self::NFC: $C = true; $K = false; break; 65 case self::NFD: $C = false; $K = false; break; 66 case self::NFKC: $C = true; $K = true; break; 67 case self::NFKD: $C = false; $K = true; break; 68 default: 69 if (\defined('Normalizer::NONE') && \Normalizer::NONE == $form) { 70 return $s; 71 } 72 73 if (80000 > \PHP_VERSION_ID) { 74 return false; 75 } 76 77 throw new \ValueError('normalizer_normalize(): Argument #2 ($form) must be a a valid normalization form'); 78 } 79 80 if ('' === $s) { 81 return ''; 82 } 83 84 if ($K && null === self::$KD) { 85 self::$KD = self::getData('compatibilityDecomposition'); 86 } 87 88 if (null === self::$D) { 89 self::$D = self::getData('canonicalDecomposition'); 90 self::$cC = self::getData('combiningClass'); 91 } 92 93 if (null !== $mbEncoding = (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) ? mb_internal_encoding() : null) { 94 mb_internal_encoding('8bit'); 95 } 96 97 $r = self::decompose($s, $K); 98 99 if ($C) { 100 if (null === self::$C) { 101 self::$C = self::getData('canonicalComposition'); 102 } 103 104 $r = self::recompose($r); 105 } 106 if (null !== $mbEncoding) { 107 mb_internal_encoding($mbEncoding); 108 } 109 110 return $r; 111 } 112 113 private static function recompose($s) 114 { 115 $ASCII = self::$ASCII; 116 $compMap = self::$C; 117 $combClass = self::$cC; 118 $ulenMask = self::$ulenMask; 119 120 $result = $tail = ''; 121 122 $i = $s[0] < "\x80" ? 1 : $ulenMask[$s[0] & "\xF0"]; 123 $len = \strlen($s); 124 125 $lastUchr = substr($s, 0, $i); 126 $lastUcls = isset($combClass[$lastUchr]) ? 256 : 0; 127 128 while ($i < $len) { 129 if ($s[$i] < "\x80") { 130 // ASCII chars 131 132 if ($tail) { 133 $lastUchr .= $tail; 134 $tail = ''; 135 } 136 137 if ($j = strspn($s, $ASCII, $i + 1)) { 138 $lastUchr .= substr($s, $i, $j); 139 $i += $j; 140 } 141 142 $result .= $lastUchr; 143 $lastUchr = $s[$i]; 144 $lastUcls = 0; 145 ++$i; 146 continue; 147 } 148 149 $ulen = $ulenMask[$s[$i] & "\xF0"]; 150 $uchr = substr($s, $i, $ulen); 151 152 if ($lastUchr < "\xE1\x84\x80" || "\xE1\x84\x92" < $lastUchr 153 || $uchr < "\xE1\x85\xA1" || "\xE1\x85\xB5" < $uchr 154 || $lastUcls) { 155 // Table lookup and combining chars composition 156 157 $ucls = $combClass[$uchr] ?? 0; 158 159 if (isset($compMap[$lastUchr.$uchr]) && (!$lastUcls || $lastUcls < $ucls)) { 160 $lastUchr = $compMap[$lastUchr.$uchr]; 161 } elseif ($lastUcls = $ucls) { 162 $tail .= $uchr; 163 } else { 164 if ($tail) { 165 $lastUchr .= $tail; 166 $tail = ''; 167 } 168 169 $result .= $lastUchr; 170 $lastUchr = $uchr; 171 } 172 } else { 173 // Hangul chars 174 175 $L = \ord($lastUchr[2]) - 0x80; 176 $V = \ord($uchr[2]) - 0xA1; 177 $T = 0; 178 179 $uchr = substr($s, $i + $ulen, 3); 180 181 if ("\xE1\x86\xA7" <= $uchr && $uchr <= "\xE1\x87\x82") { 182 $T = \ord($uchr[2]) - 0xA7; 183 0 > $T && $T += 0x40; 184 $ulen += 3; 185 } 186 187 $L = 0xAC00 + ($L * 21 + $V) * 28 + $T; 188 $lastUchr = \chr(0xE0 | $L >> 12).\chr(0x80 | $L >> 6 & 0x3F).\chr(0x80 | $L & 0x3F); 189 } 190 191 $i += $ulen; 192 } 193 194 return $result.$lastUchr.$tail; 195 } 196 197 private static function decompose($s, $c) 198 { 199 $result = ''; 200 201 $ASCII = self::$ASCII; 202 $decompMap = self::$D; 203 $combClass = self::$cC; 204 $ulenMask = self::$ulenMask; 205 if ($c) { 206 $compatMap = self::$KD; 207 } 208 209 $c = []; 210 $i = 0; 211 $len = \strlen($s); 212 213 while ($i < $len) { 214 if ($s[$i] < "\x80") { 215 // ASCII chars 216 217 if ($c) { 218 ksort($c); 219 $result .= implode('', $c); 220 $c = []; 221 } 222 223 $j = 1 + strspn($s, $ASCII, $i + 1); 224 $result .= substr($s, $i, $j); 225 $i += $j; 226 continue; 227 } 228 229 $ulen = $ulenMask[$s[$i] & "\xF0"]; 230 $uchr = substr($s, $i, $ulen); 231 $i += $ulen; 232 233 if ($uchr < "\xEA\xB0\x80" || "\xED\x9E\xA3" < $uchr) { 234 // Table lookup 235 236 if ($uchr !== $j = $compatMap[$uchr] ?? ($decompMap[$uchr] ?? $uchr)) { 237 $uchr = $j; 238 239 $j = \strlen($uchr); 240 $ulen = $uchr[0] < "\x80" ? 1 : $ulenMask[$uchr[0] & "\xF0"]; 241 242 if ($ulen != $j) { 243 // Put trailing chars in $s 244 245 $j -= $ulen; 246 $i -= $j; 247 248 if (0 > $i) { 249 $s = str_repeat(' ', -$i).$s; 250 $len -= $i; 251 $i = 0; 252 } 253 254 while ($j--) { 255 $s[$i + $j] = $uchr[$ulen + $j]; 256 } 257 258 $uchr = substr($uchr, 0, $ulen); 259 } 260 } 261 if (isset($combClass[$uchr])) { 262 // Combining chars, for sorting 263 264 if (!isset($c[$combClass[$uchr]])) { 265 $c[$combClass[$uchr]] = ''; 266 } 267 $c[$combClass[$uchr]] .= $uchr; 268 continue; 269 } 270 } else { 271 // Hangul chars 272 273 $uchr = unpack('C*', $uchr); 274 $j = (($uchr[1] - 224) << 12) + (($uchr[2] - 128) << 6) + $uchr[3] - 0xAC80; 275 276 $uchr = "\xE1\x84".\chr(0x80 + (int) ($j / 588)) 277 ."\xE1\x85".\chr(0xA1 + (int) (($j % 588) / 28)); 278 279 if ($j %= 28) { 280 $uchr .= $j < 25 281 ? ("\xE1\x86".\chr(0xA7 + $j)) 282 : ("\xE1\x87".\chr(0x67 + $j)); 283 } 284 } 285 if ($c) { 286 ksort($c); 287 $result .= implode('', $c); 288 $c = []; 289 } 290 291 $result .= $uchr; 292 } 293 294 if ($c) { 295 ksort($c); 296 $result .= implode('', $c); 297 } 298 299 return $result; 300 } 301 302 private static function getData($file) 303 { 304 if (file_exists($file = __DIR__.'/Resources/unidata/'.$file.'.php')) { 305 return require $file; 306 } 307 308 return false; 309 } 310} 311