1<?php 2 3/** 4 * Hoa 5 * 6 * 7 * @license 8 * 9 * New BSD License 10 * 11 * Copyright © 2007-2017, Hoa community. All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions are met: 15 * * Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * * Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * * Neither the name of the Hoa nor the names of its contributors may be 21 * used to endorse or promote products derived from this software without 22 * specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE 28 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 */ 36 37namespace Hoa\Ustring; 38 39use Hoa\Consistency; 40 41/** 42 * Class \Hoa\Ustring. 43 * 44 * This class represents a UTF-8 string. 45 * Please, see: 46 * • http://www.ietf.org/rfc/rfc3454.txt; 47 * • http://unicode.org/reports/tr9/; 48 * • http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt. 49 * 50 * @copyright Copyright © 2007-2017 Hoa community 51 * @license New BSD License 52 */ 53class Ustring implements \ArrayAccess, \Countable, \IteratorAggregate 54{ 55 /** 56 * Left-To-Right. 57 * 58 * @const int 59 */ 60 const LTR = 0; 61 62 /** 63 * Right-To-Left. 64 * 65 * @const int 66 */ 67 const RTL = 1; 68 69 /** 70 * ZERO WIDTH NON-BREAKING SPACE (ZWNPBSP, aka byte-order mark, BOM). 71 * 72 * @const int 73 */ 74 const BOM = 0xfeff; 75 76 /** 77 * LEFT-TO-RIGHT MARK. 78 * 79 * @const int 80 */ 81 const LRM = 0x200e; 82 83 /** 84 * RIGHT-TO-LEFT MARK. 85 * 86 * @const int 87 */ 88 const RLM = 0x200f; 89 90 /** 91 * LEFT-TO-RIGHT EMBEDDING. 92 * 93 * @const int 94 */ 95 const LRE = 0x202a; 96 97 /** 98 * RIGHT-TO-LEFT EMBEDDING. 99 * 100 * @const int 101 */ 102 const RLE = 0x202b; 103 104 /** 105 * POP DIRECTIONAL FORMATTING. 106 * 107 * @const int 108 */ 109 const PDF = 0x202c; 110 111 /** 112 * LEFT-TO-RIGHT OVERRIDE. 113 * 114 * @const int 115 */ 116 const LRO = 0x202d; 117 118 /** 119 * RIGHT-TO-LEFT OVERRIDE. 120 * 121 * @const int 122 */ 123 const RLO = 0x202e; 124 125 /** 126 * Represent the beginning of the string. 127 * 128 * @const int 129 */ 130 const BEGINNING = 1; 131 132 /** 133 * Represent the end of the string. 134 * 135 * @const int 136 */ 137 const END = 2; 138 139 /** 140 * Split: non-empty pieces is returned. 141 * 142 * @const int 143 */ 144 const WITHOUT_EMPTY = PREG_SPLIT_NO_EMPTY; 145 146 /** 147 * Split: parenthesized expression in the delimiter pattern will be captured 148 * and returned. 149 * 150 * @const int 151 */ 152 const WITH_DELIMITERS = PREG_SPLIT_DELIM_CAPTURE; 153 154 /** 155 * Split: offsets of captures will be returned. 156 * 157 * @const int 158 */ 159 const WITH_OFFSET = 260; // PREG_OFFSET_CAPTURE 160 // | PREG_SPLIT_OFFSET_CAPTURE 161 162 /** 163 * Group results by patterns. 164 * 165 * @const int 166 */ 167 const GROUP_BY_PATTERN = PREG_PATTERN_ORDER; 168 169 /** 170 * Group results by tuple (set of patterns). 171 * 172 * @const int 173 */ 174 const GROUP_BY_TUPLE = PREG_SET_ORDER; 175 176 /** 177 * Current string. 178 * 179 * @var string 180 */ 181 protected $_string = null; 182 183 /** 184 * Direction. Please see self::LTR and self::RTL constants. 185 * 186 * @var int 187 */ 188 protected $_direction = null; 189 190 /** 191 * Collator. 192 * 193 * @var \Collator 194 */ 195 protected static $_collator = null; 196 197 198 199 /** 200 * Construct a UTF-8 string. 201 * 202 * @param string $string String. 203 */ 204 public function __construct($string = null) 205 { 206 if (null !== $string) { 207 $this->append($string); 208 } 209 210 return; 211 } 212 213 /** 214 * Check if ext/mbstring is available. 215 * 216 * @return bool 217 */ 218 public static function checkMbString() 219 { 220 return function_exists('mb_substr'); 221 } 222 223 /** 224 * Check if ext/iconv is available. 225 * 226 * @return bool 227 */ 228 public static function checkIconv() 229 { 230 return function_exists('iconv'); 231 } 232 233 /** 234 * Append a substring to the current string, i.e. add to the end. 235 * 236 * @param string $substring Substring to append. 237 * @return \Hoa\Ustring 238 */ 239 public function append($substring) 240 { 241 $this->_string .= $substring; 242 243 return $this; 244 } 245 246 /** 247 * Prepend a substring to the current string, i.e. add to the start. 248 * 249 * @param string $substring Substring to append. 250 * @return \Hoa\Ustring 251 */ 252 public function prepend($substring) 253 { 254 $this->_string = $substring . $this->_string; 255 256 return $this; 257 } 258 259 /** 260 * Pad the current string to a certain length with another piece, aka piece. 261 * 262 * @param int $length Length. 263 * @param string $piece Piece. 264 * @param int $side Whether we append at the end or the beginning 265 * of the current string. 266 * @return \Hoa\Ustring 267 */ 268 public function pad($length, $piece, $side = self::END) 269 { 270 $difference = $length - $this->count(); 271 272 if (0 >= $difference) { 273 return $this; 274 } 275 276 $handle = null; 277 278 for ($i = $difference / mb_strlen($piece) - 1; $i >= 0; --$i) { 279 $handle .= $piece; 280 } 281 282 $handle .= mb_substr($piece, 0, $difference - mb_strlen($handle)); 283 284 return 285 static::END === $side 286 ? $this->append($handle) 287 : $this->prepend($handle); 288 } 289 290 /** 291 * Make a comparison with a string. 292 * Return < 0 if current string is less than $string, > 0 if greater and 0 293 * if equal. 294 * 295 * @param mixed $string String. 296 * @return int 297 */ 298 public function compare($string) 299 { 300 if (null === $collator = static::getCollator()) { 301 return strcmp($this->_string, (string) $string); 302 } 303 304 return $collator->compare($this->_string, $string); 305 } 306 307 /** 308 * Get collator. 309 * 310 * @return \Collator 311 */ 312 public static function getCollator() 313 { 314 if (false === class_exists('Collator')) { 315 return null; 316 } 317 318 if (null === static::$_collator) { 319 static::$_collator = new \Collator(setlocale(LC_COLLATE, null)); 320 } 321 322 return static::$_collator; 323 } 324 325 /** 326 * Ensure that the pattern is safe for Unicode: add the “u” option. 327 * 328 * @param string $pattern Pattern. 329 * @return string 330 */ 331 public static function safePattern($pattern) 332 { 333 $delimiter = mb_substr($pattern, 0, 1); 334 $options = mb_substr( 335 mb_strrchr($pattern, $delimiter, false), 336 mb_strlen($delimiter) 337 ); 338 339 if (false === strpos($options, 'u')) { 340 $pattern .= 'u'; 341 } 342 343 return $pattern; 344 } 345 346 /** 347 * Perform a regular expression (PCRE) match. 348 * 349 * @param string $pattern Pattern. 350 * @param array $matches Matches. 351 * @param int $flags Please, see constants self::WITH_OFFSET, 352 * self::GROUP_BY_PATTERN and 353 * self::GROUP_BY_TUPLE. 354 * @param int $offset Alternate place from which to start the 355 * search. 356 * @param bool $global Whether the match is global or not. 357 * @return int 358 */ 359 public function match( 360 $pattern, 361 &$matches = null, 362 $flags = 0, 363 $offset = 0, 364 $global = false 365 ) { 366 $pattern = static::safePattern($pattern); 367 368 if (0 === $flags) { 369 if (true === $global) { 370 $flags = static::GROUP_BY_PATTERN; 371 } 372 } else { 373 $flags &= ~PREG_SPLIT_OFFSET_CAPTURE; 374 } 375 376 377 $offset = strlen(mb_substr($this->_string, 0, $offset)); 378 379 if (true === $global) { 380 return preg_match_all( 381 $pattern, 382 $this->_string, 383 $matches, 384 $flags, 385 $offset 386 ); 387 } 388 389 return preg_match($pattern, $this->_string, $matches, $flags, $offset); 390 } 391 392 /** 393 * Perform a regular expression (PCRE) search and replace. 394 * 395 * @param mixed $pattern Pattern(s). 396 * @param mixed $replacement Replacement(s) (please, see 397 * preg_replace() documentation). 398 * @param int $limit Maximum of replacements. -1 for unbound. 399 * @return \Hoa\Ustring 400 */ 401 public function replace($pattern, $replacement, $limit = -1) 402 { 403 $pattern = static::safePattern($pattern); 404 405 if (false === is_callable($replacement)) { 406 $this->_string = preg_replace( 407 $pattern, 408 $replacement, 409 $this->_string, 410 $limit 411 ); 412 } else { 413 $this->_string = preg_replace_callback( 414 $pattern, 415 $replacement, 416 $this->_string, 417 $limit 418 ); 419 } 420 421 return $this; 422 } 423 424 /** 425 * Split the current string according to a given pattern (PCRE). 426 * 427 * @param string $pattern Pattern (as a regular expression). 428 * @param int $limit Maximum of split. -1 for unbound. 429 * @param int $flags Please, see constants self::WITHOUT_EMPTY, 430 * self::WITH_DELIMITERS, self::WITH_OFFSET. 431 * @return array 432 */ 433 public function split( 434 $pattern, 435 $limit = -1, 436 $flags = self::WITHOUT_EMPTY 437 ) { 438 return preg_split( 439 static::safePattern($pattern), 440 $this->_string, 441 $limit, 442 $flags 443 ); 444 } 445 446 /** 447 * Iterator over chars. 448 * 449 * @return \ArrayIterator 450 */ 451 public function getIterator() 452 { 453 return new \ArrayIterator(preg_split('#(?<!^)(?!$)#u', $this->_string)); 454 } 455 456 /** 457 * Perform a lowercase folding on the current string. 458 * 459 * @return \Hoa\Ustring 460 */ 461 public function toLowerCase() 462 { 463 $this->_string = mb_strtolower($this->_string); 464 465 return $this; 466 } 467 468 /** 469 * Perform an uppercase folding on the current string. 470 * 471 * @return \Hoa\Ustring 472 */ 473 public function toUpperCase() 474 { 475 $this->_string = mb_strtoupper($this->_string); 476 477 return $this; 478 } 479 480 /** 481 * Transform a UTF-8 string into an ASCII one. 482 * First, try with a transliterator. If not available, will fallback to a 483 * normalizer. If not available, will try something homemade. 484 * 485 * @param bool $try Try something if \Normalizer is not present. 486 * @return \Hoa\Ustring 487 * @throws \Hoa\Ustring\Exception 488 */ 489 public function toAscii($try = false) 490 { 491 if (0 === preg_match('#[\x80-\xff]#', $this->_string)) { 492 return $this; 493 } 494 495 $string = $this->_string; 496 $transId = 497 'Any-Latin; ' . 498 '[\p{S}] Name; ' . 499 'Latin-ASCII'; 500 501 if (null !== $transliterator = static::getTransliterator($transId)) { 502 $this->_string = preg_replace_callback( 503 '#\\\N\{([A-Z ]+)\}#u', 504 function (array $matches) { 505 return '(' . strtolower($matches[1]) . ')'; 506 }, 507 $transliterator->transliterate($string) 508 ); 509 510 return $this; 511 } 512 513 if (false === class_exists('Normalizer')) { 514 if (false === $try) { 515 throw new Exception( 516 '%s needs the class Normalizer to work properly, ' . 517 'or you can force a try by using %1$s(true).', 518 0, 519 __METHOD__ 520 ); 521 } 522 523 $string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT'); 524 $this->_string = preg_replace('#(?:[\'"`^](\w))#u', '\1', $string); 525 526 return $this; 527 } 528 529 $string = \Normalizer::normalize($string, \Normalizer::NFKD); 530 $string = preg_replace('#\p{Mn}+#u', '', $string); 531 $this->_string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT'); 532 533 return $this; 534 } 535 536 /** 537 * Transliterate the string into another. 538 * See self::getTransliterator for more information. 539 * 540 * @param string $identifier Identifier. 541 * @param int $start Start. 542 * @param int $end End. 543 * @return \Hoa\Ustring 544 * @throws \Hoa\Ustring\Exception 545 */ 546 public function transliterate($identifier, $start = 0, $end = null) 547 { 548 if (null === $transliterator = static::getTransliterator($identifier)) { 549 throw new Exception( 550 '%s needs the class Transliterator to work properly.', 551 1, 552 __METHOD__ 553 ); 554 } 555 556 $this->_string = $transliterator->transliterate($this->_string, $start, $end); 557 558 return $this; 559 } 560 561 /** 562 * Get transliterator. 563 * See http://userguide.icu-project.org/transforms/general for $identifier. 564 * 565 * @param string $identifier Identifier. 566 * @return \Transliterator 567 */ 568 public static function getTransliterator($identifier) 569 { 570 if (false === class_exists('Transliterator')) { 571 return null; 572 } 573 574 return \Transliterator::create($identifier); 575 } 576 577 /** 578 * Strip characters (default \s) of the current string. 579 * 580 * @param string $regex Characters to remove. 581 * @param int $side Whether we trim the beginning, the end or both 582 * sides, of the current string. 583 * @return \Hoa\Ustring 584 */ 585 public function trim($regex = '\s', $side = 3 /* static::BEGINNING | static::END */) 586 { 587 $regex = '(?:' . $regex . ')+'; 588 $handle = null; 589 590 if (0 !== ($side & static::BEGINNING)) { 591 $handle .= '(^' . $regex . ')'; 592 } 593 594 if (0 !== ($side & static::END)) { 595 if (null !== $handle) { 596 $handle .= '|'; 597 } 598 599 $handle .= '(' . $regex . '$)'; 600 } 601 602 $this->_string = preg_replace('#' . $handle . '#u', '', $this->_string); 603 $this->_direction = null; 604 605 return $this; 606 } 607 608 /** 609 * Compute offset (negative, unbound etc.). 610 * 611 * @param int $offset Offset. 612 * @return int 613 */ 614 protected function computeOffset($offset) 615 { 616 $length = mb_strlen($this->_string); 617 618 if (0 > $offset) { 619 $offset = -$offset % $length; 620 621 if (0 !== $offset) { 622 $offset = $length - $offset; 623 } 624 } elseif ($offset >= $length) { 625 $offset %= $length; 626 } 627 628 return $offset; 629 } 630 631 /** 632 * Get a specific chars of the current string. 633 * 634 * @param int $offset Offset (can be negative and unbound). 635 * @return string 636 */ 637 public function offsetGet($offset) 638 { 639 return mb_substr($this->_string, $this->computeOffset($offset), 1); 640 } 641 642 /** 643 * Set a specific character of the current string. 644 * 645 * @param int $offset Offset (can be negative and unbound). 646 * @param string $value Value. 647 * @return \Hoa\Ustring 648 */ 649 public function offsetSet($offset, $value) 650 { 651 $head = null; 652 $offset = $this->computeOffset($offset); 653 654 if (0 < $offset) { 655 $head = mb_substr($this->_string, 0, $offset); 656 } 657 658 $tail = mb_substr($this->_string, $offset + 1); 659 $this->_string = $head . $value . $tail; 660 $this->_direction = null; 661 662 return $this; 663 } 664 665 /** 666 * Delete a specific character of the current string. 667 * 668 * @param int $offset Offset (can be negative and unbound). 669 * @return string 670 */ 671 public function offsetUnset($offset) 672 { 673 return $this->offsetSet($offset, null); 674 } 675 676 /** 677 * Check if a specific offset exists. 678 * 679 * @return bool 680 */ 681 public function offsetExists($offset) 682 { 683 return true; 684 } 685 686 /** 687 * Reduce the strings. 688 * 689 * @param int $start Position of first character. 690 * @param int $length Maximum number of characters. 691 * @return \Hoa\Ustring 692 */ 693 public function reduce($start, $length = null) 694 { 695 $this->_string = mb_substr($this->_string, $start, $length); 696 697 return $this; 698 } 699 700 /** 701 * Count number of characters of the current string. 702 * 703 * @return int 704 */ 705 public function count() 706 { 707 return mb_strlen($this->_string); 708 } 709 710 /** 711 * Get byte (not character) at a specific offset. 712 * 713 * @param int $offset Offset (can be negative and unbound). 714 * @return string 715 */ 716 public function getByteAt($offset) 717 { 718 $length = strlen($this->_string); 719 720 if (0 > $offset) { 721 $offset = -$offset % $length; 722 723 if (0 !== $offset) { 724 $offset = $length - $offset; 725 } 726 } elseif ($offset >= $length) { 727 $offset %= $length; 728 } 729 730 return $this->_string[$offset]; 731 } 732 733 /** 734 * Count number of bytes (not characters) of the current string. 735 * 736 * @return int 737 */ 738 public function getBytesLength() 739 { 740 return strlen($this->_string); 741 } 742 743 /** 744 * Get the width of the current string. 745 * Useful when printing the string in monotype (some character need more 746 * than one column to be printed). 747 * 748 * @return int 749 */ 750 public function getWidth() 751 { 752 return mb_strwidth($this->_string); 753 } 754 755 /** 756 * Get direction of the current string. 757 * Please, see the self::LTR and self::RTL constants. 758 * It does not yet support embedding directions. 759 * 760 * @return int 761 */ 762 public function getDirection() 763 { 764 if (null === $this->_direction) { 765 if (null === $this->_string) { 766 $this->_direction = static::LTR; 767 } else { 768 $this->_direction = static::getCharDirection( 769 mb_substr($this->_string, 0, 1) 770 ); 771 } 772 } 773 774 return $this->_direction; 775 } 776 777 /** 778 * Get character of a specific character. 779 * Please, see the self::LTR and self::RTL constants. 780 * 781 * @param string $char Character. 782 * @return int 783 */ 784 public static function getCharDirection($char) 785 { 786 $c = static::toCode($char); 787 788 if (!(0x5be <= $c && 0x10b7f >= $c)) { 789 return static::LTR; 790 } 791 792 if (0x85e >= $c) { 793 if (0x5be === $c || 794 0x5c0 === $c || 795 0x5c3 === $c || 796 0x5c6 === $c || 797 (0x5d0 <= $c && 0x5ea >= $c) || 798 (0x5f0 <= $c && 0x5f4 >= $c) || 799 0x608 === $c || 800 0x60b === $c || 801 0x60d === $c || 802 0x61b === $c || 803 (0x61e <= $c && 0x64a >= $c) || 804 (0x66d <= $c && 0x66f >= $c) || 805 (0x671 <= $c && 0x6d5 >= $c) || 806 (0x6e5 <= $c && 0x6e6 >= $c) || 807 (0x6ee <= $c && 0x6ef >= $c) || 808 (0x6fa <= $c && 0x70d >= $c) || 809 0x710 === $c || 810 (0x712 <= $c && 0x72f >= $c) || 811 (0x74d <= $c && 0x7a5 >= $c) || 812 0x7b1 === $c || 813 (0x7c0 <= $c && 0x7ea >= $c) || 814 (0x7f4 <= $c && 0x7f5 >= $c) || 815 0x7fa === $c || 816 (0x800 <= $c && 0x815 >= $c) || 817 0x81a === $c || 818 0x824 === $c || 819 0x828 === $c || 820 (0x830 <= $c && 0x83e >= $c) || 821 (0x840 <= $c && 0x858 >= $c) || 822 0x85e === $c) { 823 return static::RTL; 824 } 825 } elseif (0x200f === $c) { 826 return static::RTL; 827 } elseif (0xfb1d <= $c) { 828 if (0xfb1d === $c || 829 (0xfb1f <= $c && 0xfb28 >= $c) || 830 (0xfb2a <= $c && 0xfb36 >= $c) || 831 (0xfb38 <= $c && 0xfb3c >= $c) || 832 0xfb3e === $c || 833 (0xfb40 <= $c && 0xfb41 >= $c) || 834 (0xfb43 <= $c && 0xfb44 >= $c) || 835 (0xfb46 <= $c && 0xfbc1 >= $c) || 836 (0xfbd3 <= $c && 0xfd3d >= $c) || 837 (0xfd50 <= $c && 0xfd8f >= $c) || 838 (0xfd92 <= $c && 0xfdc7 >= $c) || 839 (0xfdf0 <= $c && 0xfdfc >= $c) || 840 (0xfe70 <= $c && 0xfe74 >= $c) || 841 (0xfe76 <= $c && 0xfefc >= $c) || 842 (0x10800 <= $c && 0x10805 >= $c) || 843 0x10808 === $c || 844 (0x1080a <= $c && 0x10835 >= $c) || 845 (0x10837 <= $c && 0x10838 >= $c) || 846 0x1083c === $c || 847 (0x1083f <= $c && 0x10855 >= $c) || 848 (0x10857 <= $c && 0x1085f >= $c) || 849 (0x10900 <= $c && 0x1091b >= $c) || 850 (0x10920 <= $c && 0x10939 >= $c) || 851 0x1093f === $c || 852 0x10a00 === $c || 853 (0x10a10 <= $c && 0x10a13 >= $c) || 854 (0x10a15 <= $c && 0x10a17 >= $c) || 855 (0x10a19 <= $c && 0x10a33 >= $c) || 856 (0x10a40 <= $c && 0x10a47 >= $c) || 857 (0x10a50 <= $c && 0x10a58 >= $c) || 858 (0x10a60 <= $c && 0x10a7f >= $c) || 859 (0x10b00 <= $c && 0x10b35 >= $c) || 860 (0x10b40 <= $c && 0x10b55 >= $c) || 861 (0x10b58 <= $c && 0x10b72 >= $c) || 862 (0x10b78 <= $c && 0x10b7f >= $c)) { 863 return static::RTL; 864 } 865 } 866 867 return static::LTR; 868 } 869 870 /** 871 * Get the number of column positions of a wide-character. 872 * 873 * This is a PHP implementation of wcwidth() and wcswidth() (defined in IEEE 874 * Std 1002.1-2001) for Unicode, by Markus Kuhn. Please, see 875 * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c. 876 * 877 * The wcwidth(wc) function shall either return 0 (if wc is a null 878 * wide-character code), or return the number of column positions to be 879 * occupied by the wide-character code wc, or return -1 (if wc does not 880 * correspond to a printable wide-character code). 881 * 882 * @param string $char Character. 883 * @return int 884 */ 885 public static function getCharWidth($char) 886 { 887 $char = (string) $char; 888 $c = static::toCode($char); 889 890 // Test for 8-bit control characters. 891 if (0x0 === $c) { 892 return 0; 893 } 894 895 if (0x20 > $c || (0x7f <= $c && $c < 0xa0)) { 896 return -1; 897 } 898 899 // Non-spacing characters. 900 if (0xad !== $c && 901 0 !== preg_match('#^[\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11ff}\x{200b}]#u', $char)) { 902 return 0; 903 } 904 905 // If we arrive here, $c is not a combining C0/C1 control character. 906 return 1 + 907 (0x1100 <= $c && 908 (0x115f >= $c || // Hangul Jamo init. consonants 909 0x2329 === $c || 0x232a === $c || 910 (0x2e80 <= $c && 0xa4cf >= $c && 911 0x303f !== $c) || // CJK…Yi 912 (0xac00 <= $c && 0xd7a3 >= $c) || // Hangul Syllables 913 (0xf900 <= $c && 0xfaff >= $c) || // CJK Compatibility Ideographs 914 (0xfe10 <= $c && 0xfe19 >= $c) || // Vertical forms 915 (0xfe30 <= $c && 0xfe6f >= $c) || // CJK Compatibility Forms 916 (0xff00 <= $c && 0xff60 >= $c) || // Fullwidth Forms 917 (0xffe0 <= $c && 0xffe6 >= $c) || 918 (0x20000 <= $c && 0x2fffd >= $c) || 919 (0x30000 <= $c && 0x3fffd >= $c))); 920 } 921 922 /** 923 * Check whether the character is printable or not. 924 * 925 * @param string $char Character. 926 * @return bool 927 */ 928 public static function isCharPrintable($char) 929 { 930 return 1 <= static::getCharWidth($char); 931 } 932 933 /** 934 * Get a UTF-8 character from its decimal code representation. 935 * 936 * @param int $code Code. 937 * @return string 938 */ 939 public static function fromCode($code) 940 { 941 return mb_convert_encoding( 942 '&#x' . dechex($code) . ';', 943 'UTF-8', 944 'HTML-ENTITIES' 945 ); 946 } 947 948 /** 949 * Get a decimal code representation of a specific character. 950 * 951 * @param string $char Character. 952 * @return int 953 */ 954 public static function toCode($char) 955 { 956 $char = (string) $char; 957 $code = ord($char[0]); 958 $bytes = 1; 959 960 if (!($code & 0x80)) { // 0xxxxxxx 961 return $code; 962 } 963 964 if (($code & 0xe0) === 0xc0) { // 110xxxxx 965 $bytes = 2; 966 $code = $code & ~0xc0; 967 } elseif (($code & 0xf0) == 0xe0) { // 1110xxxx 968 $bytes = 3; 969 $code = $code & ~0xe0; 970 } elseif (($code & 0xf8) === 0xf0) { // 11110xxx 971 $bytes = 4; 972 $code = $code & ~0xf0; 973 } 974 975 for ($i = 2; $i <= $bytes; $i++) { // 10xxxxxx 976 $code = ($code << 6) + (ord($char[$i - 1]) & ~0x80); 977 } 978 979 return $code; 980 } 981 982 /** 983 * Get a binary representation of a specific character. 984 * 985 * @param string $char Character. 986 * @return string 987 */ 988 public static function toBinaryCode($char) 989 { 990 $char = (string) $char; 991 $out = null; 992 993 for ($i = 0, $max = strlen($char); $i < $max; ++$i) { 994 $out .= vsprintf('%08b', ord($char[$i])); 995 } 996 997 return $out; 998 } 999 1000 /** 1001 * Transcode. 1002 * 1003 * @param string $string String. 1004 * @param string $from Original encoding. 1005 * @param string $to Final encoding. 1006 * @return string 1007 * @throws \Hoa\Ustring\Exception 1008 */ 1009 public static function transcode($string, $from, $to = 'UTF-8') 1010 { 1011 if (false === static::checkIconv()) { 1012 throw new Exception( 1013 '%s needs the iconv extension.', 1014 2, 1015 __CLASS__ 1016 ); 1017 } 1018 1019 return iconv($from, $to, $string); 1020 } 1021 1022 /** 1023 * Check if a string is encoded in UTF-8. 1024 * 1025 * @param string $string String. 1026 * @return bool 1027 */ 1028 public static function isUtf8($string) 1029 { 1030 return (bool) preg_match('##u', $string); 1031 } 1032 1033 /** 1034 * Copy current object string 1035 * 1036 * @return \Hoa\Ustring 1037 */ 1038 public function copy() 1039 { 1040 return clone $this; 1041 } 1042 1043 /** 1044 * Transform the object as a string. 1045 * 1046 * @return string 1047 */ 1048 public function __toString() 1049 { 1050 return $this->_string; 1051 } 1052} 1053 1054/** 1055 * Flex entity. 1056 */ 1057Consistency::flexEntity('Hoa\Ustring\Ustring'); 1058 1059if (false === Ustring::checkMbString()) { 1060 throw new Exception( 1061 '%s needs the mbstring extension.', 1062 0, 1063 __NAMESPACE__ . '\Ustring' 1064 ); 1065} 1066