1<?php
2
3/**
4 * Hoa
5 *
6 *
7 * @license
8 *
9 * New BSD License
10 *
11 * Copyright © 2007-2017, Hoa community. All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions are met:
15 *     * Redistributions of source code must retain the above copyright
16 *       notice, this list of conditions and the following disclaimer.
17 *     * Redistributions in binary form must reproduce the above copyright
18 *       notice, this list of conditions and the following disclaimer in the
19 *       documentation and/or other materials provided with the distribution.
20 *     * Neither the name of the Hoa nor the names of its contributors may be
21 *       used to endorse or promote products derived from this software without
22 *       specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
28 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36
37namespace Hoa\Ustring;
38
39use Hoa\Consistency;
40
41/**
42 * Class \Hoa\Ustring.
43 *
44 * This class represents a UTF-8 string.
45 * Please, see:
46 *     • http://www.ietf.org/rfc/rfc3454.txt;
47 *     • http://unicode.org/reports/tr9/;
48 *     • http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt.
49 *
50 * @copyright  Copyright © 2007-2017 Hoa community
51 * @license    New BSD License
52 */
53class Ustring implements \ArrayAccess, \Countable, \IteratorAggregate
54{
55    /**
56     * Left-To-Right.
57     *
58     * @const int
59     */
60    const LTR              = 0;
61
62    /**
63     * Right-To-Left.
64     *
65     * @const int
66     */
67    const RTL              = 1;
68
69    /**
70     * ZERO WIDTH NON-BREAKING SPACE (ZWNPBSP, aka byte-order mark, BOM).
71     *
72     * @const int
73     */
74    const BOM              = 0xfeff;
75
76    /**
77     * LEFT-TO-RIGHT MARK.
78     *
79     * @const int
80     */
81    const LRM              = 0x200e;
82
83    /**
84     * RIGHT-TO-LEFT MARK.
85     *
86     * @const int
87     */
88    const RLM              = 0x200f;
89
90    /**
91     * LEFT-TO-RIGHT EMBEDDING.
92     *
93     * @const int
94     */
95    const LRE              = 0x202a;
96
97    /**
98     * RIGHT-TO-LEFT EMBEDDING.
99     *
100     * @const int
101     */
102    const RLE              = 0x202b;
103
104    /**
105     * POP DIRECTIONAL FORMATTING.
106     *
107     * @const int
108     */
109    const PDF              = 0x202c;
110
111    /**
112     * LEFT-TO-RIGHT OVERRIDE.
113     *
114     * @const int
115     */
116    const LRO              = 0x202d;
117
118    /**
119     * RIGHT-TO-LEFT OVERRIDE.
120     *
121     * @const int
122     */
123    const RLO              = 0x202e;
124
125    /**
126     * Represent the beginning of the string.
127     *
128     * @const int
129     */
130    const BEGINNING        = 1;
131
132    /**
133     * Represent the end of the string.
134     *
135     * @const int
136     */
137    const END              = 2;
138
139    /**
140     * Split: non-empty pieces is returned.
141     *
142     * @const int
143     */
144    const WITHOUT_EMPTY    = PREG_SPLIT_NO_EMPTY;
145
146    /**
147     * Split: parenthesized expression in the delimiter pattern will be captured
148     * and returned.
149     *
150     * @const int
151     */
152    const WITH_DELIMITERS  = PREG_SPLIT_DELIM_CAPTURE;
153
154    /**
155     * Split: offsets of captures will be returned.
156     *
157     * @const int
158     */
159    const WITH_OFFSET      = 260; //   PREG_OFFSET_CAPTURE
160                                  // | PREG_SPLIT_OFFSET_CAPTURE
161
162    /**
163     * Group results by patterns.
164     *
165     * @const int
166     */
167    const GROUP_BY_PATTERN = PREG_PATTERN_ORDER;
168
169    /**
170     * Group results by tuple (set of patterns).
171     *
172     * @const int
173     */
174    const GROUP_BY_TUPLE   = PREG_SET_ORDER;
175
176    /**
177     * Current string.
178     *
179     * @var string
180     */
181    protected $_string          = null;
182
183    /**
184     * Direction. Please see self::LTR and self::RTL constants.
185     *
186     * @var int
187     */
188    protected $_direction       = null;
189
190    /**
191     * Collator.
192     *
193     * @var \Collator
194     */
195    protected static $_collator = null;
196
197
198
199    /**
200     * Construct a UTF-8 string.
201     *
202     * @param   string  $string    String.
203     */
204    public function __construct($string = null)
205    {
206        if (null !== $string) {
207            $this->append($string);
208        }
209
210        return;
211    }
212
213    /**
214     * Check if ext/mbstring is available.
215     *
216     * @return  bool
217     */
218    public static function checkMbString()
219    {
220        return function_exists('mb_substr');
221    }
222
223    /**
224     * Check if ext/iconv is available.
225     *
226     * @return  bool
227     */
228    public static function checkIconv()
229    {
230        return function_exists('iconv');
231    }
232
233    /**
234     * Append a substring to the current string, i.e. add to the end.
235     *
236     * @param   string  $substring    Substring to append.
237     * @return  \Hoa\Ustring
238     */
239    public function append($substring)
240    {
241        $this->_string .= $substring;
242
243        return $this;
244    }
245
246    /**
247     * Prepend a substring to the current string, i.e. add to the start.
248     *
249     * @param   string  $substring    Substring to append.
250     * @return  \Hoa\Ustring
251     */
252    public function prepend($substring)
253    {
254        $this->_string = $substring . $this->_string;
255
256        return $this;
257    }
258
259    /**
260     * Pad the current string to a certain length with another piece, aka piece.
261     *
262     * @param   int     $length    Length.
263     * @param   string  $piece     Piece.
264     * @param   int     $side      Whether we append at the end or the beginning
265     *                             of the current string.
266     * @return  \Hoa\Ustring
267     */
268    public function pad($length, $piece, $side = self::END)
269    {
270        $difference = $length - $this->count();
271
272        if (0 >= $difference) {
273            return $this;
274        }
275
276        $handle = null;
277
278        for ($i = $difference / mb_strlen($piece) - 1; $i >= 0; --$i) {
279            $handle .= $piece;
280        }
281
282        $handle .= mb_substr($piece, 0, $difference - mb_strlen($handle));
283
284        return
285            static::END === $side
286                ? $this->append($handle)
287                : $this->prepend($handle);
288    }
289
290    /**
291     * Make a comparison with a string.
292     * Return < 0 if current string is less than $string, > 0 if greater and 0
293     * if equal.
294     *
295     * @param   mixed  $string    String.
296     * @return  int
297     */
298    public function compare($string)
299    {
300        if (null === $collator = static::getCollator()) {
301            return strcmp($this->_string, (string) $string);
302        }
303
304        return $collator->compare($this->_string, $string);
305    }
306
307    /**
308     * Get collator.
309     *
310     * @return  \Collator
311     */
312    public static function getCollator()
313    {
314        if (false === class_exists('Collator')) {
315            return null;
316        }
317
318        if (null === static::$_collator) {
319            static::$_collator = new \Collator(setlocale(LC_COLLATE, null));
320        }
321
322        return static::$_collator;
323    }
324
325    /**
326     * Ensure that the pattern is safe for Unicode: add the “u” option.
327     *
328     * @param   string  $pattern    Pattern.
329     * @return  string
330     */
331    public static function safePattern($pattern)
332    {
333        $delimiter = mb_substr($pattern, 0, 1);
334        $options   = mb_substr(
335            mb_strrchr($pattern, $delimiter, false),
336            mb_strlen($delimiter)
337        );
338
339        if (false === strpos($options, 'u')) {
340            $pattern .= 'u';
341        }
342
343        return $pattern;
344    }
345
346    /**
347     * Perform a regular expression (PCRE) match.
348     *
349     * @param   string  $pattern    Pattern.
350     * @param   array   $matches    Matches.
351     * @param   int     $flags      Please, see constants self::WITH_OFFSET,
352     *                              self::GROUP_BY_PATTERN and
353     *                              self::GROUP_BY_TUPLE.
354     * @param   int     $offset     Alternate place from which to start the
355     *                              search.
356     * @param   bool    $global     Whether the match is global or not.
357     * @return  int
358     */
359    public function match(
360        $pattern,
361        &$matches = null,
362        $flags    = 0,
363        $offset   = 0,
364        $global   = false
365    ) {
366        $pattern = static::safePattern($pattern);
367
368        if (0 === $flags) {
369            if (true === $global) {
370                $flags = static::GROUP_BY_PATTERN;
371            }
372        } else {
373            $flags &= ~PREG_SPLIT_OFFSET_CAPTURE;
374        }
375
376
377        $offset = strlen(mb_substr($this->_string, 0, $offset));
378
379        if (true === $global) {
380            return preg_match_all(
381                $pattern,
382                $this->_string,
383                $matches,
384                $flags,
385                $offset
386            );
387        }
388
389        return preg_match($pattern, $this->_string, $matches, $flags, $offset);
390    }
391
392    /**
393     * Perform a regular expression (PCRE) search and replace.
394     *
395     * @param   mixed   $pattern        Pattern(s).
396     * @param   mixed   $replacement    Replacement(s) (please, see
397     *                                  preg_replace() documentation).
398     * @param   int     $limit          Maximum of replacements. -1 for unbound.
399     * @return  \Hoa\Ustring
400     */
401    public function replace($pattern, $replacement, $limit = -1)
402    {
403        $pattern = static::safePattern($pattern);
404
405        if (false === is_callable($replacement)) {
406            $this->_string = preg_replace(
407                $pattern,
408                $replacement,
409                $this->_string,
410                $limit
411            );
412        } else {
413            $this->_string = preg_replace_callback(
414                $pattern,
415                $replacement,
416                $this->_string,
417                $limit
418            );
419        }
420
421        return $this;
422    }
423
424    /**
425     * Split the current string according to a given pattern (PCRE).
426     *
427     * @param   string  $pattern    Pattern (as a regular expression).
428     * @param   int     $limit      Maximum of split. -1 for unbound.
429     * @param   int     $flags      Please, see constants self::WITHOUT_EMPTY,
430     *                              self::WITH_DELIMITERS, self::WITH_OFFSET.
431     * @return  array
432     */
433    public function split(
434        $pattern,
435        $limit = -1,
436        $flags = self::WITHOUT_EMPTY
437    ) {
438        return preg_split(
439            static::safePattern($pattern),
440            $this->_string,
441            $limit,
442            $flags
443        );
444    }
445
446    /**
447     * Iterator over chars.
448     *
449     * @return  \ArrayIterator
450     */
451    public function getIterator()
452    {
453        return new \ArrayIterator(preg_split('#(?<!^)(?!$)#u', $this->_string));
454    }
455
456    /**
457     * Perform a lowercase folding on the current string.
458     *
459     * @return  \Hoa\Ustring
460     */
461    public function toLowerCase()
462    {
463        $this->_string = mb_strtolower($this->_string);
464
465        return $this;
466    }
467
468    /**
469     * Perform an uppercase folding on the current string.
470     *
471     * @return  \Hoa\Ustring
472     */
473    public function toUpperCase()
474    {
475        $this->_string = mb_strtoupper($this->_string);
476
477        return $this;
478    }
479
480    /**
481     * Transform a UTF-8 string into an ASCII one.
482     * First, try with a transliterator. If not available, will fallback to a
483     * normalizer. If not available, will try something homemade.
484     *
485     * @param   bool  $try    Try something if \Normalizer is not present.
486     * @return  \Hoa\Ustring
487     * @throws  \Hoa\Ustring\Exception
488     */
489    public function toAscii($try = false)
490    {
491        if (0 === preg_match('#[\x80-\xff]#', $this->_string)) {
492            return $this;
493        }
494
495        $string  = $this->_string;
496        $transId =
497            'Any-Latin; ' .
498            '[\p{S}] Name; ' .
499            'Latin-ASCII';
500
501        if (null !== $transliterator = static::getTransliterator($transId)) {
502            $this->_string = preg_replace_callback(
503                '#\\\N\{([A-Z ]+)\}#u',
504                function (array $matches) {
505                    return '(' . strtolower($matches[1]) . ')';
506                },
507                $transliterator->transliterate($string)
508            );
509
510            return $this;
511        }
512
513        if (false === class_exists('Normalizer')) {
514            if (false === $try) {
515                throw new Exception(
516                    '%s needs the class Normalizer to work properly, ' .
517                    'or you can force a try by using %1$s(true).',
518                    0,
519                    __METHOD__
520                );
521            }
522
523            $string        = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT');
524            $this->_string = preg_replace('#(?:[\'"`^](\w))#u', '\1', $string);
525
526            return $this;
527        }
528
529        $string        = \Normalizer::normalize($string, \Normalizer::NFKD);
530        $string        = preg_replace('#\p{Mn}+#u', '', $string);
531        $this->_string = static::transcode($string, 'UTF-8', 'ASCII//IGNORE//TRANSLIT');
532
533        return $this;
534    }
535
536    /**
537     * Transliterate the string into another.
538     * See self::getTransliterator for more information.
539     *
540     * @param   string  $identifier    Identifier.
541     * @param   int     $start         Start.
542     * @param   int     $end           End.
543     * @return  \Hoa\Ustring
544     * @throws  \Hoa\Ustring\Exception
545     */
546    public function transliterate($identifier, $start = 0, $end = null)
547    {
548        if (null === $transliterator = static::getTransliterator($identifier)) {
549            throw new Exception(
550                '%s needs the class Transliterator to work properly.',
551                1,
552                __METHOD__
553            );
554        }
555
556        $this->_string = $transliterator->transliterate($this->_string, $start, $end);
557
558        return $this;
559    }
560
561    /**
562     * Get transliterator.
563     * See http://userguide.icu-project.org/transforms/general for $identifier.
564     *
565     * @param   string  $identifier    Identifier.
566     * @return  \Transliterator
567     */
568    public static function getTransliterator($identifier)
569    {
570        if (false === class_exists('Transliterator')) {
571            return null;
572        }
573
574        return \Transliterator::create($identifier);
575    }
576
577    /**
578     * Strip characters (default \s) of the current string.
579     *
580     * @param   string  $regex    Characters to remove.
581     * @param   int     $side     Whether we trim the beginning, the end or both
582     *                            sides, of the current string.
583     * @return  \Hoa\Ustring
584     */
585    public function trim($regex = '\s', $side = 3 /* static::BEGINNING | static::END */)
586    {
587        $regex  = '(?:' . $regex . ')+';
588        $handle = null;
589
590        if (0 !== ($side & static::BEGINNING)) {
591            $handle .= '(^' . $regex . ')';
592        }
593
594        if (0 !== ($side & static::END)) {
595            if (null !== $handle) {
596                $handle .= '|';
597            }
598
599            $handle .= '(' . $regex . '$)';
600        }
601
602        $this->_string    = preg_replace('#' . $handle . '#u', '', $this->_string);
603        $this->_direction = null;
604
605        return $this;
606    }
607
608    /**
609     * Compute offset (negative, unbound etc.).
610     *
611     * @param   int        $offset    Offset.
612     * @return  int
613     */
614    protected function computeOffset($offset)
615    {
616        $length = mb_strlen($this->_string);
617
618        if (0 > $offset) {
619            $offset = -$offset % $length;
620
621            if (0 !== $offset) {
622                $offset = $length - $offset;
623            }
624        } elseif ($offset >= $length) {
625            $offset %= $length;
626        }
627
628        return $offset;
629    }
630
631    /**
632     * Get a specific chars of the current string.
633     *
634     * @param   int     $offset    Offset (can be negative and unbound).
635     * @return  string
636     */
637    public function offsetGet($offset)
638    {
639        return mb_substr($this->_string, $this->computeOffset($offset), 1);
640    }
641
642    /**
643     * Set a specific character of the current string.
644     *
645     * @param   int     $offset    Offset (can be negative and unbound).
646     * @param   string  $value     Value.
647     * @return  \Hoa\Ustring
648     */
649    public function offsetSet($offset, $value)
650    {
651        $head   = null;
652        $offset = $this->computeOffset($offset);
653
654        if (0 < $offset) {
655            $head = mb_substr($this->_string, 0, $offset);
656        }
657
658        $tail             = mb_substr($this->_string, $offset + 1);
659        $this->_string    = $head . $value . $tail;
660        $this->_direction = null;
661
662        return $this;
663    }
664
665    /**
666     * Delete a specific character of the current string.
667     *
668     * @param   int     $offset    Offset (can be negative and unbound).
669     * @return  string
670     */
671    public function offsetUnset($offset)
672    {
673        return $this->offsetSet($offset, null);
674    }
675
676    /**
677     * Check if a specific offset exists.
678     *
679     * @return  bool
680     */
681    public function offsetExists($offset)
682    {
683        return true;
684    }
685
686    /**
687     * Reduce the strings.
688     *
689     * @param   int  $start     Position of first character.
690     * @param   int  $length    Maximum number of characters.
691     * @return  \Hoa\Ustring
692     */
693    public function reduce($start, $length = null)
694    {
695        $this->_string = mb_substr($this->_string, $start, $length);
696
697        return $this;
698    }
699
700    /**
701     * Count number of characters of the current string.
702     *
703     * @return  int
704     */
705    public function count()
706    {
707        return mb_strlen($this->_string);
708    }
709
710    /**
711     * Get byte (not character) at a specific offset.
712     *
713     * @param   int     $offset    Offset (can be negative and unbound).
714     * @return  string
715     */
716    public function getByteAt($offset)
717    {
718        $length = strlen($this->_string);
719
720        if (0 > $offset) {
721            $offset = -$offset % $length;
722
723            if (0 !== $offset) {
724                $offset = $length - $offset;
725            }
726        } elseif ($offset >= $length) {
727            $offset %= $length;
728        }
729
730        return $this->_string[$offset];
731    }
732
733    /**
734     * Count number of bytes (not characters) of the current string.
735     *
736     * @return  int
737     */
738    public function getBytesLength()
739    {
740        return strlen($this->_string);
741    }
742
743    /**
744     * Get the width of the current string.
745     * Useful when printing the string in monotype (some character need more
746     * than one column to be printed).
747     *
748     * @return  int
749     */
750    public function getWidth()
751    {
752        return mb_strwidth($this->_string);
753    }
754
755    /**
756     * Get direction of the current string.
757     * Please, see the self::LTR and self::RTL constants.
758     * It does not yet support embedding directions.
759     *
760     * @return  int
761     */
762    public function getDirection()
763    {
764        if (null === $this->_direction) {
765            if (null === $this->_string) {
766                $this->_direction = static::LTR;
767            } else {
768                $this->_direction = static::getCharDirection(
769                    mb_substr($this->_string, 0, 1)
770                );
771            }
772        }
773
774        return $this->_direction;
775    }
776
777    /**
778     * Get character of a specific character.
779     * Please, see the self::LTR and self::RTL constants.
780     *
781     * @param   string  $char    Character.
782     * @return  int
783     */
784    public static function getCharDirection($char)
785    {
786        $c = static::toCode($char);
787
788        if (!(0x5be <= $c && 0x10b7f >= $c)) {
789            return static::LTR;
790        }
791
792        if (0x85e >= $c) {
793            if (0x5be === $c ||
794                0x5c0 === $c ||
795                0x5c3 === $c ||
796                0x5c6 === $c ||
797                (0x5d0 <= $c && 0x5ea >= $c) ||
798                (0x5f0 <= $c && 0x5f4 >= $c) ||
799                0x608 === $c ||
800                0x60b === $c ||
801                0x60d === $c ||
802                0x61b === $c ||
803                (0x61e <= $c && 0x64a >= $c) ||
804                (0x66d <= $c && 0x66f >= $c) ||
805                (0x671 <= $c && 0x6d5 >= $c) ||
806                (0x6e5 <= $c && 0x6e6 >= $c) ||
807                (0x6ee <= $c && 0x6ef >= $c) ||
808                (0x6fa <= $c && 0x70d >= $c) ||
809                0x710 === $c ||
810                (0x712 <= $c && 0x72f >= $c) ||
811                (0x74d <= $c && 0x7a5 >= $c) ||
812                0x7b1 === $c ||
813                (0x7c0 <= $c && 0x7ea >= $c) ||
814                (0x7f4 <= $c && 0x7f5 >= $c) ||
815                0x7fa === $c ||
816                (0x800 <= $c && 0x815 >= $c) ||
817                0x81a === $c ||
818                0x824 === $c ||
819                0x828 === $c ||
820                (0x830 <= $c && 0x83e >= $c) ||
821                (0x840 <= $c && 0x858 >= $c) ||
822                0x85e === $c) {
823                return static::RTL;
824            }
825        } elseif (0x200f === $c) {
826            return static::RTL;
827        } elseif (0xfb1d <= $c) {
828            if (0xfb1d === $c ||
829                (0xfb1f <= $c && 0xfb28 >= $c) ||
830                (0xfb2a <= $c && 0xfb36 >= $c) ||
831                (0xfb38 <= $c && 0xfb3c >= $c) ||
832                0xfb3e === $c ||
833                (0xfb40 <= $c && 0xfb41 >= $c) ||
834                (0xfb43 <= $c && 0xfb44 >= $c) ||
835                (0xfb46 <= $c && 0xfbc1 >= $c) ||
836                (0xfbd3 <= $c && 0xfd3d >= $c) ||
837                (0xfd50 <= $c && 0xfd8f >= $c) ||
838                (0xfd92 <= $c && 0xfdc7 >= $c) ||
839                (0xfdf0 <= $c && 0xfdfc >= $c) ||
840                (0xfe70 <= $c && 0xfe74 >= $c) ||
841                (0xfe76 <= $c && 0xfefc >= $c) ||
842                (0x10800 <= $c && 0x10805 >= $c) ||
843                0x10808 === $c ||
844                (0x1080a <= $c && 0x10835 >= $c) ||
845                (0x10837 <= $c && 0x10838 >= $c) ||
846                0x1083c === $c ||
847                (0x1083f <= $c && 0x10855 >= $c) ||
848                (0x10857 <= $c && 0x1085f >= $c) ||
849                (0x10900 <= $c && 0x1091b >= $c) ||
850                (0x10920 <= $c && 0x10939 >= $c) ||
851                0x1093f === $c ||
852                0x10a00 === $c ||
853                (0x10a10 <= $c && 0x10a13 >= $c) ||
854                (0x10a15 <= $c && 0x10a17 >= $c) ||
855                (0x10a19 <= $c && 0x10a33 >= $c) ||
856                (0x10a40 <= $c && 0x10a47 >= $c) ||
857                (0x10a50 <= $c && 0x10a58 >= $c) ||
858                (0x10a60 <= $c && 0x10a7f >= $c) ||
859                (0x10b00 <= $c && 0x10b35 >= $c) ||
860                (0x10b40 <= $c && 0x10b55 >= $c) ||
861                (0x10b58 <= $c && 0x10b72 >= $c) ||
862                (0x10b78 <= $c && 0x10b7f >= $c)) {
863                return static::RTL;
864            }
865        }
866
867        return static::LTR;
868    }
869
870    /**
871     * Get the number of column positions of a wide-character.
872     *
873     * This is a PHP implementation of wcwidth() and wcswidth() (defined in IEEE
874     * Std 1002.1-2001) for Unicode, by Markus Kuhn. Please, see
875     * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c.
876     *
877     * The wcwidth(wc) function shall either return 0 (if wc is a null
878     * wide-character code), or return the number of column positions to be
879     * occupied by the wide-character code wc, or return -1 (if wc does not
880     * correspond to a printable wide-character code).
881     *
882     * @param   string  $char    Character.
883     * @return  int
884     */
885    public static function getCharWidth($char)
886    {
887        $char = (string) $char;
888        $c    = static::toCode($char);
889
890        // Test for 8-bit control characters.
891        if (0x0 === $c) {
892            return 0;
893        }
894
895        if (0x20 > $c || (0x7f <= $c && $c < 0xa0)) {
896            return -1;
897        }
898
899        // Non-spacing characters.
900        if (0xad !== $c &&
901            0    !== preg_match('#^[\p{Mn}\p{Me}\p{Cf}\x{1160}-\x{11ff}\x{200b}]#u', $char)) {
902            return 0;
903        }
904
905        // If we arrive here, $c is not a combining C0/C1 control character.
906        return 1 +
907            (0x1100 <= $c &&
908                (0x115f >= $c ||                        // Hangul Jamo init. consonants
909                 0x2329 === $c || 0x232a === $c ||
910                     (0x2e80 <= $c && 0xa4cf >= $c &&
911                      0x303f !== $c) ||                 // CJK…Yi
912                     (0xac00  <= $c && 0xd7a3 >= $c) || // Hangul Syllables
913                     (0xf900  <= $c && 0xfaff >= $c) || // CJK Compatibility Ideographs
914                     (0xfe10  <= $c && 0xfe19 >= $c) || // Vertical forms
915                     (0xfe30  <= $c && 0xfe6f >= $c) || // CJK Compatibility Forms
916                     (0xff00  <= $c && 0xff60 >= $c) || // Fullwidth Forms
917                     (0xffe0  <= $c && 0xffe6 >= $c) ||
918                     (0x20000 <= $c && 0x2fffd >= $c) ||
919                     (0x30000 <= $c && 0x3fffd >= $c)));
920    }
921
922    /**
923     * Check whether the character is printable or not.
924     *
925     * @param   string  $char    Character.
926     * @return  bool
927     */
928    public static function isCharPrintable($char)
929    {
930        return 1 <= static::getCharWidth($char);
931    }
932
933    /**
934     * Get a UTF-8 character from its decimal code representation.
935     *
936     * @param   int  $code    Code.
937     * @return  string
938     */
939    public static function fromCode($code)
940    {
941        return mb_convert_encoding(
942            '&#x' . dechex($code) . ';',
943            'UTF-8',
944            'HTML-ENTITIES'
945        );
946    }
947
948    /**
949     * Get a decimal code representation of a specific character.
950     *
951     * @param   string  $char    Character.
952     * @return  int
953     */
954    public static function toCode($char)
955    {
956        $char  = (string) $char;
957        $code  = ord($char[0]);
958        $bytes = 1;
959
960        if (!($code & 0x80)) { // 0xxxxxxx
961            return $code;
962        }
963
964        if (($code & 0xe0) === 0xc0) { // 110xxxxx
965            $bytes = 2;
966            $code  = $code & ~0xc0;
967        } elseif (($code & 0xf0) == 0xe0) { // 1110xxxx
968            $bytes = 3;
969            $code  = $code & ~0xe0;
970        } elseif (($code & 0xf8) === 0xf0) { // 11110xxx
971            $bytes = 4;
972            $code  = $code & ~0xf0;
973        }
974
975        for ($i = 2; $i <= $bytes; $i++) { // 10xxxxxx
976            $code = ($code << 6) + (ord($char[$i - 1]) & ~0x80);
977        }
978
979        return $code;
980    }
981
982    /**
983     * Get a binary representation of a specific character.
984     *
985     * @param   string  $char    Character.
986     * @return  string
987     */
988    public static function toBinaryCode($char)
989    {
990        $char = (string) $char;
991        $out  = null;
992
993        for ($i = 0, $max = strlen($char); $i < $max; ++$i) {
994            $out .= vsprintf('%08b', ord($char[$i]));
995        }
996
997        return $out;
998    }
999
1000    /**
1001     * Transcode.
1002     *
1003     * @param   string  $string    String.
1004     * @param   string  $from      Original encoding.
1005     * @param   string  $to        Final encoding.
1006     * @return  string
1007     * @throws  \Hoa\Ustring\Exception
1008     */
1009    public static function transcode($string, $from, $to = 'UTF-8')
1010    {
1011        if (false === static::checkIconv()) {
1012            throw new Exception(
1013                '%s needs the iconv extension.',
1014                2,
1015                __CLASS__
1016            );
1017        }
1018
1019        return iconv($from, $to, $string);
1020    }
1021
1022    /**
1023     * Check if a string is encoded in UTF-8.
1024     *
1025     * @param   string  $string    String.
1026     * @return  bool
1027     */
1028    public static function isUtf8($string)
1029    {
1030        return (bool) preg_match('##u', $string);
1031    }
1032
1033    /**
1034     * Copy current object string
1035     *
1036     * @return \Hoa\Ustring
1037     */
1038    public function copy()
1039    {
1040        return clone $this;
1041    }
1042
1043    /**
1044     * Transform the object as a string.
1045     *
1046     * @return  string
1047     */
1048    public function __toString()
1049    {
1050        return $this->_string;
1051    }
1052}
1053
1054/**
1055 * Flex entity.
1056 */
1057Consistency::flexEntity('Hoa\Ustring\Ustring');
1058
1059if (false === Ustring::checkMbString()) {
1060    throw new Exception(
1061        '%s needs the mbstring extension.',
1062        0,
1063        __NAMESPACE__ . '\Ustring'
1064    );
1065}
1066