1 <?php
2 
3 namespace dokuwiki\Utf8;
4 
5 /**
6  * Methods to convert from and to UTF-8 strings
7  */
8 class Conversion
9 {
10     /**
11      * Encodes UTF-8 characters to HTML entities
12      *
13      * @author Tom N Harris <tnharris@whoopdedo.org>
14      * @author <vpribish at shopping dot com>
15      * @link   http://php.net/manual/en/function.utf8-decode.php
16      *
17      * @param string $str
18      * @param bool $all Encode non-utf8 char to HTML as well
19      * @return string
20      */
21     public static function toHtml($str, $all = false)
22     {
23         $ret = '';
24         foreach (Unicode::fromUtf8($str) as $cp) {
25             if ($cp < 0x80 && !$all) {
26                 $ret .= chr($cp);
27             } elseif ($cp < 0x100) {
28                 $ret .= "&#$cp;";
29             } else {
30                 $ret .= '&#x' . dechex($cp) . ';';
31             }
32         }
33         return $ret;
34     }
35 
36     /**
37      * Decodes HTML entities to UTF-8 characters
38      *
39      * Convert any &#..; entity to a codepoint,
40      * The entities flag defaults to only decoding numeric entities.
41      * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
42      * are handled as well. Avoids the problem that would occur if you
43      * had to decode "&amp;#38;&#38;amp;#38;"
44      *
45      * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
46      * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
47      * what it should be                   -> "&#38;&amp#38;"
48      *
49      * @author Tom N Harris <tnharris@whoopdedo.org>
50      *
51      * @param  string $str UTF-8 encoded string
52      * @param  boolean $entities decode name entities in addtition to numeric ones
53      * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
54      */
55     public static function fromHtml($str, $entities = false)
56     {
57         if (!$entities) {
58             return preg_replace_callback(
59                 '/(&#([Xx])?([0-9A-Za-z]+);)/m',
60                 [self::class, 'decodeNumericEntity'],
61                 $str
62             );
63         }
64 
65         return preg_replace_callback(
66             '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
67             [self::class, 'decodeAnyEntity'],
68             $str
69         );
70     }
71 
72     /**
73      * Decodes any HTML entity to it's correct UTF-8 char equivalent
74      *
75      * @param string $ent An entity
76      * @return string
77      */
78     protected static function decodeAnyEntity($ent)
79     {
80         // create the named entity lookup table
81         static $table = null;
82         if ($table === null) {
83             $table = get_html_translation_table(HTML_ENTITIES);
84             $table = array_flip($table);
85             $table = array_map(
86                 static fn($c) => Unicode::toUtf8([ord($c)]),
87                 $table
88             );
89         }
90 
91         if ($ent[1] === '#') {
92             return self::decodeNumericEntity($ent);
93         }
94 
95         if (array_key_exists($ent[0], $table)) {
96             return $table[$ent[0]];
97         }
98 
99         return $ent[0];
100     }
101 
102     /**
103      * Decodes numeric HTML entities to their correct UTF-8 characters
104      *
105      * @param $ent string A numeric entity
106      * @return string|false
107      */
108     protected static function decodeNumericEntity($ent)
109     {
110         switch ($ent[2]) {
111             case 'X':
112             case 'x':
113                 $cp = hexdec($ent[3]);
114                 break;
115             default:
116                 $cp = (int) $ent[3];
117                 break;
118         }
119         return Unicode::toUtf8([$cp]);
120     }
121 
122     /**
123      * UTF-8 to UTF-16BE conversion.
124      *
125      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
126      *
127      * @param string $str
128      * @param bool $bom
129      * @return string
130      */
131     public static function toUtf16be($str, $bom = false)
132     {
133         $out = $bom ? "\xFE\xFF" : '';
134         if (UTF8_MBSTRING) {
135             return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
136         }
137 
138         $uni = Unicode::fromUtf8($str);
139         foreach ($uni as $cp) {
140             $out .= pack('n', $cp);
141         }
142         return $out;
143     }
144 
145     /**
146      * UTF-8 to UTF-16BE conversion.
147      *
148      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
149      *
150      * @param string $str
151      * @return false|string
152      */
153     public static function fromUtf16be($str)
154     {
155         $uni = unpack('n*', $str);
156         return Unicode::toUtf8($uni);
157     }
158 
159     /**
160      * Converts a string from ISO-8859-1 to UTF-8
161      *
162      * This is a replacement for the deprecated utf8_encode function.
163      *
164      * @param $string
165      * @return string
166      * @author <p@tchwork.com> Nicolas Grekas - pure PHP implementation
167      * @link https://github.com/tchwork/utf8/blob/master/src/Patchwork/PHP/Shim/Xml.php
168      */
169     public static function fromLatin1($string)
170     {
171         if (UTF8_MBSTRING) {
172             return mb_convert_encoding($string, 'UTF-8', 'ISO-8859-1');
173         }
174         if (function_exists('iconv')) {
175             return iconv('ISO-8859-1', 'UTF-8', $string);
176         }
177         if (class_exists('UConverter')) {
178             return \UConverter::transcode($string, 'UTF8', 'ISO-8859-1');
179         }
180         if (function_exists('utf8_encode')) {
181             // deprecated
182             return utf8_encode($string);
183         }
184 
185         // fallback to pure PHP
186         $string .= $string;
187         $len = strlen($string);
188         for ($i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j) {
189             switch (true) {
190                 case $string[$i] < "\x80":
191                     $string[$j] = $string[$i];
192                     break;
193                 case $string[$i] < "\xC0":
194                     $string[$j] = "\xC2";
195                     $string[++$j] = $string[$i];
196                     break;
197                 default:
198                     $string[$j] = "\xC3";
199                     $string[++$j] = chr(ord($string[$i]) - 64);
200                     break;
201             }
202         }
203         return substr($string, 0, $j);
204     }
205 }
206