1<?php 2 3namespace dokuwiki\Utf8; 4 5/** 6 * Methods to convert from and to UTF-8 strings 7 */ 8class Conversion 9{ 10 /** 11 * Encodes UTF-8 characters to HTML entities 12 * 13 * @author Tom N Harris <tnharris@whoopdedo.org> 14 * @author <vpribish at shopping dot com> 15 * @link http://php.net/manual/en/function.utf8-decode.php 16 * 17 * @param string $str 18 * @param bool $all Encode non-utf8 char to HTML as well 19 * @return string 20 */ 21 public static function toHtml($str, $all = false) 22 { 23 $ret = ''; 24 foreach (Unicode::fromUtf8($str) as $cp) { 25 if ($cp < 0x80 && !$all) { 26 $ret .= chr($cp); 27 } elseif ($cp < 0x100) { 28 $ret .= "&#$cp;"; 29 } else { 30 $ret .= '&#x' . dechex($cp) . ';'; 31 } 32 } 33 return $ret; 34 } 35 36 /** 37 * Decodes HTML entities to UTF-8 characters 38 * 39 * Convert any &#..; entity to a codepoint, 40 * The entities flag defaults to only decoding numeric entities. 41 * Pass HTML_ENTITIES and named entities, including & < etc. 42 * are handled as well. Avoids the problem that would occur if you 43 * had to decode "&#38;&amp;#38;" 44 * 45 * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&" 46 * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;" 47 * what it should be -> "&&#38;" 48 * 49 * @author Tom N Harris <tnharris@whoopdedo.org> 50 * 51 * @param string $str UTF-8 encoded string 52 * @param boolean $entities decode name entities in addtition to numeric ones 53 * @return string UTF-8 encoded string with numeric (and named) entities replaced. 54 */ 55 public static function fromHtml($str, $entities = false) 56 { 57 if (!$entities) { 58 return preg_replace_callback( 59 '/(&#([Xx])?([0-9A-Za-z]+);)/m', 60 [self::class, 'decodeNumericEntity'], 61 $str 62 ); 63 } 64 65 return preg_replace_callback( 66 '/&(#)?([Xx])?([0-9A-Za-z]+);/m', 67 [self::class, 'decodeAnyEntity'], 68 $str 69 ); 70 } 71 72 /** 73 * Decodes any HTML entity to it's correct UTF-8 char equivalent 74 * 75 * @param string $ent An entity 76 * @return string 77 */ 78 protected static function decodeAnyEntity($ent) 79 { 80 // create the named entity lookup table 81 static $table = null; 82 if ($table === null) { 83 $table = get_html_translation_table(HTML_ENTITIES); 84 $table = array_flip($table); 85 $table = array_map( 86 static fn($c) => Unicode::toUtf8([ord($c)]), 87 $table 88 ); 89 } 90 91 if ($ent[1] === '#') { 92 return self::decodeNumericEntity($ent); 93 } 94 95 if (array_key_exists($ent[0], $table)) { 96 return $table[$ent[0]]; 97 } 98 99 return $ent[0]; 100 } 101 102 /** 103 * Decodes numeric HTML entities to their correct UTF-8 characters 104 * 105 * @param $ent string A numeric entity 106 * @return string|false 107 */ 108 protected static function decodeNumericEntity($ent) 109 { 110 switch ($ent[2]) { 111 case 'X': 112 case 'x': 113 $cp = hexdec($ent[3]); 114 break; 115 default: 116 $cp = (int) $ent[3]; 117 break; 118 } 119 return Unicode::toUtf8([$cp]); 120 } 121 122 /** 123 * UTF-8 to UTF-16BE conversion. 124 * 125 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 126 * 127 * @param string $str 128 * @param bool $bom 129 * @return string 130 */ 131 public static function toUtf16be($str, $bom = false) 132 { 133 $out = $bom ? "\xFE\xFF" : ''; 134 if (UTF8_MBSTRING) { 135 return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); 136 } 137 138 $uni = Unicode::fromUtf8($str); 139 foreach ($uni as $cp) { 140 $out .= pack('n', $cp); 141 } 142 return $out; 143 } 144 145 /** 146 * UTF-8 to UTF-16BE conversion. 147 * 148 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 149 * 150 * @param string $str 151 * @return false|string 152 */ 153 public static function fromUtf16be($str) 154 { 155 $uni = unpack('n*', $str); 156 return Unicode::toUtf8($uni); 157 } 158 159 /** 160 * Converts a string from ISO-8859-1 to UTF-8 161 * 162 * This is a replacement for the deprecated utf8_encode function. 163 * 164 * @param $string 165 * @return string 166 * @author <p@tchwork.com> Nicolas Grekas - pure PHP implementation 167 * @link https://github.com/tchwork/utf8/blob/master/src/Patchwork/PHP/Shim/Xml.php 168 */ 169 public static function fromLatin1($string) 170 { 171 if (UTF8_MBSTRING) { 172 return mb_convert_encoding($string, 'UTF-8', 'ISO-8859-1'); 173 } 174 if (function_exists('iconv')) { 175 return iconv('ISO-8859-1', 'UTF-8', $string); 176 } 177 if (class_exists('UConverter')) { 178 return \UConverter::transcode($string, 'UTF8', 'ISO-8859-1'); 179 } 180 if (function_exists('utf8_encode')) { 181 // deprecated 182 return utf8_encode($string); 183 } 184 185 // fallback to pure PHP 186 $string .= $string; 187 $len = strlen($string); 188 for ($i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j) { 189 switch (true) { 190 case $string[$i] < "\x80": 191 $string[$j] = $string[$i]; 192 break; 193 case $string[$i] < "\xC0": 194 $string[$j] = "\xC2"; 195 $string[++$j] = $string[$i]; 196 break; 197 default: 198 $string[$j] = "\xC3"; 199 $string[++$j] = chr(ord($string[$i]) - 64); 200 break; 201 } 202 } 203 return substr($string, 0, $j); 204 } 205} 206