1f41bbe4cSAndreas Gohr<?php 2f41bbe4cSAndreas Gohr 3f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8; 4f41bbe4cSAndreas Gohr 5f41bbe4cSAndreas Gohr/** 6f41bbe4cSAndreas Gohr * Methods to convert from and to UTF-8 strings 7f41bbe4cSAndreas Gohr */ 8f41bbe4cSAndreas Gohrclass Conversion 9f41bbe4cSAndreas Gohr{ 10f41bbe4cSAndreas Gohr /** 11f41bbe4cSAndreas Gohr * Encodes UTF-8 characters to HTML entities 12f41bbe4cSAndreas Gohr * 13f41bbe4cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 14f41bbe4cSAndreas Gohr * @author <vpribish at shopping dot com> 15f41bbe4cSAndreas Gohr * @link http://php.net/manual/en/function.utf8-decode.php 16f41bbe4cSAndreas Gohr * 17f41bbe4cSAndreas Gohr * @param string $str 18c0c77cd2SAndreas Gohr * @param bool $all Encode non-utf8 char to HTML as well 19f41bbe4cSAndreas Gohr * @return string 20f41bbe4cSAndreas Gohr */ 21c0c77cd2SAndreas Gohr public static function toHtml($str, $all = false) 22f41bbe4cSAndreas Gohr { 23f41bbe4cSAndreas Gohr $ret = ''; 24f41bbe4cSAndreas Gohr foreach (Unicode::fromUtf8($str) as $cp) { 25c0c77cd2SAndreas Gohr if ($cp < 0x80 && !$all) { 26f41bbe4cSAndreas Gohr $ret .= chr($cp); 27f41bbe4cSAndreas Gohr } elseif ($cp < 0x100) { 28f41bbe4cSAndreas Gohr $ret .= "&#$cp;"; 29f41bbe4cSAndreas Gohr } else { 30f41bbe4cSAndreas Gohr $ret .= '&#x' . dechex($cp) . ';'; 31f41bbe4cSAndreas Gohr } 32f41bbe4cSAndreas Gohr } 33f41bbe4cSAndreas Gohr return $ret; 34f41bbe4cSAndreas Gohr } 35f41bbe4cSAndreas Gohr 36f41bbe4cSAndreas Gohr /** 37f41bbe4cSAndreas Gohr * Decodes HTML entities to UTF-8 characters 38f41bbe4cSAndreas Gohr * 39f41bbe4cSAndreas Gohr * Convert any &#..; entity to a codepoint, 40f41bbe4cSAndreas Gohr * The entities flag defaults to only decoding numeric entities. 41f41bbe4cSAndreas Gohr * Pass HTML_ENTITIES and named entities, including & < etc. 42f41bbe4cSAndreas Gohr * are handled as well. Avoids the problem that would occur if you 43f41bbe4cSAndreas Gohr * had to decode "&#38;&amp;#38;" 44f41bbe4cSAndreas Gohr * 458cbc5ee8SAndreas Gohr * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&" 468cbc5ee8SAndreas Gohr * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;" 47f41bbe4cSAndreas Gohr * what it should be -> "&&#38;" 48f41bbe4cSAndreas Gohr * 49f41bbe4cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 50f41bbe4cSAndreas Gohr * 51f41bbe4cSAndreas Gohr * @param string $str UTF-8 encoded string 52f41bbe4cSAndreas Gohr * @param boolean $entities decode name entities in addtition to numeric ones 53f41bbe4cSAndreas Gohr * @return string UTF-8 encoded string with numeric (and named) entities replaced. 54f41bbe4cSAndreas Gohr */ 55f41bbe4cSAndreas Gohr public static function fromHtml($str, $entities = false) 56f41bbe4cSAndreas Gohr { 57f41bbe4cSAndreas Gohr if (!$entities) { 58f41bbe4cSAndreas Gohr return preg_replace_callback( 59f41bbe4cSAndreas Gohr '/(&#([Xx])?([0-9A-Za-z]+);)/m', 60e025be72SAndreas Gohr [self::class, 'decodeNumericEntity'], 61f41bbe4cSAndreas Gohr $str 62f41bbe4cSAndreas Gohr ); 63f41bbe4cSAndreas Gohr } 64f41bbe4cSAndreas Gohr 65f41bbe4cSAndreas Gohr return preg_replace_callback( 66f41bbe4cSAndreas Gohr '/&(#)?([Xx])?([0-9A-Za-z]+);/m', 67e025be72SAndreas Gohr [self::class, 'decodeAnyEntity'], 68f41bbe4cSAndreas Gohr $str 69f41bbe4cSAndreas Gohr ); 70f41bbe4cSAndreas Gohr } 71f41bbe4cSAndreas Gohr 72f41bbe4cSAndreas Gohr /** 73f41bbe4cSAndreas Gohr * Decodes any HTML entity to it's correct UTF-8 char equivalent 74f41bbe4cSAndreas Gohr * 75f41bbe4cSAndreas Gohr * @param string $ent An entity 76f41bbe4cSAndreas Gohr * @return string 77f41bbe4cSAndreas Gohr */ 78f41bbe4cSAndreas Gohr protected static function decodeAnyEntity($ent) 79f41bbe4cSAndreas Gohr { 80f41bbe4cSAndreas Gohr // create the named entity lookup table 81f41bbe4cSAndreas Gohr static $table = null; 82f41bbe4cSAndreas Gohr if ($table === null) { 83f41bbe4cSAndreas Gohr $table = get_html_translation_table(HTML_ENTITIES); 84f41bbe4cSAndreas Gohr $table = array_flip($table); 85f41bbe4cSAndreas Gohr $table = array_map( 86e025be72SAndreas Gohr static fn($c) => Unicode::toUtf8([ord($c)]), 87f41bbe4cSAndreas Gohr $table 88f41bbe4cSAndreas Gohr ); 89f41bbe4cSAndreas Gohr } 90f41bbe4cSAndreas Gohr 91f41bbe4cSAndreas Gohr if ($ent[1] === '#') { 92f41bbe4cSAndreas Gohr return self::decodeNumericEntity($ent); 93f41bbe4cSAndreas Gohr } 94f41bbe4cSAndreas Gohr 95f41bbe4cSAndreas Gohr if (array_key_exists($ent[0], $table)) { 96f41bbe4cSAndreas Gohr return $table[$ent[0]]; 97f41bbe4cSAndreas Gohr } 98f41bbe4cSAndreas Gohr 99f41bbe4cSAndreas Gohr return $ent[0]; 100f41bbe4cSAndreas Gohr } 101f41bbe4cSAndreas Gohr 102f41bbe4cSAndreas Gohr /** 103f41bbe4cSAndreas Gohr * Decodes numeric HTML entities to their correct UTF-8 characters 104f41bbe4cSAndreas Gohr * 105f41bbe4cSAndreas Gohr * @param $ent string A numeric entity 106f41bbe4cSAndreas Gohr * @return string|false 107f41bbe4cSAndreas Gohr */ 108f41bbe4cSAndreas Gohr protected static function decodeNumericEntity($ent) 109f41bbe4cSAndreas Gohr { 110f41bbe4cSAndreas Gohr switch ($ent[2]) { 111f41bbe4cSAndreas Gohr case 'X': 112f41bbe4cSAndreas Gohr case 'x': 113f41bbe4cSAndreas Gohr $cp = hexdec($ent[3]); 114f41bbe4cSAndreas Gohr break; 115f41bbe4cSAndreas Gohr default: 116e025be72SAndreas Gohr $cp = (int) $ent[3]; 117f41bbe4cSAndreas Gohr break; 118f41bbe4cSAndreas Gohr } 119e025be72SAndreas Gohr return Unicode::toUtf8([$cp]); 120f41bbe4cSAndreas Gohr } 121f41bbe4cSAndreas Gohr 122f41bbe4cSAndreas Gohr /** 123f41bbe4cSAndreas Gohr * UTF-8 to UTF-16BE conversion. 124f41bbe4cSAndreas Gohr * 125f41bbe4cSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 126f41bbe4cSAndreas Gohr * 127f41bbe4cSAndreas Gohr * @param string $str 128f41bbe4cSAndreas Gohr * @param bool $bom 129f41bbe4cSAndreas Gohr * @return string 130f41bbe4cSAndreas Gohr */ 131f41bbe4cSAndreas Gohr public static function toUtf16be($str, $bom = false) 132f41bbe4cSAndreas Gohr { 133f41bbe4cSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 134f41bbe4cSAndreas Gohr if (UTF8_MBSTRING) { 135f41bbe4cSAndreas Gohr return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); 136f41bbe4cSAndreas Gohr } 137f41bbe4cSAndreas Gohr 138f41bbe4cSAndreas Gohr $uni = Unicode::fromUtf8($str); 139f41bbe4cSAndreas Gohr foreach ($uni as $cp) { 140f41bbe4cSAndreas Gohr $out .= pack('n', $cp); 141f41bbe4cSAndreas Gohr } 142f41bbe4cSAndreas Gohr return $out; 143f41bbe4cSAndreas Gohr } 144f41bbe4cSAndreas Gohr 145f41bbe4cSAndreas Gohr /** 146f41bbe4cSAndreas Gohr * UTF-8 to UTF-16BE conversion. 147f41bbe4cSAndreas Gohr * 148f41bbe4cSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 149f41bbe4cSAndreas Gohr * 150f41bbe4cSAndreas Gohr * @param string $str 151f41bbe4cSAndreas Gohr * @return false|string 152f41bbe4cSAndreas Gohr */ 153f41bbe4cSAndreas Gohr public static function fromUtf16be($str) 154f41bbe4cSAndreas Gohr { 155f41bbe4cSAndreas Gohr $uni = unpack('n*', $str); 156f41bbe4cSAndreas Gohr return Unicode::toUtf8($uni); 157f41bbe4cSAndreas Gohr } 158*53c68e5cSAndreas Gohr 159*53c68e5cSAndreas Gohr /** 160*53c68e5cSAndreas Gohr * Converts a string from ISO-8859-1 to UTF-8 161*53c68e5cSAndreas Gohr * 162*53c68e5cSAndreas Gohr * This is a replacement for the deprecated utf8_encode function. 163*53c68e5cSAndreas Gohr * 164*53c68e5cSAndreas Gohr * @param $string 165*53c68e5cSAndreas Gohr * @return string 166*53c68e5cSAndreas Gohr * @author <p@tchwork.com> Nicolas Grekas - pure PHP implementation 167*53c68e5cSAndreas Gohr * @link https://github.com/tchwork/utf8/blob/master/src/Patchwork/PHP/Shim/Xml.php 168*53c68e5cSAndreas Gohr */ 169*53c68e5cSAndreas Gohr public static function fromLatin1($string) 170*53c68e5cSAndreas Gohr { 171*53c68e5cSAndreas Gohr if (UTF8_MBSTRING) { 172*53c68e5cSAndreas Gohr return mb_convert_encoding($string, 'UTF-8', 'ISO-8859-1'); 173*53c68e5cSAndreas Gohr } 174*53c68e5cSAndreas Gohr if (function_exists('iconv')) { 175*53c68e5cSAndreas Gohr return iconv('ISO-8859-1', 'UTF-8', $string); 176*53c68e5cSAndreas Gohr } 177*53c68e5cSAndreas Gohr if (class_exists('UConverter')) { 178*53c68e5cSAndreas Gohr return \UConverter::transcode($string, 'UTF8', 'ISO-8859-1'); 179*53c68e5cSAndreas Gohr } 180*53c68e5cSAndreas Gohr if (function_exists('utf8_encode')) { 181*53c68e5cSAndreas Gohr // deprecated 182*53c68e5cSAndreas Gohr return utf8_encode($string); 183*53c68e5cSAndreas Gohr } 184*53c68e5cSAndreas Gohr 185*53c68e5cSAndreas Gohr // fallback to pure PHP 186*53c68e5cSAndreas Gohr $string .= $string; 187*53c68e5cSAndreas Gohr $len = strlen($string); 188*53c68e5cSAndreas Gohr for ($i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j) { 189*53c68e5cSAndreas Gohr switch (true) { 190*53c68e5cSAndreas Gohr case $string[$i] < "\x80": 191*53c68e5cSAndreas Gohr $string[$j] = $string[$i]; 192*53c68e5cSAndreas Gohr break; 193*53c68e5cSAndreas Gohr case $string[$i] < "\xC0": 194*53c68e5cSAndreas Gohr $string[$j] = "\xC2"; 195*53c68e5cSAndreas Gohr $string[++$j] = $string[$i]; 196*53c68e5cSAndreas Gohr break; 197*53c68e5cSAndreas Gohr default: 198*53c68e5cSAndreas Gohr $string[$j] = "\xC3"; 199*53c68e5cSAndreas Gohr $string[++$j] = chr(ord($string[$i]) - 64); 200*53c68e5cSAndreas Gohr break; 201*53c68e5cSAndreas Gohr } 202*53c68e5cSAndreas Gohr } 203*53c68e5cSAndreas Gohr return substr($string, 0, $j); 204*53c68e5cSAndreas Gohr } 205f41bbe4cSAndreas Gohr} 206