1f41bbe4cSAndreas Gohr<?php 2f41bbe4cSAndreas Gohr 3f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8; 4f41bbe4cSAndreas Gohr 5f41bbe4cSAndreas Gohr/** 6f41bbe4cSAndreas Gohr * Methods to convert from and to UTF-8 strings 7f41bbe4cSAndreas Gohr */ 8f41bbe4cSAndreas Gohrclass Conversion 9f41bbe4cSAndreas Gohr{ 10f41bbe4cSAndreas Gohr 11f41bbe4cSAndreas Gohr /** 12f41bbe4cSAndreas Gohr * Encodes UTF-8 characters to HTML entities 13f41bbe4cSAndreas Gohr * 14f41bbe4cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 15f41bbe4cSAndreas Gohr * @author <vpribish at shopping dot com> 16f41bbe4cSAndreas Gohr * @link http://php.net/manual/en/function.utf8-decode.php 17f41bbe4cSAndreas Gohr * 18f41bbe4cSAndreas Gohr * @param string $str 19f41bbe4cSAndreas Gohr * @return string 20f41bbe4cSAndreas Gohr */ 21f41bbe4cSAndreas Gohr public static function toHtml($str) 22f41bbe4cSAndreas Gohr { 23f41bbe4cSAndreas Gohr $ret = ''; 24f41bbe4cSAndreas Gohr foreach (Unicode::fromUtf8($str) as $cp) { 25f41bbe4cSAndreas Gohr if ($cp < 0x80) { 26f41bbe4cSAndreas Gohr $ret .= chr($cp); 27f41bbe4cSAndreas Gohr } elseif ($cp < 0x100) { 28f41bbe4cSAndreas Gohr $ret .= "&#$cp;"; 29f41bbe4cSAndreas Gohr } else { 30f41bbe4cSAndreas Gohr $ret .= '&#x' . dechex($cp) . ';'; 31f41bbe4cSAndreas Gohr } 32f41bbe4cSAndreas Gohr } 33f41bbe4cSAndreas Gohr return $ret; 34f41bbe4cSAndreas Gohr } 35f41bbe4cSAndreas Gohr 36f41bbe4cSAndreas Gohr /** 37f41bbe4cSAndreas Gohr * Decodes HTML entities to UTF-8 characters 38f41bbe4cSAndreas Gohr * 39f41bbe4cSAndreas Gohr * Convert any &#..; entity to a codepoint, 40f41bbe4cSAndreas Gohr * The entities flag defaults to only decoding numeric entities. 41f41bbe4cSAndreas Gohr * Pass HTML_ENTITIES and named entities, including & < etc. 42f41bbe4cSAndreas Gohr * are handled as well. Avoids the problem that would occur if you 43f41bbe4cSAndreas Gohr * had to decode "&#38;&amp;#38;" 44f41bbe4cSAndreas Gohr * 45*8cbc5ee8SAndreas Gohr * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&" 46*8cbc5ee8SAndreas Gohr * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;" 47f41bbe4cSAndreas Gohr * what it should be -> "&&#38;" 48f41bbe4cSAndreas Gohr * 49f41bbe4cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 50f41bbe4cSAndreas Gohr * 51f41bbe4cSAndreas Gohr * @param string $str UTF-8 encoded string 52f41bbe4cSAndreas Gohr * @param boolean $entities decode name entities in addtition to numeric ones 53f41bbe4cSAndreas Gohr * @return string UTF-8 encoded string with numeric (and named) entities replaced. 54f41bbe4cSAndreas Gohr */ 55f41bbe4cSAndreas Gohr public static function fromHtml($str, $entities = false) 56f41bbe4cSAndreas Gohr { 57f41bbe4cSAndreas Gohr if (!$entities) { 58f41bbe4cSAndreas Gohr return preg_replace_callback( 59f41bbe4cSAndreas Gohr '/(&#([Xx])?([0-9A-Za-z]+);)/m', 60f41bbe4cSAndreas Gohr [__CLASS__, 'decodeNumericEntity'], 61f41bbe4cSAndreas Gohr $str 62f41bbe4cSAndreas Gohr ); 63f41bbe4cSAndreas Gohr } 64f41bbe4cSAndreas Gohr 65f41bbe4cSAndreas Gohr return preg_replace_callback( 66f41bbe4cSAndreas Gohr '/&(#)?([Xx])?([0-9A-Za-z]+);/m', 67f41bbe4cSAndreas Gohr [__CLASS__, 'decodeAnyEntity'], 68f41bbe4cSAndreas Gohr $str 69f41bbe4cSAndreas Gohr ); 70f41bbe4cSAndreas Gohr } 71f41bbe4cSAndreas Gohr 72f41bbe4cSAndreas Gohr /** 73f41bbe4cSAndreas Gohr * Decodes any HTML entity to it's correct UTF-8 char equivalent 74f41bbe4cSAndreas Gohr * 75f41bbe4cSAndreas Gohr * @param string $ent An entity 76f41bbe4cSAndreas Gohr * @return string 77f41bbe4cSAndreas Gohr */ 78f41bbe4cSAndreas Gohr protected static function decodeAnyEntity($ent) 79f41bbe4cSAndreas Gohr { 80f41bbe4cSAndreas Gohr // create the named entity lookup table 81f41bbe4cSAndreas Gohr static $table = null; 82f41bbe4cSAndreas Gohr if ($table === null) { 83f41bbe4cSAndreas Gohr $table = get_html_translation_table(HTML_ENTITIES); 84f41bbe4cSAndreas Gohr $table = array_flip($table); 85f41bbe4cSAndreas Gohr $table = array_map( 86f41bbe4cSAndreas Gohr static function ($c) { 87f41bbe4cSAndreas Gohr return Unicode::toUtf8(array(ord($c))); 88f41bbe4cSAndreas Gohr }, 89f41bbe4cSAndreas Gohr $table 90f41bbe4cSAndreas Gohr ); 91f41bbe4cSAndreas Gohr } 92f41bbe4cSAndreas Gohr 93f41bbe4cSAndreas Gohr if ($ent[1] === '#') { 94f41bbe4cSAndreas Gohr return self::decodeNumericEntity($ent); 95f41bbe4cSAndreas Gohr } 96f41bbe4cSAndreas Gohr 97f41bbe4cSAndreas Gohr if (array_key_exists($ent[0], $table)) { 98f41bbe4cSAndreas Gohr return $table[$ent[0]]; 99f41bbe4cSAndreas Gohr } 100f41bbe4cSAndreas Gohr 101f41bbe4cSAndreas Gohr return $ent[0]; 102f41bbe4cSAndreas Gohr } 103f41bbe4cSAndreas Gohr 104f41bbe4cSAndreas Gohr /** 105f41bbe4cSAndreas Gohr * Decodes numeric HTML entities to their correct UTF-8 characters 106f41bbe4cSAndreas Gohr * 107f41bbe4cSAndreas Gohr * @param $ent string A numeric entity 108f41bbe4cSAndreas Gohr * @return string|false 109f41bbe4cSAndreas Gohr */ 110f41bbe4cSAndreas Gohr protected static function decodeNumericEntity($ent) 111f41bbe4cSAndreas Gohr { 112f41bbe4cSAndreas Gohr switch ($ent[2]) { 113f41bbe4cSAndreas Gohr case 'X': 114f41bbe4cSAndreas Gohr case 'x': 115f41bbe4cSAndreas Gohr $cp = hexdec($ent[3]); 116f41bbe4cSAndreas Gohr break; 117f41bbe4cSAndreas Gohr default: 118f41bbe4cSAndreas Gohr $cp = intval($ent[3]); 119f41bbe4cSAndreas Gohr break; 120f41bbe4cSAndreas Gohr } 121f41bbe4cSAndreas Gohr return Unicode::toUtf8(array($cp)); 122f41bbe4cSAndreas Gohr } 123f41bbe4cSAndreas Gohr 124f41bbe4cSAndreas Gohr /** 125f41bbe4cSAndreas Gohr * UTF-8 to UTF-16BE conversion. 126f41bbe4cSAndreas Gohr * 127f41bbe4cSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 128f41bbe4cSAndreas Gohr * 129f41bbe4cSAndreas Gohr * @param string $str 130f41bbe4cSAndreas Gohr * @param bool $bom 131f41bbe4cSAndreas Gohr * @return string 132f41bbe4cSAndreas Gohr */ 133f41bbe4cSAndreas Gohr public static function toUtf16be($str, $bom = false) 134f41bbe4cSAndreas Gohr { 135f41bbe4cSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 136f41bbe4cSAndreas Gohr if (UTF8_MBSTRING) { 137f41bbe4cSAndreas Gohr return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); 138f41bbe4cSAndreas Gohr } 139f41bbe4cSAndreas Gohr 140f41bbe4cSAndreas Gohr $uni = Unicode::fromUtf8($str); 141f41bbe4cSAndreas Gohr foreach ($uni as $cp) { 142f41bbe4cSAndreas Gohr $out .= pack('n', $cp); 143f41bbe4cSAndreas Gohr } 144f41bbe4cSAndreas Gohr return $out; 145f41bbe4cSAndreas Gohr } 146f41bbe4cSAndreas Gohr 147f41bbe4cSAndreas Gohr /** 148f41bbe4cSAndreas Gohr * UTF-8 to UTF-16BE conversion. 149f41bbe4cSAndreas Gohr * 150f41bbe4cSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 151f41bbe4cSAndreas Gohr * 152f41bbe4cSAndreas Gohr * @param string $str 153f41bbe4cSAndreas Gohr * @return false|string 154f41bbe4cSAndreas Gohr */ 155f41bbe4cSAndreas Gohr public static function fromUtf16be($str) 156f41bbe4cSAndreas Gohr { 157f41bbe4cSAndreas Gohr $uni = unpack('n*', $str); 158f41bbe4cSAndreas Gohr return Unicode::toUtf8($uni); 159f41bbe4cSAndreas Gohr } 160f41bbe4cSAndreas Gohr 161f41bbe4cSAndreas Gohr} 162