1*f41bbe4cSAndreas Gohr<?php 2*f41bbe4cSAndreas Gohr 3*f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8; 4*f41bbe4cSAndreas Gohr 5*f41bbe4cSAndreas Gohr/** 6*f41bbe4cSAndreas Gohr * Methods to convert from and to UTF-8 strings 7*f41bbe4cSAndreas Gohr */ 8*f41bbe4cSAndreas Gohrclass Conversion 9*f41bbe4cSAndreas Gohr{ 10*f41bbe4cSAndreas Gohr 11*f41bbe4cSAndreas Gohr /** 12*f41bbe4cSAndreas Gohr * Encodes UTF-8 characters to HTML entities 13*f41bbe4cSAndreas Gohr * 14*f41bbe4cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 15*f41bbe4cSAndreas Gohr * @author <vpribish at shopping dot com> 16*f41bbe4cSAndreas Gohr * @link http://php.net/manual/en/function.utf8-decode.php 17*f41bbe4cSAndreas Gohr * 18*f41bbe4cSAndreas Gohr * @param string $str 19*f41bbe4cSAndreas Gohr * @return string 20*f41bbe4cSAndreas Gohr */ 21*f41bbe4cSAndreas Gohr public static function toHtml($str) 22*f41bbe4cSAndreas Gohr { 23*f41bbe4cSAndreas Gohr $ret = ''; 24*f41bbe4cSAndreas Gohr foreach (Unicode::fromUtf8($str) as $cp) { 25*f41bbe4cSAndreas Gohr if ($cp < 0x80) { 26*f41bbe4cSAndreas Gohr $ret .= chr($cp); 27*f41bbe4cSAndreas Gohr } elseif ($cp < 0x100) { 28*f41bbe4cSAndreas Gohr $ret .= "&#$cp;"; 29*f41bbe4cSAndreas Gohr } else { 30*f41bbe4cSAndreas Gohr $ret .= '&#x' . dechex($cp) . ';'; 31*f41bbe4cSAndreas Gohr } 32*f41bbe4cSAndreas Gohr } 33*f41bbe4cSAndreas Gohr return $ret; 34*f41bbe4cSAndreas Gohr } 35*f41bbe4cSAndreas Gohr 36*f41bbe4cSAndreas Gohr /** 37*f41bbe4cSAndreas Gohr * Decodes HTML entities to UTF-8 characters 38*f41bbe4cSAndreas Gohr * 39*f41bbe4cSAndreas Gohr * Convert any &#..; entity to a codepoint, 40*f41bbe4cSAndreas Gohr * The entities flag defaults to only decoding numeric entities. 41*f41bbe4cSAndreas Gohr * Pass HTML_ENTITIES and named entities, including & < etc. 42*f41bbe4cSAndreas Gohr * are handled as well. Avoids the problem that would occur if you 43*f41bbe4cSAndreas Gohr * had to decode "&#38;&amp;#38;" 44*f41bbe4cSAndreas Gohr * 45*f41bbe4cSAndreas Gohr * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 46*f41bbe4cSAndreas Gohr * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 47*f41bbe4cSAndreas Gohr * what it should be -> "&&#38;" 48*f41bbe4cSAndreas Gohr * 49*f41bbe4cSAndreas Gohr * @author Tom N Harris <tnharris@whoopdedo.org> 50*f41bbe4cSAndreas Gohr * 51*f41bbe4cSAndreas Gohr * @param string $str UTF-8 encoded string 52*f41bbe4cSAndreas Gohr * @param boolean $entities decode name entities in addtition to numeric ones 53*f41bbe4cSAndreas Gohr * @return string UTF-8 encoded string with numeric (and named) entities replaced. 54*f41bbe4cSAndreas Gohr */ 55*f41bbe4cSAndreas Gohr public static function fromHtml($str, $entities = false) 56*f41bbe4cSAndreas Gohr { 57*f41bbe4cSAndreas Gohr if (!$entities) { 58*f41bbe4cSAndreas Gohr return preg_replace_callback( 59*f41bbe4cSAndreas Gohr '/(&#([Xx])?([0-9A-Za-z]+);)/m', 60*f41bbe4cSAndreas Gohr [__CLASS__, 'decodeNumericEntity'], 61*f41bbe4cSAndreas Gohr $str 62*f41bbe4cSAndreas Gohr ); 63*f41bbe4cSAndreas Gohr } 64*f41bbe4cSAndreas Gohr 65*f41bbe4cSAndreas Gohr return preg_replace_callback( 66*f41bbe4cSAndreas Gohr '/&(#)?([Xx])?([0-9A-Za-z]+);/m', 67*f41bbe4cSAndreas Gohr [__CLASS__, 'decodeAnyEntity'], 68*f41bbe4cSAndreas Gohr $str 69*f41bbe4cSAndreas Gohr ); 70*f41bbe4cSAndreas Gohr } 71*f41bbe4cSAndreas Gohr 72*f41bbe4cSAndreas Gohr /** 73*f41bbe4cSAndreas Gohr * Decodes any HTML entity to it's correct UTF-8 char equivalent 74*f41bbe4cSAndreas Gohr * 75*f41bbe4cSAndreas Gohr * @param string $ent An entity 76*f41bbe4cSAndreas Gohr * @return string 77*f41bbe4cSAndreas Gohr */ 78*f41bbe4cSAndreas Gohr protected static function decodeAnyEntity($ent) 79*f41bbe4cSAndreas Gohr { 80*f41bbe4cSAndreas Gohr // create the named entity lookup table 81*f41bbe4cSAndreas Gohr static $table = null; 82*f41bbe4cSAndreas Gohr if ($table === null) { 83*f41bbe4cSAndreas Gohr $table = get_html_translation_table(HTML_ENTITIES); 84*f41bbe4cSAndreas Gohr $table = array_flip($table); 85*f41bbe4cSAndreas Gohr $table = array_map( 86*f41bbe4cSAndreas Gohr static function ($c) { 87*f41bbe4cSAndreas Gohr return Unicode::toUtf8(array(ord($c))); 88*f41bbe4cSAndreas Gohr }, 89*f41bbe4cSAndreas Gohr $table 90*f41bbe4cSAndreas Gohr ); 91*f41bbe4cSAndreas Gohr } 92*f41bbe4cSAndreas Gohr 93*f41bbe4cSAndreas Gohr if ($ent[1] === '#') { 94*f41bbe4cSAndreas Gohr return self::decodeNumericEntity($ent); 95*f41bbe4cSAndreas Gohr } 96*f41bbe4cSAndreas Gohr 97*f41bbe4cSAndreas Gohr if (array_key_exists($ent[0], $table)) { 98*f41bbe4cSAndreas Gohr return $table[$ent[0]]; 99*f41bbe4cSAndreas Gohr } 100*f41bbe4cSAndreas Gohr 101*f41bbe4cSAndreas Gohr return $ent[0]; 102*f41bbe4cSAndreas Gohr } 103*f41bbe4cSAndreas Gohr 104*f41bbe4cSAndreas Gohr /** 105*f41bbe4cSAndreas Gohr * Decodes numeric HTML entities to their correct UTF-8 characters 106*f41bbe4cSAndreas Gohr * 107*f41bbe4cSAndreas Gohr * @param $ent string A numeric entity 108*f41bbe4cSAndreas Gohr * @return string|false 109*f41bbe4cSAndreas Gohr */ 110*f41bbe4cSAndreas Gohr protected static function decodeNumericEntity($ent) 111*f41bbe4cSAndreas Gohr { 112*f41bbe4cSAndreas Gohr switch ($ent[2]) { 113*f41bbe4cSAndreas Gohr case 'X': 114*f41bbe4cSAndreas Gohr case 'x': 115*f41bbe4cSAndreas Gohr $cp = hexdec($ent[3]); 116*f41bbe4cSAndreas Gohr break; 117*f41bbe4cSAndreas Gohr default: 118*f41bbe4cSAndreas Gohr $cp = intval($ent[3]); 119*f41bbe4cSAndreas Gohr break; 120*f41bbe4cSAndreas Gohr } 121*f41bbe4cSAndreas Gohr return Unicode::toUtf8(array($cp)); 122*f41bbe4cSAndreas Gohr } 123*f41bbe4cSAndreas Gohr 124*f41bbe4cSAndreas Gohr /** 125*f41bbe4cSAndreas Gohr * UTF-8 to UTF-16BE conversion. 126*f41bbe4cSAndreas Gohr * 127*f41bbe4cSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 128*f41bbe4cSAndreas Gohr * 129*f41bbe4cSAndreas Gohr * @param string $str 130*f41bbe4cSAndreas Gohr * @param bool $bom 131*f41bbe4cSAndreas Gohr * @return string 132*f41bbe4cSAndreas Gohr */ 133*f41bbe4cSAndreas Gohr public static function toUtf16be($str, $bom = false) 134*f41bbe4cSAndreas Gohr { 135*f41bbe4cSAndreas Gohr $out = $bom ? "\xFE\xFF" : ''; 136*f41bbe4cSAndreas Gohr if (UTF8_MBSTRING) { 137*f41bbe4cSAndreas Gohr return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); 138*f41bbe4cSAndreas Gohr } 139*f41bbe4cSAndreas Gohr 140*f41bbe4cSAndreas Gohr $uni = Unicode::fromUtf8($str); 141*f41bbe4cSAndreas Gohr foreach ($uni as $cp) { 142*f41bbe4cSAndreas Gohr $out .= pack('n', $cp); 143*f41bbe4cSAndreas Gohr } 144*f41bbe4cSAndreas Gohr return $out; 145*f41bbe4cSAndreas Gohr } 146*f41bbe4cSAndreas Gohr 147*f41bbe4cSAndreas Gohr /** 148*f41bbe4cSAndreas Gohr * UTF-8 to UTF-16BE conversion. 149*f41bbe4cSAndreas Gohr * 150*f41bbe4cSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits 151*f41bbe4cSAndreas Gohr * 152*f41bbe4cSAndreas Gohr * @param string $str 153*f41bbe4cSAndreas Gohr * @return false|string 154*f41bbe4cSAndreas Gohr */ 155*f41bbe4cSAndreas Gohr public static function fromUtf16be($str) 156*f41bbe4cSAndreas Gohr { 157*f41bbe4cSAndreas Gohr $uni = unpack('n*', $str); 158*f41bbe4cSAndreas Gohr return Unicode::toUtf8($uni); 159*f41bbe4cSAndreas Gohr } 160*f41bbe4cSAndreas Gohr 161*f41bbe4cSAndreas Gohr} 162