xref: /dokuwiki/inc/Utf8/Conversion.php (revision f41bbe4cad0871728891d9ffb45bd6fd79ab1024)
1*f41bbe4cSAndreas Gohr<?php
2*f41bbe4cSAndreas Gohr
3*f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8;
4*f41bbe4cSAndreas Gohr
5*f41bbe4cSAndreas Gohr/**
6*f41bbe4cSAndreas Gohr * Methods to convert from and to UTF-8 strings
7*f41bbe4cSAndreas Gohr */
8*f41bbe4cSAndreas Gohrclass Conversion
9*f41bbe4cSAndreas Gohr{
10*f41bbe4cSAndreas Gohr
11*f41bbe4cSAndreas Gohr    /**
12*f41bbe4cSAndreas Gohr     * Encodes UTF-8 characters to HTML entities
13*f41bbe4cSAndreas Gohr     *
14*f41bbe4cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
15*f41bbe4cSAndreas Gohr     * @author <vpribish at shopping dot com>
16*f41bbe4cSAndreas Gohr     * @link   http://php.net/manual/en/function.utf8-decode.php
17*f41bbe4cSAndreas Gohr     *
18*f41bbe4cSAndreas Gohr     * @param string $str
19*f41bbe4cSAndreas Gohr     * @return string
20*f41bbe4cSAndreas Gohr     */
21*f41bbe4cSAndreas Gohr    public static function toHtml($str)
22*f41bbe4cSAndreas Gohr    {
23*f41bbe4cSAndreas Gohr        $ret = '';
24*f41bbe4cSAndreas Gohr        foreach (Unicode::fromUtf8($str) as $cp) {
25*f41bbe4cSAndreas Gohr            if ($cp < 0x80) {
26*f41bbe4cSAndreas Gohr                $ret .= chr($cp);
27*f41bbe4cSAndreas Gohr            } elseif ($cp < 0x100) {
28*f41bbe4cSAndreas Gohr                $ret .= "&#$cp;";
29*f41bbe4cSAndreas Gohr            } else {
30*f41bbe4cSAndreas Gohr                $ret .= '&#x' . dechex($cp) . ';';
31*f41bbe4cSAndreas Gohr            }
32*f41bbe4cSAndreas Gohr        }
33*f41bbe4cSAndreas Gohr        return $ret;
34*f41bbe4cSAndreas Gohr    }
35*f41bbe4cSAndreas Gohr
36*f41bbe4cSAndreas Gohr    /**
37*f41bbe4cSAndreas Gohr     * Decodes HTML entities to UTF-8 characters
38*f41bbe4cSAndreas Gohr     *
39*f41bbe4cSAndreas Gohr     * Convert any &#..; entity to a codepoint,
40*f41bbe4cSAndreas Gohr     * The entities flag defaults to only decoding numeric entities.
41*f41bbe4cSAndreas Gohr     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
42*f41bbe4cSAndreas Gohr     * are handled as well. Avoids the problem that would occur if you
43*f41bbe4cSAndreas Gohr     * had to decode "&amp;#38;&#38;amp;#38;"
44*f41bbe4cSAndreas Gohr     *
45*f41bbe4cSAndreas Gohr     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
46*f41bbe4cSAndreas Gohr     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
47*f41bbe4cSAndreas Gohr     * what it should be                   -> "&#38;&amp#38;"
48*f41bbe4cSAndreas Gohr     *
49*f41bbe4cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
50*f41bbe4cSAndreas Gohr     *
51*f41bbe4cSAndreas Gohr     * @param  string $str UTF-8 encoded string
52*f41bbe4cSAndreas Gohr     * @param  boolean $entities decode name entities in addtition to numeric ones
53*f41bbe4cSAndreas Gohr     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
54*f41bbe4cSAndreas Gohr     */
55*f41bbe4cSAndreas Gohr    public static function fromHtml($str, $entities = false)
56*f41bbe4cSAndreas Gohr    {
57*f41bbe4cSAndreas Gohr        if (!$entities) {
58*f41bbe4cSAndreas Gohr            return preg_replace_callback(
59*f41bbe4cSAndreas Gohr                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
60*f41bbe4cSAndreas Gohr                [__CLASS__, 'decodeNumericEntity'],
61*f41bbe4cSAndreas Gohr                $str
62*f41bbe4cSAndreas Gohr            );
63*f41bbe4cSAndreas Gohr        }
64*f41bbe4cSAndreas Gohr
65*f41bbe4cSAndreas Gohr        return preg_replace_callback(
66*f41bbe4cSAndreas Gohr            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
67*f41bbe4cSAndreas Gohr            [__CLASS__, 'decodeAnyEntity'],
68*f41bbe4cSAndreas Gohr            $str
69*f41bbe4cSAndreas Gohr        );
70*f41bbe4cSAndreas Gohr    }
71*f41bbe4cSAndreas Gohr
72*f41bbe4cSAndreas Gohr    /**
73*f41bbe4cSAndreas Gohr     * Decodes any HTML entity to it's correct UTF-8 char equivalent
74*f41bbe4cSAndreas Gohr     *
75*f41bbe4cSAndreas Gohr     * @param string $ent An entity
76*f41bbe4cSAndreas Gohr     * @return string
77*f41bbe4cSAndreas Gohr     */
78*f41bbe4cSAndreas Gohr    protected static function decodeAnyEntity($ent)
79*f41bbe4cSAndreas Gohr    {
80*f41bbe4cSAndreas Gohr        // create the named entity lookup table
81*f41bbe4cSAndreas Gohr        static $table = null;
82*f41bbe4cSAndreas Gohr        if ($table === null) {
83*f41bbe4cSAndreas Gohr            $table = get_html_translation_table(HTML_ENTITIES);
84*f41bbe4cSAndreas Gohr            $table = array_flip($table);
85*f41bbe4cSAndreas Gohr            $table = array_map(
86*f41bbe4cSAndreas Gohr                static function ($c) {
87*f41bbe4cSAndreas Gohr                    return Unicode::toUtf8(array(ord($c)));
88*f41bbe4cSAndreas Gohr                },
89*f41bbe4cSAndreas Gohr                $table
90*f41bbe4cSAndreas Gohr            );
91*f41bbe4cSAndreas Gohr        }
92*f41bbe4cSAndreas Gohr
93*f41bbe4cSAndreas Gohr        if ($ent[1] === '#') {
94*f41bbe4cSAndreas Gohr            return self::decodeNumericEntity($ent);
95*f41bbe4cSAndreas Gohr        }
96*f41bbe4cSAndreas Gohr
97*f41bbe4cSAndreas Gohr        if (array_key_exists($ent[0], $table)) {
98*f41bbe4cSAndreas Gohr            return $table[$ent[0]];
99*f41bbe4cSAndreas Gohr        }
100*f41bbe4cSAndreas Gohr
101*f41bbe4cSAndreas Gohr        return $ent[0];
102*f41bbe4cSAndreas Gohr    }
103*f41bbe4cSAndreas Gohr
104*f41bbe4cSAndreas Gohr    /**
105*f41bbe4cSAndreas Gohr     * Decodes numeric HTML entities to their correct UTF-8 characters
106*f41bbe4cSAndreas Gohr     *
107*f41bbe4cSAndreas Gohr     * @param $ent string A numeric entity
108*f41bbe4cSAndreas Gohr     * @return string|false
109*f41bbe4cSAndreas Gohr     */
110*f41bbe4cSAndreas Gohr    protected static function decodeNumericEntity($ent)
111*f41bbe4cSAndreas Gohr    {
112*f41bbe4cSAndreas Gohr        switch ($ent[2]) {
113*f41bbe4cSAndreas Gohr            case 'X':
114*f41bbe4cSAndreas Gohr            case 'x':
115*f41bbe4cSAndreas Gohr                $cp = hexdec($ent[3]);
116*f41bbe4cSAndreas Gohr                break;
117*f41bbe4cSAndreas Gohr            default:
118*f41bbe4cSAndreas Gohr                $cp = intval($ent[3]);
119*f41bbe4cSAndreas Gohr                break;
120*f41bbe4cSAndreas Gohr        }
121*f41bbe4cSAndreas Gohr        return Unicode::toUtf8(array($cp));
122*f41bbe4cSAndreas Gohr    }
123*f41bbe4cSAndreas Gohr
124*f41bbe4cSAndreas Gohr    /**
125*f41bbe4cSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
126*f41bbe4cSAndreas Gohr     *
127*f41bbe4cSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
128*f41bbe4cSAndreas Gohr     *
129*f41bbe4cSAndreas Gohr     * @param string $str
130*f41bbe4cSAndreas Gohr     * @param bool $bom
131*f41bbe4cSAndreas Gohr     * @return string
132*f41bbe4cSAndreas Gohr     */
133*f41bbe4cSAndreas Gohr    public static function toUtf16be($str, $bom = false)
134*f41bbe4cSAndreas Gohr    {
135*f41bbe4cSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
136*f41bbe4cSAndreas Gohr        if (UTF8_MBSTRING) {
137*f41bbe4cSAndreas Gohr            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
138*f41bbe4cSAndreas Gohr        }
139*f41bbe4cSAndreas Gohr
140*f41bbe4cSAndreas Gohr        $uni = Unicode::fromUtf8($str);
141*f41bbe4cSAndreas Gohr        foreach ($uni as $cp) {
142*f41bbe4cSAndreas Gohr            $out .= pack('n', $cp);
143*f41bbe4cSAndreas Gohr        }
144*f41bbe4cSAndreas Gohr        return $out;
145*f41bbe4cSAndreas Gohr    }
146*f41bbe4cSAndreas Gohr
147*f41bbe4cSAndreas Gohr    /**
148*f41bbe4cSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
149*f41bbe4cSAndreas Gohr     *
150*f41bbe4cSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
151*f41bbe4cSAndreas Gohr     *
152*f41bbe4cSAndreas Gohr     * @param string $str
153*f41bbe4cSAndreas Gohr     * @return false|string
154*f41bbe4cSAndreas Gohr     */
155*f41bbe4cSAndreas Gohr    public static function fromUtf16be($str)
156*f41bbe4cSAndreas Gohr    {
157*f41bbe4cSAndreas Gohr        $uni = unpack('n*', $str);
158*f41bbe4cSAndreas Gohr        return Unicode::toUtf8($uni);
159*f41bbe4cSAndreas Gohr    }
160*f41bbe4cSAndreas Gohr
161*f41bbe4cSAndreas Gohr}
162