xref: /dokuwiki/inc/Utf8/Conversion.php (revision c0c77cd20b23921c9e893bb70b99f38be153875a)
1f41bbe4cSAndreas Gohr<?php
2f41bbe4cSAndreas Gohr
3f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8;
4f41bbe4cSAndreas Gohr
5f41bbe4cSAndreas Gohr/**
6f41bbe4cSAndreas Gohr * Methods to convert from and to UTF-8 strings
7f41bbe4cSAndreas Gohr */
8f41bbe4cSAndreas Gohrclass Conversion
9f41bbe4cSAndreas Gohr{
10f41bbe4cSAndreas Gohr
11f41bbe4cSAndreas Gohr    /**
12f41bbe4cSAndreas Gohr     * Encodes UTF-8 characters to HTML entities
13f41bbe4cSAndreas Gohr     *
14f41bbe4cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
15f41bbe4cSAndreas Gohr     * @author <vpribish at shopping dot com>
16f41bbe4cSAndreas Gohr     * @link   http://php.net/manual/en/function.utf8-decode.php
17f41bbe4cSAndreas Gohr     *
18f41bbe4cSAndreas Gohr     * @param string $str
19*c0c77cd2SAndreas Gohr     * @param bool $all Encode non-utf8 char to HTML as well
20f41bbe4cSAndreas Gohr     * @return string
21f41bbe4cSAndreas Gohr     */
22*c0c77cd2SAndreas Gohr    public static function toHtml($str, $all = false)
23f41bbe4cSAndreas Gohr    {
24f41bbe4cSAndreas Gohr        $ret = '';
25f41bbe4cSAndreas Gohr        foreach (Unicode::fromUtf8($str) as $cp) {
26*c0c77cd2SAndreas Gohr            if ($cp < 0x80 && !$all) {
27f41bbe4cSAndreas Gohr                $ret .= chr($cp);
28f41bbe4cSAndreas Gohr            } elseif ($cp < 0x100) {
29f41bbe4cSAndreas Gohr                $ret .= "&#$cp;";
30f41bbe4cSAndreas Gohr            } else {
31f41bbe4cSAndreas Gohr                $ret .= '&#x' . dechex($cp) . ';';
32f41bbe4cSAndreas Gohr            }
33f41bbe4cSAndreas Gohr        }
34f41bbe4cSAndreas Gohr        return $ret;
35f41bbe4cSAndreas Gohr    }
36f41bbe4cSAndreas Gohr
37f41bbe4cSAndreas Gohr    /**
38f41bbe4cSAndreas Gohr     * Decodes HTML entities to UTF-8 characters
39f41bbe4cSAndreas Gohr     *
40f41bbe4cSAndreas Gohr     * Convert any &#..; entity to a codepoint,
41f41bbe4cSAndreas Gohr     * The entities flag defaults to only decoding numeric entities.
42f41bbe4cSAndreas Gohr     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
43f41bbe4cSAndreas Gohr     * are handled as well. Avoids the problem that would occur if you
44f41bbe4cSAndreas Gohr     * had to decode "&amp;#38;&#38;amp;#38;"
45f41bbe4cSAndreas Gohr     *
468cbc5ee8SAndreas Gohr     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
478cbc5ee8SAndreas Gohr     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
48f41bbe4cSAndreas Gohr     * what it should be                   -> "&#38;&amp#38;"
49f41bbe4cSAndreas Gohr     *
50f41bbe4cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
51f41bbe4cSAndreas Gohr     *
52f41bbe4cSAndreas Gohr     * @param  string $str UTF-8 encoded string
53f41bbe4cSAndreas Gohr     * @param  boolean $entities decode name entities in addtition to numeric ones
54f41bbe4cSAndreas Gohr     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
55f41bbe4cSAndreas Gohr     */
56f41bbe4cSAndreas Gohr    public static function fromHtml($str, $entities = false)
57f41bbe4cSAndreas Gohr    {
58f41bbe4cSAndreas Gohr        if (!$entities) {
59f41bbe4cSAndreas Gohr            return preg_replace_callback(
60f41bbe4cSAndreas Gohr                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
61f41bbe4cSAndreas Gohr                [__CLASS__, 'decodeNumericEntity'],
62f41bbe4cSAndreas Gohr                $str
63f41bbe4cSAndreas Gohr            );
64f41bbe4cSAndreas Gohr        }
65f41bbe4cSAndreas Gohr
66f41bbe4cSAndreas Gohr        return preg_replace_callback(
67f41bbe4cSAndreas Gohr            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
68f41bbe4cSAndreas Gohr            [__CLASS__, 'decodeAnyEntity'],
69f41bbe4cSAndreas Gohr            $str
70f41bbe4cSAndreas Gohr        );
71f41bbe4cSAndreas Gohr    }
72f41bbe4cSAndreas Gohr
73f41bbe4cSAndreas Gohr    /**
74f41bbe4cSAndreas Gohr     * Decodes any HTML entity to it's correct UTF-8 char equivalent
75f41bbe4cSAndreas Gohr     *
76f41bbe4cSAndreas Gohr     * @param string $ent An entity
77f41bbe4cSAndreas Gohr     * @return string
78f41bbe4cSAndreas Gohr     */
79f41bbe4cSAndreas Gohr    protected static function decodeAnyEntity($ent)
80f41bbe4cSAndreas Gohr    {
81f41bbe4cSAndreas Gohr        // create the named entity lookup table
82f41bbe4cSAndreas Gohr        static $table = null;
83f41bbe4cSAndreas Gohr        if ($table === null) {
84f41bbe4cSAndreas Gohr            $table = get_html_translation_table(HTML_ENTITIES);
85f41bbe4cSAndreas Gohr            $table = array_flip($table);
86f41bbe4cSAndreas Gohr            $table = array_map(
87f41bbe4cSAndreas Gohr                static function ($c) {
88f41bbe4cSAndreas Gohr                    return Unicode::toUtf8(array(ord($c)));
89f41bbe4cSAndreas Gohr                },
90f41bbe4cSAndreas Gohr                $table
91f41bbe4cSAndreas Gohr            );
92f41bbe4cSAndreas Gohr        }
93f41bbe4cSAndreas Gohr
94f41bbe4cSAndreas Gohr        if ($ent[1] === '#') {
95f41bbe4cSAndreas Gohr            return self::decodeNumericEntity($ent);
96f41bbe4cSAndreas Gohr        }
97f41bbe4cSAndreas Gohr
98f41bbe4cSAndreas Gohr        if (array_key_exists($ent[0], $table)) {
99f41bbe4cSAndreas Gohr            return $table[$ent[0]];
100f41bbe4cSAndreas Gohr        }
101f41bbe4cSAndreas Gohr
102f41bbe4cSAndreas Gohr        return $ent[0];
103f41bbe4cSAndreas Gohr    }
104f41bbe4cSAndreas Gohr
105f41bbe4cSAndreas Gohr    /**
106f41bbe4cSAndreas Gohr     * Decodes numeric HTML entities to their correct UTF-8 characters
107f41bbe4cSAndreas Gohr     *
108f41bbe4cSAndreas Gohr     * @param $ent string A numeric entity
109f41bbe4cSAndreas Gohr     * @return string|false
110f41bbe4cSAndreas Gohr     */
111f41bbe4cSAndreas Gohr    protected static function decodeNumericEntity($ent)
112f41bbe4cSAndreas Gohr    {
113f41bbe4cSAndreas Gohr        switch ($ent[2]) {
114f41bbe4cSAndreas Gohr            case 'X':
115f41bbe4cSAndreas Gohr            case 'x':
116f41bbe4cSAndreas Gohr                $cp = hexdec($ent[3]);
117f41bbe4cSAndreas Gohr                break;
118f41bbe4cSAndreas Gohr            default:
119f41bbe4cSAndreas Gohr                $cp = intval($ent[3]);
120f41bbe4cSAndreas Gohr                break;
121f41bbe4cSAndreas Gohr        }
122f41bbe4cSAndreas Gohr        return Unicode::toUtf8(array($cp));
123f41bbe4cSAndreas Gohr    }
124f41bbe4cSAndreas Gohr
125f41bbe4cSAndreas Gohr    /**
126f41bbe4cSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
127f41bbe4cSAndreas Gohr     *
128f41bbe4cSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
129f41bbe4cSAndreas Gohr     *
130f41bbe4cSAndreas Gohr     * @param string $str
131f41bbe4cSAndreas Gohr     * @param bool $bom
132f41bbe4cSAndreas Gohr     * @return string
133f41bbe4cSAndreas Gohr     */
134f41bbe4cSAndreas Gohr    public static function toUtf16be($str, $bom = false)
135f41bbe4cSAndreas Gohr    {
136f41bbe4cSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
137f41bbe4cSAndreas Gohr        if (UTF8_MBSTRING) {
138f41bbe4cSAndreas Gohr            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
139f41bbe4cSAndreas Gohr        }
140f41bbe4cSAndreas Gohr
141f41bbe4cSAndreas Gohr        $uni = Unicode::fromUtf8($str);
142f41bbe4cSAndreas Gohr        foreach ($uni as $cp) {
143f41bbe4cSAndreas Gohr            $out .= pack('n', $cp);
144f41bbe4cSAndreas Gohr        }
145f41bbe4cSAndreas Gohr        return $out;
146f41bbe4cSAndreas Gohr    }
147f41bbe4cSAndreas Gohr
148f41bbe4cSAndreas Gohr    /**
149f41bbe4cSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
150f41bbe4cSAndreas Gohr     *
151f41bbe4cSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
152f41bbe4cSAndreas Gohr     *
153f41bbe4cSAndreas Gohr     * @param string $str
154f41bbe4cSAndreas Gohr     * @return false|string
155f41bbe4cSAndreas Gohr     */
156f41bbe4cSAndreas Gohr    public static function fromUtf16be($str)
157f41bbe4cSAndreas Gohr    {
158f41bbe4cSAndreas Gohr        $uni = unpack('n*', $str);
159f41bbe4cSAndreas Gohr        return Unicode::toUtf8($uni);
160f41bbe4cSAndreas Gohr    }
161f41bbe4cSAndreas Gohr
162f41bbe4cSAndreas Gohr}
163