xref: /dokuwiki/inc/Utf8/Conversion.php (revision 093fe67e98c0cdb4b73fd46938e49b64971483c2)
1f41bbe4cSAndreas Gohr<?php
2f41bbe4cSAndreas Gohr
3f41bbe4cSAndreas Gohrnamespace dokuwiki\Utf8;
4f41bbe4cSAndreas Gohr
5f41bbe4cSAndreas Gohr/**
6f41bbe4cSAndreas Gohr * Methods to convert from and to UTF-8 strings
7f41bbe4cSAndreas Gohr */
8f41bbe4cSAndreas Gohrclass Conversion
9f41bbe4cSAndreas Gohr{
10f41bbe4cSAndreas Gohr    /**
11f41bbe4cSAndreas Gohr     * Encodes UTF-8 characters to HTML entities
12f41bbe4cSAndreas Gohr     *
13f41bbe4cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
14f41bbe4cSAndreas Gohr     * @author <vpribish at shopping dot com>
15f41bbe4cSAndreas Gohr     * @link   http://php.net/manual/en/function.utf8-decode.php
16f41bbe4cSAndreas Gohr     *
17f41bbe4cSAndreas Gohr     * @param string $str
18c0c77cd2SAndreas Gohr     * @param bool $all Encode non-utf8 char to HTML as well
19f41bbe4cSAndreas Gohr     * @return string
20f41bbe4cSAndreas Gohr     */
21c0c77cd2SAndreas Gohr    public static function toHtml($str, $all = false)
22f41bbe4cSAndreas Gohr    {
23f41bbe4cSAndreas Gohr        $ret = '';
24f41bbe4cSAndreas Gohr        foreach (Unicode::fromUtf8($str) as $cp) {
25c0c77cd2SAndreas Gohr            if ($cp < 0x80 && !$all) {
26f41bbe4cSAndreas Gohr                $ret .= chr($cp);
27f41bbe4cSAndreas Gohr            } elseif ($cp < 0x100) {
28f41bbe4cSAndreas Gohr                $ret .= "&#$cp;";
29f41bbe4cSAndreas Gohr            } else {
30f41bbe4cSAndreas Gohr                $ret .= '&#x' . dechex($cp) . ';';
31f41bbe4cSAndreas Gohr            }
32f41bbe4cSAndreas Gohr        }
33f41bbe4cSAndreas Gohr        return $ret;
34f41bbe4cSAndreas Gohr    }
35f41bbe4cSAndreas Gohr
36f41bbe4cSAndreas Gohr    /**
37f41bbe4cSAndreas Gohr     * Decodes HTML entities to UTF-8 characters
38f41bbe4cSAndreas Gohr     *
39f41bbe4cSAndreas Gohr     * Convert any &#..; entity to a codepoint,
40f41bbe4cSAndreas Gohr     * The entities flag defaults to only decoding numeric entities.
41f41bbe4cSAndreas Gohr     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
42f41bbe4cSAndreas Gohr     * are handled as well. Avoids the problem that would occur if you
43f41bbe4cSAndreas Gohr     * had to decode "&amp;#38;&#38;amp;#38;"
44f41bbe4cSAndreas Gohr     *
458cbc5ee8SAndreas Gohr     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
468cbc5ee8SAndreas Gohr     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
47f41bbe4cSAndreas Gohr     * what it should be                   -> "&#38;&amp#38;"
48f41bbe4cSAndreas Gohr     *
49f41bbe4cSAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
50f41bbe4cSAndreas Gohr     *
51f41bbe4cSAndreas Gohr     * @param  string $str UTF-8 encoded string
52f41bbe4cSAndreas Gohr     * @param  boolean $entities decode name entities in addtition to numeric ones
53f41bbe4cSAndreas Gohr     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
54f41bbe4cSAndreas Gohr     */
55f41bbe4cSAndreas Gohr    public static function fromHtml($str, $entities = false)
56f41bbe4cSAndreas Gohr    {
57f41bbe4cSAndreas Gohr        if (!$entities) {
58f41bbe4cSAndreas Gohr            return preg_replace_callback(
59f41bbe4cSAndreas Gohr                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
60*093fe67eSAndreas Gohr                self::decodeNumericEntity(...),
61f41bbe4cSAndreas Gohr                $str
62f41bbe4cSAndreas Gohr            );
63f41bbe4cSAndreas Gohr        }
64f41bbe4cSAndreas Gohr
65f41bbe4cSAndreas Gohr        return preg_replace_callback(
66f41bbe4cSAndreas Gohr            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
67*093fe67eSAndreas Gohr            self::decodeAnyEntity(...),
68f41bbe4cSAndreas Gohr            $str
69f41bbe4cSAndreas Gohr        );
70f41bbe4cSAndreas Gohr    }
71f41bbe4cSAndreas Gohr
72f41bbe4cSAndreas Gohr    /**
73f41bbe4cSAndreas Gohr     * Decodes any HTML entity to it's correct UTF-8 char equivalent
74f41bbe4cSAndreas Gohr     *
75f41bbe4cSAndreas Gohr     * @param string $ent An entity
76f41bbe4cSAndreas Gohr     * @return string
77f41bbe4cSAndreas Gohr     */
78f41bbe4cSAndreas Gohr    protected static function decodeAnyEntity($ent)
79f41bbe4cSAndreas Gohr    {
80f41bbe4cSAndreas Gohr        // create the named entity lookup table
81f41bbe4cSAndreas Gohr        static $table = null;
82f41bbe4cSAndreas Gohr        if ($table === null) {
83f41bbe4cSAndreas Gohr            $table = get_html_translation_table(HTML_ENTITIES);
84f41bbe4cSAndreas Gohr            $table = array_flip($table);
85f41bbe4cSAndreas Gohr            $table = array_map(
86dc63e34dSAndreas Gohr                static fn($c) => Unicode::toUtf8([ord($c[0])]),
87f41bbe4cSAndreas Gohr                $table
88f41bbe4cSAndreas Gohr            );
89f41bbe4cSAndreas Gohr        }
90f41bbe4cSAndreas Gohr
91f41bbe4cSAndreas Gohr        if ($ent[1] === '#') {
92f41bbe4cSAndreas Gohr            return self::decodeNumericEntity($ent);
93f41bbe4cSAndreas Gohr        }
94f41bbe4cSAndreas Gohr
95f41bbe4cSAndreas Gohr        if (array_key_exists($ent[0], $table)) {
96f41bbe4cSAndreas Gohr            return $table[$ent[0]];
97f41bbe4cSAndreas Gohr        }
98f41bbe4cSAndreas Gohr
99f41bbe4cSAndreas Gohr        return $ent[0];
100f41bbe4cSAndreas Gohr    }
101f41bbe4cSAndreas Gohr
102f41bbe4cSAndreas Gohr    /**
103f41bbe4cSAndreas Gohr     * Decodes numeric HTML entities to their correct UTF-8 characters
104f41bbe4cSAndreas Gohr     *
105f41bbe4cSAndreas Gohr     * @param $ent string A numeric entity
106f41bbe4cSAndreas Gohr     * @return string|false
107f41bbe4cSAndreas Gohr     */
108f41bbe4cSAndreas Gohr    protected static function decodeNumericEntity($ent)
109f41bbe4cSAndreas Gohr    {
110*093fe67eSAndreas Gohr        $cp = match ($ent[2]) {
111*093fe67eSAndreas Gohr            'X', 'x' => hexdec($ent[3]),
112*093fe67eSAndreas Gohr            default => (int) $ent[3],
113*093fe67eSAndreas Gohr        };
114e025be72SAndreas Gohr        return Unicode::toUtf8([$cp]);
115f41bbe4cSAndreas Gohr    }
116f41bbe4cSAndreas Gohr
117f41bbe4cSAndreas Gohr    /**
118f41bbe4cSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
119f41bbe4cSAndreas Gohr     *
120f41bbe4cSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
121f41bbe4cSAndreas Gohr     *
122f41bbe4cSAndreas Gohr     * @param string $str
123f41bbe4cSAndreas Gohr     * @param bool $bom
124f41bbe4cSAndreas Gohr     * @return string
125f41bbe4cSAndreas Gohr     */
126f41bbe4cSAndreas Gohr    public static function toUtf16be($str, $bom = false)
127f41bbe4cSAndreas Gohr    {
128f41bbe4cSAndreas Gohr        $out = $bom ? "\xFE\xFF" : '';
129f41bbe4cSAndreas Gohr        if (UTF8_MBSTRING) {
130f41bbe4cSAndreas Gohr            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
131f41bbe4cSAndreas Gohr        }
132f41bbe4cSAndreas Gohr
133f41bbe4cSAndreas Gohr        $uni = Unicode::fromUtf8($str);
134f41bbe4cSAndreas Gohr        foreach ($uni as $cp) {
135f41bbe4cSAndreas Gohr            $out .= pack('n', $cp);
136f41bbe4cSAndreas Gohr        }
137f41bbe4cSAndreas Gohr        return $out;
138f41bbe4cSAndreas Gohr    }
139f41bbe4cSAndreas Gohr
140f41bbe4cSAndreas Gohr    /**
141f41bbe4cSAndreas Gohr     * UTF-8 to UTF-16BE conversion.
142f41bbe4cSAndreas Gohr     *
143f41bbe4cSAndreas Gohr     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
144f41bbe4cSAndreas Gohr     *
145f41bbe4cSAndreas Gohr     * @param string $str
146f41bbe4cSAndreas Gohr     * @return false|string
147f41bbe4cSAndreas Gohr     */
148f41bbe4cSAndreas Gohr    public static function fromUtf16be($str)
149f41bbe4cSAndreas Gohr    {
150f41bbe4cSAndreas Gohr        $uni = unpack('n*', $str);
151f41bbe4cSAndreas Gohr        return Unicode::toUtf8($uni);
152f41bbe4cSAndreas Gohr    }
15353c68e5cSAndreas Gohr
15453c68e5cSAndreas Gohr    /**
15553c68e5cSAndreas Gohr     * Converts a string from ISO-8859-1 to UTF-8
15653c68e5cSAndreas Gohr     *
15753c68e5cSAndreas Gohr     * This is a replacement for the deprecated utf8_encode function.
15853c68e5cSAndreas Gohr     *
15953c68e5cSAndreas Gohr     * @param $string
16053c68e5cSAndreas Gohr     * @return string
16153c68e5cSAndreas Gohr     * @author <p@tchwork.com> Nicolas Grekas - pure PHP implementation
16253c68e5cSAndreas Gohr     * @link https://github.com/tchwork/utf8/blob/master/src/Patchwork/PHP/Shim/Xml.php
16353c68e5cSAndreas Gohr     */
16453c68e5cSAndreas Gohr    public static function fromLatin1($string)
16553c68e5cSAndreas Gohr    {
16653c68e5cSAndreas Gohr        if (UTF8_MBSTRING) {
16753c68e5cSAndreas Gohr            return mb_convert_encoding($string, 'UTF-8', 'ISO-8859-1');
16853c68e5cSAndreas Gohr        }
16953c68e5cSAndreas Gohr        if (function_exists('iconv')) {
17053c68e5cSAndreas Gohr            return iconv('ISO-8859-1', 'UTF-8', $string);
17153c68e5cSAndreas Gohr        }
17253c68e5cSAndreas Gohr        if (class_exists('UConverter')) {
17353c68e5cSAndreas Gohr            return \UConverter::transcode($string, 'UTF8', 'ISO-8859-1');
17453c68e5cSAndreas Gohr        }
17553c68e5cSAndreas Gohr        if (function_exists('utf8_encode')) {
17653c68e5cSAndreas Gohr            // deprecated
17753c68e5cSAndreas Gohr            return utf8_encode($string);
17853c68e5cSAndreas Gohr        }
17953c68e5cSAndreas Gohr
18053c68e5cSAndreas Gohr        // fallback to pure PHP
18153c68e5cSAndreas Gohr        $string .= $string;
18253c68e5cSAndreas Gohr        $len = strlen($string);
18353c68e5cSAndreas Gohr        for ($i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j) {
18453c68e5cSAndreas Gohr            switch (true) {
18553c68e5cSAndreas Gohr                case $string[$i] < "\x80":
18653c68e5cSAndreas Gohr                    $string[$j] = $string[$i];
18753c68e5cSAndreas Gohr                    break;
18853c68e5cSAndreas Gohr                case $string[$i] < "\xC0":
18953c68e5cSAndreas Gohr                    $string[$j] = "\xC2";
19053c68e5cSAndreas Gohr                    $string[++$j] = $string[$i];
19153c68e5cSAndreas Gohr                    break;
19253c68e5cSAndreas Gohr                default:
19353c68e5cSAndreas Gohr                    $string[$j] = "\xC3";
19453c68e5cSAndreas Gohr                    $string[++$j] = chr(ord($string[$i]) - 64);
19553c68e5cSAndreas Gohr                    break;
19653c68e5cSAndreas Gohr            }
19753c68e5cSAndreas Gohr        }
19853c68e5cSAndreas Gohr        return substr($string, 0, $j);
19953c68e5cSAndreas Gohr    }
200f41bbe4cSAndreas Gohr}
201