xref: /dokuwiki/inc/Utf8/Conversion.php (revision 093fe67e98c0cdb4b73fd46938e49b64971483c2)
1<?php
2
3namespace dokuwiki\Utf8;
4
5/**
6 * Methods to convert from and to UTF-8 strings
7 */
8class Conversion
9{
10    /**
11     * Encodes UTF-8 characters to HTML entities
12     *
13     * @author Tom N Harris <tnharris@whoopdedo.org>
14     * @author <vpribish at shopping dot com>
15     * @link   http://php.net/manual/en/function.utf8-decode.php
16     *
17     * @param string $str
18     * @param bool $all Encode non-utf8 char to HTML as well
19     * @return string
20     */
21    public static function toHtml($str, $all = false)
22    {
23        $ret = '';
24        foreach (Unicode::fromUtf8($str) as $cp) {
25            if ($cp < 0x80 && !$all) {
26                $ret .= chr($cp);
27            } elseif ($cp < 0x100) {
28                $ret .= "&#$cp;";
29            } else {
30                $ret .= '&#x' . dechex($cp) . ';';
31            }
32        }
33        return $ret;
34    }
35
36    /**
37     * Decodes HTML entities to UTF-8 characters
38     *
39     * Convert any &#..; entity to a codepoint,
40     * The entities flag defaults to only decoding numeric entities.
41     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
42     * are handled as well. Avoids the problem that would occur if you
43     * had to decode "&amp;#38;&#38;amp;#38;"
44     *
45     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
46     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
47     * what it should be                   -> "&#38;&amp#38;"
48     *
49     * @author Tom N Harris <tnharris@whoopdedo.org>
50     *
51     * @param  string $str UTF-8 encoded string
52     * @param  boolean $entities decode name entities in addtition to numeric ones
53     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
54     */
55    public static function fromHtml($str, $entities = false)
56    {
57        if (!$entities) {
58            return preg_replace_callback(
59                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
60                self::decodeNumericEntity(...),
61                $str
62            );
63        }
64
65        return preg_replace_callback(
66            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
67            self::decodeAnyEntity(...),
68            $str
69        );
70    }
71
72    /**
73     * Decodes any HTML entity to it's correct UTF-8 char equivalent
74     *
75     * @param string $ent An entity
76     * @return string
77     */
78    protected static function decodeAnyEntity($ent)
79    {
80        // create the named entity lookup table
81        static $table = null;
82        if ($table === null) {
83            $table = get_html_translation_table(HTML_ENTITIES);
84            $table = array_flip($table);
85            $table = array_map(
86                static fn($c) => Unicode::toUtf8([ord($c[0])]),
87                $table
88            );
89        }
90
91        if ($ent[1] === '#') {
92            return self::decodeNumericEntity($ent);
93        }
94
95        if (array_key_exists($ent[0], $table)) {
96            return $table[$ent[0]];
97        }
98
99        return $ent[0];
100    }
101
102    /**
103     * Decodes numeric HTML entities to their correct UTF-8 characters
104     *
105     * @param $ent string A numeric entity
106     * @return string|false
107     */
108    protected static function decodeNumericEntity($ent)
109    {
110        $cp = match ($ent[2]) {
111            'X', 'x' => hexdec($ent[3]),
112            default => (int) $ent[3],
113        };
114        return Unicode::toUtf8([$cp]);
115    }
116
117    /**
118     * UTF-8 to UTF-16BE conversion.
119     *
120     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
121     *
122     * @param string $str
123     * @param bool $bom
124     * @return string
125     */
126    public static function toUtf16be($str, $bom = false)
127    {
128        $out = $bom ? "\xFE\xFF" : '';
129        if (UTF8_MBSTRING) {
130            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
131        }
132
133        $uni = Unicode::fromUtf8($str);
134        foreach ($uni as $cp) {
135            $out .= pack('n', $cp);
136        }
137        return $out;
138    }
139
140    /**
141     * UTF-8 to UTF-16BE conversion.
142     *
143     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
144     *
145     * @param string $str
146     * @return false|string
147     */
148    public static function fromUtf16be($str)
149    {
150        $uni = unpack('n*', $str);
151        return Unicode::toUtf8($uni);
152    }
153
154    /**
155     * Converts a string from ISO-8859-1 to UTF-8
156     *
157     * This is a replacement for the deprecated utf8_encode function.
158     *
159     * @param $string
160     * @return string
161     * @author <p@tchwork.com> Nicolas Grekas - pure PHP implementation
162     * @link https://github.com/tchwork/utf8/blob/master/src/Patchwork/PHP/Shim/Xml.php
163     */
164    public static function fromLatin1($string)
165    {
166        if (UTF8_MBSTRING) {
167            return mb_convert_encoding($string, 'UTF-8', 'ISO-8859-1');
168        }
169        if (function_exists('iconv')) {
170            return iconv('ISO-8859-1', 'UTF-8', $string);
171        }
172        if (class_exists('UConverter')) {
173            return \UConverter::transcode($string, 'UTF8', 'ISO-8859-1');
174        }
175        if (function_exists('utf8_encode')) {
176            // deprecated
177            return utf8_encode($string);
178        }
179
180        // fallback to pure PHP
181        $string .= $string;
182        $len = strlen($string);
183        for ($i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j) {
184            switch (true) {
185                case $string[$i] < "\x80":
186                    $string[$j] = $string[$i];
187                    break;
188                case $string[$i] < "\xC0":
189                    $string[$j] = "\xC2";
190                    $string[++$j] = $string[$i];
191                    break;
192                default:
193                    $string[$j] = "\xC3";
194                    $string[++$j] = chr(ord($string[$i]) - 64);
195                    break;
196            }
197        }
198        return substr($string, 0, $j);
199    }
200}
201