xref: /dokuwiki/inc/Utf8/Conversion.php (revision a19c9aa0217112e3ab7ebc160354c7e9fbabe8eb)
1<?php
2
3namespace dokuwiki\Utf8;
4
5/**
6 * Methods to convert from and to UTF-8 strings
7 */
8class Conversion
9{
10
11    /**
12     * Encodes UTF-8 characters to HTML entities
13     *
14     * @author Tom N Harris <tnharris@whoopdedo.org>
15     * @author <vpribish at shopping dot com>
16     * @link   http://php.net/manual/en/function.utf8-decode.php
17     *
18     * @param string $str
19     * @param bool $all Encode non-utf8 char to HTML as well
20     * @return string
21     */
22    public static function toHtml($str, $all = false)
23    {
24        $ret = '';
25        foreach (Unicode::fromUtf8($str) as $cp) {
26            if ($cp < 0x80 && !$all) {
27                $ret .= chr($cp);
28            } elseif ($cp < 0x100) {
29                $ret .= "&#$cp;";
30            } else {
31                $ret .= '&#x' . dechex($cp) . ';';
32            }
33        }
34        return $ret;
35    }
36
37    /**
38     * Decodes HTML entities to UTF-8 characters
39     *
40     * Convert any &#..; entity to a codepoint,
41     * The entities flag defaults to only decoding numeric entities.
42     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
43     * are handled as well. Avoids the problem that would occur if you
44     * had to decode "&amp;#38;&#38;amp;#38;"
45     *
46     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
47     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
48     * what it should be                   -> "&#38;&amp#38;"
49     *
50     * @author Tom N Harris <tnharris@whoopdedo.org>
51     *
52     * @param  string $str UTF-8 encoded string
53     * @param  boolean $entities decode name entities in addtition to numeric ones
54     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
55     */
56    public static function fromHtml($str, $entities = false)
57    {
58        if (!$entities) {
59            return preg_replace_callback(
60                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
61                [self::class, 'decodeNumericEntity'],
62                $str
63            );
64        }
65
66        return preg_replace_callback(
67            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
68            [self::class, 'decodeAnyEntity'],
69            $str
70        );
71    }
72
73    /**
74     * Decodes any HTML entity to it's correct UTF-8 char equivalent
75     *
76     * @param string $ent An entity
77     * @return string
78     */
79    protected static function decodeAnyEntity($ent)
80    {
81        // create the named entity lookup table
82        static $table = null;
83        if ($table === null) {
84            $table = get_html_translation_table(HTML_ENTITIES);
85            $table = array_flip($table);
86            $table = array_map(
87                static fn($c) => Unicode::toUtf8([ord($c)]),
88                $table
89            );
90        }
91
92        if ($ent[1] === '#') {
93            return self::decodeNumericEntity($ent);
94        }
95
96        if (array_key_exists($ent[0], $table)) {
97            return $table[$ent[0]];
98        }
99
100        return $ent[0];
101    }
102
103    /**
104     * Decodes numeric HTML entities to their correct UTF-8 characters
105     *
106     * @param $ent string A numeric entity
107     * @return string|false
108     */
109    protected static function decodeNumericEntity($ent)
110    {
111        switch ($ent[2]) {
112            case 'X':
113            case 'x':
114                $cp = hexdec($ent[3]);
115                break;
116            default:
117                $cp = (int) $ent[3];
118                break;
119        }
120        return Unicode::toUtf8([$cp]);
121    }
122
123    /**
124     * UTF-8 to UTF-16BE conversion.
125     *
126     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
127     *
128     * @param string $str
129     * @param bool $bom
130     * @return string
131     */
132    public static function toUtf16be($str, $bom = false)
133    {
134        $out = $bom ? "\xFE\xFF" : '';
135        if (UTF8_MBSTRING) {
136            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
137        }
138
139        $uni = Unicode::fromUtf8($str);
140        foreach ($uni as $cp) {
141            $out .= pack('n', $cp);
142        }
143        return $out;
144    }
145
146    /**
147     * UTF-8 to UTF-16BE conversion.
148     *
149     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
150     *
151     * @param string $str
152     * @return false|string
153     */
154    public static function fromUtf16be($str)
155    {
156        $uni = unpack('n*', $str);
157        return Unicode::toUtf8($uni);
158    }
159}
160