xref: /dokuwiki/inc/Utf8/Conversion.php (revision 8cbc5ee84fe788597ede5266255a74af6da47555)
1<?php
2
3namespace dokuwiki\Utf8;
4
5/**
6 * Methods to convert from and to UTF-8 strings
7 */
8class Conversion
9{
10
11    /**
12     * Encodes UTF-8 characters to HTML entities
13     *
14     * @author Tom N Harris <tnharris@whoopdedo.org>
15     * @author <vpribish at shopping dot com>
16     * @link   http://php.net/manual/en/function.utf8-decode.php
17     *
18     * @param string $str
19     * @return string
20     */
21    public static function toHtml($str)
22    {
23        $ret = '';
24        foreach (Unicode::fromUtf8($str) as $cp) {
25            if ($cp < 0x80) {
26                $ret .= chr($cp);
27            } elseif ($cp < 0x100) {
28                $ret .= "&#$cp;";
29            } else {
30                $ret .= '&#x' . dechex($cp) . ';';
31            }
32        }
33        return $ret;
34    }
35
36    /**
37     * Decodes HTML entities to UTF-8 characters
38     *
39     * Convert any &#..; entity to a codepoint,
40     * The entities flag defaults to only decoding numeric entities.
41     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
42     * are handled as well. Avoids the problem that would occur if you
43     * had to decode "&amp;#38;&#38;amp;#38;"
44     *
45     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
46     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
47     * what it should be                   -> "&#38;&amp#38;"
48     *
49     * @author Tom N Harris <tnharris@whoopdedo.org>
50     *
51     * @param  string $str UTF-8 encoded string
52     * @param  boolean $entities decode name entities in addtition to numeric ones
53     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
54     */
55    public static function fromHtml($str, $entities = false)
56    {
57        if (!$entities) {
58            return preg_replace_callback(
59                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
60                [__CLASS__, 'decodeNumericEntity'],
61                $str
62            );
63        }
64
65        return preg_replace_callback(
66            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
67            [__CLASS__, 'decodeAnyEntity'],
68            $str
69        );
70    }
71
72    /**
73     * Decodes any HTML entity to it's correct UTF-8 char equivalent
74     *
75     * @param string $ent An entity
76     * @return string
77     */
78    protected static function decodeAnyEntity($ent)
79    {
80        // create the named entity lookup table
81        static $table = null;
82        if ($table === null) {
83            $table = get_html_translation_table(HTML_ENTITIES);
84            $table = array_flip($table);
85            $table = array_map(
86                static function ($c) {
87                    return Unicode::toUtf8(array(ord($c)));
88                },
89                $table
90            );
91        }
92
93        if ($ent[1] === '#') {
94            return self::decodeNumericEntity($ent);
95        }
96
97        if (array_key_exists($ent[0], $table)) {
98            return $table[$ent[0]];
99        }
100
101        return $ent[0];
102    }
103
104    /**
105     * Decodes numeric HTML entities to their correct UTF-8 characters
106     *
107     * @param $ent string A numeric entity
108     * @return string|false
109     */
110    protected static function decodeNumericEntity($ent)
111    {
112        switch ($ent[2]) {
113            case 'X':
114            case 'x':
115                $cp = hexdec($ent[3]);
116                break;
117            default:
118                $cp = intval($ent[3]);
119                break;
120        }
121        return Unicode::toUtf8(array($cp));
122    }
123
124    /**
125     * UTF-8 to UTF-16BE conversion.
126     *
127     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
128     *
129     * @param string $str
130     * @param bool $bom
131     * @return string
132     */
133    public static function toUtf16be($str, $bom = false)
134    {
135        $out = $bom ? "\xFE\xFF" : '';
136        if (UTF8_MBSTRING) {
137            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
138        }
139
140        $uni = Unicode::fromUtf8($str);
141        foreach ($uni as $cp) {
142            $out .= pack('n', $cp);
143        }
144        return $out;
145    }
146
147    /**
148     * UTF-8 to UTF-16BE conversion.
149     *
150     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
151     *
152     * @param string $str
153     * @return false|string
154     */
155    public static function fromUtf16be($str)
156    {
157        $uni = unpack('n*', $str);
158        return Unicode::toUtf8($uni);
159    }
160
161}
162