xref: /dokuwiki/inc/SafeFN.class.php (revision f03fd957525a714da1cde7e2957939046bd51bd5)
1<?php
2
3/**
4 *  Class to safely store UTF-8 in a Filename
5 *
6 *  Encodes a utf8 string using only the following characters 0-9a-z_.-%
7 *  characters 0-9a-z in the original string are preserved, "plain".
8 *  all other characters are represented in a substring that starts
9 *  with '%' are "converted".
10 *  The transition from converted substrings to plain characters is
11 *  marked with a '.'
12 *
13 *  @author   Christopher Smith
14 *  @date     2010-04-02
15 */
16class SafeFN {
17
18    private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
19    private static $pre_indicator = '%';
20    private static $post_indicator = '.';                             // this character can be included in "plain" set
21    private static $adjustments = array();                            // must be initialized, use getAdjustments()
22
23    /**
24     * Convert an UTF-8 string to a safe ASCII String
25     *
26     *  conversion process
27     *    - if codepoint is a plain character,
28     *      - if previous character was "converted", append post_indicator
29     *        to output
30     *      - append ascii byte for character to output (continue to
31     *        next character)
32     *
33     *    - reduce codepoint value to fill the holes left by "plain"
34     *    - choose marker character for conversion by taking modulus
35     *      (number of possible pre_indicators) of modified codepoint
36     *    - calculate value for conversion to base36 by integer division
37     *      (number of possible pre_indicators) of modified codepoint
38     *    - convert above value to a base36 string
39     *    - append marker characater followed by base36 string to
40     *      output (continue to next character)
41     */
42    public function encode($utf8) {
43        return self::unicode_safe(self::utf8_unicode($utf8));
44    }
45
46    /**
47     *  decoding process
48     *    - split the string into substrings at marker characters,
49     *      discarding post_indicator character but keeping
50     *      pre_indicator characters (along with their following
51     *      base36 string)
52     *    - check the first character of the substring
53     *      - if its not a pre_indicator character, convert each
54     *        character in the substring into its codepoint value
55     *        and append to output (continue to next substring)
56     *      - if it is a pre_indicator character, get its position in the
57     *        pre_indicator string (order is important)
58     *    - convert the remainder of the string from base36 to base10
59     *      and then to an (int).
60     *    - multiply the converted int by the number of pre_indicator
61     *      characters and add the pre_indicator position
62     *    - reverse the conversion adjustment for codepoint holes left by
63     *      "plain" characters
64     *    - append resulting codepoint value to output (continue to next
65     *      substring)
66     */
67    public function decode($safe) {
68        return self::unicode_utf8(self::safe_unicode(strtolower($safe)));
69    }
70
71    public function validate_printable_utf8($printable_utf8) {
72        return !preg_match('/[\x01-\x1f]/',$printable_utf8);
73    }
74
75    public function validate_safe($safe) {
76        return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe);
77    }
78
79    private function utf8_unicode($utf8) {
80        return utf8_to_unicode($utf8);
81    }
82
83    private function unicode_utf8($unicode) {
84        return unicode_to_utf8($unicode);
85    }
86
87    private function unicode_safe($unicode) {
88
89        $safe = '';
90        $converted = false;
91
92        foreach ($unicode as $codepoint) {
93            if (self::isPlain($codepoint)) {
94                if ($converted) {
95                    $safe .= self::$post_indicator;
96                    $converted = false;
97                }
98                $safe .= chr($codepoint);
99
100            } else if (self::isPreIndicator($codepoint)) {
101                $converted = true;
102                $safe .= chr($codepoint);
103
104            } else {
105                $converted = true;
106                $adjusted = self::adjustForPlain($codepoint);
107
108                $marker = $adjusted % strlen(self::$pre_indicator);
109                $base = (int) ($adjusted / strlen(self::$pre_indicator));
110
111                $safe .= self::$pre_indicator[$marker];
112                $safe .= base_convert((string)$base,10,36);
113            }
114        }
115        return $safe;
116    }
117
118    private function safe_unicode($safe) {
119        $unicode = array();
120        $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY);
121
122        $converted = false;
123        foreach ($split as $sub) {
124            if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) {
125                if ($converted) {
126                    // strip post_indicator
127                    $sub = substr($sub,1);
128                    $converted = false;
129                }
130                for ($i=0; $i < strlen($sub); $i++) {
131                    $unicode[] = ord($sub[$i]);
132                }
133            } else if (strlen($sub)==1) {
134                $converted =  true;
135                $unicode[] = ord($sub);
136            } else {
137                // a single codepoint in our base
138                $converted = true;
139                $base = (int)base_convert(substr($sub,1),36,10);
140                $adjusted = ($base*strlen(self::$pre_indicator)) + $marker;
141
142                $unicode[] = self::reverseForPlain($adjusted);
143            }
144        }
145
146        return $unicode;
147    }
148
149    private function isPlain($codepoint) {
150        return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false));
151    }
152
153    private function isPreIndicator($codepoint) {
154        return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false));
155    }
156
157    /**
158     * adjust for plain and non-printable (ascii 0-31)
159     * this makes SPACE (0x20) the first character we allow
160     */
161    private function adjustForPlain($codepoint) {
162        $adjustment = self::getAdjustments();
163
164        // codepoint is higher than that of the plain character with the highest codepoint
165        if ($codepoint > ord($adjustment[count($adjustment)-1])) {
166            $adjusted = $codepoint - count($adjustment);
167        } else if ($codepoint > ord($adjustment[0])) {
168            for ($i=1; $i < count($adjustment); $i++) {
169                if ($codepoint < ord($adjustment[$i])) {
170                    break;
171                }
172            }
173            $adjusted = $codepoint - $i;
174        } else {
175            $adjusted = $codepoint;
176        }
177
178        // substract number of non-printable characters and return
179        return $adjusted - ord(' ');
180    }
181
182    private function reverseForPlain($adjusted) {
183        $adjustment = self::getAdjustments();
184
185        // reverse adjustment for non-printable characters
186        $adjusted += ord(' ');
187
188        if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) {
189            $adjusted += count($adjustment);
190        } else if ($adjusted > ord($adjustment[0])) {
191            for ($i=1; $i < count($adjustment); $i++) {
192                if ($adjusted + $i < ord($adjustment[$i])) {
193                    break;
194                }
195            }
196            $adjusted += $i;
197        }
198
199        return $adjusted;
200    }
201
202    private function getAdjustments() {
203        if (empty(self::$adjustments)) {
204            self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator);
205            sort(self::$adjustments);
206        }
207
208        return self::$adjustments;
209    }
210}
211