xref: /dokuwiki/inc/SafeFN.class.php (revision 70e083cec45ef4a738ae0b3f20af8e4f288a7dfa)
1*70e083ceSChristopher Smith<?php
2*70e083ceSChristopher Smith
3*70e083ceSChristopher Smith/**
4*70e083ceSChristopher Smith *  Class to safely store UTF-8 in a Filename
5*70e083ceSChristopher Smith *
6*70e083ceSChristopher Smith *  Encodes a utf8 string using only the following characters 0-9a-z_.-%
7*70e083ceSChristopher Smith *  characters 0-9a-z in the original string are preserved, "plain".
8*70e083ceSChristopher Smith *  all other characters are represented in a substring that starts
9*70e083ceSChristopher Smith *  with '%' are "converted".
10*70e083ceSChristopher Smith *  The transition from converted substrings to plain characters is
11*70e083ceSChristopher Smith *  marked with a '.'
12*70e083ceSChristopher Smith *
13*70e083ceSChristopher Smith *  @author   Christopher Smith
14*70e083ceSChristopher Smith *  @date     2010-04-02
15*70e083ceSChristopher Smith */
16*70e083ceSChristopher Smithclass SafeFN {
17*70e083ceSChristopher Smith
18*70e083ceSChristopher Smith    private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
19*70e083ceSChristopher Smith    private static $pre_indicator = '%';
20*70e083ceSChristopher Smith    private static $post_indicator = '.';                             // this character can be included in "plain" set
21*70e083ceSChristopher Smith    private static $adjustments = array();                            // must be initialized, use getAdjustments()
22*70e083ceSChristopher Smith
23*70e083ceSChristopher Smith    /**
24*70e083ceSChristopher Smith     * Convert an UTF-8 string to a safe ASCII String
25*70e083ceSChristopher Smith     *
26*70e083ceSChristopher Smith     *  conversion process
27*70e083ceSChristopher Smith     *    - if codepoint is a plain character,
28*70e083ceSChristopher Smith     *      - if previous character was "converted", append post_indicator
29*70e083ceSChristopher Smith     *        to output
30*70e083ceSChristopher Smith     *      - append ascii byte for character to output (continue to
31*70e083ceSChristopher Smith     *        next character)
32*70e083ceSChristopher Smith     *
33*70e083ceSChristopher Smith     *    - reduce codepoint value to fill the holes left by "plain"
34*70e083ceSChristopher Smith     *    - choose marker character for conversion by taking modulus
35*70e083ceSChristopher Smith     *      (number of possible pre_indicators) of modified codepoint
36*70e083ceSChristopher Smith     *    - calculate value for conversion to base36 by integer division
37*70e083ceSChristopher Smith     *      (number of possible pre_indicators) of modified codepoint
38*70e083ceSChristopher Smith     *    - convert above value to a base36 string
39*70e083ceSChristopher Smith     *    - append marker characater followed by base36 string to
40*70e083ceSChristopher Smith     *      output (continue to next character)
41*70e083ceSChristopher Smith     */
42*70e083ceSChristopher Smith    public function encode($utf8) {
43*70e083ceSChristopher Smith        return self::unicode_safe(self::utf8_unicode($utf8));
44*70e083ceSChristopher Smith    }
45*70e083ceSChristopher Smith
46*70e083ceSChristopher Smith    /**
47*70e083ceSChristopher Smith     *  decoding process
48*70e083ceSChristopher Smith     *    - split the string into substrings at marker characters,
49*70e083ceSChristopher Smith     *      discarding post_indicator character but keeping
50*70e083ceSChristopher Smith     *      pre_indicator characters (along with their following
51*70e083ceSChristopher Smith     *      base36 string)
52*70e083ceSChristopher Smith     *    - check the first character of the substring
53*70e083ceSChristopher Smith     *      - if its not a pre_indicator character, convert each
54*70e083ceSChristopher Smith     *        character in the substring into its codepoint value
55*70e083ceSChristopher Smith     *        and append to output (continue to next substring)
56*70e083ceSChristopher Smith     *      - if it is a pre_indicator character, get its position in the
57*70e083ceSChristopher Smith     *        pre_indicator string (order is important)
58*70e083ceSChristopher Smith     *    - convert the remainder of the string from base36 to base10
59*70e083ceSChristopher Smith     *      and then to an (int).
60*70e083ceSChristopher Smith     *    - multiply the converted int by the number of pre_indicator
61*70e083ceSChristopher Smith     *      characters and add the pre_indicator position
62*70e083ceSChristopher Smith     *    - reverse the conversion adjustment for codepoint holes left by
63*70e083ceSChristopher Smith     *      "plain" characters
64*70e083ceSChristopher Smith     *    - append resulting codepoint value to output (continue to next
65*70e083ceSChristopher Smith     *      substring)
66*70e083ceSChristopher Smith     */
67*70e083ceSChristopher Smith    public function decode($safe) {
68*70e083ceSChristopher Smith        return self::unicode_utf8(self::safe_unicode(strtolower($safe)));
69*70e083ceSChristopher Smith    }
70*70e083ceSChristopher Smith
71*70e083ceSChristopher Smith    public function validate_printable_utf8($printable_utf8) {
72*70e083ceSChristopher Smith        return !preg_match('/[\x01-\x1f]/',$printable_utf8);
73*70e083ceSChristopher Smith    }
74*70e083ceSChristopher Smith
75*70e083ceSChristopher Smith    public function validate_safe($safe) {
76*70e083ceSChristopher Smith        return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe);
77*70e083ceSChristopher Smith    }
78*70e083ceSChristopher Smith
79*70e083ceSChristopher Smith    private function utf8_unicode($utf8) {
80*70e083ceSChristopher Smith        return utf8_to_unicode($utf8);
81*70e083ceSChristopher Smith    }
82*70e083ceSChristopher Smith
83*70e083ceSChristopher Smith    private function unicode_utf8($unicode) {
84*70e083ceSChristopher Smith        return unicode_to_utf8($unicode);
85*70e083ceSChristopher Smith    }
86*70e083ceSChristopher Smith
87*70e083ceSChristopher Smith    private function unicode_safe($unicode) {
88*70e083ceSChristopher Smith
89*70e083ceSChristopher Smith        $safe = '';
90*70e083ceSChristopher Smith        $converted = false;
91*70e083ceSChristopher Smith
92*70e083ceSChristopher Smith        foreach ($unicode as $codepoint) {
93*70e083ceSChristopher Smith            if (self::isPlain($codepoint)) {
94*70e083ceSChristopher Smith                if ($converted) {
95*70e083ceSChristopher Smith                    $safe .= self::$post_indicator;
96*70e083ceSChristopher Smith                    $converted = false;
97*70e083ceSChristopher Smith                }
98*70e083ceSChristopher Smith                $safe .= chr($codepoint);
99*70e083ceSChristopher Smith
100*70e083ceSChristopher Smith            } else if (self::isPreIndicator($codepoint)) {
101*70e083ceSChristopher Smith                $converted = true;
102*70e083ceSChristopher Smith                $safe .= chr($codepoint);
103*70e083ceSChristopher Smith
104*70e083ceSChristopher Smith            } else {
105*70e083ceSChristopher Smith                $converted = true;
106*70e083ceSChristopher Smith                $adjusted = self::adjustForPlain($codepoint);
107*70e083ceSChristopher Smith
108*70e083ceSChristopher Smith                $marker = $adjusted % strlen(self::$pre_indicator);
109*70e083ceSChristopher Smith                $base = (int) ($adjusted / strlen(self::$pre_indicator));
110*70e083ceSChristopher Smith
111*70e083ceSChristopher Smith                $safe .= self::$pre_indicator[$marker];
112*70e083ceSChristopher Smith                $safe .= base_convert((string)$base,10,36);
113*70e083ceSChristopher Smith            }
114*70e083ceSChristopher Smith        }
115*70e083ceSChristopher Smith        return $safe;
116*70e083ceSChristopher Smith    }
117*70e083ceSChristopher Smith
118*70e083ceSChristopher Smith    private function safe_unicode($safe) {
119*70e083ceSChristopher Smith        $unicode = array();
120*70e083ceSChristopher Smith        $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY);
121*70e083ceSChristopher Smith
122*70e083ceSChristopher Smith        $converted = false;
123*70e083ceSChristopher Smith        foreach ($split as $sub) {
124*70e083ceSChristopher Smith            if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) {
125*70e083ceSChristopher Smith                if ($converted) {
126*70e083ceSChristopher Smith                    // strip post_indicator
127*70e083ceSChristopher Smith                    $sub = substr($sub,1);
128*70e083ceSChristopher Smith                    $converted = false;
129*70e083ceSChristopher Smith                }
130*70e083ceSChristopher Smith                for ($i=0; $i < strlen($sub); $i++) {
131*70e083ceSChristopher Smith                    $unicode[] = ord($sub[$i]);
132*70e083ceSChristopher Smith                }
133*70e083ceSChristopher Smith            } else if (strlen($sub)==1) {
134*70e083ceSChristopher Smith                $converted =  true;
135*70e083ceSChristopher Smith                $unicode[] = ord($sub);
136*70e083ceSChristopher Smith            } else {
137*70e083ceSChristopher Smith                // a single codepoint in our base
138*70e083ceSChristopher Smith                $converted = true;
139*70e083ceSChristopher Smith                $base = (int)base_convert(substr($sub,1),36,10);
140*70e083ceSChristopher Smith                $adjusted = ($base*strlen(self::$pre_indicator)) + $marker;
141*70e083ceSChristopher Smith
142*70e083ceSChristopher Smith                $unicode[] = self::reverseForPlain($adjusted);
143*70e083ceSChristopher Smith            }
144*70e083ceSChristopher Smith        }
145*70e083ceSChristopher Smith
146*70e083ceSChristopher Smith        return $unicode;
147*70e083ceSChristopher Smith    }
148*70e083ceSChristopher Smith
149*70e083ceSChristopher Smith    private function isPlain($codepoint) {
150*70e083ceSChristopher Smith        return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false));
151*70e083ceSChristopher Smith    }
152*70e083ceSChristopher Smith
153*70e083ceSChristopher Smith    private function isPreIndicator($codepoint) {
154*70e083ceSChristopher Smith        return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false));
155*70e083ceSChristopher Smith    }
156*70e083ceSChristopher Smith
157*70e083ceSChristopher Smith    /**
158*70e083ceSChristopher Smith     * adjust for plain and non-printable (ascii 0-31)
159*70e083ceSChristopher Smith     * this makes SPACE (0x20) the first character we allow
160*70e083ceSChristopher Smith     */
161*70e083ceSChristopher Smith    private function adjustForPlain($codepoint) {
162*70e083ceSChristopher Smith        $adjustment = self::getAdjustments();
163*70e083ceSChristopher Smith
164*70e083ceSChristopher Smith        // codepoint is higher than that of the plain character with the highest codepoint
165*70e083ceSChristopher Smith        if ($codepoint > ord($adjustment[count($adjustment)-1])) {
166*70e083ceSChristopher Smith            $adjusted = $codepoint - count($adjustment);
167*70e083ceSChristopher Smith        } else if ($codepoint > ord($adjustment[0])) {
168*70e083ceSChristopher Smith            for ($i=1; $i < count($adjustment); $i++) {
169*70e083ceSChristopher Smith                if ($codepoint < ord($adjustment[$i])) {
170*70e083ceSChristopher Smith                    break;
171*70e083ceSChristopher Smith                }
172*70e083ceSChristopher Smith            }
173*70e083ceSChristopher Smith            $adjusted = $codepoint - $i;
174*70e083ceSChristopher Smith        } else {
175*70e083ceSChristopher Smith            $adjusted = $codepoint;
176*70e083ceSChristopher Smith        }
177*70e083ceSChristopher Smith
178*70e083ceSChristopher Smith        // substract number of non-printable characters and return
179*70e083ceSChristopher Smith        return $adjusted - ord(' ');
180*70e083ceSChristopher Smith    }
181*70e083ceSChristopher Smith
182*70e083ceSChristopher Smith    private function reverseForPlain($adjusted) {
183*70e083ceSChristopher Smith        $adjustment = self::getAdjustments();
184*70e083ceSChristopher Smith
185*70e083ceSChristopher Smith        // reverse adjustment for non-printable characters
186*70e083ceSChristopher Smith        $adjusted += ord(' ');
187*70e083ceSChristopher Smith
188*70e083ceSChristopher Smith        if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) {
189*70e083ceSChristopher Smith            $adjusted += count($adjustment);
190*70e083ceSChristopher Smith        } else if ($adjusted > ord($adjustment[0])) {
191*70e083ceSChristopher Smith            for ($i=1; $i < count($adjustment); $i++) {
192*70e083ceSChristopher Smith                if ($adjusted + $i < ord($adjustment[$i])) {
193*70e083ceSChristopher Smith                    break;
194*70e083ceSChristopher Smith                }
195*70e083ceSChristopher Smith            }
196*70e083ceSChristopher Smith            $adjusted += $i;
197*70e083ceSChristopher Smith        }
198*70e083ceSChristopher Smith
199*70e083ceSChristopher Smith        return $adjusted;
200*70e083ceSChristopher Smith    }
201*70e083ceSChristopher Smith
202*70e083ceSChristopher Smith    private function getAdjustments() {
203*70e083ceSChristopher Smith        if (empty(self::$adjustments)) {
204*70e083ceSChristopher Smith            self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator);
205*70e083ceSChristopher Smith            sort(self::$adjustments);
206*70e083ceSChristopher Smith        }
207*70e083ceSChristopher Smith
208*70e083ceSChristopher Smith        return self::$adjustments;
209*70e083ceSChristopher Smith    }
210*70e083ceSChristopher Smith}
211