1*70e083ceSChristopher Smith<?php 2*70e083ceSChristopher Smith 3*70e083ceSChristopher Smith/** 4*70e083ceSChristopher Smith * Class to safely store UTF-8 in a Filename 5*70e083ceSChristopher Smith * 6*70e083ceSChristopher Smith * Encodes a utf8 string using only the following characters 0-9a-z_.-% 7*70e083ceSChristopher Smith * characters 0-9a-z in the original string are preserved, "plain". 8*70e083ceSChristopher Smith * all other characters are represented in a substring that starts 9*70e083ceSChristopher Smith * with '%' are "converted". 10*70e083ceSChristopher Smith * The transition from converted substrings to plain characters is 11*70e083ceSChristopher Smith * marked with a '.' 12*70e083ceSChristopher Smith * 13*70e083ceSChristopher Smith * @author Christopher Smith 14*70e083ceSChristopher Smith * @date 2010-04-02 15*70e083ceSChristopher Smith */ 16*70e083ceSChristopher Smithclass SafeFN { 17*70e083ceSChristopher Smith 18*70e083ceSChristopher Smith private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted 19*70e083ceSChristopher Smith private static $pre_indicator = '%'; 20*70e083ceSChristopher Smith private static $post_indicator = '.'; // this character can be included in "plain" set 21*70e083ceSChristopher Smith private static $adjustments = array(); // must be initialized, use getAdjustments() 22*70e083ceSChristopher Smith 23*70e083ceSChristopher Smith /** 24*70e083ceSChristopher Smith * Convert an UTF-8 string to a safe ASCII String 25*70e083ceSChristopher Smith * 26*70e083ceSChristopher Smith * conversion process 27*70e083ceSChristopher Smith * - if codepoint is a plain character, 28*70e083ceSChristopher Smith * - if previous character was "converted", append post_indicator 29*70e083ceSChristopher Smith * to output 30*70e083ceSChristopher Smith * - append ascii byte for character to output (continue to 31*70e083ceSChristopher Smith * next character) 32*70e083ceSChristopher Smith * 33*70e083ceSChristopher Smith * - reduce codepoint value to fill the holes left by "plain" 34*70e083ceSChristopher Smith * - choose marker character for conversion by taking modulus 35*70e083ceSChristopher Smith * (number of possible pre_indicators) of modified codepoint 36*70e083ceSChristopher Smith * - calculate value for conversion to base36 by integer division 37*70e083ceSChristopher Smith * (number of possible pre_indicators) of modified codepoint 38*70e083ceSChristopher Smith * - convert above value to a base36 string 39*70e083ceSChristopher Smith * - append marker characater followed by base36 string to 40*70e083ceSChristopher Smith * output (continue to next character) 41*70e083ceSChristopher Smith */ 42*70e083ceSChristopher Smith public function encode($utf8) { 43*70e083ceSChristopher Smith return self::unicode_safe(self::utf8_unicode($utf8)); 44*70e083ceSChristopher Smith } 45*70e083ceSChristopher Smith 46*70e083ceSChristopher Smith /** 47*70e083ceSChristopher Smith * decoding process 48*70e083ceSChristopher Smith * - split the string into substrings at marker characters, 49*70e083ceSChristopher Smith * discarding post_indicator character but keeping 50*70e083ceSChristopher Smith * pre_indicator characters (along with their following 51*70e083ceSChristopher Smith * base36 string) 52*70e083ceSChristopher Smith * - check the first character of the substring 53*70e083ceSChristopher Smith * - if its not a pre_indicator character, convert each 54*70e083ceSChristopher Smith * character in the substring into its codepoint value 55*70e083ceSChristopher Smith * and append to output (continue to next substring) 56*70e083ceSChristopher Smith * - if it is a pre_indicator character, get its position in the 57*70e083ceSChristopher Smith * pre_indicator string (order is important) 58*70e083ceSChristopher Smith * - convert the remainder of the string from base36 to base10 59*70e083ceSChristopher Smith * and then to an (int). 60*70e083ceSChristopher Smith * - multiply the converted int by the number of pre_indicator 61*70e083ceSChristopher Smith * characters and add the pre_indicator position 62*70e083ceSChristopher Smith * - reverse the conversion adjustment for codepoint holes left by 63*70e083ceSChristopher Smith * "plain" characters 64*70e083ceSChristopher Smith * - append resulting codepoint value to output (continue to next 65*70e083ceSChristopher Smith * substring) 66*70e083ceSChristopher Smith */ 67*70e083ceSChristopher Smith public function decode($safe) { 68*70e083ceSChristopher Smith return self::unicode_utf8(self::safe_unicode(strtolower($safe))); 69*70e083ceSChristopher Smith } 70*70e083ceSChristopher Smith 71*70e083ceSChristopher Smith public function validate_printable_utf8($printable_utf8) { 72*70e083ceSChristopher Smith return !preg_match('/[\x01-\x1f]/',$printable_utf8); 73*70e083ceSChristopher Smith } 74*70e083ceSChristopher Smith 75*70e083ceSChristopher Smith public function validate_safe($safe) { 76*70e083ceSChristopher Smith return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe); 77*70e083ceSChristopher Smith } 78*70e083ceSChristopher Smith 79*70e083ceSChristopher Smith private function utf8_unicode($utf8) { 80*70e083ceSChristopher Smith return utf8_to_unicode($utf8); 81*70e083ceSChristopher Smith } 82*70e083ceSChristopher Smith 83*70e083ceSChristopher Smith private function unicode_utf8($unicode) { 84*70e083ceSChristopher Smith return unicode_to_utf8($unicode); 85*70e083ceSChristopher Smith } 86*70e083ceSChristopher Smith 87*70e083ceSChristopher Smith private function unicode_safe($unicode) { 88*70e083ceSChristopher Smith 89*70e083ceSChristopher Smith $safe = ''; 90*70e083ceSChristopher Smith $converted = false; 91*70e083ceSChristopher Smith 92*70e083ceSChristopher Smith foreach ($unicode as $codepoint) { 93*70e083ceSChristopher Smith if (self::isPlain($codepoint)) { 94*70e083ceSChristopher Smith if ($converted) { 95*70e083ceSChristopher Smith $safe .= self::$post_indicator; 96*70e083ceSChristopher Smith $converted = false; 97*70e083ceSChristopher Smith } 98*70e083ceSChristopher Smith $safe .= chr($codepoint); 99*70e083ceSChristopher Smith 100*70e083ceSChristopher Smith } else if (self::isPreIndicator($codepoint)) { 101*70e083ceSChristopher Smith $converted = true; 102*70e083ceSChristopher Smith $safe .= chr($codepoint); 103*70e083ceSChristopher Smith 104*70e083ceSChristopher Smith } else { 105*70e083ceSChristopher Smith $converted = true; 106*70e083ceSChristopher Smith $adjusted = self::adjustForPlain($codepoint); 107*70e083ceSChristopher Smith 108*70e083ceSChristopher Smith $marker = $adjusted % strlen(self::$pre_indicator); 109*70e083ceSChristopher Smith $base = (int) ($adjusted / strlen(self::$pre_indicator)); 110*70e083ceSChristopher Smith 111*70e083ceSChristopher Smith $safe .= self::$pre_indicator[$marker]; 112*70e083ceSChristopher Smith $safe .= base_convert((string)$base,10,36); 113*70e083ceSChristopher Smith } 114*70e083ceSChristopher Smith } 115*70e083ceSChristopher Smith return $safe; 116*70e083ceSChristopher Smith } 117*70e083ceSChristopher Smith 118*70e083ceSChristopher Smith private function safe_unicode($safe) { 119*70e083ceSChristopher Smith $unicode = array(); 120*70e083ceSChristopher Smith $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY); 121*70e083ceSChristopher Smith 122*70e083ceSChristopher Smith $converted = false; 123*70e083ceSChristopher Smith foreach ($split as $sub) { 124*70e083ceSChristopher Smith if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) { 125*70e083ceSChristopher Smith if ($converted) { 126*70e083ceSChristopher Smith // strip post_indicator 127*70e083ceSChristopher Smith $sub = substr($sub,1); 128*70e083ceSChristopher Smith $converted = false; 129*70e083ceSChristopher Smith } 130*70e083ceSChristopher Smith for ($i=0; $i < strlen($sub); $i++) { 131*70e083ceSChristopher Smith $unicode[] = ord($sub[$i]); 132*70e083ceSChristopher Smith } 133*70e083ceSChristopher Smith } else if (strlen($sub)==1) { 134*70e083ceSChristopher Smith $converted = true; 135*70e083ceSChristopher Smith $unicode[] = ord($sub); 136*70e083ceSChristopher Smith } else { 137*70e083ceSChristopher Smith // a single codepoint in our base 138*70e083ceSChristopher Smith $converted = true; 139*70e083ceSChristopher Smith $base = (int)base_convert(substr($sub,1),36,10); 140*70e083ceSChristopher Smith $adjusted = ($base*strlen(self::$pre_indicator)) + $marker; 141*70e083ceSChristopher Smith 142*70e083ceSChristopher Smith $unicode[] = self::reverseForPlain($adjusted); 143*70e083ceSChristopher Smith } 144*70e083ceSChristopher Smith } 145*70e083ceSChristopher Smith 146*70e083ceSChristopher Smith return $unicode; 147*70e083ceSChristopher Smith } 148*70e083ceSChristopher Smith 149*70e083ceSChristopher Smith private function isPlain($codepoint) { 150*70e083ceSChristopher Smith return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)); 151*70e083ceSChristopher Smith } 152*70e083ceSChristopher Smith 153*70e083ceSChristopher Smith private function isPreIndicator($codepoint) { 154*70e083ceSChristopher Smith return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false)); 155*70e083ceSChristopher Smith } 156*70e083ceSChristopher Smith 157*70e083ceSChristopher Smith /** 158*70e083ceSChristopher Smith * adjust for plain and non-printable (ascii 0-31) 159*70e083ceSChristopher Smith * this makes SPACE (0x20) the first character we allow 160*70e083ceSChristopher Smith */ 161*70e083ceSChristopher Smith private function adjustForPlain($codepoint) { 162*70e083ceSChristopher Smith $adjustment = self::getAdjustments(); 163*70e083ceSChristopher Smith 164*70e083ceSChristopher Smith // codepoint is higher than that of the plain character with the highest codepoint 165*70e083ceSChristopher Smith if ($codepoint > ord($adjustment[count($adjustment)-1])) { 166*70e083ceSChristopher Smith $adjusted = $codepoint - count($adjustment); 167*70e083ceSChristopher Smith } else if ($codepoint > ord($adjustment[0])) { 168*70e083ceSChristopher Smith for ($i=1; $i < count($adjustment); $i++) { 169*70e083ceSChristopher Smith if ($codepoint < ord($adjustment[$i])) { 170*70e083ceSChristopher Smith break; 171*70e083ceSChristopher Smith } 172*70e083ceSChristopher Smith } 173*70e083ceSChristopher Smith $adjusted = $codepoint - $i; 174*70e083ceSChristopher Smith } else { 175*70e083ceSChristopher Smith $adjusted = $codepoint; 176*70e083ceSChristopher Smith } 177*70e083ceSChristopher Smith 178*70e083ceSChristopher Smith // substract number of non-printable characters and return 179*70e083ceSChristopher Smith return $adjusted - ord(' '); 180*70e083ceSChristopher Smith } 181*70e083ceSChristopher Smith 182*70e083ceSChristopher Smith private function reverseForPlain($adjusted) { 183*70e083ceSChristopher Smith $adjustment = self::getAdjustments(); 184*70e083ceSChristopher Smith 185*70e083ceSChristopher Smith // reverse adjustment for non-printable characters 186*70e083ceSChristopher Smith $adjusted += ord(' '); 187*70e083ceSChristopher Smith 188*70e083ceSChristopher Smith if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) { 189*70e083ceSChristopher Smith $adjusted += count($adjustment); 190*70e083ceSChristopher Smith } else if ($adjusted > ord($adjustment[0])) { 191*70e083ceSChristopher Smith for ($i=1; $i < count($adjustment); $i++) { 192*70e083ceSChristopher Smith if ($adjusted + $i < ord($adjustment[$i])) { 193*70e083ceSChristopher Smith break; 194*70e083ceSChristopher Smith } 195*70e083ceSChristopher Smith } 196*70e083ceSChristopher Smith $adjusted += $i; 197*70e083ceSChristopher Smith } 198*70e083ceSChristopher Smith 199*70e083ceSChristopher Smith return $adjusted; 200*70e083ceSChristopher Smith } 201*70e083ceSChristopher Smith 202*70e083ceSChristopher Smith private function getAdjustments() { 203*70e083ceSChristopher Smith if (empty(self::$adjustments)) { 204*70e083ceSChristopher Smith self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator); 205*70e083ceSChristopher Smith sort(self::$adjustments); 206*70e083ceSChristopher Smith } 207*70e083ceSChristopher Smith 208*70e083ceSChristopher Smith return self::$adjustments; 209*70e083ceSChristopher Smith } 210*70e083ceSChristopher Smith} 211