1<?php 2 3/** 4 * Class to safely store UTF-8 in a Filename 5 * 6 * Encodes a utf8 string using only the following characters 0-9a-z_.-% 7 * characters 0-9a-z in the original string are preserved, "plain". 8 * all other characters are represented in a substring that starts 9 * with '%' are "converted". 10 * The transition from converted substrings to plain characters is 11 * marked with a '.' 12 * 13 * @author Christopher Smith 14 * @date 2010-04-02 15 */ 16class SafeFN { 17 18 private static $plain = '/_-0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted 19 private static $pre_indicator = '%'; 20 private static $post_indicator = '.'; // this character can be included in "plain" set 21 private static $adjustments = array(); // must be initialized, use getAdjustments() 22 23 /** 24 * Convert an UTF-8 string to a safe ASCII String 25 * 26 * conversion process 27 * - if codepoint is a plain character, 28 * - if previous character was "converted", append post_indicator 29 * to output 30 * - append ascii byte for character to output (continue to 31 * next character) 32 * 33 * - reduce codepoint value to fill the holes left by "plain" 34 * - choose marker character for conversion by taking modulus 35 * (number of possible pre_indicators) of modified codepoint 36 * - calculate value for conversion to base36 by integer division 37 * (number of possible pre_indicators) of modified codepoint 38 * - convert above value to a base36 string 39 * - append marker characater followed by base36 string to 40 * output (continue to next character) 41 */ 42 public function encode($utf8) { 43 return self::unicode_safe(self::utf8_unicode($utf8)); 44 } 45 46 /** 47 * decoding process 48 * - split the string into substrings at marker characters, 49 * discarding post_indicator character but keeping 50 * pre_indicator characters (along with their following 51 * base36 string) 52 * - check the first character of the substring 53 * - if its not a pre_indicator character, convert each 54 * character in the substring into its codepoint value 55 * and append to output (continue to next substring) 56 * - if it is a pre_indicator character, get its position in the 57 * pre_indicator string (order is important) 58 * - convert the remainder of the string from base36 to base10 59 * and then to an (int). 60 * - multiply the converted int by the number of pre_indicator 61 * characters and add the pre_indicator position 62 * - reverse the conversion adjustment for codepoint holes left by 63 * "plain" characters 64 * - append resulting codepoint value to output (continue to next 65 * substring) 66 */ 67 public function decode($safe) { 68 return self::unicode_utf8(self::safe_unicode(strtolower($safe))); 69 } 70 71 public function validate_printable_utf8($printable_utf8) { 72 return !preg_match('/[\x01-\x1f]/',$printable_utf8); 73 } 74 75 public function validate_safe($safe) { 76 return !preg_match('/[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']/',$safe); 77 } 78 79 private function utf8_unicode($utf8) { 80 return utf8_to_unicode($utf8); 81 } 82 83 private function unicode_utf8($unicode) { 84 return unicode_to_utf8($unicode); 85 } 86 87 private function unicode_safe($unicode) { 88 89 $safe = ''; 90 $converted = false; 91 92 foreach ($unicode as $codepoint) { 93 if (self::isPlain($codepoint)) { 94 if ($converted) { 95 $safe .= self::$post_indicator; 96 $converted = false; 97 } 98 $safe .= chr($codepoint); 99 100 } else if (self::isPreIndicator($codepoint)) { 101 $converted = true; 102 $safe .= chr($codepoint); 103 104 } else { 105 $converted = true; 106 $adjusted = self::adjustForPlain($codepoint); 107 108 $marker = $adjusted % strlen(self::$pre_indicator); 109 $base = (int) ($adjusted / strlen(self::$pre_indicator)); 110 111 $safe .= self::$pre_indicator[$marker]; 112 $safe .= base_convert((string)$base,10,36); 113 } 114 } 115 return $safe; 116 } 117 118 private function safe_unicode($safe) { 119 $unicode = array(); 120 $split = preg_split('/(?=['.self::$post_indicator.self::$pre_indicator.'])/',$safe,-1,PREG_SPLIT_NO_EMPTY); 121 122 $converted = false; 123 foreach ($split as $sub) { 124 if (($marker = strpos(self::$pre_indicator,$sub[0])) === false) { 125 if ($converted) { 126 // strip post_indicator 127 $sub = substr($sub,1); 128 $converted = false; 129 } 130 for ($i=0; $i < strlen($sub); $i++) { 131 $unicode[] = ord($sub[$i]); 132 } 133 } else if (strlen($sub)==1) { 134 $converted = true; 135 $unicode[] = ord($sub); 136 } else { 137 // a single codepoint in our base 138 $converted = true; 139 $base = (int)base_convert(substr($sub,1),36,10); 140 $adjusted = ($base*strlen(self::$pre_indicator)) + $marker; 141 142 $unicode[] = self::reverseForPlain($adjusted); 143 } 144 } 145 146 return $unicode; 147 } 148 149 private function isPlain($codepoint) { 150 return ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)); 151 } 152 153 private function isPreIndicator($codepoint) { 154 return ($codepoint < 127 && (strpos(self::$pre_indicator,chr($codepoint)) !== false)); 155 } 156 157 /** 158 * adjust for plain and non-printable (ascii 0-31) 159 * this makes SPACE (0x20) the first character we allow 160 */ 161 private function adjustForPlain($codepoint) { 162 $adjustment = self::getAdjustments(); 163 164 // codepoint is higher than that of the plain character with the highest codepoint 165 if ($codepoint > ord($adjustment[count($adjustment)-1])) { 166 $adjusted = $codepoint - count($adjustment); 167 } else if ($codepoint > ord($adjustment[0])) { 168 for ($i=1; $i < count($adjustment); $i++) { 169 if ($codepoint < ord($adjustment[$i])) { 170 break; 171 } 172 } 173 $adjusted = $codepoint - $i; 174 } else { 175 $adjusted = $codepoint; 176 } 177 178 // substract number of non-printable characters and return 179 return $adjusted - ord(' '); 180 } 181 182 private function reverseForPlain($adjusted) { 183 $adjustment = self::getAdjustments(); 184 185 // reverse adjustment for non-printable characters 186 $adjusted += ord(' '); 187 188 if ($adjusted + count($adjustment) > ord($adjustment[count($adjustment)-1])) { 189 $adjusted += count($adjustment); 190 } else if ($adjusted > ord($adjustment[0])) { 191 for ($i=1; $i < count($adjustment); $i++) { 192 if ($adjusted + $i < ord($adjustment[$i])) { 193 break; 194 } 195 } 196 $adjusted += $i; 197 } 198 199 return $adjusted; 200 } 201 202 private function getAdjustments() { 203 if (empty(self::$adjustments)) { 204 self::$adjustments = str_split(self::$plain.self::$pre_indicator.self::$post_indicator); 205 sort(self::$adjustments); 206 } 207 208 return self::$adjustments; 209 } 210} 211