1<?php 2 3/** 4 * Class to safely store UTF-8 in a Filename 5 * 6 * Encodes a utf8 string using only the following characters 0-9a-z_.-% 7 * characters 0-9a-z in the original string are preserved, "plain". 8 * all other characters are represented in a substring that starts 9 * with '%' are "converted". 10 * The transition from converted substrings to plain characters is 11 * marked with a '.' 12 * 13 * @author Christopher Smith 14 * @date 2010-04-02 15 */ 16class SafeFN { 17 // 'safe' characters are a superset of $plain, $pre_indicator and $post_indicator 18 private static $plain = '-./[_0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted 19 private static $pre_indicator = '%'; 20 private static $post_indicator = ']'; 21 22 /** 23 * Convert an UTF-8 string to a safe ASCII String 24 * 25 * conversion process 26 * - if codepoint is a plain or post_indicator character, 27 * - if previous character was "converted", append post_indicator to output, clear "converted" flag 28 * - append ascii byte for character to output 29 * (continue to next character) 30 * 31 * - if codepoint is a pre_indicator character, 32 * - append ascii byte for character to output, set "converted" flag 33 * (continue to next character) 34 * 35 * (all remaining characters) 36 * - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero. 37 * - convert reduced value to base36 (0-9a-z) 38 * - append $pre_indicator characater followed by base36 string to output, set converted flag 39 * continue to next character) 40 * 41 * @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f 42 * @return string an encoded representation of $filename using only 'safe' ASCII characters 43 * 44 * @author Christopher Smith <chris@jalakai.co.uk> 45 */ 46 public function encode($filename) { 47 return self::unicode_to_safe(utf8_to_unicode($filename)); 48 } 49 50 /** 51 * decoding process 52 * - split the string into substrings at any occurrence of pre or post indicator characters 53 * - check the first character of the substring 54 * - if its not a pre_indicator character 55 * - if previous character was converted, skip over post_indicator character 56 * - copy codepoint values of remaining characters to the output array 57 * - clear any converted flag 58 * (continue to next substring) 59 * 60 * _ else (its a pre_indicator character) 61 * - if string length is 1, copy the post_indicator character to the output array 62 * (continue to next substring) 63 * 64 * - else (string length > 1) 65 * - skip the pre-indicator character and convert remaining string from base36 to base10 66 * - increase codepoint value for non-printable ASCII characters (add 0x20) 67 * - append codepoint to output array 68 * (continue to next substring) 69 * 70 * @param string $filename a 'safe' encoded ASCII string, 71 * @return string decoded utf8 representation of $filename 72 * 73 * @author Christopher Smith <chris@jalakai.co.uk> 74 */ 75 public function decode($filename) { 76 return unicode_to_utf8(self::safe_to_unicode(strtolower($filename))); 77 } 78 79 public function validate_printable_utf8($printable_utf8) { 80 return !preg_match('#[\x01-\x1f]#',$printable_utf8); 81 } 82 83 public function validate_safe($safe) { 84 return !preg_match('#[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']#',$safe); 85 } 86 87 /** 88 * convert an array of unicode codepoints into 'safe_filename' format 89 * 90 * @param array int $unicode an array of unicode codepoints 91 * @return string the unicode represented in 'safe_filename' format 92 * 93 * @author Christopher Smith <chris@jalakai.co.uk> 94 */ 95 private function unicode_to_safe($unicode) { 96 97 $safe = ''; 98 $converted = false; 99 100 foreach ($unicode as $codepoint) { 101 if ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)) { 102 if ($converted) { 103 $safe .= self::$post_indicator; 104 $converted = false; 105 } 106 $safe .= chr($codepoint); 107 108 } else if ($codepoint == ord(self::$pre_indicator)) { 109 $safe .= self::$pre_indicator; 110 $converted = true; 111 } else { 112 $safe .= self::$pre_indicator.base_convert((string)($codepoint-32),10,36); 113 $converted = true; 114 } 115 } 116 if($converted) $safe .= self::$post_indicator; 117 return $safe; 118 } 119 120 /** 121 * convert a 'safe_filename' string into an array of unicode codepoints 122 * 123 * @param string $safe a filename in 'safe_filename' format 124 * @return array int an array of unicode codepoints 125 * 126 * @author Christopher Smith <chris@jalakai.co.uk> 127 */ 128 private function safe_to_unicode($safe) { 129 130 $unicode = array(); 131 $split = preg_split('#(?=['.self::$post_indicator.self::$pre_indicator.'])#',$safe,-1,PREG_SPLIT_NO_EMPTY); 132 133 $converted = false; 134 foreach ($split as $sub) { 135 if ($sub[0] != self::$pre_indicator) { 136 // plain (unconverted) characters, optionally starting with a post_indicator 137 // set initial value to skip any post_indicator 138 for ($i=($converted?1:0); $i < strlen($sub); $i++) { 139 $unicode[] = ord($sub[$i]); 140 } 141 $converted = false; 142 } else if (strlen($sub)==1) { 143 // a pre_indicator character in the real data 144 $unicode[] = ord($sub); 145 $converted = true; 146 } else { 147 // a single codepoint in base36, adjusted for initial 32 non-printable chars 148 $unicode[] = 32 + (int)base_convert(substr($sub,1),36,10); 149 $converted = true; 150 } 151 } 152 153 return $unicode; 154 } 155 156} 157 158/** 159 * Encode a UTF-8 filename to use on any filesystem 160 * 161 * Uses the 'fnencode' option to determine encoding 162 * 163 * When the second parameter is true the string will 164 * be encoded only if non ASCII characters are detected - 165 * This makes it safe to run it multiple times on the 166 * same string (default is true) 167 * 168 * @author Andreas Gohr <andi@splitbrain.org> 169 * @see urlencode 170 */ 171function dwiki_encodeFN($file,$safe=true){ 172 global $Dwfck_conf_values; 173 if($Dwfck_conf_values['fnencode'] == 'utf-8') return $file; 174 175 if($safe && preg_match('#^[a-zA-Z0-9/_\-\\]%]+$#',$file)){ 176 return $file; 177 } 178 179 if($Dwfck_conf_values['fnencode'] == 'safe'){ 180 return SafeFN::encode($file); 181 } 182 183 $file = urlencode($file); 184 $file = str_replace('%2F','/',$file); 185 return $file; 186} 187 188/** 189 * Decode a filename back to UTF-8 190 * 191 * Uses the 'fnencode' option to determine encoding 192 * 193 * @author Andreas Gohr <andi@splitbrain.org> 194 * @see urldecode 195 */ 196function dwiki_decodeFN($file){ 197 global $Dwfck_conf_values; 198 if($Dwfck_conf_values['fnencode'] == 'utf-8') return $file; 199 200 if($Dwfck_conf_values['fnencode'] == 'safe'){ 201 return SafeFN::decode($file); 202 } 203 204 return urldecode($file); 205} 206 207 208function safe_write_debug($what) { 209return; 210if(is_array($what)) { 211 $what = print_r($what,true); 212} 213$dwfckFHandle = fopen("safe_dbg.txt", "a"); 214fwrite($dwfckFHandle, "$what\n"); 215fclose($dwfckFHandle); 216} 217