xref: /dokuwiki/inc/utf8.php (revision 10f09f2a66400f77b4696f973c4c526424e44bc1)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
982257610Sandi/**
1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters
1149c713a3Sandi *
1249c713a3Sandi * Slashes are not encoded
1349c713a3Sandi *
14f59b22f0Sandi * When the second parameter is true the string will
15f59b22f0Sandi * be encoded only if non ASCII characters are detected -
16f59b22f0Sandi * This makes it safe to run it multiple times on the
17f59b22f0Sandi * same string (default is true)
18f59b22f0Sandi *
1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
20f59b22f0Sandi * @see    urlencode
2149c713a3Sandi */
22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
23f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24f59b22f0Sandi    return $file;
25f59b22f0Sandi  }
26f59b22f0Sandi  $file = urlencode($file);
2749c713a3Sandi  $file = str_replace('%2F','/',$file);
2849c713a3Sandi  return $file;
2949c713a3Sandi}
3049c713a3Sandi
3149c713a3Sandi/**
3249c713a3Sandi * URL-Decode a filename
3349c713a3Sandi *
34f59b22f0Sandi * This is just a wrapper around urldecode
35f59b22f0Sandi *
3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
37f59b22f0Sandi * @see    urldecode
3849c713a3Sandi */
3949c713a3Sandifunction utf8_decodeFN($file){
40f59b22f0Sandi  $file = urldecode($file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
44f29bd553Sandi/**
4544f669e9Sandi * Checks if a string contains 7bit ASCII only
4644f669e9Sandi *
4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
4844f669e9Sandi */
4944f669e9Sandifunction utf8_isASCII($str){
5044f669e9Sandi  for($i=0; $i<strlen($str); $i++){
5144f669e9Sandi    if(ord($str{$i}) >127) return false;
5244f669e9Sandi  }
5344f669e9Sandi  return true;
5444f669e9Sandi}
5544f669e9Sandi
5644f669e9Sandi/**
57e1906e6eSandi * Strips all highbyte chars
58e1906e6eSandi *
59e1906e6eSandi * Returns a pure ASCII7 string
60e1906e6eSandi *
61e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
62e1906e6eSandi */
63e1906e6eSandifunction utf8_strip($str){
64e1906e6eSandi  $ascii = '';
65e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
66e1906e6eSandi    if(ord($str{$i}) <128){
67e1906e6eSandi      $ascii .= $str{$i};
68e1906e6eSandi    }
69e1906e6eSandi  }
70e1906e6eSandi  return $ascii;
71e1906e6eSandi}
72e1906e6eSandi
73e1906e6eSandi/**
74f29bd553Sandi * Tries to detect if a string is in Unicode encoding
75f29bd553Sandi *
76f29bd553Sandi * @author <bmorel@ssi.fr>
77f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
78f29bd553Sandi */
79f29bd553Sandifunction utf8_check($Str) {
80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
81f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87f29bd553Sandi  else return false; # Does not match any model
88f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90f29bd553Sandi   return false;
91f29bd553Sandi  }
92f29bd553Sandi }
93f29bd553Sandi return true;
94f29bd553Sandi}
9549c713a3Sandi
962f954959Sandi/**
97f29317c1Sandi * Unicode aware replacement for strlen()
982f954959Sandi *
99f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
100f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
101f29317c1Sandi * even faster than mb_strlen.
1022f954959Sandi *
103f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1042f954959Sandi * @see    strlen()
105f29317c1Sandi * @see    utf8_decode()
1062f954959Sandi */
1072f954959Sandifunction utf8_strlen($string){
108dc57ef04Sandi  return strlen(utf8_decode($string));
1092f954959Sandi}
1102f954959Sandi
1117077c942Sandi/**
112*10f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1137077c942Sandi *
114*10f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
115*10f09f2aSAndreas Gohr * Note: supports use of negative offsets and lengths but will be slower
116*10f09f2aSAndreas Gohr * when doing so
117*10f09f2aSAndreas Gohr *
118*10f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
119*10f09f2aSAndreas Gohr * @param string
120*10f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
121*10f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
122*10f09f2aSAndreas Gohr * @return mixed string or FALSE if failure
1237077c942Sandi */
124*10f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
125*10f09f2aSAndreas Gohr    if(!defined('UTF8_NOMBSTRING') && function_exists('mb_substr')){
126*10f09f2aSAndreas Gohr        if( $length === null ){
127*10f09f2aSAndreas Gohr            mb_substr($str, $offset);
1287d8be200Sandi        }else{
129*10f09f2aSAndreas Gohr            mb_substr($str, $offset, $length);
130f29317c1Sandi        }
131f29317c1Sandi    }
132f29317c1Sandi
133*10f09f2aSAndreas Gohr    if ( $offset >= 0 && $length >= 0 ) {
134*10f09f2aSAndreas Gohr        if ( $length === null ) {
135*10f09f2aSAndreas Gohr            $length = '*';
136*10f09f2aSAndreas Gohr        } else {
137*10f09f2aSAndreas Gohr            $strlen = strlen(utf8_decode($str));
138*10f09f2aSAndreas Gohr            if ( $offset > $strlen ) {
139*10f09f2aSAndreas Gohr                return '';
140*10f09f2aSAndreas Gohr            }
141*10f09f2aSAndreas Gohr
142*10f09f2aSAndreas Gohr            if ( ( $offset + $length ) > $strlen ) {
143*10f09f2aSAndreas Gohr               $length = '*';
144*10f09f2aSAndreas Gohr            } else {
145*10f09f2aSAndreas Gohr                $length = '{'.$length.'}';
146*10f09f2aSAndreas Gohr            }
147*10f09f2aSAndreas Gohr        }
148*10f09f2aSAndreas Gohr
149*10f09f2aSAndreas Gohr        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
150*10f09f2aSAndreas Gohr        preg_match($pattern, $str, $matches);
151*10f09f2aSAndreas Gohr
152*10f09f2aSAndreas Gohr        if ( isset($matches[1]) ) {
153*10f09f2aSAndreas Gohr            return $matches[1];
154*10f09f2aSAndreas Gohr        }
155*10f09f2aSAndreas Gohr        return false;
156*10f09f2aSAndreas Gohr
157*10f09f2aSAndreas Gohr    } else {
158*10f09f2aSAndreas Gohr        // Handle negatives using different, slower technique
159*10f09f2aSAndreas Gohr        // From: http://www.php.net/manual/en/function.substr.php#44838
160*10f09f2aSAndreas Gohr        preg_match_all('/./u', $str, $ar);
161*10f09f2aSAndreas Gohr        if( $length !== null ) {
162*10f09f2aSAndreas Gohr            return join('',array_slice($ar[0],$offset,$length));
163*10f09f2aSAndreas Gohr        } else {
164*10f09f2aSAndreas Gohr            return join('',array_slice($ar[0],$offset));
165*10f09f2aSAndreas Gohr        }
166*10f09f2aSAndreas Gohr    }
167*10f09f2aSAndreas Gohr}
168*10f09f2aSAndreas Gohr
169*10f09f2aSAndreas Gohr
170f29317c1Sandi/**
171dc57ef04Sandi * Unicode aware replacement for substr_replace()
172dc57ef04Sandi *
173dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
174dc57ef04Sandi * @see    substr_replace()
175dc57ef04Sandi */
176dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
177dc57ef04Sandi  $ret = '';
178dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
179dc57ef04Sandi  $ret .= $replacement;
180dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
181dc57ef04Sandi  return $ret;
182dc57ef04Sandi}
183dc57ef04Sandi
184dc57ef04Sandi/**
185f29317c1Sandi * Unicode aware replacement for explode
186f29317c1Sandi *
187f29317c1Sandi * @TODO   support third limit arg
188f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
189f29317c1Sandi * @see    explode();
190f29317c1Sandi */
191f29317c1Sandifunction utf8_explode($sep, $str) {
192f29317c1Sandi  if ( $sep == '' ) {
193f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
194f29317c1Sandi    return FALSE;
195f29317c1Sandi  }
196f29317c1Sandi
197f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
198f29317c1Sandi}
199f29317c1Sandi
200f29317c1Sandi/**
201f29317c1Sandi * Unicode aware replacement for strrepalce()
202f29317c1Sandi *
203f29317c1Sandi * @todo   support PHP5 count (fourth arg)
204f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
205f29317c1Sandi * @see    strreplace();
206f29317c1Sandi */
207f29317c1Sandifunction utf8_str_replace($s,$r,$str){
208f29317c1Sandi  if(!is_array($s)){
209f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
210f29317c1Sandi  }else{
211f29317c1Sandi    foreach ($s as $k => $v) {
212f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
213f29317c1Sandi    }
214f29317c1Sandi  }
215f29317c1Sandi  return preg_replace($s,$r,$str);
216f29317c1Sandi}
217f29317c1Sandi
218f29317c1Sandi/**
219f29317c1Sandi * Unicode aware replacement for ltrim()
220f29317c1Sandi *
221f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
222f29317c1Sandi * @see    ltrim()
223f29317c1Sandi * @return string
224f29317c1Sandi */
225f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
226f29317c1Sandi  if($charlist == '') return ltrim($str);
227f29317c1Sandi
228f29317c1Sandi  //quote charlist for use in a characterclass
229f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
230f29317c1Sandi
231f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
232f29317c1Sandi}
233f29317c1Sandi
234f29317c1Sandi/**
235ea2eed85Sandi * Unicode aware replacement for rtrim()
236f29317c1Sandi *
237f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
238f29317c1Sandi * @see    rtrim()
239f29317c1Sandi * @return string
240f29317c1Sandi */
241f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
242f29317c1Sandi  if($charlist == '') return rtrim($str);
243f29317c1Sandi
244f29317c1Sandi  //quote charlist for use in a characterclass
245f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
246f29317c1Sandi
247f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
248f29317c1Sandi}
249f29317c1Sandi
250f29317c1Sandi/**
251f29317c1Sandi * Unicode aware replacement for trim()
252f29317c1Sandi *
253f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
254f29317c1Sandi * @see    trim()
255f29317c1Sandi * @return string
256f29317c1Sandi */
257f29317c1Sandifunction  utf8_trim($str,$charlist='') {
258f29317c1Sandi  if($charlist == '') return trim($str);
259f29317c1Sandi
260f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
261f29317c1Sandi}
262f29317c1Sandi
2632f954959Sandi
26449c713a3Sandi/**
26582257610Sandi * This is a unicode aware replacement for strtolower()
26682257610Sandi *
26782257610Sandi * Uses mb_string extension if available
26882257610Sandi *
26982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
27082257610Sandi * @see    strtolower()
27182257610Sandi * @see    utf8_strtoupper()
27282257610Sandi */
27382257610Sandifunction utf8_strtolower($string){
27482257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
27582257610Sandi    return mb_strtolower($string,'utf-8');
27682257610Sandi
27782257610Sandi  global $UTF8_UPPER_TO_LOWER;
27882257610Sandi  $uni = utf8_to_unicode($string);
2792cd2db38Sandi  $cnt = count($uni);
2802cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
28182257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
28282257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
28382257610Sandi    }
28482257610Sandi  }
28582257610Sandi  return unicode_to_utf8($uni);
28682257610Sandi}
28782257610Sandi
28882257610Sandi/**
28982257610Sandi * This is a unicode aware replacement for strtoupper()
29082257610Sandi *
29182257610Sandi * Uses mb_string extension if available
29282257610Sandi *
29382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
29482257610Sandi * @see    strtoupper()
29582257610Sandi * @see    utf8_strtoupper()
29682257610Sandi */
29782257610Sandifunction utf8_strtoupper($string){
29882257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
2994d807ea6SAndreas Gohr    return mb_strtoupper($string,'utf-8');
30082257610Sandi
30182257610Sandi  global $UTF8_LOWER_TO_UPPER;
30282257610Sandi  $uni = utf8_to_unicode($string);
3032cd2db38Sandi  $cnt = count($uni);
3042cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
30582257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
30682257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
30782257610Sandi    }
30882257610Sandi  }
30982257610Sandi  return unicode_to_utf8($uni);
31082257610Sandi}
31182257610Sandi
31282257610Sandi/**
31382257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
31482257610Sandi *
31582257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
31682257610Sandi * letters. Default is to deaccent both cases ($case = 0)
31782257610Sandi *
31882257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
31982257610Sandi */
32082257610Sandifunction utf8_deaccent($string,$case=0){
32182257610Sandi  if($case <= 0){
32282257610Sandi    global $UTF8_LOWER_ACCENTS;
32382257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
32482257610Sandi  }
32582257610Sandi  if($case >= 0){
32682257610Sandi    global $UTF8_UPPER_ACCENTS;
32782257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
32882257610Sandi  }
32982257610Sandi  return $string;
33082257610Sandi}
33182257610Sandi
33282257610Sandi/**
3338a831f2bSAndreas Gohr * Romanize a non-latin string
3348a831f2bSAndreas Gohr *
3358a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3368a831f2bSAndreas Gohr */
3378a831f2bSAndreas Gohrfunction utf8_romanize($string){
3388a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3398a831f2bSAndreas Gohr
3408a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3418a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3428a831f2bSAndreas Gohr}
3438a831f2bSAndreas Gohr
3448a831f2bSAndreas Gohr/**
345099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
346099ada41Sandi *
347099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
348099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
349099ada41Sandi *
350099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
351099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
352099ada41Sandi * @param  string $repl       Replace special with this string
353b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
354099ada41Sandi */
355b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
356099ada41Sandi  global $UTF8_SPECIAL_CHARS;
357099ada41Sandi
3585c812709Sandi  static $specials = null;
3595c812709Sandi  if(is_null($specials)){
3605c812709Sandi    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
3615c812709Sandi  }
362099ada41Sandi
363b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
364099ada41Sandi}
365099ada41Sandi
366099ada41Sandi/**
3672f954959Sandi * This is an Unicode aware replacement for strpos
3682f954959Sandi *
3692f954959Sandi * Uses mb_string extension if available
3702f954959Sandi *
371f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
3722f954959Sandi * @see    strpos()
3732f954959Sandi */
3742f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
3752f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
3762f954959Sandi    return mb_strpos($haystack,$needle,$offset,'utf-8');
3772f954959Sandi
378f29317c1Sandi  if(!$offset){
379f29317c1Sandi    $ar = utf8_explode($needle, $str);
380f29317c1Sandi    if ( count($ar) > 1 ) {
381f29317c1Sandi       return utf8_strlen($ar[0]);
382f29317c1Sandi    }
383f29317c1Sandi    return false;
384f29317c1Sandi  }else{
385f29317c1Sandi    if ( !is_int($offset) ) {
386f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
387f29317c1Sandi      return false;
388f29317c1Sandi    }
3892f954959Sandi
390f29317c1Sandi    $str = utf8_substr($str, $offset);
391f29317c1Sandi
392f29317c1Sandi    if ( false !== ($pos = utf8_strpos($str,$needle))){
393f29317c1Sandi       return $pos + $offset;
3942f954959Sandi    }
395f29317c1Sandi    return false;
3962f954959Sandi  }
3972f954959Sandi}
3982f954959Sandi
3992f954959Sandi/**
400ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
401ea2eed85Sandi *
402ea2eed85Sandi * @author <vpribish at shopping dot com>
403ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
404ea2eed85Sandi */
405ea2eed85Sandifunction utf8_tohtml ($str) {
406ea2eed85Sandi  $ret = '';
407ea2eed85Sandi  $max = strlen($str);
408ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
409ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
410ea2eed85Sandi    $c = $str{$i};
411ea2eed85Sandi    $c1 = ord($c);
412ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
413ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
414ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
415ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
416ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
417ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
418ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
419ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
420ea2eed85Sandi      $last = $i+1;
421ea2eed85Sandi    }
422ea2eed85Sandi  }
423ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
424ea2eed85Sandi}
425ea2eed85Sandi
426ea2eed85Sandi/**
4271abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
4281abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
4291abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
4301abfaba4SAndreas Gohr * are not allowed.
43182257610Sandi *
4321abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
4331abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
4341abfaba4SAndreas Gohr * level E_USER_WARNING
4351abfaba4SAndreas Gohr *
4361abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
4371abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
4381abfaba4SAndreas Gohr *
4391abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
4401abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
4411abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
4421abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
4431abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid
4441abfaba4SAndreas Gohr * @see    unicode_to_utf8
4451abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
4461abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
44782257610Sandi */
4481abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
4491abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
4501abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
4511abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
4521abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
45382257610Sandi
4541abfaba4SAndreas Gohr    $out = array();
4551abfaba4SAndreas Gohr
4561abfaba4SAndreas Gohr    $len = strlen($str);
4571abfaba4SAndreas Gohr
4581abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
4591abfaba4SAndreas Gohr
4601abfaba4SAndreas Gohr        $in = ord($str{$i});
4611abfaba4SAndreas Gohr
4621abfaba4SAndreas Gohr        if ( $mState == 0) {
4631abfaba4SAndreas Gohr
4641abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
4651abfaba4SAndreas Gohr            // multi-octet sequence.
4661abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
4671abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
4681abfaba4SAndreas Gohr                $out[] = $in;
4691abfaba4SAndreas Gohr                $mBytes = 1;
4701abfaba4SAndreas Gohr
4711abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
4721abfaba4SAndreas Gohr                // First octet of 2 octet sequence
4731abfaba4SAndreas Gohr                $mUcs4 = ($in);
4741abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
4751abfaba4SAndreas Gohr                $mState = 1;
4761abfaba4SAndreas Gohr                $mBytes = 2;
4771abfaba4SAndreas Gohr
4781abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
4791abfaba4SAndreas Gohr                // First octet of 3 octet sequence
4801abfaba4SAndreas Gohr                $mUcs4 = ($in);
4811abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
4821abfaba4SAndreas Gohr                $mState = 2;
4831abfaba4SAndreas Gohr                $mBytes = 3;
4841abfaba4SAndreas Gohr
4851abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
4861abfaba4SAndreas Gohr                // First octet of 4 octet sequence
4871abfaba4SAndreas Gohr                $mUcs4 = ($in);
4881abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
4891abfaba4SAndreas Gohr                $mState = 3;
4901abfaba4SAndreas Gohr                $mBytes = 4;
4911abfaba4SAndreas Gohr
4921abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
4931abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
4941abfaba4SAndreas Gohr                 *
4951abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
4961abfaba4SAndreas Gohr                 * (a) not the shortest form or
4971abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
4981abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
4991abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5001abfaba4SAndreas Gohr                 */
5011abfaba4SAndreas Gohr                $mUcs4 = ($in);
5021abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5031abfaba4SAndreas Gohr                $mState = 4;
5041abfaba4SAndreas Gohr                $mBytes = 5;
5051abfaba4SAndreas Gohr
5061abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
5071abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
5081abfaba4SAndreas Gohr                $mUcs4 = ($in);
5091abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
5101abfaba4SAndreas Gohr                $mState = 5;
5111abfaba4SAndreas Gohr                $mBytes = 6;
5121abfaba4SAndreas Gohr
5131abfaba4SAndreas Gohr            } elseif($strict) {
5141abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
5151abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
5161abfaba4SAndreas Gohr                 */
5171abfaba4SAndreas Gohr                trigger_error(
5181abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
5191abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
5201abfaba4SAndreas Gohr                        E_USER_WARNING
5211abfaba4SAndreas Gohr                    );
5221abfaba4SAndreas Gohr                return FALSE;
5231abfaba4SAndreas Gohr
5241abfaba4SAndreas Gohr            }
5251abfaba4SAndreas Gohr
5261abfaba4SAndreas Gohr        } else {
5271abfaba4SAndreas Gohr
5281abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
5291abfaba4SAndreas Gohr            // sequence
5301abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
5311abfaba4SAndreas Gohr
5321abfaba4SAndreas Gohr                // Legal continuation.
5331abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
5341abfaba4SAndreas Gohr                $tmp = $in;
5351abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
5361abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
5371abfaba4SAndreas Gohr
5381abfaba4SAndreas Gohr                /**
5391abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
5401abfaba4SAndreas Gohr                 * Unicode codepoint to be output
5411abfaba4SAndreas Gohr                 */
5421abfaba4SAndreas Gohr                if (0 == --$mState) {
5431abfaba4SAndreas Gohr
5441abfaba4SAndreas Gohr                    /*
5451abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
5461abfaba4SAndreas Gohr                     */
5471abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
5481abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
5491abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
5501abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
5511abfaba4SAndreas Gohr                        (4 < $mBytes) ||
5521abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
5531abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
5541abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
5551abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
5561abfaba4SAndreas Gohr
5571abfaba4SAndreas Gohr                        if($strict){
5581abfaba4SAndreas Gohr                            trigger_error(
5591abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
5601abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
5611abfaba4SAndreas Gohr                                    E_USER_WARNING
5621abfaba4SAndreas Gohr                                );
5631abfaba4SAndreas Gohr
5641abfaba4SAndreas Gohr                            return FALSE;
5651abfaba4SAndreas Gohr                        }
5661abfaba4SAndreas Gohr
5671abfaba4SAndreas Gohr                    }
5681abfaba4SAndreas Gohr
5691abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
5701abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
5711abfaba4SAndreas Gohr                        $out[] = $mUcs4;
5721abfaba4SAndreas Gohr                    }
5731abfaba4SAndreas Gohr
5741abfaba4SAndreas Gohr                    //initialize UTF8 cache
5751abfaba4SAndreas Gohr                    $mState = 0;
5761abfaba4SAndreas Gohr                    $mUcs4  = 0;
5771abfaba4SAndreas Gohr                    $mBytes = 1;
5781abfaba4SAndreas Gohr                }
5791abfaba4SAndreas Gohr
5801abfaba4SAndreas Gohr            } elseif($strict) {
5811abfaba4SAndreas Gohr                /**
5821abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
5831abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
5841abfaba4SAndreas Gohr                 */
5851abfaba4SAndreas Gohr                trigger_error(
5861abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
5871abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
5881abfaba4SAndreas Gohr                        E_USER_WARNING
5891abfaba4SAndreas Gohr                    );
5901abfaba4SAndreas Gohr
5911abfaba4SAndreas Gohr                return FALSE;
59282257610Sandi            }
59382257610Sandi        }
59482257610Sandi    }
5951abfaba4SAndreas Gohr    return $out;
59682257610Sandi}
59782257610Sandi
59882257610Sandi/**
5991abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6001abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6011abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6021abfaba4SAndreas Gohr * are not allowed.
60382257610Sandi *
6041abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
6051abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
6061abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
6071abfaba4SAndreas Gohr *
6081abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
6091abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
6101abfaba4SAndreas Gohr * reference the array by it's keys
6111abfaba4SAndreas Gohr *
6121abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
6131abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
6141abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points
6151abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
6161abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
6171abfaba4SAndreas Gohr * @see    utf8_to_unicode
6181abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
6191abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
62082257610Sandi */
6211abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
6221abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
6231abfaba4SAndreas Gohr    ob_start();
624f949a01cSAndreas Gohr
6251abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
6261abfaba4SAndreas Gohr
6271abfaba4SAndreas Gohr        # ASCII range (including control chars)
6281abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
6291abfaba4SAndreas Gohr
6301abfaba4SAndreas Gohr            echo chr($arr[$k]);
6311abfaba4SAndreas Gohr
6321abfaba4SAndreas Gohr        # 2 byte sequence
6331abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
6341abfaba4SAndreas Gohr
6351abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
6361abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6371abfaba4SAndreas Gohr
6381abfaba4SAndreas Gohr        # Byte order mark (skip)
6391abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
6401abfaba4SAndreas Gohr
6411abfaba4SAndreas Gohr            // nop -- zap the BOM
6421abfaba4SAndreas Gohr
6431abfaba4SAndreas Gohr        # Test for illegal surrogates
6441abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
6451abfaba4SAndreas Gohr
6461abfaba4SAndreas Gohr            // found a surrogate
6471abfaba4SAndreas Gohr            if($strict){
6481abfaba4SAndreas Gohr                trigger_error(
6491abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
6501abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
6511abfaba4SAndreas Gohr                    E_USER_WARNING
6521abfaba4SAndreas Gohr                    );
6531abfaba4SAndreas Gohr                return FALSE;
6541abfaba4SAndreas Gohr            }
6551abfaba4SAndreas Gohr
6561abfaba4SAndreas Gohr        # 3 byte sequence
6571abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
6581abfaba4SAndreas Gohr
6591abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
6601abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
6611abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6621abfaba4SAndreas Gohr
6631abfaba4SAndreas Gohr        # 4 byte sequence
6641abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
6651abfaba4SAndreas Gohr
6661abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
6671abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
6681abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
6691abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
6701abfaba4SAndreas Gohr
6711abfaba4SAndreas Gohr        } elseif($strict) {
6721abfaba4SAndreas Gohr
6731abfaba4SAndreas Gohr            trigger_error(
6741abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
6751abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
6761abfaba4SAndreas Gohr                E_USER_WARNING
6771abfaba4SAndreas Gohr                );
6781abfaba4SAndreas Gohr
6791abfaba4SAndreas Gohr            // out of range
6801abfaba4SAndreas Gohr            return FALSE;
68182257610Sandi        }
68282257610Sandi    }
6831abfaba4SAndreas Gohr
6841abfaba4SAndreas Gohr    $result = ob_get_contents();
6851abfaba4SAndreas Gohr    ob_end_clean();
6861abfaba4SAndreas Gohr    return $result;
68782257610Sandi}
68882257610Sandi
68982257610Sandi/**
69015fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
69115fa0b4fSAndreas Gohr *
69215fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
69315fa0b4fSAndreas Gohr */
69415fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
69515fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
69615fa0b4fSAndreas Gohr  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
69715fa0b4fSAndreas Gohr    return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
69815fa0b4fSAndreas Gohr
69915fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
70015fa0b4fSAndreas Gohr  foreach($uni as $cp){
70115fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
70215fa0b4fSAndreas Gohr  }
70315fa0b4fSAndreas Gohr  return $out;
70415fa0b4fSAndreas Gohr}
70515fa0b4fSAndreas Gohr
70615fa0b4fSAndreas Gohr/**
70715fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
70815fa0b4fSAndreas Gohr *
70915fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
71015fa0b4fSAndreas Gohr */
71115fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
71215fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
71315fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
71415fa0b4fSAndreas Gohr}
71515fa0b4fSAndreas Gohr
71615fa0b4fSAndreas Gohr/**
71782257610Sandi * UTF-8 Case lookup table
71882257610Sandi *
71982257610Sandi * This lookuptable defines the upper case letters to their correspponding
72082257610Sandi * lower case letter in UTF-8
72182257610Sandi *
72282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
72382257610Sandi */
7248a831f2bSAndreas Gohrstatic $UTF8_LOWER_TO_UPPER = array(
72582257610Sandi  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
72682257610Sandi  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
72782257610Sandi  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
72882257610Sandi  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
72982257610Sandi  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
73082257610Sandi  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
73182257610Sandi  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
73282257610Sandi  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
73382257610Sandi  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
73482257610Sandi  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
73582257610Sandi  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
73682257610Sandi  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
73782257610Sandi  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
73882257610Sandi  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
73982257610Sandi  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
74082257610Sandi  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
74182257610Sandi  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
74282257610Sandi  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
74382257610Sandi  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
74482257610Sandi  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
74582257610Sandi  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
74682257610Sandi  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
74782257610Sandi  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
74882257610Sandi  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
74982257610Sandi  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
75082257610Sandi  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
75182257610Sandi  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
75282257610Sandi  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
75382257610Sandi  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
75482257610Sandi  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
75582257610Sandi  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
75682257610Sandi  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
75782257610Sandi  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
75882257610Sandi  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
75982257610Sandi  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
76082257610Sandi  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
76182257610Sandi  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
76282257610Sandi  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
76382257610Sandi  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
76482257610Sandi  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
76582257610Sandi  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
76682257610Sandi  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
76782257610Sandi  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
76882257610Sandi);
76982257610Sandi
77082257610Sandi/**
77182257610Sandi * UTF-8 Case lookup table
77282257610Sandi *
77382257610Sandi * This lookuptable defines the lower case letters to their correspponding
77482257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
77582257610Sandi *
77682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
77782257610Sandi */
77882257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
77982257610Sandi
78082257610Sandi/**
78182257610Sandi * UTF-8 lookup table for lower case accented letters
78282257610Sandi *
78382257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
78482257610Sandi * range. This are lower case letters only.
78582257610Sandi *
78682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
78782257610Sandi * @see    utf8_deaccent()
78882257610Sandi */
78982257610Sandi$UTF8_LOWER_ACCENTS = array(
79082257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
79182257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
79282257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
79382257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
79482257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
79582257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
79682257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
79782257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
79882257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
79982257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
80082257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
80182257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
80282257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
80382257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
8040c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
80582257610Sandi);
80682257610Sandi
80782257610Sandi/**
80882257610Sandi * UTF-8 lookup table for upper case accented letters
80982257610Sandi *
81082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
81182257610Sandi * range. This are upper case letters only.
81282257610Sandi *
81382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
81482257610Sandi * @see    utf8_deaccent()
81582257610Sandi */
81682257610Sandi$UTF8_UPPER_ACCENTS = array(
817df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
818df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
819df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
820df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
821df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
822df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
823df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
824df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
825df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
826df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
827df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
828df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
829df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
830df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
831df3ecd55SAndreas Gohr  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
83282257610Sandi);
83382257610Sandi
834099ada41Sandi/**
835099ada41Sandi * UTF-8 array of common special characters
836099ada41Sandi *
837099ada41Sandi * This array should contain all special characters (not a letter or digit)
838099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
839099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
840099ada41Sandi * chars.
841099ada41Sandi *
842099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
843ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
844099ada41Sandi *
845099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
846099ada41Sandi * @see    utf8_stripspecials()
847099ada41Sandi */
848099ada41Sandi$UTF8_SPECIAL_CHARS = array(
849099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
850ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
8515c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
8525c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
853099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
854099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
855099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
856099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
857099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
858099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
859099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
860099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
861099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
862099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
863099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
864099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
865099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
866099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
867099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
868099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
869099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
870099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
871099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
872099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
873099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
874099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
875099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
876099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
877099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
878099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
879099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
880099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
881099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
882099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
883099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
884099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
885099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
886099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
887099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
888099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
889099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
890099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
891099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
892099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
893099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
894099ada41Sandi  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
895099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
896099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
897099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
898099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
899099ada41Sandi);
900340756e4Sandi
9018a831f2bSAndreas Gohr/**
9028a831f2bSAndreas Gohr * Romanization lookup table
9038a831f2bSAndreas Gohr *
9048a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
9058a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
9068a831f2bSAndreas Gohr *
9078a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
9088a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
9098a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
9108a831f2bSAndreas Gohr *
9118a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
9128a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
9138a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
9148a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
9158a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
9168a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
9178a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
9188a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
9198a831f2bSAndreas Gohr */
9208a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
9218a831f2bSAndreas Gohr  //russian cyrillic
9228a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
9238a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
9248a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
9258a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
9268a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
9278a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
928d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
929d8cb2602SDenis Simakov  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
9308a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
9318a831f2bSAndreas Gohr  // Ukrainian cyrillic
9328a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
9338a831f2bSAndreas Gohr  // Georgian
9348a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
9358a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
9368a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
9378a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
9388a831f2bSAndreas Gohr  'ჰ'=>'xh',
9398a831f2bSAndreas Gohr  //Sanskrit
9408a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
9418a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
9428a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
9438a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
9448a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
9458a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
9468a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
9478a831f2bSAndreas Gohr  //Hebrew
9483dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
9493dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
9503dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
9518a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
9528a831f2bSAndreas Gohr  //Arabic
9538a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
9548a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
9558a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
9568a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
9578a831f2bSAndreas Gohr
9588a831f2bSAndreas Gohr  // Japanese hiragana
9598a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
9608a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
9618a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
9628a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
9638a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
9648a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
9658a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
9668a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
9678a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
9688a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
9698a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
9708a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
9718a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
9728a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
9738a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
9748a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
9758a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
9768a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
9778a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
9788a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
9798a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
9808a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
9818a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
9828a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
9838a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
9848a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
9858a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
9868a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
9878a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
9888a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
9898a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
9908a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
9918a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
9928a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
9938a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
9948a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
9958a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
9968a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
9978a831f2bSAndreas Gohr  'じゅ'=>'zyu',
9988a831f2bSAndreas Gohr  // Japanese katakana
9998a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
10008a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
10018a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
10028a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
10038a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
10048a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
10058a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
10068a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
10078a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
10088a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
10098a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
10108a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
10118a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
10128a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
10138a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
10148a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
10158a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
10168a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
10178a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
10188a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
10198a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
10208a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
10218a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
10228a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
10238a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
10248a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
10258a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
10268a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
10278a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
10288a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
10298a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
10308a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
10318a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
10328a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
10338a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
10348a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
10358a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
10368a831f2bSAndreas Gohr
10378a831f2bSAndreas Gohr  // "Greeklish"
10388a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
10398a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
10408a831f2bSAndreas Gohr
10418a831f2bSAndreas Gohr  // Thai
10428a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
10438a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
10448a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
10458a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
10468a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
10478a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
10488a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
10498a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
10508a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
10518a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
10528a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
10538a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
10548a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
10558a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
10568a831f2bSAndreas Gohr
10578a831f2bSAndreas Gohr  // Korean
10588a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
10598a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
10608a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
10618a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
10628a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
10638a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
10648a831f2bSAndreas Gohr);
1065340756e4Sandi
1066340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
10678a831f2bSAndreas Gohr
1068