xref: /dokuwiki/inc/utf8.php (revision 8a831f2bba632ebf9e24d0f2f407c5b42cebffe5)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
982257610Sandi/**
1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters
1149c713a3Sandi *
1249c713a3Sandi * Slashes are not encoded
1349c713a3Sandi *
14f59b22f0Sandi * When the second parameter is true the string will
15f59b22f0Sandi * be encoded only if non ASCII characters are detected -
16f59b22f0Sandi * This makes it safe to run it multiple times on the
17f59b22f0Sandi * same string (default is true)
18f59b22f0Sandi *
1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
20f59b22f0Sandi * @see    urlencode
2149c713a3Sandi */
22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
23f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24f59b22f0Sandi    return $file;
25f59b22f0Sandi  }
26f59b22f0Sandi  $file = urlencode($file);
2749c713a3Sandi  $file = str_replace('%2F','/',$file);
2849c713a3Sandi  return $file;
2949c713a3Sandi}
3049c713a3Sandi
3149c713a3Sandi/**
3249c713a3Sandi * URL-Decode a filename
3349c713a3Sandi *
34f59b22f0Sandi * This is just a wrapper around urldecode
35f59b22f0Sandi *
3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
37f59b22f0Sandi * @see    urldecode
3849c713a3Sandi */
3949c713a3Sandifunction utf8_decodeFN($file){
40f59b22f0Sandi  $file = urldecode($file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
44f29bd553Sandi/**
4544f669e9Sandi * Checks if a string contains 7bit ASCII only
4644f669e9Sandi *
4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
4844f669e9Sandi */
4944f669e9Sandifunction utf8_isASCII($str){
5044f669e9Sandi  for($i=0; $i<strlen($str); $i++){
5144f669e9Sandi    if(ord($str{$i}) >127) return false;
5244f669e9Sandi  }
5344f669e9Sandi  return true;
5444f669e9Sandi}
5544f669e9Sandi
5644f669e9Sandi/**
57e1906e6eSandi * Strips all highbyte chars
58e1906e6eSandi *
59e1906e6eSandi * Returns a pure ASCII7 string
60e1906e6eSandi *
61e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
62e1906e6eSandi */
63e1906e6eSandifunction utf8_strip($str){
64e1906e6eSandi  $ascii = '';
65e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
66e1906e6eSandi    if(ord($str{$i}) <128){
67e1906e6eSandi      $ascii .= $str{$i};
68e1906e6eSandi    }
69e1906e6eSandi  }
70e1906e6eSandi  return $ascii;
71e1906e6eSandi}
72e1906e6eSandi
73e1906e6eSandi/**
74f29bd553Sandi * Tries to detect if a string is in Unicode encoding
75f29bd553Sandi *
76f29bd553Sandi * @author <bmorel@ssi.fr>
77f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
78f29bd553Sandi */
79f29bd553Sandifunction utf8_check($Str) {
80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
81f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87f29bd553Sandi  else return false; # Does not match any model
88f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90f29bd553Sandi   return false;
91f29bd553Sandi  }
92f29bd553Sandi }
93f29bd553Sandi return true;
94f29bd553Sandi}
9549c713a3Sandi
962f954959Sandi/**
97f29317c1Sandi * Unicode aware replacement for strlen()
982f954959Sandi *
99f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
100f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
101f29317c1Sandi * even faster than mb_strlen.
1022f954959Sandi *
103f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1042f954959Sandi * @see    strlen()
105f29317c1Sandi * @see    utf8_decode()
1062f954959Sandi */
1072f954959Sandifunction utf8_strlen($string){
108dc57ef04Sandi  return strlen(utf8_decode($string));
1092f954959Sandi}
1102f954959Sandi
1117077c942Sandi/**
112f29317c1Sandi * Unicode aware replacement for substr()
1137077c942Sandi *
1147d8be200Sandi * @author lmak at NOSPAM dot iti dot gr
1157d8be200Sandi * @link   http://www.php.net/manual/en/function.substr.php
1167077c942Sandi * @see    substr()
1177077c942Sandi */
1187077c942Sandifunction utf8_substr($str,$start,$length=null){
1197d8be200Sandi   preg_match_all("/./u", $str, $ar);
120f29317c1Sandi
1217d8be200Sandi   if($length != null) {
1227d8be200Sandi       return join("",array_slice($ar[0],$start,$length));
1237d8be200Sandi   } else {
1247d8be200Sandi       return join("",array_slice($ar[0],$start));
125f29317c1Sandi   }
126f29317c1Sandi}
127f29317c1Sandi
128f29317c1Sandi/**
129dc57ef04Sandi * Unicode aware replacement for substr_replace()
130dc57ef04Sandi *
131dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
132dc57ef04Sandi * @see    substr_replace()
133dc57ef04Sandi */
134dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
135dc57ef04Sandi  $ret = '';
136dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
137dc57ef04Sandi  $ret .= $replacement;
138dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
139dc57ef04Sandi  return $ret;
140dc57ef04Sandi}
141dc57ef04Sandi
142dc57ef04Sandi/**
143f29317c1Sandi * Unicode aware replacement for explode
144f29317c1Sandi *
145f29317c1Sandi * @TODO   support third limit arg
146f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
147f29317c1Sandi * @see    explode();
148f29317c1Sandi */
149f29317c1Sandifunction utf8_explode($sep, $str) {
150f29317c1Sandi  if ( $sep == '' ) {
151f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
152f29317c1Sandi    return FALSE;
153f29317c1Sandi  }
154f29317c1Sandi
155f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
156f29317c1Sandi}
157f29317c1Sandi
158f29317c1Sandi/**
159f29317c1Sandi * Unicode aware replacement for strrepalce()
160f29317c1Sandi *
161f29317c1Sandi * @todo   support PHP5 count (fourth arg)
162f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
163f29317c1Sandi * @see    strreplace();
164f29317c1Sandi */
165f29317c1Sandifunction utf8_str_replace($s,$r,$str){
166f29317c1Sandi  if(!is_array($s)){
167f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
168f29317c1Sandi  }else{
169f29317c1Sandi    foreach ($s as $k => $v) {
170f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
171f29317c1Sandi    }
172f29317c1Sandi  }
173f29317c1Sandi  return preg_replace($s,$r,$str);
174f29317c1Sandi}
175f29317c1Sandi
176f29317c1Sandi/**
177f29317c1Sandi * Unicode aware replacement for ltrim()
178f29317c1Sandi *
179f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
180f29317c1Sandi * @see    ltrim()
181f29317c1Sandi * @return string
182f29317c1Sandi */
183f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
184f29317c1Sandi  if($charlist == '') return ltrim($str);
185f29317c1Sandi
186f29317c1Sandi  //quote charlist for use in a characterclass
187f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
188f29317c1Sandi
189f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
190f29317c1Sandi}
191f29317c1Sandi
192f29317c1Sandi/**
193ea2eed85Sandi * Unicode aware replacement for rtrim()
194f29317c1Sandi *
195f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
196f29317c1Sandi * @see    rtrim()
197f29317c1Sandi * @return string
198f29317c1Sandi */
199f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
200f29317c1Sandi  if($charlist == '') return rtrim($str);
201f29317c1Sandi
202f29317c1Sandi  //quote charlist for use in a characterclass
203f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
204f29317c1Sandi
205f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
206f29317c1Sandi}
207f29317c1Sandi
208f29317c1Sandi/**
209f29317c1Sandi * Unicode aware replacement for trim()
210f29317c1Sandi *
211f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
212f29317c1Sandi * @see    trim()
213f29317c1Sandi * @return string
214f29317c1Sandi */
215f29317c1Sandifunction  utf8_trim($str,$charlist='') {
216f29317c1Sandi  if($charlist == '') return trim($str);
217f29317c1Sandi
218f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
219f29317c1Sandi}
220f29317c1Sandi
2212f954959Sandi
22249c713a3Sandi/**
22382257610Sandi * This is a unicode aware replacement for strtolower()
22482257610Sandi *
22582257610Sandi * Uses mb_string extension if available
22682257610Sandi *
22782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
22882257610Sandi * @see    strtolower()
22982257610Sandi * @see    utf8_strtoupper()
23082257610Sandi */
23182257610Sandifunction utf8_strtolower($string){
23282257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
23382257610Sandi    return mb_strtolower($string,'utf-8');
23482257610Sandi
23582257610Sandi  global $UTF8_UPPER_TO_LOWER;
23682257610Sandi  $uni = utf8_to_unicode($string);
2372cd2db38Sandi  $cnt = count($uni);
2382cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
23982257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
24082257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
24182257610Sandi    }
24282257610Sandi  }
24382257610Sandi  return unicode_to_utf8($uni);
24482257610Sandi}
24582257610Sandi
24682257610Sandi/**
24782257610Sandi * This is a unicode aware replacement for strtoupper()
24882257610Sandi *
24982257610Sandi * Uses mb_string extension if available
25082257610Sandi *
25182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
25282257610Sandi * @see    strtoupper()
25382257610Sandi * @see    utf8_strtoupper()
25482257610Sandi */
25582257610Sandifunction utf8_strtoupper($string){
25682257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
2574d807ea6SAndreas Gohr    return mb_strtoupper($string,'utf-8');
25882257610Sandi
25982257610Sandi  global $UTF8_LOWER_TO_UPPER;
26082257610Sandi  $uni = utf8_to_unicode($string);
2612cd2db38Sandi  $cnt = count($uni);
2622cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
26382257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
26482257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
26582257610Sandi    }
26682257610Sandi  }
26782257610Sandi  return unicode_to_utf8($uni);
26882257610Sandi}
26982257610Sandi
27082257610Sandi/**
27182257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
27282257610Sandi *
27382257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
27482257610Sandi * letters. Default is to deaccent both cases ($case = 0)
27582257610Sandi *
27682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
27782257610Sandi */
27882257610Sandifunction utf8_deaccent($string,$case=0){
27982257610Sandi  if($case <= 0){
28082257610Sandi    global $UTF8_LOWER_ACCENTS;
28182257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
28282257610Sandi  }
28382257610Sandi  if($case >= 0){
28482257610Sandi    global $UTF8_UPPER_ACCENTS;
28582257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
28682257610Sandi  }
28782257610Sandi  return $string;
28882257610Sandi}
28982257610Sandi
29082257610Sandi/**
291*8a831f2bSAndreas Gohr * Romanize a non-latin string
292*8a831f2bSAndreas Gohr *
293*8a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
294*8a831f2bSAndreas Gohr */
295*8a831f2bSAndreas Gohrfunction utf8_romanize($string){
296*8a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
297*8a831f2bSAndreas Gohr
298*8a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
299*8a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
300*8a831f2bSAndreas Gohr}
301*8a831f2bSAndreas Gohr
302*8a831f2bSAndreas Gohr/**
303099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
304099ada41Sandi *
305099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
306099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
307099ada41Sandi *
308099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
309099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
310099ada41Sandi * @param  string $repl       Replace special with this string
311b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
312099ada41Sandi */
313b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
314099ada41Sandi  global $UTF8_SPECIAL_CHARS;
315099ada41Sandi
3165c812709Sandi  static $specials = null;
3175c812709Sandi  if(is_null($specials)){
3185c812709Sandi    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
3195c812709Sandi  }
320099ada41Sandi
321b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
322099ada41Sandi}
323099ada41Sandi
324099ada41Sandi/**
3252f954959Sandi * This is an Unicode aware replacement for strpos
3262f954959Sandi *
3272f954959Sandi * Uses mb_string extension if available
3282f954959Sandi *
329f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
3302f954959Sandi * @see    strpos()
3312f954959Sandi */
3322f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
3332f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
3342f954959Sandi    return mb_strpos($haystack,$needle,$offset,'utf-8');
3352f954959Sandi
336f29317c1Sandi  if(!$offset){
337f29317c1Sandi    $ar = utf8_explode($needle, $str);
338f29317c1Sandi    if ( count($ar) > 1 ) {
339f29317c1Sandi       return utf8_strlen($ar[0]);
340f29317c1Sandi    }
341f29317c1Sandi    return false;
342f29317c1Sandi  }else{
343f29317c1Sandi    if ( !is_int($offset) ) {
344f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
345f29317c1Sandi      return false;
346f29317c1Sandi    }
3472f954959Sandi
348f29317c1Sandi    $str = utf8_substr($str, $offset);
349f29317c1Sandi
350f29317c1Sandi    if ( false !== ($pos = utf8_strpos($str,$needle))){
351f29317c1Sandi       return $pos + $offset;
3522f954959Sandi    }
353f29317c1Sandi    return false;
3542f954959Sandi  }
3552f954959Sandi}
3562f954959Sandi
3572f954959Sandi/**
358ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
359ea2eed85Sandi *
360ea2eed85Sandi * @author <vpribish at shopping dot com>
361ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
362ea2eed85Sandi */
363ea2eed85Sandifunction utf8_tohtml ($str) {
364ea2eed85Sandi  $ret = '';
365ea2eed85Sandi  $max = strlen($str);
366ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
367ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
368ea2eed85Sandi    $c = $str{$i};
369ea2eed85Sandi    $c1 = ord($c);
370ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
371ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
372ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
373ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
374ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
375ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
376ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
377ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
378ea2eed85Sandi      $last = $i+1;
379ea2eed85Sandi    }
380ea2eed85Sandi  }
381ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
382ea2eed85Sandi}
383ea2eed85Sandi
384ea2eed85Sandi/**
385f29317c1Sandi * This function returns any UTF-8 encoded text as a list of
386f29317c1Sandi * Unicode values:
38782257610Sandi *
38882257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
38982257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
39082257610Sandi * @see    unicode_to_utf8()
39182257610Sandi */
39215fa0b4fSAndreas Gohrfunction utf8_to_unicode( &$str ) {
39382257610Sandi  $unicode = array();
39482257610Sandi  $values = array();
39582257610Sandi  $lookingFor = 1;
39682257610Sandi
39782257610Sandi  for ($i = 0; $i < strlen( $str ); $i++ ) {
39882257610Sandi    $thisValue = ord( $str[ $i ] );
39982257610Sandi    if ( $thisValue < 128 ) $unicode[] = $thisValue;
40082257610Sandi    else {
40182257610Sandi      if ( count( $values ) == 0 ) $lookingFor = ( $thisValue < 224 ) ? 2 : 3;
40282257610Sandi      $values[] = $thisValue;
40382257610Sandi      if ( count( $values ) == $lookingFor ) {
40482257610Sandi  $number = ( $lookingFor == 3 ) ?
40582257610Sandi    ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ):
40682257610Sandi  	( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );
40782257610Sandi  $unicode[] = $number;
40882257610Sandi  $values = array();
40982257610Sandi  $lookingFor = 1;
41082257610Sandi      }
41182257610Sandi    }
41282257610Sandi  }
41382257610Sandi  return $unicode;
41482257610Sandi}
41582257610Sandi
41682257610Sandi/**
417f29317c1Sandi * This function converts a Unicode array back to its UTF-8 representation
41882257610Sandi *
41982257610Sandi * @author Scott Michael Reynen <scott@randomchaos.com>
42082257610Sandi * @link   http://www.randomchaos.com/document.php?source=php_and_unicode
42182257610Sandi * @see    utf8_to_unicode()
42282257610Sandi */
42315fa0b4fSAndreas Gohrfunction unicode_to_utf8( &$str ) {
424f949a01cSAndreas Gohr  if (!is_array($str)) return '';
425f949a01cSAndreas Gohr
42682257610Sandi  $utf8 = '';
42782257610Sandi  foreach( $str as $unicode ) {
42882257610Sandi    if ( $unicode < 128 ) {
42982257610Sandi      $utf8.= chr( $unicode );
43082257610Sandi    } elseif ( $unicode < 2048 ) {
43182257610Sandi      $utf8.= chr( 192 +  ( ( $unicode - ( $unicode % 64 ) ) / 64 ) );
43282257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
43382257610Sandi    } else {
43482257610Sandi      $utf8.= chr( 224 + ( ( $unicode - ( $unicode % 4096 ) ) / 4096 ) );
43582257610Sandi      $utf8.= chr( 128 + ( ( ( $unicode % 4096 ) - ( $unicode % 64 ) ) / 64 ) );
43682257610Sandi      $utf8.= chr( 128 + ( $unicode % 64 ) );
43782257610Sandi    }
43882257610Sandi  }
43982257610Sandi  return $utf8;
44082257610Sandi}
44182257610Sandi
44282257610Sandi/**
44315fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
44415fa0b4fSAndreas Gohr *
44515fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
44615fa0b4fSAndreas Gohr */
44715fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
44815fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
44915fa0b4fSAndreas Gohr  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
45015fa0b4fSAndreas Gohr    return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
45115fa0b4fSAndreas Gohr
45215fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
45315fa0b4fSAndreas Gohr  foreach($uni as $cp){
45415fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
45515fa0b4fSAndreas Gohr  }
45615fa0b4fSAndreas Gohr  return $out;
45715fa0b4fSAndreas Gohr}
45815fa0b4fSAndreas Gohr
45915fa0b4fSAndreas Gohr/**
46015fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
46115fa0b4fSAndreas Gohr *
46215fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
46315fa0b4fSAndreas Gohr */
46415fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
46515fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
46615fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
46715fa0b4fSAndreas Gohr}
46815fa0b4fSAndreas Gohr
46915fa0b4fSAndreas Gohr/**
47082257610Sandi * UTF-8 Case lookup table
47182257610Sandi *
47282257610Sandi * This lookuptable defines the upper case letters to their correspponding
47382257610Sandi * lower case letter in UTF-8
47482257610Sandi *
47582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
47682257610Sandi */
477*8a831f2bSAndreas Gohrstatic $UTF8_LOWER_TO_UPPER = array(
47882257610Sandi  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
47982257610Sandi  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
48082257610Sandi  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
48182257610Sandi  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
48282257610Sandi  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
48382257610Sandi  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
48482257610Sandi  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
48582257610Sandi  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
48682257610Sandi  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
48782257610Sandi  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
48882257610Sandi  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
48982257610Sandi  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
49082257610Sandi  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
49182257610Sandi  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
49282257610Sandi  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
49382257610Sandi  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
49482257610Sandi  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
49582257610Sandi  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
49682257610Sandi  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
49782257610Sandi  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
49882257610Sandi  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
49982257610Sandi  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
50082257610Sandi  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
50182257610Sandi  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
50282257610Sandi  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
50382257610Sandi  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
50482257610Sandi  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
50582257610Sandi  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
50682257610Sandi  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
50782257610Sandi  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
50882257610Sandi  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
50982257610Sandi  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
51082257610Sandi  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
51182257610Sandi  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
51282257610Sandi  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
51382257610Sandi  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
51482257610Sandi  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
51582257610Sandi  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
51682257610Sandi  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
51782257610Sandi  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
51882257610Sandi  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
51982257610Sandi  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
52082257610Sandi  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
52182257610Sandi);
52282257610Sandi
52382257610Sandi/**
52482257610Sandi * UTF-8 Case lookup table
52582257610Sandi *
52682257610Sandi * This lookuptable defines the lower case letters to their correspponding
52782257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
52882257610Sandi *
52982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
53082257610Sandi */
53182257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
53282257610Sandi
53382257610Sandi/**
53482257610Sandi * UTF-8 lookup table for lower case accented letters
53582257610Sandi *
53682257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
53782257610Sandi * range. This are lower case letters only.
53882257610Sandi *
53982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
54082257610Sandi * @see    utf8_deaccent()
54182257610Sandi */
54282257610Sandi$UTF8_LOWER_ACCENTS = array(
54382257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
54482257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
54582257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
54682257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
54782257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
54882257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
54982257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
55082257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
55182257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
55282257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
55382257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
55482257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
55582257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
55682257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
5570c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
55882257610Sandi);
55982257610Sandi
56082257610Sandi/**
56182257610Sandi * UTF-8 lookup table for upper case accented letters
56282257610Sandi *
56382257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
56482257610Sandi * range. This are upper case letters only.
56582257610Sandi *
56682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
56782257610Sandi * @see    utf8_deaccent()
56882257610Sandi */
56982257610Sandi$UTF8_UPPER_ACCENTS = array(
570df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
571df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
572df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
573df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
574df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
575df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
576df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
577df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
578df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
579df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
580df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
581df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
582df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
583df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
584df3ecd55SAndreas Gohr  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
58582257610Sandi);
58682257610Sandi
587099ada41Sandi/**
588099ada41Sandi * UTF-8 array of common special characters
589099ada41Sandi *
590099ada41Sandi * This array should contain all special characters (not a letter or digit)
591099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
592099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
593099ada41Sandi * chars.
594099ada41Sandi *
595099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
596ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
597099ada41Sandi *
598099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
599099ada41Sandi * @see    utf8_stripspecials()
600099ada41Sandi */
601099ada41Sandi$UTF8_SPECIAL_CHARS = array(
602099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
603ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
6045c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
6055c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
606099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
607099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
608099ada41Sandi	0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
609099ada41Sandi	0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
610099ada41Sandi	0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
611099ada41Sandi	0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
612099ada41Sandi	0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
613099ada41Sandi	0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
614099ada41Sandi	0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
615099ada41Sandi	0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
616099ada41Sandi	0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
617099ada41Sandi	0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
618099ada41Sandi	0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
619099ada41Sandi	0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
620099ada41Sandi	0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
621099ada41Sandi	0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
622099ada41Sandi	0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
623099ada41Sandi	0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
624099ada41Sandi	0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
625099ada41Sandi	0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
626099ada41Sandi	0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
627099ada41Sandi	0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
628099ada41Sandi	0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
629099ada41Sandi	0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
630099ada41Sandi	0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
631099ada41Sandi	0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
632099ada41Sandi	0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
633099ada41Sandi	0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
634099ada41Sandi	0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
635099ada41Sandi	0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
636099ada41Sandi	0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
637099ada41Sandi	0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
638099ada41Sandi	0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
639099ada41Sandi	0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
640099ada41Sandi	0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
641099ada41Sandi	0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
642099ada41Sandi	0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
643099ada41Sandi	0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
644099ada41Sandi	0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
645099ada41Sandi	0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
646099ada41Sandi	0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
647099ada41Sandi	0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
648099ada41Sandi	0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
649099ada41Sandi	0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
650099ada41Sandi	0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
651099ada41Sandi	0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
652099ada41Sandi);
653340756e4Sandi
654*8a831f2bSAndreas Gohr/**
655*8a831f2bSAndreas Gohr * Romanization lookup table
656*8a831f2bSAndreas Gohr *
657*8a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
658*8a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
659*8a831f2bSAndreas Gohr *
660*8a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
661*8a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
662*8a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
663*8a831f2bSAndreas Gohr *
664*8a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
665*8a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
666*8a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
667*8a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
668*8a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
669*8a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
670*8a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
671*8a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
672*8a831f2bSAndreas Gohr */
673*8a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
674*8a831f2bSAndreas Gohr  //russian cyrillic
675*8a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
676*8a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
677*8a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
678*8a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
679*8a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
680*8a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
681*8a831f2bSAndreas Gohr  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'th','Щ'=>'Th','ъ'=>'qh',
682*8a831f2bSAndreas Gohr  'Ъ'=>'Qh','ы'=>'y','Ы'=>'Y','ь'=>'q','Ь'=>'Q','э'=>'eh','Э'=>'Eh','ю'=>'ju',
683*8a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
684*8a831f2bSAndreas Gohr  // Ukrainian cyrillic
685*8a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
686*8a831f2bSAndreas Gohr  // Georgian
687*8a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
688*8a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
689*8a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
690*8a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
691*8a831f2bSAndreas Gohr  'ჰ'=>'xh',
692*8a831f2bSAndreas Gohr  //Sanskrit
693*8a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
694*8a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
695*8a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
696*8a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
697*8a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
698*8a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
699*8a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
700*8a831f2bSAndreas Gohr  //Hebrew
701*8a831f2bSAndreas Gohr  'ב'=>'a','ג'=>'b','ד'=>'g','ה'=>'d','ו'=>'x','ז'=>'v','ח'=>'kh','ט'=>'th',
702*8a831f2bSAndreas Gohr  'י'=>'y','ך'=>'k','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
703*8a831f2bSAndreas Gohr  'ס'=>'s','ע'=>'ah','ף'=>'p','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
704*8a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
705*8a831f2bSAndreas Gohr  //Arabic
706*8a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
707*8a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
708*8a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
709*8a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
710*8a831f2bSAndreas Gohr
711*8a831f2bSAndreas Gohr  // Japanese hiragana
712*8a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
713*8a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
714*8a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
715*8a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
716*8a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
717*8a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
718*8a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
719*8a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
720*8a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
721*8a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
722*8a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
723*8a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
724*8a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
725*8a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
726*8a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
727*8a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
728*8a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
729*8a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
730*8a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
731*8a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
732*8a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
733*8a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
734*8a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
735*8a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
736*8a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
737*8a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
738*8a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
739*8a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
740*8a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
741*8a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
742*8a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
743*8a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
744*8a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
745*8a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
746*8a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
747*8a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
748*8a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
749*8a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
750*8a831f2bSAndreas Gohr  'じゅ'=>'zyu',
751*8a831f2bSAndreas Gohr  // Japanese katakana
752*8a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
753*8a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
754*8a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
755*8a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
756*8a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
757*8a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
758*8a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
759*8a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
760*8a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
761*8a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
762*8a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
763*8a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
764*8a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
765*8a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
766*8a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
767*8a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
768*8a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
769*8a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
770*8a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
771*8a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
772*8a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
773*8a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
774*8a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
775*8a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
776*8a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
777*8a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
778*8a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
779*8a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
780*8a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
781*8a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
782*8a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
783*8a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
784*8a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
785*8a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
786*8a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
787*8a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
788*8a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
789*8a831f2bSAndreas Gohr
790*8a831f2bSAndreas Gohr  // "Greeklish"
791*8a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
792*8a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
793*8a831f2bSAndreas Gohr
794*8a831f2bSAndreas Gohr  // Thai
795*8a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
796*8a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
797*8a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
798*8a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
799*8a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
800*8a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
801*8a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
802*8a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
803*8a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
804*8a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
805*8a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
806*8a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
807*8a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
808*8a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
809*8a831f2bSAndreas Gohr
810*8a831f2bSAndreas Gohr  // Korean
811*8a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
812*8a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
813*8a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
814*8a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
815*8a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
816*8a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
817*8a831f2bSAndreas Gohr);
818340756e4Sandi
819340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
820*8a831f2bSAndreas Gohr
821