xref: /dokuwiki/inc/utf8.php (revision 54662a044eb6fd07b3246aed0d598d02bf7a1e4a)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr
10ab77016bSAndreas Gohr/**
11ab77016bSAndreas Gohr * check for mb_string support
12ab77016bSAndreas Gohr */
13ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
14ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
15ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
16ab77016bSAndreas Gohr  }else{
17ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
18ab77016bSAndreas Gohr  }
19ab77016bSAndreas Gohr}
20ab77016bSAndreas Gohr
21ab77016bSAndreas Gohr
2282257610Sandi/**
2349c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2449c713a3Sandi *
2549c713a3Sandi * Slashes are not encoded
2649c713a3Sandi *
27f59b22f0Sandi * When the second parameter is true the string will
28f59b22f0Sandi * be encoded only if non ASCII characters are detected -
29f59b22f0Sandi * This makes it safe to run it multiple times on the
30f59b22f0Sandi * same string (default is true)
31f59b22f0Sandi *
3249c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
33f59b22f0Sandi * @see    urlencode
3449c713a3Sandi */
35f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
36f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
37f59b22f0Sandi    return $file;
38f59b22f0Sandi  }
39f59b22f0Sandi  $file = urlencode($file);
4049c713a3Sandi  $file = str_replace('%2F','/',$file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
4449c713a3Sandi/**
4549c713a3Sandi * URL-Decode a filename
4649c713a3Sandi *
47f59b22f0Sandi * This is just a wrapper around urldecode
48f59b22f0Sandi *
4949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
50f59b22f0Sandi * @see    urldecode
5149c713a3Sandi */
5249c713a3Sandifunction utf8_decodeFN($file){
53f59b22f0Sandi  $file = urldecode($file);
5449c713a3Sandi  return $file;
5549c713a3Sandi}
5649c713a3Sandi
57f29bd553Sandi/**
5844f669e9Sandi * Checks if a string contains 7bit ASCII only
5944f669e9Sandi *
6044f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6144f669e9Sandi */
6244f669e9Sandifunction utf8_isASCII($str){
6344f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6444f669e9Sandi    if(ord($str{$i}) >127) return false;
6544f669e9Sandi  }
6644f669e9Sandi  return true;
6744f669e9Sandi}
6844f669e9Sandi
6944f669e9Sandi/**
70e1906e6eSandi * Strips all highbyte chars
71e1906e6eSandi *
72e1906e6eSandi * Returns a pure ASCII7 string
73e1906e6eSandi *
74e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
75e1906e6eSandi */
76e1906e6eSandifunction utf8_strip($str){
77e1906e6eSandi  $ascii = '';
78e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
79e1906e6eSandi    if(ord($str{$i}) <128){
80e1906e6eSandi      $ascii .= $str{$i};
81e1906e6eSandi    }
82e1906e6eSandi  }
83e1906e6eSandi  return $ascii;
84e1906e6eSandi}
85e1906e6eSandi
86e1906e6eSandi/**
87f29bd553Sandi * Tries to detect if a string is in Unicode encoding
88f29bd553Sandi *
89f29bd553Sandi * @author <bmorel@ssi.fr>
90f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
91f29bd553Sandi */
92f29bd553Sandifunction utf8_check($Str) {
93f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
94f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
95f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
96f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
97f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
98f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
99f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
100f29bd553Sandi  else return false; # Does not match any model
101f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
102f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
103f29bd553Sandi   return false;
104f29bd553Sandi  }
105f29bd553Sandi }
106f29bd553Sandi return true;
107f29bd553Sandi}
10849c713a3Sandi
1092f954959Sandi/**
110f29317c1Sandi * Unicode aware replacement for strlen()
1112f954959Sandi *
112f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
113f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
114f29317c1Sandi * even faster than mb_strlen.
1152f954959Sandi *
116f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1172f954959Sandi * @see    strlen()
118f29317c1Sandi * @see    utf8_decode()
1192f954959Sandi */
1202f954959Sandifunction utf8_strlen($string){
121dc57ef04Sandi  return strlen(utf8_decode($string));
1222f954959Sandi}
1232f954959Sandi
1247077c942Sandi/**
12510f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1267077c942Sandi *
12710f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
12810f09f2aSAndreas Gohr * Note: supports use of negative offsets and lengths but will be slower
12910f09f2aSAndreas Gohr * when doing so
13010f09f2aSAndreas Gohr *
13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
13210f09f2aSAndreas Gohr * @param string
13310f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13410f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13510f09f2aSAndreas Gohr * @return mixed string or FALSE if failure
1367077c942Sandi */
13710f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
138ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
13910f09f2aSAndreas Gohr        if( $length === null ){
14010f09f2aSAndreas Gohr            mb_substr($str, $offset);
1417d8be200Sandi        }else{
14210f09f2aSAndreas Gohr            mb_substr($str, $offset, $length);
143f29317c1Sandi        }
144f29317c1Sandi    }
145f29317c1Sandi
14610f09f2aSAndreas Gohr    if ( $offset >= 0 && $length >= 0 ) {
14710f09f2aSAndreas Gohr        if ( $length === null ) {
14810f09f2aSAndreas Gohr            $length = '*';
14910f09f2aSAndreas Gohr        } else {
15010f09f2aSAndreas Gohr            $strlen = strlen(utf8_decode($str));
15110f09f2aSAndreas Gohr            if ( $offset > $strlen ) {
15210f09f2aSAndreas Gohr                return '';
15310f09f2aSAndreas Gohr            }
15410f09f2aSAndreas Gohr
15510f09f2aSAndreas Gohr            if ( ( $offset + $length ) > $strlen ) {
15610f09f2aSAndreas Gohr               $length = '*';
15710f09f2aSAndreas Gohr            } else {
15810f09f2aSAndreas Gohr                $length = '{'.$length.'}';
15910f09f2aSAndreas Gohr            }
16010f09f2aSAndreas Gohr        }
16110f09f2aSAndreas Gohr
16210f09f2aSAndreas Gohr        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
16310f09f2aSAndreas Gohr        preg_match($pattern, $str, $matches);
16410f09f2aSAndreas Gohr
16510f09f2aSAndreas Gohr        if ( isset($matches[1]) ) {
16610f09f2aSAndreas Gohr            return $matches[1];
16710f09f2aSAndreas Gohr        }
16810f09f2aSAndreas Gohr        return false;
16910f09f2aSAndreas Gohr
17010f09f2aSAndreas Gohr    } else {
17110f09f2aSAndreas Gohr        // Handle negatives using different, slower technique
17210f09f2aSAndreas Gohr        // From: http://www.php.net/manual/en/function.substr.php#44838
17310f09f2aSAndreas Gohr        preg_match_all('/./u', $str, $ar);
17410f09f2aSAndreas Gohr        if( $length !== null ) {
17510f09f2aSAndreas Gohr            return join('',array_slice($ar[0],$offset,$length));
17610f09f2aSAndreas Gohr        } else {
17710f09f2aSAndreas Gohr            return join('',array_slice($ar[0],$offset));
17810f09f2aSAndreas Gohr        }
17910f09f2aSAndreas Gohr    }
18010f09f2aSAndreas Gohr}
18110f09f2aSAndreas Gohr
18210f09f2aSAndreas Gohr
183f29317c1Sandi/**
184dc57ef04Sandi * Unicode aware replacement for substr_replace()
185dc57ef04Sandi *
186dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
187dc57ef04Sandi * @see    substr_replace()
188dc57ef04Sandi */
189dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
190dc57ef04Sandi  $ret = '';
191dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
192dc57ef04Sandi  $ret .= $replacement;
193dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
194dc57ef04Sandi  return $ret;
195dc57ef04Sandi}
196dc57ef04Sandi
197dc57ef04Sandi/**
198f29317c1Sandi * Unicode aware replacement for explode
199f29317c1Sandi *
200f29317c1Sandi * @TODO   support third limit arg
201f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
202f29317c1Sandi * @see    explode();
203f29317c1Sandi */
204f29317c1Sandifunction utf8_explode($sep, $str) {
205f29317c1Sandi  if ( $sep == '' ) {
206f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
207f29317c1Sandi    return FALSE;
208f29317c1Sandi  }
209f29317c1Sandi
210f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
211f29317c1Sandi}
212f29317c1Sandi
213f29317c1Sandi/**
214f29317c1Sandi * Unicode aware replacement for strrepalce()
215f29317c1Sandi *
216f29317c1Sandi * @todo   support PHP5 count (fourth arg)
217f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
218f29317c1Sandi * @see    strreplace();
219f29317c1Sandi */
220f29317c1Sandifunction utf8_str_replace($s,$r,$str){
221f29317c1Sandi  if(!is_array($s)){
222f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
223f29317c1Sandi  }else{
224f29317c1Sandi    foreach ($s as $k => $v) {
225f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
226f29317c1Sandi    }
227f29317c1Sandi  }
228f29317c1Sandi  return preg_replace($s,$r,$str);
229f29317c1Sandi}
230f29317c1Sandi
231f29317c1Sandi/**
232f29317c1Sandi * Unicode aware replacement for ltrim()
233f29317c1Sandi *
234f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
235f29317c1Sandi * @see    ltrim()
236f29317c1Sandi * @return string
237f29317c1Sandi */
238f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
239f29317c1Sandi  if($charlist == '') return ltrim($str);
240f29317c1Sandi
241f29317c1Sandi  //quote charlist for use in a characterclass
242f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
243f29317c1Sandi
244f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
245f29317c1Sandi}
246f29317c1Sandi
247f29317c1Sandi/**
248ea2eed85Sandi * Unicode aware replacement for rtrim()
249f29317c1Sandi *
250f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
251f29317c1Sandi * @see    rtrim()
252f29317c1Sandi * @return string
253f29317c1Sandi */
254f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
255f29317c1Sandi  if($charlist == '') return rtrim($str);
256f29317c1Sandi
257f29317c1Sandi  //quote charlist for use in a characterclass
258f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
259f29317c1Sandi
260f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
261f29317c1Sandi}
262f29317c1Sandi
263f29317c1Sandi/**
264f29317c1Sandi * Unicode aware replacement for trim()
265f29317c1Sandi *
266f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
267f29317c1Sandi * @see    trim()
268f29317c1Sandi * @return string
269f29317c1Sandi */
270f29317c1Sandifunction  utf8_trim($str,$charlist='') {
271f29317c1Sandi  if($charlist == '') return trim($str);
272f29317c1Sandi
273f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
274f29317c1Sandi}
275f29317c1Sandi
2762f954959Sandi
27749c713a3Sandi/**
27882257610Sandi * This is a unicode aware replacement for strtolower()
27982257610Sandi *
28082257610Sandi * Uses mb_string extension if available
28182257610Sandi *
28282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
28382257610Sandi * @see    strtolower()
28482257610Sandi * @see    utf8_strtoupper()
28582257610Sandi */
28682257610Sandifunction utf8_strtolower($string){
287ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
28882257610Sandi
28982257610Sandi  global $UTF8_UPPER_TO_LOWER;
29082257610Sandi  $uni = utf8_to_unicode($string);
2912cd2db38Sandi  $cnt = count($uni);
2922cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
29382257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
29482257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
29582257610Sandi    }
29682257610Sandi  }
29782257610Sandi  return unicode_to_utf8($uni);
29882257610Sandi}
29982257610Sandi
30082257610Sandi/**
30182257610Sandi * This is a unicode aware replacement for strtoupper()
30282257610Sandi *
30382257610Sandi * Uses mb_string extension if available
30482257610Sandi *
30582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
30682257610Sandi * @see    strtoupper()
30782257610Sandi * @see    utf8_strtoupper()
30882257610Sandi */
30982257610Sandifunction utf8_strtoupper($string){
310ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
31182257610Sandi
31282257610Sandi  global $UTF8_LOWER_TO_UPPER;
31382257610Sandi  $uni = utf8_to_unicode($string);
3142cd2db38Sandi  $cnt = count($uni);
3152cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
31682257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
31782257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
31882257610Sandi    }
31982257610Sandi  }
32082257610Sandi  return unicode_to_utf8($uni);
32182257610Sandi}
32282257610Sandi
32382257610Sandi/**
32482257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
32582257610Sandi *
32682257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
32782257610Sandi * letters. Default is to deaccent both cases ($case = 0)
32882257610Sandi *
32982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
33082257610Sandi */
33182257610Sandifunction utf8_deaccent($string,$case=0){
33282257610Sandi  if($case <= 0){
33382257610Sandi    global $UTF8_LOWER_ACCENTS;
33482257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
33582257610Sandi  }
33682257610Sandi  if($case >= 0){
33782257610Sandi    global $UTF8_UPPER_ACCENTS;
33882257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
33982257610Sandi  }
34082257610Sandi  return $string;
34182257610Sandi}
34282257610Sandi
34382257610Sandi/**
3448a831f2bSAndreas Gohr * Romanize a non-latin string
3458a831f2bSAndreas Gohr *
3468a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3478a831f2bSAndreas Gohr */
3488a831f2bSAndreas Gohrfunction utf8_romanize($string){
3498a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3508a831f2bSAndreas Gohr
3518a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3528a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3538a831f2bSAndreas Gohr}
3548a831f2bSAndreas Gohr
3558a831f2bSAndreas Gohr/**
356099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
357099ada41Sandi *
358099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
359099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
360099ada41Sandi *
361099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
362099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
363099ada41Sandi * @param  string $repl       Replace special with this string
364b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
365099ada41Sandi */
366b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
367099ada41Sandi  global $UTF8_SPECIAL_CHARS;
368099ada41Sandi
3695c812709Sandi  static $specials = null;
3705c812709Sandi  if(is_null($specials)){
3715c812709Sandi    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
3725c812709Sandi  }
373099ada41Sandi
374b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
375099ada41Sandi}
376099ada41Sandi
377099ada41Sandi/**
3782f954959Sandi * This is an Unicode aware replacement for strpos
3792f954959Sandi *
3802f954959Sandi * Uses mb_string extension if available
3812f954959Sandi *
382f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
3832f954959Sandi * @see    strpos()
3842f954959Sandi */
3852f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
386ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
3872f954959Sandi
388f29317c1Sandi  if(!$offset){
389eaa525a0SAndreas Gohr    $ar = utf8_explode($needle, $haystack);
390f29317c1Sandi    if ( count($ar) > 1 ) {
391f29317c1Sandi       return utf8_strlen($ar[0]);
392f29317c1Sandi    }
393f29317c1Sandi    return false;
394f29317c1Sandi  }else{
395f29317c1Sandi    if ( !is_int($offset) ) {
396f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
397f29317c1Sandi      return false;
398f29317c1Sandi    }
3992f954959Sandi
400eaa525a0SAndreas Gohr    $haystack = utf8_substr($haystack, $offset);
401f29317c1Sandi
402eaa525a0SAndreas Gohr    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
403f29317c1Sandi       return $pos + $offset;
4042f954959Sandi    }
405f29317c1Sandi    return false;
4062f954959Sandi  }
4072f954959Sandi}
4082f954959Sandi
4092f954959Sandi/**
410ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
411ea2eed85Sandi *
412ea2eed85Sandi * @author <vpribish at shopping dot com>
413ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
414ea2eed85Sandi */
415ea2eed85Sandifunction utf8_tohtml ($str) {
416ea2eed85Sandi  $ret = '';
417ea2eed85Sandi  $max = strlen($str);
418ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
419ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
420ea2eed85Sandi    $c = $str{$i};
421ea2eed85Sandi    $c1 = ord($c);
422ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
423ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
424ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
425ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
426ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
427ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
428ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
429ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
430ea2eed85Sandi      $last = $i+1;
431ea2eed85Sandi    }
432ea2eed85Sandi  }
433ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
434ea2eed85Sandi}
435ea2eed85Sandi
436ea2eed85Sandi/**
4371abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
4381abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
4391abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
4401abfaba4SAndreas Gohr * are not allowed.
44182257610Sandi *
4421abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
4431abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
4441abfaba4SAndreas Gohr * level E_USER_WARNING
4451abfaba4SAndreas Gohr *
4461abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
4471abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
4481abfaba4SAndreas Gohr *
4491abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
4501abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
4511abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
4521abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
4531abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid
4541abfaba4SAndreas Gohr * @see    unicode_to_utf8
4551abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
4561abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
45782257610Sandi */
4581abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
4591abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
4601abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
4611abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
4621abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
46382257610Sandi
4641abfaba4SAndreas Gohr    $out = array();
4651abfaba4SAndreas Gohr
4661abfaba4SAndreas Gohr    $len = strlen($str);
4671abfaba4SAndreas Gohr
4681abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
4691abfaba4SAndreas Gohr
4701abfaba4SAndreas Gohr        $in = ord($str{$i});
4711abfaba4SAndreas Gohr
4721abfaba4SAndreas Gohr        if ( $mState == 0) {
4731abfaba4SAndreas Gohr
4741abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
4751abfaba4SAndreas Gohr            // multi-octet sequence.
4761abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
4771abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
4781abfaba4SAndreas Gohr                $out[] = $in;
4791abfaba4SAndreas Gohr                $mBytes = 1;
4801abfaba4SAndreas Gohr
4811abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
4821abfaba4SAndreas Gohr                // First octet of 2 octet sequence
4831abfaba4SAndreas Gohr                $mUcs4 = ($in);
4841abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
4851abfaba4SAndreas Gohr                $mState = 1;
4861abfaba4SAndreas Gohr                $mBytes = 2;
4871abfaba4SAndreas Gohr
4881abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
4891abfaba4SAndreas Gohr                // First octet of 3 octet sequence
4901abfaba4SAndreas Gohr                $mUcs4 = ($in);
4911abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
4921abfaba4SAndreas Gohr                $mState = 2;
4931abfaba4SAndreas Gohr                $mBytes = 3;
4941abfaba4SAndreas Gohr
4951abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
4961abfaba4SAndreas Gohr                // First octet of 4 octet sequence
4971abfaba4SAndreas Gohr                $mUcs4 = ($in);
4981abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
4991abfaba4SAndreas Gohr                $mState = 3;
5001abfaba4SAndreas Gohr                $mBytes = 4;
5011abfaba4SAndreas Gohr
5021abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
5031abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
5041abfaba4SAndreas Gohr                 *
5051abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
5061abfaba4SAndreas Gohr                 * (a) not the shortest form or
5071abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
5081abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
5091abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5101abfaba4SAndreas Gohr                 */
5111abfaba4SAndreas Gohr                $mUcs4 = ($in);
5121abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5131abfaba4SAndreas Gohr                $mState = 4;
5141abfaba4SAndreas Gohr                $mBytes = 5;
5151abfaba4SAndreas Gohr
5161abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
5171abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
5181abfaba4SAndreas Gohr                $mUcs4 = ($in);
5191abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
5201abfaba4SAndreas Gohr                $mState = 5;
5211abfaba4SAndreas Gohr                $mBytes = 6;
5221abfaba4SAndreas Gohr
5231abfaba4SAndreas Gohr            } elseif($strict) {
5241abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
5251abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
5261abfaba4SAndreas Gohr                 */
5271abfaba4SAndreas Gohr                trigger_error(
5281abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
5291abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
5301abfaba4SAndreas Gohr                        E_USER_WARNING
5311abfaba4SAndreas Gohr                    );
5321abfaba4SAndreas Gohr                return FALSE;
5331abfaba4SAndreas Gohr
5341abfaba4SAndreas Gohr            }
5351abfaba4SAndreas Gohr
5361abfaba4SAndreas Gohr        } else {
5371abfaba4SAndreas Gohr
5381abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
5391abfaba4SAndreas Gohr            // sequence
5401abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
5411abfaba4SAndreas Gohr
5421abfaba4SAndreas Gohr                // Legal continuation.
5431abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
5441abfaba4SAndreas Gohr                $tmp = $in;
5451abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
5461abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
5471abfaba4SAndreas Gohr
5481abfaba4SAndreas Gohr                /**
5491abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
5501abfaba4SAndreas Gohr                 * Unicode codepoint to be output
5511abfaba4SAndreas Gohr                 */
5521abfaba4SAndreas Gohr                if (0 == --$mState) {
5531abfaba4SAndreas Gohr
5541abfaba4SAndreas Gohr                    /*
5551abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
5561abfaba4SAndreas Gohr                     */
5571abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
5581abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
5591abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
5601abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
5611abfaba4SAndreas Gohr                        (4 < $mBytes) ||
5621abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
5631abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
5641abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
5651abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
5661abfaba4SAndreas Gohr
5671abfaba4SAndreas Gohr                        if($strict){
5681abfaba4SAndreas Gohr                            trigger_error(
5691abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
5701abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
5711abfaba4SAndreas Gohr                                    E_USER_WARNING
5721abfaba4SAndreas Gohr                                );
5731abfaba4SAndreas Gohr
5741abfaba4SAndreas Gohr                            return FALSE;
5751abfaba4SAndreas Gohr                        }
5761abfaba4SAndreas Gohr
5771abfaba4SAndreas Gohr                    }
5781abfaba4SAndreas Gohr
5791abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
5801abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
5811abfaba4SAndreas Gohr                        $out[] = $mUcs4;
5821abfaba4SAndreas Gohr                    }
5831abfaba4SAndreas Gohr
5841abfaba4SAndreas Gohr                    //initialize UTF8 cache
5851abfaba4SAndreas Gohr                    $mState = 0;
5861abfaba4SAndreas Gohr                    $mUcs4  = 0;
5871abfaba4SAndreas Gohr                    $mBytes = 1;
5881abfaba4SAndreas Gohr                }
5891abfaba4SAndreas Gohr
5901abfaba4SAndreas Gohr            } elseif($strict) {
5911abfaba4SAndreas Gohr                /**
5921abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
5931abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
5941abfaba4SAndreas Gohr                 */
5951abfaba4SAndreas Gohr                trigger_error(
5961abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
5971abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
5981abfaba4SAndreas Gohr                        E_USER_WARNING
5991abfaba4SAndreas Gohr                    );
6001abfaba4SAndreas Gohr
6011abfaba4SAndreas Gohr                return FALSE;
60282257610Sandi            }
60382257610Sandi        }
60482257610Sandi    }
6051abfaba4SAndreas Gohr    return $out;
60682257610Sandi}
60782257610Sandi
60882257610Sandi/**
6091abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6101abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6111abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6121abfaba4SAndreas Gohr * are not allowed.
61382257610Sandi *
6141abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
6151abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
6161abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
6171abfaba4SAndreas Gohr *
6181abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
6191abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
6201abfaba4SAndreas Gohr * reference the array by it's keys
6211abfaba4SAndreas Gohr *
6221abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
6231abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
6241abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points
6251abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
6261abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
6271abfaba4SAndreas Gohr * @see    utf8_to_unicode
6281abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
6291abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
63082257610Sandi */
6311abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
6321abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
6331abfaba4SAndreas Gohr    ob_start();
634f949a01cSAndreas Gohr
6351abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
6361abfaba4SAndreas Gohr
6371abfaba4SAndreas Gohr        # ASCII range (including control chars)
6381abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
6391abfaba4SAndreas Gohr
6401abfaba4SAndreas Gohr            echo chr($arr[$k]);
6411abfaba4SAndreas Gohr
6421abfaba4SAndreas Gohr        # 2 byte sequence
6431abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
6441abfaba4SAndreas Gohr
6451abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
6461abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6471abfaba4SAndreas Gohr
6481abfaba4SAndreas Gohr        # Byte order mark (skip)
6491abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
6501abfaba4SAndreas Gohr
6511abfaba4SAndreas Gohr            // nop -- zap the BOM
6521abfaba4SAndreas Gohr
6531abfaba4SAndreas Gohr        # Test for illegal surrogates
6541abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
6551abfaba4SAndreas Gohr
6561abfaba4SAndreas Gohr            // found a surrogate
6571abfaba4SAndreas Gohr            if($strict){
6581abfaba4SAndreas Gohr                trigger_error(
6591abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
6601abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
6611abfaba4SAndreas Gohr                    E_USER_WARNING
6621abfaba4SAndreas Gohr                    );
6631abfaba4SAndreas Gohr                return FALSE;
6641abfaba4SAndreas Gohr            }
6651abfaba4SAndreas Gohr
6661abfaba4SAndreas Gohr        # 3 byte sequence
6671abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
6681abfaba4SAndreas Gohr
6691abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
6701abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
6711abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6721abfaba4SAndreas Gohr
6731abfaba4SAndreas Gohr        # 4 byte sequence
6741abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
6751abfaba4SAndreas Gohr
6761abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
6771abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
6781abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
6791abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
6801abfaba4SAndreas Gohr
6811abfaba4SAndreas Gohr        } elseif($strict) {
6821abfaba4SAndreas Gohr
6831abfaba4SAndreas Gohr            trigger_error(
6841abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
6851abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
6861abfaba4SAndreas Gohr                E_USER_WARNING
6871abfaba4SAndreas Gohr                );
6881abfaba4SAndreas Gohr
6891abfaba4SAndreas Gohr            // out of range
6901abfaba4SAndreas Gohr            return FALSE;
69182257610Sandi        }
69282257610Sandi    }
6931abfaba4SAndreas Gohr
6941abfaba4SAndreas Gohr    $result = ob_get_contents();
6951abfaba4SAndreas Gohr    ob_end_clean();
6961abfaba4SAndreas Gohr    return $result;
69782257610Sandi}
69882257610Sandi
69982257610Sandi/**
70015fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
70115fa0b4fSAndreas Gohr *
70215fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
70315fa0b4fSAndreas Gohr */
70415fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
70515fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
706ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
70715fa0b4fSAndreas Gohr
70815fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
70915fa0b4fSAndreas Gohr  foreach($uni as $cp){
71015fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
71115fa0b4fSAndreas Gohr  }
71215fa0b4fSAndreas Gohr  return $out;
71315fa0b4fSAndreas Gohr}
71415fa0b4fSAndreas Gohr
71515fa0b4fSAndreas Gohr/**
71615fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
71715fa0b4fSAndreas Gohr *
71815fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
71915fa0b4fSAndreas Gohr */
72015fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
72115fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
72215fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
72315fa0b4fSAndreas Gohr}
72415fa0b4fSAndreas Gohr
725ab77016bSAndreas Gohr
726ab77016bSAndreas Gohr// only needed if no mb_string available
727ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
728ab77016bSAndreas Gohr
72915fa0b4fSAndreas Gohr  /**
73082257610Sandi   * UTF-8 Case lookup table
73182257610Sandi   *
73282257610Sandi   * This lookuptable defines the upper case letters to their correspponding
73382257610Sandi   * lower case letter in UTF-8
73482257610Sandi   *
73582257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
73682257610Sandi   */
737*54662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
738*54662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
73982257610Sandi    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
74082257610Sandi    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
74182257610Sandi    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
74282257610Sandi    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
74382257610Sandi    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
74482257610Sandi    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
74582257610Sandi    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
74682257610Sandi    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
74782257610Sandi    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
74882257610Sandi    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
74982257610Sandi    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
75082257610Sandi    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
75182257610Sandi    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
75282257610Sandi    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
75382257610Sandi    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
75482257610Sandi    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
75582257610Sandi    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
75682257610Sandi    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
75782257610Sandi    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
75882257610Sandi    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
75982257610Sandi    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
76082257610Sandi    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
76182257610Sandi    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
76282257610Sandi    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
76382257610Sandi    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
76482257610Sandi    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
76582257610Sandi    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
76682257610Sandi    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
76782257610Sandi    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
76882257610Sandi    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
76982257610Sandi    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
77082257610Sandi    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
77182257610Sandi    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
77282257610Sandi    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
77382257610Sandi    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
77482257610Sandi    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
77582257610Sandi    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
77682257610Sandi    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
77782257610Sandi    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
77882257610Sandi    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
77982257610Sandi    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
78082257610Sandi    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
78182257610Sandi    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
78282257610Sandi  );
78382257610Sandi
78482257610Sandi  /**
78582257610Sandi   * UTF-8 Case lookup table
78682257610Sandi   *
78782257610Sandi   * This lookuptable defines the lower case letters to their correspponding
78882257610Sandi   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
78982257610Sandi   *
79082257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
79182257610Sandi   */
792*54662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
79382257610Sandi  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
79482257610Sandi
795ab77016bSAndreas Gohr} // end of case lookup tables
796ab77016bSAndreas Gohr
797ab77016bSAndreas Gohr
79882257610Sandi/**
79982257610Sandi * UTF-8 lookup table for lower case accented letters
80082257610Sandi *
80182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
80282257610Sandi * range. This are lower case letters only.
80382257610Sandi *
80482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
80582257610Sandi * @see    utf8_deaccent()
80682257610Sandi */
807*54662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
80882257610Sandi$UTF8_LOWER_ACCENTS = array(
80982257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
81082257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
81182257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
81282257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
81382257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
81482257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
81582257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
81682257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
81782257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
81882257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
81982257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
82082257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
82182257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
82282257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
8230c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
82482257610Sandi);
82582257610Sandi
82682257610Sandi/**
82782257610Sandi * UTF-8 lookup table for upper case accented letters
82882257610Sandi *
82982257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
83082257610Sandi * range. This are upper case letters only.
83182257610Sandi *
83282257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
83382257610Sandi * @see    utf8_deaccent()
83482257610Sandi */
835*54662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
83682257610Sandi$UTF8_UPPER_ACCENTS = array(
837df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
838df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
839df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
840df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
841df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
842df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
843df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
844df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
845df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
846df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
847df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
848df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
849df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
850df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
851df3ecd55SAndreas Gohr  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
85282257610Sandi);
85382257610Sandi
854099ada41Sandi/**
855099ada41Sandi * UTF-8 array of common special characters
856099ada41Sandi *
857099ada41Sandi * This array should contain all special characters (not a letter or digit)
858099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
859099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
860099ada41Sandi * chars.
861099ada41Sandi *
862099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
863ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
864099ada41Sandi *
865099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
866099ada41Sandi * @see    utf8_stripspecials()
867099ada41Sandi */
868*54662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
869099ada41Sandi$UTF8_SPECIAL_CHARS = array(
870099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
871ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
8725c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
8735c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
874099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
875099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
876099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
877099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
878099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
879099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
880099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
881099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
882099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
883099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
884099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
885099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
886099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
887099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
888099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
889099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
890099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
891099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
892099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
893099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
894099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
895099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
896099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
897099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
898099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
899099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
900099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
901099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
902099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
903099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
904099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
905099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
906099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
907099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
908099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
909099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
910099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
911099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
912099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
913099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
914099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
915099ada41Sandi  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
916099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
917099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
918099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
919099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
920099ada41Sandi);
921340756e4Sandi
9228a831f2bSAndreas Gohr/**
9238a831f2bSAndreas Gohr * Romanization lookup table
9248a831f2bSAndreas Gohr *
9258a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
9268a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
9278a831f2bSAndreas Gohr *
9288a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
9298a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
9308a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
9318a831f2bSAndreas Gohr *
9328a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
9338a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
9348a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
9358a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
9368a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
9378a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
9388a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
9398a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
9408a831f2bSAndreas Gohr */
941*54662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
9428a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
9438a831f2bSAndreas Gohr  //russian cyrillic
9448a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
9458a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
9468a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
9478a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
9488a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
9498a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
950d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
951d8cb2602SDenis Simakov  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
9528a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
9538a831f2bSAndreas Gohr  // Ukrainian cyrillic
9548a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
9558a831f2bSAndreas Gohr  // Georgian
9568a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
9578a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
9588a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
9598a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
9608a831f2bSAndreas Gohr  'ჰ'=>'xh',
9618a831f2bSAndreas Gohr  //Sanskrit
9628a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
9638a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
9648a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
9658a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
9668a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
9678a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
9688a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
9698a831f2bSAndreas Gohr  //Hebrew
9703dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
9713dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
9723dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
9738a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
9748a831f2bSAndreas Gohr  //Arabic
9758a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
9768a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
9778a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
9788a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
9798a831f2bSAndreas Gohr
9808a831f2bSAndreas Gohr  // Japanese hiragana
9818a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
9828a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
9838a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
9848a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
9858a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
9868a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
9878a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
9888a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
9898a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
9908a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
9918a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
9928a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
9938a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
9948a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
9958a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
9968a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
9978a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
9988a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
9998a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
10008a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
10018a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
10028a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
10038a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
10048a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
10058a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
10068a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
10078a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
10088a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
10098a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
10108a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
10118a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
10128a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
10138a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
10148a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
10158a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
10168a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
10178a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
10188a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
10198a831f2bSAndreas Gohr  'じゅ'=>'zyu',
10208a831f2bSAndreas Gohr  // Japanese katakana
10218a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
10228a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
10238a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
10248a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
10258a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
10268a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
10278a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
10288a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
10298a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
10308a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
10318a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
10328a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
10338a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
10348a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
10358a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
10368a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
10378a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
10388a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
10398a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
10408a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
10418a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
10428a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
10438a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
10448a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
10458a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
10468a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
10478a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
10488a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
10498a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
10508a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
10518a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
10528a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
10538a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
10548a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
10558a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
10568a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
10578a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
10588a831f2bSAndreas Gohr
10598a831f2bSAndreas Gohr  // "Greeklish"
10608a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
10618a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
10628a831f2bSAndreas Gohr
10638a831f2bSAndreas Gohr  // Thai
10648a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
10658a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
10668a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
10678a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
10688a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
10698a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
10708a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
10718a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
10728a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
10738a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
10748a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
10758a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
10768a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
10778a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
10788a831f2bSAndreas Gohr
10798a831f2bSAndreas Gohr  // Korean
10808a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
10818a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
10828a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
10838a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
10848a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
10858a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
10868a831f2bSAndreas Gohr);
1087340756e4Sandi
1088340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
10898a831f2bSAndreas Gohr
1090