xref: /dokuwiki/inc/utf8.php (revision 1abfaba47f3c1c4352eb728afcd4b6871b7bb4a0)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
982257610Sandi/**
1049c713a3Sandi * URL-Encode a filename to allow unicodecharacters
1149c713a3Sandi *
1249c713a3Sandi * Slashes are not encoded
1349c713a3Sandi *
14f59b22f0Sandi * When the second parameter is true the string will
15f59b22f0Sandi * be encoded only if non ASCII characters are detected -
16f59b22f0Sandi * This makes it safe to run it multiple times on the
17f59b22f0Sandi * same string (default is true)
18f59b22f0Sandi *
1949c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
20f59b22f0Sandi * @see    urlencode
2149c713a3Sandi */
22f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
23f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
24f59b22f0Sandi    return $file;
25f59b22f0Sandi  }
26f59b22f0Sandi  $file = urlencode($file);
2749c713a3Sandi  $file = str_replace('%2F','/',$file);
2849c713a3Sandi  return $file;
2949c713a3Sandi}
3049c713a3Sandi
3149c713a3Sandi/**
3249c713a3Sandi * URL-Decode a filename
3349c713a3Sandi *
34f59b22f0Sandi * This is just a wrapper around urldecode
35f59b22f0Sandi *
3649c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
37f59b22f0Sandi * @see    urldecode
3849c713a3Sandi */
3949c713a3Sandifunction utf8_decodeFN($file){
40f59b22f0Sandi  $file = urldecode($file);
4149c713a3Sandi  return $file;
4249c713a3Sandi}
4349c713a3Sandi
44f29bd553Sandi/**
4544f669e9Sandi * Checks if a string contains 7bit ASCII only
4644f669e9Sandi *
4744f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
4844f669e9Sandi */
4944f669e9Sandifunction utf8_isASCII($str){
5044f669e9Sandi  for($i=0; $i<strlen($str); $i++){
5144f669e9Sandi    if(ord($str{$i}) >127) return false;
5244f669e9Sandi  }
5344f669e9Sandi  return true;
5444f669e9Sandi}
5544f669e9Sandi
5644f669e9Sandi/**
57e1906e6eSandi * Strips all highbyte chars
58e1906e6eSandi *
59e1906e6eSandi * Returns a pure ASCII7 string
60e1906e6eSandi *
61e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
62e1906e6eSandi */
63e1906e6eSandifunction utf8_strip($str){
64e1906e6eSandi  $ascii = '';
65e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
66e1906e6eSandi    if(ord($str{$i}) <128){
67e1906e6eSandi      $ascii .= $str{$i};
68e1906e6eSandi    }
69e1906e6eSandi  }
70e1906e6eSandi  return $ascii;
71e1906e6eSandi}
72e1906e6eSandi
73e1906e6eSandi/**
74f29bd553Sandi * Tries to detect if a string is in Unicode encoding
75f29bd553Sandi *
76f29bd553Sandi * @author <bmorel@ssi.fr>
77f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
78f29bd553Sandi */
79f29bd553Sandifunction utf8_check($Str) {
80f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
81f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
82f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
83f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
84f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
85f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
86f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
87f29bd553Sandi  else return false; # Does not match any model
88f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
89f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
90f29bd553Sandi   return false;
91f29bd553Sandi  }
92f29bd553Sandi }
93f29bd553Sandi return true;
94f29bd553Sandi}
9549c713a3Sandi
962f954959Sandi/**
97f29317c1Sandi * Unicode aware replacement for strlen()
982f954959Sandi *
99f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
100f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
101f29317c1Sandi * even faster than mb_strlen.
1022f954959Sandi *
103f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1042f954959Sandi * @see    strlen()
105f29317c1Sandi * @see    utf8_decode()
1062f954959Sandi */
1072f954959Sandifunction utf8_strlen($string){
108dc57ef04Sandi  return strlen(utf8_decode($string));
1092f954959Sandi}
1102f954959Sandi
1117077c942Sandi/**
112f29317c1Sandi * Unicode aware replacement for substr()
1137077c942Sandi *
1147d8be200Sandi * @author lmak at NOSPAM dot iti dot gr
1157d8be200Sandi * @link   http://www.php.net/manual/en/function.substr.php
1167077c942Sandi * @see    substr()
1177077c942Sandi */
1187077c942Sandifunction utf8_substr($str,$start,$length=null){
1197d8be200Sandi   preg_match_all("/./u", $str, $ar);
120f29317c1Sandi
1217d8be200Sandi   if($length != null) {
1227d8be200Sandi       return join("",array_slice($ar[0],$start,$length));
1237d8be200Sandi   } else {
1247d8be200Sandi       return join("",array_slice($ar[0],$start));
125f29317c1Sandi   }
126f29317c1Sandi}
127f29317c1Sandi
128f29317c1Sandi/**
129dc57ef04Sandi * Unicode aware replacement for substr_replace()
130dc57ef04Sandi *
131dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
132dc57ef04Sandi * @see    substr_replace()
133dc57ef04Sandi */
134dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
135dc57ef04Sandi  $ret = '';
136dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
137dc57ef04Sandi  $ret .= $replacement;
138dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
139dc57ef04Sandi  return $ret;
140dc57ef04Sandi}
141dc57ef04Sandi
142dc57ef04Sandi/**
143f29317c1Sandi * Unicode aware replacement for explode
144f29317c1Sandi *
145f29317c1Sandi * @TODO   support third limit arg
146f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
147f29317c1Sandi * @see    explode();
148f29317c1Sandi */
149f29317c1Sandifunction utf8_explode($sep, $str) {
150f29317c1Sandi  if ( $sep == '' ) {
151f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
152f29317c1Sandi    return FALSE;
153f29317c1Sandi  }
154f29317c1Sandi
155f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
156f29317c1Sandi}
157f29317c1Sandi
158f29317c1Sandi/**
159f29317c1Sandi * Unicode aware replacement for strrepalce()
160f29317c1Sandi *
161f29317c1Sandi * @todo   support PHP5 count (fourth arg)
162f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
163f29317c1Sandi * @see    strreplace();
164f29317c1Sandi */
165f29317c1Sandifunction utf8_str_replace($s,$r,$str){
166f29317c1Sandi  if(!is_array($s)){
167f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
168f29317c1Sandi  }else{
169f29317c1Sandi    foreach ($s as $k => $v) {
170f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
171f29317c1Sandi    }
172f29317c1Sandi  }
173f29317c1Sandi  return preg_replace($s,$r,$str);
174f29317c1Sandi}
175f29317c1Sandi
176f29317c1Sandi/**
177f29317c1Sandi * Unicode aware replacement for ltrim()
178f29317c1Sandi *
179f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
180f29317c1Sandi * @see    ltrim()
181f29317c1Sandi * @return string
182f29317c1Sandi */
183f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
184f29317c1Sandi  if($charlist == '') return ltrim($str);
185f29317c1Sandi
186f29317c1Sandi  //quote charlist for use in a characterclass
187f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
188f29317c1Sandi
189f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
190f29317c1Sandi}
191f29317c1Sandi
192f29317c1Sandi/**
193ea2eed85Sandi * Unicode aware replacement for rtrim()
194f29317c1Sandi *
195f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
196f29317c1Sandi * @see    rtrim()
197f29317c1Sandi * @return string
198f29317c1Sandi */
199f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
200f29317c1Sandi  if($charlist == '') return rtrim($str);
201f29317c1Sandi
202f29317c1Sandi  //quote charlist for use in a characterclass
203f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
204f29317c1Sandi
205f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
206f29317c1Sandi}
207f29317c1Sandi
208f29317c1Sandi/**
209f29317c1Sandi * Unicode aware replacement for trim()
210f29317c1Sandi *
211f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
212f29317c1Sandi * @see    trim()
213f29317c1Sandi * @return string
214f29317c1Sandi */
215f29317c1Sandifunction  utf8_trim($str,$charlist='') {
216f29317c1Sandi  if($charlist == '') return trim($str);
217f29317c1Sandi
218f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
219f29317c1Sandi}
220f29317c1Sandi
2212f954959Sandi
22249c713a3Sandi/**
22382257610Sandi * This is a unicode aware replacement for strtolower()
22482257610Sandi *
22582257610Sandi * Uses mb_string extension if available
22682257610Sandi *
22782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
22882257610Sandi * @see    strtolower()
22982257610Sandi * @see    utf8_strtoupper()
23082257610Sandi */
23182257610Sandifunction utf8_strtolower($string){
23282257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
23382257610Sandi    return mb_strtolower($string,'utf-8');
23482257610Sandi
23582257610Sandi  global $UTF8_UPPER_TO_LOWER;
23682257610Sandi  $uni = utf8_to_unicode($string);
2372cd2db38Sandi  $cnt = count($uni);
2382cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
23982257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
24082257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
24182257610Sandi    }
24282257610Sandi  }
24382257610Sandi  return unicode_to_utf8($uni);
24482257610Sandi}
24582257610Sandi
24682257610Sandi/**
24782257610Sandi * This is a unicode aware replacement for strtoupper()
24882257610Sandi *
24982257610Sandi * Uses mb_string extension if available
25082257610Sandi *
25182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
25282257610Sandi * @see    strtoupper()
25382257610Sandi * @see    utf8_strtoupper()
25482257610Sandi */
25582257610Sandifunction utf8_strtoupper($string){
25682257610Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strtolower'))
2574d807ea6SAndreas Gohr    return mb_strtoupper($string,'utf-8');
25882257610Sandi
25982257610Sandi  global $UTF8_LOWER_TO_UPPER;
26082257610Sandi  $uni = utf8_to_unicode($string);
2612cd2db38Sandi  $cnt = count($uni);
2622cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
26382257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
26482257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
26582257610Sandi    }
26682257610Sandi  }
26782257610Sandi  return unicode_to_utf8($uni);
26882257610Sandi}
26982257610Sandi
27082257610Sandi/**
27182257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
27282257610Sandi *
27382257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
27482257610Sandi * letters. Default is to deaccent both cases ($case = 0)
27582257610Sandi *
27682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
27782257610Sandi */
27882257610Sandifunction utf8_deaccent($string,$case=0){
27982257610Sandi  if($case <= 0){
28082257610Sandi    global $UTF8_LOWER_ACCENTS;
28182257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
28282257610Sandi  }
28382257610Sandi  if($case >= 0){
28482257610Sandi    global $UTF8_UPPER_ACCENTS;
28582257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
28682257610Sandi  }
28782257610Sandi  return $string;
28882257610Sandi}
28982257610Sandi
29082257610Sandi/**
2918a831f2bSAndreas Gohr * Romanize a non-latin string
2928a831f2bSAndreas Gohr *
2938a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
2948a831f2bSAndreas Gohr */
2958a831f2bSAndreas Gohrfunction utf8_romanize($string){
2968a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
2978a831f2bSAndreas Gohr
2988a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
2998a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3008a831f2bSAndreas Gohr}
3018a831f2bSAndreas Gohr
3028a831f2bSAndreas Gohr/**
303099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
304099ada41Sandi *
305099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
306099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
307099ada41Sandi *
308099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
309099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
310099ada41Sandi * @param  string $repl       Replace special with this string
311b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
312099ada41Sandi */
313b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
314099ada41Sandi  global $UTF8_SPECIAL_CHARS;
315099ada41Sandi
3165c812709Sandi  static $specials = null;
3175c812709Sandi  if(is_null($specials)){
3185c812709Sandi    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
3195c812709Sandi  }
320099ada41Sandi
321b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
322099ada41Sandi}
323099ada41Sandi
324099ada41Sandi/**
3252f954959Sandi * This is an Unicode aware replacement for strpos
3262f954959Sandi *
3272f954959Sandi * Uses mb_string extension if available
3282f954959Sandi *
329f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
3302f954959Sandi * @see    strpos()
3312f954959Sandi */
3322f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
3332f954959Sandi  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_strpos'))
3342f954959Sandi    return mb_strpos($haystack,$needle,$offset,'utf-8');
3352f954959Sandi
336f29317c1Sandi  if(!$offset){
337f29317c1Sandi    $ar = utf8_explode($needle, $str);
338f29317c1Sandi    if ( count($ar) > 1 ) {
339f29317c1Sandi       return utf8_strlen($ar[0]);
340f29317c1Sandi    }
341f29317c1Sandi    return false;
342f29317c1Sandi  }else{
343f29317c1Sandi    if ( !is_int($offset) ) {
344f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
345f29317c1Sandi      return false;
346f29317c1Sandi    }
3472f954959Sandi
348f29317c1Sandi    $str = utf8_substr($str, $offset);
349f29317c1Sandi
350f29317c1Sandi    if ( false !== ($pos = utf8_strpos($str,$needle))){
351f29317c1Sandi       return $pos + $offset;
3522f954959Sandi    }
353f29317c1Sandi    return false;
3542f954959Sandi  }
3552f954959Sandi}
3562f954959Sandi
3572f954959Sandi/**
358ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
359ea2eed85Sandi *
360ea2eed85Sandi * @author <vpribish at shopping dot com>
361ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
362ea2eed85Sandi */
363ea2eed85Sandifunction utf8_tohtml ($str) {
364ea2eed85Sandi  $ret = '';
365ea2eed85Sandi  $max = strlen($str);
366ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
367ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
368ea2eed85Sandi    $c = $str{$i};
369ea2eed85Sandi    $c1 = ord($c);
370ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
371ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
372ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
373ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
374ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
375ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
376ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
377ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
378ea2eed85Sandi      $last = $i+1;
379ea2eed85Sandi    }
380ea2eed85Sandi  }
381ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
382ea2eed85Sandi}
383ea2eed85Sandi
384ea2eed85Sandi/**
385*1abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
386*1abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
387*1abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
388*1abfaba4SAndreas Gohr * are not allowed.
38982257610Sandi *
390*1abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
391*1abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
392*1abfaba4SAndreas Gohr * level E_USER_WARNING
393*1abfaba4SAndreas Gohr *
394*1abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
395*1abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
396*1abfaba4SAndreas Gohr *
397*1abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
398*1abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
399*1abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
400*1abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
401*1abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid
402*1abfaba4SAndreas Gohr * @see    unicode_to_utf8
403*1abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
404*1abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
40582257610Sandi */
406*1abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
407*1abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
408*1abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
409*1abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
410*1abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
41182257610Sandi
412*1abfaba4SAndreas Gohr    $out = array();
413*1abfaba4SAndreas Gohr
414*1abfaba4SAndreas Gohr    $len = strlen($str);
415*1abfaba4SAndreas Gohr
416*1abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
417*1abfaba4SAndreas Gohr
418*1abfaba4SAndreas Gohr        $in = ord($str{$i});
419*1abfaba4SAndreas Gohr
420*1abfaba4SAndreas Gohr        if ( $mState == 0) {
421*1abfaba4SAndreas Gohr
422*1abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
423*1abfaba4SAndreas Gohr            // multi-octet sequence.
424*1abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
425*1abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
426*1abfaba4SAndreas Gohr                $out[] = $in;
427*1abfaba4SAndreas Gohr                $mBytes = 1;
428*1abfaba4SAndreas Gohr
429*1abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
430*1abfaba4SAndreas Gohr                // First octet of 2 octet sequence
431*1abfaba4SAndreas Gohr                $mUcs4 = ($in);
432*1abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
433*1abfaba4SAndreas Gohr                $mState = 1;
434*1abfaba4SAndreas Gohr                $mBytes = 2;
435*1abfaba4SAndreas Gohr
436*1abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
437*1abfaba4SAndreas Gohr                // First octet of 3 octet sequence
438*1abfaba4SAndreas Gohr                $mUcs4 = ($in);
439*1abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
440*1abfaba4SAndreas Gohr                $mState = 2;
441*1abfaba4SAndreas Gohr                $mBytes = 3;
442*1abfaba4SAndreas Gohr
443*1abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
444*1abfaba4SAndreas Gohr                // First octet of 4 octet sequence
445*1abfaba4SAndreas Gohr                $mUcs4 = ($in);
446*1abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
447*1abfaba4SAndreas Gohr                $mState = 3;
448*1abfaba4SAndreas Gohr                $mBytes = 4;
449*1abfaba4SAndreas Gohr
450*1abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
451*1abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
452*1abfaba4SAndreas Gohr                 *
453*1abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
454*1abfaba4SAndreas Gohr                 * (a) not the shortest form or
455*1abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
456*1abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
457*1abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
458*1abfaba4SAndreas Gohr                 */
459*1abfaba4SAndreas Gohr                $mUcs4 = ($in);
460*1abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
461*1abfaba4SAndreas Gohr                $mState = 4;
462*1abfaba4SAndreas Gohr                $mBytes = 5;
463*1abfaba4SAndreas Gohr
464*1abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
465*1abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
466*1abfaba4SAndreas Gohr                $mUcs4 = ($in);
467*1abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
468*1abfaba4SAndreas Gohr                $mState = 5;
469*1abfaba4SAndreas Gohr                $mBytes = 6;
470*1abfaba4SAndreas Gohr
471*1abfaba4SAndreas Gohr            } elseif($strict) {
472*1abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
473*1abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
474*1abfaba4SAndreas Gohr                 */
475*1abfaba4SAndreas Gohr                trigger_error(
476*1abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
477*1abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
478*1abfaba4SAndreas Gohr                        E_USER_WARNING
479*1abfaba4SAndreas Gohr                    );
480*1abfaba4SAndreas Gohr                return FALSE;
481*1abfaba4SAndreas Gohr
482*1abfaba4SAndreas Gohr            }
483*1abfaba4SAndreas Gohr
484*1abfaba4SAndreas Gohr        } else {
485*1abfaba4SAndreas Gohr
486*1abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
487*1abfaba4SAndreas Gohr            // sequence
488*1abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
489*1abfaba4SAndreas Gohr
490*1abfaba4SAndreas Gohr                // Legal continuation.
491*1abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
492*1abfaba4SAndreas Gohr                $tmp = $in;
493*1abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
494*1abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
495*1abfaba4SAndreas Gohr
496*1abfaba4SAndreas Gohr                /**
497*1abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
498*1abfaba4SAndreas Gohr                 * Unicode codepoint to be output
499*1abfaba4SAndreas Gohr                 */
500*1abfaba4SAndreas Gohr                if (0 == --$mState) {
501*1abfaba4SAndreas Gohr
502*1abfaba4SAndreas Gohr                    /*
503*1abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
504*1abfaba4SAndreas Gohr                     */
505*1abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
506*1abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
507*1abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
508*1abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
509*1abfaba4SAndreas Gohr                        (4 < $mBytes) ||
510*1abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
511*1abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
512*1abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
513*1abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
514*1abfaba4SAndreas Gohr
515*1abfaba4SAndreas Gohr                        if($strict){
516*1abfaba4SAndreas Gohr                            trigger_error(
517*1abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
518*1abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
519*1abfaba4SAndreas Gohr                                    E_USER_WARNING
520*1abfaba4SAndreas Gohr                                );
521*1abfaba4SAndreas Gohr
522*1abfaba4SAndreas Gohr                            return FALSE;
523*1abfaba4SAndreas Gohr                        }
524*1abfaba4SAndreas Gohr
525*1abfaba4SAndreas Gohr                    }
526*1abfaba4SAndreas Gohr
527*1abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
528*1abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
529*1abfaba4SAndreas Gohr                        $out[] = $mUcs4;
530*1abfaba4SAndreas Gohr                    }
531*1abfaba4SAndreas Gohr
532*1abfaba4SAndreas Gohr                    //initialize UTF8 cache
533*1abfaba4SAndreas Gohr                    $mState = 0;
534*1abfaba4SAndreas Gohr                    $mUcs4  = 0;
535*1abfaba4SAndreas Gohr                    $mBytes = 1;
536*1abfaba4SAndreas Gohr                }
537*1abfaba4SAndreas Gohr
538*1abfaba4SAndreas Gohr            } elseif($strict) {
539*1abfaba4SAndreas Gohr                /**
540*1abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
541*1abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
542*1abfaba4SAndreas Gohr                 */
543*1abfaba4SAndreas Gohr                trigger_error(
544*1abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
545*1abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
546*1abfaba4SAndreas Gohr                        E_USER_WARNING
547*1abfaba4SAndreas Gohr                    );
548*1abfaba4SAndreas Gohr
549*1abfaba4SAndreas Gohr                return FALSE;
55082257610Sandi            }
55182257610Sandi        }
55282257610Sandi    }
553*1abfaba4SAndreas Gohr    return $out;
55482257610Sandi}
55582257610Sandi
55682257610Sandi/**
557*1abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
558*1abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
559*1abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
560*1abfaba4SAndreas Gohr * are not allowed.
56182257610Sandi *
562*1abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
563*1abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
564*1abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
565*1abfaba4SAndreas Gohr *
566*1abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
567*1abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
568*1abfaba4SAndreas Gohr * reference the array by it's keys
569*1abfaba4SAndreas Gohr *
570*1abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
571*1abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
572*1abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points
573*1abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
574*1abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
575*1abfaba4SAndreas Gohr * @see    utf8_to_unicode
576*1abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
577*1abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
57882257610Sandi */
579*1abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
580*1abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
581*1abfaba4SAndreas Gohr    ob_start();
582f949a01cSAndreas Gohr
583*1abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
584*1abfaba4SAndreas Gohr
585*1abfaba4SAndreas Gohr        # ASCII range (including control chars)
586*1abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
587*1abfaba4SAndreas Gohr
588*1abfaba4SAndreas Gohr            echo chr($arr[$k]);
589*1abfaba4SAndreas Gohr
590*1abfaba4SAndreas Gohr        # 2 byte sequence
591*1abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
592*1abfaba4SAndreas Gohr
593*1abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
594*1abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
595*1abfaba4SAndreas Gohr
596*1abfaba4SAndreas Gohr        # Byte order mark (skip)
597*1abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
598*1abfaba4SAndreas Gohr
599*1abfaba4SAndreas Gohr            // nop -- zap the BOM
600*1abfaba4SAndreas Gohr
601*1abfaba4SAndreas Gohr        # Test for illegal surrogates
602*1abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
603*1abfaba4SAndreas Gohr
604*1abfaba4SAndreas Gohr            // found a surrogate
605*1abfaba4SAndreas Gohr            if($strict){
606*1abfaba4SAndreas Gohr                trigger_error(
607*1abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
608*1abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
609*1abfaba4SAndreas Gohr                    E_USER_WARNING
610*1abfaba4SAndreas Gohr                    );
611*1abfaba4SAndreas Gohr                return FALSE;
612*1abfaba4SAndreas Gohr            }
613*1abfaba4SAndreas Gohr
614*1abfaba4SAndreas Gohr        # 3 byte sequence
615*1abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
616*1abfaba4SAndreas Gohr
617*1abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
618*1abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
619*1abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
620*1abfaba4SAndreas Gohr
621*1abfaba4SAndreas Gohr        # 4 byte sequence
622*1abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
623*1abfaba4SAndreas Gohr
624*1abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
625*1abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
626*1abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
627*1abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
628*1abfaba4SAndreas Gohr
629*1abfaba4SAndreas Gohr        } elseif($strict) {
630*1abfaba4SAndreas Gohr
631*1abfaba4SAndreas Gohr            trigger_error(
632*1abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
633*1abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
634*1abfaba4SAndreas Gohr                E_USER_WARNING
635*1abfaba4SAndreas Gohr                );
636*1abfaba4SAndreas Gohr
637*1abfaba4SAndreas Gohr            // out of range
638*1abfaba4SAndreas Gohr            return FALSE;
63982257610Sandi        }
64082257610Sandi    }
641*1abfaba4SAndreas Gohr
642*1abfaba4SAndreas Gohr    $result = ob_get_contents();
643*1abfaba4SAndreas Gohr    ob_end_clean();
644*1abfaba4SAndreas Gohr    return $result;
64582257610Sandi}
64682257610Sandi
64782257610Sandi/**
64815fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
64915fa0b4fSAndreas Gohr *
65015fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
65115fa0b4fSAndreas Gohr */
65215fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
65315fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
65415fa0b4fSAndreas Gohr  if(!defined('UTF8_NOMBSTRING') && function_exists('mb_convert_encoding'))
65515fa0b4fSAndreas Gohr    return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
65615fa0b4fSAndreas Gohr
65715fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
65815fa0b4fSAndreas Gohr  foreach($uni as $cp){
65915fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
66015fa0b4fSAndreas Gohr  }
66115fa0b4fSAndreas Gohr  return $out;
66215fa0b4fSAndreas Gohr}
66315fa0b4fSAndreas Gohr
66415fa0b4fSAndreas Gohr/**
66515fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
66615fa0b4fSAndreas Gohr *
66715fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
66815fa0b4fSAndreas Gohr */
66915fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
67015fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
67115fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
67215fa0b4fSAndreas Gohr}
67315fa0b4fSAndreas Gohr
67415fa0b4fSAndreas Gohr/**
67582257610Sandi * UTF-8 Case lookup table
67682257610Sandi *
67782257610Sandi * This lookuptable defines the upper case letters to their correspponding
67882257610Sandi * lower case letter in UTF-8
67982257610Sandi *
68082257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
68182257610Sandi */
6828a831f2bSAndreas Gohrstatic $UTF8_LOWER_TO_UPPER = array(
68382257610Sandi  0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
68482257610Sandi  0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
68582257610Sandi  0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
68682257610Sandi  0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
68782257610Sandi  0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
68882257610Sandi  0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
68982257610Sandi  0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
69082257610Sandi  0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
69182257610Sandi  0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
69282257610Sandi  0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
69382257610Sandi  0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
69482257610Sandi  0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
69582257610Sandi  0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
69682257610Sandi  0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
69782257610Sandi  0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
69882257610Sandi  0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
69982257610Sandi  0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
70082257610Sandi  0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
70182257610Sandi  0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
70282257610Sandi  0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
70382257610Sandi  0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
70482257610Sandi  0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
70582257610Sandi  0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
70682257610Sandi  0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
70782257610Sandi  0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
70882257610Sandi  0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
70982257610Sandi  0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
71082257610Sandi  0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
71182257610Sandi  0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
71282257610Sandi  0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
71382257610Sandi  0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
71482257610Sandi  0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
71582257610Sandi  0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
71682257610Sandi  0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
71782257610Sandi  0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
71882257610Sandi  0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
71982257610Sandi  0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
72082257610Sandi  0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
72182257610Sandi  0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
72282257610Sandi  0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
72382257610Sandi  0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
72482257610Sandi  0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
72582257610Sandi  0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
72682257610Sandi);
72782257610Sandi
72882257610Sandi/**
72982257610Sandi * UTF-8 Case lookup table
73082257610Sandi *
73182257610Sandi * This lookuptable defines the lower case letters to their correspponding
73282257610Sandi * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
73382257610Sandi *
73482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
73582257610Sandi */
73682257610Sandi$UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
73782257610Sandi
73882257610Sandi/**
73982257610Sandi * UTF-8 lookup table for lower case accented letters
74082257610Sandi *
74182257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
74282257610Sandi * range. This are lower case letters only.
74382257610Sandi *
74482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
74582257610Sandi * @see    utf8_deaccent()
74682257610Sandi */
74782257610Sandi$UTF8_LOWER_ACCENTS = array(
74882257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
74982257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
75082257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
75182257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
75282257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
75382257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
75482257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
75582257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
75682257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
75782257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
75882257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
75982257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
76082257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
76182257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
7620c59b0cfSandi  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u',
76382257610Sandi);
76482257610Sandi
76582257610Sandi/**
76682257610Sandi * UTF-8 lookup table for upper case accented letters
76782257610Sandi *
76882257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
76982257610Sandi * range. This are upper case letters only.
77082257610Sandi *
77182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
77282257610Sandi * @see    utf8_deaccent()
77382257610Sandi */
77482257610Sandi$UTF8_UPPER_ACCENTS = array(
775df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
776df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
777df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
778df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
779df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
780df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
781df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
782df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
783df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
784df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
785df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
786df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
787df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
788df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
789df3ecd55SAndreas Gohr  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae',
79082257610Sandi);
79182257610Sandi
792099ada41Sandi/**
793099ada41Sandi * UTF-8 array of common special characters
794099ada41Sandi *
795099ada41Sandi * This array should contain all special characters (not a letter or digit)
796099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
797099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
798099ada41Sandi * chars.
799099ada41Sandi *
800099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
801ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
802099ada41Sandi *
803099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
804099ada41Sandi * @see    utf8_stripspecials()
805099ada41Sandi */
806099ada41Sandi$UTF8_SPECIAL_CHARS = array(
807099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
808ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
8095c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
8105c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
811099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
812099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
813099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
814099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
815099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
816099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
817099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
818099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
819099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
820099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
821099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
822099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
823099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
824099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
825099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
826099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
827099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
828099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
829099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
830099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
831099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
832099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
833099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
834099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
835099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
836099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
837099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
838099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
839099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
840099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
841099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
842099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
843099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
844099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
845099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
846099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
847099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
848099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
849099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
850099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
851099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
852099ada41Sandi  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
853099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
854099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
855099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
856099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
857099ada41Sandi);
858340756e4Sandi
8598a831f2bSAndreas Gohr/**
8608a831f2bSAndreas Gohr * Romanization lookup table
8618a831f2bSAndreas Gohr *
8628a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
8638a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
8648a831f2bSAndreas Gohr *
8658a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
8668a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
8678a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
8688a831f2bSAndreas Gohr *
8698a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
8708a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
8718a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
8728a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
8738a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
8748a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
8758a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
8768a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
8778a831f2bSAndreas Gohr */
8788a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
8798a831f2bSAndreas Gohr  //russian cyrillic
8808a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
8818a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
8828a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
8838a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
8848a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
8858a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
8868a831f2bSAndreas Gohr  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'th','Щ'=>'Th','ъ'=>'qh',
8878a831f2bSAndreas Gohr  'Ъ'=>'Qh','ы'=>'y','Ы'=>'Y','ь'=>'q','Ь'=>'Q','э'=>'eh','Э'=>'Eh','ю'=>'ju',
8888a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
8898a831f2bSAndreas Gohr  // Ukrainian cyrillic
8908a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
8918a831f2bSAndreas Gohr  // Georgian
8928a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
8938a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
8948a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
8958a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
8968a831f2bSAndreas Gohr  'ჰ'=>'xh',
8978a831f2bSAndreas Gohr  //Sanskrit
8988a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
8998a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
9008a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
9018a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
9028a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
9038a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
9048a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
9058a831f2bSAndreas Gohr  //Hebrew
9068a831f2bSAndreas Gohr  'ב'=>'a','ג'=>'b','ד'=>'g','ה'=>'d','ו'=>'x','ז'=>'v','ח'=>'kh','ט'=>'th',
9078a831f2bSAndreas Gohr  'י'=>'y','ך'=>'k','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
9088a831f2bSAndreas Gohr  'ס'=>'s','ע'=>'ah','ף'=>'p','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
9098a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
9108a831f2bSAndreas Gohr  //Arabic
9118a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
9128a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
9138a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
9148a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
9158a831f2bSAndreas Gohr
9168a831f2bSAndreas Gohr  // Japanese hiragana
9178a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
9188a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
9198a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
9208a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
9218a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
9228a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
9238a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
9248a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
9258a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
9268a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
9278a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
9288a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
9298a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
9308a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
9318a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
9328a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
9338a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
9348a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
9358a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
9368a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
9378a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
9388a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
9398a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
9408a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
9418a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
9428a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
9438a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
9448a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
9458a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
9468a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
9478a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
9488a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
9498a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
9508a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
9518a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
9528a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
9538a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
9548a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
9558a831f2bSAndreas Gohr  'じゅ'=>'zyu',
9568a831f2bSAndreas Gohr  // Japanese katakana
9578a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
9588a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
9598a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
9608a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
9618a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
9628a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
9638a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
9648a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
9658a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
9668a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
9678a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
9688a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
9698a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
9708a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
9718a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
9728a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
9738a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
9748a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
9758a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
9768a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
9778a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
9788a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
9798a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
9808a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
9818a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
9828a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
9838a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
9848a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
9858a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
9868a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
9878a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
9888a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
9898a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
9908a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
9918a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
9928a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
9938a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
9948a831f2bSAndreas Gohr
9958a831f2bSAndreas Gohr  // "Greeklish"
9968a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
9978a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
9988a831f2bSAndreas Gohr
9998a831f2bSAndreas Gohr  // Thai
10008a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
10018a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
10028a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
10038a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
10048a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
10058a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
10068a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
10078a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
10088a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
10098a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
10108a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
10118a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
10128a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
10138a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
10148a831f2bSAndreas Gohr
10158a831f2bSAndreas Gohr  // Korean
10168a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
10178a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
10188a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
10198a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
10208a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
10218a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
10228a831f2bSAndreas Gohr);
1023340756e4Sandi
1024340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
10258a831f2bSAndreas Gohr
1026