xref: /dokuwiki/inc/utf8.php (revision 720307d9ce09c85c3b6037fed985128b2bd860a4)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr
10ab77016bSAndreas Gohr/**
11ab77016bSAndreas Gohr * check for mb_string support
12ab77016bSAndreas Gohr */
13ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
14ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
15ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
169ee93076Schris    mb_internal_encoding('UTF-8');
17ab77016bSAndreas Gohr  }else{
18ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
19ab77016bSAndreas Gohr  }
20ab77016bSAndreas Gohr}
21ab77016bSAndreas Gohr
22ab77016bSAndreas Gohr
2382257610Sandi/**
2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi *
2649c713a3Sandi * Slashes are not encoded
2749c713a3Sandi *
28f59b22f0Sandi * When the second parameter is true the string will
29f59b22f0Sandi * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi * This makes it safe to run it multiple times on the
31f59b22f0Sandi * same string (default is true)
32f59b22f0Sandi *
3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi * @see    urlencode
3549c713a3Sandi */
36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi    return $file;
39f59b22f0Sandi  }
40f59b22f0Sandi  $file = urlencode($file);
4149c713a3Sandi  $file = str_replace('%2F','/',$file);
4249c713a3Sandi  return $file;
4349c713a3Sandi}
4449c713a3Sandi
4549c713a3Sandi/**
4649c713a3Sandi * URL-Decode a filename
4749c713a3Sandi *
48f59b22f0Sandi * This is just a wrapper around urldecode
49f59b22f0Sandi *
5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
51f59b22f0Sandi * @see    urldecode
5249c713a3Sandi */
5349c713a3Sandifunction utf8_decodeFN($file){
54f59b22f0Sandi  $file = urldecode($file);
5549c713a3Sandi  return $file;
5649c713a3Sandi}
5749c713a3Sandi
58f29bd553Sandi/**
5944f669e9Sandi * Checks if a string contains 7bit ASCII only
6044f669e9Sandi *
6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6244f669e9Sandi */
6344f669e9Sandifunction utf8_isASCII($str){
6444f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6544f669e9Sandi    if(ord($str{$i}) >127) return false;
6644f669e9Sandi  }
6744f669e9Sandi  return true;
6844f669e9Sandi}
6944f669e9Sandi
7044f669e9Sandi/**
71e1906e6eSandi * Strips all highbyte chars
72e1906e6eSandi *
73e1906e6eSandi * Returns a pure ASCII7 string
74e1906e6eSandi *
75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
76e1906e6eSandi */
77e1906e6eSandifunction utf8_strip($str){
78e1906e6eSandi  $ascii = '';
79e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
80e1906e6eSandi    if(ord($str{$i}) <128){
81e1906e6eSandi      $ascii .= $str{$i};
82e1906e6eSandi    }
83e1906e6eSandi  }
84e1906e6eSandi  return $ascii;
85e1906e6eSandi}
86e1906e6eSandi
87e1906e6eSandi/**
88f29bd553Sandi * Tries to detect if a string is in Unicode encoding
89f29bd553Sandi *
90f29bd553Sandi * @author <bmorel@ssi.fr>
91f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
92f29bd553Sandi */
93f29bd553Sandifunction utf8_check($Str) {
94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
95f29bd553Sandi  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
96f29bd553Sandi  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
97f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
98f29bd553Sandi  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
99f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
100f29bd553Sandi  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
101f29bd553Sandi  else return false; # Does not match any model
102f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
103f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
104f29bd553Sandi   return false;
105f29bd553Sandi  }
106f29bd553Sandi }
107f29bd553Sandi return true;
108f29bd553Sandi}
10949c713a3Sandi
1102f954959Sandi/**
111f29317c1Sandi * Unicode aware replacement for strlen()
1122f954959Sandi *
113f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
114f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
115f29317c1Sandi * even faster than mb_strlen.
1162f954959Sandi *
117f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1182f954959Sandi * @see    strlen()
119f29317c1Sandi * @see    utf8_decode()
1202f954959Sandi */
1212f954959Sandifunction utf8_strlen($string){
122dc57ef04Sandi  return strlen(utf8_decode($string));
1232f954959Sandi}
1242f954959Sandi
1257077c942Sandi/**
12610f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1277077c942Sandi *
12810f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
12910f09f2aSAndreas Gohr * Note: supports use of negative offsets and lengths but will be slower
13010f09f2aSAndreas Gohr * when doing so
13110f09f2aSAndreas Gohr *
13210f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
13310f09f2aSAndreas Gohr * @param string
13410f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13510f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13610f09f2aSAndreas Gohr * @return mixed string or FALSE if failure
1377077c942Sandi */
13810f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
139ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
14010f09f2aSAndreas Gohr        if( $length === null ){
14119a32233Schris            return mb_substr($str, $offset);
1427d8be200Sandi        }else{
14319a32233Schris            return mb_substr($str, $offset, $length);
144f29317c1Sandi        }
145f29317c1Sandi    }
146f29317c1Sandi
14710f09f2aSAndreas Gohr    if ( $offset >= 0 && $length >= 0 ) {
14810f09f2aSAndreas Gohr        if ( $length === null ) {
14910f09f2aSAndreas Gohr            $length = '*';
15010f09f2aSAndreas Gohr        } else {
15110f09f2aSAndreas Gohr            $strlen = strlen(utf8_decode($str));
15210f09f2aSAndreas Gohr            if ( $offset > $strlen ) {
15310f09f2aSAndreas Gohr                return '';
15410f09f2aSAndreas Gohr            }
15510f09f2aSAndreas Gohr
15610f09f2aSAndreas Gohr            if ( ( $offset + $length ) > $strlen ) {
15710f09f2aSAndreas Gohr               $length = '*';
15810f09f2aSAndreas Gohr            } else {
15910f09f2aSAndreas Gohr                $length = '{'.$length.'}';
16010f09f2aSAndreas Gohr            }
16110f09f2aSAndreas Gohr        }
16210f09f2aSAndreas Gohr
16310f09f2aSAndreas Gohr        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
16410f09f2aSAndreas Gohr        preg_match($pattern, $str, $matches);
16510f09f2aSAndreas Gohr
16610f09f2aSAndreas Gohr        if ( isset($matches[1]) ) {
16710f09f2aSAndreas Gohr            return $matches[1];
16810f09f2aSAndreas Gohr        }
16910f09f2aSAndreas Gohr        return false;
17010f09f2aSAndreas Gohr
17110f09f2aSAndreas Gohr    } else {
17210f09f2aSAndreas Gohr        // Handle negatives using different, slower technique
17310f09f2aSAndreas Gohr        // From: http://www.php.net/manual/en/function.substr.php#44838
17410f09f2aSAndreas Gohr        preg_match_all('/./u', $str, $ar);
17510f09f2aSAndreas Gohr        if( $length !== null ) {
17610f09f2aSAndreas Gohr            return join('',array_slice($ar[0],$offset,$length));
17710f09f2aSAndreas Gohr        } else {
17810f09f2aSAndreas Gohr            return join('',array_slice($ar[0],$offset));
17910f09f2aSAndreas Gohr        }
18010f09f2aSAndreas Gohr    }
18110f09f2aSAndreas Gohr}
18210f09f2aSAndreas Gohr
18310f09f2aSAndreas Gohr
184f29317c1Sandi/**
185dc57ef04Sandi * Unicode aware replacement for substr_replace()
186dc57ef04Sandi *
187dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
188dc57ef04Sandi * @see    substr_replace()
189dc57ef04Sandi */
190dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
191dc57ef04Sandi  $ret = '';
192dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
193dc57ef04Sandi  $ret .= $replacement;
194dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
195dc57ef04Sandi  return $ret;
196dc57ef04Sandi}
197dc57ef04Sandi
198dc57ef04Sandi/**
199f29317c1Sandi * Unicode aware replacement for explode
200f29317c1Sandi *
201f29317c1Sandi * @TODO   support third limit arg
202f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
203f29317c1Sandi * @see    explode();
204f29317c1Sandi */
205f29317c1Sandifunction utf8_explode($sep, $str) {
206f29317c1Sandi  if ( $sep == '' ) {
207f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
208f29317c1Sandi    return FALSE;
209f29317c1Sandi  }
210f29317c1Sandi
211f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
212f29317c1Sandi}
213f29317c1Sandi
214f29317c1Sandi/**
215f29317c1Sandi * Unicode aware replacement for strrepalce()
216f29317c1Sandi *
217f29317c1Sandi * @todo   support PHP5 count (fourth arg)
218f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
219f29317c1Sandi * @see    strreplace();
220f29317c1Sandi */
221f29317c1Sandifunction utf8_str_replace($s,$r,$str){
222f29317c1Sandi  if(!is_array($s)){
223f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
224f29317c1Sandi  }else{
225f29317c1Sandi    foreach ($s as $k => $v) {
226f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
227f29317c1Sandi    }
228f29317c1Sandi  }
229f29317c1Sandi  return preg_replace($s,$r,$str);
230f29317c1Sandi}
231f29317c1Sandi
232f29317c1Sandi/**
233f29317c1Sandi * Unicode aware replacement for ltrim()
234f29317c1Sandi *
235f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
236f29317c1Sandi * @see    ltrim()
237f29317c1Sandi * @return string
238f29317c1Sandi */
239f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
240f29317c1Sandi  if($charlist == '') return ltrim($str);
241f29317c1Sandi
242f29317c1Sandi  //quote charlist for use in a characterclass
243f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
244f29317c1Sandi
245f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
246f29317c1Sandi}
247f29317c1Sandi
248f29317c1Sandi/**
249ea2eed85Sandi * Unicode aware replacement for rtrim()
250f29317c1Sandi *
251f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
252f29317c1Sandi * @see    rtrim()
253f29317c1Sandi * @return string
254f29317c1Sandi */
255f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
256f29317c1Sandi  if($charlist == '') return rtrim($str);
257f29317c1Sandi
258f29317c1Sandi  //quote charlist for use in a characterclass
259f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
260f29317c1Sandi
261f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
262f29317c1Sandi}
263f29317c1Sandi
264f29317c1Sandi/**
265f29317c1Sandi * Unicode aware replacement for trim()
266f29317c1Sandi *
267f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
268f29317c1Sandi * @see    trim()
269f29317c1Sandi * @return string
270f29317c1Sandi */
271f29317c1Sandifunction  utf8_trim($str,$charlist='') {
272f29317c1Sandi  if($charlist == '') return trim($str);
273f29317c1Sandi
274f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
275f29317c1Sandi}
276f29317c1Sandi
2772f954959Sandi
27849c713a3Sandi/**
27982257610Sandi * This is a unicode aware replacement for strtolower()
28082257610Sandi *
28182257610Sandi * Uses mb_string extension if available
28282257610Sandi *
28382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
28482257610Sandi * @see    strtolower()
28582257610Sandi * @see    utf8_strtoupper()
28682257610Sandi */
28782257610Sandifunction utf8_strtolower($string){
288ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
28982257610Sandi
29082257610Sandi  global $UTF8_UPPER_TO_LOWER;
29182257610Sandi  $uni = utf8_to_unicode($string);
2922cd2db38Sandi  $cnt = count($uni);
2932cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
29482257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
29582257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
29682257610Sandi    }
29782257610Sandi  }
29882257610Sandi  return unicode_to_utf8($uni);
29982257610Sandi}
30082257610Sandi
30182257610Sandi/**
30282257610Sandi * This is a unicode aware replacement for strtoupper()
30382257610Sandi *
30482257610Sandi * Uses mb_string extension if available
30582257610Sandi *
30682257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
30782257610Sandi * @see    strtoupper()
30882257610Sandi * @see    utf8_strtoupper()
30982257610Sandi */
31082257610Sandifunction utf8_strtoupper($string){
311ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
31282257610Sandi
31382257610Sandi  global $UTF8_LOWER_TO_UPPER;
31482257610Sandi  $uni = utf8_to_unicode($string);
3152cd2db38Sandi  $cnt = count($uni);
3162cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
31782257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
31882257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
31982257610Sandi    }
32082257610Sandi  }
32182257610Sandi  return unicode_to_utf8($uni);
32282257610Sandi}
32382257610Sandi
32482257610Sandi/**
32582257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
32682257610Sandi *
32782257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
32882257610Sandi * letters. Default is to deaccent both cases ($case = 0)
32982257610Sandi *
33082257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
33182257610Sandi */
33282257610Sandifunction utf8_deaccent($string,$case=0){
33382257610Sandi  if($case <= 0){
33482257610Sandi    global $UTF8_LOWER_ACCENTS;
33582257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
33682257610Sandi  }
33782257610Sandi  if($case >= 0){
33882257610Sandi    global $UTF8_UPPER_ACCENTS;
33982257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
34082257610Sandi  }
34182257610Sandi  return $string;
34282257610Sandi}
34382257610Sandi
34482257610Sandi/**
3458a831f2bSAndreas Gohr * Romanize a non-latin string
3468a831f2bSAndreas Gohr *
3478a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3488a831f2bSAndreas Gohr */
3498a831f2bSAndreas Gohrfunction utf8_romanize($string){
3508a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3518a831f2bSAndreas Gohr
3528a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3538a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3548a831f2bSAndreas Gohr}
3558a831f2bSAndreas Gohr
3568a831f2bSAndreas Gohr/**
357099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
358099ada41Sandi *
359099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
360099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
361099ada41Sandi *
362099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
363099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
364099ada41Sandi * @param  string $repl       Replace special with this string
365b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
366099ada41Sandi */
367b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
368099ada41Sandi  global $UTF8_SPECIAL_CHARS;
369*720307d9Schris  global $UTF8_SPECIAL_CHARS2;
370099ada41Sandi
3715c812709Sandi  static $specials = null;
3725c812709Sandi  if(is_null($specials)){
373*720307d9Schris#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
374*720307d9Schris    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
3755c812709Sandi  }
376099ada41Sandi
377b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
378099ada41Sandi}
379099ada41Sandi
380099ada41Sandi/**
3812f954959Sandi * This is an Unicode aware replacement for strpos
3822f954959Sandi *
3832f954959Sandi * Uses mb_string extension if available
3842f954959Sandi *
385f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
3862f954959Sandi * @see    strpos()
3872f954959Sandi */
3882f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
389ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
3902f954959Sandi
391f29317c1Sandi  if(!$offset){
392eaa525a0SAndreas Gohr    $ar = utf8_explode($needle, $haystack);
393f29317c1Sandi    if ( count($ar) > 1 ) {
394f29317c1Sandi       return utf8_strlen($ar[0]);
395f29317c1Sandi    }
396f29317c1Sandi    return false;
397f29317c1Sandi  }else{
398f29317c1Sandi    if ( !is_int($offset) ) {
399f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
400f29317c1Sandi      return false;
401f29317c1Sandi    }
4022f954959Sandi
403eaa525a0SAndreas Gohr    $haystack = utf8_substr($haystack, $offset);
404f29317c1Sandi
405eaa525a0SAndreas Gohr    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
406f29317c1Sandi       return $pos + $offset;
4072f954959Sandi    }
408f29317c1Sandi    return false;
4092f954959Sandi  }
4102f954959Sandi}
4112f954959Sandi
4122f954959Sandi/**
413ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
414ea2eed85Sandi *
415ea2eed85Sandi * @author <vpribish at shopping dot com>
416ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
417ea2eed85Sandi */
418ea2eed85Sandifunction utf8_tohtml ($str) {
419ea2eed85Sandi  $ret = '';
420ea2eed85Sandi  $max = strlen($str);
421ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
422ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
423ea2eed85Sandi    $c = $str{$i};
424ea2eed85Sandi    $c1 = ord($c);
425ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
426ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
427ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
428ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
429ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
430ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
431ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
432ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
433ea2eed85Sandi      $last = $i+1;
434ea2eed85Sandi    }
435ea2eed85Sandi  }
436ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
437ea2eed85Sandi}
438ea2eed85Sandi
439ea2eed85Sandi/**
4401abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
4411abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
4421abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
4431abfaba4SAndreas Gohr * are not allowed.
44482257610Sandi *
4451abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
4461abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
4471abfaba4SAndreas Gohr * level E_USER_WARNING
4481abfaba4SAndreas Gohr *
4491abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
4501abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
4511abfaba4SAndreas Gohr *
4521abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
4531abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
4541abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
4551abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
4561abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid
4571abfaba4SAndreas Gohr * @see    unicode_to_utf8
4581abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
4591abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
46082257610Sandi */
4611abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
4621abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
4631abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
4641abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
4651abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
46682257610Sandi
4671abfaba4SAndreas Gohr    $out = array();
4681abfaba4SAndreas Gohr
4691abfaba4SAndreas Gohr    $len = strlen($str);
4701abfaba4SAndreas Gohr
4711abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
4721abfaba4SAndreas Gohr
4731abfaba4SAndreas Gohr        $in = ord($str{$i});
4741abfaba4SAndreas Gohr
4751abfaba4SAndreas Gohr        if ( $mState == 0) {
4761abfaba4SAndreas Gohr
4771abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
4781abfaba4SAndreas Gohr            // multi-octet sequence.
4791abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
4801abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
4811abfaba4SAndreas Gohr                $out[] = $in;
4821abfaba4SAndreas Gohr                $mBytes = 1;
4831abfaba4SAndreas Gohr
4841abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
4851abfaba4SAndreas Gohr                // First octet of 2 octet sequence
4861abfaba4SAndreas Gohr                $mUcs4 = ($in);
4871abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
4881abfaba4SAndreas Gohr                $mState = 1;
4891abfaba4SAndreas Gohr                $mBytes = 2;
4901abfaba4SAndreas Gohr
4911abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
4921abfaba4SAndreas Gohr                // First octet of 3 octet sequence
4931abfaba4SAndreas Gohr                $mUcs4 = ($in);
4941abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
4951abfaba4SAndreas Gohr                $mState = 2;
4961abfaba4SAndreas Gohr                $mBytes = 3;
4971abfaba4SAndreas Gohr
4981abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
4991abfaba4SAndreas Gohr                // First octet of 4 octet sequence
5001abfaba4SAndreas Gohr                $mUcs4 = ($in);
5011abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
5021abfaba4SAndreas Gohr                $mState = 3;
5031abfaba4SAndreas Gohr                $mBytes = 4;
5041abfaba4SAndreas Gohr
5051abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
5061abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
5071abfaba4SAndreas Gohr                 *
5081abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
5091abfaba4SAndreas Gohr                 * (a) not the shortest form or
5101abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
5111abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
5121abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5131abfaba4SAndreas Gohr                 */
5141abfaba4SAndreas Gohr                $mUcs4 = ($in);
5151abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5161abfaba4SAndreas Gohr                $mState = 4;
5171abfaba4SAndreas Gohr                $mBytes = 5;
5181abfaba4SAndreas Gohr
5191abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
5201abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
5211abfaba4SAndreas Gohr                $mUcs4 = ($in);
5221abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
5231abfaba4SAndreas Gohr                $mState = 5;
5241abfaba4SAndreas Gohr                $mBytes = 6;
5251abfaba4SAndreas Gohr
5261abfaba4SAndreas Gohr            } elseif($strict) {
5271abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
5281abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
5291abfaba4SAndreas Gohr                 */
5301abfaba4SAndreas Gohr                trigger_error(
5311abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
5321abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
5331abfaba4SAndreas Gohr                        E_USER_WARNING
5341abfaba4SAndreas Gohr                    );
5351abfaba4SAndreas Gohr                return FALSE;
5361abfaba4SAndreas Gohr
5371abfaba4SAndreas Gohr            }
5381abfaba4SAndreas Gohr
5391abfaba4SAndreas Gohr        } else {
5401abfaba4SAndreas Gohr
5411abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
5421abfaba4SAndreas Gohr            // sequence
5431abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
5441abfaba4SAndreas Gohr
5451abfaba4SAndreas Gohr                // Legal continuation.
5461abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
5471abfaba4SAndreas Gohr                $tmp = $in;
5481abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
5491abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
5501abfaba4SAndreas Gohr
5511abfaba4SAndreas Gohr                /**
5521abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
5531abfaba4SAndreas Gohr                 * Unicode codepoint to be output
5541abfaba4SAndreas Gohr                 */
5551abfaba4SAndreas Gohr                if (0 == --$mState) {
5561abfaba4SAndreas Gohr
5571abfaba4SAndreas Gohr                    /*
5581abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
5591abfaba4SAndreas Gohr                     */
5601abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
5611abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
5621abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
5631abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
5641abfaba4SAndreas Gohr                        (4 < $mBytes) ||
5651abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
5661abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
5671abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
5681abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
5691abfaba4SAndreas Gohr
5701abfaba4SAndreas Gohr                        if($strict){
5711abfaba4SAndreas Gohr                            trigger_error(
5721abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
5731abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
5741abfaba4SAndreas Gohr                                    E_USER_WARNING
5751abfaba4SAndreas Gohr                                );
5761abfaba4SAndreas Gohr
5771abfaba4SAndreas Gohr                            return FALSE;
5781abfaba4SAndreas Gohr                        }
5791abfaba4SAndreas Gohr
5801abfaba4SAndreas Gohr                    }
5811abfaba4SAndreas Gohr
5821abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
5831abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
5841abfaba4SAndreas Gohr                        $out[] = $mUcs4;
5851abfaba4SAndreas Gohr                    }
5861abfaba4SAndreas Gohr
5871abfaba4SAndreas Gohr                    //initialize UTF8 cache
5881abfaba4SAndreas Gohr                    $mState = 0;
5891abfaba4SAndreas Gohr                    $mUcs4  = 0;
5901abfaba4SAndreas Gohr                    $mBytes = 1;
5911abfaba4SAndreas Gohr                }
5921abfaba4SAndreas Gohr
5931abfaba4SAndreas Gohr            } elseif($strict) {
5941abfaba4SAndreas Gohr                /**
5951abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
5961abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
5971abfaba4SAndreas Gohr                 */
5981abfaba4SAndreas Gohr                trigger_error(
5991abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
6001abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
6011abfaba4SAndreas Gohr                        E_USER_WARNING
6021abfaba4SAndreas Gohr                    );
6031abfaba4SAndreas Gohr
6041abfaba4SAndreas Gohr                return FALSE;
60582257610Sandi            }
60682257610Sandi        }
60782257610Sandi    }
6081abfaba4SAndreas Gohr    return $out;
60982257610Sandi}
61082257610Sandi
61182257610Sandi/**
6121abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6131abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6141abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6151abfaba4SAndreas Gohr * are not allowed.
61682257610Sandi *
6171abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
6181abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
6191abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
6201abfaba4SAndreas Gohr *
6211abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
6221abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
6231abfaba4SAndreas Gohr * reference the array by it's keys
6241abfaba4SAndreas Gohr *
6251abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
6261abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
6271abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points
6281abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
6291abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
6301abfaba4SAndreas Gohr * @see    utf8_to_unicode
6311abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
6321abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
63382257610Sandi */
6341abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
6351abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
6361abfaba4SAndreas Gohr    ob_start();
637f949a01cSAndreas Gohr
6381abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
6391abfaba4SAndreas Gohr
6401abfaba4SAndreas Gohr        # ASCII range (including control chars)
6411abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
6421abfaba4SAndreas Gohr
6431abfaba4SAndreas Gohr            echo chr($arr[$k]);
6441abfaba4SAndreas Gohr
6451abfaba4SAndreas Gohr        # 2 byte sequence
6461abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
6471abfaba4SAndreas Gohr
6481abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
6491abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6501abfaba4SAndreas Gohr
6511abfaba4SAndreas Gohr        # Byte order mark (skip)
6521abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
6531abfaba4SAndreas Gohr
6541abfaba4SAndreas Gohr            // nop -- zap the BOM
6551abfaba4SAndreas Gohr
6561abfaba4SAndreas Gohr        # Test for illegal surrogates
6571abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
6581abfaba4SAndreas Gohr
6591abfaba4SAndreas Gohr            // found a surrogate
6601abfaba4SAndreas Gohr            if($strict){
6611abfaba4SAndreas Gohr                trigger_error(
6621abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
6631abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
6641abfaba4SAndreas Gohr                    E_USER_WARNING
6651abfaba4SAndreas Gohr                    );
6661abfaba4SAndreas Gohr                return FALSE;
6671abfaba4SAndreas Gohr            }
6681abfaba4SAndreas Gohr
6691abfaba4SAndreas Gohr        # 3 byte sequence
6701abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
6711abfaba4SAndreas Gohr
6721abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
6731abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
6741abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6751abfaba4SAndreas Gohr
6761abfaba4SAndreas Gohr        # 4 byte sequence
6771abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
6781abfaba4SAndreas Gohr
6791abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
6801abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
6811abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
6821abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
6831abfaba4SAndreas Gohr
6841abfaba4SAndreas Gohr        } elseif($strict) {
6851abfaba4SAndreas Gohr
6861abfaba4SAndreas Gohr            trigger_error(
6871abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
6881abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
6891abfaba4SAndreas Gohr                E_USER_WARNING
6901abfaba4SAndreas Gohr                );
6911abfaba4SAndreas Gohr
6921abfaba4SAndreas Gohr            // out of range
6931abfaba4SAndreas Gohr            return FALSE;
69482257610Sandi        }
69582257610Sandi    }
6961abfaba4SAndreas Gohr
6971abfaba4SAndreas Gohr    $result = ob_get_contents();
6981abfaba4SAndreas Gohr    ob_end_clean();
6991abfaba4SAndreas Gohr    return $result;
70082257610Sandi}
70182257610Sandi
70282257610Sandi/**
70315fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
70415fa0b4fSAndreas Gohr *
70515fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
70615fa0b4fSAndreas Gohr */
70715fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
70815fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
709ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
71015fa0b4fSAndreas Gohr
71115fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
71215fa0b4fSAndreas Gohr  foreach($uni as $cp){
71315fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
71415fa0b4fSAndreas Gohr  }
71515fa0b4fSAndreas Gohr  return $out;
71615fa0b4fSAndreas Gohr}
71715fa0b4fSAndreas Gohr
71815fa0b4fSAndreas Gohr/**
71915fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
72015fa0b4fSAndreas Gohr *
72115fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
72215fa0b4fSAndreas Gohr */
72315fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
72415fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
72515fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
72615fa0b4fSAndreas Gohr}
72715fa0b4fSAndreas Gohr
7280eac1afbSAndreas Gohr/**
7290eac1afbSAndreas Gohr * Replace bad bytes with an alternative character
7300eac1afbSAndreas Gohr *
7310eac1afbSAndreas Gohr * ASCII character is recommended for replacement char
7320eac1afbSAndreas Gohr *
7330eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string
7340eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms
7350eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars
7360eac1afbSAndreas Gohr *
7370eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
7380eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8
7390eac1afbSAndreas Gohr * @param string to search
7400eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII
7410eac1afbSAndreas Gohr * @return string
7420eac1afbSAndreas Gohr */
7430eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') {
7440eac1afbSAndreas Gohr    $UTF8_BAD =
7450eac1afbSAndreas Gohr     '([\x00-\x7F]'.                          # ASCII (including control chars)
7460eac1afbSAndreas Gohr     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
7470eac1afbSAndreas Gohr     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
7480eac1afbSAndreas Gohr     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
7490eac1afbSAndreas Gohr     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
7500eac1afbSAndreas Gohr     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
7510eac1afbSAndreas Gohr     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
7520eac1afbSAndreas Gohr     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
7530eac1afbSAndreas Gohr     '|(.{1}))';                              # invalid byte
7540eac1afbSAndreas Gohr    ob_start();
7550eac1afbSAndreas Gohr    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
7560eac1afbSAndreas Gohr        if ( !isset($matches[2])) {
7570eac1afbSAndreas Gohr            echo $matches[0];
7580eac1afbSAndreas Gohr        } else {
7590eac1afbSAndreas Gohr            echo $replace;
7600eac1afbSAndreas Gohr        }
7610eac1afbSAndreas Gohr        $str = substr($str,strlen($matches[0]));
7620eac1afbSAndreas Gohr    }
7630eac1afbSAndreas Gohr    $result = ob_get_contents();
7640eac1afbSAndreas Gohr    ob_end_clean();
7650eac1afbSAndreas Gohr    return $result;
7660eac1afbSAndreas Gohr}
767ab77016bSAndreas Gohr
7685953e889Schris/**
7695953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary
7705953e889Schris *
7715953e889Schris * @param $str   string   utf8 character string
7725953e889Schris * @param $i     int      byte index into $str
7735953e889Schris * @param $next  bool     direction to search for boundary,
7745953e889Schris *                           false = up (current character)
7755953e889Schris *                           true = down (next character)
7765953e889Schris *
7775953e889Schris * @return int            byte index into $str now pointing to a utf8 character boundary
7785953e889Schris *
7795953e889Schris * @author       chris smith <chris@jalakai.co.uk>
7805953e889Schris */
7815953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) {
7825953e889Schris
783f50163d1Schris  if ($i <= 0) return 0;
784f50163d1Schris
7855953e889Schris  $limit = strlen($str);
786f50163d1Schris  if ($i>=$limit) return $limit;
787f50163d1Schris
788f50163d1Schris  if ($next) {
7895953e889Schris    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
7905953e889Schris  } else {
7915953e889Schris    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
7925953e889Schris  }
7935953e889Schris
7945953e889Schris  return $i;
7955953e889Schris}
7965953e889Schris
797ab77016bSAndreas Gohr// only needed if no mb_string available
798ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
799ab77016bSAndreas Gohr
80015fa0b4fSAndreas Gohr  /**
80182257610Sandi   * UTF-8 Case lookup table
80282257610Sandi   *
80382257610Sandi   * This lookuptable defines the upper case letters to their correspponding
80482257610Sandi   * lower case letter in UTF-8
80582257610Sandi   *
80682257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
80782257610Sandi   */
80854662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
80954662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
81082257610Sandi    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
81182257610Sandi    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
81282257610Sandi    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
81382257610Sandi    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
81482257610Sandi    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
81582257610Sandi    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
81682257610Sandi    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
81782257610Sandi    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
81882257610Sandi    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
81982257610Sandi    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
82082257610Sandi    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
82182257610Sandi    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
82282257610Sandi    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
82382257610Sandi    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
82482257610Sandi    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
82582257610Sandi    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
82682257610Sandi    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
82782257610Sandi    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
82882257610Sandi    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
82982257610Sandi    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
83082257610Sandi    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
83182257610Sandi    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
83282257610Sandi    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
83382257610Sandi    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
83482257610Sandi    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
83582257610Sandi    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
83682257610Sandi    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
83782257610Sandi    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
83882257610Sandi    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
83982257610Sandi    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
84082257610Sandi    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
84182257610Sandi    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
84282257610Sandi    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
84382257610Sandi    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
84482257610Sandi    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
84582257610Sandi    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
84682257610Sandi    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
84782257610Sandi    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
84882257610Sandi    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
84982257610Sandi    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
85082257610Sandi    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
85182257610Sandi    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
85282257610Sandi    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
85382257610Sandi  );
85482257610Sandi
85582257610Sandi  /**
85682257610Sandi   * UTF-8 Case lookup table
85782257610Sandi   *
85882257610Sandi   * This lookuptable defines the lower case letters to their correspponding
85982257610Sandi   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
86082257610Sandi   *
86182257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
86282257610Sandi   */
86354662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
86482257610Sandi  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
86582257610Sandi
866ab77016bSAndreas Gohr} // end of case lookup tables
867ab77016bSAndreas Gohr
868ab77016bSAndreas Gohr
86982257610Sandi/**
87082257610Sandi * UTF-8 lookup table for lower case accented letters
87182257610Sandi *
87282257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
87382257610Sandi * range. This are lower case letters only.
87482257610Sandi *
87582257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
87682257610Sandi * @see    utf8_deaccent()
87782257610Sandi */
87854662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
87982257610Sandi$UTF8_LOWER_ACCENTS = array(
88082257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
88182257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
88282257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
88382257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
88482257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
88582257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
88682257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
88782257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
88882257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
88982257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
89082257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
89182257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
89282257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
89382257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
89474c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
89582257610Sandi);
89682257610Sandi
89782257610Sandi/**
89882257610Sandi * UTF-8 lookup table for upper case accented letters
89982257610Sandi *
90082257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
90182257610Sandi * range. This are upper case letters only.
90282257610Sandi *
90382257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
90482257610Sandi * @see    utf8_deaccent()
90582257610Sandi */
90654662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
90782257610Sandi$UTF8_UPPER_ACCENTS = array(
908df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
909df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
910df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
911df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
912df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
913df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
914df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
915df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
916df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
917df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
918df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
919df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
920df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
921df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
92274c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
92382257610Sandi);
92482257610Sandi
925099ada41Sandi/**
926099ada41Sandi * UTF-8 array of common special characters
927099ada41Sandi *
928099ada41Sandi * This array should contain all special characters (not a letter or digit)
929099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
930099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
931099ada41Sandi * chars.
932099ada41Sandi *
933099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
934ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
935099ada41Sandi *
936099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
937099ada41Sandi * @see    utf8_stripspecials()
938099ada41Sandi */
93954662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
940099ada41Sandi$UTF8_SPECIAL_CHARS = array(
941099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
942ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
9435c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
9445c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
945099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
946099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
947099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
948099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
949099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
950099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
951099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
952099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
953099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
954099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
955099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
956099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
957099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
958099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
959099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
960099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
961099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
962099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
963099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
964099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
965099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
966099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
967099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
968099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
969099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
970099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
971099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
972099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
973099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
974099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
975099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
976099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
977099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
978099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
979099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
980099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
981099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
982099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
983099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
984099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
985099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
986099ada41Sandi  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
987099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
988099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
989099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
990099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
991099ada41Sandi);
992340756e4Sandi
993*720307d9Schris// utf8 version of above data
994*720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
995*720307d9Schris$UTF8_SPECIAL_CHARS2 =
996*720307d9Schris    ' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
997*720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
998*720307d9Schris    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
999*720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1000*720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1001*720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1002*720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1003*720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1004*720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1005*720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1006*720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1007*720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1008*720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1009*720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1010*720307d9Schris    '➷➸➹➺➻➼➽➾�'.
1011*720307d9Schris    '�ﹼﹽ';
1012*720307d9Schris
10138a831f2bSAndreas Gohr/**
10148a831f2bSAndreas Gohr * Romanization lookup table
10158a831f2bSAndreas Gohr *
10168a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
10178a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
10188a831f2bSAndreas Gohr *
10198a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
10208a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
10218a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
10228a831f2bSAndreas Gohr *
10238a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
10248a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
10258a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
10268a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
10278a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
10288a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
10298a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
10308a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
10318a831f2bSAndreas Gohr */
103254662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
10338a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
10348a831f2bSAndreas Gohr  //russian cyrillic
10358a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
10368a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
10378a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
10388a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
10398a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
10408a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1041d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1042d8cb2602SDenis Simakov  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
10438a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
10448a831f2bSAndreas Gohr  // Ukrainian cyrillic
10458a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
10468a831f2bSAndreas Gohr  // Georgian
10478a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
10488a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
10498a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
10508a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
10518a831f2bSAndreas Gohr  'ჰ'=>'xh',
10528a831f2bSAndreas Gohr  //Sanskrit
10538a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
10548a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
10558a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
10568a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
10578a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
10588a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
10598a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
10608a831f2bSAndreas Gohr  //Hebrew
10613dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
10623dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
10633dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
10648a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
10658a831f2bSAndreas Gohr  //Arabic
10668a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
10678a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
10688a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
10698a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
10708a831f2bSAndreas Gohr
10718a831f2bSAndreas Gohr  // Japanese hiragana
10728a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
10738a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
10748a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
10758a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
10768a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
10778a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
10788a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
10798a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
10808a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
10818a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
10828a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
10838a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
10848a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
10858a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
10868a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
10878a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
10888a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
10898a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
10908a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
10918a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
10928a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
10938a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
10948a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
10958a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
10968a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
10978a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
10988a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
10998a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
11008a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
11018a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
11028a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
11038a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
11048a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
11058a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
11068a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
11078a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
11088a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
11098a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
11108a831f2bSAndreas Gohr  'じゅ'=>'zyu',
11118a831f2bSAndreas Gohr  // Japanese katakana
11128a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
11138a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
11148a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
11158a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
11168a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
11178a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
11188a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
11198a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
11208a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
11218a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
11228a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
11238a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
11248a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
11258a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
11268a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
11278a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
11288a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
11298a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
11308a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
11318a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
11328a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
11338a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
11348a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
11358a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
11368a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
11378a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
11388a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
11398a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
11408a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
11418a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
11428a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
11438a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
11448a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
11458a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
11468a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
11478a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
11488a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
11498a831f2bSAndreas Gohr
11508a831f2bSAndreas Gohr  // "Greeklish"
11518a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
11528a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
11538a831f2bSAndreas Gohr
11548a831f2bSAndreas Gohr  // Thai
11558a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
11568a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
11578a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
11588a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
11598a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
11608a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
11618a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
11628a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
11638a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
11648a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
11658a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
11668a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
11678a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
11688a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
11698a831f2bSAndreas Gohr
11708a831f2bSAndreas Gohr  // Korean
11718a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
11728a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
11738a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
11748a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
11758a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
11768a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
11778a831f2bSAndreas Gohr);
1178340756e4Sandi
1179340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
11808a831f2bSAndreas Gohr
1181