xref: /dokuwiki/inc/utf8.php (revision f5e334dec128887a129dfd43e0dc7edfd0c06c8d)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr  }else{
16ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr  }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
205e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
215e613a5cSchris
22ab77016bSAndreas Gohr
2382257610Sandi/**
2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi *
2649c713a3Sandi * Slashes are not encoded
2749c713a3Sandi *
28f59b22f0Sandi * When the second parameter is true the string will
29f59b22f0Sandi * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi * This makes it safe to run it multiple times on the
31f59b22f0Sandi * same string (default is true)
32f59b22f0Sandi *
3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi * @see    urlencode
3549c713a3Sandi */
36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi    return $file;
39f59b22f0Sandi  }
40f59b22f0Sandi  $file = urlencode($file);
4149c713a3Sandi  $file = str_replace('%2F','/',$file);
4249c713a3Sandi  return $file;
4349c713a3Sandi}
4449c713a3Sandi
4549c713a3Sandi/**
4649c713a3Sandi * URL-Decode a filename
4749c713a3Sandi *
48f59b22f0Sandi * This is just a wrapper around urldecode
49f59b22f0Sandi *
5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
51f59b22f0Sandi * @see    urldecode
5249c713a3Sandi */
5349c713a3Sandifunction utf8_decodeFN($file){
54f59b22f0Sandi  $file = urldecode($file);
5549c713a3Sandi  return $file;
5649c713a3Sandi}
5749c713a3Sandi
58f29bd553Sandi/**
5944f669e9Sandi * Checks if a string contains 7bit ASCII only
6044f669e9Sandi *
6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6244f669e9Sandi */
6344f669e9Sandifunction utf8_isASCII($str){
6444f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6544f669e9Sandi    if(ord($str{$i}) >127) return false;
6644f669e9Sandi  }
6744f669e9Sandi  return true;
6844f669e9Sandi}
6944f669e9Sandi
7044f669e9Sandi/**
71e1906e6eSandi * Strips all highbyte chars
72e1906e6eSandi *
73e1906e6eSandi * Returns a pure ASCII7 string
74e1906e6eSandi *
75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
76e1906e6eSandi */
77e1906e6eSandifunction utf8_strip($str){
78e1906e6eSandi  $ascii = '';
79e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
80e1906e6eSandi    if(ord($str{$i}) <128){
81e1906e6eSandi      $ascii .= $str{$i};
82e1906e6eSandi    }
83e1906e6eSandi  }
84e1906e6eSandi  return $ascii;
85e1906e6eSandi}
86e1906e6eSandi
87e1906e6eSandi/**
88f29bd553Sandi * Tries to detect if a string is in Unicode encoding
89f29bd553Sandi *
90f29bd553Sandi * @author <bmorel@ssi.fr>
91f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
92f29bd553Sandi */
93f29bd553Sandifunction utf8_check($Str) {
94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
955e613a5cSchris  $b = ord($Str[$i]);
965e613a5cSchris  if ($b < 0x80) continue; # 0bbbbbbb
975e613a5cSchris  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
985e613a5cSchris  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
995e613a5cSchris  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
1005e613a5cSchris  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
1015e613a5cSchris  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102f29bd553Sandi  else return false; # Does not match any model
103f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105f29bd553Sandi   return false;
106f29bd553Sandi  }
107f29bd553Sandi }
108f29bd553Sandi return true;
109f29bd553Sandi}
11049c713a3Sandi
1112f954959Sandi/**
112f29317c1Sandi * Unicode aware replacement for strlen()
1132f954959Sandi *
114f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
115f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
116f29317c1Sandi * even faster than mb_strlen.
1172f954959Sandi *
118f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1192f954959Sandi * @see    strlen()
120f29317c1Sandi * @see    utf8_decode()
1212f954959Sandi */
1222f954959Sandifunction utf8_strlen($string){
123dc57ef04Sandi  return strlen(utf8_decode($string));
1242f954959Sandi}
1252f954959Sandi
1267077c942Sandi/**
12710f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1287077c942Sandi *
12910f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
13010f09f2aSAndreas Gohr *
13110f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
1325e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk>
13310f09f2aSAndreas Gohr * @param string
13410f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13510f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13610f09f2aSAndreas Gohr * @return mixed string or FALSE if failure
1377077c942Sandi */
13810f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
139ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
14010f09f2aSAndreas Gohr        if( $length === null ){
14119a32233Schris            return mb_substr($str, $offset);
1427d8be200Sandi        }else{
14319a32233Schris            return mb_substr($str, $offset, $length);
144f29317c1Sandi        }
145f29317c1Sandi    }
146f29317c1Sandi
1472626ee0cSchris    /*
1482626ee0cSchris     * Notes:
1492626ee0cSchris     *
1502626ee0cSchris     * no mb string support, so we'll use pcre regex's with 'u' flag
1512626ee0cSchris     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
1522626ee0cSchris     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
1532626ee0cSchris     *
1542626ee0cSchris     * substr documentation states false can be returned in some cases (e.g. offset > string length)
1552626ee0cSchris     * mb_substr never returns false, it will return an empty string instead.
1562626ee0cSchris     *
1572626ee0cSchris     * calculating the number of characters in the string is a relatively expensive operation, so
1582626ee0cSchris     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
1592626ee0cSchris     */
16010f09f2aSAndreas Gohr
1612626ee0cSchris    // cast parameters to appropriate types to avoid multiple notices/warnings
1622626ee0cSchris    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
1632626ee0cSchris    $offset = (int)$offset;
1642626ee0cSchris    if (!is_null($length)) $length = (int)$length;
16510f09f2aSAndreas Gohr
1662626ee0cSchris    // handle trivial cases
1675e613a5cSchris    if ($length === 0) return '';
1682626ee0cSchris    if ($offset < 0 && $length < 0 && $length < $offset) return '';
1695e613a5cSchris
1702626ee0cSchris    $offset_pattern = '';
1712626ee0cSchris    $length_pattern = '';
1722626ee0cSchris
1732626ee0cSchris    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
1742626ee0cSchris    if ($offset < 0) {
1752626ee0cSchris      $strlen = strlen(utf8_decode($str));        // see notes
1762626ee0cSchris      $offset = $strlen + $offset;
1772626ee0cSchris      if ($offset < 0) $offset = 0;
1782626ee0cSchris    }
1792626ee0cSchris
1802626ee0cSchris    // establish a pattern for offset, a non-captured group equal in length to offset
1812626ee0cSchris    if ($offset > 0) {
1822626ee0cSchris      $Ox = (int)($offset/65535);
1832626ee0cSchris      $Oy = $offset%65535;
1842626ee0cSchris
1852626ee0cSchris      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
1862626ee0cSchris      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
1872626ee0cSchris    } else {
1882626ee0cSchris      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
1892626ee0cSchris    }
1902626ee0cSchris
1912626ee0cSchris    // establish a pattern for length
1922626ee0cSchris    if (is_null($length)) {
1932626ee0cSchris      $length_pattern = '(.*)$';                  // the rest of the string
1942626ee0cSchris    } else {
1952626ee0cSchris
1962626ee0cSchris      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
1972626ee0cSchris      if ($offset > $strlen) return '';           // another trivial case
1982626ee0cSchris
1992626ee0cSchris      if ($length > 0) {
2002626ee0cSchris
2012626ee0cSchris        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
2022626ee0cSchris
2032626ee0cSchris        $Lx = (int)($length/65535);
2042626ee0cSchris        $Ly = $length%65535;
2052626ee0cSchris
2062626ee0cSchris        // +ve length requires ... a captured group of length characters
2072626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2082626ee0cSchris        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
2092626ee0cSchris
2102626ee0cSchris      } else if ($length < 0) {
2112626ee0cSchris
2122626ee0cSchris        if ($length < ($offset - $strlen)) return '';
2132626ee0cSchris
2142626ee0cSchris        $Lx = (int)((-$length)/65535);
2152626ee0cSchris        $Ly = (-$length)%65535;
2162626ee0cSchris
2172626ee0cSchris        // -ve length requires ... capture everything except a group of -length characters
2182626ee0cSchris        //                         anchored at the tail-end of the string
2192626ee0cSchris        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
2202626ee0cSchris        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
22110f09f2aSAndreas Gohr      }
22210f09f2aSAndreas Gohr    }
22310f09f2aSAndreas Gohr
2242626ee0cSchris    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
2252626ee0cSchris    return $match[1];
2262626ee0cSchris}
22710f09f2aSAndreas Gohr
228f29317c1Sandi/**
229dc57ef04Sandi * Unicode aware replacement for substr_replace()
230dc57ef04Sandi *
231dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
232dc57ef04Sandi * @see    substr_replace()
233dc57ef04Sandi */
234dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
235dc57ef04Sandi  $ret = '';
236dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
237dc57ef04Sandi  $ret .= $replacement;
238dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
239dc57ef04Sandi  return $ret;
240dc57ef04Sandi}
241dc57ef04Sandi
242dc57ef04Sandi/**
243f29317c1Sandi * Unicode aware replacement for explode
244f29317c1Sandi *
245f29317c1Sandi * @TODO   support third limit arg
246f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
247f29317c1Sandi * @see    explode();
248f29317c1Sandi */
249f29317c1Sandifunction utf8_explode($sep, $str) {
250f29317c1Sandi  if ( $sep == '' ) {
251f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
252f29317c1Sandi    return FALSE;
253f29317c1Sandi  }
254f29317c1Sandi
255f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
256f29317c1Sandi}
257f29317c1Sandi
258f29317c1Sandi/**
259f29317c1Sandi * Unicode aware replacement for strrepalce()
260f29317c1Sandi *
261f29317c1Sandi * @todo   support PHP5 count (fourth arg)
262f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
263f29317c1Sandi * @see    strreplace();
264f29317c1Sandi */
265f29317c1Sandifunction utf8_str_replace($s,$r,$str){
266f29317c1Sandi  if(!is_array($s)){
267f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
268f29317c1Sandi  }else{
269f29317c1Sandi    foreach ($s as $k => $v) {
270f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
271f29317c1Sandi    }
272f29317c1Sandi  }
273f29317c1Sandi  return preg_replace($s,$r,$str);
274f29317c1Sandi}
275f29317c1Sandi
276f29317c1Sandi/**
277f29317c1Sandi * Unicode aware replacement for ltrim()
278f29317c1Sandi *
279f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
280f29317c1Sandi * @see    ltrim()
281f29317c1Sandi * @return string
282f29317c1Sandi */
283f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
284f29317c1Sandi  if($charlist == '') return ltrim($str);
285f29317c1Sandi
286f29317c1Sandi  //quote charlist for use in a characterclass
287f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
288f29317c1Sandi
289f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
290f29317c1Sandi}
291f29317c1Sandi
292f29317c1Sandi/**
293ea2eed85Sandi * Unicode aware replacement for rtrim()
294f29317c1Sandi *
295f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
296f29317c1Sandi * @see    rtrim()
297f29317c1Sandi * @return string
298f29317c1Sandi */
299f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
300f29317c1Sandi  if($charlist == '') return rtrim($str);
301f29317c1Sandi
302f29317c1Sandi  //quote charlist for use in a characterclass
303f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
304f29317c1Sandi
305f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
306f29317c1Sandi}
307f29317c1Sandi
308f29317c1Sandi/**
309f29317c1Sandi * Unicode aware replacement for trim()
310f29317c1Sandi *
311f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
312f29317c1Sandi * @see    trim()
313f29317c1Sandi * @return string
314f29317c1Sandi */
315f29317c1Sandifunction  utf8_trim($str,$charlist='') {
316f29317c1Sandi  if($charlist == '') return trim($str);
317f29317c1Sandi
318f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
319f29317c1Sandi}
320f29317c1Sandi
3212f954959Sandi
32249c713a3Sandi/**
32382257610Sandi * This is a unicode aware replacement for strtolower()
32482257610Sandi *
32582257610Sandi * Uses mb_string extension if available
32682257610Sandi *
32782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
32882257610Sandi * @see    strtolower()
32982257610Sandi * @see    utf8_strtoupper()
33082257610Sandi */
33182257610Sandifunction utf8_strtolower($string){
332ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
33382257610Sandi
33482257610Sandi  global $UTF8_UPPER_TO_LOWER;
33582257610Sandi  $uni = utf8_to_unicode($string);
3362cd2db38Sandi  $cnt = count($uni);
3372cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
33882257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
33982257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
34082257610Sandi    }
34182257610Sandi  }
34282257610Sandi  return unicode_to_utf8($uni);
34382257610Sandi}
34482257610Sandi
34582257610Sandi/**
34682257610Sandi * This is a unicode aware replacement for strtoupper()
34782257610Sandi *
34882257610Sandi * Uses mb_string extension if available
34982257610Sandi *
35082257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
35182257610Sandi * @see    strtoupper()
35282257610Sandi * @see    utf8_strtoupper()
35382257610Sandi */
35482257610Sandifunction utf8_strtoupper($string){
355ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
35682257610Sandi
35782257610Sandi  global $UTF8_LOWER_TO_UPPER;
35882257610Sandi  $uni = utf8_to_unicode($string);
3592cd2db38Sandi  $cnt = count($uni);
3602cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
36182257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
36282257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
36382257610Sandi    }
36482257610Sandi  }
36582257610Sandi  return unicode_to_utf8($uni);
36682257610Sandi}
36782257610Sandi
36882257610Sandi/**
36982257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
37082257610Sandi *
37182257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
37282257610Sandi * letters. Default is to deaccent both cases ($case = 0)
37382257610Sandi *
37482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
37582257610Sandi */
37682257610Sandifunction utf8_deaccent($string,$case=0){
37782257610Sandi  if($case <= 0){
37882257610Sandi    global $UTF8_LOWER_ACCENTS;
37982257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
38082257610Sandi  }
38182257610Sandi  if($case >= 0){
38282257610Sandi    global $UTF8_UPPER_ACCENTS;
38382257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
38482257610Sandi  }
38582257610Sandi  return $string;
38682257610Sandi}
38782257610Sandi
38882257610Sandi/**
3898a831f2bSAndreas Gohr * Romanize a non-latin string
3908a831f2bSAndreas Gohr *
3918a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3928a831f2bSAndreas Gohr */
3938a831f2bSAndreas Gohrfunction utf8_romanize($string){
3948a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3958a831f2bSAndreas Gohr
3968a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3978a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3988a831f2bSAndreas Gohr}
3998a831f2bSAndreas Gohr
4008a831f2bSAndreas Gohr/**
401099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
402099ada41Sandi *
403099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
404099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
405099ada41Sandi *
406099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
407099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
408099ada41Sandi * @param  string $repl       Replace special with this string
409b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
410099ada41Sandi */
411b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
412099ada41Sandi  global $UTF8_SPECIAL_CHARS;
413720307d9Schris  global $UTF8_SPECIAL_CHARS2;
414099ada41Sandi
4155c812709Sandi  static $specials = null;
4165c812709Sandi  if(is_null($specials)){
417720307d9Schris#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
418720307d9Schris    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
4195c812709Sandi  }
420099ada41Sandi
421b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
422099ada41Sandi}
423099ada41Sandi
424099ada41Sandi/**
4252f954959Sandi * This is an Unicode aware replacement for strpos
4262f954959Sandi *
4272f954959Sandi * Uses mb_string extension if available
4282f954959Sandi *
429f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
4302f954959Sandi * @see    strpos()
4312f954959Sandi */
4322f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
433ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
4342f954959Sandi
435f29317c1Sandi  if(!$offset){
436eaa525a0SAndreas Gohr    $ar = utf8_explode($needle, $haystack);
437f29317c1Sandi    if ( count($ar) > 1 ) {
438f29317c1Sandi       return utf8_strlen($ar[0]);
439f29317c1Sandi    }
440f29317c1Sandi    return false;
441f29317c1Sandi  }else{
442f29317c1Sandi    if ( !is_int($offset) ) {
443f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
444f29317c1Sandi      return false;
445f29317c1Sandi    }
4462f954959Sandi
447eaa525a0SAndreas Gohr    $haystack = utf8_substr($haystack, $offset);
448f29317c1Sandi
449eaa525a0SAndreas Gohr    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
450f29317c1Sandi       return $pos + $offset;
4512f954959Sandi    }
452f29317c1Sandi    return false;
4532f954959Sandi  }
4542f954959Sandi}
4552f954959Sandi
4562f954959Sandi/**
457ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
458ea2eed85Sandi *
459ea2eed85Sandi * @author <vpribish at shopping dot com>
460ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
461ea2eed85Sandi */
462ea2eed85Sandifunction utf8_tohtml ($str) {
463ea2eed85Sandi  $ret = '';
464ea2eed85Sandi  $max = strlen($str);
465ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
466ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
467ea2eed85Sandi    $c = $str{$i};
468ea2eed85Sandi    $c1 = ord($c);
469ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
470ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
471ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
472ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
473ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
474ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
475ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
476ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
477ea2eed85Sandi      $last = $i+1;
478ea2eed85Sandi    }
479ea2eed85Sandi  }
480ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
481ea2eed85Sandi}
482ea2eed85Sandi
483ea2eed85Sandi/**
4841abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
4851abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
4861abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
4871abfaba4SAndreas Gohr * are not allowed.
48882257610Sandi *
4891abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
4901abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
4911abfaba4SAndreas Gohr * level E_USER_WARNING
4921abfaba4SAndreas Gohr *
4931abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
4941abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
4951abfaba4SAndreas Gohr *
4961abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
4971abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
4981abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
4991abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
5001abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid
5011abfaba4SAndreas Gohr * @see    unicode_to_utf8
5021abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
5031abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
50482257610Sandi */
5051abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
5061abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
5071abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
5081abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
5091abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
51082257610Sandi
5111abfaba4SAndreas Gohr    $out = array();
5121abfaba4SAndreas Gohr
5131abfaba4SAndreas Gohr    $len = strlen($str);
5141abfaba4SAndreas Gohr
5151abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
5161abfaba4SAndreas Gohr
5171abfaba4SAndreas Gohr        $in = ord($str{$i});
5181abfaba4SAndreas Gohr
5191abfaba4SAndreas Gohr        if ( $mState == 0) {
5201abfaba4SAndreas Gohr
5211abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
5221abfaba4SAndreas Gohr            // multi-octet sequence.
5231abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
5241abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
5251abfaba4SAndreas Gohr                $out[] = $in;
5261abfaba4SAndreas Gohr                $mBytes = 1;
5271abfaba4SAndreas Gohr
5281abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
5291abfaba4SAndreas Gohr                // First octet of 2 octet sequence
5301abfaba4SAndreas Gohr                $mUcs4 = ($in);
5311abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
5321abfaba4SAndreas Gohr                $mState = 1;
5331abfaba4SAndreas Gohr                $mBytes = 2;
5341abfaba4SAndreas Gohr
5351abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
5361abfaba4SAndreas Gohr                // First octet of 3 octet sequence
5371abfaba4SAndreas Gohr                $mUcs4 = ($in);
5381abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
5391abfaba4SAndreas Gohr                $mState = 2;
5401abfaba4SAndreas Gohr                $mBytes = 3;
5411abfaba4SAndreas Gohr
5421abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
5431abfaba4SAndreas Gohr                // First octet of 4 octet sequence
5441abfaba4SAndreas Gohr                $mUcs4 = ($in);
5451abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
5461abfaba4SAndreas Gohr                $mState = 3;
5471abfaba4SAndreas Gohr                $mBytes = 4;
5481abfaba4SAndreas Gohr
5491abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
5501abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
5511abfaba4SAndreas Gohr                 *
5521abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
5531abfaba4SAndreas Gohr                 * (a) not the shortest form or
5541abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
5551abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
5561abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5571abfaba4SAndreas Gohr                 */
5581abfaba4SAndreas Gohr                $mUcs4 = ($in);
5591abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5601abfaba4SAndreas Gohr                $mState = 4;
5611abfaba4SAndreas Gohr                $mBytes = 5;
5621abfaba4SAndreas Gohr
5631abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
5641abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
5651abfaba4SAndreas Gohr                $mUcs4 = ($in);
5661abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
5671abfaba4SAndreas Gohr                $mState = 5;
5681abfaba4SAndreas Gohr                $mBytes = 6;
5691abfaba4SAndreas Gohr
5701abfaba4SAndreas Gohr            } elseif($strict) {
5711abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
5721abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
5731abfaba4SAndreas Gohr                 */
5741abfaba4SAndreas Gohr                trigger_error(
5751abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
5761abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
5771abfaba4SAndreas Gohr                        E_USER_WARNING
5781abfaba4SAndreas Gohr                    );
5791abfaba4SAndreas Gohr                return FALSE;
5801abfaba4SAndreas Gohr
5811abfaba4SAndreas Gohr            }
5821abfaba4SAndreas Gohr
5831abfaba4SAndreas Gohr        } else {
5841abfaba4SAndreas Gohr
5851abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
5861abfaba4SAndreas Gohr            // sequence
5871abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
5881abfaba4SAndreas Gohr
5891abfaba4SAndreas Gohr                // Legal continuation.
5901abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
5911abfaba4SAndreas Gohr                $tmp = $in;
5921abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
5931abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
5941abfaba4SAndreas Gohr
5951abfaba4SAndreas Gohr                /**
5961abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
5971abfaba4SAndreas Gohr                 * Unicode codepoint to be output
5981abfaba4SAndreas Gohr                 */
5991abfaba4SAndreas Gohr                if (0 == --$mState) {
6001abfaba4SAndreas Gohr
6011abfaba4SAndreas Gohr                    /*
6021abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
6031abfaba4SAndreas Gohr                     */
6041abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
6051abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
6061abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
6071abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
6081abfaba4SAndreas Gohr                        (4 < $mBytes) ||
6091abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
6101abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
6111abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
6121abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
6131abfaba4SAndreas Gohr
6141abfaba4SAndreas Gohr                        if($strict){
6151abfaba4SAndreas Gohr                            trigger_error(
6161abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
6171abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
6181abfaba4SAndreas Gohr                                    E_USER_WARNING
6191abfaba4SAndreas Gohr                                );
6201abfaba4SAndreas Gohr
6211abfaba4SAndreas Gohr                            return FALSE;
6221abfaba4SAndreas Gohr                        }
6231abfaba4SAndreas Gohr
6241abfaba4SAndreas Gohr                    }
6251abfaba4SAndreas Gohr
6261abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
6271abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
6281abfaba4SAndreas Gohr                        $out[] = $mUcs4;
6291abfaba4SAndreas Gohr                    }
6301abfaba4SAndreas Gohr
6311abfaba4SAndreas Gohr                    //initialize UTF8 cache
6321abfaba4SAndreas Gohr                    $mState = 0;
6331abfaba4SAndreas Gohr                    $mUcs4  = 0;
6341abfaba4SAndreas Gohr                    $mBytes = 1;
6351abfaba4SAndreas Gohr                }
6361abfaba4SAndreas Gohr
6371abfaba4SAndreas Gohr            } elseif($strict) {
6381abfaba4SAndreas Gohr                /**
6391abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
6401abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
6411abfaba4SAndreas Gohr                 */
6421abfaba4SAndreas Gohr                trigger_error(
6431abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
6441abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
6451abfaba4SAndreas Gohr                        E_USER_WARNING
6461abfaba4SAndreas Gohr                    );
6471abfaba4SAndreas Gohr
6481abfaba4SAndreas Gohr                return FALSE;
64982257610Sandi            }
65082257610Sandi        }
65182257610Sandi    }
6521abfaba4SAndreas Gohr    return $out;
65382257610Sandi}
65482257610Sandi
65582257610Sandi/**
6561abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6571abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6581abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6591abfaba4SAndreas Gohr * are not allowed.
66082257610Sandi *
6611abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
6621abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
6631abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
6641abfaba4SAndreas Gohr *
6651abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
6661abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
6671abfaba4SAndreas Gohr * reference the array by it's keys
6681abfaba4SAndreas Gohr *
6691abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
6701abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
6711abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points
6721abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
6731abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
6741abfaba4SAndreas Gohr * @see    utf8_to_unicode
6751abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
6761abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
67782257610Sandi */
6781abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
6791abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
6801abfaba4SAndreas Gohr    ob_start();
681f949a01cSAndreas Gohr
6821abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
6831abfaba4SAndreas Gohr
6841abfaba4SAndreas Gohr        # ASCII range (including control chars)
6851abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
6861abfaba4SAndreas Gohr
6871abfaba4SAndreas Gohr            echo chr($arr[$k]);
6881abfaba4SAndreas Gohr
6891abfaba4SAndreas Gohr        # 2 byte sequence
6901abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
6911abfaba4SAndreas Gohr
6921abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
6931abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6941abfaba4SAndreas Gohr
6951abfaba4SAndreas Gohr        # Byte order mark (skip)
6961abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
6971abfaba4SAndreas Gohr
6981abfaba4SAndreas Gohr            // nop -- zap the BOM
6991abfaba4SAndreas Gohr
7001abfaba4SAndreas Gohr        # Test for illegal surrogates
7011abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
7021abfaba4SAndreas Gohr
7031abfaba4SAndreas Gohr            // found a surrogate
7041abfaba4SAndreas Gohr            if($strict){
7051abfaba4SAndreas Gohr                trigger_error(
7061abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
7071abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
7081abfaba4SAndreas Gohr                    E_USER_WARNING
7091abfaba4SAndreas Gohr                    );
7101abfaba4SAndreas Gohr                return FALSE;
7111abfaba4SAndreas Gohr            }
7121abfaba4SAndreas Gohr
7131abfaba4SAndreas Gohr        # 3 byte sequence
7141abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
7151abfaba4SAndreas Gohr
7161abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
7171abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
7181abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
7191abfaba4SAndreas Gohr
7201abfaba4SAndreas Gohr        # 4 byte sequence
7211abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
7221abfaba4SAndreas Gohr
7231abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
7241abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
7251abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
7261abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
7271abfaba4SAndreas Gohr
7281abfaba4SAndreas Gohr        } elseif($strict) {
7291abfaba4SAndreas Gohr
7301abfaba4SAndreas Gohr            trigger_error(
7311abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
7321abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
7331abfaba4SAndreas Gohr                E_USER_WARNING
7341abfaba4SAndreas Gohr                );
7351abfaba4SAndreas Gohr
7361abfaba4SAndreas Gohr            // out of range
7371abfaba4SAndreas Gohr            return FALSE;
73882257610Sandi        }
73982257610Sandi    }
7401abfaba4SAndreas Gohr
7411abfaba4SAndreas Gohr    $result = ob_get_contents();
7421abfaba4SAndreas Gohr    ob_end_clean();
7431abfaba4SAndreas Gohr    return $result;
74482257610Sandi}
74582257610Sandi
74682257610Sandi/**
74715fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
74815fa0b4fSAndreas Gohr *
74915fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
75015fa0b4fSAndreas Gohr */
75115fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
75215fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
753ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
75415fa0b4fSAndreas Gohr
75515fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
75615fa0b4fSAndreas Gohr  foreach($uni as $cp){
75715fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
75815fa0b4fSAndreas Gohr  }
75915fa0b4fSAndreas Gohr  return $out;
76015fa0b4fSAndreas Gohr}
76115fa0b4fSAndreas Gohr
76215fa0b4fSAndreas Gohr/**
76315fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
76415fa0b4fSAndreas Gohr *
76515fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
76615fa0b4fSAndreas Gohr */
76715fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
76815fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
76915fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
77015fa0b4fSAndreas Gohr}
77115fa0b4fSAndreas Gohr
7720eac1afbSAndreas Gohr/**
7730eac1afbSAndreas Gohr * Replace bad bytes with an alternative character
7740eac1afbSAndreas Gohr *
7750eac1afbSAndreas Gohr * ASCII character is recommended for replacement char
7760eac1afbSAndreas Gohr *
7770eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string
7780eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms
7790eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars
7800eac1afbSAndreas Gohr *
7810eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
7820eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8
7830eac1afbSAndreas Gohr * @param string to search
7840eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII
7850eac1afbSAndreas Gohr * @return string
7860eac1afbSAndreas Gohr */
7870eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') {
7880eac1afbSAndreas Gohr    $UTF8_BAD =
7890eac1afbSAndreas Gohr     '([\x00-\x7F]'.                          # ASCII (including control chars)
7900eac1afbSAndreas Gohr     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
7910eac1afbSAndreas Gohr     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
7920eac1afbSAndreas Gohr     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
7930eac1afbSAndreas Gohr     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
7940eac1afbSAndreas Gohr     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
7950eac1afbSAndreas Gohr     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
7960eac1afbSAndreas Gohr     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
7970eac1afbSAndreas Gohr     '|(.{1}))';                              # invalid byte
7980eac1afbSAndreas Gohr    ob_start();
7990eac1afbSAndreas Gohr    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
8000eac1afbSAndreas Gohr        if ( !isset($matches[2])) {
8010eac1afbSAndreas Gohr            echo $matches[0];
8020eac1afbSAndreas Gohr        } else {
8030eac1afbSAndreas Gohr            echo $replace;
8040eac1afbSAndreas Gohr        }
8050eac1afbSAndreas Gohr        $str = substr($str,strlen($matches[0]));
8060eac1afbSAndreas Gohr    }
8070eac1afbSAndreas Gohr    $result = ob_get_contents();
8080eac1afbSAndreas Gohr    ob_end_clean();
8090eac1afbSAndreas Gohr    return $result;
8100eac1afbSAndreas Gohr}
811ab77016bSAndreas Gohr
8125953e889Schris/**
8135953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary
8145953e889Schris *
8155953e889Schris * @param $str   string   utf8 character string
8165953e889Schris * @param $i     int      byte index into $str
8175953e889Schris * @param $next  bool     direction to search for boundary,
8185953e889Schris *                           false = up (current character)
8195953e889Schris *                           true = down (next character)
8205953e889Schris *
8215953e889Schris * @return int            byte index into $str now pointing to a utf8 character boundary
8225953e889Schris *
8235953e889Schris * @author       chris smith <chris@jalakai.co.uk>
8245953e889Schris */
8255953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) {
8265953e889Schris
827f50163d1Schris  if ($i <= 0) return 0;
828f50163d1Schris
8295953e889Schris  $limit = strlen($str);
830f50163d1Schris  if ($i>=$limit) return $limit;
831f50163d1Schris
832f50163d1Schris  if ($next) {
8335953e889Schris    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
8345953e889Schris  } else {
8355953e889Schris    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
8365953e889Schris  }
8375953e889Schris
8385953e889Schris  return $i;
8395953e889Schris}
8405953e889Schris
841ab77016bSAndreas Gohr// only needed if no mb_string available
842ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
843ab77016bSAndreas Gohr
84415fa0b4fSAndreas Gohr  /**
84582257610Sandi   * UTF-8 Case lookup table
84682257610Sandi   *
84782257610Sandi   * This lookuptable defines the upper case letters to their correspponding
84882257610Sandi   * lower case letter in UTF-8
84982257610Sandi   *
85082257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
85182257610Sandi   */
85254662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
85354662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
85482257610Sandi    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
85582257610Sandi    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
85682257610Sandi    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
85782257610Sandi    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
85882257610Sandi    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
85982257610Sandi    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
86082257610Sandi    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
86182257610Sandi    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
86282257610Sandi    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
86382257610Sandi    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
86482257610Sandi    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
86582257610Sandi    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
86682257610Sandi    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
86782257610Sandi    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
86882257610Sandi    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
86982257610Sandi    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
87082257610Sandi    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
87182257610Sandi    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
87282257610Sandi    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
87382257610Sandi    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
87482257610Sandi    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
87582257610Sandi    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
87682257610Sandi    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
87782257610Sandi    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
87882257610Sandi    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
87982257610Sandi    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
88082257610Sandi    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
88182257610Sandi    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
88282257610Sandi    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
88382257610Sandi    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
88482257610Sandi    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
88582257610Sandi    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
88682257610Sandi    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
88782257610Sandi    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
88882257610Sandi    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
88982257610Sandi    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
89082257610Sandi    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
89182257610Sandi    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
89282257610Sandi    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
89382257610Sandi    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
89482257610Sandi    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
89582257610Sandi    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
89682257610Sandi    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
89782257610Sandi  );
89882257610Sandi
89982257610Sandi  /**
90082257610Sandi   * UTF-8 Case lookup table
90182257610Sandi   *
90282257610Sandi   * This lookuptable defines the lower case letters to their correspponding
90382257610Sandi   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
90482257610Sandi   *
90582257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
90682257610Sandi   */
90754662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
90882257610Sandi  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
90982257610Sandi
910ab77016bSAndreas Gohr} // end of case lookup tables
911ab77016bSAndreas Gohr
912ab77016bSAndreas Gohr
91382257610Sandi/**
91482257610Sandi * UTF-8 lookup table for lower case accented letters
91582257610Sandi *
91682257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
91782257610Sandi * range. This are lower case letters only.
91882257610Sandi *
91982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
92082257610Sandi * @see    utf8_deaccent()
92182257610Sandi */
92254662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
92382257610Sandi$UTF8_LOWER_ACCENTS = array(
92482257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
92582257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
92682257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
92782257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
92882257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
92982257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
93082257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
93182257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
93282257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
93382257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
93482257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
93582257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
93682257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
93782257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
93874c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
93982257610Sandi);
94082257610Sandi
94182257610Sandi/**
94282257610Sandi * UTF-8 lookup table for upper case accented letters
94382257610Sandi *
94482257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
94582257610Sandi * range. This are upper case letters only.
94682257610Sandi *
94782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
94882257610Sandi * @see    utf8_deaccent()
94982257610Sandi */
95054662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
95182257610Sandi$UTF8_UPPER_ACCENTS = array(
952df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
953df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
954df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
955df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
956df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
957df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
958df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
959df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
960df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
961df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
962df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
963df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
964df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
965df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
96674c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
96782257610Sandi);
96882257610Sandi
969099ada41Sandi/**
970099ada41Sandi * UTF-8 array of common special characters
971099ada41Sandi *
972099ada41Sandi * This array should contain all special characters (not a letter or digit)
973099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
974099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
975099ada41Sandi * chars.
976099ada41Sandi *
977099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
978ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
979099ada41Sandi *
980099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
981099ada41Sandi * @see    utf8_stripspecials()
982099ada41Sandi */
98354662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
984099ada41Sandi$UTF8_SPECIAL_CHARS = array(
985099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
986ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
9875c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
9885c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
989099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
990099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
991099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
992099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
993099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
994099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
995099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
996099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
997099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
998099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
999099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1000099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1001099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1002099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1003099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1004099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1005099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1006099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1007099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1008099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1009099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1010099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1011099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1012099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1013099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1014099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1015099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1016099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1017099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1018099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1019099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1020099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1021099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1022099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1023099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1024099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1025099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1026099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1027099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1028099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1029099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1030099ada41Sandi  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1031099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1032099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1033099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1034099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1035099ada41Sandi);
1036340756e4Sandi
1037720307d9Schris// utf8 version of above data
1038720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1039720307d9Schris$UTF8_SPECIAL_CHARS2 =
1040720307d9Schris    ' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1041720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1042720307d9Schris    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1043720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1044720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1045720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1046720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1047720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1048720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1049720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1050720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1051720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1052720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1053720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1054720307d9Schris    '➷➸➹➺➻➼➽➾�'.
1055720307d9Schris    '�ﹼﹽ';
1056720307d9Schris
10578a831f2bSAndreas Gohr/**
10588a831f2bSAndreas Gohr * Romanization lookup table
10598a831f2bSAndreas Gohr *
10608a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
10618a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
10628a831f2bSAndreas Gohr *
10638a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
10648a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
10658a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
10668a831f2bSAndreas Gohr *
10678a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
10688a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
10698a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
10708a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
10718a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
10728a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
10738a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
10748a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
10758a831f2bSAndreas Gohr */
107654662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
10778a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
10788a831f2bSAndreas Gohr  //russian cyrillic
10798a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
10808a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
10818a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
10828a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
10838a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
10848a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1085d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1086*f5e334deSAndreas Gohr  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
10878a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
10888a831f2bSAndreas Gohr  // Ukrainian cyrillic
10898a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
10908a831f2bSAndreas Gohr  // Georgian
10918a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
10928a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
10938a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
10948a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
10958a831f2bSAndreas Gohr  'ჰ'=>'xh',
10968a831f2bSAndreas Gohr  //Sanskrit
10978a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
10988a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
10998a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
11008a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
11018a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
11028a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
11038a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
11048a831f2bSAndreas Gohr  //Hebrew
11053dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
11063dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
11073dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
11088a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
11098a831f2bSAndreas Gohr  //Arabic
11108a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
11118a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
11128a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
11138a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
11148a831f2bSAndreas Gohr
11158a831f2bSAndreas Gohr  // Japanese hiragana
11168a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
11178a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
11188a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
11198a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
11208a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
11218a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
11228a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
11238a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
11248a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
11258a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
11268a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
11278a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
11288a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
11298a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
11308a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
11318a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
11328a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
11338a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
11348a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
11358a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
11368a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
11378a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
11388a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
11398a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
11408a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
11418a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
11428a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
11438a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
11448a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
11458a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
11468a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
11478a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
11488a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
11498a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
11508a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
11518a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
11528a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
11538a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
11548a831f2bSAndreas Gohr  'じゅ'=>'zyu',
11558a831f2bSAndreas Gohr  // Japanese katakana
11568a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
11578a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
11588a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
11598a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
11608a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
11618a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
11628a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
11638a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
11648a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
11658a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
11668a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
11678a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
11688a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
11698a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
11708a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
11718a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
11728a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
11738a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
11748a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
11758a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
11768a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
11778a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
11788a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
11798a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
11808a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
11818a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
11828a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
11838a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
11848a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
11858a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
11868a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
11878a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
11888a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
11898a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
11908a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
11918a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
11928a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
11938a831f2bSAndreas Gohr
11948a831f2bSAndreas Gohr  // "Greeklish"
11958a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
11968a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
11978a831f2bSAndreas Gohr
11988a831f2bSAndreas Gohr  // Thai
11998a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
12008a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
12018a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
12028a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
12038a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
12048a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
12058a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
12068a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
12078a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
12088a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
12098a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
12108a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
12118a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
12128a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
12138a831f2bSAndreas Gohr
12148a831f2bSAndreas Gohr  // Korean
12158a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
12168a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
12178a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
12188a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
12198a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
12208a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
12218a831f2bSAndreas Gohr);
1222340756e4Sandi
1223340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
12248a831f2bSAndreas Gohr
1225