xref: /dokuwiki/inc/utf8.php (revision 5e613a5c5e3fb29292e23a5fd83f17f25567a747)
1ed7b5f09Sandi<?php
282257610Sandi/**
382257610Sandi * UTF8 helper functions
482257610Sandi *
54a47269fSandi * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
682257610Sandi * @author     Andreas Gohr <andi@splitbrain.org>
782257610Sandi */
882257610Sandi
9ab77016bSAndreas Gohr/**
10ab77016bSAndreas Gohr * check for mb_string support
11ab77016bSAndreas Gohr */
12ab77016bSAndreas Gohrif(!defined('UTF8_MBSTRING')){
13ab77016bSAndreas Gohr  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14ab77016bSAndreas Gohr    define('UTF8_MBSTRING',1);
15ab77016bSAndreas Gohr  }else{
16ab77016bSAndreas Gohr    define('UTF8_MBSTRING',0);
17ab77016bSAndreas Gohr  }
18ab77016bSAndreas Gohr}
19ab77016bSAndreas Gohr
20*5e613a5cSchrisif(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21*5e613a5cSchris
22ab77016bSAndreas Gohr
2382257610Sandi/**
2449c713a3Sandi * URL-Encode a filename to allow unicodecharacters
2549c713a3Sandi *
2649c713a3Sandi * Slashes are not encoded
2749c713a3Sandi *
28f59b22f0Sandi * When the second parameter is true the string will
29f59b22f0Sandi * be encoded only if non ASCII characters are detected -
30f59b22f0Sandi * This makes it safe to run it multiple times on the
31f59b22f0Sandi * same string (default is true)
32f59b22f0Sandi *
3349c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
34f59b22f0Sandi * @see    urlencode
3549c713a3Sandi */
36f59b22f0Sandifunction utf8_encodeFN($file,$safe=true){
37f59b22f0Sandi  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38f59b22f0Sandi    return $file;
39f59b22f0Sandi  }
40f59b22f0Sandi  $file = urlencode($file);
4149c713a3Sandi  $file = str_replace('%2F','/',$file);
4249c713a3Sandi  return $file;
4349c713a3Sandi}
4449c713a3Sandi
4549c713a3Sandi/**
4649c713a3Sandi * URL-Decode a filename
4749c713a3Sandi *
48f59b22f0Sandi * This is just a wrapper around urldecode
49f59b22f0Sandi *
5049c713a3Sandi * @author Andreas Gohr <andi@splitbrain.org>
51f59b22f0Sandi * @see    urldecode
5249c713a3Sandi */
5349c713a3Sandifunction utf8_decodeFN($file){
54f59b22f0Sandi  $file = urldecode($file);
5549c713a3Sandi  return $file;
5649c713a3Sandi}
5749c713a3Sandi
58f29bd553Sandi/**
5944f669e9Sandi * Checks if a string contains 7bit ASCII only
6044f669e9Sandi *
6144f669e9Sandi * @author Andreas Gohr <andi@splitbrain.org>
6244f669e9Sandi */
6344f669e9Sandifunction utf8_isASCII($str){
6444f669e9Sandi  for($i=0; $i<strlen($str); $i++){
6544f669e9Sandi    if(ord($str{$i}) >127) return false;
6644f669e9Sandi  }
6744f669e9Sandi  return true;
6844f669e9Sandi}
6944f669e9Sandi
7044f669e9Sandi/**
71e1906e6eSandi * Strips all highbyte chars
72e1906e6eSandi *
73e1906e6eSandi * Returns a pure ASCII7 string
74e1906e6eSandi *
75e1906e6eSandi * @author Andreas Gohr <andi@splitbrain.org>
76e1906e6eSandi */
77e1906e6eSandifunction utf8_strip($str){
78e1906e6eSandi  $ascii = '';
79e1906e6eSandi  for($i=0; $i<strlen($str); $i++){
80e1906e6eSandi    if(ord($str{$i}) <128){
81e1906e6eSandi      $ascii .= $str{$i};
82e1906e6eSandi    }
83e1906e6eSandi  }
84e1906e6eSandi  return $ascii;
85e1906e6eSandi}
86e1906e6eSandi
87e1906e6eSandi/**
88f29bd553Sandi * Tries to detect if a string is in Unicode encoding
89f29bd553Sandi *
90f29bd553Sandi * @author <bmorel@ssi.fr>
91f29bd553Sandi * @link   http://www.php.net/manual/en/function.utf8-encode.php
92f29bd553Sandi */
93f29bd553Sandifunction utf8_check($Str) {
94f29bd553Sandi for ($i=0; $i<strlen($Str); $i++) {
95*5e613a5cSchris  $b = ord($Str[$i]);
96*5e613a5cSchris  if ($b < 0x80) continue; # 0bbbbbbb
97*5e613a5cSchris  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
98*5e613a5cSchris  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
99*5e613a5cSchris  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
100*5e613a5cSchris  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
101*5e613a5cSchris  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102f29bd553Sandi  else return false; # Does not match any model
103f29bd553Sandi  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104f29bd553Sandi   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105f29bd553Sandi   return false;
106f29bd553Sandi  }
107f29bd553Sandi }
108f29bd553Sandi return true;
109f29bd553Sandi}
11049c713a3Sandi
1112f954959Sandi/**
112f29317c1Sandi * Unicode aware replacement for strlen()
1132f954959Sandi *
114f29317c1Sandi * utf8_decode() converts characters that are not in ISO-8859-1
115f29317c1Sandi * to '?', which, for the purpose of counting, is alright - It's
116f29317c1Sandi * even faster than mb_strlen.
1172f954959Sandi *
118f29317c1Sandi * @author <chernyshevsky at hotmail dot com>
1192f954959Sandi * @see    strlen()
120f29317c1Sandi * @see    utf8_decode()
1212f954959Sandi */
1222f954959Sandifunction utf8_strlen($string){
123dc57ef04Sandi  return strlen(utf8_decode($string));
1242f954959Sandi}
1252f954959Sandi
1267077c942Sandi/**
12710f09f2aSAndreas Gohr * UTF-8 aware alternative to substr
1287077c942Sandi *
12910f09f2aSAndreas Gohr * Return part of a string given character offset (and optionally length)
13010f09f2aSAndreas Gohr * Note: supports use of negative offsets and lengths but will be slower
13110f09f2aSAndreas Gohr * when doing so
13210f09f2aSAndreas Gohr *
13310f09f2aSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
134*5e613a5cSchris * @author Chris Smith <chris@jalakai.co.uk>
13510f09f2aSAndreas Gohr * @param string
13610f09f2aSAndreas Gohr * @param integer number of UTF-8 characters offset (from left)
13710f09f2aSAndreas Gohr * @param integer (optional) length in UTF-8 characters from offset
13810f09f2aSAndreas Gohr * @return mixed string or FALSE if failure
1397077c942Sandi */
14010f09f2aSAndreas Gohrfunction utf8_substr($str, $offset, $length = null) {
141ab77016bSAndreas Gohr    if(UTF8_MBSTRING){
14210f09f2aSAndreas Gohr        if( $length === null ){
14319a32233Schris            return mb_substr($str, $offset);
1447d8be200Sandi        }else{
14519a32233Schris            return mb_substr($str, $offset, $length);
146f29317c1Sandi        }
147f29317c1Sandi    }
148f29317c1Sandi
149*5e613a5cSchris    if ( $offset >= 0 && $length >= 0 && $offset < 65534 && $length < 65534) {
15010f09f2aSAndreas Gohr        if ( $length === null ) {
15110f09f2aSAndreas Gohr            $length = '*';
15210f09f2aSAndreas Gohr        } else {
15310f09f2aSAndreas Gohr            $strlen = strlen(utf8_decode($str));
15410f09f2aSAndreas Gohr            if ( $offset > $strlen ) {
15510f09f2aSAndreas Gohr                return '';
15610f09f2aSAndreas Gohr            }
15710f09f2aSAndreas Gohr
15810f09f2aSAndreas Gohr            if ( ( $offset + $length ) > $strlen ) {
15910f09f2aSAndreas Gohr               $length = '*';
16010f09f2aSAndreas Gohr            } else {
16110f09f2aSAndreas Gohr                $length = '{'.$length.'}';
16210f09f2aSAndreas Gohr            }
16310f09f2aSAndreas Gohr        }
16410f09f2aSAndreas Gohr
16510f09f2aSAndreas Gohr        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
16610f09f2aSAndreas Gohr        preg_match($pattern, $str, $matches);
16710f09f2aSAndreas Gohr
16810f09f2aSAndreas Gohr        if ( isset($matches[1]) ) {
16910f09f2aSAndreas Gohr            return $matches[1];
17010f09f2aSAndreas Gohr        }
17110f09f2aSAndreas Gohr        return false;
17210f09f2aSAndreas Gohr
17310f09f2aSAndreas Gohr    } else {
174*5e613a5cSchris
175*5e613a5cSchris      // convert character offsets to byte offsets and use normal substr()
176*5e613a5cSchris      // 1. normalise paramters into positive offset and length and carry out simple checks
177*5e613a5cSchris      $strlen = strlen(utf8_decode($str));
178*5e613a5cSchris
179*5e613a5cSchris      if ($offset < 0) {
180*5e613a5cSchris        $offset = max($strlen+$offset,0);
18110f09f2aSAndreas Gohr      }
182*5e613a5cSchris      if ($offset >= $strlen) return false;
183*5e613a5cSchris
184*5e613a5cSchris      if ($length === null) {
185*5e613a5cSchris        // 2a. convert to start byte offset
186*5e613a5cSchris        list($start) = _utf8_byteindex($str,$offset);
187*5e613a5cSchris				return substr($str,$start);
188*5e613a5cSchris      }
189*5e613a5cSchris
190*5e613a5cSchris      if ($length < 0) {
191*5e613a5cSchris        $length = $strlen-$offset+$length;
192*5e613a5cSchris        if ($length < 0) return '';
193*5e613a5cSchris      }
194*5e613a5cSchris
195*5e613a5cSchris      if ($length === 0) return '';
196*5e613a5cSchris      if ($strlen - $offset < $length) $length = $strlen-$offset;
197*5e613a5cSchris
198*5e613a5cSchris      // 2b. convert to start and end byte offsets
199*5e613a5cSchris      list($start,$end) = _utf8_byteindex($str,$offset,$offset+$length);
200*5e613a5cSchris      return substr($str,$start,$end-$start);
20110f09f2aSAndreas Gohr    }
20210f09f2aSAndreas Gohr}
20310f09f2aSAndreas Gohr
20410f09f2aSAndreas Gohr
205f29317c1Sandi/**
206dc57ef04Sandi * Unicode aware replacement for substr_replace()
207dc57ef04Sandi *
208dc57ef04Sandi * @author Andreas Gohr <andi@splitbrain.org>
209dc57ef04Sandi * @see    substr_replace()
210dc57ef04Sandi */
211dc57ef04Sandifunction utf8_substr_replace($string, $replacement, $start , $length=0 ){
212dc57ef04Sandi  $ret = '';
213dc57ef04Sandi  if($start>0) $ret .= utf8_substr($string, 0, $start);
214dc57ef04Sandi  $ret .= $replacement;
215dc57ef04Sandi  $ret .= utf8_substr($string, $start+$length);
216dc57ef04Sandi  return $ret;
217dc57ef04Sandi}
218dc57ef04Sandi
219dc57ef04Sandi/**
220f29317c1Sandi * Unicode aware replacement for explode
221f29317c1Sandi *
222f29317c1Sandi * @TODO   support third limit arg
223f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
224f29317c1Sandi * @see    explode();
225f29317c1Sandi */
226f29317c1Sandifunction utf8_explode($sep, $str) {
227f29317c1Sandi  if ( $sep == '' ) {
228f29317c1Sandi    trigger_error('Empty delimiter',E_USER_WARNING);
229f29317c1Sandi    return FALSE;
230f29317c1Sandi  }
231f29317c1Sandi
232f29317c1Sandi  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
233f29317c1Sandi}
234f29317c1Sandi
235f29317c1Sandi/**
236f29317c1Sandi * Unicode aware replacement for strrepalce()
237f29317c1Sandi *
238f29317c1Sandi * @todo   support PHP5 count (fourth arg)
239f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
240f29317c1Sandi * @see    strreplace();
241f29317c1Sandi */
242f29317c1Sandifunction utf8_str_replace($s,$r,$str){
243f29317c1Sandi  if(!is_array($s)){
244f29317c1Sandi    $s = '!'.preg_quote($s,'!').'!u';
245f29317c1Sandi  }else{
246f29317c1Sandi    foreach ($s as $k => $v) {
247f29317c1Sandi      $s[$k] = '!'.preg_quote($v).'!u';
248f29317c1Sandi    }
249f29317c1Sandi  }
250f29317c1Sandi  return preg_replace($s,$r,$str);
251f29317c1Sandi}
252f29317c1Sandi
253f29317c1Sandi/**
254f29317c1Sandi * Unicode aware replacement for ltrim()
255f29317c1Sandi *
256f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
257f29317c1Sandi * @see    ltrim()
258f29317c1Sandi * @return string
259f29317c1Sandi */
260f29317c1Sandifunction utf8_ltrim($str,$charlist=''){
261f29317c1Sandi  if($charlist == '') return ltrim($str);
262f29317c1Sandi
263f29317c1Sandi  //quote charlist for use in a characterclass
264f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
265f29317c1Sandi
266f29317c1Sandi  return preg_replace('/^['.$charlist.']+/u','',$str);
267f29317c1Sandi}
268f29317c1Sandi
269f29317c1Sandi/**
270ea2eed85Sandi * Unicode aware replacement for rtrim()
271f29317c1Sandi *
272f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
273f29317c1Sandi * @see    rtrim()
274f29317c1Sandi * @return string
275f29317c1Sandi */
276f29317c1Sandifunction  utf8_rtrim($str,$charlist=''){
277f29317c1Sandi  if($charlist == '') return rtrim($str);
278f29317c1Sandi
279f29317c1Sandi  //quote charlist for use in a characterclass
280f29317c1Sandi  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
281f29317c1Sandi
282f29317c1Sandi  return preg_replace('/['.$charlist.']+$/u','',$str);
283f29317c1Sandi}
284f29317c1Sandi
285f29317c1Sandi/**
286f29317c1Sandi * Unicode aware replacement for trim()
287f29317c1Sandi *
288f29317c1Sandi * @author Andreas Gohr <andi@splitbrain.org>
289f29317c1Sandi * @see    trim()
290f29317c1Sandi * @return string
291f29317c1Sandi */
292f29317c1Sandifunction  utf8_trim($str,$charlist='') {
293f29317c1Sandi  if($charlist == '') return trim($str);
294f29317c1Sandi
295f29317c1Sandi  return utf8_ltrim(utf8_rtrim($str));
296f29317c1Sandi}
297f29317c1Sandi
2982f954959Sandi
29949c713a3Sandi/**
30082257610Sandi * This is a unicode aware replacement for strtolower()
30182257610Sandi *
30282257610Sandi * Uses mb_string extension if available
30382257610Sandi *
30482257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
30582257610Sandi * @see    strtolower()
30682257610Sandi * @see    utf8_strtoupper()
30782257610Sandi */
30882257610Sandifunction utf8_strtolower($string){
309ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
31082257610Sandi
31182257610Sandi  global $UTF8_UPPER_TO_LOWER;
31282257610Sandi  $uni = utf8_to_unicode($string);
3132cd2db38Sandi  $cnt = count($uni);
3142cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
31582257610Sandi    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
31682257610Sandi      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
31782257610Sandi    }
31882257610Sandi  }
31982257610Sandi  return unicode_to_utf8($uni);
32082257610Sandi}
32182257610Sandi
32282257610Sandi/**
32382257610Sandi * This is a unicode aware replacement for strtoupper()
32482257610Sandi *
32582257610Sandi * Uses mb_string extension if available
32682257610Sandi *
32782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
32882257610Sandi * @see    strtoupper()
32982257610Sandi * @see    utf8_strtoupper()
33082257610Sandi */
33182257610Sandifunction utf8_strtoupper($string){
332ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
33382257610Sandi
33482257610Sandi  global $UTF8_LOWER_TO_UPPER;
33582257610Sandi  $uni = utf8_to_unicode($string);
3362cd2db38Sandi  $cnt = count($uni);
3372cd2db38Sandi  for ($i=0; $i < $cnt; $i++){
33882257610Sandi    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
33982257610Sandi      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
34082257610Sandi    }
34182257610Sandi  }
34282257610Sandi  return unicode_to_utf8($uni);
34382257610Sandi}
34482257610Sandi
34582257610Sandi/**
34682257610Sandi * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
34782257610Sandi *
34882257610Sandi * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
34982257610Sandi * letters. Default is to deaccent both cases ($case = 0)
35082257610Sandi *
35182257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
35282257610Sandi */
35382257610Sandifunction utf8_deaccent($string,$case=0){
35482257610Sandi  if($case <= 0){
35582257610Sandi    global $UTF8_LOWER_ACCENTS;
35682257610Sandi    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
35782257610Sandi  }
35882257610Sandi  if($case >= 0){
35982257610Sandi    global $UTF8_UPPER_ACCENTS;
36082257610Sandi    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
36182257610Sandi  }
36282257610Sandi  return $string;
36382257610Sandi}
36482257610Sandi
36582257610Sandi/**
3668a831f2bSAndreas Gohr * Romanize a non-latin string
3678a831f2bSAndreas Gohr *
3688a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
3698a831f2bSAndreas Gohr */
3708a831f2bSAndreas Gohrfunction utf8_romanize($string){
3718a831f2bSAndreas Gohr  if(utf8_isASCII($string)) return $string; //nothing to do
3728a831f2bSAndreas Gohr
3738a831f2bSAndreas Gohr  global $UTF8_ROMANIZATION;
3748a831f2bSAndreas Gohr  return strtr($string,$UTF8_ROMANIZATION);
3758a831f2bSAndreas Gohr}
3768a831f2bSAndreas Gohr
3778a831f2bSAndreas Gohr/**
378099ada41Sandi * Removes special characters (nonalphanumeric) from a UTF-8 string
379099ada41Sandi *
380099ada41Sandi * This function adds the controlchars 0x00 to 0x19 to the array of
381099ada41Sandi * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
382099ada41Sandi *
383099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
384099ada41Sandi * @param  string $string     The UTF8 string to strip of special chars
385099ada41Sandi * @param  string $repl       Replace special with this string
386b4ce25e9SAndreas Gohr * @param  string $additional Additional chars to strip (used in regexp char class)
387099ada41Sandi */
388b4ce25e9SAndreas Gohrfunction utf8_stripspecials($string,$repl='',$additional=''){
389099ada41Sandi  global $UTF8_SPECIAL_CHARS;
390720307d9Schris  global $UTF8_SPECIAL_CHARS2;
391099ada41Sandi
3925c812709Sandi  static $specials = null;
3935c812709Sandi  if(is_null($specials)){
394720307d9Schris#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
395720307d9Schris    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
3965c812709Sandi  }
397099ada41Sandi
398b4ce25e9SAndreas Gohr  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
399099ada41Sandi}
400099ada41Sandi
401099ada41Sandi/**
4022f954959Sandi * This is an Unicode aware replacement for strpos
4032f954959Sandi *
4042f954959Sandi * Uses mb_string extension if available
4052f954959Sandi *
406f29317c1Sandi * @author Harry Fuecks <hfuecks@gmail.com>
4072f954959Sandi * @see    strpos()
4082f954959Sandi */
4092f954959Sandifunction utf8_strpos($haystack, $needle,$offset=0) {
410ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
4112f954959Sandi
412f29317c1Sandi  if(!$offset){
413eaa525a0SAndreas Gohr    $ar = utf8_explode($needle, $haystack);
414f29317c1Sandi    if ( count($ar) > 1 ) {
415f29317c1Sandi       return utf8_strlen($ar[0]);
416f29317c1Sandi    }
417f29317c1Sandi    return false;
418f29317c1Sandi  }else{
419f29317c1Sandi    if ( !is_int($offset) ) {
420f29317c1Sandi      trigger_error('Offset must be an integer',E_USER_WARNING);
421f29317c1Sandi      return false;
422f29317c1Sandi    }
4232f954959Sandi
424eaa525a0SAndreas Gohr    $haystack = utf8_substr($haystack, $offset);
425f29317c1Sandi
426eaa525a0SAndreas Gohr    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
427f29317c1Sandi       return $pos + $offset;
4282f954959Sandi    }
429f29317c1Sandi    return false;
4302f954959Sandi  }
4312f954959Sandi}
4322f954959Sandi
4332f954959Sandi/**
434ea2eed85Sandi * Encodes UTF-8 characters to HTML entities
435ea2eed85Sandi *
436ea2eed85Sandi * @author <vpribish at shopping dot com>
437ea2eed85Sandi * @link   http://www.php.net/manual/en/function.utf8-decode.php
438ea2eed85Sandi */
439ea2eed85Sandifunction utf8_tohtml ($str) {
440ea2eed85Sandi  $ret = '';
441ea2eed85Sandi  $max = strlen($str);
442ea2eed85Sandi  $last = 0;  // keeps the index of the last regular character
443ea2eed85Sandi  for ($i=0; $i<$max; $i++) {
444ea2eed85Sandi    $c = $str{$i};
445ea2eed85Sandi    $c1 = ord($c);
446ea2eed85Sandi    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
447ea2eed85Sandi      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
448ea2eed85Sandi      $c1 &= 31; // remove the 3 bit two bytes prefix
449ea2eed85Sandi      $c2 = ord($str{++$i}); // the next byte
450ea2eed85Sandi      $c2 &= 63;  // remove the 2 bit trailing byte prefix
451ea2eed85Sandi      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
452ea2eed85Sandi      $c1 >>= 2; // c1 shifts 2 to the right
453ea2eed85Sandi      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
454ea2eed85Sandi      $last = $i+1;
455ea2eed85Sandi    }
456ea2eed85Sandi  }
457ea2eed85Sandi  return $ret . substr($str, $last, $i); // append the last batch of regular characters
458ea2eed85Sandi}
459ea2eed85Sandi
460ea2eed85Sandi/**
4611abfaba4SAndreas Gohr * Takes an UTF-8 string and returns an array of ints representing the
4621abfaba4SAndreas Gohr * Unicode characters. Astral planes are supported ie. the ints in the
4631abfaba4SAndreas Gohr * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
4641abfaba4SAndreas Gohr * are not allowed.
46582257610Sandi *
4661abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
4671abfaba4SAndreas Gohr * string isn't a valid UTF-8 octet sequence and raises a PHP error at
4681abfaba4SAndreas Gohr * level E_USER_WARNING
4691abfaba4SAndreas Gohr *
4701abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to
4711abfaba4SAndreas Gohr * trigger errors on encountering bad bytes
4721abfaba4SAndreas Gohr *
4731abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
4741abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
4751abfaba4SAndreas Gohr * @param  string  UTF-8 encoded string
4761abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
4771abfaba4SAndreas Gohr * @return mixed array of unicode code points or FALSE if UTF-8 invalid
4781abfaba4SAndreas Gohr * @see    unicode_to_utf8
4791abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
4801abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
48182257610Sandi */
4821abfaba4SAndreas Gohrfunction utf8_to_unicode($str,$strict=false) {
4831abfaba4SAndreas Gohr    $mState = 0;     // cached expected number of octets after the current octet
4841abfaba4SAndreas Gohr                     // until the beginning of the next UTF8 character sequence
4851abfaba4SAndreas Gohr    $mUcs4  = 0;     // cached Unicode character
4861abfaba4SAndreas Gohr    $mBytes = 1;     // cached expected number of octets in the current sequence
48782257610Sandi
4881abfaba4SAndreas Gohr    $out = array();
4891abfaba4SAndreas Gohr
4901abfaba4SAndreas Gohr    $len = strlen($str);
4911abfaba4SAndreas Gohr
4921abfaba4SAndreas Gohr    for($i = 0; $i < $len; $i++) {
4931abfaba4SAndreas Gohr
4941abfaba4SAndreas Gohr        $in = ord($str{$i});
4951abfaba4SAndreas Gohr
4961abfaba4SAndreas Gohr        if ( $mState == 0) {
4971abfaba4SAndreas Gohr
4981abfaba4SAndreas Gohr            // When mState is zero we expect either a US-ASCII character or a
4991abfaba4SAndreas Gohr            // multi-octet sequence.
5001abfaba4SAndreas Gohr            if (0 == (0x80 & ($in))) {
5011abfaba4SAndreas Gohr                // US-ASCII, pass straight through.
5021abfaba4SAndreas Gohr                $out[] = $in;
5031abfaba4SAndreas Gohr                $mBytes = 1;
5041abfaba4SAndreas Gohr
5051abfaba4SAndreas Gohr            } else if (0xC0 == (0xE0 & ($in))) {
5061abfaba4SAndreas Gohr                // First octet of 2 octet sequence
5071abfaba4SAndreas Gohr                $mUcs4 = ($in);
5081abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x1F) << 6;
5091abfaba4SAndreas Gohr                $mState = 1;
5101abfaba4SAndreas Gohr                $mBytes = 2;
5111abfaba4SAndreas Gohr
5121abfaba4SAndreas Gohr            } else if (0xE0 == (0xF0 & ($in))) {
5131abfaba4SAndreas Gohr                // First octet of 3 octet sequence
5141abfaba4SAndreas Gohr                $mUcs4 = ($in);
5151abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x0F) << 12;
5161abfaba4SAndreas Gohr                $mState = 2;
5171abfaba4SAndreas Gohr                $mBytes = 3;
5181abfaba4SAndreas Gohr
5191abfaba4SAndreas Gohr            } else if (0xF0 == (0xF8 & ($in))) {
5201abfaba4SAndreas Gohr                // First octet of 4 octet sequence
5211abfaba4SAndreas Gohr                $mUcs4 = ($in);
5221abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x07) << 18;
5231abfaba4SAndreas Gohr                $mState = 3;
5241abfaba4SAndreas Gohr                $mBytes = 4;
5251abfaba4SAndreas Gohr
5261abfaba4SAndreas Gohr            } else if (0xF8 == (0xFC & ($in))) {
5271abfaba4SAndreas Gohr                /* First octet of 5 octet sequence.
5281abfaba4SAndreas Gohr                 *
5291abfaba4SAndreas Gohr                 * This is illegal because the encoded codepoint must be either
5301abfaba4SAndreas Gohr                 * (a) not the shortest form or
5311abfaba4SAndreas Gohr                 * (b) outside the Unicode range of 0-0x10FFFF.
5321abfaba4SAndreas Gohr                 * Rather than trying to resynchronize, we will carry on until the end
5331abfaba4SAndreas Gohr                 * of the sequence and let the later error handling code catch it.
5341abfaba4SAndreas Gohr                 */
5351abfaba4SAndreas Gohr                $mUcs4 = ($in);
5361abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 0x03) << 24;
5371abfaba4SAndreas Gohr                $mState = 4;
5381abfaba4SAndreas Gohr                $mBytes = 5;
5391abfaba4SAndreas Gohr
5401abfaba4SAndreas Gohr            } else if (0xFC == (0xFE & ($in))) {
5411abfaba4SAndreas Gohr                // First octet of 6 octet sequence, see comments for 5 octet sequence.
5421abfaba4SAndreas Gohr                $mUcs4 = ($in);
5431abfaba4SAndreas Gohr                $mUcs4 = ($mUcs4 & 1) << 30;
5441abfaba4SAndreas Gohr                $mState = 5;
5451abfaba4SAndreas Gohr                $mBytes = 6;
5461abfaba4SAndreas Gohr
5471abfaba4SAndreas Gohr            } elseif($strict) {
5481abfaba4SAndreas Gohr                /* Current octet is neither in the US-ASCII range nor a legal first
5491abfaba4SAndreas Gohr                 * octet of a multi-octet sequence.
5501abfaba4SAndreas Gohr                 */
5511abfaba4SAndreas Gohr                trigger_error(
5521abfaba4SAndreas Gohr                        'utf8_to_unicode: Illegal sequence identifier '.
5531abfaba4SAndreas Gohr                            'in UTF-8 at byte '.$i,
5541abfaba4SAndreas Gohr                        E_USER_WARNING
5551abfaba4SAndreas Gohr                    );
5561abfaba4SAndreas Gohr                return FALSE;
5571abfaba4SAndreas Gohr
5581abfaba4SAndreas Gohr            }
5591abfaba4SAndreas Gohr
5601abfaba4SAndreas Gohr        } else {
5611abfaba4SAndreas Gohr
5621abfaba4SAndreas Gohr            // When mState is non-zero, we expect a continuation of the multi-octet
5631abfaba4SAndreas Gohr            // sequence
5641abfaba4SAndreas Gohr            if (0x80 == (0xC0 & ($in))) {
5651abfaba4SAndreas Gohr
5661abfaba4SAndreas Gohr                // Legal continuation.
5671abfaba4SAndreas Gohr                $shift = ($mState - 1) * 6;
5681abfaba4SAndreas Gohr                $tmp = $in;
5691abfaba4SAndreas Gohr                $tmp = ($tmp & 0x0000003F) << $shift;
5701abfaba4SAndreas Gohr                $mUcs4 |= $tmp;
5711abfaba4SAndreas Gohr
5721abfaba4SAndreas Gohr                /**
5731abfaba4SAndreas Gohr                 * End of the multi-octet sequence. mUcs4 now contains the final
5741abfaba4SAndreas Gohr                 * Unicode codepoint to be output
5751abfaba4SAndreas Gohr                 */
5761abfaba4SAndreas Gohr                if (0 == --$mState) {
5771abfaba4SAndreas Gohr
5781abfaba4SAndreas Gohr                    /*
5791abfaba4SAndreas Gohr                     * Check for illegal sequences and codepoints.
5801abfaba4SAndreas Gohr                     */
5811abfaba4SAndreas Gohr                    // From Unicode 3.1, non-shortest form is illegal
5821abfaba4SAndreas Gohr                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
5831abfaba4SAndreas Gohr                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
5841abfaba4SAndreas Gohr                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
5851abfaba4SAndreas Gohr                        (4 < $mBytes) ||
5861abfaba4SAndreas Gohr                        // From Unicode 3.2, surrogate characters are illegal
5871abfaba4SAndreas Gohr                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
5881abfaba4SAndreas Gohr                        // Codepoints outside the Unicode range are illegal
5891abfaba4SAndreas Gohr                        ($mUcs4 > 0x10FFFF)) {
5901abfaba4SAndreas Gohr
5911abfaba4SAndreas Gohr                        if($strict){
5921abfaba4SAndreas Gohr                            trigger_error(
5931abfaba4SAndreas Gohr                                    'utf8_to_unicode: Illegal sequence or codepoint '.
5941abfaba4SAndreas Gohr                                        'in UTF-8 at byte '.$i,
5951abfaba4SAndreas Gohr                                    E_USER_WARNING
5961abfaba4SAndreas Gohr                                );
5971abfaba4SAndreas Gohr
5981abfaba4SAndreas Gohr                            return FALSE;
5991abfaba4SAndreas Gohr                        }
6001abfaba4SAndreas Gohr
6011abfaba4SAndreas Gohr                    }
6021abfaba4SAndreas Gohr
6031abfaba4SAndreas Gohr                    if (0xFEFF != $mUcs4) {
6041abfaba4SAndreas Gohr                        // BOM is legal but we don't want to output it
6051abfaba4SAndreas Gohr                        $out[] = $mUcs4;
6061abfaba4SAndreas Gohr                    }
6071abfaba4SAndreas Gohr
6081abfaba4SAndreas Gohr                    //initialize UTF8 cache
6091abfaba4SAndreas Gohr                    $mState = 0;
6101abfaba4SAndreas Gohr                    $mUcs4  = 0;
6111abfaba4SAndreas Gohr                    $mBytes = 1;
6121abfaba4SAndreas Gohr                }
6131abfaba4SAndreas Gohr
6141abfaba4SAndreas Gohr            } elseif($strict) {
6151abfaba4SAndreas Gohr                /**
6161abfaba4SAndreas Gohr                 *((0xC0 & (*in) != 0x80) && (mState != 0))
6171abfaba4SAndreas Gohr                 * Incomplete multi-octet sequence.
6181abfaba4SAndreas Gohr                 */
6191abfaba4SAndreas Gohr                trigger_error(
6201abfaba4SAndreas Gohr                        'utf8_to_unicode: Incomplete multi-octet '.
6211abfaba4SAndreas Gohr                        '   sequence in UTF-8 at byte '.$i,
6221abfaba4SAndreas Gohr                        E_USER_WARNING
6231abfaba4SAndreas Gohr                    );
6241abfaba4SAndreas Gohr
6251abfaba4SAndreas Gohr                return FALSE;
62682257610Sandi            }
62782257610Sandi        }
62882257610Sandi    }
6291abfaba4SAndreas Gohr    return $out;
63082257610Sandi}
63182257610Sandi
63282257610Sandi/**
6331abfaba4SAndreas Gohr * Takes an array of ints representing the Unicode characters and returns
6341abfaba4SAndreas Gohr * a UTF-8 string. Astral planes are supported ie. the ints in the
6351abfaba4SAndreas Gohr * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
6361abfaba4SAndreas Gohr * are not allowed.
63782257610Sandi *
6381abfaba4SAndreas Gohr * If $strict is set to true the function returns false if the input
6391abfaba4SAndreas Gohr * array contains ints that represent surrogates or are outside the
6401abfaba4SAndreas Gohr * Unicode range and raises a PHP error at level E_USER_WARNING
6411abfaba4SAndreas Gohr *
6421abfaba4SAndreas Gohr * Note: this function has been modified slightly in this library to use
6431abfaba4SAndreas Gohr * output buffering to concatenate the UTF-8 string (faster) as well as
6441abfaba4SAndreas Gohr * reference the array by it's keys
6451abfaba4SAndreas Gohr *
6461abfaba4SAndreas Gohr * @param  array of unicode code points representing a string
6471abfaba4SAndreas Gohr * @param  boolean Check for invalid sequences?
6481abfaba4SAndreas Gohr * @return mixed UTF-8 string or FALSE if array contains invalid code points
6491abfaba4SAndreas Gohr * @author <hsivonen@iki.fi>
6501abfaba4SAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
6511abfaba4SAndreas Gohr * @see    utf8_to_unicode
6521abfaba4SAndreas Gohr * @link   http://hsivonen.iki.fi/php-utf8/
6531abfaba4SAndreas Gohr * @link   http://sourceforge.net/projects/phputf8/
65482257610Sandi */
6551abfaba4SAndreas Gohrfunction unicode_to_utf8($arr,$strict=false) {
6561abfaba4SAndreas Gohr    if (!is_array($arr)) return '';
6571abfaba4SAndreas Gohr    ob_start();
658f949a01cSAndreas Gohr
6591abfaba4SAndreas Gohr    foreach (array_keys($arr) as $k) {
6601abfaba4SAndreas Gohr
6611abfaba4SAndreas Gohr        # ASCII range (including control chars)
6621abfaba4SAndreas Gohr        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
6631abfaba4SAndreas Gohr
6641abfaba4SAndreas Gohr            echo chr($arr[$k]);
6651abfaba4SAndreas Gohr
6661abfaba4SAndreas Gohr        # 2 byte sequence
6671abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x07ff) {
6681abfaba4SAndreas Gohr
6691abfaba4SAndreas Gohr            echo chr(0xc0 | ($arr[$k] >> 6));
6701abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6711abfaba4SAndreas Gohr
6721abfaba4SAndreas Gohr        # Byte order mark (skip)
6731abfaba4SAndreas Gohr        } else if($arr[$k] == 0xFEFF) {
6741abfaba4SAndreas Gohr
6751abfaba4SAndreas Gohr            // nop -- zap the BOM
6761abfaba4SAndreas Gohr
6771abfaba4SAndreas Gohr        # Test for illegal surrogates
6781abfaba4SAndreas Gohr        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
6791abfaba4SAndreas Gohr
6801abfaba4SAndreas Gohr            // found a surrogate
6811abfaba4SAndreas Gohr            if($strict){
6821abfaba4SAndreas Gohr                trigger_error(
6831abfaba4SAndreas Gohr                    'unicode_to_utf8: Illegal surrogate '.
6841abfaba4SAndreas Gohr                        'at index: '.$k.', value: '.$arr[$k],
6851abfaba4SAndreas Gohr                    E_USER_WARNING
6861abfaba4SAndreas Gohr                    );
6871abfaba4SAndreas Gohr                return FALSE;
6881abfaba4SAndreas Gohr            }
6891abfaba4SAndreas Gohr
6901abfaba4SAndreas Gohr        # 3 byte sequence
6911abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0xffff) {
6921abfaba4SAndreas Gohr
6931abfaba4SAndreas Gohr            echo chr(0xe0 | ($arr[$k] >> 12));
6941abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
6951abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x003f));
6961abfaba4SAndreas Gohr
6971abfaba4SAndreas Gohr        # 4 byte sequence
6981abfaba4SAndreas Gohr        } else if ($arr[$k] <= 0x10ffff) {
6991abfaba4SAndreas Gohr
7001abfaba4SAndreas Gohr            echo chr(0xf0 | ($arr[$k] >> 18));
7011abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
7021abfaba4SAndreas Gohr            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
7031abfaba4SAndreas Gohr            echo chr(0x80 | ($arr[$k] & 0x3f));
7041abfaba4SAndreas Gohr
7051abfaba4SAndreas Gohr        } elseif($strict) {
7061abfaba4SAndreas Gohr
7071abfaba4SAndreas Gohr            trigger_error(
7081abfaba4SAndreas Gohr                'unicode_to_utf8: Codepoint out of Unicode range '.
7091abfaba4SAndreas Gohr                    'at index: '.$k.', value: '.$arr[$k],
7101abfaba4SAndreas Gohr                E_USER_WARNING
7111abfaba4SAndreas Gohr                );
7121abfaba4SAndreas Gohr
7131abfaba4SAndreas Gohr            // out of range
7141abfaba4SAndreas Gohr            return FALSE;
71582257610Sandi        }
71682257610Sandi    }
7171abfaba4SAndreas Gohr
7181abfaba4SAndreas Gohr    $result = ob_get_contents();
7191abfaba4SAndreas Gohr    ob_end_clean();
7201abfaba4SAndreas Gohr    return $result;
72182257610Sandi}
72282257610Sandi
72382257610Sandi/**
72415fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
72515fa0b4fSAndreas Gohr *
72615fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
72715fa0b4fSAndreas Gohr */
72815fa0b4fSAndreas Gohrfunction utf8_to_utf16be(&$str, $bom = false) {
72915fa0b4fSAndreas Gohr  $out = $bom ? "\xFE\xFF" : '';
730ab77016bSAndreas Gohr  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
73115fa0b4fSAndreas Gohr
73215fa0b4fSAndreas Gohr  $uni = utf8_to_unicode($str);
73315fa0b4fSAndreas Gohr  foreach($uni as $cp){
73415fa0b4fSAndreas Gohr    $out .= pack('n',$cp);
73515fa0b4fSAndreas Gohr  }
73615fa0b4fSAndreas Gohr  return $out;
73715fa0b4fSAndreas Gohr}
73815fa0b4fSAndreas Gohr
73915fa0b4fSAndreas Gohr/**
74015fa0b4fSAndreas Gohr * UTF-8 to UTF-16BE conversion.
74115fa0b4fSAndreas Gohr *
74215fa0b4fSAndreas Gohr * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
74315fa0b4fSAndreas Gohr */
74415fa0b4fSAndreas Gohrfunction utf16be_to_utf8(&$str) {
74515fa0b4fSAndreas Gohr  $uni = unpack('n*',$str);
74615fa0b4fSAndreas Gohr  return unicode_to_utf8($uni);
74715fa0b4fSAndreas Gohr}
74815fa0b4fSAndreas Gohr
7490eac1afbSAndreas Gohr/**
7500eac1afbSAndreas Gohr * Replace bad bytes with an alternative character
7510eac1afbSAndreas Gohr *
7520eac1afbSAndreas Gohr * ASCII character is recommended for replacement char
7530eac1afbSAndreas Gohr *
7540eac1afbSAndreas Gohr * PCRE Pattern to locate bad bytes in a UTF-8 string
7550eac1afbSAndreas Gohr * Comes from W3 FAQ: Multilingual Forms
7560eac1afbSAndreas Gohr * Note: modified to include full ASCII range including control chars
7570eac1afbSAndreas Gohr *
7580eac1afbSAndreas Gohr * @author Harry Fuecks <hfuecks@gmail.com>
7590eac1afbSAndreas Gohr * @see http://www.w3.org/International/questions/qa-forms-utf-8
7600eac1afbSAndreas Gohr * @param string to search
7610eac1afbSAndreas Gohr * @param string to replace bad bytes with (defaults to '?') - use ASCII
7620eac1afbSAndreas Gohr * @return string
7630eac1afbSAndreas Gohr */
7640eac1afbSAndreas Gohrfunction utf8_bad_replace($str, $replace = '') {
7650eac1afbSAndreas Gohr    $UTF8_BAD =
7660eac1afbSAndreas Gohr     '([\x00-\x7F]'.                          # ASCII (including control chars)
7670eac1afbSAndreas Gohr     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
7680eac1afbSAndreas Gohr     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
7690eac1afbSAndreas Gohr     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
7700eac1afbSAndreas Gohr     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
7710eac1afbSAndreas Gohr     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
7720eac1afbSAndreas Gohr     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
7730eac1afbSAndreas Gohr     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
7740eac1afbSAndreas Gohr     '|(.{1}))';                              # invalid byte
7750eac1afbSAndreas Gohr    ob_start();
7760eac1afbSAndreas Gohr    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
7770eac1afbSAndreas Gohr        if ( !isset($matches[2])) {
7780eac1afbSAndreas Gohr            echo $matches[0];
7790eac1afbSAndreas Gohr        } else {
7800eac1afbSAndreas Gohr            echo $replace;
7810eac1afbSAndreas Gohr        }
7820eac1afbSAndreas Gohr        $str = substr($str,strlen($matches[0]));
7830eac1afbSAndreas Gohr    }
7840eac1afbSAndreas Gohr    $result = ob_get_contents();
7850eac1afbSAndreas Gohr    ob_end_clean();
7860eac1afbSAndreas Gohr    return $result;
7870eac1afbSAndreas Gohr}
788ab77016bSAndreas Gohr
7895953e889Schris/**
7905953e889Schris * adjust a byte index into a utf8 string to a utf8 character boundary
7915953e889Schris *
7925953e889Schris * @param $str   string   utf8 character string
7935953e889Schris * @param $i     int      byte index into $str
7945953e889Schris * @param $next  bool     direction to search for boundary,
7955953e889Schris *                           false = up (current character)
7965953e889Schris *                           true = down (next character)
7975953e889Schris *
7985953e889Schris * @return int            byte index into $str now pointing to a utf8 character boundary
7995953e889Schris *
8005953e889Schris * @author       chris smith <chris@jalakai.co.uk>
8015953e889Schris */
8025953e889Schrisfunction utf8_correctIdx(&$str,$i,$next=false) {
8035953e889Schris
804f50163d1Schris  if ($i <= 0) return 0;
805f50163d1Schris
8065953e889Schris  $limit = strlen($str);
807f50163d1Schris  if ($i>=$limit) return $limit;
808f50163d1Schris
809f50163d1Schris  if ($next) {
8105953e889Schris    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
8115953e889Schris  } else {
8125953e889Schris    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
8135953e889Schris  }
8145953e889Schris
8155953e889Schris  return $i;
8165953e889Schris}
8175953e889Schris
818*5e613a5cSchris/**
819*5e613a5cSchris * determine the byte indexes into a utf-8 string for one or more character offsets
820*5e613a5cSchris * PRIVATE  (could be made public with proper paramter checking)
821*5e613a5cSchris *
822*5e613a5cSchris * @author  Chris Smith <chris@jalakai.co.uk>
823*5e613a5cSchris *
824*5e613a5cSchris * @param   string    $str      utf8 string
825*5e613a5cSchris * @param   int       $offset   any number of character offsets into $str
826*5e613a5cSchris *
827*5e613a5cSchris * @return  array     byte indexes into $str, one index for each offset argument
828*5e613a5cSchris */
829*5e613a5cSchrisfunction _utf8_byteindex() {
830*5e613a5cSchris
831*5e613a5cSchris  $args = func_get_args();
832*5e613a5cSchris  $str =& array_shift($args);
833*5e613a5cSchris  if (!is_string($str)) return false;
834*5e613a5cSchris
835*5e613a5cSchris  $result = array();
836*5e613a5cSchris
837*5e613a5cSchris  // use a short piece of str to estimate bytes per character
838*5e613a5cSchris  $i = utf8_correctIdx($str, 300, true);           // $i (& $j) -> byte indexes into $str
839*5e613a5cSchris  $c = utf8_strlen(substr($str,0,$i));             // $c -> character offset into $str
840*5e613a5cSchris
841*5e613a5cSchris  sort($args);                                     // deal with arguments from lowest to highest
842*5e613a5cSchris  foreach ($args as $offset) {
843*5e613a5cSchris    // sanity checks FIXME
844*5e613a5cSchris
845*5e613a5cSchris    // 0 is an easy check
846*5e613a5cSchris    if ($offset == 0) { $result[] = 0; continue; }
847*5e613a5cSchris
848*5e613a5cSchris    $safety_valve = 50;                            // ensure no endless looping
849*5e613a5cSchris
850*5e613a5cSchris    do {
851*5e613a5cSchris      $j = (int)($offset * $i/$c);                 // apply latest bytes/character estimate to offset
852*5e613a5cSchris      $j = utf8_correctIdx($str, $j, true);        // correct to utf8 character boundary
853*5e613a5cSchris
854*5e613a5cSchris      if ($j > $i) {
855*5e613a5cSchris        $c += utf8_strlen(substr($str,$i,$j-$i));  // determine new character offset
856*5e613a5cSchris      } else {
857*5e613a5cSchris        $c -= utf8_strlen(substr($str,$j,$i-$j));  // ditto
858*5e613a5cSchris      }
859*5e613a5cSchris
860*5e613a5cSchris      $error = abs($c-$offset);
861*5e613a5cSchris
862*5e613a5cSchris      $i = $j;                                     // ready for next time around
863*5e613a5cSchris    } while (($error > 7) && --$safety_valve) ;    // from 7 it is faster to iterate over the string
864*5e613a5cSchris
865*5e613a5cSchris    if ($error && $error <= 7) {
866*5e613a5cSchris      if ($c < $offset) {
867*5e613a5cSchris        // move up
868*5e613a5cSchris        while ($error--) { $i = utf8_correctIdx($str,++$i,true); }
869*5e613a5cSchris      } else {
870*5e613a5cSchris        // move down
871*5e613a5cSchris        while ($error--) { $i = utf8_correctIdx($str,--$i,false); }
872*5e613a5cSchris      }
873*5e613a5cSchris      $c = $offset;                                // ready for next arg
874*5e613a5cSchris    }
875*5e613a5cSchris    $result[] = $i;
876*5e613a5cSchris  }
877*5e613a5cSchris
878*5e613a5cSchris  return $result;
879*5e613a5cSchris}
880*5e613a5cSchris
881ab77016bSAndreas Gohr// only needed if no mb_string available
882ab77016bSAndreas Gohrif(!UTF8_MBSTRING){
883ab77016bSAndreas Gohr
88415fa0b4fSAndreas Gohr  /**
88582257610Sandi   * UTF-8 Case lookup table
88682257610Sandi   *
88782257610Sandi   * This lookuptable defines the upper case letters to their correspponding
88882257610Sandi   * lower case letter in UTF-8
88982257610Sandi   *
89082257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
89182257610Sandi   */
89254662a04SAndreas Gohr  global $UTF8_LOWER_TO_UPPER;
89354662a04SAndreas Gohr  $UTF8_LOWER_TO_UPPER = array(
89482257610Sandi    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
89582257610Sandi    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
89682257610Sandi    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
89782257610Sandi    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
89882257610Sandi    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
89982257610Sandi    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
90082257610Sandi    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
90182257610Sandi    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
90282257610Sandi    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
90382257610Sandi    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
90482257610Sandi    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
90582257610Sandi    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
90682257610Sandi    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
90782257610Sandi    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
90882257610Sandi    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
90982257610Sandi    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
91082257610Sandi    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
91182257610Sandi    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
91282257610Sandi    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
91382257610Sandi    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
91482257610Sandi    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
91582257610Sandi    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
91682257610Sandi    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
91782257610Sandi    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
91882257610Sandi    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
91982257610Sandi    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
92082257610Sandi    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
92182257610Sandi    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
92282257610Sandi    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
92382257610Sandi    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
92482257610Sandi    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
92582257610Sandi    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
92682257610Sandi    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
92782257610Sandi    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
92882257610Sandi    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
92982257610Sandi    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
93082257610Sandi    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
93182257610Sandi    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
93282257610Sandi    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
93382257610Sandi    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
93482257610Sandi    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
93582257610Sandi    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
93682257610Sandi    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
93782257610Sandi  );
93882257610Sandi
93982257610Sandi  /**
94082257610Sandi   * UTF-8 Case lookup table
94182257610Sandi   *
94282257610Sandi   * This lookuptable defines the lower case letters to their correspponding
94382257610Sandi   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
94482257610Sandi   *
94582257610Sandi   * @author Andreas Gohr <andi@splitbrain.org>
94682257610Sandi   */
94754662a04SAndreas Gohr  global $UTF8_UPPER_TO_LOWER;
94882257610Sandi  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
94982257610Sandi
950ab77016bSAndreas Gohr} // end of case lookup tables
951ab77016bSAndreas Gohr
952ab77016bSAndreas Gohr
95382257610Sandi/**
95482257610Sandi * UTF-8 lookup table for lower case accented letters
95582257610Sandi *
95682257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
95782257610Sandi * range. This are lower case letters only.
95882257610Sandi *
95982257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
96082257610Sandi * @see    utf8_deaccent()
96182257610Sandi */
96254662a04SAndreas Gohrglobal $UTF8_LOWER_ACCENTS;
96382257610Sandi$UTF8_LOWER_ACCENTS = array(
96482257610Sandi  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
96582257610Sandi  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
96682257610Sandi  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
96782257610Sandi  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
96882257610Sandi  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
96982257610Sandi  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
97082257610Sandi  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
97182257610Sandi  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
97282257610Sandi  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
97382257610Sandi  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
97482257610Sandi  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
97582257610Sandi  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
97682257610Sandi  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
97782257610Sandi  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
97874c0c504Schris  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
97982257610Sandi);
98082257610Sandi
98182257610Sandi/**
98282257610Sandi * UTF-8 lookup table for upper case accented letters
98382257610Sandi *
98482257610Sandi * This lookuptable defines replacements for accented characters from the ASCII-7
98582257610Sandi * range. This are upper case letters only.
98682257610Sandi *
98782257610Sandi * @author Andreas Gohr <andi@splitbrain.org>
98882257610Sandi * @see    utf8_deaccent()
98982257610Sandi */
99054662a04SAndreas Gohrglobal $UTF8_UPPER_ACCENTS;
99182257610Sandi$UTF8_UPPER_ACCENTS = array(
992df3ecd55SAndreas Gohr  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
993df3ecd55SAndreas Gohr  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
994df3ecd55SAndreas Gohr  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
995df3ecd55SAndreas Gohr  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
996df3ecd55SAndreas Gohr  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
997df3ecd55SAndreas Gohr  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
998df3ecd55SAndreas Gohr  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
999df3ecd55SAndreas Gohr  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1000df3ecd55SAndreas Gohr  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1001df3ecd55SAndreas Gohr  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1002df3ecd55SAndreas Gohr  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1003df3ecd55SAndreas Gohr  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1004df3ecd55SAndreas Gohr  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1005df3ecd55SAndreas Gohr  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
100674c0c504Schris  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
100782257610Sandi);
100882257610Sandi
1009099ada41Sandi/**
1010099ada41Sandi * UTF-8 array of common special characters
1011099ada41Sandi *
1012099ada41Sandi * This array should contain all special characters (not a letter or digit)
1013099ada41Sandi * defined in the various local charsets - it's not a complete list of non-alphanum
1014099ada41Sandi * characters in UTF-8. It's not perfect but should match most cases of special
1015099ada41Sandi * chars.
1016099ada41Sandi *
1017099ada41Sandi * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1018ad81d431SAndreas Gohr * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1019099ada41Sandi *
1020099ada41Sandi * @author Andreas Gohr <andi@splitbrain.org>
1021099ada41Sandi * @see    utf8_stripspecials()
1022099ada41Sandi */
102354662a04SAndreas Gohrglobal $UTF8_SPECIAL_CHARS;
1024099ada41Sandi$UTF8_SPECIAL_CHARS = array(
1025099ada41Sandi  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1026ad81d431SAndreas Gohr  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
10275c812709Sandi          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
10285c812709Sandi  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1029099ada41Sandi  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1030099ada41Sandi  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1031099ada41Sandi  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1032099ada41Sandi  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1033099ada41Sandi  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1034099ada41Sandi  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1035099ada41Sandi  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1036099ada41Sandi  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1037099ada41Sandi  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1038099ada41Sandi  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1039099ada41Sandi  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1040099ada41Sandi  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1041099ada41Sandi  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1042099ada41Sandi  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1043099ada41Sandi  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1044099ada41Sandi  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1045099ada41Sandi  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1046099ada41Sandi  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1047099ada41Sandi  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1048099ada41Sandi  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1049099ada41Sandi  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1050099ada41Sandi  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1051099ada41Sandi  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1052099ada41Sandi  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1053099ada41Sandi  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1054099ada41Sandi  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1055099ada41Sandi  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1056099ada41Sandi  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1057099ada41Sandi  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1058099ada41Sandi  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1059099ada41Sandi  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1060099ada41Sandi  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1061099ada41Sandi  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1062099ada41Sandi  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1063099ada41Sandi  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1064099ada41Sandi  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1065099ada41Sandi  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1066099ada41Sandi  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1067099ada41Sandi  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1068099ada41Sandi  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1069099ada41Sandi  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1070099ada41Sandi  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1071099ada41Sandi  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1072099ada41Sandi  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1073099ada41Sandi  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1074099ada41Sandi  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1075099ada41Sandi);
1076340756e4Sandi
1077720307d9Schris// utf8 version of above data
1078720307d9Schrisglobal $UTF8_SPECIAL_CHARS2;
1079720307d9Schris$UTF8_SPECIAL_CHARS2 =
1080720307d9Schris    ' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1081720307d9Schris    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1082720307d9Schris    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1083720307d9Schris    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1084720307d9Schris    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1085720307d9Schris    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1086720307d9Schris    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1087720307d9Schris    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1088720307d9Schris    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1089720307d9Schris    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1090720307d9Schris    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1091720307d9Schris    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1092720307d9Schris    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1093720307d9Schris    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1094720307d9Schris    '➷➸➹➺➻➼➽➾�'.
1095720307d9Schris    '�ﹼﹽ';
1096720307d9Schris
10978a831f2bSAndreas Gohr/**
10988a831f2bSAndreas Gohr * Romanization lookup table
10998a831f2bSAndreas Gohr *
11008a831f2bSAndreas Gohr * This lookup tables provides a way to transform strings written in a language
11018a831f2bSAndreas Gohr * different from the ones based upon latin letters into plain ASCII.
11028a831f2bSAndreas Gohr *
11038a831f2bSAndreas Gohr * Please note: this is not a scientific transliteration table. It only works
11048a831f2bSAndreas Gohr * oneway from nonlatin to ASCII and it works by simple character replacement
11058a831f2bSAndreas Gohr * only. Specialities of each language are not supported.
11068a831f2bSAndreas Gohr *
11078a831f2bSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org>
11088a831f2bSAndreas Gohr * @author Vitaly Blokhin <vitinfo@vitn.com>
11098a831f2bSAndreas Gohr * @link   http://www.uconv.com/translit.htm
11108a831f2bSAndreas Gohr * @author Bisqwit <bisqwit@iki.fi>
11118a831f2bSAndreas Gohr * @link   http://kanjidict.stc.cx/hiragana.php?src=2
11128a831f2bSAndreas Gohr * @link   http://www.translatum.gr/converter/greek-transliteration.htm
11138a831f2bSAndreas Gohr * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
11148a831f2bSAndreas Gohr * @link   http://www.btranslations.com/resources/romanization/korean.asp
11158a831f2bSAndreas Gohr */
111654662a04SAndreas Gohrglobal $UTF8_ROMANIZATION;
11178a831f2bSAndreas Gohr$UTF8_ROMANIZATION = array(
11188a831f2bSAndreas Gohr  //russian cyrillic
11198a831f2bSAndreas Gohr  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
11208a831f2bSAndreas Gohr  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
11218a831f2bSAndreas Gohr  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
11228a831f2bSAndreas Gohr  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
11238a831f2bSAndreas Gohr  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
11248a831f2bSAndreas Gohr  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1125d8cb2602SDenis Simakov  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1126d8cb2602SDenis Simakov  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
11278a831f2bSAndreas Gohr  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
11288a831f2bSAndreas Gohr  // Ukrainian cyrillic
11298a831f2bSAndreas Gohr  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
11308a831f2bSAndreas Gohr  // Georgian
11318a831f2bSAndreas Gohr  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
11328a831f2bSAndreas Gohr  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
11338a831f2bSAndreas Gohr  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
11348a831f2bSAndreas Gohr  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
11358a831f2bSAndreas Gohr  'ჰ'=>'xh',
11368a831f2bSAndreas Gohr  //Sanskrit
11378a831f2bSAndreas Gohr  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
11388a831f2bSAndreas Gohr  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
11398a831f2bSAndreas Gohr  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
11408a831f2bSAndreas Gohr  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
11418a831f2bSAndreas Gohr  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
11428a831f2bSAndreas Gohr  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
11438a831f2bSAndreas Gohr  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
11448a831f2bSAndreas Gohr  //Hebrew
11453dbad6dcSDenis Simakov  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
11463dbad6dcSDenis Simakov  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
11473dbad6dcSDenis Simakov  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
11488a831f2bSAndreas Gohr  'ש'=>'sh','ת'=>'t',
11498a831f2bSAndreas Gohr  //Arabic
11508a831f2bSAndreas Gohr  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
11518a831f2bSAndreas Gohr  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
11528a831f2bSAndreas Gohr  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
11538a831f2bSAndreas Gohr  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
11548a831f2bSAndreas Gohr
11558a831f2bSAndreas Gohr  // Japanese hiragana
11568a831f2bSAndreas Gohr  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
11578a831f2bSAndreas Gohr  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
11588a831f2bSAndreas Gohr  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
11598a831f2bSAndreas Gohr  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
11608a831f2bSAndreas Gohr  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
11618a831f2bSAndreas Gohr  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
11628a831f2bSAndreas Gohr  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
11638a831f2bSAndreas Gohr  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
11648a831f2bSAndreas Gohr  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
11658a831f2bSAndreas Gohr  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
11668a831f2bSAndreas Gohr  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
11678a831f2bSAndreas Gohr  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
11688a831f2bSAndreas Gohr  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
11698a831f2bSAndreas Gohr  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
11708a831f2bSAndreas Gohr  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
11718a831f2bSAndreas Gohr  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
11728a831f2bSAndreas Gohr  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
11738a831f2bSAndreas Gohr  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
11748a831f2bSAndreas Gohr  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
11758a831f2bSAndreas Gohr  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
11768a831f2bSAndreas Gohr  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
11778a831f2bSAndreas Gohr  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
11788a831f2bSAndreas Gohr  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
11798a831f2bSAndreas Gohr  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
11808a831f2bSAndreas Gohr  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
11818a831f2bSAndreas Gohr  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
11828a831f2bSAndreas Gohr  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
11838a831f2bSAndreas Gohr  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
11848a831f2bSAndreas Gohr  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
11858a831f2bSAndreas Gohr  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
11868a831f2bSAndreas Gohr  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
11878a831f2bSAndreas Gohr  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
11888a831f2bSAndreas Gohr  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
11898a831f2bSAndreas Gohr  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
11908a831f2bSAndreas Gohr  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
11918a831f2bSAndreas Gohr  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
11928a831f2bSAndreas Gohr  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
11938a831f2bSAndreas Gohr  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
11948a831f2bSAndreas Gohr  'じゅ'=>'zyu',
11958a831f2bSAndreas Gohr  // Japanese katakana
11968a831f2bSAndreas Gohr  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
11978a831f2bSAndreas Gohr  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
11988a831f2bSAndreas Gohr  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
11998a831f2bSAndreas Gohr  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
12008a831f2bSAndreas Gohr  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
12018a831f2bSAndreas Gohr  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
12028a831f2bSAndreas Gohr  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
12038a831f2bSAndreas Gohr  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
12048a831f2bSAndreas Gohr  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
12058a831f2bSAndreas Gohr  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
12068a831f2bSAndreas Gohr  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
12078a831f2bSAndreas Gohr  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
12088a831f2bSAndreas Gohr  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
12098a831f2bSAndreas Gohr  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
12108a831f2bSAndreas Gohr  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
12118a831f2bSAndreas Gohr  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
12128a831f2bSAndreas Gohr  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
12138a831f2bSAndreas Gohr  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
12148a831f2bSAndreas Gohr  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
12158a831f2bSAndreas Gohr  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
12168a831f2bSAndreas Gohr  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
12178a831f2bSAndreas Gohr  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
12188a831f2bSAndreas Gohr  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
12198a831f2bSAndreas Gohr  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
12208a831f2bSAndreas Gohr  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
12218a831f2bSAndreas Gohr  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
12228a831f2bSAndreas Gohr  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
12238a831f2bSAndreas Gohr  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
12248a831f2bSAndreas Gohr  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
12258a831f2bSAndreas Gohr  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
12268a831f2bSAndreas Gohr  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
12278a831f2bSAndreas Gohr  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
12288a831f2bSAndreas Gohr  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
12298a831f2bSAndreas Gohr  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
12308a831f2bSAndreas Gohr  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
12318a831f2bSAndreas Gohr  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
12328a831f2bSAndreas Gohr  'ジョ'=>'zyo','ジュ'=>'zyu',
12338a831f2bSAndreas Gohr
12348a831f2bSAndreas Gohr  // "Greeklish"
12358a831f2bSAndreas Gohr  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
12368a831f2bSAndreas Gohr  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
12378a831f2bSAndreas Gohr
12388a831f2bSAndreas Gohr  // Thai
12398a831f2bSAndreas Gohr  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
12408a831f2bSAndreas Gohr  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
12418a831f2bSAndreas Gohr  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
12428a831f2bSAndreas Gohr  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
12438a831f2bSAndreas Gohr  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
12448a831f2bSAndreas Gohr  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
12458a831f2bSAndreas Gohr  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
12468a831f2bSAndreas Gohr  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
12478a831f2bSAndreas Gohr  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
12488a831f2bSAndreas Gohr  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
12498a831f2bSAndreas Gohr  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
12508a831f2bSAndreas Gohr  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
12518a831f2bSAndreas Gohr  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
12528a831f2bSAndreas Gohr  'เ–ียว'=>'iao',
12538a831f2bSAndreas Gohr
12548a831f2bSAndreas Gohr  // Korean
12558a831f2bSAndreas Gohr  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
12568a831f2bSAndreas Gohr  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
12578a831f2bSAndreas Gohr  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
12588a831f2bSAndreas Gohr  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
12598a831f2bSAndreas Gohr  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
12608a831f2bSAndreas Gohr  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
12618a831f2bSAndreas Gohr);
1262340756e4Sandi
1263340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 :
12648a831f2bSAndreas Gohr
1265