xref: /dokuwiki/inc/utf8.php (revision 58723938c4ac0722ab18a345d286634a9f823b39)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22
23/**
24 * URL-Encode a filename to allow unicodecharacters
25 *
26 * Slashes are not encoded
27 *
28 * When the second parameter is true the string will
29 * be encoded only if non ASCII characters are detected -
30 * This makes it safe to run it multiple times on the
31 * same string (default is true)
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @see    urlencode
35 */
36function utf8_encodeFN($file,$safe=true){
37  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38    return $file;
39  }
40  $file = urlencode($file);
41  $file = str_replace('%2F','/',$file);
42  return $file;
43}
44
45/**
46 * URL-Decode a filename
47 *
48 * This is just a wrapper around urldecode
49 *
50 * @author Andreas Gohr <andi@splitbrain.org>
51 * @see    urldecode
52 */
53function utf8_decodeFN($file){
54  $file = urldecode($file);
55  return $file;
56}
57
58/**
59 * Checks if a string contains 7bit ASCII only
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_isASCII($str){
64  for($i=0; $i<strlen($str); $i++){
65    if(ord($str{$i}) >127) return false;
66  }
67  return true;
68}
69
70/**
71 * Strips all highbyte chars
72 *
73 * Returns a pure ASCII7 string
74 *
75 * @author Andreas Gohr <andi@splitbrain.org>
76 */
77function utf8_strip($str){
78  $ascii = '';
79  for($i=0; $i<strlen($str); $i++){
80    if(ord($str{$i}) <128){
81      $ascii .= $str{$i};
82    }
83  }
84  return $ascii;
85}
86
87/**
88 * Tries to detect if a string is in Unicode encoding
89 *
90 * @author <bmorel@ssi.fr>
91 * @link   http://www.php.net/manual/en/function.utf8-encode.php
92 */
93function utf8_check($Str) {
94 for ($i=0; $i<strlen($Str); $i++) {
95  $b = ord($Str[$i]);
96  if ($b < 0x80) continue; # 0bbbbbbb
97  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
98  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
99  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
100  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
101  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102  else return false; # Does not match any model
103  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105   return false;
106  }
107 }
108 return true;
109}
110
111/**
112 * Unicode aware replacement for strlen()
113 *
114 * utf8_decode() converts characters that are not in ISO-8859-1
115 * to '?', which, for the purpose of counting, is alright - It's
116 * even faster than mb_strlen.
117 *
118 * @author <chernyshevsky at hotmail dot com>
119 * @see    strlen()
120 * @see    utf8_decode()
121 */
122function utf8_strlen($string){
123  return strlen(utf8_decode($string));
124}
125
126/**
127 * UTF-8 aware alternative to substr
128 *
129 * Return part of a string given character offset (and optionally length)
130 *
131 * @author Harry Fuecks <hfuecks@gmail.com>
132 * @author Chris Smith <chris@jalakai.co.uk>
133 * @param string
134 * @param integer number of UTF-8 characters offset (from left)
135 * @param integer (optional) length in UTF-8 characters from offset
136 * @return mixed string or false if failure
137 */
138function utf8_substr($str, $offset, $length = null) {
139    if(UTF8_MBSTRING){
140        if( $length === null ){
141            return mb_substr($str, $offset);
142        }else{
143            return mb_substr($str, $offset, $length);
144        }
145    }
146
147    /*
148     * Notes:
149     *
150     * no mb string support, so we'll use pcre regex's with 'u' flag
151     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
152     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
153     *
154     * substr documentation states false can be returned in some cases (e.g. offset > string length)
155     * mb_substr never returns false, it will return an empty string instead.
156     *
157     * calculating the number of characters in the string is a relatively expensive operation, so
158     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
159     */
160
161    // cast parameters to appropriate types to avoid multiple notices/warnings
162    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
163    $offset = (int)$offset;
164    if (!is_null($length)) $length = (int)$length;
165
166    // handle trivial cases
167    if ($length === 0) return '';
168    if ($offset < 0 && $length < 0 && $length < $offset) return '';
169
170    $offset_pattern = '';
171    $length_pattern = '';
172
173    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
174    if ($offset < 0) {
175      $strlen = strlen(utf8_decode($str));        // see notes
176      $offset = $strlen + $offset;
177      if ($offset < 0) $offset = 0;
178    }
179
180    // establish a pattern for offset, a non-captured group equal in length to offset
181    if ($offset > 0) {
182      $Ox = (int)($offset/65535);
183      $Oy = $offset%65535;
184
185      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
186      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
187    } else {
188      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
189    }
190
191    // establish a pattern for length
192    if (is_null($length)) {
193      $length_pattern = '(.*)$';                  // the rest of the string
194    } else {
195
196      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
197      if ($offset > $strlen) return '';           // another trivial case
198
199      if ($length > 0) {
200
201        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
202
203        $Lx = (int)($length/65535);
204        $Ly = $length%65535;
205
206        // +ve length requires ... a captured group of length characters
207        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
208        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
209
210      } else if ($length < 0) {
211
212        if ($length < ($offset - $strlen)) return '';
213
214        $Lx = (int)((-$length)/65535);
215        $Ly = (-$length)%65535;
216
217        // -ve length requires ... capture everything except a group of -length characters
218        //                         anchored at the tail-end of the string
219        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
220        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
221      }
222    }
223
224    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
225    return $match[1];
226}
227
228/**
229 * Unicode aware replacement for substr_replace()
230 *
231 * @author Andreas Gohr <andi@splitbrain.org>
232 * @see    substr_replace()
233 */
234function utf8_substr_replace($string, $replacement, $start , $length=0 ){
235  $ret = '';
236  if($start>0) $ret .= utf8_substr($string, 0, $start);
237  $ret .= $replacement;
238  $ret .= utf8_substr($string, $start+$length);
239  return $ret;
240}
241
242/**
243 * Unicode aware replacement for explode
244 *
245 * @TODO   support third limit arg
246 * @author Harry Fuecks <hfuecks@gmail.com>
247 * @see    explode();
248 */
249function utf8_explode($sep, $str) {
250  if ( $sep == '' ) {
251    trigger_error('Empty delimiter',E_USER_WARNING);
252    return false;
253  }
254
255  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
256}
257
258/**
259 * Unicode aware replacement for strrepalce()
260 *
261 * @todo   support PHP5 count (fourth arg)
262 * @author Harry Fuecks <hfuecks@gmail.com>
263 * @see    strreplace();
264 */
265function utf8_str_replace($s,$r,$str){
266  if(!is_array($s)){
267    $s = '!'.preg_quote($s,'!').'!u';
268  }else{
269    foreach ($s as $k => $v) {
270      $s[$k] = '!'.preg_quote($v).'!u';
271    }
272  }
273  return preg_replace($s,$r,$str);
274}
275
276/**
277 * Unicode aware replacement for ltrim()
278 *
279 * @author Andreas Gohr <andi@splitbrain.org>
280 * @see    ltrim()
281 * @return string
282 */
283function utf8_ltrim($str,$charlist=''){
284  if($charlist == '') return ltrim($str);
285
286  //quote charlist for use in a characterclass
287  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
288
289  return preg_replace('/^['.$charlist.']+/u','',$str);
290}
291
292/**
293 * Unicode aware replacement for rtrim()
294 *
295 * @author Andreas Gohr <andi@splitbrain.org>
296 * @see    rtrim()
297 * @return string
298 */
299function  utf8_rtrim($str,$charlist=''){
300  if($charlist == '') return rtrim($str);
301
302  //quote charlist for use in a characterclass
303  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
304
305  return preg_replace('/['.$charlist.']+$/u','',$str);
306}
307
308/**
309 * Unicode aware replacement for trim()
310 *
311 * @author Andreas Gohr <andi@splitbrain.org>
312 * @see    trim()
313 * @return string
314 */
315function  utf8_trim($str,$charlist='') {
316  if($charlist == '') return trim($str);
317
318  return utf8_ltrim(utf8_rtrim($str));
319}
320
321
322/**
323 * This is a unicode aware replacement for strtolower()
324 *
325 * Uses mb_string extension if available
326 *
327 * @author Leo Feyer <leo@typolight.org>
328 * @see    strtolower()
329 * @see    utf8_strtoupper()
330 */
331function utf8_strtolower($string){
332  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
333
334  global $UTF8_UPPER_TO_LOWER;
335  return strtr($string,$UTF8_UPPER_TO_LOWER);
336}
337
338/**
339 * This is a unicode aware replacement for strtoupper()
340 *
341 * Uses mb_string extension if available
342 *
343 * @author Leo Feyer <leo@typolight.org>
344 * @see    strtoupper()
345 * @see    utf8_strtoupper()
346 */
347function utf8_strtoupper($string){
348  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
349
350  global $UTF8_LOWER_TO_UPPER;
351  return strtr($string,$UTF8_LOWER_TO_UPPER);
352}
353
354/**
355 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
356 *
357 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
358 * letters. Default is to deaccent both cases ($case = 0)
359 *
360 * @author Andreas Gohr <andi@splitbrain.org>
361 */
362function utf8_deaccent($string,$case=0){
363  if($case <= 0){
364    global $UTF8_LOWER_ACCENTS;
365    $string = strtr($string,$UTF8_LOWER_ACCENTS);
366  }
367  if($case >= 0){
368    global $UTF8_UPPER_ACCENTS;
369    $string = strtr($string,$UTF8_UPPER_ACCENTS);
370  }
371  return $string;
372}
373
374/**
375 * Romanize a non-latin string
376 *
377 * @author Andreas Gohr <andi@splitbrain.org>
378 */
379function utf8_romanize($string){
380  if(utf8_isASCII($string)) return $string; //nothing to do
381
382  global $UTF8_ROMANIZATION;
383  return strtr($string,$UTF8_ROMANIZATION);
384}
385
386/**
387 * Removes special characters (nonalphanumeric) from a UTF-8 string
388 *
389 * This function adds the controlchars 0x00 to 0x19 to the array of
390 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
391 *
392 * @author Andreas Gohr <andi@splitbrain.org>
393 * @param  string $string     The UTF8 string to strip of special chars
394 * @param  string $repl       Replace special with this string
395 * @param  string $additional Additional chars to strip (used in regexp char class)
396 */
397function utf8_stripspecials($string,$repl='',$additional=''){
398  global $UTF8_SPECIAL_CHARS;
399  global $UTF8_SPECIAL_CHARS2;
400
401  static $specials = null;
402  if(is_null($specials)){
403#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
404    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
405  }
406
407  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
408}
409
410/**
411 * This is an Unicode aware replacement for strpos
412 *
413 * @author Leo Feyer <leo@typolight.org>
414 * @see    strpos()
415 * @param  string
416 * @param  string
417 * @param  integer
418 * @return integer
419 */
420function utf8_strpos($haystack, $needle, $offset=0){
421    $comp = 0;
422    $length = null;
423
424    while (is_null($length) || $length < $offset) {
425        $pos = strpos($haystack, $needle, $offset + $comp);
426
427        if ($pos === false)
428            return false;
429
430        $length = utf8_strlen(substr($haystack, 0, $pos));
431
432        if ($length < $offset)
433            $comp = $pos - $length;
434    }
435
436    return $length;
437}
438
439
440/**
441 * Encodes UTF-8 characters to HTML entities
442 *
443 * @author Tom N Harris <tnharris@whoopdedo.org>
444 * @author <vpribish at shopping dot com>
445 * @link   http://www.php.net/manual/en/function.utf8-decode.php
446 */
447function utf8_tohtml ($str) {
448    $ret = '';
449    foreach (utf8_to_unicode($str) as $cp) {
450        if ($cp < 0x80)
451            $ret .= chr($cp);
452        elseif ($cp < 0x100)
453            $ret .= "&#$cp;";
454        else
455            $ret .= '&#x'.dechex($cp).';';
456    }
457    return $ret;
458}
459
460/**
461 * Decodes HTML entities to UTF-8 characters
462 *
463 * Convert any &#..; entity to a codepoint,
464 * The entities flag defaults to only decoding numeric entities.
465 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
466 * are handled as well. Avoids the problem that would occur if you
467 * had to decode "&amp;#38;&#38;amp;#38;"
468 *
469 * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
470 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
471 * what it should be                   -> "&#38;&amp#38;"
472 *
473 * @author Tom N Harris <tnharris@whoopdedo.org>
474 * @param  string  $str      UTF-8 encoded string
475 * @param  boolean $entities Flag controlling decoding of named entities.
476 * @return UTF-8 encoded string with numeric (and named) entities replaced.
477 */
478function utf8_unhtml($str, $entities=null) {
479    static $decoder = null;
480    if (is_null($decoder))
481      $decoder = new utf8_entity_decoder();
482    if (is_null($entities))
483        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
484                                     'utf8_decode_numeric', $str);
485    else
486        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
487                                     array(&$decoder, 'decode'), $str);
488}
489function utf8_decode_numeric($ent) {
490    switch ($ent[2]) {
491      case 'X':
492      case 'x':
493          $cp = hexdec($ent[3]);
494          break;
495      default:
496          $cp = intval($ent[3]);
497          break;
498    }
499    return unicode_to_utf8(array($cp));
500}
501class utf8_entity_decoder {
502    var $table;
503    function utf8_entity_decoder() {
504        $table = get_html_translation_table(HTML_ENTITIES);
505        $table = array_flip($table);
506        $this->table = array_map(array(&$this,'makeutf8'), $table);
507    }
508    function makeutf8($c) {
509        return unicode_to_utf8(array(ord($c)));
510    }
511    function decode($ent) {
512        if ($ent[1] == '#') {
513            return utf8_decode_numeric($ent);
514        } elseif (array_key_exists($ent[0],$this->table)) {
515            return $this->table[$ent[0]];
516        } else {
517            return $ent[0];
518        }
519    }
520}
521
522/**
523 * Takes an UTF-8 string and returns an array of ints representing the
524 * Unicode characters. Astral planes are supported ie. the ints in the
525 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
526 * are not allowed.
527 *
528 * If $strict is set to true the function returns false if the input
529 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
530 * level E_USER_WARNING
531 *
532 * Note: this function has been modified slightly in this library to
533 * trigger errors on encountering bad bytes
534 *
535 * @author <hsivonen@iki.fi>
536 * @author Harry Fuecks <hfuecks@gmail.com>
537 * @param  string  UTF-8 encoded string
538 * @param  boolean Check for invalid sequences?
539 * @return mixed array of unicode code points or false if UTF-8 invalid
540 * @see    unicode_to_utf8
541 * @link   http://hsivonen.iki.fi/php-utf8/
542 * @link   http://sourceforge.net/projects/phputf8/
543 */
544function utf8_to_unicode($str,$strict=false) {
545    $mState = 0;     // cached expected number of octets after the current octet
546                     // until the beginning of the next UTF8 character sequence
547    $mUcs4  = 0;     // cached Unicode character
548    $mBytes = 1;     // cached expected number of octets in the current sequence
549
550    $out = array();
551
552    $len = strlen($str);
553
554    for($i = 0; $i < $len; $i++) {
555
556        $in = ord($str{$i});
557
558        if ( $mState == 0) {
559
560            // When mState is zero we expect either a US-ASCII character or a
561            // multi-octet sequence.
562            if (0 == (0x80 & ($in))) {
563                // US-ASCII, pass straight through.
564                $out[] = $in;
565                $mBytes = 1;
566
567            } else if (0xC0 == (0xE0 & ($in))) {
568                // First octet of 2 octet sequence
569                $mUcs4 = ($in);
570                $mUcs4 = ($mUcs4 & 0x1F) << 6;
571                $mState = 1;
572                $mBytes = 2;
573
574            } else if (0xE0 == (0xF0 & ($in))) {
575                // First octet of 3 octet sequence
576                $mUcs4 = ($in);
577                $mUcs4 = ($mUcs4 & 0x0F) << 12;
578                $mState = 2;
579                $mBytes = 3;
580
581            } else if (0xF0 == (0xF8 & ($in))) {
582                // First octet of 4 octet sequence
583                $mUcs4 = ($in);
584                $mUcs4 = ($mUcs4 & 0x07) << 18;
585                $mState = 3;
586                $mBytes = 4;
587
588            } else if (0xF8 == (0xFC & ($in))) {
589                /* First octet of 5 octet sequence.
590                 *
591                 * This is illegal because the encoded codepoint must be either
592                 * (a) not the shortest form or
593                 * (b) outside the Unicode range of 0-0x10FFFF.
594                 * Rather than trying to resynchronize, we will carry on until the end
595                 * of the sequence and let the later error handling code catch it.
596                 */
597                $mUcs4 = ($in);
598                $mUcs4 = ($mUcs4 & 0x03) << 24;
599                $mState = 4;
600                $mBytes = 5;
601
602            } else if (0xFC == (0xFE & ($in))) {
603                // First octet of 6 octet sequence, see comments for 5 octet sequence.
604                $mUcs4 = ($in);
605                $mUcs4 = ($mUcs4 & 1) << 30;
606                $mState = 5;
607                $mBytes = 6;
608
609            } elseif($strict) {
610                /* Current octet is neither in the US-ASCII range nor a legal first
611                 * octet of a multi-octet sequence.
612                 */
613                trigger_error(
614                        'utf8_to_unicode: Illegal sequence identifier '.
615                            'in UTF-8 at byte '.$i,
616                        E_USER_WARNING
617                    );
618                return false;
619
620            }
621
622        } else {
623
624            // When mState is non-zero, we expect a continuation of the multi-octet
625            // sequence
626            if (0x80 == (0xC0 & ($in))) {
627
628                // Legal continuation.
629                $shift = ($mState - 1) * 6;
630                $tmp = $in;
631                $tmp = ($tmp & 0x0000003F) << $shift;
632                $mUcs4 |= $tmp;
633
634                /**
635                 * End of the multi-octet sequence. mUcs4 now contains the final
636                 * Unicode codepoint to be output
637                 */
638                if (0 == --$mState) {
639
640                    /*
641                     * Check for illegal sequences and codepoints.
642                     */
643                    // From Unicode 3.1, non-shortest form is illegal
644                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
645                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
646                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
647                        (4 < $mBytes) ||
648                        // From Unicode 3.2, surrogate characters are illegal
649                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
650                        // Codepoints outside the Unicode range are illegal
651                        ($mUcs4 > 0x10FFFF)) {
652
653                        if($strict){
654                            trigger_error(
655                                    'utf8_to_unicode: Illegal sequence or codepoint '.
656                                        'in UTF-8 at byte '.$i,
657                                    E_USER_WARNING
658                                );
659
660                            return false;
661                        }
662
663                    }
664
665                    if (0xFEFF != $mUcs4) {
666                        // BOM is legal but we don't want to output it
667                        $out[] = $mUcs4;
668                    }
669
670                    //initialize UTF8 cache
671                    $mState = 0;
672                    $mUcs4  = 0;
673                    $mBytes = 1;
674                }
675
676            } elseif($strict) {
677                /**
678                 *((0xC0 & (*in) != 0x80) && (mState != 0))
679                 * Incomplete multi-octet sequence.
680                 */
681                trigger_error(
682                        'utf8_to_unicode: Incomplete multi-octet '.
683                        '   sequence in UTF-8 at byte '.$i,
684                        E_USER_WARNING
685                    );
686
687                return false;
688            }
689        }
690    }
691    return $out;
692}
693
694/**
695 * Takes an array of ints representing the Unicode characters and returns
696 * a UTF-8 string. Astral planes are supported ie. the ints in the
697 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
698 * are not allowed.
699 *
700 * If $strict is set to true the function returns false if the input
701 * array contains ints that represent surrogates or are outside the
702 * Unicode range and raises a PHP error at level E_USER_WARNING
703 *
704 * Note: this function has been modified slightly in this library to use
705 * output buffering to concatenate the UTF-8 string (faster) as well as
706 * reference the array by it's keys
707 *
708 * @param  array of unicode code points representing a string
709 * @param  boolean Check for invalid sequences?
710 * @return mixed UTF-8 string or false if array contains invalid code points
711 * @author <hsivonen@iki.fi>
712 * @author Harry Fuecks <hfuecks@gmail.com>
713 * @see    utf8_to_unicode
714 * @link   http://hsivonen.iki.fi/php-utf8/
715 * @link   http://sourceforge.net/projects/phputf8/
716 */
717function unicode_to_utf8($arr,$strict=false) {
718    if (!is_array($arr)) return '';
719    ob_start();
720
721    foreach (array_keys($arr) as $k) {
722
723        # ASCII range (including control chars)
724        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
725
726            echo chr($arr[$k]);
727
728        # 2 byte sequence
729        } else if ($arr[$k] <= 0x07ff) {
730
731            echo chr(0xc0 | ($arr[$k] >> 6));
732            echo chr(0x80 | ($arr[$k] & 0x003f));
733
734        # Byte order mark (skip)
735        } else if($arr[$k] == 0xFEFF) {
736
737            // nop -- zap the BOM
738
739        # Test for illegal surrogates
740        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
741
742            // found a surrogate
743            if($strict){
744                trigger_error(
745                    'unicode_to_utf8: Illegal surrogate '.
746                        'at index: '.$k.', value: '.$arr[$k],
747                    E_USER_WARNING
748                    );
749                return false;
750            }
751
752        # 3 byte sequence
753        } else if ($arr[$k] <= 0xffff) {
754
755            echo chr(0xe0 | ($arr[$k] >> 12));
756            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
757            echo chr(0x80 | ($arr[$k] & 0x003f));
758
759        # 4 byte sequence
760        } else if ($arr[$k] <= 0x10ffff) {
761
762            echo chr(0xf0 | ($arr[$k] >> 18));
763            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
764            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
765            echo chr(0x80 | ($arr[$k] & 0x3f));
766
767        } elseif($strict) {
768
769            trigger_error(
770                'unicode_to_utf8: Codepoint out of Unicode range '.
771                    'at index: '.$k.', value: '.$arr[$k],
772                E_USER_WARNING
773                );
774
775            // out of range
776            return false;
777        }
778    }
779
780    $result = ob_get_contents();
781    ob_end_clean();
782    return $result;
783}
784
785/**
786 * UTF-8 to UTF-16BE conversion.
787 *
788 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
789 */
790function utf8_to_utf16be(&$str, $bom = false) {
791  $out = $bom ? "\xFE\xFF" : '';
792  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
793
794  $uni = utf8_to_unicode($str);
795  foreach($uni as $cp){
796    $out .= pack('n',$cp);
797  }
798  return $out;
799}
800
801/**
802 * UTF-8 to UTF-16BE conversion.
803 *
804 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
805 */
806function utf16be_to_utf8(&$str) {
807  $uni = unpack('n*',$str);
808  return unicode_to_utf8($uni);
809}
810
811/**
812 * Replace bad bytes with an alternative character
813 *
814 * ASCII character is recommended for replacement char
815 *
816 * PCRE Pattern to locate bad bytes in a UTF-8 string
817 * Comes from W3 FAQ: Multilingual Forms
818 * Note: modified to include full ASCII range including control chars
819 *
820 * @author Harry Fuecks <hfuecks@gmail.com>
821 * @see http://www.w3.org/International/questions/qa-forms-utf-8
822 * @param string to search
823 * @param string to replace bad bytes with (defaults to '?') - use ASCII
824 * @return string
825 */
826function utf8_bad_replace($str, $replace = '') {
827    $UTF8_BAD =
828     '([\x00-\x7F]'.                          # ASCII (including control chars)
829     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
830     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
831     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
832     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
833     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
834     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
835     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
836     '|(.{1}))';                              # invalid byte
837    ob_start();
838    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
839        if ( !isset($matches[2])) {
840            echo $matches[0];
841        } else {
842            echo $replace;
843        }
844        $str = substr($str,strlen($matches[0]));
845    }
846    $result = ob_get_contents();
847    ob_end_clean();
848    return $result;
849}
850
851/**
852 * adjust a byte index into a utf8 string to a utf8 character boundary
853 *
854 * @param $str   string   utf8 character string
855 * @param $i     int      byte index into $str
856 * @param $next  bool     direction to search for boundary,
857 *                           false = up (current character)
858 *                           true = down (next character)
859 *
860 * @return int            byte index into $str now pointing to a utf8 character boundary
861 *
862 * @author       chris smith <chris@jalakai.co.uk>
863 */
864function utf8_correctIdx(&$str,$i,$next=false) {
865
866  if ($i <= 0) return 0;
867
868  $limit = strlen($str);
869  if ($i>=$limit) return $limit;
870
871  if ($next) {
872    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
873  } else {
874    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
875  }
876
877  return $i;
878}
879
880// only needed if no mb_string available
881if(!UTF8_MBSTRING){
882  /**
883   * UTF-8 Case lookup table
884   *
885   * This lookuptable defines the upper case letters to their correspponding
886   * lower case letter in UTF-8
887   *
888   * @author Andreas Gohr <andi@splitbrain.org>
889   */
890  global $UTF8_LOWER_TO_UPPER;
891  $UTF8_LOWER_TO_UPPER = array(
892    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
893    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
894    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
895    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
896    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
897    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
898    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
899    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
900    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
901    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
902    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
903    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
904    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
905    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
906    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
907    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
908    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
909    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
910    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
911    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
912    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
913    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
914    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
915    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
916    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
917    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
918    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
919    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
920    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
921    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
922    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
923    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
924    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
925    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
926    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
927    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
928    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
929    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
930    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
931    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
932    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
933    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
934    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
935    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
936    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
937    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
938    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
939    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
940    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
941    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
942    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
943    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
944    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
945    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
946    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
947    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
948    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
949    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
950    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
951    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
952    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
953    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
954    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
955    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
956    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
957    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
958    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
959  );
960
961  /**
962   * UTF-8 Case lookup table
963   *
964   * This lookuptable defines the lower case letters to their correspponding
965   * upper case letter in UTF-8
966   *
967   * @author Andreas Gohr <andi@splitbrain.org>
968   */
969  global $UTF8_UPPER_TO_LOWER;
970  $UTF8_UPPER_TO_LOWER = array (
971    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
972    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
973    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
974    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
975    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
976    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
977    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
978    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
979    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
980    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
981    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
982    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
983    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
984    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
985    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
986    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
987    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
988    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
989    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
990    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
991    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
992    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
993    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
994    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
995    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
996    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
997    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
998    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
999    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1000    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1001    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1002    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1003    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1004    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1005    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1006    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1007    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1008    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1009    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1010    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1011    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1012    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1013    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1014    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1015    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1016    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1017    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1018    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1019    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1020    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1021    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1022    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1023    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1024    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1025    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1026    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1027    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1028    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1029    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1030    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1031    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1032    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1033    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1034    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1035    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1036    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1037    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1038  );
1039}; // end of case lookup tables
1040
1041/**
1042 * UTF-8 lookup table for lower case accented letters
1043 *
1044 * This lookuptable defines replacements for accented characters from the ASCII-7
1045 * range. This are lower case letters only.
1046 *
1047 * @author Andreas Gohr <andi@splitbrain.org>
1048 * @see    utf8_deaccent()
1049 */
1050global $UTF8_LOWER_ACCENTS;
1051$UTF8_LOWER_ACCENTS = array(
1052  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1053  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1054  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1055  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1056  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1057  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1058  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1059  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1060  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1061  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1062  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1063  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1064  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1065  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1066  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1067);
1068
1069/**
1070 * UTF-8 lookup table for upper case accented letters
1071 *
1072 * This lookuptable defines replacements for accented characters from the ASCII-7
1073 * range. This are upper case letters only.
1074 *
1075 * @author Andreas Gohr <andi@splitbrain.org>
1076 * @see    utf8_deaccent()
1077 */
1078global $UTF8_UPPER_ACCENTS;
1079$UTF8_UPPER_ACCENTS = array(
1080  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1081  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1082  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1083  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1084  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1085  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1086  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1087  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1088  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1089  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1090  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1091  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1092  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1093  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1094  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1095);
1096
1097/**
1098 * UTF-8 array of common special characters
1099 *
1100 * This array should contain all special characters (not a letter or digit)
1101 * defined in the various local charsets - it's not a complete list of non-alphanum
1102 * characters in UTF-8. It's not perfect but should match most cases of special
1103 * chars.
1104 *
1105 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1106 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1107 *
1108 * @author Andreas Gohr <andi@splitbrain.org>
1109 * @see    utf8_stripspecials()
1110 */
1111global $UTF8_SPECIAL_CHARS;
1112$UTF8_SPECIAL_CHARS = array(
1113  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1114  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1115          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1116  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1117  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1118  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1119  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1120  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1121  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1122  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1123  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1124  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1125  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1126  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1127  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1128  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1129  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1130  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1131  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1132  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1133  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1134  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1135  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1136  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1137  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1138  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1139  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1140  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1141  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1142  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1143  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1144  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1145  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1146  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1147  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1148  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1149  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1150  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1151  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1152  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1153  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1154  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1155  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1156  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1157  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1158  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1159  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1160  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1161  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1162  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1163  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1164  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1165  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1166          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1167  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1168  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1169  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1170  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1171  0xffeb, 0xffec, 0xffed, 0xffee,
1172);
1173
1174// utf8 version of above data
1175global $UTF8_SPECIAL_CHARS2;
1176$UTF8_SPECIAL_CHARS2 =
1177    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1178    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1179    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1180    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1181    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1182    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1183    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1184    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1185    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1186    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1187    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1188    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1189    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1190    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1191    '➷➸➹➺➻➼➽➾'.
1192    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1193    '�'.
1194    '�ﹼﹽ'.
1195    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1196    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○';
1197
1198/**
1199 * Romanization lookup table
1200 *
1201 * This lookup tables provides a way to transform strings written in a language
1202 * different from the ones based upon latin letters into plain ASCII.
1203 *
1204 * Please note: this is not a scientific transliteration table. It only works
1205 * oneway from nonlatin to ASCII and it works by simple character replacement
1206 * only. Specialities of each language are not supported.
1207 *
1208 * @author Andreas Gohr <andi@splitbrain.org>
1209 * @author Vitaly Blokhin <vitinfo@vitn.com>
1210 * @link   http://www.uconv.com/translit.htm
1211 * @author Bisqwit <bisqwit@iki.fi>
1212 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1213 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1214 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1215 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1216 */
1217global $UTF8_ROMANIZATION;
1218$UTF8_ROMANIZATION = array(
1219  //russian cyrillic
1220  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1221  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1222  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1223  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1224  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1225  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1226  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1227  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1228  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1229  // Ukrainian cyrillic
1230  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1231  // Georgian
1232  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1233  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1234  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1235  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1236  'ჰ'=>'xh',
1237  //Sanskrit
1238  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1239  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1240  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1241  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1242  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1243  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1244  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1245  //Hebrew
1246  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1247  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1248  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1249  'ש'=>'sh','ת'=>'t',
1250  //Arabic
1251  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1252  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1253  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1254  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1255
1256  // Japanese hiragana
1257  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1258  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1259  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1260  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1261  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1262  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1263  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1264  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1265  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1266  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1267  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1268  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1269  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1270  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1271  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1272  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1273  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1274  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1275  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1276  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1277  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1278  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1279  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1280  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1281  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1282  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1283  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1284  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1285  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1286  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1287  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1288  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1289  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1290  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1291  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1292  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1293  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1294  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1295  'じゅ'=>'zyu',
1296  // Japanese katakana
1297  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1298  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1299  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1300  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1301  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1302  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1303  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1304  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1305  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1306  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1307  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1308  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1309  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1310  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1311  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1312  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1313  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1314  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1315  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1316  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1317  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1318  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1319  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1320  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1321  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1322  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1323  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1324  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1325  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1326  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1327  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1328  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1329  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1330  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1331  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1332  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1333  'ジョ'=>'zyo','ジュ'=>'zyu',
1334
1335  // "Greeklish"
1336  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1337  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1338
1339  // Thai
1340  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1341  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1342  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1343  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1344  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1345  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1346  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1347  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1348  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1349  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1350  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1351  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1352  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1353  'เ–ียว'=>'iao',
1354
1355  // Korean
1356  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1357  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1358  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1359  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1360  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1361  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1362);
1363
1364//Setup VIM: ex: et ts=2 enc=utf-8 :
1365
1366