xref: /dokuwiki/inc/utf8.php (revision b13c8432d362b38b5a791001798dd8b704d96ff5)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22if(!function_exists('utf8_encodeFN')){
23    /**
24     * URL-Encode a filename to allow unicodecharacters
25     *
26     * Slashes are not encoded
27     *
28     * When the second parameter is true the string will
29     * be encoded only if non ASCII characters are detected -
30     * This makes it safe to run it multiple times on the
31     * same string (default is true)
32     *
33     * @author Andreas Gohr <andi@splitbrain.org>
34     * @see    urlencode
35     */
36    function utf8_encodeFN($file,$safe=true){
37      if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38        return $file;
39      }
40      $file = urlencode($file);
41      $file = str_replace('%2F','/',$file);
42      return $file;
43    }
44}
45
46if(!function_exists('utf8_decodeFN')){
47    /**
48     * URL-Decode a filename
49     *
50     * This is just a wrapper around urldecode
51     *
52     * @author Andreas Gohr <andi@splitbrain.org>
53     * @see    urldecode
54     */
55    function utf8_decodeFN($file){
56        $file = urldecode($file);
57        return $file;
58    }
59}
60
61if(!function_exists('utf8_isASCII')){
62    /**
63     * Checks if a string contains 7bit ASCII only
64     *
65     * @author Andreas Haerter <netzmeister@andreas-haerter.de>
66     */
67    function utf8_isASCII($str){
68        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
69    }
70}
71
72if(!function_exists('utf8_strip')){
73    /**
74     * Strips all highbyte chars
75     *
76     * Returns a pure ASCII7 string
77     *
78     * @author Andreas Gohr <andi@splitbrain.org>
79     */
80    function utf8_strip($str){
81      $ascii = '';
82      for($i=0; $i<strlen($str); $i++){
83        if(ord($str{$i}) <128){
84          $ascii .= $str{$i};
85        }
86      }
87      return $ascii;
88    }
89}
90
91if(!function_exists('utf8_check')){
92    /**
93     * Tries to detect if a string is in Unicode encoding
94     *
95     * @author <bmorel@ssi.fr>
96     * @link   http://www.php.net/manual/en/function.utf8-encode.php
97     */
98    function utf8_check($Str) {
99        for ($i=0; $i<strlen($Str); $i++) {
100            $b = ord($Str[$i]);
101            if ($b < 0x80) continue; # 0bbbbbbb
102            elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
103            elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
104            elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
105            elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
106            elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
107            else return false; # Does not match any model
108
109            for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
110                if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
111                    return false;
112            }
113        }
114        return true;
115    }
116}
117
118if(!function_exists('utf8_strlen')){
119    /**
120     * Unicode aware replacement for strlen()
121     *
122     * utf8_decode() converts characters that are not in ISO-8859-1
123     * to '?', which, for the purpose of counting, is alright - It's
124     * even faster than mb_strlen.
125     *
126     * @author <chernyshevsky at hotmail dot com>
127     * @see    strlen()
128     * @see    utf8_decode()
129     */
130    function utf8_strlen($string){
131        return strlen(utf8_decode($string));
132    }
133}
134
135if(!function_exists('utf8_substr')){
136    /**
137     * UTF-8 aware alternative to substr
138     *
139     * Return part of a string given character offset (and optionally length)
140     *
141     * @author Harry Fuecks <hfuecks@gmail.com>
142     * @author Chris Smith <chris@jalakai.co.uk>
143     * @param string
144     * @param integer number of UTF-8 characters offset (from left)
145     * @param integer (optional) length in UTF-8 characters from offset
146     * @return mixed string or false if failure
147     */
148    function utf8_substr($str, $offset, $length = null) {
149        if(UTF8_MBSTRING){
150            if( $length === null ){
151                return mb_substr($str, $offset);
152            }else{
153                return mb_substr($str, $offset, $length);
154            }
155        }
156
157        /*
158         * Notes:
159         *
160         * no mb string support, so we'll use pcre regex's with 'u' flag
161         * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
162         * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
163         *
164         * substr documentation states false can be returned in some cases (e.g. offset > string length)
165         * mb_substr never returns false, it will return an empty string instead.
166         *
167         * calculating the number of characters in the string is a relatively expensive operation, so
168         * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
169         */
170
171        // cast parameters to appropriate types to avoid multiple notices/warnings
172        $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
173        $offset = (int)$offset;
174        if (!is_null($length)) $length = (int)$length;
175
176        // handle trivial cases
177        if ($length === 0) return '';
178        if ($offset < 0 && $length < 0 && $length < $offset) return '';
179
180        $offset_pattern = '';
181        $length_pattern = '';
182
183        // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
184        if ($offset < 0) {
185          $strlen = strlen(utf8_decode($str));        // see notes
186          $offset = $strlen + $offset;
187          if ($offset < 0) $offset = 0;
188        }
189
190        // establish a pattern for offset, a non-captured group equal in length to offset
191        if ($offset > 0) {
192          $Ox = (int)($offset/65535);
193          $Oy = $offset%65535;
194
195          if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
196          $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
197        } else {
198          $offset_pattern = '^';                      // offset == 0; just anchor the pattern
199        }
200
201        // establish a pattern for length
202        if (is_null($length)) {
203          $length_pattern = '(.*)$';                  // the rest of the string
204        } else {
205
206          if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
207          if ($offset > $strlen) return '';           // another trivial case
208
209          if ($length > 0) {
210
211            $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
212
213            $Lx = (int)($length/65535);
214            $Ly = $length%65535;
215
216            // +ve length requires ... a captured group of length characters
217            if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
218            $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
219
220          } else if ($length < 0) {
221
222            if ($length < ($offset - $strlen)) return '';
223
224            $Lx = (int)((-$length)/65535);
225            $Ly = (-$length)%65535;
226
227            // -ve length requires ... capture everything except a group of -length characters
228            //                         anchored at the tail-end of the string
229            if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
230            $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
231          }
232        }
233
234        if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
235        return $match[1];
236    }
237}
238
239if(!function_exists('utf8_substr_replace')){
240    /**
241     * Unicode aware replacement for substr_replace()
242     *
243     * @author Andreas Gohr <andi@splitbrain.org>
244     * @see    substr_replace()
245     */
246    function utf8_substr_replace($string, $replacement, $start , $length=0 ){
247      $ret = '';
248      if($start>0) $ret .= utf8_substr($string, 0, $start);
249      $ret .= $replacement;
250      $ret .= utf8_substr($string, $start+$length);
251      return $ret;
252    }
253}
254
255if(!function_exists('utf8_ltrim')){
256    /**
257     * Unicode aware replacement for ltrim()
258     *
259     * @author Andreas Gohr <andi@splitbrain.org>
260     * @see    ltrim()
261     * @return string
262     */
263    function utf8_ltrim($str,$charlist=''){
264      if($charlist == '') return ltrim($str);
265
266      //quote charlist for use in a characterclass
267      $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
268
269      return preg_replace('/^['.$charlist.']+/u','',$str);
270    }
271}
272
273if(!function_exists('utf8_rtrim')){
274    /**
275     * Unicode aware replacement for rtrim()
276     *
277     * @author Andreas Gohr <andi@splitbrain.org>
278     * @see    rtrim()
279     * @return string
280     */
281    function  utf8_rtrim($str,$charlist=''){
282      if($charlist == '') return rtrim($str);
283
284      //quote charlist for use in a characterclass
285      $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
286
287      return preg_replace('/['.$charlist.']+$/u','',$str);
288    }
289}
290
291if(!function_exists('utf8_trim')){
292    /**
293     * Unicode aware replacement for trim()
294     *
295     * @author Andreas Gohr <andi@splitbrain.org>
296     * @see    trim()
297     * @return string
298     */
299    function  utf8_trim($str,$charlist='') {
300      if($charlist == '') return trim($str);
301
302      return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
303    }
304}
305
306if(!function_exists('utf8_strtolower')){
307    /**
308     * This is a unicode aware replacement for strtolower()
309     *
310     * Uses mb_string extension if available
311     *
312     * @author Leo Feyer <leo@typolight.org>
313     * @see    strtolower()
314     * @see    utf8_strtoupper()
315     */
316    function utf8_strtolower($string){
317      if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
318
319      global $UTF8_UPPER_TO_LOWER;
320      return strtr($string,$UTF8_UPPER_TO_LOWER);
321    }
322}
323
324if(!function_exists('utf8_strtoupper')){
325    /**
326     * This is a unicode aware replacement for strtoupper()
327     *
328     * Uses mb_string extension if available
329     *
330     * @author Leo Feyer <leo@typolight.org>
331     * @see    strtoupper()
332     * @see    utf8_strtoupper()
333     */
334    function utf8_strtoupper($string){
335      if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
336
337      global $UTF8_LOWER_TO_UPPER;
338      return strtr($string,$UTF8_LOWER_TO_UPPER);
339    }
340}
341
342if(!function_exists('utf8_ucfirst')){
343    /**
344     * UTF-8 aware alternative to ucfirst
345     * Make a string's first character uppercase
346     *
347     * @author Harry Fuecks
348     * @param string
349     * @return string with first character as upper case (if applicable)
350     */
351    function utf8_ucfirst($str){
352      switch ( utf8_strlen($str) ) {
353        case 0:
354            return '';
355        case 1:
356            return utf8_strtoupper($str);
357        default:
358            preg_match('/^(.{1})(.*)$/us', $str, $matches);
359            return utf8_strtoupper($matches[1]).$matches[2];
360      }
361    }
362}
363
364if(!function_exists('utf8_ucwords')){
365    /**
366     * UTF-8 aware alternative to ucwords
367     * Uppercase the first character of each word in a string
368     *
369     * @author Harry Fuecks
370     * @param string
371     * @return string with first char of each word uppercase
372     * @see http://www.php.net/ucwords
373     */
374    function utf8_ucwords($str) {
375      // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
376      // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
377      // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
378      $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
379
380      return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
381    }
382
383    /**
384     * Callback function for preg_replace_callback call in utf8_ucwords
385     * You don't need to call this yourself
386     *
387     * @author Harry Fuecks
388     * @param array of matches corresponding to a single word
389     * @return string with first char of the word in uppercase
390     * @see utf8_ucwords
391     * @see utf8_strtoupper
392     */
393    function utf8_ucwords_callback($matches) {
394      $leadingws = $matches[2];
395      $ucfirst = utf8_strtoupper($matches[3]);
396      $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
397      return $leadingws . $ucword;
398    }
399}
400
401if(!function_exists('utf8_deaccent')){
402    /**
403     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
404     *
405     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
406     * letters. Default is to deaccent both cases ($case = 0)
407     *
408     * @author Andreas Gohr <andi@splitbrain.org>
409     */
410    function utf8_deaccent($string,$case=0){
411        if($case <= 0){
412            global $UTF8_LOWER_ACCENTS;
413            $string = strtr($string,$UTF8_LOWER_ACCENTS);
414        }
415        if($case >= 0){
416            global $UTF8_UPPER_ACCENTS;
417            $string = strtr($string,$UTF8_UPPER_ACCENTS);
418        }
419        return $string;
420    }
421}
422
423if(!function_exists('utf8_romanize')){
424    /**
425     * Romanize a non-latin string
426     *
427     * @author Andreas Gohr <andi@splitbrain.org>
428     */
429    function utf8_romanize($string){
430        if(utf8_isASCII($string)) return $string; //nothing to do
431
432        global $UTF8_ROMANIZATION;
433        return strtr($string,$UTF8_ROMANIZATION);
434    }
435}
436
437if(!function_exists('utf8_stripspecials')){
438    /**
439     * Removes special characters (nonalphanumeric) from a UTF-8 string
440     *
441     * This function adds the controlchars 0x00 to 0x19 to the array of
442     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
443     *
444     * @author Andreas Gohr <andi@splitbrain.org>
445     * @param  string $string     The UTF8 string to strip of special chars
446     * @param  string $repl       Replace special with this string
447     * @param  string $additional Additional chars to strip (used in regexp char class)
448     */
449    function utf8_stripspecials($string,$repl='',$additional=''){
450        global $UTF8_SPECIAL_CHARS;
451        global $UTF8_SPECIAL_CHARS2;
452
453        static $specials = null;
454        if(is_null($specials)){
455            #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
456            $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
457        }
458
459        return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
460    }
461}
462
463if(!function_exists('utf8_strpos')){
464    /**
465     * This is an Unicode aware replacement for strpos
466     *
467     * @author Leo Feyer <leo@typolight.org>
468     * @see    strpos()
469     * @param  string
470     * @param  string
471     * @param  integer
472     * @return integer
473     */
474    function utf8_strpos($haystack, $needle, $offset=0){
475        $comp = 0;
476        $length = null;
477
478        while (is_null($length) || $length < $offset) {
479            $pos = strpos($haystack, $needle, $offset + $comp);
480
481            if ($pos === false)
482                return false;
483
484            $length = utf8_strlen(substr($haystack, 0, $pos));
485
486            if ($length < $offset)
487                $comp = $pos - $length;
488        }
489
490        return $length;
491    }
492}
493
494if(!function_exists('utf8_tohtml')){
495    /**
496     * Encodes UTF-8 characters to HTML entities
497     *
498     * @author Tom N Harris <tnharris@whoopdedo.org>
499     * @author <vpribish at shopping dot com>
500     * @link   http://www.php.net/manual/en/function.utf8-decode.php
501     */
502    function utf8_tohtml ($str) {
503        $ret = '';
504        foreach (utf8_to_unicode($str) as $cp) {
505            if ($cp < 0x80)
506                $ret .= chr($cp);
507            elseif ($cp < 0x100)
508                $ret .= "&#$cp;";
509            else
510                $ret .= '&#x'.dechex($cp).';';
511        }
512        return $ret;
513    }
514}
515
516if(!function_exists('utf8_unhtml')){
517    /**
518     * Decodes HTML entities to UTF-8 characters
519     *
520     * Convert any &#..; entity to a codepoint,
521     * The entities flag defaults to only decoding numeric entities.
522     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
523     * are handled as well. Avoids the problem that would occur if you
524     * had to decode "&amp;#38;&#38;amp;#38;"
525     *
526     * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
527     * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
528     * what it should be                   -> "&#38;&amp#38;"
529     *
530     * @author Tom N Harris <tnharris@whoopdedo.org>
531     * @param  string  $str      UTF-8 encoded string
532     * @param  boolean $entities Flag controlling decoding of named entities.
533     * @return UTF-8 encoded string with numeric (and named) entities replaced.
534     */
535    function utf8_unhtml($str, $entities=null) {
536        static $decoder = null;
537        if (is_null($decoder))
538            $decoder = new utf8_entity_decoder();
539        if (is_null($entities))
540            return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
541                                         'utf8_decode_numeric', $str);
542        else
543            return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
544                                         array(&$decoder, 'decode'), $str);
545    }
546}
547
548if(!function_exists('utf8_decode_numeric')){
549    function utf8_decode_numeric($ent) {
550        switch ($ent[2]) {
551          case 'X':
552          case 'x':
553              $cp = hexdec($ent[3]);
554              break;
555          default:
556              $cp = intval($ent[3]);
557              break;
558        }
559        return unicode_to_utf8(array($cp));
560    }
561}
562
563if(!class_exists('utf8_entity_decoder')){
564    class utf8_entity_decoder {
565        var $table;
566        function utf8_entity_decoder() {
567            $table = get_html_translation_table(HTML_ENTITIES);
568            $table = array_flip($table);
569            $this->table = array_map(array(&$this,'makeutf8'), $table);
570        }
571        function makeutf8($c) {
572            return unicode_to_utf8(array(ord($c)));
573        }
574        function decode($ent) {
575            if ($ent[1] == '#') {
576                return utf8_decode_numeric($ent);
577            } elseif (array_key_exists($ent[0],$this->table)) {
578                return $this->table[$ent[0]];
579            } else {
580                return $ent[0];
581            }
582        }
583    }
584}
585
586if(!function_exists('utf8_to_unicode')){
587    /**
588     * Takes an UTF-8 string and returns an array of ints representing the
589     * Unicode characters. Astral planes are supported ie. the ints in the
590     * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
591     * are not allowed.
592     *
593     * If $strict is set to true the function returns false if the input
594     * string isn't a valid UTF-8 octet sequence and raises a PHP error at
595     * level E_USER_WARNING
596     *
597     * Note: this function has been modified slightly in this library to
598     * trigger errors on encountering bad bytes
599     *
600     * @author <hsivonen@iki.fi>
601     * @author Harry Fuecks <hfuecks@gmail.com>
602     * @param  string  UTF-8 encoded string
603     * @param  boolean Check for invalid sequences?
604     * @return mixed array of unicode code points or false if UTF-8 invalid
605     * @see    unicode_to_utf8
606     * @link   http://hsivonen.iki.fi/php-utf8/
607     * @link   http://sourceforge.net/projects/phputf8/
608     */
609    function utf8_to_unicode($str,$strict=false) {
610        $mState = 0;     // cached expected number of octets after the current octet
611                         // until the beginning of the next UTF8 character sequence
612        $mUcs4  = 0;     // cached Unicode character
613        $mBytes = 1;     // cached expected number of octets in the current sequence
614
615        $out = array();
616
617        $len = strlen($str);
618
619        for($i = 0; $i < $len; $i++) {
620
621            $in = ord($str{$i});
622
623            if ( $mState == 0) {
624
625                // When mState is zero we expect either a US-ASCII character or a
626                // multi-octet sequence.
627                if (0 == (0x80 & ($in))) {
628                    // US-ASCII, pass straight through.
629                    $out[] = $in;
630                    $mBytes = 1;
631
632                } else if (0xC0 == (0xE0 & ($in))) {
633                    // First octet of 2 octet sequence
634                    $mUcs4 = ($in);
635                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
636                    $mState = 1;
637                    $mBytes = 2;
638
639                } else if (0xE0 == (0xF0 & ($in))) {
640                    // First octet of 3 octet sequence
641                    $mUcs4 = ($in);
642                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
643                    $mState = 2;
644                    $mBytes = 3;
645
646                } else if (0xF0 == (0xF8 & ($in))) {
647                    // First octet of 4 octet sequence
648                    $mUcs4 = ($in);
649                    $mUcs4 = ($mUcs4 & 0x07) << 18;
650                    $mState = 3;
651                    $mBytes = 4;
652
653                } else if (0xF8 == (0xFC & ($in))) {
654                    /* First octet of 5 octet sequence.
655                     *
656                     * This is illegal because the encoded codepoint must be either
657                     * (a) not the shortest form or
658                     * (b) outside the Unicode range of 0-0x10FFFF.
659                     * Rather than trying to resynchronize, we will carry on until the end
660                     * of the sequence and let the later error handling code catch it.
661                     */
662                    $mUcs4 = ($in);
663                    $mUcs4 = ($mUcs4 & 0x03) << 24;
664                    $mState = 4;
665                    $mBytes = 5;
666
667                } else if (0xFC == (0xFE & ($in))) {
668                    // First octet of 6 octet sequence, see comments for 5 octet sequence.
669                    $mUcs4 = ($in);
670                    $mUcs4 = ($mUcs4 & 1) << 30;
671                    $mState = 5;
672                    $mBytes = 6;
673
674                } elseif($strict) {
675                    /* Current octet is neither in the US-ASCII range nor a legal first
676                     * octet of a multi-octet sequence.
677                     */
678                    trigger_error(
679                            'utf8_to_unicode: Illegal sequence identifier '.
680                                'in UTF-8 at byte '.$i,
681                            E_USER_WARNING
682                        );
683                    return false;
684
685                }
686
687            } else {
688
689                // When mState is non-zero, we expect a continuation of the multi-octet
690                // sequence
691                if (0x80 == (0xC0 & ($in))) {
692
693                    // Legal continuation.
694                    $shift = ($mState - 1) * 6;
695                    $tmp = $in;
696                    $tmp = ($tmp & 0x0000003F) << $shift;
697                    $mUcs4 |= $tmp;
698
699                    /**
700                     * End of the multi-octet sequence. mUcs4 now contains the final
701                     * Unicode codepoint to be output
702                     */
703                    if (0 == --$mState) {
704
705                        /*
706                         * Check for illegal sequences and codepoints.
707                         */
708                        // From Unicode 3.1, non-shortest form is illegal
709                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
710                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
711                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
712                            (4 < $mBytes) ||
713                            // From Unicode 3.2, surrogate characters are illegal
714                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
715                            // Codepoints outside the Unicode range are illegal
716                            ($mUcs4 > 0x10FFFF)) {
717
718                            if($strict){
719                                trigger_error(
720                                        'utf8_to_unicode: Illegal sequence or codepoint '.
721                                            'in UTF-8 at byte '.$i,
722                                        E_USER_WARNING
723                                    );
724
725                                return false;
726                            }
727
728                        }
729
730                        if (0xFEFF != $mUcs4) {
731                            // BOM is legal but we don't want to output it
732                            $out[] = $mUcs4;
733                        }
734
735                        //initialize UTF8 cache
736                        $mState = 0;
737                        $mUcs4  = 0;
738                        $mBytes = 1;
739                    }
740
741                } elseif($strict) {
742                    /**
743                     *((0xC0 & (*in) != 0x80) && (mState != 0))
744                     * Incomplete multi-octet sequence.
745                     */
746                    trigger_error(
747                            'utf8_to_unicode: Incomplete multi-octet '.
748                            '   sequence in UTF-8 at byte '.$i,
749                            E_USER_WARNING
750                        );
751
752                    return false;
753                }
754            }
755        }
756        return $out;
757    }
758}
759
760if(!function_exists('unicode_to_utf8')){
761    /**
762     * Takes an array of ints representing the Unicode characters and returns
763     * a UTF-8 string. Astral planes are supported ie. the ints in the
764     * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
765     * are not allowed.
766     *
767     * If $strict is set to true the function returns false if the input
768     * array contains ints that represent surrogates or are outside the
769     * Unicode range and raises a PHP error at level E_USER_WARNING
770     *
771     * Note: this function has been modified slightly in this library to use
772     * output buffering to concatenate the UTF-8 string (faster) as well as
773     * reference the array by it's keys
774     *
775     * @param  array of unicode code points representing a string
776     * @param  boolean Check for invalid sequences?
777     * @return mixed UTF-8 string or false if array contains invalid code points
778     * @author <hsivonen@iki.fi>
779     * @author Harry Fuecks <hfuecks@gmail.com>
780     * @see    utf8_to_unicode
781     * @link   http://hsivonen.iki.fi/php-utf8/
782     * @link   http://sourceforge.net/projects/phputf8/
783     */
784    function unicode_to_utf8($arr,$strict=false) {
785        if (!is_array($arr)) return '';
786        ob_start();
787
788        foreach (array_keys($arr) as $k) {
789
790            # ASCII range (including control chars)
791            if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
792
793                echo chr($arr[$k]);
794
795            # 2 byte sequence
796            } else if ($arr[$k] <= 0x07ff) {
797
798                echo chr(0xc0 | ($arr[$k] >> 6));
799                echo chr(0x80 | ($arr[$k] & 0x003f));
800
801            # Byte order mark (skip)
802            } else if($arr[$k] == 0xFEFF) {
803
804                // nop -- zap the BOM
805
806            # Test for illegal surrogates
807            } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
808
809                // found a surrogate
810                if($strict){
811                    trigger_error(
812                        'unicode_to_utf8: Illegal surrogate '.
813                            'at index: '.$k.', value: '.$arr[$k],
814                        E_USER_WARNING
815                        );
816                    return false;
817                }
818
819            # 3 byte sequence
820            } else if ($arr[$k] <= 0xffff) {
821
822                echo chr(0xe0 | ($arr[$k] >> 12));
823                echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
824                echo chr(0x80 | ($arr[$k] & 0x003f));
825
826            # 4 byte sequence
827            } else if ($arr[$k] <= 0x10ffff) {
828
829                echo chr(0xf0 | ($arr[$k] >> 18));
830                echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
831                echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
832                echo chr(0x80 | ($arr[$k] & 0x3f));
833
834            } elseif($strict) {
835
836                trigger_error(
837                    'unicode_to_utf8: Codepoint out of Unicode range '.
838                        'at index: '.$k.', value: '.$arr[$k],
839                    E_USER_WARNING
840                    );
841
842                // out of range
843                return false;
844            }
845        }
846
847        $result = ob_get_contents();
848        ob_end_clean();
849        return $result;
850    }
851}
852
853if(!function_exists('utf8_to_utf16be')){
854    /**
855     * UTF-8 to UTF-16BE conversion.
856     *
857     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
858     */
859    function utf8_to_utf16be(&$str, $bom = false) {
860        $out = $bom ? "\xFE\xFF" : '';
861        if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
862
863        $uni = utf8_to_unicode($str);
864        foreach($uni as $cp){
865            $out .= pack('n',$cp);
866        }
867        return $out;
868    }
869}
870
871if(!function_exists('utf16be_to_utf8')){
872    /**
873     * UTF-8 to UTF-16BE conversion.
874     *
875     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
876     */
877    function utf16be_to_utf8(&$str) {
878        $uni = unpack('n*',$str);
879        return unicode_to_utf8($uni);
880    }
881}
882
883if(!function_exists('utf8_bad_replace')){
884    /**
885     * Replace bad bytes with an alternative character
886     *
887     * ASCII character is recommended for replacement char
888     *
889     * PCRE Pattern to locate bad bytes in a UTF-8 string
890     * Comes from W3 FAQ: Multilingual Forms
891     * Note: modified to include full ASCII range including control chars
892     *
893     * @author Harry Fuecks <hfuecks@gmail.com>
894     * @see http://www.w3.org/International/questions/qa-forms-utf-8
895     * @param string to search
896     * @param string to replace bad bytes with (defaults to '?') - use ASCII
897     * @return string
898     */
899    function utf8_bad_replace($str, $replace = '') {
900        $UTF8_BAD =
901         '([\x00-\x7F]'.                          # ASCII (including control chars)
902         '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
903         '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
904         '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
905         '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
906         '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
907         '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
908         '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
909         '|(.{1}))';                              # invalid byte
910        ob_start();
911        while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
912            if ( !isset($matches[2])) {
913                echo $matches[0];
914            } else {
915                echo $replace;
916            }
917            $str = substr($str,strlen($matches[0]));
918        }
919        $result = ob_get_contents();
920        ob_end_clean();
921        return $result;
922    }
923}
924
925if(!function_exists('utf8_correctIdx')){
926    /**
927     * adjust a byte index into a utf8 string to a utf8 character boundary
928     *
929     * @param $str   string   utf8 character string
930     * @param $i     int      byte index into $str
931     * @param $next  bool     direction to search for boundary,
932     *                           false = up (current character)
933     *                           true = down (next character)
934     *
935     * @return int            byte index into $str now pointing to a utf8 character boundary
936     *
937     * @author       chris smith <chris@jalakai.co.uk>
938     */
939    function utf8_correctIdx(&$str,$i,$next=false) {
940
941        if ($i <= 0) return 0;
942
943        $limit = strlen($str);
944        if ($i>=$limit) return $limit;
945
946        if ($next) {
947            while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
948        } else {
949            while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
950        }
951
952        return $i;
953    }
954}
955
956// only needed if no mb_string available
957if(!UTF8_MBSTRING){
958  /**
959   * UTF-8 Case lookup table
960   *
961   * This lookuptable defines the upper case letters to their correspponding
962   * lower case letter in UTF-8
963   *
964   * @author Andreas Gohr <andi@splitbrain.org>
965   */
966  global $UTF8_LOWER_TO_UPPER;
967  if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
968    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
969    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
970    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
971    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
972    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
973    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
974    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
975    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
976    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
977    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
978    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
979    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
980    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
981    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
982    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
983    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
984    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
985    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
986    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
987    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
988    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
989    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
990    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
991    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
992    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
993    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
994    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
995    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
996    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
997    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
998    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
999    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1000    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1001    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1002    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1003    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1004    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1005    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1006    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1007    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1008    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1009    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1010    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1011    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1012    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1013    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1014    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1015    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1016    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1017    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1018    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1019    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1020    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1021    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1022    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1023    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1024    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1025    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1026    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1027    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1028    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1029    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1030    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1031    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1032    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1033    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1034    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1035  );
1036
1037  /**
1038   * UTF-8 Case lookup table
1039   *
1040   * This lookuptable defines the lower case letters to their correspponding
1041   * upper case letter in UTF-8
1042   *
1043   * @author Andreas Gohr <andi@splitbrain.org>
1044   */
1045  global $UTF8_UPPER_TO_LOWER;
1046  if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1047    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1048    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1049    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1050    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1051    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1052    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1053    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1054    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1055    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1056    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1057    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1058    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1059    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1060    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1061    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1062    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1063    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1064    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1065    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1066    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1067    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1068    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1069    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1070    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1071    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1072    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1073    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1074    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1075    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1076    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1077    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1078    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1079    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1080    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1081    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1082    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1083    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1084    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1085    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1086    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1087    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1088    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1089    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1090    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1091    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1092    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1093    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1094    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1095    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1096    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1097    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1098    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1099    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1100    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1101    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1102    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1103    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1104    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1105    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1106    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1107    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1108    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1109    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1110    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1111    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1112    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1113    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1114  );
1115}; // end of case lookup tables
1116
1117/**
1118 * UTF-8 lookup table for lower case accented letters
1119 *
1120 * This lookuptable defines replacements for accented characters from the ASCII-7
1121 * range. This are lower case letters only.
1122 *
1123 * @author Andreas Gohr <andi@splitbrain.org>
1124 * @see    utf8_deaccent()
1125 */
1126global $UTF8_LOWER_ACCENTS;
1127if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1128  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1129  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1130  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1131  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1132  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1133  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1134  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1135  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1136  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1137  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1138  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1139  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1140  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1141  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1142  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1143);
1144
1145/**
1146 * UTF-8 lookup table for upper case accented letters
1147 *
1148 * This lookuptable defines replacements for accented characters from the ASCII-7
1149 * range. This are upper case letters only.
1150 *
1151 * @author Andreas Gohr <andi@splitbrain.org>
1152 * @see    utf8_deaccent()
1153 */
1154global $UTF8_UPPER_ACCENTS;
1155if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1156  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1157  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1158  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1159  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1160  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1161  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1162  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1163  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1164  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1165  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1166  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1167  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1168  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1169  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1170  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1171);
1172
1173/**
1174 * UTF-8 array of common special characters
1175 *
1176 * This array should contain all special characters (not a letter or digit)
1177 * defined in the various local charsets - it's not a complete list of non-alphanum
1178 * characters in UTF-8. It's not perfect but should match most cases of special
1179 * chars.
1180 *
1181 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1182 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1183 *
1184 * @author Andreas Gohr <andi@splitbrain.org>
1185 * @see    utf8_stripspecials()
1186 */
1187global $UTF8_SPECIAL_CHARS;
1188if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1189  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1190  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1191          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1192  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1193  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1194  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1195  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1196  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1197  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1198  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1199  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1200  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1201  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1202  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1203  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1204  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1205  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1206  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1207  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1208  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1209  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1210  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1211  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1212  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1213  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1214  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1215  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1216  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1217  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1218  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1219  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1220  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1221  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1222  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1223  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1224  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1225  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1226  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1227  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1228  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1229  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1230  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1231  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1232  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1233  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1234  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1235  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1236  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1237  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1238  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1239  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1240  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1241  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1242          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1243  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1244  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1245  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1246  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1247  0xffeb, 0xffec, 0xffed, 0xffee,
1248  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1249  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1250  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1251  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1252  0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1253);
1254
1255// utf8 version of above data
1256global $UTF8_SPECIAL_CHARS2;
1257if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1258    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1259    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1260    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1261    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1262    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1263    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1264    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1265    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1266    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1267    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1268    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1269    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1270    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1271    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1272    '➷➸➹➺➻➼➽➾'.
1273    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1274    '�'.
1275    '�ﹼﹽ'.
1276    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1277    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1278    '����������������������������������������������������������������'.
1279    '   ⁠';
1280
1281/**
1282 * Romanization lookup table
1283 *
1284 * This lookup tables provides a way to transform strings written in a language
1285 * different from the ones based upon latin letters into plain ASCII.
1286 *
1287 * Please note: this is not a scientific transliteration table. It only works
1288 * oneway from nonlatin to ASCII and it works by simple character replacement
1289 * only. Specialities of each language are not supported.
1290 *
1291 * @author Andreas Gohr <andi@splitbrain.org>
1292 * @author Vitaly Blokhin <vitinfo@vitn.com>
1293 * @link   http://www.uconv.com/translit.htm
1294 * @author Bisqwit <bisqwit@iki.fi>
1295 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1296 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1297 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1298 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1299 * @author Arthit Suriyawongkul <arthit@gmail.com>
1300 * @author Denis Scheither <amorphis@uni-bremen.de>
1301 */
1302global $UTF8_ROMANIZATION;
1303if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1304  // scandinavian - differs from what we do in deaccent
1305  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1306
1307  //russian cyrillic
1308  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1309  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1310  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1311  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1312  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1313  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1314  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1315  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1316  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1317  // Ukrainian cyrillic
1318  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1319  // Georgian
1320  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1321  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1322  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1323  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1324  'ჰ'=>'xh',
1325  //Sanskrit
1326  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1327  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1328  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1329  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1330  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1331  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1332  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1333  //Hebrew
1334  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1335  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1336  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1337  'ש'=>'sh','ת'=>'t',
1338  //Arabic
1339  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1340  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1341  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1342  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1343
1344  // Japanese characters  (last update: 2008-05-09)
1345
1346  // Japanese hiragana
1347
1348  // 3 character syllables, っ doubles the consonant after
1349  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1350  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1351  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1352  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1353  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1354  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1355  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1356  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1357  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1358  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1359  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1360
1361  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1362  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1363  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1364
1365   // 2 character syllables - normal
1366  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1367  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1368  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1369  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1370  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1371  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1372  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1373  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1374  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1375  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1376  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1377  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1378  'うぇ'=>'we','うぃ'=>'wi',
1379  'いぇ'=>'ye',
1380
1381  // 2 character syllables, っ doubles the consonant after
1382  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1383  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1384  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1385  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1386  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1387  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1388  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1389  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1390  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1391  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1392  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1393
1394  // 1 character syllabels
1395  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1396  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1397  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1398  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1399  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1400  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1401  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1402  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1403  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1404  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1405  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1406  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1407  'わ'=>'wa','を'=>'wo',
1408  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1409  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1410  // old characters
1411  'ゑ'=>'we','ゐ'=>'wi',
1412
1413  //  convert what's left (probably only kicks in when something's missing above)
1414  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1415  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1416
1417  // never seen one of those (disabled for the moment)
1418  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1419  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1420  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1421  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1422  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1423  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1424  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1425  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1426  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1427  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1428  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1429  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1430  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1431  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1432
1433  // 'spare' characters from other romanization systems
1434  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1435  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1436  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1437  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1438  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1439  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1440  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1441  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1442  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1443  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1444
1445
1446  // Japanese katakana
1447
1448  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1449  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1450  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1451  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1452  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1453  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1454  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1455  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1456  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1457  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1458  'ッティー'=>'ttii',
1459  'ッヂィー'=>'ddii',
1460
1461  // 3 character syllables - doubled vowels
1462  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1463  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1464  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1465  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1466  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1467  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1468  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1469  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1470  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1471  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1472  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1473  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1474  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1475  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1476  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1477  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1478  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1479  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1480  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1481  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1482  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1483  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1484  'ウェー'=>'wee','ウィー'=>'wii',
1485  'イェー'=>'yee',
1486  'ティー'=>'tii',
1487  'ヂィー'=>'dii',
1488
1489  // 3 character syllables - doubled consonants
1490  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1491  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1492  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1493  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1494  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1495  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1496  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1497  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1498  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1499  'ッティ'=>'tti',
1500  'ッヂィ'=>'ddi',
1501
1502  // 3 character syllables - doubled vowel and consonants
1503  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1504  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1505  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1506  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1507  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1508  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1509  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1510  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1511  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1512  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1513  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1514
1515  // 2 character syllables - normal
1516  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1517  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1518  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1519  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1520  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1521  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1522  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1523  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1524  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1525  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1526  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1527  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1528  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1529  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1530  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1531  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1532  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1533  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1534  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1535  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1536  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1537  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1538  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1539  'ウェ'=>'we','ウィ'=>'wi',
1540  'イェ'=>'ye',
1541  'ティ'=>'ti',
1542  'ヂィ'=>'di',
1543
1544  // 2 character syllables - doubled vocal
1545  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1546  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1547  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1548  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1549  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1550  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1551  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1552  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1553  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1554  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1555  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1556  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1557  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1558  'ワー'=>'waa','ヲー'=>'woo',
1559  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1560  'ヵー'=>'kaa','ヶー'=>'kee',
1561  // old characters
1562  'ヱー'=>'wee','ヰー'=>'wii',
1563
1564  // seperate katakana 'n'
1565  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1566  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1567
1568  // 2 character syllables - doubled consonants
1569  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1570  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1571  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1572  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1573  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1574  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1575  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1576  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1577  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1578  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1579  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1580
1581  // 1 character syllables
1582  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1583  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1584  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1585  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1586  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1587  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1588  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1589  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1590  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1591  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1592  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1593  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1594  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1595  'ワ'=>'wa','ヲ'=>'wo',
1596  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1597  'ヵ'=>'ka','ヶ'=>'ke',
1598  // old characters
1599  'ヱ'=>'we','ヰ'=>'wi',
1600
1601  //  convert what's left (probably only kicks in when something's missing above)
1602  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1603  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1604
1605  // special characters
1606  '・'=>'_','、'=>'_',
1607  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1608
1609  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1610  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1611  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1612  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1613  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1614  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1615  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1616  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1617  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1618  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1619  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1620  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1621
1622  // "Greeklish"
1623  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1624  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1625
1626  // Thai
1627  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1628  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1629  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1630  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1631  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1632  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1633  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1634  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1635  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1636  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1637  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1638  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1639  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1640  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1641  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1642  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1643	'๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1644  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1645
1646  // Korean
1647  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1648  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1649  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1650  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1651  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1652  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1653);
1654
1655//Setup VIM: ex: et ts=2 enc=utf-8 :
1656
1657