xref: /dokuwiki/inc/utf8.php (revision 137ce95a340b36e962f90c44c9f6414dc2e0d1e4)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9/**
10 * check for mb_string support
11 */
12if(!defined('UTF8_MBSTRING')){
13  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14    define('UTF8_MBSTRING',1);
15  }else{
16    define('UTF8_MBSTRING',0);
17  }
18}
19
20if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
21
22
23/**
24 * URL-Encode a filename to allow unicodecharacters
25 *
26 * Slashes are not encoded
27 *
28 * When the second parameter is true the string will
29 * be encoded only if non ASCII characters are detected -
30 * This makes it safe to run it multiple times on the
31 * same string (default is true)
32 *
33 * @author Andreas Gohr <andi@splitbrain.org>
34 * @see    urlencode
35 */
36function utf8_encodeFN($file,$safe=true){
37  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
38    return $file;
39  }
40  $file = urlencode($file);
41  $file = str_replace('%2F','/',$file);
42  return $file;
43}
44
45/**
46 * URL-Decode a filename
47 *
48 * This is just a wrapper around urldecode
49 *
50 * @author Andreas Gohr <andi@splitbrain.org>
51 * @see    urldecode
52 */
53function utf8_decodeFN($file){
54  $file = urldecode($file);
55  return $file;
56}
57
58/**
59 * Checks if a string contains 7bit ASCII only
60 *
61 * @author Andreas Gohr <andi@splitbrain.org>
62 */
63function utf8_isASCII($str){
64  for($i=0; $i<strlen($str); $i++){
65    if(ord($str{$i}) >127) return false;
66  }
67  return true;
68}
69
70/**
71 * Strips all highbyte chars
72 *
73 * Returns a pure ASCII7 string
74 *
75 * @author Andreas Gohr <andi@splitbrain.org>
76 */
77function utf8_strip($str){
78  $ascii = '';
79  for($i=0; $i<strlen($str); $i++){
80    if(ord($str{$i}) <128){
81      $ascii .= $str{$i};
82    }
83  }
84  return $ascii;
85}
86
87/**
88 * Tries to detect if a string is in Unicode encoding
89 *
90 * @author <bmorel@ssi.fr>
91 * @link   http://www.php.net/manual/en/function.utf8-encode.php
92 */
93function utf8_check($Str) {
94 for ($i=0; $i<strlen($Str); $i++) {
95  $b = ord($Str[$i]);
96  if ($b < 0x80) continue; # 0bbbbbbb
97  elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
98  elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
99  elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
100  elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
101  elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
102  else return false; # Does not match any model
103  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
104   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
105   return false;
106  }
107 }
108 return true;
109}
110
111/**
112 * Unicode aware replacement for strlen()
113 *
114 * utf8_decode() converts characters that are not in ISO-8859-1
115 * to '?', which, for the purpose of counting, is alright - It's
116 * even faster than mb_strlen.
117 *
118 * @author <chernyshevsky at hotmail dot com>
119 * @see    strlen()
120 * @see    utf8_decode()
121 */
122function utf8_strlen($string){
123  return strlen(utf8_decode($string));
124}
125
126/**
127 * UTF-8 aware alternative to substr
128 *
129 * Return part of a string given character offset (and optionally length)
130 *
131 * @author Harry Fuecks <hfuecks@gmail.com>
132 * @author Chris Smith <chris@jalakai.co.uk>
133 * @param string
134 * @param integer number of UTF-8 characters offset (from left)
135 * @param integer (optional) length in UTF-8 characters from offset
136 * @return mixed string or false if failure
137 */
138function utf8_substr($str, $offset, $length = null) {
139    if(UTF8_MBSTRING){
140        if( $length === null ){
141            return mb_substr($str, $offset);
142        }else{
143            return mb_substr($str, $offset, $length);
144        }
145    }
146
147    /*
148     * Notes:
149     *
150     * no mb string support, so we'll use pcre regex's with 'u' flag
151     * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
152     * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
153     *
154     * substr documentation states false can be returned in some cases (e.g. offset > string length)
155     * mb_substr never returns false, it will return an empty string instead.
156     *
157     * calculating the number of characters in the string is a relatively expensive operation, so
158     * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
159     */
160
161    // cast parameters to appropriate types to avoid multiple notices/warnings
162    $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
163    $offset = (int)$offset;
164    if (!is_null($length)) $length = (int)$length;
165
166    // handle trivial cases
167    if ($length === 0) return '';
168    if ($offset < 0 && $length < 0 && $length < $offset) return '';
169
170    $offset_pattern = '';
171    $length_pattern = '';
172
173    // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
174    if ($offset < 0) {
175      $strlen = strlen(utf8_decode($str));        // see notes
176      $offset = $strlen + $offset;
177      if ($offset < 0) $offset = 0;
178    }
179
180    // establish a pattern for offset, a non-captured group equal in length to offset
181    if ($offset > 0) {
182      $Ox = (int)($offset/65535);
183      $Oy = $offset%65535;
184
185      if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
186      $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
187    } else {
188      $offset_pattern = '^';                      // offset == 0; just anchor the pattern
189    }
190
191    // establish a pattern for length
192    if (is_null($length)) {
193      $length_pattern = '(.*)$';                  // the rest of the string
194    } else {
195
196      if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
197      if ($offset > $strlen) return '';           // another trivial case
198
199      if ($length > 0) {
200
201        $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
202
203        $Lx = (int)($length/65535);
204        $Ly = $length%65535;
205
206        // +ve length requires ... a captured group of length characters
207        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
208        $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
209
210      } else if ($length < 0) {
211
212        if ($length < ($offset - $strlen)) return '';
213
214        $Lx = (int)((-$length)/65535);
215        $Ly = (-$length)%65535;
216
217        // -ve length requires ... capture everything except a group of -length characters
218        //                         anchored at the tail-end of the string
219        if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
220        $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
221      }
222    }
223
224    if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
225    return $match[1];
226}
227
228/**
229 * Unicode aware replacement for substr_replace()
230 *
231 * @author Andreas Gohr <andi@splitbrain.org>
232 * @see    substr_replace()
233 */
234function utf8_substr_replace($string, $replacement, $start , $length=0 ){
235  $ret = '';
236  if($start>0) $ret .= utf8_substr($string, 0, $start);
237  $ret .= $replacement;
238  $ret .= utf8_substr($string, $start+$length);
239  return $ret;
240}
241
242/**
243 * Unicode aware replacement for ltrim()
244 *
245 * @author Andreas Gohr <andi@splitbrain.org>
246 * @see    ltrim()
247 * @return string
248 */
249function utf8_ltrim($str,$charlist=''){
250  if($charlist == '') return ltrim($str);
251
252  //quote charlist for use in a characterclass
253  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
254
255  return preg_replace('/^['.$charlist.']+/u','',$str);
256}
257
258/**
259 * Unicode aware replacement for rtrim()
260 *
261 * @author Andreas Gohr <andi@splitbrain.org>
262 * @see    rtrim()
263 * @return string
264 */
265function  utf8_rtrim($str,$charlist=''){
266  if($charlist == '') return rtrim($str);
267
268  //quote charlist for use in a characterclass
269  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
270
271  return preg_replace('/['.$charlist.']+$/u','',$str);
272}
273
274/**
275 * Unicode aware replacement for trim()
276 *
277 * @author Andreas Gohr <andi@splitbrain.org>
278 * @see    trim()
279 * @return string
280 */
281function  utf8_trim($str,$charlist='') {
282  if($charlist == '') return trim($str);
283
284  return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
285}
286
287
288/**
289 * This is a unicode aware replacement for strtolower()
290 *
291 * Uses mb_string extension if available
292 *
293 * @author Leo Feyer <leo@typolight.org>
294 * @see    strtolower()
295 * @see    utf8_strtoupper()
296 */
297function utf8_strtolower($string){
298  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
299
300  global $UTF8_UPPER_TO_LOWER;
301  return strtr($string,$UTF8_UPPER_TO_LOWER);
302}
303
304/**
305 * This is a unicode aware replacement for strtoupper()
306 *
307 * Uses mb_string extension if available
308 *
309 * @author Leo Feyer <leo@typolight.org>
310 * @see    strtoupper()
311 * @see    utf8_strtoupper()
312 */
313function utf8_strtoupper($string){
314  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
315
316  global $UTF8_LOWER_TO_UPPER;
317  return strtr($string,$UTF8_LOWER_TO_UPPER);
318}
319
320/**
321 * UTF-8 aware alternative to ucfirst
322 * Make a string's first character uppercase
323 *
324 * @author Harry Fuecks
325 * @param string
326 * @return string with first character as upper case (if applicable)
327 */
328function utf8_ucfirst($str){
329  switch ( utf8_strlen($str) ) {
330    case 0:
331        return '';
332    case 1:
333        return utf8_strtoupper($str);
334    default:
335        preg_match('/^(.{1})(.*)$/us', $str, $matches);
336        return utf8_strtoupper($matches[1]).$matches[2];
337  }
338}
339
340/**
341 * UTF-8 aware alternative to ucwords
342 * Uppercase the first character of each word in a string
343 *
344 * @author Harry Fuecks
345 * @param string
346 * @return string with first char of each word uppercase
347 * @see http://www.php.net/ucwords
348 */
349function utf8_ucwords($str) {
350  // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
351  // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
352  // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
353  $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
354
355  return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
356}
357
358/**
359 * Callback function for preg_replace_callback call in utf8_ucwords
360 * You don't need to call this yourself
361 *
362 * @author Harry Fuecks
363 * @param array of matches corresponding to a single word
364 * @return string with first char of the word in uppercase
365 * @see utf8_ucwords
366 * @see utf8_strtoupper
367 */
368function utf8_ucwords_callback($matches) {
369  $leadingws = $matches[2];
370  $ucfirst = utf8_strtoupper($matches[3]);
371  $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
372  return $leadingws . $ucword;
373}
374
375/**
376 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
377 *
378 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
379 * letters. Default is to deaccent both cases ($case = 0)
380 *
381 * @author Andreas Gohr <andi@splitbrain.org>
382 */
383function utf8_deaccent($string,$case=0){
384  if($case <= 0){
385    global $UTF8_LOWER_ACCENTS;
386    $string = strtr($string,$UTF8_LOWER_ACCENTS);
387  }
388  if($case >= 0){
389    global $UTF8_UPPER_ACCENTS;
390    $string = strtr($string,$UTF8_UPPER_ACCENTS);
391  }
392  return $string;
393}
394
395/**
396 * Romanize a non-latin string
397 *
398 * @author Andreas Gohr <andi@splitbrain.org>
399 */
400function utf8_romanize($string){
401  if(utf8_isASCII($string)) return $string; //nothing to do
402
403  global $UTF8_ROMANIZATION;
404  return strtr($string,$UTF8_ROMANIZATION);
405}
406
407/**
408 * Removes special characters (nonalphanumeric) from a UTF-8 string
409 *
410 * This function adds the controlchars 0x00 to 0x19 to the array of
411 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
412 *
413 * @author Andreas Gohr <andi@splitbrain.org>
414 * @param  string $string     The UTF8 string to strip of special chars
415 * @param  string $repl       Replace special with this string
416 * @param  string $additional Additional chars to strip (used in regexp char class)
417 */
418function utf8_stripspecials($string,$repl='',$additional=''){
419  global $UTF8_SPECIAL_CHARS;
420  global $UTF8_SPECIAL_CHARS2;
421
422  static $specials = null;
423  if(is_null($specials)){
424#    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
425    $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
426  }
427
428  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
429}
430
431/**
432 * This is an Unicode aware replacement for strpos
433 *
434 * @author Leo Feyer <leo@typolight.org>
435 * @see    strpos()
436 * @param  string
437 * @param  string
438 * @param  integer
439 * @return integer
440 */
441function utf8_strpos($haystack, $needle, $offset=0){
442    $comp = 0;
443    $length = null;
444
445    while (is_null($length) || $length < $offset) {
446        $pos = strpos($haystack, $needle, $offset + $comp);
447
448        if ($pos === false)
449            return false;
450
451        $length = utf8_strlen(substr($haystack, 0, $pos));
452
453        if ($length < $offset)
454            $comp = $pos - $length;
455    }
456
457    return $length;
458}
459
460
461/**
462 * Encodes UTF-8 characters to HTML entities
463 *
464 * @author Tom N Harris <tnharris@whoopdedo.org>
465 * @author <vpribish at shopping dot com>
466 * @link   http://www.php.net/manual/en/function.utf8-decode.php
467 */
468function utf8_tohtml ($str) {
469    $ret = '';
470    foreach (utf8_to_unicode($str) as $cp) {
471        if ($cp < 0x80)
472            $ret .= chr($cp);
473        elseif ($cp < 0x100)
474            $ret .= "&#$cp;";
475        else
476            $ret .= '&#x'.dechex($cp).';';
477    }
478    return $ret;
479}
480
481/**
482 * Decodes HTML entities to UTF-8 characters
483 *
484 * Convert any &#..; entity to a codepoint,
485 * The entities flag defaults to only decoding numeric entities.
486 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
487 * are handled as well. Avoids the problem that would occur if you
488 * had to decode "&amp;#38;&#38;amp;#38;"
489 *
490 * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
491 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
492 * what it should be                   -> "&#38;&amp#38;"
493 *
494 * @author Tom N Harris <tnharris@whoopdedo.org>
495 * @param  string  $str      UTF-8 encoded string
496 * @param  boolean $entities Flag controlling decoding of named entities.
497 * @return UTF-8 encoded string with numeric (and named) entities replaced.
498 */
499function utf8_unhtml($str, $entities=null) {
500    static $decoder = null;
501    if (is_null($decoder))
502      $decoder = new utf8_entity_decoder();
503    if (is_null($entities))
504        return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
505                                     'utf8_decode_numeric', $str);
506    else
507        return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
508                                     array(&$decoder, 'decode'), $str);
509}
510function utf8_decode_numeric($ent) {
511    switch ($ent[2]) {
512      case 'X':
513      case 'x':
514          $cp = hexdec($ent[3]);
515          break;
516      default:
517          $cp = intval($ent[3]);
518          break;
519    }
520    return unicode_to_utf8(array($cp));
521}
522class utf8_entity_decoder {
523    var $table;
524    function utf8_entity_decoder() {
525        $table = get_html_translation_table(HTML_ENTITIES);
526        $table = array_flip($table);
527        $this->table = array_map(array(&$this,'makeutf8'), $table);
528    }
529    function makeutf8($c) {
530        return unicode_to_utf8(array(ord($c)));
531    }
532    function decode($ent) {
533        if ($ent[1] == '#') {
534            return utf8_decode_numeric($ent);
535        } elseif (array_key_exists($ent[0],$this->table)) {
536            return $this->table[$ent[0]];
537        } else {
538            return $ent[0];
539        }
540    }
541}
542
543/**
544 * Takes an UTF-8 string and returns an array of ints representing the
545 * Unicode characters. Astral planes are supported ie. the ints in the
546 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
547 * are not allowed.
548 *
549 * If $strict is set to true the function returns false if the input
550 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
551 * level E_USER_WARNING
552 *
553 * Note: this function has been modified slightly in this library to
554 * trigger errors on encountering bad bytes
555 *
556 * @author <hsivonen@iki.fi>
557 * @author Harry Fuecks <hfuecks@gmail.com>
558 * @param  string  UTF-8 encoded string
559 * @param  boolean Check for invalid sequences?
560 * @return mixed array of unicode code points or false if UTF-8 invalid
561 * @see    unicode_to_utf8
562 * @link   http://hsivonen.iki.fi/php-utf8/
563 * @link   http://sourceforge.net/projects/phputf8/
564 */
565function utf8_to_unicode($str,$strict=false) {
566    $mState = 0;     // cached expected number of octets after the current octet
567                     // until the beginning of the next UTF8 character sequence
568    $mUcs4  = 0;     // cached Unicode character
569    $mBytes = 1;     // cached expected number of octets in the current sequence
570
571    $out = array();
572
573    $len = strlen($str);
574
575    for($i = 0; $i < $len; $i++) {
576
577        $in = ord($str{$i});
578
579        if ( $mState == 0) {
580
581            // When mState is zero we expect either a US-ASCII character or a
582            // multi-octet sequence.
583            if (0 == (0x80 & ($in))) {
584                // US-ASCII, pass straight through.
585                $out[] = $in;
586                $mBytes = 1;
587
588            } else if (0xC0 == (0xE0 & ($in))) {
589                // First octet of 2 octet sequence
590                $mUcs4 = ($in);
591                $mUcs4 = ($mUcs4 & 0x1F) << 6;
592                $mState = 1;
593                $mBytes = 2;
594
595            } else if (0xE0 == (0xF0 & ($in))) {
596                // First octet of 3 octet sequence
597                $mUcs4 = ($in);
598                $mUcs4 = ($mUcs4 & 0x0F) << 12;
599                $mState = 2;
600                $mBytes = 3;
601
602            } else if (0xF0 == (0xF8 & ($in))) {
603                // First octet of 4 octet sequence
604                $mUcs4 = ($in);
605                $mUcs4 = ($mUcs4 & 0x07) << 18;
606                $mState = 3;
607                $mBytes = 4;
608
609            } else if (0xF8 == (0xFC & ($in))) {
610                /* First octet of 5 octet sequence.
611                 *
612                 * This is illegal because the encoded codepoint must be either
613                 * (a) not the shortest form or
614                 * (b) outside the Unicode range of 0-0x10FFFF.
615                 * Rather than trying to resynchronize, we will carry on until the end
616                 * of the sequence and let the later error handling code catch it.
617                 */
618                $mUcs4 = ($in);
619                $mUcs4 = ($mUcs4 & 0x03) << 24;
620                $mState = 4;
621                $mBytes = 5;
622
623            } else if (0xFC == (0xFE & ($in))) {
624                // First octet of 6 octet sequence, see comments for 5 octet sequence.
625                $mUcs4 = ($in);
626                $mUcs4 = ($mUcs4 & 1) << 30;
627                $mState = 5;
628                $mBytes = 6;
629
630            } elseif($strict) {
631                /* Current octet is neither in the US-ASCII range nor a legal first
632                 * octet of a multi-octet sequence.
633                 */
634                trigger_error(
635                        'utf8_to_unicode: Illegal sequence identifier '.
636                            'in UTF-8 at byte '.$i,
637                        E_USER_WARNING
638                    );
639                return false;
640
641            }
642
643        } else {
644
645            // When mState is non-zero, we expect a continuation of the multi-octet
646            // sequence
647            if (0x80 == (0xC0 & ($in))) {
648
649                // Legal continuation.
650                $shift = ($mState - 1) * 6;
651                $tmp = $in;
652                $tmp = ($tmp & 0x0000003F) << $shift;
653                $mUcs4 |= $tmp;
654
655                /**
656                 * End of the multi-octet sequence. mUcs4 now contains the final
657                 * Unicode codepoint to be output
658                 */
659                if (0 == --$mState) {
660
661                    /*
662                     * Check for illegal sequences and codepoints.
663                     */
664                    // From Unicode 3.1, non-shortest form is illegal
665                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
666                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
667                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
668                        (4 < $mBytes) ||
669                        // From Unicode 3.2, surrogate characters are illegal
670                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
671                        // Codepoints outside the Unicode range are illegal
672                        ($mUcs4 > 0x10FFFF)) {
673
674                        if($strict){
675                            trigger_error(
676                                    'utf8_to_unicode: Illegal sequence or codepoint '.
677                                        'in UTF-8 at byte '.$i,
678                                    E_USER_WARNING
679                                );
680
681                            return false;
682                        }
683
684                    }
685
686                    if (0xFEFF != $mUcs4) {
687                        // BOM is legal but we don't want to output it
688                        $out[] = $mUcs4;
689                    }
690
691                    //initialize UTF8 cache
692                    $mState = 0;
693                    $mUcs4  = 0;
694                    $mBytes = 1;
695                }
696
697            } elseif($strict) {
698                /**
699                 *((0xC0 & (*in) != 0x80) && (mState != 0))
700                 * Incomplete multi-octet sequence.
701                 */
702                trigger_error(
703                        'utf8_to_unicode: Incomplete multi-octet '.
704                        '   sequence in UTF-8 at byte '.$i,
705                        E_USER_WARNING
706                    );
707
708                return false;
709            }
710        }
711    }
712    return $out;
713}
714
715/**
716 * Takes an array of ints representing the Unicode characters and returns
717 * a UTF-8 string. Astral planes are supported ie. the ints in the
718 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
719 * are not allowed.
720 *
721 * If $strict is set to true the function returns false if the input
722 * array contains ints that represent surrogates or are outside the
723 * Unicode range and raises a PHP error at level E_USER_WARNING
724 *
725 * Note: this function has been modified slightly in this library to use
726 * output buffering to concatenate the UTF-8 string (faster) as well as
727 * reference the array by it's keys
728 *
729 * @param  array of unicode code points representing a string
730 * @param  boolean Check for invalid sequences?
731 * @return mixed UTF-8 string or false if array contains invalid code points
732 * @author <hsivonen@iki.fi>
733 * @author Harry Fuecks <hfuecks@gmail.com>
734 * @see    utf8_to_unicode
735 * @link   http://hsivonen.iki.fi/php-utf8/
736 * @link   http://sourceforge.net/projects/phputf8/
737 */
738function unicode_to_utf8($arr,$strict=false) {
739    if (!is_array($arr)) return '';
740    ob_start();
741
742    foreach (array_keys($arr) as $k) {
743
744        # ASCII range (including control chars)
745        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
746
747            echo chr($arr[$k]);
748
749        # 2 byte sequence
750        } else if ($arr[$k] <= 0x07ff) {
751
752            echo chr(0xc0 | ($arr[$k] >> 6));
753            echo chr(0x80 | ($arr[$k] & 0x003f));
754
755        # Byte order mark (skip)
756        } else if($arr[$k] == 0xFEFF) {
757
758            // nop -- zap the BOM
759
760        # Test for illegal surrogates
761        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
762
763            // found a surrogate
764            if($strict){
765                trigger_error(
766                    'unicode_to_utf8: Illegal surrogate '.
767                        'at index: '.$k.', value: '.$arr[$k],
768                    E_USER_WARNING
769                    );
770                return false;
771            }
772
773        # 3 byte sequence
774        } else if ($arr[$k] <= 0xffff) {
775
776            echo chr(0xe0 | ($arr[$k] >> 12));
777            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
778            echo chr(0x80 | ($arr[$k] & 0x003f));
779
780        # 4 byte sequence
781        } else if ($arr[$k] <= 0x10ffff) {
782
783            echo chr(0xf0 | ($arr[$k] >> 18));
784            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
785            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
786            echo chr(0x80 | ($arr[$k] & 0x3f));
787
788        } elseif($strict) {
789
790            trigger_error(
791                'unicode_to_utf8: Codepoint out of Unicode range '.
792                    'at index: '.$k.', value: '.$arr[$k],
793                E_USER_WARNING
794                );
795
796            // out of range
797            return false;
798        }
799    }
800
801    $result = ob_get_contents();
802    ob_end_clean();
803    return $result;
804}
805
806/**
807 * UTF-8 to UTF-16BE conversion.
808 *
809 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
810 */
811function utf8_to_utf16be(&$str, $bom = false) {
812  $out = $bom ? "\xFE\xFF" : '';
813  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
814
815  $uni = utf8_to_unicode($str);
816  foreach($uni as $cp){
817    $out .= pack('n',$cp);
818  }
819  return $out;
820}
821
822/**
823 * UTF-8 to UTF-16BE conversion.
824 *
825 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
826 */
827function utf16be_to_utf8(&$str) {
828  $uni = unpack('n*',$str);
829  return unicode_to_utf8($uni);
830}
831
832/**
833 * Replace bad bytes with an alternative character
834 *
835 * ASCII character is recommended for replacement char
836 *
837 * PCRE Pattern to locate bad bytes in a UTF-8 string
838 * Comes from W3 FAQ: Multilingual Forms
839 * Note: modified to include full ASCII range including control chars
840 *
841 * @author Harry Fuecks <hfuecks@gmail.com>
842 * @see http://www.w3.org/International/questions/qa-forms-utf-8
843 * @param string to search
844 * @param string to replace bad bytes with (defaults to '?') - use ASCII
845 * @return string
846 */
847function utf8_bad_replace($str, $replace = '') {
848    $UTF8_BAD =
849     '([\x00-\x7F]'.                          # ASCII (including control chars)
850     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
851     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
852     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
853     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
854     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
855     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
856     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
857     '|(.{1}))';                              # invalid byte
858    ob_start();
859    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
860        if ( !isset($matches[2])) {
861            echo $matches[0];
862        } else {
863            echo $replace;
864        }
865        $str = substr($str,strlen($matches[0]));
866    }
867    $result = ob_get_contents();
868    ob_end_clean();
869    return $result;
870}
871
872/**
873 * adjust a byte index into a utf8 string to a utf8 character boundary
874 *
875 * @param $str   string   utf8 character string
876 * @param $i     int      byte index into $str
877 * @param $next  bool     direction to search for boundary,
878 *                           false = up (current character)
879 *                           true = down (next character)
880 *
881 * @return int            byte index into $str now pointing to a utf8 character boundary
882 *
883 * @author       chris smith <chris@jalakai.co.uk>
884 */
885function utf8_correctIdx(&$str,$i,$next=false) {
886
887  if ($i <= 0) return 0;
888
889  $limit = strlen($str);
890  if ($i>=$limit) return $limit;
891
892  if ($next) {
893    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
894  } else {
895    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
896  }
897
898  return $i;
899}
900
901// only needed if no mb_string available
902if(!UTF8_MBSTRING){
903  /**
904   * UTF-8 Case lookup table
905   *
906   * This lookuptable defines the upper case letters to their correspponding
907   * lower case letter in UTF-8
908   *
909   * @author Andreas Gohr <andi@splitbrain.org>
910   */
911  global $UTF8_LOWER_TO_UPPER;
912  $UTF8_LOWER_TO_UPPER = array(
913    "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
914    "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
915    "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
916    "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
917    "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
918    "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
919    "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
920    "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
921    "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
922    "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
923    "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
924    "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
925    "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
926    "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
927    "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
928    "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
929    "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
930    "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
931    "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
932    "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
933    "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
934    "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
935    "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
936    "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
937    "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
938    "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
939    "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
940    "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
941    "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
942    "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
943    "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
944    "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
945    "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
946    "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
947    "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
948    "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
949    "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
950    "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
951    "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
952    "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
953    "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
954    "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
955    "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
956    "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
957    "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
958    "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
959    "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
960    "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
961    "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
962    "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
963    "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
964    "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
965    "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
966    "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
967    "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
968    "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
969    "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
970    "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
971    "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
972    "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
973    "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
974    "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
975    "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
976    "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
977    "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
978    "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
979    "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
980  );
981
982  /**
983   * UTF-8 Case lookup table
984   *
985   * This lookuptable defines the lower case letters to their correspponding
986   * upper case letter in UTF-8
987   *
988   * @author Andreas Gohr <andi@splitbrain.org>
989   */
990  global $UTF8_UPPER_TO_LOWER;
991  $UTF8_UPPER_TO_LOWER = array (
992    "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
993    "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
994    "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
995    "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
996    "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
997    "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
998    "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
999    "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1000    "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1001    "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1002    "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1003    "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1004    "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1005    "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1006    "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1007    "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1008    "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1009    "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1010    "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1011    "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1012    "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1013    "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1014    "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1015    "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1016    "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1017    "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1018    "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1019    "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1020    "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1021    "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1022    "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1023    "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1024    "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1025    "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1026    "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1027    "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1028    "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1029    "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1030    "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1031    "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1032    "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1033    "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1034    "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1035    "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1036    "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1037    "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1038    "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1039    "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1040    "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1041    "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1042    "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1043    "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1044    "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1045    "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1046    "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1047    "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1048    "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1049    "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1050    "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1051    "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1052    "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1053    "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1054    "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1055    "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1056    "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1057    "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1058    "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1059  );
1060}; // end of case lookup tables
1061
1062/**
1063 * UTF-8 lookup table for lower case accented letters
1064 *
1065 * This lookuptable defines replacements for accented characters from the ASCII-7
1066 * range. This are lower case letters only.
1067 *
1068 * @author Andreas Gohr <andi@splitbrain.org>
1069 * @see    utf8_deaccent()
1070 */
1071global $UTF8_LOWER_ACCENTS;
1072$UTF8_LOWER_ACCENTS = array(
1073  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1074  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1075  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1076  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1077  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1078  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1079  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1080  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1081  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1082  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1083  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1084  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1085  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1086  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1087  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1088);
1089
1090/**
1091 * UTF-8 lookup table for upper case accented letters
1092 *
1093 * This lookuptable defines replacements for accented characters from the ASCII-7
1094 * range. This are upper case letters only.
1095 *
1096 * @author Andreas Gohr <andi@splitbrain.org>
1097 * @see    utf8_deaccent()
1098 */
1099global $UTF8_UPPER_ACCENTS;
1100$UTF8_UPPER_ACCENTS = array(
1101  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1102  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1103  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1104  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1105  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1106  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1107  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1108  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1109  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1110  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1111  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1112  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1113  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1114  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1115  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1116);
1117
1118/**
1119 * UTF-8 array of common special characters
1120 *
1121 * This array should contain all special characters (not a letter or digit)
1122 * defined in the various local charsets - it's not a complete list of non-alphanum
1123 * characters in UTF-8. It's not perfect but should match most cases of special
1124 * chars.
1125 *
1126 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1127 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1128 *
1129 * @author Andreas Gohr <andi@splitbrain.org>
1130 * @see    utf8_stripspecials()
1131 */
1132global $UTF8_SPECIAL_CHARS;
1133$UTF8_SPECIAL_CHARS = array(
1134  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1135  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1136          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1137  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1138  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1139  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1140  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1141  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1142  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1143  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1144  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1145  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1146  0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1147  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1148  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1149  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1150  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1151  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1152  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1153  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1154  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1155  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1156  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1157  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1158  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1159  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1160  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1161  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1162  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1163  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1164  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1165  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1166  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1167  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1168  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1169  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1170  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1171  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1172  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1173  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1174  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1175  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1176  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1177  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1178  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1179  0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1180  0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1181  0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1182  0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1183  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1184  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1185  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1186  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1187          0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1188  0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1189  0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1190  0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1191  0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1192  0xffeb, 0xffec, 0xffed, 0xffee,
1193  0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1194  0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1195  0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1196  0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b
1197);
1198
1199// utf8 version of above data
1200global $UTF8_SPECIAL_CHARS2;
1201$UTF8_SPECIAL_CHARS2 =
1202    "\x1A".' !"#$%&\'()+,/;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1203    '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1204    '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1205    '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1206    '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1207    '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1208    '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1209    '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1210    '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1211    '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1212    '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1213    '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1214    '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1215    '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1216    '➷➸➹➺➻➼➽➾'.
1217    ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1218    '�'.
1219    '�ﹼﹽ'.
1220    '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1221    '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1222    '����������������������������������������������������������������';
1223
1224/**
1225 * Romanization lookup table
1226 *
1227 * This lookup tables provides a way to transform strings written in a language
1228 * different from the ones based upon latin letters into plain ASCII.
1229 *
1230 * Please note: this is not a scientific transliteration table. It only works
1231 * oneway from nonlatin to ASCII and it works by simple character replacement
1232 * only. Specialities of each language are not supported.
1233 *
1234 * @author Andreas Gohr <andi@splitbrain.org>
1235 * @author Vitaly Blokhin <vitinfo@vitn.com>
1236 * @link   http://www.uconv.com/translit.htm
1237 * @author Bisqwit <bisqwit@iki.fi>
1238 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1239 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1240 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1241 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1242 * @author Arthit Suriyawongkul <arthit@gmail.com>
1243 * @author Denis Scheither <amorphis@uni-bremen.de>
1244 */
1245global $UTF8_ROMANIZATION;
1246$UTF8_ROMANIZATION = array(
1247  // scandinavian - differs from what we do in deaccent
1248  'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1249
1250  //russian cyrillic
1251  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1252  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1253  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1254  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1255  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1256  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1257  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1258  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1259  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1260  // Ukrainian cyrillic
1261  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1262  // Georgian
1263  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1264  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1265  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1266  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1267  'ჰ'=>'xh',
1268  //Sanskrit
1269  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1270  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1271  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1272  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1273  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1274  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1275  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1276  //Hebrew
1277  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1278  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1279  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1280  'ש'=>'sh','ת'=>'t',
1281  //Arabic
1282  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1283  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1284  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1285  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1286
1287  // Japanese characters  (last update: 2008-05-09)
1288
1289  // Japanese hiragana
1290
1291  // 3 character syllables, っ doubles the consonant after
1292  'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1293  'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1294  'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1295  'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1296  // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1297  'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1298  'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1299  'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1300  'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1301  'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1302  'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1303
1304  // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1305  'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1306  'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1307
1308   // 2 character syllables - normal
1309  'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1310  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1311  'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1312  'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1313  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1314  'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1315  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1316  'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1317  'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1318  'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1319  'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1320  'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1321  'うぇ'=>'we','うぃ'=>'wi',
1322  'いぇ'=>'ye',
1323
1324  // 2 character syllables, っ doubles the consonant after
1325  'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1326  'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1327  'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1328  'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1329  'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1330  'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1331  'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1332  'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1333  'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1334  'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1335  'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1336
1337  // 1 character syllabels
1338  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1339  'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1340  'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1341  'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1342  'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1343  'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1344  'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1345  'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1346  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1347  'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1348  'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1349  'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1350  'わ'=>'wa','を'=>'wo',
1351  'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1352  'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1353  // old characters
1354  'ゑ'=>'we','ゐ'=>'wi',
1355
1356  //  convert what's left (probably only kicks in when something's missing above)
1357  // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1358  // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1359
1360  // never seen one of those (disabled for the moment)
1361  // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1362  // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1363  // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1364  // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1365  // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1366  // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1367  // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1368  // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1369  // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1370  // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1371  // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1372  // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1373  // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1374  // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1375
1376  // 'spare' characters from other romanization systems
1377  // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1378  // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1379  // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1380  // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1381  //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1382  //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1383  //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1384  //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1385  //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1386  //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1387
1388
1389  // Japanese katakana
1390
1391  // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1392  'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1393  'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1394  'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1395  'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1396  'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1397  'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1398  'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1399  'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1400  'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1401  'ッティー'=>'ttii',
1402  'ッヂィー'=>'ddii',
1403
1404  // 3 character syllables - doubled vowels
1405  'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1406  'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1407  'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1408  'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1409  'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1410  'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1411  'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1412  'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1413  'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1414  'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1415  'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1416  'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1417  'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1418  'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1419  'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1420  'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1421  'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1422  'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1423  'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1424  'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1425  'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1426  'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1427  'ウェー'=>'wee','ウィー'=>'wii',
1428  'イェー'=>'yee',
1429  'ティー'=>'tii',
1430  'ヂィー'=>'dii',
1431
1432  // 3 character syllables - doubled consonants
1433  'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1434  'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1435  'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1436  'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1437  'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1438  'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1439  'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1440  'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1441  'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1442  'ッティ'=>'tti',
1443  'ッヂィ'=>'ddi',
1444
1445  // 3 character syllables - doubled vowel and consonants
1446  'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1447  'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1448  'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1449  'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1450  'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1451  'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1452  'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1453  'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1454  'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1455  'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1456  'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1457
1458  // 2 character syllables - normal
1459  'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1460  // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1461  'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1462  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1463  'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1464  'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1465  'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1466  'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1467  'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1468  'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1469  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1470  'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1471  'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1472  'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1473  'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1474  'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1475  // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1476  'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1477  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1478  'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1479  'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1480  'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1481  'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1482  'ウェ'=>'we','ウィ'=>'wi',
1483  'イェ'=>'ye',
1484  'ティ'=>'ti',
1485  'ヂィ'=>'di',
1486
1487  // 2 character syllables - doubled vocal
1488  'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1489  'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1490  'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1491  'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1492  'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1493  'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1494  'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1495  'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1496  'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1497  'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1498  'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1499  'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1500  'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1501  'ワー'=>'waa','ヲー'=>'woo',
1502  'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1503  'ヵー'=>'kaa','ヶー'=>'kee',
1504  // old characters
1505  'ヱー'=>'wee','ヰー'=>'wii',
1506
1507  // seperate katakana 'n'
1508  'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1509  'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1510
1511  // 2 character syllables - doubled consonants
1512  'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1513  'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1514  'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1515  'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1516  'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1517  'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1518  'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1519  'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1520  'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1521  'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1522  'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1523
1524  // 1 character syllables
1525  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1526  'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1527  'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1528  'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1529  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1530  'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1531  'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1532  'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1533  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1534  'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1535  'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1536  'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1537  'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1538  'ワ'=>'wa','ヲ'=>'wo',
1539  'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1540  'ヵ'=>'ka','ヶ'=>'ke',
1541  // old characters
1542  'ヱ'=>'we','ヰ'=>'wi',
1543
1544  //  convert what's left (probably only kicks in when something's missing above)
1545  'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1546  'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1547
1548  // special characters
1549  '・'=>'_','、'=>'_',
1550  'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1551
1552  // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1553  // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1554  //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1555  // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1556  // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1557  //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1558  //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1559  // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1560  // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1561  //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1562  //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1563  //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1564
1565  // "Greeklish"
1566  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1567  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1568
1569  // Thai
1570  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1571  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1572  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1573  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1574  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1575  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1576  'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1577  'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1578  'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1579  'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1580  'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1581  'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1582  'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1583  '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1584  '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1585  'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1586	'๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1587  '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1588
1589  // Korean
1590  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1591  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1592  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1593  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1594  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1595  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1596);
1597
1598//Setup VIM: ex: et ts=2 enc=utf-8 :
1599
1600