xref: /dokuwiki/inc/utf8.php (revision 71726d7801bdcbf41dfdc79d244f09a0988529c0)
1<?php
2/**
3 * UTF8 helper functions
4 *
5 * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9
10/**
11 * check for mb_string support
12 */
13if(!defined('UTF8_MBSTRING')){
14  if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
15    define('UTF8_MBSTRING',1);
16  }else{
17    define('UTF8_MBSTRING',0);
18  }
19}
20
21
22/**
23 * URL-Encode a filename to allow unicodecharacters
24 *
25 * Slashes are not encoded
26 *
27 * When the second parameter is true the string will
28 * be encoded only if non ASCII characters are detected -
29 * This makes it safe to run it multiple times on the
30 * same string (default is true)
31 *
32 * @author Andreas Gohr <andi@splitbrain.org>
33 * @see    urlencode
34 */
35function utf8_encodeFN($file,$safe=true){
36  if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
37    return $file;
38  }
39  $file = urlencode($file);
40  $file = str_replace('%2F','/',$file);
41  return $file;
42}
43
44/**
45 * URL-Decode a filename
46 *
47 * This is just a wrapper around urldecode
48 *
49 * @author Andreas Gohr <andi@splitbrain.org>
50 * @see    urldecode
51 */
52function utf8_decodeFN($file){
53  $file = urldecode($file);
54  return $file;
55}
56
57/**
58 * Checks if a string contains 7bit ASCII only
59 *
60 * @author Andreas Gohr <andi@splitbrain.org>
61 */
62function utf8_isASCII($str){
63  for($i=0; $i<strlen($str); $i++){
64    if(ord($str{$i}) >127) return false;
65  }
66  return true;
67}
68
69/**
70 * Strips all highbyte chars
71 *
72 * Returns a pure ASCII7 string
73 *
74 * @author Andreas Gohr <andi@splitbrain.org>
75 */
76function utf8_strip($str){
77  $ascii = '';
78  for($i=0; $i<strlen($str); $i++){
79    if(ord($str{$i}) <128){
80      $ascii .= $str{$i};
81    }
82  }
83  return $ascii;
84}
85
86/**
87 * Tries to detect if a string is in Unicode encoding
88 *
89 * @author <bmorel@ssi.fr>
90 * @link   http://www.php.net/manual/en/function.utf8-encode.php
91 */
92function utf8_check($Str) {
93 for ($i=0; $i<strlen($Str); $i++) {
94  if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
95  elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
96  elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
97  elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
98  elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
99  elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
100  else return false; # Does not match any model
101  for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
102   if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
103   return false;
104  }
105 }
106 return true;
107}
108
109/**
110 * Unicode aware replacement for strlen()
111 *
112 * utf8_decode() converts characters that are not in ISO-8859-1
113 * to '?', which, for the purpose of counting, is alright - It's
114 * even faster than mb_strlen.
115 *
116 * @author <chernyshevsky at hotmail dot com>
117 * @see    strlen()
118 * @see    utf8_decode()
119 */
120function utf8_strlen($string){
121  return strlen(utf8_decode($string));
122}
123
124/**
125 * UTF-8 aware alternative to substr
126 *
127 * Return part of a string given character offset (and optionally length)
128 * Note: supports use of negative offsets and lengths but will be slower
129 * when doing so
130 *
131 * @author Harry Fuecks <hfuecks@gmail.com>
132 * @param string
133 * @param integer number of UTF-8 characters offset (from left)
134 * @param integer (optional) length in UTF-8 characters from offset
135 * @return mixed string or FALSE if failure
136 */
137function utf8_substr($str, $offset, $length = null) {
138    if(UTF8_MBSTRING){
139        if( $length === null ){
140            return mb_substr($str, $offset);
141        }else{
142            return mb_substr($str, $offset, $length);
143        }
144    }
145
146    if ( $offset >= 0 && $length >= 0 ) {
147        if ( $length === null ) {
148            $length = '*';
149        } else {
150            $strlen = strlen(utf8_decode($str));
151            if ( $offset > $strlen ) {
152                return '';
153            }
154
155            if ( ( $offset + $length ) > $strlen ) {
156               $length = '*';
157            } else {
158                $length = '{'.$length.'}';
159            }
160        }
161
162        $pattern = '/^.{'.$offset.'}(.'.$length.')/us';
163        preg_match($pattern, $str, $matches);
164
165        if ( isset($matches[1]) ) {
166            return $matches[1];
167        }
168        return false;
169
170    } else {
171        // Handle negatives using different, slower technique
172        // From: http://www.php.net/manual/en/function.substr.php#44838
173        preg_match_all('/./u', $str, $ar);
174        if( $length !== null ) {
175            return join('',array_slice($ar[0],$offset,$length));
176        } else {
177            return join('',array_slice($ar[0],$offset));
178        }
179    }
180}
181
182
183/**
184 * Unicode aware replacement for substr_replace()
185 *
186 * @author Andreas Gohr <andi@splitbrain.org>
187 * @see    substr_replace()
188 */
189function utf8_substr_replace($string, $replacement, $start , $length=0 ){
190  $ret = '';
191  if($start>0) $ret .= utf8_substr($string, 0, $start);
192  $ret .= $replacement;
193  $ret .= utf8_substr($string, $start+$length);
194  return $ret;
195}
196
197/**
198 * Unicode aware replacement for explode
199 *
200 * @TODO   support third limit arg
201 * @author Harry Fuecks <hfuecks@gmail.com>
202 * @see    explode();
203 */
204function utf8_explode($sep, $str) {
205  if ( $sep == '' ) {
206    trigger_error('Empty delimiter',E_USER_WARNING);
207    return FALSE;
208  }
209
210  return preg_split('!'.preg_quote($sep,'!').'!u',$str);
211}
212
213/**
214 * Unicode aware replacement for strrepalce()
215 *
216 * @todo   support PHP5 count (fourth arg)
217 * @author Harry Fuecks <hfuecks@gmail.com>
218 * @see    strreplace();
219 */
220function utf8_str_replace($s,$r,$str){
221  if(!is_array($s)){
222    $s = '!'.preg_quote($s,'!').'!u';
223  }else{
224    foreach ($s as $k => $v) {
225      $s[$k] = '!'.preg_quote($v).'!u';
226    }
227  }
228  return preg_replace($s,$r,$str);
229}
230
231/**
232 * Unicode aware replacement for ltrim()
233 *
234 * @author Andreas Gohr <andi@splitbrain.org>
235 * @see    ltrim()
236 * @return string
237 */
238function utf8_ltrim($str,$charlist=''){
239  if($charlist == '') return ltrim($str);
240
241  //quote charlist for use in a characterclass
242  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
243
244  return preg_replace('/^['.$charlist.']+/u','',$str);
245}
246
247/**
248 * Unicode aware replacement for rtrim()
249 *
250 * @author Andreas Gohr <andi@splitbrain.org>
251 * @see    rtrim()
252 * @return string
253 */
254function  utf8_rtrim($str,$charlist=''){
255  if($charlist == '') return rtrim($str);
256
257  //quote charlist for use in a characterclass
258  $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
259
260  return preg_replace('/['.$charlist.']+$/u','',$str);
261}
262
263/**
264 * Unicode aware replacement for trim()
265 *
266 * @author Andreas Gohr <andi@splitbrain.org>
267 * @see    trim()
268 * @return string
269 */
270function  utf8_trim($str,$charlist='') {
271  if($charlist == '') return trim($str);
272
273  return utf8_ltrim(utf8_rtrim($str));
274}
275
276
277/**
278 * This is a unicode aware replacement for strtolower()
279 *
280 * Uses mb_string extension if available
281 *
282 * @author Andreas Gohr <andi@splitbrain.org>
283 * @see    strtolower()
284 * @see    utf8_strtoupper()
285 */
286function utf8_strtolower($string){
287  if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
288
289  global $UTF8_UPPER_TO_LOWER;
290  $uni = utf8_to_unicode($string);
291  $cnt = count($uni);
292  for ($i=0; $i < $cnt; $i++){
293    if($UTF8_UPPER_TO_LOWER[$uni[$i]]){
294      $uni[$i] = $UTF8_UPPER_TO_LOWER[$uni[$i]];
295    }
296  }
297  return unicode_to_utf8($uni);
298}
299
300/**
301 * This is a unicode aware replacement for strtoupper()
302 *
303 * Uses mb_string extension if available
304 *
305 * @author Andreas Gohr <andi@splitbrain.org>
306 * @see    strtoupper()
307 * @see    utf8_strtoupper()
308 */
309function utf8_strtoupper($string){
310  if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
311
312  global $UTF8_LOWER_TO_UPPER;
313  $uni = utf8_to_unicode($string);
314  $cnt = count($uni);
315  for ($i=0; $i < $cnt; $i++){
316    if($UTF8_LOWER_TO_UPPER[$uni[$i]]){
317      $uni[$i] = $UTF8_LOWER_TO_UPPER[$uni[$i]];
318    }
319  }
320  return unicode_to_utf8($uni);
321}
322
323/**
324 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
325 *
326 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
327 * letters. Default is to deaccent both cases ($case = 0)
328 *
329 * @author Andreas Gohr <andi@splitbrain.org>
330 */
331function utf8_deaccent($string,$case=0){
332  if($case <= 0){
333    global $UTF8_LOWER_ACCENTS;
334    $string = str_replace(array_keys($UTF8_LOWER_ACCENTS),array_values($UTF8_LOWER_ACCENTS),$string);
335  }
336  if($case >= 0){
337    global $UTF8_UPPER_ACCENTS;
338    $string = str_replace(array_keys($UTF8_UPPER_ACCENTS),array_values($UTF8_UPPER_ACCENTS),$string);
339  }
340  return $string;
341}
342
343/**
344 * Romanize a non-latin string
345 *
346 * @author Andreas Gohr <andi@splitbrain.org>
347 */
348function utf8_romanize($string){
349  if(utf8_isASCII($string)) return $string; //nothing to do
350
351  global $UTF8_ROMANIZATION;
352  return strtr($string,$UTF8_ROMANIZATION);
353}
354
355/**
356 * Removes special characters (nonalphanumeric) from a UTF-8 string
357 *
358 * This function adds the controlchars 0x00 to 0x19 to the array of
359 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
360 *
361 * @author Andreas Gohr <andi@splitbrain.org>
362 * @param  string $string     The UTF8 string to strip of special chars
363 * @param  string $repl       Replace special with this string
364 * @param  string $additional Additional chars to strip (used in regexp char class)
365 */
366function utf8_stripspecials($string,$repl='',$additional=''){
367  global $UTF8_SPECIAL_CHARS;
368
369  static $specials = null;
370  if(is_null($specials)){
371    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
372  }
373
374  return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
375}
376
377/**
378 * This is an Unicode aware replacement for strpos
379 *
380 * Uses mb_string extension if available
381 *
382 * @author Harry Fuecks <hfuecks@gmail.com>
383 * @see    strpos()
384 */
385function utf8_strpos($haystack, $needle,$offset=0) {
386  if(UTF8_MBSTRING) return mb_strpos($haystack,$needle,$offset,'utf-8');
387
388  if(!$offset){
389    $ar = utf8_explode($needle, $haystack);
390    if ( count($ar) > 1 ) {
391       return utf8_strlen($ar[0]);
392    }
393    return false;
394  }else{
395    if ( !is_int($offset) ) {
396      trigger_error('Offset must be an integer',E_USER_WARNING);
397      return false;
398    }
399
400    $haystack = utf8_substr($haystack, $offset);
401
402    if ( false !== ($pos = utf8_strpos($haystack,$needle))){
403       return $pos + $offset;
404    }
405    return false;
406  }
407}
408
409/**
410 * Encodes UTF-8 characters to HTML entities
411 *
412 * @author <vpribish at shopping dot com>
413 * @link   http://www.php.net/manual/en/function.utf8-decode.php
414 */
415function utf8_tohtml ($str) {
416  $ret = '';
417  $max = strlen($str);
418  $last = 0;  // keeps the index of the last regular character
419  for ($i=0; $i<$max; $i++) {
420    $c = $str{$i};
421    $c1 = ord($c);
422    if ($c1>>5 == 6) {  // 110x xxxx, 110 prefix for 2 bytes unicode
423      $ret .= substr($str, $last, $i-$last); // append all the regular characters we've passed
424      $c1 &= 31; // remove the 3 bit two bytes prefix
425      $c2 = ord($str{++$i}); // the next byte
426      $c2 &= 63;  // remove the 2 bit trailing byte prefix
427      $c2 |= (($c1 & 3) << 6); // last 2 bits of c1 become first 2 of c2
428      $c1 >>= 2; // c1 shifts 2 to the right
429      $ret .= '&#' . ($c1 * 100 + $c2) . ';'; // this is the fastest string concatenation
430      $last = $i+1;
431    }
432  }
433  return $ret . substr($str, $last, $i); // append the last batch of regular characters
434}
435
436/**
437 * Takes an UTF-8 string and returns an array of ints representing the
438 * Unicode characters. Astral planes are supported ie. the ints in the
439 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
440 * are not allowed.
441 *
442 * If $strict is set to true the function returns false if the input
443 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
444 * level E_USER_WARNING
445 *
446 * Note: this function has been modified slightly in this library to
447 * trigger errors on encountering bad bytes
448 *
449 * @author <hsivonen@iki.fi>
450 * @author Harry Fuecks <hfuecks@gmail.com>
451 * @param  string  UTF-8 encoded string
452 * @param  boolean Check for invalid sequences?
453 * @return mixed array of unicode code points or FALSE if UTF-8 invalid
454 * @see    unicode_to_utf8
455 * @link   http://hsivonen.iki.fi/php-utf8/
456 * @link   http://sourceforge.net/projects/phputf8/
457 */
458function utf8_to_unicode($str,$strict=false) {
459    $mState = 0;     // cached expected number of octets after the current octet
460                     // until the beginning of the next UTF8 character sequence
461    $mUcs4  = 0;     // cached Unicode character
462    $mBytes = 1;     // cached expected number of octets in the current sequence
463
464    $out = array();
465
466    $len = strlen($str);
467
468    for($i = 0; $i < $len; $i++) {
469
470        $in = ord($str{$i});
471
472        if ( $mState == 0) {
473
474            // When mState is zero we expect either a US-ASCII character or a
475            // multi-octet sequence.
476            if (0 == (0x80 & ($in))) {
477                // US-ASCII, pass straight through.
478                $out[] = $in;
479                $mBytes = 1;
480
481            } else if (0xC0 == (0xE0 & ($in))) {
482                // First octet of 2 octet sequence
483                $mUcs4 = ($in);
484                $mUcs4 = ($mUcs4 & 0x1F) << 6;
485                $mState = 1;
486                $mBytes = 2;
487
488            } else if (0xE0 == (0xF0 & ($in))) {
489                // First octet of 3 octet sequence
490                $mUcs4 = ($in);
491                $mUcs4 = ($mUcs4 & 0x0F) << 12;
492                $mState = 2;
493                $mBytes = 3;
494
495            } else if (0xF0 == (0xF8 & ($in))) {
496                // First octet of 4 octet sequence
497                $mUcs4 = ($in);
498                $mUcs4 = ($mUcs4 & 0x07) << 18;
499                $mState = 3;
500                $mBytes = 4;
501
502            } else if (0xF8 == (0xFC & ($in))) {
503                /* First octet of 5 octet sequence.
504                 *
505                 * This is illegal because the encoded codepoint must be either
506                 * (a) not the shortest form or
507                 * (b) outside the Unicode range of 0-0x10FFFF.
508                 * Rather than trying to resynchronize, we will carry on until the end
509                 * of the sequence and let the later error handling code catch it.
510                 */
511                $mUcs4 = ($in);
512                $mUcs4 = ($mUcs4 & 0x03) << 24;
513                $mState = 4;
514                $mBytes = 5;
515
516            } else if (0xFC == (0xFE & ($in))) {
517                // First octet of 6 octet sequence, see comments for 5 octet sequence.
518                $mUcs4 = ($in);
519                $mUcs4 = ($mUcs4 & 1) << 30;
520                $mState = 5;
521                $mBytes = 6;
522
523            } elseif($strict) {
524                /* Current octet is neither in the US-ASCII range nor a legal first
525                 * octet of a multi-octet sequence.
526                 */
527                trigger_error(
528                        'utf8_to_unicode: Illegal sequence identifier '.
529                            'in UTF-8 at byte '.$i,
530                        E_USER_WARNING
531                    );
532                return FALSE;
533
534            }
535
536        } else {
537
538            // When mState is non-zero, we expect a continuation of the multi-octet
539            // sequence
540            if (0x80 == (0xC0 & ($in))) {
541
542                // Legal continuation.
543                $shift = ($mState - 1) * 6;
544                $tmp = $in;
545                $tmp = ($tmp & 0x0000003F) << $shift;
546                $mUcs4 |= $tmp;
547
548                /**
549                 * End of the multi-octet sequence. mUcs4 now contains the final
550                 * Unicode codepoint to be output
551                 */
552                if (0 == --$mState) {
553
554                    /*
555                     * Check for illegal sequences and codepoints.
556                     */
557                    // From Unicode 3.1, non-shortest form is illegal
558                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
559                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
560                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
561                        (4 < $mBytes) ||
562                        // From Unicode 3.2, surrogate characters are illegal
563                        (($mUcs4 & 0xFFFFF800) == 0xD800) ||
564                        // Codepoints outside the Unicode range are illegal
565                        ($mUcs4 > 0x10FFFF)) {
566
567                        if($strict){
568                            trigger_error(
569                                    'utf8_to_unicode: Illegal sequence or codepoint '.
570                                        'in UTF-8 at byte '.$i,
571                                    E_USER_WARNING
572                                );
573
574                            return FALSE;
575                        }
576
577                    }
578
579                    if (0xFEFF != $mUcs4) {
580                        // BOM is legal but we don't want to output it
581                        $out[] = $mUcs4;
582                    }
583
584                    //initialize UTF8 cache
585                    $mState = 0;
586                    $mUcs4  = 0;
587                    $mBytes = 1;
588                }
589
590            } elseif($strict) {
591                /**
592                 *((0xC0 & (*in) != 0x80) && (mState != 0))
593                 * Incomplete multi-octet sequence.
594                 */
595                trigger_error(
596                        'utf8_to_unicode: Incomplete multi-octet '.
597                        '   sequence in UTF-8 at byte '.$i,
598                        E_USER_WARNING
599                    );
600
601                return FALSE;
602            }
603        }
604    }
605    return $out;
606}
607
608/**
609 * Takes an array of ints representing the Unicode characters and returns
610 * a UTF-8 string. Astral planes are supported ie. the ints in the
611 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
612 * are not allowed.
613 *
614 * If $strict is set to true the function returns false if the input
615 * array contains ints that represent surrogates or are outside the
616 * Unicode range and raises a PHP error at level E_USER_WARNING
617 *
618 * Note: this function has been modified slightly in this library to use
619 * output buffering to concatenate the UTF-8 string (faster) as well as
620 * reference the array by it's keys
621 *
622 * @param  array of unicode code points representing a string
623 * @param  boolean Check for invalid sequences?
624 * @return mixed UTF-8 string or FALSE if array contains invalid code points
625 * @author <hsivonen@iki.fi>
626 * @author Harry Fuecks <hfuecks@gmail.com>
627 * @see    utf8_to_unicode
628 * @link   http://hsivonen.iki.fi/php-utf8/
629 * @link   http://sourceforge.net/projects/phputf8/
630 */
631function unicode_to_utf8($arr,$strict=false) {
632    if (!is_array($arr)) return '';
633    ob_start();
634
635    foreach (array_keys($arr) as $k) {
636
637        # ASCII range (including control chars)
638        if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
639
640            echo chr($arr[$k]);
641
642        # 2 byte sequence
643        } else if ($arr[$k] <= 0x07ff) {
644
645            echo chr(0xc0 | ($arr[$k] >> 6));
646            echo chr(0x80 | ($arr[$k] & 0x003f));
647
648        # Byte order mark (skip)
649        } else if($arr[$k] == 0xFEFF) {
650
651            // nop -- zap the BOM
652
653        # Test for illegal surrogates
654        } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
655
656            // found a surrogate
657            if($strict){
658                trigger_error(
659                    'unicode_to_utf8: Illegal surrogate '.
660                        'at index: '.$k.', value: '.$arr[$k],
661                    E_USER_WARNING
662                    );
663                return FALSE;
664            }
665
666        # 3 byte sequence
667        } else if ($arr[$k] <= 0xffff) {
668
669            echo chr(0xe0 | ($arr[$k] >> 12));
670            echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
671            echo chr(0x80 | ($arr[$k] & 0x003f));
672
673        # 4 byte sequence
674        } else if ($arr[$k] <= 0x10ffff) {
675
676            echo chr(0xf0 | ($arr[$k] >> 18));
677            echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
678            echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
679            echo chr(0x80 | ($arr[$k] & 0x3f));
680
681        } elseif($strict) {
682
683            trigger_error(
684                'unicode_to_utf8: Codepoint out of Unicode range '.
685                    'at index: '.$k.', value: '.$arr[$k],
686                E_USER_WARNING
687                );
688
689            // out of range
690            return FALSE;
691        }
692    }
693
694    $result = ob_get_contents();
695    ob_end_clean();
696    return $result;
697}
698
699/**
700 * UTF-8 to UTF-16BE conversion.
701 *
702 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
703 */
704function utf8_to_utf16be(&$str, $bom = false) {
705  $out = $bom ? "\xFE\xFF" : '';
706  if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
707
708  $uni = utf8_to_unicode($str);
709  foreach($uni as $cp){
710    $out .= pack('n',$cp);
711  }
712  return $out;
713}
714
715/**
716 * UTF-8 to UTF-16BE conversion.
717 *
718 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
719 */
720function utf16be_to_utf8(&$str) {
721  $uni = unpack('n*',$str);
722  return unicode_to_utf8($uni);
723}
724
725/**
726 * Replace bad bytes with an alternative character
727 *
728 * ASCII character is recommended for replacement char
729 *
730 * PCRE Pattern to locate bad bytes in a UTF-8 string
731 * Comes from W3 FAQ: Multilingual Forms
732 * Note: modified to include full ASCII range including control chars
733 *
734 * @author Harry Fuecks <hfuecks@gmail.com>
735 * @see http://www.w3.org/International/questions/qa-forms-utf-8
736 * @param string to search
737 * @param string to replace bad bytes with (defaults to '?') - use ASCII
738 * @return string
739 */
740function utf8_bad_replace($str, $replace = '') {
741    $UTF8_BAD =
742     '([\x00-\x7F]'.                          # ASCII (including control chars)
743     '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
744     '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
745     '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
746     '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
747     '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
748     '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
749     '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
750     '|(.{1}))';                              # invalid byte
751    ob_start();
752    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
753        if ( !isset($matches[2])) {
754            echo $matches[0];
755        } else {
756            echo $replace;
757        }
758        $str = substr($str,strlen($matches[0]));
759    }
760    $result = ob_get_contents();
761    ob_end_clean();
762    return $result;
763}
764
765/**
766 * adjust a byte index into a utf8 string to a utf8 character boundary
767 *
768 * @param $str   string   utf8 character string
769 * @param $i     int      byte index into $str
770 * @param $next  bool     direction to search for boundary,
771 *                           false = up (current character)
772 *                           true = down (next character)
773 *
774 * @return int            byte index into $str now pointing to a utf8 character boundary
775 *
776 * @author       chris smith <chris@jalakai.co.uk>
777 */
778function utf8_correctIdx(&$str,$i,$next=false) {
779
780  if ($i <= 0) return 0;
781
782  $limit = strlen($str);
783  if ($i>=$limit) return $limit;
784
785  if ($next) {
786    while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
787  } else {
788    while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
789  }
790
791  return $i;
792}
793
794// only needed if no mb_string available
795if(!UTF8_MBSTRING){
796
797  /**
798   * UTF-8 Case lookup table
799   *
800   * This lookuptable defines the upper case letters to their correspponding
801   * lower case letter in UTF-8
802   *
803   * @author Andreas Gohr <andi@splitbrain.org>
804   */
805  global $UTF8_LOWER_TO_UPPER;
806  $UTF8_LOWER_TO_UPPER = array(
807    0x0061=>0x0041, 0x03C6=>0x03A6, 0x0163=>0x0162, 0x00E5=>0x00C5, 0x0062=>0x0042,
808    0x013A=>0x0139, 0x00E1=>0x00C1, 0x0142=>0x0141, 0x03CD=>0x038E, 0x0101=>0x0100,
809    0x0491=>0x0490, 0x03B4=>0x0394, 0x015B=>0x015A, 0x0064=>0x0044, 0x03B3=>0x0393,
810    0x00F4=>0x00D4, 0x044A=>0x042A, 0x0439=>0x0419, 0x0113=>0x0112, 0x043C=>0x041C,
811    0x015F=>0x015E, 0x0144=>0x0143, 0x00EE=>0x00CE, 0x045E=>0x040E, 0x044F=>0x042F,
812    0x03BA=>0x039A, 0x0155=>0x0154, 0x0069=>0x0049, 0x0073=>0x0053, 0x1E1F=>0x1E1E,
813    0x0135=>0x0134, 0x0447=>0x0427, 0x03C0=>0x03A0, 0x0438=>0x0418, 0x00F3=>0x00D3,
814    0x0440=>0x0420, 0x0454=>0x0404, 0x0435=>0x0415, 0x0449=>0x0429, 0x014B=>0x014A,
815    0x0431=>0x0411, 0x0459=>0x0409, 0x1E03=>0x1E02, 0x00F6=>0x00D6, 0x00F9=>0x00D9,
816    0x006E=>0x004E, 0x0451=>0x0401, 0x03C4=>0x03A4, 0x0443=>0x0423, 0x015D=>0x015C,
817    0x0453=>0x0403, 0x03C8=>0x03A8, 0x0159=>0x0158, 0x0067=>0x0047, 0x00E4=>0x00C4,
818    0x03AC=>0x0386, 0x03AE=>0x0389, 0x0167=>0x0166, 0x03BE=>0x039E, 0x0165=>0x0164,
819    0x0117=>0x0116, 0x0109=>0x0108, 0x0076=>0x0056, 0x00FE=>0x00DE, 0x0157=>0x0156,
820    0x00FA=>0x00DA, 0x1E61=>0x1E60, 0x1E83=>0x1E82, 0x00E2=>0x00C2, 0x0119=>0x0118,
821    0x0146=>0x0145, 0x0070=>0x0050, 0x0151=>0x0150, 0x044E=>0x042E, 0x0129=>0x0128,
822    0x03C7=>0x03A7, 0x013E=>0x013D, 0x0442=>0x0422, 0x007A=>0x005A, 0x0448=>0x0428,
823    0x03C1=>0x03A1, 0x1E81=>0x1E80, 0x016D=>0x016C, 0x00F5=>0x00D5, 0x0075=>0x0055,
824    0x0177=>0x0176, 0x00FC=>0x00DC, 0x1E57=>0x1E56, 0x03C3=>0x03A3, 0x043A=>0x041A,
825    0x006D=>0x004D, 0x016B=>0x016A, 0x0171=>0x0170, 0x0444=>0x0424, 0x00EC=>0x00CC,
826    0x0169=>0x0168, 0x03BF=>0x039F, 0x006B=>0x004B, 0x00F2=>0x00D2, 0x00E0=>0x00C0,
827    0x0434=>0x0414, 0x03C9=>0x03A9, 0x1E6B=>0x1E6A, 0x00E3=>0x00C3, 0x044D=>0x042D,
828    0x0436=>0x0416, 0x01A1=>0x01A0, 0x010D=>0x010C, 0x011D=>0x011C, 0x00F0=>0x00D0,
829    0x013C=>0x013B, 0x045F=>0x040F, 0x045A=>0x040A, 0x00E8=>0x00C8, 0x03C5=>0x03A5,
830    0x0066=>0x0046, 0x00FD=>0x00DD, 0x0063=>0x0043, 0x021B=>0x021A, 0x00EA=>0x00CA,
831    0x03B9=>0x0399, 0x017A=>0x0179, 0x00EF=>0x00CF, 0x01B0=>0x01AF, 0x0065=>0x0045,
832    0x03BB=>0x039B, 0x03B8=>0x0398, 0x03BC=>0x039C, 0x045C=>0x040C, 0x043F=>0x041F,
833    0x044C=>0x042C, 0x00FE=>0x00DE, 0x00F0=>0x00D0, 0x1EF3=>0x1EF2, 0x0068=>0x0048,
834    0x00EB=>0x00CB, 0x0111=>0x0110, 0x0433=>0x0413, 0x012F=>0x012E, 0x00E6=>0x00C6,
835    0x0078=>0x0058, 0x0161=>0x0160, 0x016F=>0x016E, 0x03B1=>0x0391, 0x0457=>0x0407,
836    0x0173=>0x0172, 0x00FF=>0x0178, 0x006F=>0x004F, 0x043B=>0x041B, 0x03B5=>0x0395,
837    0x0445=>0x0425, 0x0121=>0x0120, 0x017E=>0x017D, 0x017C=>0x017B, 0x03B6=>0x0396,
838    0x03B2=>0x0392, 0x03AD=>0x0388, 0x1E85=>0x1E84, 0x0175=>0x0174, 0x0071=>0x0051,
839    0x0437=>0x0417, 0x1E0B=>0x1E0A, 0x0148=>0x0147, 0x0105=>0x0104, 0x0458=>0x0408,
840    0x014D=>0x014C, 0x00ED=>0x00CD, 0x0079=>0x0059, 0x010B=>0x010A, 0x03CE=>0x038F,
841    0x0072=>0x0052, 0x0430=>0x0410, 0x0455=>0x0405, 0x0452=>0x0402, 0x0127=>0x0126,
842    0x0137=>0x0136, 0x012B=>0x012A, 0x03AF=>0x038A, 0x044B=>0x042B, 0x006C=>0x004C,
843    0x03B7=>0x0397, 0x0125=>0x0124, 0x0219=>0x0218, 0x00FB=>0x00DB, 0x011F=>0x011E,
844    0x043E=>0x041E, 0x1E41=>0x1E40, 0x03BD=>0x039D, 0x0107=>0x0106, 0x03CB=>0x03AB,
845    0x0446=>0x0426, 0x00FE=>0x00DE, 0x00E7=>0x00C7, 0x03CA=>0x03AA, 0x0441=>0x0421,
846    0x0432=>0x0412, 0x010F=>0x010E, 0x00F8=>0x00D8, 0x0077=>0x0057, 0x011B=>0x011A,
847    0x0074=>0x0054, 0x006A=>0x004A, 0x045B=>0x040B, 0x0456=>0x0406, 0x0103=>0x0102,
848    0x03BB=>0x039B, 0x00F1=>0x00D1, 0x043D=>0x041D, 0x03CC=>0x038C, 0x00E9=>0x00C9,
849    0x00F0=>0x00D0, 0x0457=>0x0407, 0x0123=>0x0122,
850  );
851
852  /**
853   * UTF-8 Case lookup table
854   *
855   * This lookuptable defines the lower case letters to their correspponding
856   * upper case letter in UTF-8 (it does so by flipping $UTF8_LOWER_TO_UPPER)
857   *
858   * @author Andreas Gohr <andi@splitbrain.org>
859   */
860  global $UTF8_UPPER_TO_LOWER;
861  $UTF8_UPPER_TO_LOWER = @array_flip($UTF8_LOWER_TO_UPPER);
862
863} // end of case lookup tables
864
865
866/**
867 * UTF-8 lookup table for lower case accented letters
868 *
869 * This lookuptable defines replacements for accented characters from the ASCII-7
870 * range. This are lower case letters only.
871 *
872 * @author Andreas Gohr <andi@splitbrain.org>
873 * @see    utf8_deaccent()
874 */
875global $UTF8_LOWER_ACCENTS;
876$UTF8_LOWER_ACCENTS = array(
877  'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
878  'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
879  'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
880  'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
881  'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
882  'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
883  'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
884  'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
885  'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
886  'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
887  'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
888  'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
889  'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
890  'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
891  'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
892);
893
894/**
895 * UTF-8 lookup table for upper case accented letters
896 *
897 * This lookuptable defines replacements for accented characters from the ASCII-7
898 * range. This are upper case letters only.
899 *
900 * @author Andreas Gohr <andi@splitbrain.org>
901 * @see    utf8_deaccent()
902 */
903global $UTF8_UPPER_ACCENTS;
904$UTF8_UPPER_ACCENTS = array(
905  'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
906  'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
907  'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
908  'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
909  'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
910  'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
911  'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
912  'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
913  'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
914  'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
915  'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
916  'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
917  'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
918  'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
919  'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
920);
921
922/**
923 * UTF-8 array of common special characters
924 *
925 * This array should contain all special characters (not a letter or digit)
926 * defined in the various local charsets - it's not a complete list of non-alphanum
927 * characters in UTF-8. It's not perfect but should match most cases of special
928 * chars.
929 *
930 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
931 * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
932 *
933 * @author Andreas Gohr <andi@splitbrain.org>
934 * @see    utf8_stripspecials()
935 */
936global $UTF8_SPECIAL_CHARS;
937$UTF8_SPECIAL_CHARS = array(
938  0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
939  0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
940          0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
941  0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
942  0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
943  0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
944  0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
945  0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
946  0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
947  0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
948  0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
949  0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
950  0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
951  0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
952  0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
953  0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
954  0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
955  0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
956  0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
957  0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
958  0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
959  0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
960  0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
961  0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
962  0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
963  0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
964  0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
965  0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
966  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
967  0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
968  0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
969  0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
970  0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
971  0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
972  0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
973  0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
974  0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
975  0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
976  0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
977  0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
978  0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
979  0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
980  0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
981  0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
982  0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
983  0x27be, 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
984  0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
985  0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
986  0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
987  0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
988);
989
990/**
991 * Romanization lookup table
992 *
993 * This lookup tables provides a way to transform strings written in a language
994 * different from the ones based upon latin letters into plain ASCII.
995 *
996 * Please note: this is not a scientific transliteration table. It only works
997 * oneway from nonlatin to ASCII and it works by simple character replacement
998 * only. Specialities of each language are not supported.
999 *
1000 * @author Andreas Gohr <andi@splitbrain.org>
1001 * @author Vitaly Blokhin <vitinfo@vitn.com>
1002 * @link   http://www.uconv.com/translit.htm
1003 * @author Bisqwit <bisqwit@iki.fi>
1004 * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1005 * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1006 * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1007 * @link   http://www.btranslations.com/resources/romanization/korean.asp
1008 */
1009global $UTF8_ROMANIZATION;
1010$UTF8_ROMANIZATION = array(
1011  //russian cyrillic
1012  'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1013  'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1014  'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1015  'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1016  'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1017  'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1018  'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1019  'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'\'','Ь'=>'\'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1020  'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1021  // Ukrainian cyrillic
1022  'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1023  // Georgian
1024  'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1025  'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1026  'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1027  'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1028  'ჰ'=>'xh',
1029  //Sanskrit
1030  'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1031  'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1032  'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1033  'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1034  'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1035  'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1036  'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1037  //Hebrew
1038  'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1039  'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1040  'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1041  'ש'=>'sh','ת'=>'t',
1042  //Arabic
1043  'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1044  'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1045  'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1046  'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1047
1048  // Japanese hiragana
1049  'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1050  'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1051  'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1052  'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1053  'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1054  'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1055  'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1056  'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1057  'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1058  'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1059  'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1060  'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1061  'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1062  'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1063  'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1064  'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1065  'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1066  'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1067  'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1068  'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1069  'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1070  'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1071  'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1072  'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1073  'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1074  'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1075  'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1076  'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1077  'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1078  'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1079  'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1080  'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1081  'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1082  'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1083  'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1084  'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1085  'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1086  'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1087  'じゅ'=>'zyu',
1088  // Japanese katakana
1089  'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1090  'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1091  'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1092  'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1093  'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1094  'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1095  'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1096  'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1097  'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1098  'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1099  'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1100  'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1101  'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1102  'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1103  'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1104  'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1105  'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1106  'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1107  'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1108  'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1109  'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1110  'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1111  'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1112  'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1113  'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1114  'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1115  'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1116  'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1117  'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1118  'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1119  'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1120  'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1121  'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1122  'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1123  'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1124  'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1125  'ジョ'=>'zyo','ジュ'=>'zyu',
1126
1127  // "Greeklish"
1128  'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1129  'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1130
1131  // Thai
1132  'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1133  'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1134  'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1135  'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1136  'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1137  'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1138  'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1139  '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1140  'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1141  'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1142  '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1143  'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1144  'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1145  'เ–ียว'=>'iao',
1146
1147  // Korean
1148  'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1149  'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1150  'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1151  'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1152  'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1153  'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1154);
1155
1156//Setup VIM: ex: et ts=2 enc=utf-8 :
1157
1158