1<?php 2// {{{ license 3 4/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */ 5// 6// +----------------------------------------------------------------------+ 7// | This library is free software; you can redistribute it and/or modify | 8// | it under the terms of the GNU Lesser General Public License as | 9// | published by the Free Software Foundation; either version 2.1 of the | 10// | License, or (at your option) any later version. | 11// | | 12// | This library is distributed in the hope that it will be useful, but | 13// | WITHOUT ANY WARRANTY; without even the implied warranty of | 14// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 15// | Lesser General Public License for more details. | 16// | | 17// | You should have received a copy of the GNU Lesser General Public | 18// | License along with this library; if not, write to the Free Software | 19// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | 20// | USA. | 21// +----------------------------------------------------------------------+ 22// 23 24// }}} 25 26/** 27 * Encode/decode Internationalized Domain Names. 28 * 29 * The class allows to convert internationalized domain names 30 * (see RFC 3490 for details) as they can be used with various registries worldwide 31 * to be translated between their original (localized) form and their encoded form 32 * as it will be used in the DNS (Domain Name System). 33 * 34 * The class provides two public methods, encode() and decode(), which do exactly 35 * what you would expect them to do. You are allowed to use complete domain names, 36 * simple strings and complete email addresses as well. That means, that you might 37 * use any of the following notations: 38 * 39 * - www.nörgler.com 40 * - xn--nrgler-wxa 41 * - xn--brse-5qa.xn--knrz-1ra.info 42 * 43 * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4 44 * array. Unicode output is available in the same formats. 45 * You can select your preferred format via {@link set_paramter()}. 46 * 47 * ACE input and output is always expected to be ASCII. 48 * 49 * @author Matthias Sommerfeld <mso@phlylabs.de> 50 * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de 51 * @version 0.5.1 52 * 53 */ 54class idna_convert 55{ 56 /** 57 * Holds all relevant mapping tables, loaded from a seperate file on construct 58 * See RFC3454 for details 59 * 60 * @var array 61 * @access private 62 */ 63 var $NP = array(); 64 65 // Internal settings, do not mess with them 66 var $_punycode_prefix = 'xn--'; 67 var $_invalid_ucs = 0x80000000; 68 var $_max_ucs = 0x10FFFF; 69 var $_base = 36; 70 var $_tmin = 1; 71 var $_tmax = 26; 72 var $_skew = 38; 73 var $_damp = 700; 74 var $_initial_bias = 72; 75 var $_initial_n = 0x80; 76 var $_sbase = 0xAC00; 77 var $_lbase = 0x1100; 78 var $_vbase = 0x1161; 79 var $_tbase = 0x11A7; 80 var $_lcount = 19; 81 var $_vcount = 21; 82 var $_tcount = 28; 83 var $_ncount = 588; // _vcount * _tcount 84 var $_scount = 11172; // _lcount * _tcount * _vcount 85 var $_error = false; 86 87 // See {@link set_paramter()} for details of how to change the following 88 // settings from within your script / application 89 var $_api_encoding = 'utf8'; // Default input charset is UTF-8 90 var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden 91 var $_strict_mode = false; // Behave strict or not 92 93 // The constructor 94 function __construct($options = false) 95 { 96 $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount; 97 if (function_exists('file_get_contents')) { 98 $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser')); 99 } else { 100 $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser'))); 101 } 102 // If parameters are given, pass these to the respective method 103 if (is_array($options)) { 104 return $this->set_parameter($options); 105 } 106 return true; 107 } 108 109 /** 110 * Sets a new option value. Available options and values: 111 * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, 112 * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] 113 * [overlong - Unicode does not allow unnecessarily long encodings of chars, 114 * to allow this, set this parameter to true, else to false; 115 * default is false.] 116 * [strict - true: strict mode, good for registration purposes - Causes errors 117 * on failures; false: loose mode, ideal for "wildlife" applications 118 * by silently ignoring errors and returning the original input instead 119 * 120 * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) 121 * @param string Value to use (if parameter 1 is a string) 122 * @return boolean true on success, false otherwise 123 * @access public 124 */ 125 function set_parameter($option, $value = false) 126 { 127 if (!is_array($option)) { 128 $option = array($option => $value); 129 } 130 foreach ($option as $k => $v) { 131 switch ($k) { 132 case 'encoding': 133 switch ($v) { 134 case 'utf8': 135 case 'ucs4_string': 136 case 'ucs4_array': 137 $this->_api_encoding = $v; 138 break; 139 default: 140 $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k); 141 return false; 142 } 143 break; 144 case 'overlong': 145 $this->_allow_overlong = ($v) ? true : false; 146 break; 147 case 'strict': 148 $this->_strict_mode = ($v) ? true : false; 149 break; 150 default: 151 $this->_error('Set Parameter: Unknown option '.$k); 152 return false; 153 } 154 } 155 return true; 156 } 157 158 /** 159 * Decode a given ACE domain name 160 * @param string Domain name (ACE string) 161 * [@param string Desired output encoding, see {@link set_parameter}] 162 * @return string Decoded Domain name (UTF-8 or UCS-4) 163 * @access public 164 */ 165 function decode($input, $one_time_encoding = false) 166 { 167 // Optionally set 168 if ($one_time_encoding) { 169 switch ($one_time_encoding) { 170 case 'utf8': 171 case 'ucs4_string': 172 case 'ucs4_array': 173 break; 174 default: 175 $this->_error('Unknown encoding '.$one_time_encoding); 176 return false; 177 } 178 } 179 // Make sure to drop any newline characters around 180 $input = trim($input); 181 182 // Negotiate input and try to determine, whether it is a plain string, 183 // an email address or something like a complete URL 184 if (strpos($input, '@')) { // Maybe it is an email address 185 // No no in strict mode 186 if ($this->_strict_mode) { 187 $this->_error('Only simple domain name parts can be handled in strict mode'); 188 return false; 189 } 190 list ($email_pref, $input) = explode('@', $input, 2); 191 $arr = explode('.', $input); 192 foreach ($arr as $k => $v) { 193 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { 194 $conv = $this->_decode($v); 195 if ($conv) $arr[$k] = $conv; 196 } 197 } 198 $input = join('.', $arr); 199 $arr = explode('.', $email_pref); 200 foreach ($arr as $k => $v) { 201 if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { 202 $conv = $this->_decode($v); 203 if ($conv) $arr[$k] = $conv; 204 } 205 } 206 $email_pref = join('.', $arr); 207 $return = $email_pref . '@' . $input; 208 } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters) 209 // No no in strict mode 210 if ($this->_strict_mode) { 211 $this->_error('Only simple domain name parts can be handled in strict mode'); 212 return false; 213 } 214 $parsed = parse_url($input); 215 if (isset($parsed['host'])) { 216 $arr = explode('.', $parsed['host']); 217 foreach ($arr as $k => $v) { 218 $conv = $this->_decode($v); 219 if ($conv) $arr[$k] = $conv; 220 } 221 $parsed['host'] = join('.', $arr); 222 $return = 223 (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')) 224 .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@') 225 .$parsed['host'] 226 .(empty($parsed['port']) ? '' : ':'.$parsed['port']) 227 .(empty($parsed['path']) ? '' : $parsed['path']) 228 .(empty($parsed['query']) ? '' : '?'.$parsed['query']) 229 .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']); 230 } else { // parse_url seems to have failed, try without it 231 $arr = explode('.', $input); 232 foreach ($arr as $k => $v) { 233 $conv = $this->_decode($v); 234 $arr[$k] = ($conv) ? $conv : $v; 235 } 236 $return = join('.', $arr); 237 } 238 } else { // Otherwise we consider it being a pure domain name string 239 $return = $this->_decode($input); 240 if (!$return) $return = $input; 241 } 242 // The output is UTF-8 by default, other output formats need conversion here 243 // If one time encoding is given, use this, else the objects property 244 switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) { 245 case 'utf8': 246 return $return; 247 break; 248 case 'ucs4_string': 249 return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return)); 250 break; 251 case 'ucs4_array': 252 return $this->_utf8_to_ucs4($return); 253 break; 254 default: 255 $this->_error('Unsupported output format'); 256 return false; 257 } 258 } 259 260 /** 261 * Encode a given UTF-8 domain name 262 * @param string Domain name (UTF-8 or UCS-4) 263 * [@param string Desired input encoding, see {@link set_parameter}] 264 * @return string Encoded Domain name (ACE string) 265 * @access public 266 */ 267 function encode($decoded, $one_time_encoding = false) 268 { 269 // Forcing conversion of input to UCS4 array 270 // If one time encoding is given, use this, else the objects property 271 switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) { 272 case 'utf8': 273 $decoded = $this->_utf8_to_ucs4($decoded); 274 break; 275 case 'ucs4_string': 276 $decoded = $this->_ucs4_string_to_ucs4($decoded); 277 case 'ucs4_array': 278 break; 279 default: 280 $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding)); 281 return false; 282 } 283 284 // No input, no output, what else did you expect? 285 if (empty($decoded)) return ''; 286 287 // Anchors for iteration 288 $last_begin = 0; 289 // Output string 290 $output = ''; 291 foreach ($decoded as $k => $v) { 292 // Make sure to use just the plain dot 293 switch($v) { 294 case 0x3002: 295 case 0xFF0E: 296 case 0xFF61: 297 $decoded[$k] = 0x2E; 298 // Right, no break here, the above are converted to dots anyway 299 // Stumbling across an anchoring character 300 case 0x2E: 301 case 0x2F: 302 case 0x3A: 303 case 0x3F: 304 case 0x40: 305 // Neither email addresses nor URLs allowed in strict mode 306 if ($this->_strict_mode) { 307 $this->_error('Neither email addresses nor URLs are allowed in strict mode.'); 308 return false; 309 } 310 311 // Skip first char 312 if ($k) { 313 $encoded = ''; 314 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin))); 315 if ($encoded) { 316 $output .= $encoded; 317 } else { 318 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin))); 319 } 320 $output .= chr($decoded[$k]); 321 } 322 $last_begin = $k + 1; 323 } 324 } 325 // Catch the rest of the string 326 if ($last_begin) { 327 $inp_len = sizeof($decoded); 328 $encoded = ''; 329 $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); 330 if ($encoded) { 331 $output .= $encoded; 332 } else { 333 $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); 334 } 335 return $output; 336 } 337 338 if ($output = $this->_encode($decoded)) { 339 return $output; 340 } 341 342 return $this->_ucs4_to_utf8($decoded); 343 } 344 345 /** 346 * Use this method to get the last error ocurred 347 * @param void 348 * @return string The last error, that occured 349 * @access public 350 */ 351 function get_last_error() 352 { 353 return $this->_error; 354 } 355 356 /** 357 * The actual decoding algorithm 358 * @access private 359 */ 360 function _decode($encoded) 361 { 362 // We do need to find the Punycode prefix 363 if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) { 364 $this->_error('This is not a punycode string'); 365 return false; 366 } 367 $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded); 368 // If nothing left after removing the prefix, it is hopeless 369 if (!$encode_test) { 370 $this->_error('The given encoded string was empty'); 371 return false; 372 } 373 // Find last occurence of the delimiter 374 $delim_pos = strrpos($encoded, '-'); 375 if ($delim_pos > strlen($this->_punycode_prefix)) { 376 for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) { 377 $decoded[] = ord($encoded[$k]); 378 } 379 } else { 380 $decoded = array(); 381 } 382 $deco_len = count($decoded); 383 $enco_len = strlen($encoded); 384 385 // Wandering through the strings; init 386 $is_first = true; 387 $bias = $this->_initial_bias; 388 $idx = 0; 389 $char = $this->_initial_n; 390 391 for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) { 392 for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) { 393 $digit = $this->_decode_digit($encoded[$enco_idx++]); 394 $idx += $digit * $w; 395 $t = ($k <= $bias) ? $this->_tmin : 396 (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias)); 397 if ($digit < $t) break; 398 $w = (int) ($w * ($this->_base - $t)); 399 } 400 $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first); 401 $is_first = false; 402 $char += (int) ($idx / ($deco_len + 1)); 403 $idx %= ($deco_len + 1); 404 if ($deco_len > 0) { 405 // Make room for the decoded char 406 for ($i = $deco_len; $i > $idx; $i--) { 407 $decoded[$i] = $decoded[($i - 1)]; 408 } 409 } 410 $decoded[$idx++] = $char; 411 } 412 return $this->_ucs4_to_utf8($decoded); 413 } 414 415 /** 416 * The actual encoding algorithm 417 * @access private 418 */ 419 function _encode($decoded) 420 { 421 // We cannot encode a domain name containing the Punycode prefix 422 $extract = strlen($this->_punycode_prefix); 423 $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix); 424 $check_deco = array_slice($decoded, 0, $extract); 425 426 if ($check_pref == $check_deco) { 427 $this->_error('This is already a punycode string'); 428 return false; 429 } 430 // We will not try to encode strings consisting of basic code points only 431 $encodable = false; 432 foreach ($decoded as $k => $v) { 433 if ($v > 0x7a) { 434 $encodable = true; 435 break; 436 } 437 } 438 if (!$encodable) { 439 $this->_error('The given string does not contain encodable chars'); 440 return false; 441 } 442 443 // Do NAMEPREP 444 $decoded = $this->_nameprep($decoded); 445 if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed 446 447 $deco_len = count($decoded); 448 if (!$deco_len) return false; // Empty array 449 450 $codecount = 0; // How many chars have been consumed 451 452 $encoded = ''; 453 // Copy all basic code points to output 454 for ($i = 0; $i < $deco_len; ++$i) { 455 $test = $decoded[$i]; 456 // Will match [-0-9a-zA-Z] 457 if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B) 458 || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) { 459 $encoded .= chr($decoded[$i]); 460 $codecount++; 461 } 462 } 463 if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones 464 465 // Start with the prefix; copy it to output 466 $encoded = $this->_punycode_prefix.$encoded; 467 468 // If we have basic code points in output, add an hyphen to the end 469 if ($codecount) $encoded .= '-'; 470 471 // Now find and encode all non-basic code points 472 $is_first = true; 473 $cur_code = $this->_initial_n; 474 $bias = $this->_initial_bias; 475 $delta = 0; 476 while ($codecount < $deco_len) { 477 // Find the smallest code point >= the current code point and 478 // remember the last ouccrence of it in the input 479 for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) { 480 if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) { 481 $next_code = $decoded[$i]; 482 } 483 } 484 485 $delta += ($next_code - $cur_code) * ($codecount + 1); 486 $cur_code = $next_code; 487 488 // Scan input again and encode all characters whose code point is $cur_code 489 for ($i = 0; $i < $deco_len; $i++) { 490 if ($decoded[$i] < $cur_code) { 491 $delta++; 492 } elseif ($decoded[$i] == $cur_code) { 493 for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) { 494 $t = ($k <= $bias) ? $this->_tmin : 495 (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias); 496 if ($q < $t) break; 497 $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval() 498 $q = (int) (($q - $t) / ($this->_base - $t)); 499 } 500 $encoded .= $this->_encode_digit($q); 501 $bias = $this->_adapt($delta, $codecount+1, $is_first); 502 $codecount++; 503 $delta = 0; 504 $is_first = false; 505 } 506 } 507 $delta++; 508 $cur_code++; 509 } 510 return $encoded; 511 } 512 513 /** 514 * Adapt the bias according to the current code point and position 515 * @access private 516 */ 517 function _adapt($delta, $npoints, $is_first) 518 { 519 $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2)); 520 $delta += intval($delta / $npoints); 521 for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) { 522 $delta = intval($delta / ($this->_base - $this->_tmin)); 523 } 524 return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew)); 525 } 526 527 /** 528 * Encoding a certain digit 529 * @access private 530 */ 531 function _encode_digit($d) 532 { 533 return chr($d + 22 + 75 * ($d < 26)); 534 } 535 536 /** 537 * Decode a certain digit 538 * @access private 539 */ 540 function _decode_digit($cp) 541 { 542 $cp = ord($cp); 543 return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base)); 544 } 545 546 /** 547 * Internal error handling method 548 * @access private 549 */ 550 function _error($error = '') 551 { 552 $this->_error = $error; 553 } 554 555 /** 556 * Do Nameprep according to RFC3491 and RFC3454 557 * @param array Unicode Characters 558 * @return string Unicode Characters, Nameprep'd 559 * @access private 560 */ 561 function _nameprep($input) 562 { 563 $output = array(); 564 $error = false; 565 // 566 // Mapping 567 // Walking through the input array, performing the required steps on each of 568 // the input chars and putting the result into the output array 569 // While mapping required chars we apply the cannonical ordering 570 foreach ($input as $v) { 571 // Map to nothing == skip that code point 572 if (in_array($v, $this->NP['map_nothing'])) continue; 573 574 // Try to find prohibited input 575 if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) { 576 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); 577 return false; 578 } 579 foreach ($this->NP['prohibit_ranges'] as $range) { 580 if ($range[0] <= $v && $v <= $range[1]) { 581 $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); 582 return false; 583 } 584 } 585 // 586 // Hangul syllable decomposition 587 if (0xAC00 <= $v && $v <= 0xD7AF) { 588 foreach ($this->_hangul_decompose($v) as $out) { 589 $output[] = (int) $out; 590 } 591 // There's a decomposition mapping for that code point 592 } elseif (isset($this->NP['replacemaps'][$v])) { 593 foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) { 594 $output[] = (int) $out; 595 } 596 } else { 597 $output[] = (int) $v; 598 } 599 } 600 // Before applying any Combining, try to rearrange any Hangul syllables 601 $output = $this->_hangul_compose($output); 602 // 603 // Combine code points 604 // 605 $last_class = 0; 606 $last_starter = 0; 607 $out_len = count($output); 608 for ($i = 0; $i < $out_len; ++$i) { 609 $class = $this->_get_combining_class($output[$i]); 610 if ((!$last_class || $last_class > $class) && $class) { 611 // Try to match 612 $seq_len = $i - $last_starter; 613 $out = $this->_combine(array_slice($output, $last_starter, $seq_len)); 614 // On match: Replace the last starter with the composed character and remove 615 // the now redundant non-starter(s) 616 if ($out) { 617 $output[$last_starter] = $out; 618 if (count($out) != $seq_len) { 619 for ($j = $i+1; $j < $out_len; ++$j) { 620 $output[$j-1] = $output[$j]; 621 } 622 unset($output[$out_len]); 623 } 624 // Rewind the for loop by one, since there can be more possible compositions 625 $i--; 626 $out_len--; 627 $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]); 628 continue; 629 } 630 } 631 // The current class is 0 632 if (!$class) $last_starter = $i; 633 $last_class = $class; 634 } 635 return $output; 636 } 637 638 /** 639 * Decomposes a Hangul syllable 640 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul 641 * @param integer 32bit UCS4 code point 642 * @return array Either Hangul Syllable decomposed or original 32bit value as one value array 643 * @access private 644 */ 645 function _hangul_decompose($char) 646 { 647 $sindex = (int) $char - $this->_sbase; 648 if ($sindex < 0 || $sindex >= $this->_scount) { 649 return array($char); 650 } 651 $result = array(); 652 $result[] = (int) $this->_lbase + $sindex / $this->_ncount; 653 $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount; 654 $T = intval($this->_tbase + $sindex % $this->_tcount); 655 if ($T != $this->_tbase) $result[] = $T; 656 return $result; 657 } 658 /** 659 * Ccomposes a Hangul syllable 660 * (see http://www.unicode.org/unicode/reports/tr15/#Hangul 661 * @param array Decomposed UCS4 sequence 662 * @return array UCS4 sequence with syllables composed 663 * @access private 664 */ 665 function _hangul_compose($input) 666 { 667 $inp_len = count($input); 668 if (!$inp_len) return array(); 669 $result = array(); 670 $last = (int) $input[0]; 671 $result[] = $last; // copy first char from input to output 672 673 for ($i = 1; $i < $inp_len; ++$i) { 674 $char = (int) $input[$i]; 675 $sindex = $last - $this->_sbase; 676 $lindex = $last - $this->_lbase; 677 $vindex = $char - $this->_vbase; 678 $tindex = $char - $this->_tbase; 679 // Find out, whether two current characters are LV and T 680 if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0) 681 && 0 <= $tindex && $tindex <= $this->_tcount) { 682 // create syllable of form LVT 683 $last += $tindex; 684 $result[(count($result) - 1)] = $last; // reset last 685 continue; // discard char 686 } 687 // Find out, whether two current characters form L and V 688 if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) { 689 // create syllable of form LV 690 $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount; 691 $result[(count($result) - 1)] = $last; // reset last 692 continue; // discard char 693 } 694 // if neither case was true, just add the character 695 $last = $char; 696 $result[] = $char; 697 } 698 return $result; 699 } 700 701 /** 702 * Returns the combining class of a certain wide char 703 * @param integer Wide char to check (32bit integer) 704 * @return integer Combining class if found, else 0 705 * @access private 706 */ 707 function _get_combining_class($char) 708 { 709 return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0; 710 } 711 712 /** 713 * Apllies the cannonical ordering of a decomposed UCS4 sequence 714 * @param array Decomposed UCS4 sequence 715 * @return array Ordered USC4 sequence 716 * @access private 717 */ 718 function _apply_cannonical_ordering($input) 719 { 720 $swap = true; 721 $size = count($input); 722 while ($swap) { 723 $swap = false; 724 $last = $this->_get_combining_class(intval($input[0])); 725 for ($i = 0; $i < $size-1; ++$i) { 726 $next = $this->_get_combining_class(intval($input[$i+1])); 727 if ($next != 0 && $last > $next) { 728 // Move item leftward until it fits 729 for ($j = $i + 1; $j > 0; --$j) { 730 if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break; 731 $t = intval($input[$j]); 732 $input[$j] = intval($input[$j-1]); 733 $input[$j-1] = $t; 734 $swap = true; 735 } 736 // Reentering the loop looking at the old character again 737 $next = $last; 738 } 739 $last = $next; 740 } 741 } 742 return $input; 743 } 744 745 /** 746 * Do composition of a sequence of starter and non-starter 747 * @param array UCS4 Decomposed sequence 748 * @return array Ordered USC4 sequence 749 * @access private 750 */ 751 function _combine($input) 752 { 753 $inp_len = count($input); 754 foreach ($this->NP['replacemaps'] as $np_src => $np_target) { 755 if ($np_target[0] != $input[0]) continue; 756 if (count($np_target) != $inp_len) continue; 757 $hit = false; 758 foreach ($input as $k2 => $v2) { 759 if ($v2 == $np_target[$k2]) { 760 $hit = true; 761 } else { 762 $hit = false; 763 break; 764 } 765 } 766 if ($hit) return $np_src; 767 } 768 return false; 769 } 770 771 /** 772 * This converts an UTF-8 encoded string to its UCS-4 representation 773 * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing 774 * each of the "chars". This is due to PHP not being able to handle strings with 775 * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too. 776 * The following UTF-8 encodings are supported: 777 * bytes bits representation 778 * 1 7 0xxxxxxx 779 * 2 11 110xxxxx 10xxxxxx 780 * 3 16 1110xxxx 10xxxxxx 10xxxxxx 781 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 782 * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 783 * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 784 * Each x represents a bit that can be used to store character data. 785 * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000 786 * @access private 787 */ 788 function _utf8_to_ucs4($input) 789 { 790 $output = array(); 791 $out_len = 0; 792 $inp_len = strlen($input); 793 $mode = 'next'; 794 $test = 'none'; 795 for ($k = 0; $k < $inp_len; ++$k) { 796 $v = ord($input[$k]); // Extract byte from input string 797 798 if ($v < 128) { // We found an ASCII char - put into stirng as is 799 $output[$out_len] = $v; 800 ++$out_len; 801 if ('add' == $mode) { 802 $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); 803 return false; 804 } 805 continue; 806 } 807 if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char 808 $start_byte = $v; 809 $mode = 'add'; 810 $test = 'range'; 811 if ($v >> 5 == 6) { // &110xxxxx 10xxxxx 812 $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left 813 $v = ($v - 192) << 6; 814 } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx 815 $next_byte = 1; 816 $v = ($v - 224) << 12; 817 } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 818 $next_byte = 2; 819 $v = ($v - 240) << 18; 820 } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 821 $next_byte = 3; 822 $v = ($v - 248) << 24; 823 } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 824 $next_byte = 4; 825 $v = ($v - 252) << 30; 826 } else { 827 $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k); 828 return false; 829 } 830 if ('add' == $mode) { 831 $output[$out_len] = (int) $v; 832 ++$out_len; 833 continue; 834 } 835 } 836 if ('add' == $mode) { 837 if (!$this->_allow_overlong && $test == 'range') { 838 $test = 'none'; 839 if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) { 840 $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k); 841 return false; 842 } 843 } 844 if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx 845 $v = ($v - 128) << ($next_byte * 6); 846 $output[($out_len - 1)] += $v; 847 --$next_byte; 848 } else { 849 $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); 850 return false; 851 } 852 if ($next_byte < 0) { 853 $mode = 'next'; 854 } 855 } 856 } // for 857 return $output; 858 } 859 860 /** 861 * Convert UCS-4 string into UTF-8 string 862 * See _utf8_to_ucs4() for details 863 * @access private 864 */ 865 function _ucs4_to_utf8($input) 866 { 867 $output = ''; 868 $k = 0; 869 foreach ($input as $v) { 870 ++$k; 871 // $v = ord($v); 872 if ($v < 128) { // 7bit are transferred literally 873 $output .= chr($v); 874 } elseif ($v < (1 << 11)) { // 2 bytes 875 $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63)); 876 } elseif ($v < (1 << 16)) { // 3 bytes 877 $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); 878 } elseif ($v < (1 << 21)) { // 4 bytes 879 $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63)) 880 . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); 881 } elseif ($v < (1 << 26)) { // 5 bytes 882 $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63)) 883 . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63)) 884 . chr(128 + ($v & 63)); 885 } elseif ($v < (1 << 31)) { // 6 bytes 886 $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63)) 887 . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63)) 888 . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); 889 } else { 890 $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k); 891 return false; 892 } 893 } 894 return $output; 895 } 896 897 /** 898 * Convert UCS-4 array into UCS-4 string 899 * 900 * @access private 901 */ 902 function _ucs4_to_ucs4_string($input) 903 { 904 $output = ''; 905 // Take array values and split output to 4 bytes per value 906 // The bit mask is 255, which reads &11111111 907 foreach ($input as $v) { 908 $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255); 909 } 910 return $output; 911 } 912 913 /** 914 * Convert UCS-4 strin into UCS-4 garray 915 * 916 * @access private 917 */ 918 function _ucs4_string_to_ucs4($input) 919 { 920 $output = array(); 921 $inp_len = strlen($input); 922 // Input length must be dividable by 4 923 if ($inp_len % 4) { 924 $this->_error('Input UCS4 string is broken'); 925 return false; 926 } 927 // Empty input - return empty output 928 if (!$inp_len) return $output; 929 for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) { 930 // Increment output position every 4 input bytes 931 if (!($i % 4)) { 932 $out_len++; 933 $output[$out_len] = 0; 934 } 935 $output[$out_len] += ord($input[$i]) << (8 * (3 - ($i % 4) ) ); 936 } 937 return $output; 938 } 939} 940 941/** 942* Adapter class for aligning the API of idna_convert with that of Net_IDNA 943* @author Matthias Sommerfeld <mso@phlylabs.de> 944*/ 945class Net_IDNA_php4 extends idna_convert 946{ 947 /** 948 * Sets a new option value. Available options and values: 949 * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8, 950 * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8] 951 * [overlong - Unicode does not allow unnecessarily long encodings of chars, 952 * to allow this, set this parameter to true, else to false; 953 * default is false.] 954 * [strict - true: strict mode, good for registration purposes - Causes errors 955 * on failures; false: loose mode, ideal for "wildlife" applications 956 * by silently ignoring errors and returning the original input instead 957 * 958 * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs) 959 * @param string Value to use (if parameter 1 is a string) 960 * @return boolean true on success, false otherwise 961 * @access public 962 */ 963 function setParams($option, $param = false) 964 { 965 return $this->IC->set_parameters($option, $param); 966 } 967} 968 969?> 970