1<?php 2/** 3 * Website: http://sourceforge.net/projects/simplehtmldom/ 4 * Additional projects: http://sourceforge.net/projects/debugobject/ 5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 * 7 * Licensed under The MIT License 8 * See the LICENSE file in the project root for more information. 9 * 10 * Authors: 11 * S.C. Chen 12 * John Schlick 13 * Rus Carroll 14 * logmanoriginal 15 * 16 * Contributors: 17 * Yousuke Kumakura 18 * Vadim Voituk 19 * Antcs 20 * 21 * Version Rev. 1.9 (290) 22 */ 23 24if (strpos(@ini_get('disable_functions'), 'set_time_limit') === false) { 25 @set_time_limit(0); 26} 27ini_set('max_execution_time', 0); 28 29define('HDOM_TYPE_ELEMENT', 1); 30define('HDOM_TYPE_COMMENT', 2); 31define('HDOM_TYPE_TEXT', 3); 32define('HDOM_TYPE_ENDTAG', 4); 33define('HDOM_TYPE_ROOT', 5); 34define('HDOM_TYPE_UNKNOWN', 6); 35define('HDOM_QUOTE_DOUBLE', 0); 36define('HDOM_QUOTE_SINGLE', 1); 37define('HDOM_QUOTE_NO', 3); 38define('HDOM_INFO_BEGIN', 0); 39define('HDOM_INFO_END', 1); 40define('HDOM_INFO_QUOTE', 2); 41define('HDOM_INFO_SPACE', 3); 42define('HDOM_INFO_TEXT', 4); 43define('HDOM_INFO_INNER', 5); 44define('HDOM_INFO_OUTER', 6); 45define('HDOM_INFO_ENDSPACE', 7); 46 47defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 48defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 49defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 50defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 51define('HDOM_SMARTY_AS_TEXT', 1); 52 53function file_get_html( 54 $url, 55 $use_include_path = false, 56 $context = null, 57 $offset = 0, 58 $maxLen = -1, 59 $lowercase = true, 60 $forceTagsClosed = true, 61 $target_charset = DEFAULT_TARGET_CHARSET, 62 $stripRN = true, 63 $defaultBRText = DEFAULT_BR_TEXT, 64 $defaultSpanText = DEFAULT_SPAN_TEXT) 65{ 66 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 67 68 $dom = new simple_html_dom( 69 null, 70 $lowercase, 71 $forceTagsClosed, 72 $target_charset, 73 $stripRN, 74 $defaultBRText, 75 $defaultSpanText 76 ); 77 78 /** 79 * For sourceforge users: uncomment the next line and comment the 80 * retrieve_url_contents line 2 lines down if it is not already done. 81 */ 82 $contents = file_get_contents( 83 $url, 84 $use_include_path, 85 $context, 86 $offset, 87 $maxLen 88 ); 89 // $contents = retrieve_url_contents($url); 90 91 if (empty($contents) || strlen($contents) > $maxLen) { 92 $dom->clear(); 93 return false; 94 } 95 96 return $dom->load($contents, $lowercase, $stripRN); 97} 98 99function str_get_html( 100 $str, 101 $lowercase = true, 102 $forceTagsClosed = true, 103 $target_charset = DEFAULT_TARGET_CHARSET, 104 $stripRN = true, 105 $defaultBRText = DEFAULT_BR_TEXT, 106 $defaultSpanText = DEFAULT_SPAN_TEXT) 107{ 108 $dom = new simple_html_dom( 109 null, 110 $lowercase, 111 $forceTagsClosed, 112 $target_charset, 113 $stripRN, 114 $defaultBRText, 115 $defaultSpanText 116 ); 117 118 if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 119 $dom->clear(); 120 return false; 121 } 122 123 return $dom->load($str, $lowercase, $stripRN); 124} 125 126function dump_html_tree($node, $show_attr = true, $deep = 0) 127{ 128 $node->dump($node); 129} 130 131class simple_html_dom_node 132{ 133 public $nodetype = HDOM_TYPE_TEXT; 134 public $tag = 'text'; 135 public $attr = array(); 136 public $children = array(); 137 public $nodes = array(); 138 public $parent = null; 139 public $_ = array(); 140 public $tag_start = 0; 141 private $dom = null; 142 143 function __construct($dom) 144 { 145 $this->dom = $dom; 146 $dom->nodes[] = $this; 147 } 148 149 function __destruct() 150 { 151 $this->clear(); 152 } 153 154 function __toString() 155 { 156 return $this->outertext(); 157 } 158 159 function clear() 160 { 161 $this->dom = null; 162 $this->nodes = null; 163 $this->parent = null; 164 $this->children = null; 165 } 166 167 function dump($show_attr = true, $depth = 0) 168 { 169 echo str_repeat("\t", $depth) . $this->tag; 170 171 if ($show_attr && count($this->attr) > 0) { 172 echo '('; 173 foreach ($this->attr as $k => $v) { 174 echo "[$k]=>\"$v\", "; 175 } 176 echo ')'; 177 } 178 179 echo "\n"; 180 181 if ($this->nodes) { 182 foreach ($this->nodes as $node) { 183 $node->dump($show_attr, $depth + 1); 184 } 185 } 186 } 187 188 function dump_node($echo = true) 189 { 190 $string = $this->tag; 191 192 if (count($this->attr) > 0) { 193 $string .= '('; 194 foreach ($this->attr as $k => $v) { 195 $string .= "[$k]=>\"$v\", "; 196 } 197 $string .= ')'; 198 } 199 200 if (count($this->_) > 0) { 201 $string .= ' $_ ('; 202 foreach ($this->_ as $k => $v) { 203 if (is_array($v)) { 204 $string .= "[$k]=>("; 205 foreach ($v as $k2 => $v2) { 206 $string .= "[$k2]=>\"$v2\", "; 207 } 208 $string .= ')'; 209 } else { 210 $string .= "[$k]=>\"$v\", "; 211 } 212 } 213 $string .= ')'; 214 } 215 216 if (isset($this->text)) { 217 $string .= " text: ({$this->text})"; 218 } 219 220 $string .= ' HDOM_INNER_INFO: '; 221 222 if (isset($node->_[HDOM_INFO_INNER])) { 223 $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 224 } else { 225 $string .= ' NULL '; 226 } 227 228 $string .= ' children: ' . count($this->children); 229 $string .= ' nodes: ' . count($this->nodes); 230 $string .= ' tag_start: ' . $this->tag_start; 231 $string .= "\n"; 232 233 if ($echo) { 234 echo $string; 235 return; 236 } else { 237 return $string; 238 } 239 } 240 241 function parent($parent = null) 242 { 243 // I am SURE that this doesn't work properly. 244 // It fails to unset the current node from it's current parents nodes or 245 // children list first. 246 if ($parent !== null) { 247 $this->parent = $parent; 248 $this->parent->nodes[] = $this; 249 $this->parent->children[] = $this; 250 } 251 252 return $this->parent; 253 } 254 255 function has_child() 256 { 257 return !empty($this->children); 258 } 259 260 function children($idx = -1) 261 { 262 if ($idx === -1) { 263 return $this->children; 264 } 265 266 if (isset($this->children[$idx])) { 267 return $this->children[$idx]; 268 } 269 270 return null; 271 } 272 273 function first_child() 274 { 275 if (count($this->children) > 0) { 276 return $this->children[0]; 277 } 278 return null; 279 } 280 281 function last_child() 282 { 283 if (count($this->children) > 0) { 284 return end($this->children); 285 } 286 return null; 287 } 288 289 function next_sibling() 290 { 291 if ($this->parent === null) { 292 return null; 293 } 294 295 $idx = array_search($this, $this->parent->children, true); 296 297 if ($idx !== false && isset($this->parent->children[$idx + 1])) { 298 return $this->parent->children[$idx + 1]; 299 } 300 301 return null; 302 } 303 304 function prev_sibling() 305 { 306 if ($this->parent === null) { 307 return null; 308 } 309 310 $idx = array_search($this, $this->parent->children, true); 311 312 if ($idx !== false && $idx > 0) { 313 return $this->parent->children[$idx - 1]; 314 } 315 316 return null; 317 } 318 319 function find_ancestor_tag($tag) 320 { 321 global $debug_object; 322 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 323 324 if ($this->parent === null) { 325 return null; 326 } 327 328 $ancestor = $this->parent; 329 330 while (!is_null($ancestor)) { 331 if (is_object($debug_object)) { 332 $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 333 } 334 335 if ($ancestor->tag === $tag) { 336 break; 337 } 338 339 $ancestor = $ancestor->parent; 340 } 341 342 return $ancestor; 343 } 344 345 function innertext() 346 { 347 if (isset($this->_[HDOM_INFO_INNER])) { 348 return $this->_[HDOM_INFO_INNER]; 349 } 350 351 if (isset($this->_[HDOM_INFO_TEXT])) { 352 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 353 } 354 355 $ret = ''; 356 357 foreach ($this->nodes as $n) { 358 $ret .= $n->outertext(); 359 } 360 361 return $ret; 362 } 363 364 function outertext() 365 { 366 global $debug_object; 367 368 if (is_object($debug_object)) { 369 $text = ''; 370 371 if ($this->tag === 'text') { 372 if (!empty($this->text)) { 373 $text = ' with text: ' . $this->text; 374 } 375 } 376 377 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 378 } 379 380 if ($this->tag === 'root') { 381 return $this->innertext(); 382 } 383 384 // todo: What is the use of this callback? Remove? 385 if ($this->dom && $this->dom->callback !== null) { 386 call_user_func_array($this->dom->callback, array($this)); 387 } 388 389 if (isset($this->_[HDOM_INFO_OUTER])) { 390 return $this->_[HDOM_INFO_OUTER]; 391 } 392 393 if (isset($this->_[HDOM_INFO_TEXT])) { 394 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 395 } 396 397 $ret = ''; 398 399 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 400 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 401 } 402 403 if (isset($this->_[HDOM_INFO_INNER])) { 404 // todo: <br> should either never have HDOM_INFO_INNER or always 405 if ($this->tag !== 'br') { 406 $ret .= $this->_[HDOM_INFO_INNER]; 407 } 408 } elseif ($this->nodes) { 409 foreach ($this->nodes as $n) { 410 $ret .= $this->convert_text($n->outertext()); 411 } 412 } 413 414 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 415 $ret .= '</' . $this->tag . '>'; 416 } 417 418 return $ret; 419 } 420 421 function text() 422 { 423 if (isset($this->_[HDOM_INFO_INNER])) { 424 return $this->_[HDOM_INFO_INNER]; 425 } 426 427 switch ($this->nodetype) { 428 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 429 case HDOM_TYPE_COMMENT: return ''; 430 case HDOM_TYPE_UNKNOWN: return ''; 431 } 432 433 if (strcasecmp($this->tag, 'script') === 0) { return ''; } 434 if (strcasecmp($this->tag, 'style') === 0) { return ''; } 435 436 $ret = ''; 437 438 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 439 // for some span tags, and some p tags) $this->nodes is set to NULL. 440 // NOTE: This indicates that there is a problem where it's set to NULL 441 // without a clear happening. 442 // WHY is this happening? 443 if (!is_null($this->nodes)) { 444 foreach ($this->nodes as $n) { 445 // Start paragraph after a blank line 446 if ($n->tag === 'p') { 447 $ret = trim($ret) . "\n\n"; 448 } 449 450 $ret .= $this->convert_text($n->text()); 451 452 // If this node is a span... add a space at the end of it so 453 // multiple spans don't run into each other. This is plaintext 454 // after all. 455 if ($n->tag === 'span') { 456 $ret .= $this->dom->default_span_text; 457 } 458 } 459 } 460 return $ret; 461 } 462 463 function xmltext() 464 { 465 $ret = $this->innertext(); 466 $ret = str_ireplace('<![CDATA[', '', $ret); 467 $ret = str_replace(']]>', '', $ret); 468 return $ret; 469 } 470 471 function makeup() 472 { 473 // text, comment, unknown 474 if (isset($this->_[HDOM_INFO_TEXT])) { 475 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 476 } 477 478 $ret = '<' . $this->tag; 479 $i = -1; 480 481 foreach ($this->attr as $key => $val) { 482 ++$i; 483 484 // skip removed attribute 485 if ($val === null || $val === false) { continue; } 486 487 $ret .= @$this->_[HDOM_INFO_SPACE][$i][0]; 488 489 //no value attr: nowrap, checked selected... 490 if ($val === true) { 491 $ret .= $key; 492 } else { 493 switch (@$this->_[HDOM_INFO_QUOTE][$i]) 494 { 495 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 496 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 497 default: $quote = ''; 498 } 499 500 $ret .= $key 501 . @$this->_[HDOM_INFO_SPACE][$i][1] 502 . '=' 503 . @$this->_[HDOM_INFO_SPACE][$i][2] 504 . $quote 505 . $val 506 . $quote; 507 } 508 } 509 510 $ret = $this->dom->restore_noise($ret); 511 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 512 } 513 514 function find($selector, $idx = null, $lowercase = false) 515 { 516 $selectors = $this->parse_selector($selector); 517 if (($count = count($selectors)) === 0) { return array(); } 518 $found_keys = array(); 519 520 // find each selector 521 for ($c = 0; $c < $count; ++$c) { 522 // The change on the below line was documented on the sourceforge 523 // code tracker id 2788009 524 // used to be: if (($levle=count($selectors[0]))===0) return array(); 525 if (($levle = count($selectors[$c])) === 0) { return array(); } 526 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 527 528 $head = array($this->_[HDOM_INFO_BEGIN] => 1); 529 $cmd = ' '; // Combinator 530 531 // handle descendant selectors, no recursive! 532 for ($l = 0; $l < $levle; ++$l) { 533 $ret = array(); 534 535 foreach ($head as $k => $v) { 536 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 537 //PaperG - Pass this optional parameter on to the seek function. 538 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 539 } 540 541 $head = $ret; 542 $cmd = $selectors[$c][$l][4]; // Next Combinator 543 } 544 545 foreach ($head as $k => $v) { 546 if (!isset($found_keys[$k])) { 547 $found_keys[$k] = 1; 548 } 549 } 550 } 551 552 // sort keys 553 ksort($found_keys); 554 555 $found = array(); 556 foreach ($found_keys as $k => $v) { 557 $found[] = $this->dom->nodes[$k]; 558 } 559 560 // return nth-element or array 561 if (is_null($idx)) { return $found; } 562 elseif ($idx < 0) { $idx = count($found) + $idx; } 563 return (isset($found[$idx])) ? $found[$idx] : null; 564 } 565 566 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 567 { 568 global $debug_object; 569 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 570 571 list($tag, $id, $class, $attributes, $cmb) = $selector; 572 $nodes = array(); 573 574 if ($parent_cmd === ' ') { // Descendant Combinator 575 // Find parent closing tag if the current element doesn't have a closing 576 // tag (i.e. void element) 577 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 578 if ($end == 0) { 579 $parent = $this->parent; 580 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 581 $end -= 1; 582 $parent = $parent->parent; 583 } 584 $end += $parent->_[HDOM_INFO_END]; 585 } 586 587 // Get list of target nodes 588 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 589 $nodes_count = $end - $nodes_start; 590 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 591 } elseif ($parent_cmd === '>') { // Child Combinator 592 $nodes = $this->children; 593 } elseif ($parent_cmd === '+' 594 && $this->parent 595 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 596 $index = array_search($this, $this->parent->children, true) + 1; 597 if ($index < count($this->parent->children)) 598 $nodes[] = $this->parent->children[$index]; 599 } elseif ($parent_cmd === '~' 600 && $this->parent 601 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 602 $index = array_search($this, $this->parent->children, true); 603 $nodes = array_slice($this->parent->children, $index); 604 } 605 606 // Go throgh each element starting at this element until the end tag 607 // Note: If this element is a void tag, any previous void element is 608 // skipped. 609 foreach($nodes as $node) { 610 $pass = true; 611 612 // Skip root nodes 613 if(!$node->parent) { 614 $pass = false; 615 } 616 617 // Skip if node isn't a child node (i.e. text nodes) 618 if($pass && !in_array($node, $node->parent->children, true)) { 619 $pass = false; 620 } 621 622 // Skip if tag doesn't match 623 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 624 $pass = false; 625 } 626 627 // Skip if ID doesn't exist 628 if ($pass && $id !== '' && !isset($node->attr['id'])) { 629 $pass = false; 630 } 631 632 // Check if ID matches 633 if ($pass && $id !== '' && isset($node->attr['id'])) { 634 // Note: Only consider the first ID (as browsers do) 635 $node_id = explode(' ', trim($node->attr['id']))[0]; 636 637 if($id !== $node_id) { $pass = false; } 638 } 639 640 // Check if all class(es) exist 641 if ($pass && $class !== '' && is_array($class) && !empty($class)) { 642 if (isset($node->attr['class'])) { 643 $node_classes = explode(' ', $node->attr['class']); 644 645 if ($lowercase) { 646 $node_classes = array_map('strtolower', $node_classes); 647 } 648 649 foreach($class as $c) { 650 if(!in_array($c, $node_classes)) { 651 $pass = false; 652 break; 653 } 654 } 655 } else { 656 $pass = false; 657 } 658 } 659 660 // Check attributes 661 if ($pass 662 && $attributes !== '' 663 && is_array($attributes) 664 && !empty($attributes)) { 665 foreach($attributes as $a) { 666 list ( 667 $att_name, 668 $att_expr, 669 $att_val, 670 $att_inv, 671 $att_case_sensitivity 672 ) = $a; 673 674 // Handle indexing attributes (i.e. "[2]") 675 /** 676 * Note: This is not supported by the CSS Standard but adds 677 * the ability to select items compatible to XPath (i.e. 678 * the 3rd element within it's parent). 679 * 680 * Note: This doesn't conflict with the CSS Standard which 681 * doesn't work on numeric attributes anyway. 682 */ 683 if (is_numeric($att_name) 684 && $att_expr === '' 685 && $att_val === '') { 686 $count = 0; 687 688 // Find index of current element in parent 689 foreach ($node->parent->children as $c) { 690 if ($c->tag === $node->tag) ++$count; 691 if ($c === $node) break; 692 } 693 694 // If this is the correct node, continue with next 695 // attribute 696 if ($count === (int)$att_name) continue; 697 } 698 699 // Check attribute availability 700 if ($att_inv) { // Attribute should NOT be set 701 if (isset($node->attr[$att_name])) { 702 $pass = false; 703 break; 704 } 705 } else { // Attribute should be set 706 // todo: "plaintext" is not a valid CSS selector! 707 if ($att_name !== 'plaintext' 708 && !isset($node->attr[$att_name])) { 709 $pass = false; 710 break; 711 } 712 } 713 714 // Continue with next attribute if expression isn't defined 715 if ($att_expr === '') continue; 716 717 // If they have told us that this is a "plaintext" 718 // search then we want the plaintext of the node - right? 719 // todo "plaintext" is not a valid CSS selector! 720 if ($att_name === 'plaintext') { 721 $nodeKeyValue = $node->text(); 722 } else { 723 $nodeKeyValue = $node->attr[$att_name]; 724 } 725 726 if (is_object($debug_object)) { 727 $debug_object->debug_log(2, 728 'testing node: ' 729 . $node->tag 730 . ' for attribute: ' 731 . $att_name 732 . $att_expr 733 . $att_val 734 . ' where nodes value is: ' 735 . $nodeKeyValue 736 ); 737 } 738 739 // If lowercase is set, do a case insensitive test of 740 // the value of the selector. 741 if ($lowercase) { 742 $check = $this->match( 743 $att_expr, 744 strtolower($att_val), 745 strtolower($nodeKeyValue), 746 $att_case_sensitivity 747 ); 748 } else { 749 $check = $this->match( 750 $att_expr, 751 $att_val, 752 $nodeKeyValue, 753 $att_case_sensitivity 754 ); 755 } 756 757 if (is_object($debug_object)) { 758 $debug_object->debug_log(2, 759 'after match: ' 760 . ($check ? 'true' : 'false') 761 ); 762 } 763 764 if (!$check) { 765 $pass = false; 766 break; 767 } 768 } 769 } 770 771 // Found a match. Add to list and clear node 772 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 773 unset($node); 774 } 775 // It's passed by reference so this is actually what this function returns. 776 if (is_object($debug_object)) { 777 $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 778 } 779 } 780 781 protected function match($exp, $pattern, $value, $case_sensitivity) 782 { 783 global $debug_object; 784 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 785 786 if ($case_sensitivity === 'i') { 787 $pattern = strtolower($pattern); 788 $value = strtolower($value); 789 } 790 791 switch ($exp) { 792 case '=': 793 return ($value === $pattern); 794 case '!=': 795 return ($value !== $pattern); 796 case '^=': 797 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 798 case '$=': 799 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 800 case '*=': 801 return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 802 case '|=': 803 /** 804 * [att|=val] 805 * 806 * Represents an element with the att attribute, its value 807 * either being exactly "val" or beginning with "val" 808 * immediately followed by "-" (U+002D). 809 */ 810 return strpos($value, $pattern) === 0; 811 case '~=': 812 /** 813 * [att~=val] 814 * 815 * Represents an element with the att attribute whose value is a 816 * whitespace-separated list of words, one of which is exactly 817 * "val". If "val" contains whitespace, it will never represent 818 * anything (since the words are separated by spaces). Also if 819 * "val" is the empty string, it will never represent anything. 820 */ 821 return in_array($pattern, explode(' ', trim($value)), true); 822 } 823 return false; 824 } 825 826 protected function parse_selector($selector_string) 827 { 828 global $debug_object; 829 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 830 831 /** 832 * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 833 * 834 * Paperg: Add the colon to the attribute, so that it properly finds 835 * <tag attr:ibute="something" > like google does. 836 * 837 * Note: if you try to look at this attribute, you MUST use getAttribute 838 * since $dom->x:y will fail the php syntax check. 839 * 840 * Notice the \[ starting the attribute? and the @? following? This 841 * implies that an attribute can begin with an @ sign that is not 842 * captured. This implies that an html attribute specifier may start 843 * with an @ sign that is NOT captured by the expression. Farther study 844 * is required to determine of this should be documented or removed. 845 * 846 * Matches selectors in this order: 847 * 848 * [0] - full match 849 * 850 * [1] - tag name 851 * ([\w:\*-]*) 852 * Matches the tag name consisting of zero or more words, colons, 853 * asterisks and hyphens. 854 * 855 * [2] - id name 856 * (?:\#([\w-]+)) 857 * Optionally matches a id name, consisting of an "#" followed by 858 * the id name (one or more words and hyphens). 859 * 860 * [3] - class names (including dots) 861 * (?:\.([\w\.-]+))? 862 * Optionally matches a list of classs, consisting of an "." 863 * followed by the class name (one or more words and hyphens) 864 * where multiple classes can be chained (i.e. ".foo.bar.baz") 865 * 866 * [4] - attributes 867 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 868 * Optionally matches the attributes list 869 * 870 * [5] - separator 871 * ([\/, >+~]+) 872 * Matches the selector list separator 873 */ 874 // phpcs:ignore Generic.Files.LineLength 875 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 876 877 preg_match_all( 878 $pattern, 879 trim($selector_string) . ' ', // Add final ' ' as pseudo separator 880 $matches, 881 PREG_SET_ORDER 882 ); 883 884 if (is_object($debug_object)) { 885 $debug_object->debug_log(2, 'Matches Array: ', $matches); 886 } 887 888 $selectors = array(); 889 $result = array(); 890 891 foreach ($matches as $m) { 892 $m[0] = trim($m[0]); 893 894 // Skip NoOps 895 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 896 897 // Convert to lowercase 898 if ($this->dom->lowercase) { 899 $m[1] = strtolower($m[1]); 900 } 901 902 // Extract classes 903 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 904 905 /* Extract attributes (pattern based on the pattern above!) 906 907 * [0] - full match 908 * [1] - attribute name 909 * [2] - attribute expression 910 * [3] - attribute value 911 * [4] - case sensitivity 912 * 913 * Note: Attributes can be negated with a "!" prefix to their name 914 */ 915 if($m[4] !== '') { 916 preg_match_all( 917 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 918 trim($m[4]), 919 $attributes, 920 PREG_SET_ORDER 921 ); 922 923 // Replace element by array 924 $m[4] = array(); 925 926 foreach($attributes as $att) { 927 // Skip empty matches 928 if(trim($att[0]) === '') { continue; } 929 930 $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 931 $m[4][] = array( 932 $inverted ? substr($att[1], 1) : $att[1], // Name 933 (isset($att[2])) ? $att[2] : '', // Expression 934 (isset($att[3])) ? $att[3] : '', // Value 935 $inverted, // Inverted Flag 936 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 937 ); 938 } 939 } 940 941 // Sanitize Separator 942 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 943 $m[5] = ' '; 944 } else { // Other Separator 945 $m[5] = trim($m[5]); 946 } 947 948 // Clear Separator if it's a Selector List 949 if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 950 951 // Remove full match before adding to results 952 array_shift($m); 953 $result[] = $m; 954 955 if ($is_list) { // Selector List 956 $selectors[] = $result; 957 $result = array(); 958 } 959 } 960 961 if (count($result) > 0) { $selectors[] = $result; } 962 return $selectors; 963 } 964 965 function __get($name) 966 { 967 if (isset($this->attr[$name])) { 968 return $this->convert_text($this->attr[$name]); 969 } 970 switch ($name) { 971 case 'outertext': return $this->outertext(); 972 case 'innertext': return $this->innertext(); 973 case 'plaintext': return $this->text(); 974 case 'xmltext': return $this->xmltext(); 975 default: return array_key_exists($name, $this->attr); 976 } 977 } 978 979 function __set($name, $value) 980 { 981 global $debug_object; 982 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 983 984 switch ($name) { 985 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 986 case 'innertext': 987 if (isset($this->_[HDOM_INFO_TEXT])) { 988 return $this->_[HDOM_INFO_TEXT] = $value; 989 } 990 return $this->_[HDOM_INFO_INNER] = $value; 991 } 992 993 if (!isset($this->attr[$name])) { 994 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 995 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 996 } 997 998 $this->attr[$name] = $value; 999 } 1000 1001 function __isset($name) 1002 { 1003 switch ($name) { 1004 case 'outertext': return true; 1005 case 'innertext': return true; 1006 case 'plaintext': return true; 1007 } 1008 //no value attr: nowrap, checked selected... 1009 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1010 } 1011 1012 function __unset($name) 1013 { 1014 if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1015 } 1016 1017 function convert_text($text) 1018 { 1019 global $debug_object; 1020 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1021 1022 $converted_text = $text; 1023 1024 $sourceCharset = ''; 1025 $targetCharset = ''; 1026 1027 if ($this->dom) { 1028 $sourceCharset = strtoupper($this->dom->_charset); 1029 $targetCharset = strtoupper($this->dom->_target_charset); 1030 } 1031 1032 if (is_object($debug_object)) { 1033 $debug_object->debug_log(3, 1034 'source charset: ' 1035 . $sourceCharset 1036 . ' target charaset: ' 1037 . $targetCharset 1038 ); 1039 } 1040 1041 if (!empty($sourceCharset) 1042 && !empty($targetCharset) 1043 && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1044 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1045 if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1046 && ($this->is_utf8($text))) { 1047 $converted_text = $text; 1048 } else { 1049 $converted_text = iconv($sourceCharset, $targetCharset, $text); 1050 } 1051 } 1052 1053 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1054 if ($targetCharset === 'UTF-8') { 1055 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1056 $converted_text = substr($converted_text, 3); 1057 } 1058 1059 if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1060 $converted_text = substr($converted_text, 0, -3); 1061 } 1062 } 1063 1064 return $converted_text; 1065 } 1066 1067 static function is_utf8($str) 1068 { 1069 $c = 0; $b = 0; 1070 $bits = 0; 1071 $len = strlen($str); 1072 for($i = 0; $i < $len; $i++) { 1073 $c = ord($str[$i]); 1074 if($c > 128) { 1075 if(($c >= 254)) { return false; } 1076 elseif($c >= 252) { $bits = 6; } 1077 elseif($c >= 248) { $bits = 5; } 1078 elseif($c >= 240) { $bits = 4; } 1079 elseif($c >= 224) { $bits = 3; } 1080 elseif($c >= 192) { $bits = 2; } 1081 else { return false; } 1082 if(($i + $bits) > $len) { return false; } 1083 while($bits > 1) { 1084 $i++; 1085 $b = ord($str[$i]); 1086 if($b < 128 || $b > 191) { return false; } 1087 $bits--; 1088 } 1089 } 1090 } 1091 return true; 1092 } 1093 1094 function get_display_size() 1095 { 1096 global $debug_object; 1097 1098 $width = -1; 1099 $height = -1; 1100 1101 if ($this->tag !== 'img') { 1102 return false; 1103 } 1104 1105 // See if there is aheight or width attribute in the tag itself. 1106 if (isset($this->attr['width'])) { 1107 $width = $this->attr['width']; 1108 } 1109 1110 if (isset($this->attr['height'])) { 1111 $height = $this->attr['height']; 1112 } 1113 1114 // Now look for an inline style. 1115 if (isset($this->attr['style'])) { 1116 // Thanks to user gnarf from stackoverflow for this regular expression. 1117 $attributes = array(); 1118 1119 preg_match_all( 1120 '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1121 $this->attr['style'], 1122 $matches, 1123 PREG_SET_ORDER 1124 ); 1125 1126 foreach ($matches as $match) { 1127 $attributes[$match[1]] = $match[2]; 1128 } 1129 1130 // If there is a width in the style attributes: 1131 if (isset($attributes['width']) && $width == -1) { 1132 // check that the last two characters are px (pixels) 1133 if (strtolower(substr($attributes['width'], -2)) === 'px') { 1134 $proposed_width = substr($attributes['width'], 0, -2); 1135 // Now make sure that it's an integer and not something stupid. 1136 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1137 $width = $proposed_width; 1138 } 1139 } 1140 } 1141 1142 // If there is a width in the style attributes: 1143 if (isset($attributes['height']) && $height == -1) { 1144 // check that the last two characters are px (pixels) 1145 if (strtolower(substr($attributes['height'], -2)) == 'px') { 1146 $proposed_height = substr($attributes['height'], 0, -2); 1147 // Now make sure that it's an integer and not something stupid. 1148 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1149 $height = $proposed_height; 1150 } 1151 } 1152 } 1153 1154 } 1155 1156 // Future enhancement: 1157 // Look in the tag to see if there is a class or id specified that has 1158 // a height or width attribute to it. 1159 1160 // Far future enhancement 1161 // Look at all the parent tags of this image to see if they specify a 1162 // class or id that has an img selector that specifies a height or width 1163 // Note that in this case, the class or id will have the img subselector 1164 // for it to apply to the image. 1165 1166 // ridiculously far future development 1167 // If the class or id is specified in a SEPARATE css file thats not on 1168 // the page, go get it and do what we were just doing for the ones on 1169 // the page. 1170 1171 $result = array( 1172 'height' => $height, 1173 'width' => $width 1174 ); 1175 1176 return $result; 1177 } 1178 1179 function save($filepath = '') 1180 { 1181 $ret = $this->outertext(); 1182 1183 if ($filepath !== '') { 1184 file_put_contents($filepath, $ret, LOCK_EX); 1185 } 1186 1187 return $ret; 1188 } 1189 1190 function addClass($class) 1191 { 1192 if (is_string($class)) { 1193 $class = explode(' ', $class); 1194 } 1195 1196 if (is_array($class)) { 1197 foreach($class as $c) { 1198 if (isset($this->class)) { 1199 if ($this->hasClass($c)) { 1200 continue; 1201 } else { 1202 $this->class .= ' ' . $c; 1203 } 1204 } else { 1205 $this->class = $c; 1206 } 1207 } 1208 } else { 1209 if (is_object($debug_object)) { 1210 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1211 } 1212 } 1213 } 1214 1215 function hasClass($class) 1216 { 1217 if (is_string($class)) { 1218 if (isset($this->class)) { 1219 return in_array($class, explode(' ', $this->class), true); 1220 } 1221 } else { 1222 if (is_object($debug_object)) { 1223 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1224 } 1225 } 1226 1227 return false; 1228 } 1229 1230 function removeClass($class = null) 1231 { 1232 if (!isset($this->class)) { 1233 return; 1234 } 1235 1236 if (is_null($class)) { 1237 $this->removeAttribute('class'); 1238 return; 1239 } 1240 1241 if (is_string($class)) { 1242 $class = explode(' ', $class); 1243 } 1244 1245 if (is_array($class)) { 1246 $class = array_diff(explode(' ', $this->class), $class); 1247 if (empty($class)) { 1248 $this->removeAttribute('class'); 1249 } else { 1250 $this->class = implode(' ', $class); 1251 } 1252 } 1253 } 1254 1255 function getAllAttributes() 1256 { 1257 return $this->attr; 1258 } 1259 1260 function getAttribute($name) 1261 { 1262 return $this->__get($name); 1263 } 1264 1265 function setAttribute($name, $value) 1266 { 1267 $this->__set($name, $value); 1268 } 1269 1270 function hasAttribute($name) 1271 { 1272 return $this->__isset($name); 1273 } 1274 1275 function removeAttribute($name) 1276 { 1277 $this->__set($name, null); 1278 } 1279 1280 function remove() 1281 { 1282 if ($this->parent) { 1283 $this->parent->removeChild($this); 1284 } 1285 } 1286 1287 function removeChild($node) 1288 { 1289 $nidx = array_search($node, $this->nodes, true); 1290 $cidx = array_search($node, $this->children, true); 1291 $didx = array_search($node, $this->dom->nodes, true); 1292 1293 if ($nidx !== false && $cidx !== false && $didx !== false) { 1294 1295 foreach($node->children as $child) { 1296 $node->removeChild($child); 1297 } 1298 1299 foreach($node->nodes as $entity) { 1300 $enidx = array_search($entity, $node->nodes, true); 1301 $edidx = array_search($entity, $node->dom->nodes, true); 1302 1303 if ($enidx !== false && $edidx !== false) { 1304 unset($node->nodes[$enidx]); 1305 unset($node->dom->nodes[$edidx]); 1306 } 1307 } 1308 1309 unset($this->nodes[$nidx]); 1310 unset($this->children[$cidx]); 1311 unset($this->dom->nodes[$didx]); 1312 1313 $node->clear(); 1314 1315 } 1316 } 1317 1318 function getElementById($id) 1319 { 1320 return $this->find("#$id", 0); 1321 } 1322 1323 function getElementsById($id, $idx = null) 1324 { 1325 return $this->find("#$id", $idx); 1326 } 1327 1328 function getElementByTagName($name) 1329 { 1330 return $this->find($name, 0); 1331 } 1332 1333 function getElementsByTagName($name, $idx = null) 1334 { 1335 return $this->find($name, $idx); 1336 } 1337 1338 function parentNode() 1339 { 1340 return $this->parent(); 1341 } 1342 1343 function childNodes($idx = -1) 1344 { 1345 return $this->children($idx); 1346 } 1347 1348 function firstChild() 1349 { 1350 return $this->first_child(); 1351 } 1352 1353 function lastChild() 1354 { 1355 return $this->last_child(); 1356 } 1357 1358 function nextSibling() 1359 { 1360 return $this->next_sibling(); 1361 } 1362 1363 function previousSibling() 1364 { 1365 return $this->prev_sibling(); 1366 } 1367 1368 function hasChildNodes() 1369 { 1370 return $this->has_child(); 1371 } 1372 1373 function nodeName() 1374 { 1375 return $this->tag; 1376 } 1377 1378 function appendChild($node) 1379 { 1380 $node->parent($this); 1381 return $node; 1382 } 1383 1384} 1385 1386class simple_html_dom 1387{ 1388 public $root = null; 1389 public $nodes = array(); 1390 public $callback = null; 1391 public $lowercase = false; 1392 public $original_size; 1393 public $size; 1394 1395 protected $pos; 1396 protected $doc; 1397 protected $char; 1398 1399 protected $cursor; 1400 protected $parent; 1401 protected $noise = array(); 1402 protected $token_blank = " \t\r\n"; 1403 protected $token_equal = ' =/>'; 1404 protected $token_slash = " />\r\n\t"; 1405 protected $token_attr = ' >'; 1406 1407 public $_charset = ''; 1408 public $_target_charset = ''; 1409 1410 protected $default_br_text = ''; 1411 1412 public $default_span_text = ''; 1413 1414 protected $self_closing_tags = array( 1415 'area' => 1, 1416 'base' => 1, 1417 'br' => 1, 1418 'col' => 1, 1419 'embed' => 1, 1420 'hr' => 1, 1421 'img' => 1, 1422 'input' => 1, 1423 'link' => 1, 1424 'meta' => 1, 1425 'param' => 1, 1426 'source' => 1, 1427 'track' => 1, 1428 'wbr' => 1 1429 ); 1430 protected $block_tags = array( 1431 'body' => 1, 1432 'div' => 1, 1433 'form' => 1, 1434 'root' => 1, 1435 'span' => 1, 1436 'table' => 1 1437 ); 1438 protected $optional_closing_tags = array( 1439 // Not optional, see 1440 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1441 'b' => array('b' => 1), 1442 'dd' => array('dd' => 1, 'dt' => 1), 1443 // Not optional, see 1444 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1445 'dl' => array('dd' => 1, 'dt' => 1), 1446 'dt' => array('dd' => 1, 'dt' => 1), 1447 'li' => array('li' => 1), 1448 'optgroup' => array('optgroup' => 1, 'option' => 1), 1449 'option' => array('optgroup' => 1, 'option' => 1), 1450 'p' => array('p' => 1), 1451 'rp' => array('rp' => 1, 'rt' => 1), 1452 'rt' => array('rp' => 1, 'rt' => 1), 1453 'td' => array('td' => 1, 'th' => 1), 1454 'th' => array('td' => 1, 'th' => 1), 1455 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1456 ); 1457 1458 function __construct( 1459 $str = null, 1460 $lowercase = true, 1461 $forceTagsClosed = true, 1462 $target_charset = DEFAULT_TARGET_CHARSET, 1463 $stripRN = true, 1464 $defaultBRText = DEFAULT_BR_TEXT, 1465 $defaultSpanText = DEFAULT_SPAN_TEXT, 1466 $options = 0) 1467 { 1468 if ($str) { 1469 if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1470 $this->load_file($str); 1471 } else { 1472 $this->load( 1473 $str, 1474 $lowercase, 1475 $stripRN, 1476 $defaultBRText, 1477 $defaultSpanText, 1478 $options 1479 ); 1480 } 1481 } 1482 // Forcing tags to be closed implies that we don't trust the html, but 1483 // it can lead to parsing errors if we SHOULD trust the html. 1484 if (!$forceTagsClosed) { 1485 $this->optional_closing_array = array(); 1486 } 1487 1488 $this->_target_charset = $target_charset; 1489 } 1490 1491 function __destruct() 1492 { 1493 $this->clear(); 1494 } 1495 1496 function load( 1497 $str, 1498 $lowercase = true, 1499 $stripRN = true, 1500 $defaultBRText = DEFAULT_BR_TEXT, 1501 $defaultSpanText = DEFAULT_SPAN_TEXT, 1502 $options = 0) 1503 { 1504 global $debug_object; 1505 1506 // prepare 1507 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1508 1509 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1510 // Script tags removal now preceeds style tag removal. 1511 // strip out <script> tags 1512 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1513 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1514 1515 // strip out the \r \n's if we are told to. 1516 if ($stripRN) { 1517 $this->doc = str_replace("\r", ' ', $this->doc); 1518 $this->doc = str_replace("\n", ' ', $this->doc); 1519 1520 // set the length of content since we have changed it. 1521 $this->size = strlen($this->doc); 1522 } 1523 1524 // strip out cdata 1525 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1526 // strip out comments 1527 $this->remove_noise("'<!--(.*?)-->'is"); 1528 // strip out <style> tags 1529 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1530 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1531 // strip out preformatted tags 1532 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1533 // strip out server side scripts 1534 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1535 1536 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1537 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1538 } 1539 1540 // parsing 1541 $this->parse(); 1542 // end 1543 $this->root->_[HDOM_INFO_END] = $this->cursor; 1544 $this->parse_charset(); 1545 1546 // make load function chainable 1547 return $this; 1548 } 1549 1550 function load_file() 1551 { 1552 $args = func_get_args(); 1553 1554 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1555 $this->load($doc, true); 1556 } else { 1557 return false; 1558 } 1559 } 1560 1561 function set_callback($function_name) 1562 { 1563 $this->callback = $function_name; 1564 } 1565 1566 function remove_callback() 1567 { 1568 $this->callback = null; 1569 } 1570 1571 function save($filepath = '') 1572 { 1573 $ret = $this->root->innertext(); 1574 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1575 return $ret; 1576 } 1577 1578 function find($selector, $idx = null, $lowercase = false) 1579 { 1580 return $this->root->find($selector, $idx, $lowercase); 1581 } 1582 1583 function clear() 1584 { 1585 if (isset($this->nodes)) { 1586 foreach ($this->nodes as $n) { 1587 $n->clear(); 1588 $n = null; 1589 } 1590 } 1591 1592 // This add next line is documented in the sourceforge repository. 1593 // 2977248 as a fix for ongoing memory leaks that occur even with the 1594 // use of clear. 1595 if (isset($this->children)) { 1596 foreach ($this->children as $n) { 1597 $n->clear(); 1598 $n = null; 1599 } 1600 } 1601 1602 if (isset($this->parent)) { 1603 $this->parent->clear(); 1604 unset($this->parent); 1605 } 1606 1607 if (isset($this->root)) { 1608 $this->root->clear(); 1609 unset($this->root); 1610 } 1611 1612 unset($this->doc); 1613 unset($this->noise); 1614 } 1615 1616 function dump($show_attr = true) 1617 { 1618 $this->root->dump($show_attr); 1619 } 1620 1621 protected function prepare( 1622 $str, $lowercase = true, 1623 $defaultBRText = DEFAULT_BR_TEXT, 1624 $defaultSpanText = DEFAULT_SPAN_TEXT) 1625 { 1626 $this->clear(); 1627 1628 $this->doc = trim($str); 1629 $this->size = strlen($this->doc); 1630 $this->original_size = $this->size; // original size of the html 1631 $this->pos = 0; 1632 $this->cursor = 1; 1633 $this->noise = array(); 1634 $this->nodes = array(); 1635 $this->lowercase = $lowercase; 1636 $this->default_br_text = $defaultBRText; 1637 $this->default_span_text = $defaultSpanText; 1638 $this->root = new simple_html_dom_node($this); 1639 $this->root->tag = 'root'; 1640 $this->root->_[HDOM_INFO_BEGIN] = -1; 1641 $this->root->nodetype = HDOM_TYPE_ROOT; 1642 $this->parent = $this->root; 1643 if ($this->size > 0) { $this->char = $this->doc[0]; } 1644 } 1645 1646 protected function parse() 1647 { 1648 while (true) { 1649 // Read next tag if there is no text between current position and the 1650 // next opening tag. 1651 if (($s = $this->copy_until_char('<')) === '') { 1652 if($this->read_tag()) { 1653 continue; 1654 } else { 1655 return true; 1656 } 1657 } 1658 1659 // Add a text node for text between tags 1660 $node = new simple_html_dom_node($this); 1661 ++$this->cursor; 1662 $node->_[HDOM_INFO_TEXT] = $s; 1663 $this->link_nodes($node, false); 1664 } 1665 } 1666 1667 protected function parse_charset() 1668 { 1669 global $debug_object; 1670 1671 $charset = null; 1672 1673 if (function_exists('get_last_retrieve_url_contents_content_type')) { 1674 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1675 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1676 if ($success) { 1677 $charset = $matches[1]; 1678 if (is_object($debug_object)) { 1679 $debug_object->debug_log(2, 1680 'header content-type found charset of: ' 1681 . $charset 1682 ); 1683 } 1684 } 1685 } 1686 1687 if (empty($charset)) { 1688 // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1689 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1690 1691 if (!empty($el)) { 1692 $fullvalue = $el->content; 1693 if (is_object($debug_object)) { 1694 $debug_object->debug_log(2, 1695 'meta content-type tag found' 1696 . $fullvalue 1697 ); 1698 } 1699 1700 if (!empty($fullvalue)) { 1701 $success = preg_match( 1702 '/charset=(.+)/i', 1703 $fullvalue, 1704 $matches 1705 ); 1706 1707 if ($success) { 1708 $charset = $matches[1]; 1709 } else { 1710 // If there is a meta tag, and they don't specify the 1711 // character set, research says that it's typically 1712 // ISO-8859-1 1713 if (is_object($debug_object)) { 1714 $debug_object->debug_log(2, 1715 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1716 ); 1717 } 1718 1719 $charset = 'ISO-8859-1'; 1720 } 1721 } 1722 } 1723 } 1724 1725 if (empty($charset)) { 1726 // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1727 if ($meta = $this->root->find('meta[charset]', 0)) { 1728 $charset = $meta->charset; 1729 if (is_object($debug_object)) { 1730 $debug_object->debug_log(2, 'meta charset: ' . $charset); 1731 } 1732 } 1733 } 1734 1735 if (empty($charset)) { 1736 // Try to guess the charset based on the content 1737 // Requires Multibyte String (mbstring) support (optional) 1738 if (function_exists('mb_detect_encoding')) { 1739 /** 1740 * mb_detect_encoding() is not intended to distinguish between 1741 * charsets, especially single-byte charsets. Its primary 1742 * purpose is to detect which multibyte encoding is in use, 1743 * i.e. UTF-8, UTF-16, shift-JIS, etc. 1744 * 1745 * -- https://bugs.php.net/bug.php?id=38138 1746 * 1747 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1748 * always result in CP1251/ISO-8859-5 and vice versa. 1749 * 1750 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1751 * to stay compatible. 1752 */ 1753 $encoding = mb_detect_encoding( 1754 $this->doc, 1755 array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1756 ); 1757 1758 if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1759 // Due to a limitation of mb_detect_encoding 1760 // 'CP1251'/'ISO-8859-5' will be detected as 1761 // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1762 // which case we can simply assume it is the other charset. 1763 if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1764 $encoding = 'CP1251'; 1765 } 1766 } 1767 1768 if ($encoding !== false) { 1769 $charset = $encoding; 1770 if (is_object($debug_object)) { 1771 $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1772 } 1773 } 1774 } 1775 } 1776 1777 if (empty($charset)) { 1778 // Assume it's UTF-8 as it is the most likely charset to be used 1779 $charset = 'UTF-8'; 1780 if (is_object($debug_object)) { 1781 $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1782 } 1783 } 1784 1785 // Since CP1252 is a superset, if we get one of it's subsets, we want 1786 // it instead. 1787 if ((strtolower($charset) == 'iso-8859-1') 1788 || (strtolower($charset) == 'latin1') 1789 || (strtolower($charset) == 'latin-1')) { 1790 $charset = 'CP1252'; 1791 if (is_object($debug_object)) { 1792 $debug_object->debug_log(2, 1793 'replacing ' . $charset . ' with CP1252 as its a superset' 1794 ); 1795 } 1796 } 1797 1798 if (is_object($debug_object)) { 1799 $debug_object->debug_log(1, 'EXIT - ' . $charset); 1800 } 1801 1802 return $this->_charset = $charset; 1803 } 1804 1805 protected function read_tag() 1806 { 1807 // Set end position if no further tags found 1808 if ($this->char !== '<') { 1809 $this->root->_[HDOM_INFO_END] = $this->cursor; 1810 return false; 1811 } 1812 1813 $begin_tag_pos = $this->pos; 1814 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1815 1816 // end tag 1817 if ($this->char === '/') { 1818 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1819 1820 // Skip whitespace in end tags (i.e. in "</ html>") 1821 $this->skip($this->token_blank); 1822 $tag = $this->copy_until_char('>'); 1823 1824 // Skip attributes in end tags 1825 if (($pos = strpos($tag, ' ')) !== false) { 1826 $tag = substr($tag, 0, $pos); 1827 } 1828 1829 $parent_lower = strtolower($this->parent->tag); 1830 $tag_lower = strtolower($tag); 1831 1832 // The end tag is supposed to close the parent tag. Handle situations 1833 // when it doesn't 1834 if ($parent_lower !== $tag_lower) { 1835 // Parent tag does not have to be closed necessarily (optional closing tag) 1836 // Current tag is a block tag, so it may close an ancestor 1837 if (isset($this->optional_closing_tags[$parent_lower]) 1838 && isset($this->block_tags[$tag_lower])) { 1839 1840 $this->parent->_[HDOM_INFO_END] = 0; 1841 $org_parent = $this->parent; 1842 1843 // Traverse ancestors to find a matching opening tag 1844 // Stop at root node 1845 while (($this->parent->parent) 1846 && strtolower($this->parent->tag) !== $tag_lower 1847 ){ 1848 $this->parent = $this->parent->parent; 1849 } 1850 1851 // If we don't have a match add current tag as text node 1852 if (strtolower($this->parent->tag) !== $tag_lower) { 1853 $this->parent = $org_parent; // restore origonal parent 1854 1855 if ($this->parent->parent) { 1856 $this->parent = $this->parent->parent; 1857 } 1858 1859 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1860 return $this->as_text_node($tag); 1861 } 1862 } elseif (($this->parent->parent) 1863 && isset($this->block_tags[$tag_lower]) 1864 ) { 1865 // Grandparent exists and current tag is a block tag, so our 1866 // parent doesn't have an end tag 1867 $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1868 $org_parent = $this->parent; 1869 1870 // Traverse ancestors to find a matching opening tag 1871 // Stop at root node 1872 while (($this->parent->parent) 1873 && strtolower($this->parent->tag) !== $tag_lower 1874 ) { 1875 $this->parent = $this->parent->parent; 1876 } 1877 1878 // If we don't have a match add current tag as text node 1879 if (strtolower($this->parent->tag) !== $tag_lower) { 1880 $this->parent = $org_parent; // restore origonal parent 1881 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1882 return $this->as_text_node($tag); 1883 } 1884 } elseif (($this->parent->parent) 1885 && strtolower($this->parent->parent->tag) === $tag_lower 1886 ) { // Grandparent exists and current tag closes it 1887 $this->parent->_[HDOM_INFO_END] = 0; 1888 $this->parent = $this->parent->parent; 1889 } else { // Random tag, add as text node 1890 return $this->as_text_node($tag); 1891 } 1892 } 1893 1894 // Set end position of parent tag to current cursor position 1895 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1896 1897 if ($this->parent->parent) { 1898 $this->parent = $this->parent->parent; 1899 } 1900 1901 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1902 return true; 1903 } 1904 1905 // start tag 1906 $node = new simple_html_dom_node($this); 1907 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1908 ++$this->cursor; 1909 $tag = $this->copy_until($this->token_slash); // Get tag name 1910 $node->tag_start = $begin_tag_pos; 1911 1912 // doctype, cdata & comments... 1913 // <!DOCTYPE html> 1914 // <![CDATA[ ... ]]> 1915 // <!-- Comment --> 1916 if (isset($tag[0]) && $tag[0] === '!') { 1917 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1918 1919 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1920 $node->nodetype = HDOM_TYPE_COMMENT; 1921 $node->tag = 'comment'; 1922 } else { // Could be doctype or CDATA but we don't care 1923 $node->nodetype = HDOM_TYPE_UNKNOWN; 1924 $node->tag = 'unknown'; 1925 } 1926 1927 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1928 1929 $this->link_nodes($node, true); 1930 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1931 return true; 1932 } 1933 1934 // The start tag cannot contain another start tag, if so add as text 1935 // i.e. "<<html>" 1936 if ($pos = strpos($tag, '<') !== false) { 1937 $tag = '<' . substr($tag, 0, -1); 1938 $node->_[HDOM_INFO_TEXT] = $tag; 1939 $this->link_nodes($node, false); 1940 $this->char = $this->doc[--$this->pos]; // prev 1941 return true; 1942 } 1943 1944 // Handle invalid tag names (i.e. "<html#doc>") 1945 if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1946 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1947 1948 // Next char is the beginning of a new tag, don't touch it. 1949 if ($this->char === '<') { 1950 $this->link_nodes($node, false); 1951 return true; 1952 } 1953 1954 // Next char closes current tag, add and be done with it. 1955 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1956 $this->link_nodes($node, false); 1957 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1958 return true; 1959 } 1960 1961 // begin tag, add new node 1962 $node->nodetype = HDOM_TYPE_ELEMENT; 1963 $tag_lower = strtolower($tag); 1964 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1965 1966 // handle optional closing tags 1967 if (isset($this->optional_closing_tags[$tag_lower])) { 1968 // Traverse ancestors to close all optional closing tags 1969 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1970 $this->parent->_[HDOM_INFO_END] = 0; 1971 $this->parent = $this->parent->parent; 1972 } 1973 $node->parent = $this->parent; 1974 } 1975 1976 $guard = 0; // prevent infinity loop 1977 1978 // [0] Space between tag and first attribute 1979 $space = array($this->copy_skip($this->token_blank), '', ''); 1980 1981 // attributes 1982 do { 1983 // Everything until the first equal sign should be the attribute name 1984 $name = $this->copy_until($this->token_equal); 1985 1986 if ($name === '' && $this->char !== null && $space[0] === '') { 1987 break; 1988 } 1989 1990 if ($guard === $this->pos) { // Escape infinite loop 1991 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1992 continue; 1993 } 1994 1995 $guard = $this->pos; 1996 1997 // handle endless '<' 1998 // Out of bounds before the tag ended 1999 if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2000 $node->nodetype = HDOM_TYPE_TEXT; 2001 $node->_[HDOM_INFO_END] = 0; 2002 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2003 $node->tag = 'text'; 2004 $this->link_nodes($node, false); 2005 return true; 2006 } 2007 2008 // handle mismatch '<' 2009 // Attributes cannot start after opening tag 2010 if ($this->doc[$this->pos - 1] == '<') { 2011 $node->nodetype = HDOM_TYPE_TEXT; 2012 $node->tag = 'text'; 2013 $node->attr = array(); 2014 $node->_[HDOM_INFO_END] = 0; 2015 $node->_[HDOM_INFO_TEXT] = substr( 2016 $this->doc, 2017 $begin_tag_pos, 2018 $this->pos - $begin_tag_pos - 1 2019 ); 2020 $this->pos -= 2; 2021 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2022 $this->link_nodes($node, false); 2023 return true; 2024 } 2025 2026 if ($name !== '/' && $name !== '') { // this is a attribute name 2027 // [1] Whitespace after attribute name 2028 $space[1] = $this->copy_skip($this->token_blank); 2029 2030 $name = $this->restore_noise($name); // might be a noisy name 2031 2032 if ($this->lowercase) { $name = strtolower($name); } 2033 2034 if ($this->char === '=') { // attribute with value 2035 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2036 $this->parse_attr($node, $name, $space); // get attribute value 2037 } else { 2038 //no value attr: nowrap, checked selected... 2039 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2040 $node->attr[$name] = true; 2041 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2042 } 2043 2044 $node->_[HDOM_INFO_SPACE][] = $space; 2045 2046 // prepare for next attribute 2047 $space = array( 2048 $this->copy_skip($this->token_blank), 2049 '', 2050 '' 2051 ); 2052 } else { // no more attributes 2053 break; 2054 } 2055 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2056 2057 $this->link_nodes($node, true); 2058 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2059 2060 // handle empty tags (i.e. "<div/>") 2061 if ($this->copy_until_char('>') === '/') { 2062 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2063 $node->_[HDOM_INFO_END] = 0; 2064 } else { 2065 // reset parent 2066 if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2067 $this->parent = $node; 2068 } 2069 } 2070 2071 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2072 2073 // If it's a BR tag, we need to set it's text to the default text. 2074 // This way when we see it in plaintext, we can generate formatting that the user wants. 2075 // since a br tag never has sub nodes, this works well. 2076 if ($node->tag === 'br') { 2077 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2078 } 2079 2080 return true; 2081 } 2082 2083 protected function parse_attr($node, $name, &$space) 2084 { 2085 $is_duplicate = isset($node->attr[$name]); 2086 2087 if (!$is_duplicate) // Copy whitespace between "=" and value 2088 $space[2] = $this->copy_skip($this->token_blank); 2089 2090 switch ($this->char) { 2091 case '"': 2092 $quote_type = HDOM_QUOTE_DOUBLE; 2093 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2094 $value = $this->copy_until_char('"'); 2095 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2096 break; 2097 case '\'': 2098 $quote_type = HDOM_QUOTE_SINGLE; 2099 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2100 $value = $this->copy_until_char('\''); 2101 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2102 break; 2103 default: 2104 $quote_type = HDOM_QUOTE_NO; 2105 $value = $this->copy_until($this->token_attr); 2106 } 2107 2108 $value = $this->restore_noise($value); 2109 2110 // PaperG: Attributes should not have \r or \n in them, that counts as 2111 // html whitespace. 2112 2113 // giterlizzi: Fix for DokuWiki Bootstrap Template 2114 if ($this->strip_rn) { 2115 $value = str_replace("\r", '', $value); 2116 $value = str_replace("\n", '', $value); 2117 } 2118 2119 // PaperG: If this is a "class" selector, lets get rid of the preceeding 2120 // and trailing space since some people leave it in the multi class case. 2121 if ($name === 'class') { 2122 $value = trim($value); 2123 } 2124 2125 if (!$is_duplicate) { 2126 $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2127 $node->attr[$name] = $value; 2128 } 2129 } 2130 2131 protected function link_nodes(&$node, $is_child) 2132 { 2133 $node->parent = $this->parent; 2134 $this->parent->nodes[] = $node; 2135 if ($is_child) { 2136 $this->parent->children[] = $node; 2137 } 2138 } 2139 2140 protected function as_text_node($tag) 2141 { 2142 $node = new simple_html_dom_node($this); 2143 ++$this->cursor; 2144 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2145 $this->link_nodes($node, false); 2146 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2147 return true; 2148 } 2149 2150 protected function skip($chars) 2151 { 2152 $this->pos += strspn($this->doc, $chars, $this->pos); 2153 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2154 } 2155 2156 protected function copy_skip($chars) 2157 { 2158 $pos = $this->pos; 2159 $len = strspn($this->doc, $chars, $pos); 2160 $this->pos += $len; 2161 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2162 if ($len === 0) { return ''; } 2163 return substr($this->doc, $pos, $len); 2164 } 2165 2166 protected function copy_until($chars) 2167 { 2168 $pos = $this->pos; 2169 $len = strcspn($this->doc, $chars, $pos); 2170 $this->pos += $len; 2171 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2172 return substr($this->doc, $pos, $len); 2173 } 2174 2175 protected function copy_until_char($char) 2176 { 2177 if ($this->char === null) { return ''; } 2178 2179 if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2180 $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2181 $this->char = null; 2182 $this->pos = $this->size; 2183 return $ret; 2184 } 2185 2186 if ($pos === $this->pos) { return ''; } 2187 2188 $pos_old = $this->pos; 2189 $this->char = $this->doc[$pos]; 2190 $this->pos = $pos; 2191 return substr($this->doc, $pos_old, $pos - $pos_old); 2192 } 2193 2194 protected function remove_noise($pattern, $remove_tag = false) 2195 { 2196 global $debug_object; 2197 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2198 2199 $count = preg_match_all( 2200 $pattern, 2201 $this->doc, 2202 $matches, 2203 PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2204 ); 2205 2206 for ($i = $count - 1; $i > -1; --$i) { 2207 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2208 2209 if (is_object($debug_object)) { 2210 $debug_object->debug_log(2, 'key is: ' . $key); 2211 } 2212 2213 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2214 $this->noise[$key] = $matches[$i][$idx][0]; 2215 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2216 } 2217 2218 // reset the length of content 2219 $this->size = strlen($this->doc); 2220 2221 if ($this->size > 0) { 2222 $this->char = $this->doc[0]; 2223 } 2224 } 2225 2226 function restore_noise($text) 2227 { 2228 global $debug_object; 2229 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2230 2231 while (($pos = strpos($text, '___noise___')) !== false) { 2232 // Sometimes there is a broken piece of markup, and we don't GET the 2233 // pos+11 etc... token which indicates a problem outside of us... 2234 2235 // todo: "___noise___1000" (or any number with four or more digits) 2236 // in the DOM causes an infinite loop which could be utilized by 2237 // malicious software 2238 if (strlen($text) > $pos + 15) { 2239 $key = '___noise___' 2240 . $text[$pos + 11] 2241 . $text[$pos + 12] 2242 . $text[$pos + 13] 2243 . $text[$pos + 14] 2244 . $text[$pos + 15]; 2245 2246 if (is_object($debug_object)) { 2247 $debug_object->debug_log(2, 'located key of: ' . $key); 2248 } 2249 2250 if (isset($this->noise[$key])) { 2251 $text = substr($text, 0, $pos) 2252 . $this->noise[$key] 2253 . substr($text, $pos + 16); 2254 } else { 2255 // do this to prevent an infinite loop. 2256 $text = substr($text, 0, $pos) 2257 . 'UNDEFINED NOISE FOR KEY: ' 2258 . $key 2259 . substr($text, $pos + 16); 2260 } 2261 } else { 2262 // There is no valid key being given back to us... We must get 2263 // rid of the ___noise___ or we will have a problem. 2264 $text = substr($text, 0, $pos) 2265 . 'NO NUMERIC NOISE KEY' 2266 . substr($text, $pos + 11); 2267 } 2268 } 2269 return $text; 2270 } 2271 2272 function search_noise($text) 2273 { 2274 global $debug_object; 2275 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2276 2277 foreach($this->noise as $noiseElement) { 2278 if (strpos($noiseElement, $text) !== false) { 2279 return $noiseElement; 2280 } 2281 } 2282 } 2283 2284 function __toString() 2285 { 2286 return $this->root->innertext(); 2287 } 2288 2289 function __get($name) 2290 { 2291 switch ($name) { 2292 case 'outertext': 2293 return $this->root->innertext(); 2294 case 'innertext': 2295 return $this->root->innertext(); 2296 case 'plaintext': 2297 return $this->root->text(); 2298 case 'charset': 2299 return $this->_charset; 2300 case 'target_charset': 2301 return $this->_target_charset; 2302 } 2303 } 2304 2305 function childNodes($idx = -1) 2306 { 2307 return $this->root->childNodes($idx); 2308 } 2309 2310 function firstChild() 2311 { 2312 return $this->root->first_child(); 2313 } 2314 2315 function lastChild() 2316 { 2317 return $this->root->last_child(); 2318 } 2319 2320 function createElement($name, $value = null) 2321 { 2322 return @str_get_html("<$name>$value</$name>")->firstChild(); 2323 } 2324 2325 function createTextNode($value) 2326 { 2327 return @end(str_get_html($value)->nodes); 2328 } 2329 2330 function getElementById($id) 2331 { 2332 return $this->find("#$id", 0); 2333 } 2334 2335 function getElementsById($id, $idx = null) 2336 { 2337 return $this->find("#$id", $idx); 2338 } 2339 2340 function getElementByTagName($name) 2341 { 2342 return $this->find($name, 0); 2343 } 2344 2345 function getElementsByTagName($name, $idx = -1) 2346 { 2347 return $this->find($name, $idx); 2348 } 2349 2350 function loadFile() 2351 { 2352 $args = func_get_args(); 2353 $this->load_file($args); 2354 } 2355} 2356