1<?php 2namespace imapmarkers; // Added by: Kai Thoene <k.git.thoene@gmx.net> 3/** 4 * Website: http://sourceforge.net/projects/simplehtmldom/ 5 * Additional projects: http://sourceforge.net/projects/debugobject/ 6 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 7 * 8 * Licensed under The MIT License 9 * See the LICENSE file in the project root for more information. 10 * 11 * Authors: 12 * S.C. Chen 13 * John Schlick 14 * Rus Carroll 15 * logmanoriginal 16 * 17 * Contributors: 18 * Yousuke Kumakura 19 * Vadim Voituk 20 * Antcs 21 * 22 * Version Rev. 1.9.1 (291) 23 */ 24 25define('HDOM_TYPE_ELEMENT', 1); 26define('HDOM_TYPE_COMMENT', 2); 27define('HDOM_TYPE_TEXT', 3); 28define('HDOM_TYPE_ENDTAG', 4); 29define('HDOM_TYPE_ROOT', 5); 30define('HDOM_TYPE_UNKNOWN', 6); 31define('HDOM_QUOTE_DOUBLE', 0); 32define('HDOM_QUOTE_SINGLE', 1); 33define('HDOM_QUOTE_NO', 3); 34define('HDOM_INFO_BEGIN', 0); 35define('HDOM_INFO_END', 1); 36define('HDOM_INFO_QUOTE', 2); 37define('HDOM_INFO_SPACE', 3); 38define('HDOM_INFO_TEXT', 4); 39define('HDOM_INFO_INNER', 5); 40define('HDOM_INFO_OUTER', 6); 41define('HDOM_INFO_ENDSPACE', 7); 42 43defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 44defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 45defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 46defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 47define('HDOM_SMARTY_AS_TEXT', 1); 48 49function file_get_html( 50 $url, 51 $use_include_path = false, 52 $context = null, 53 $offset = 0, 54 $maxLen = -1, 55 $lowercase = true, 56 $forceTagsClosed = true, 57 $target_charset = DEFAULT_TARGET_CHARSET, 58 $stripRN = true, 59 $defaultBRText = DEFAULT_BR_TEXT, 60 $defaultSpanText = DEFAULT_SPAN_TEXT) { 61 if ($maxLen <= 0) { 62 $maxLen = MAX_FILE_SIZE; 63 } 64 65 $dom = new simple_html_dom( 66 null, 67 $lowercase, 68 $forceTagsClosed, 69 $target_charset, 70 $stripRN, 71 $defaultBRText, 72 $defaultSpanText 73 ); 74 75 /** 76 * For sourceforge users: uncomment the next line and comment the 77 * retrieve_url_contents line 2 lines down if it is not already done. 78 */ 79 $contents = file_get_contents( 80 $url, 81 $use_include_path, 82 $context, 83 $offset, 84 $maxLen 85 ); 86 // $contents = retrieve_url_contents($url); 87 88 if (empty($contents) || strlen($contents) > $maxLen) { 89 $dom->clear(); 90 return false; 91 } 92 93 return $dom->load($contents, $lowercase, $stripRN); 94} 95 96function str_get_html( 97 $str, 98 $lowercase = true, 99 $forceTagsClosed = true, 100 $target_charset = DEFAULT_TARGET_CHARSET, 101 $stripRN = true, 102 $defaultBRText = DEFAULT_BR_TEXT, 103 $defaultSpanText = DEFAULT_SPAN_TEXT) { 104 $dom = new simple_html_dom( 105 null, 106 $lowercase, 107 $forceTagsClosed, 108 $target_charset, 109 $stripRN, 110 $defaultBRText, 111 $defaultSpanText 112 ); 113 114 if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 115 $dom->clear(); 116 return false; 117 } 118 119 return $dom->load($str, $lowercase, $stripRN); 120} 121 122function dump_html_tree($node, $show_attr = true, $deep = 0) { 123 $node->dump($node); 124} 125 126class simple_html_dom_node { 127 public $nodetype = HDOM_TYPE_TEXT; 128 public $tag = 'text'; 129 public $attr = array(); 130 public $children = array(); 131 public $nodes = array(); 132 public $parent = null; 133 public $_ = array(); 134 public $tag_start = 0; 135 private $dom = null; 136 137 function __construct($dom) { 138 $this->dom = $dom; 139 $dom->nodes[] = $this; 140 } 141 142 function __destruct() { 143 $this->clear(); 144 } 145 146 function __toString() { 147 return $this->outertext(); 148 } 149 150 function clear() { 151 $this->dom = null; 152 $this->nodes = null; 153 $this->parent = null; 154 $this->children = null; 155 } 156 157 function dump($show_attr = true, $depth = 0) { 158 echo str_repeat("\t", $depth) . $this->tag; 159 160 if ($show_attr && count($this->attr) > 0) { 161 echo '('; 162 foreach ($this->attr as $k => $v) { 163 echo "[$k]=>\"$v\", "; 164 } 165 echo ')'; 166 } 167 168 echo "\n"; 169 170 if ($this->nodes) { 171 foreach ($this->nodes as $node) { 172 $node->dump($show_attr, $depth + 1); 173 } 174 } 175 } 176 177 function dump_node($echo = true) { 178 $string = $this->tag; 179 180 if (count($this->attr) > 0) { 181 $string .= '('; 182 foreach ($this->attr as $k => $v) { 183 $string .= "[$k]=>\"$v\", "; 184 } 185 $string .= ')'; 186 } 187 188 if (count($this->_) > 0) { 189 $string .= ' $_ ('; 190 foreach ($this->_ as $k => $v) { 191 if (is_array($v)) { 192 $string .= "[$k]=>("; 193 foreach ($v as $k2 => $v2) { 194 $string .= "[$k2]=>\"$v2\", "; 195 } 196 $string .= ')'; 197 } else { 198 $string .= "[$k]=>\"$v\", "; 199 } 200 } 201 $string .= ')'; 202 } 203 204 if (isset($this->text)) { 205 $string .= " text: ({$this->text})"; 206 } 207 208 $string .= ' HDOM_INNER_INFO: '; 209 210 if (isset($node->_[HDOM_INFO_INNER])) { 211 $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 212 } else { 213 $string .= ' NULL '; 214 } 215 216 $string .= ' children: ' . count($this->children); 217 $string .= ' nodes: ' . count($this->nodes); 218 $string .= ' tag_start: ' . $this->tag_start; 219 $string .= "\n"; 220 221 if ($echo) { 222 echo $string; 223 return; 224 } else { 225 return $string; 226 } 227 } 228 229 function parent($parent = null) { 230 // I am SURE that this doesn't work properly. 231 // It fails to unset the current node from it's current parents nodes or 232 // children list first. 233 if ($parent !== null) { 234 $this->parent = $parent; 235 $this->parent->nodes[] = $this; 236 $this->parent->children[] = $this; 237 } 238 239 return $this->parent; 240 } 241 242 function has_child() { 243 return !empty($this->children); 244 } 245 246 function children($idx = -1) { 247 if ($idx === -1) { 248 return $this->children; 249 } 250 251 if (isset($this->children[$idx])) { 252 return $this->children[$idx]; 253 } 254 255 return null; 256 } 257 258 function first_child() { 259 if (count($this->children) > 0) { 260 return $this->children[0]; 261 } 262 return null; 263 } 264 265 function last_child() { 266 if (count($this->children) > 0) { 267 return end($this->children); 268 } 269 return null; 270 } 271 272 function next_sibling() { 273 if ($this->parent === null) { 274 return null; 275 } 276 277 $idx = array_search($this, $this->parent->children, true); 278 279 if ($idx !== false && isset($this->parent->children[$idx + 1])) { 280 return $this->parent->children[$idx + 1]; 281 } 282 283 return null; 284 } 285 286 function prev_sibling() { 287 if ($this->parent === null) { 288 return null; 289 } 290 291 $idx = array_search($this, $this->parent->children, true); 292 293 if ($idx !== false && $idx > 0) { 294 return $this->parent->children[$idx - 1]; 295 } 296 297 return null; 298 } 299 300 function find_ancestor_tag($tag) { 301 global $debug_object; 302 if (is_object($debug_object)) { 303 $debug_object->debug_log_entry(1); 304 } 305 306 if ($this->parent === null) { 307 return null; 308 } 309 310 $ancestor = $this->parent; 311 312 while (!is_null($ancestor)) { 313 if (is_object($debug_object)) { 314 $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 315 } 316 317 if ($ancestor->tag === $tag) { 318 break; 319 } 320 321 $ancestor = $ancestor->parent; 322 } 323 324 return $ancestor; 325 } 326 327 function innertext() { 328 if (isset($this->_[HDOM_INFO_INNER])) { 329 return $this->_[HDOM_INFO_INNER]; 330 } 331 332 if (isset($this->_[HDOM_INFO_TEXT])) { 333 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 334 } 335 336 $ret = ''; 337 338 foreach ($this->nodes as $n) { 339 $ret .= $n->outertext(); 340 } 341 342 return $ret; 343 } 344 345 function outertext() { 346 global $debug_object; 347 348 if (is_object($debug_object)) { 349 $text = ''; 350 351 if ($this->tag === 'text') { 352 if (!empty($this->text)) { 353 $text = ' with text: ' . $this->text; 354 } 355 } 356 357 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 358 } 359 360 if ($this->tag === 'root') { 361 return $this->innertext(); 362 } 363 364 // todo: What is the use of this callback? Remove? 365 if ($this->dom && $this->dom->callback !== null) { 366 call_user_func_array($this->dom->callback, array($this)); 367 } 368 369 if (isset($this->_[HDOM_INFO_OUTER])) { 370 return $this->_[HDOM_INFO_OUTER]; 371 } 372 373 if (isset($this->_[HDOM_INFO_TEXT])) { 374 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 375 } 376 377 $ret = ''; 378 379 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 380 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 381 } 382 383 if (isset($this->_[HDOM_INFO_INNER])) { 384 // todo: <br> should either never have HDOM_INFO_INNER or always 385 if ($this->tag !== 'br') { 386 $ret .= $this->_[HDOM_INFO_INNER]; 387 } 388 } elseif ($this->nodes) { 389 foreach ($this->nodes as $n) { 390 $ret .= $this->convert_text($n->outertext()); 391 } 392 } 393 394 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 395 $ret .= '</' . $this->tag . '>'; 396 } 397 398 return $ret; 399 } 400 401 function text() { 402 if (isset($this->_[HDOM_INFO_INNER])) { 403 return $this->_[HDOM_INFO_INNER]; 404 } 405 406 switch ($this->nodetype) { 407 case HDOM_TYPE_TEXT: 408 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 409 case HDOM_TYPE_COMMENT: 410 return ''; 411 case HDOM_TYPE_UNKNOWN: 412 return ''; 413 } 414 415 if (strcasecmp($this->tag, 'script') === 0) { 416 return ''; 417 } 418 if (strcasecmp($this->tag, 'style') === 0) { 419 return ''; 420 } 421 422 $ret = ''; 423 424 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 425 // for some span tags, and some p tags) $this->nodes is set to NULL. 426 // NOTE: This indicates that there is a problem where it's set to NULL 427 // without a clear happening. 428 // WHY is this happening? 429 if (!is_null($this->nodes)) { 430 foreach ($this->nodes as $n) { 431 // Start paragraph after a blank line 432 if ($n->tag === 'p') { 433 $ret = trim($ret) . "\n\n"; 434 } 435 436 $ret .= $this->convert_text($n->text()); 437 438 // If this node is a span... add a space at the end of it so 439 // multiple spans don't run into each other. This is plaintext 440 // after all. 441 if ($n->tag === 'span') { 442 $ret .= $this->dom->default_span_text; 443 } 444 } 445 } 446 return $ret; 447 } 448 449 function xmltext() { 450 $ret = $this->innertext(); 451 $ret = str_ireplace('<![CDATA[', '', $ret); 452 $ret = str_replace(']]>', '', $ret); 453 return $ret; 454 } 455 456 function makeup() { 457 // text, comment, unknown 458 if (isset($this->_[HDOM_INFO_TEXT])) { 459 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 460 } 461 462 $ret = '<' . $this->tag; 463 $i = -1; 464 465 foreach ($this->attr as $key => $val) { 466 ++$i; 467 468 // skip removed attribute 469 if ($val === null || $val === false) { 470 continue; 471 } 472 473 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 474 475 //no value attr: nowrap, checked selected... 476 if ($val === true) { 477 $ret .= $key; 478 } else { 479 switch ($this->_[HDOM_INFO_QUOTE][$i]) { 480 case HDOM_QUOTE_DOUBLE: 481 $quote = '"'; 482 break; 483 case HDOM_QUOTE_SINGLE: 484 $quote = '\''; 485 break; 486 default: 487 $quote = ''; 488 } 489 490 $ret .= $key 491 . $this->_[HDOM_INFO_SPACE][$i][1] 492 . '=' 493 . $this->_[HDOM_INFO_SPACE][$i][2] 494 . $quote 495 . $val 496 . $quote; 497 } 498 } 499 500 $ret = $this->dom->restore_noise($ret); 501 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 502 } 503 504 function find($selector, $idx = null, $lowercase = false) { 505 $selectors = $this->parse_selector($selector); 506 if (($count = count($selectors)) === 0) { 507 return array(); 508 } 509 $found_keys = array(); 510 511 // find each selector 512 for ($c = 0; $c < $count; ++$c) { 513 // The change on the below line was documented on the sourceforge 514 // code tracker id 2788009 515 // used to be: if (($levle=count($selectors[0]))===0) return array(); 516 if (($levle = count($selectors[$c])) === 0) { 517 return array(); 518 } 519 if (!isset($this->_[HDOM_INFO_BEGIN])) { 520 return array(); 521 } 522 523 $head = array($this->_[HDOM_INFO_BEGIN] => 1); 524 $cmd = ' '; // Combinator 525 526 // handle descendant selectors, no recursive! 527 for ($l = 0; $l < $levle; ++$l) { 528 $ret = array(); 529 530 foreach ($head as $k => $v) { 531 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 532 //PaperG - Pass this optional parameter on to the seek function. 533 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 534 } 535 536 $head = $ret; 537 $cmd = $selectors[$c][$l][4]; // Next Combinator 538 } 539 540 foreach ($head as $k => $v) { 541 if (!isset($found_keys[$k])) { 542 $found_keys[$k] = 1; 543 } 544 } 545 } 546 547 // sort keys 548 ksort($found_keys); 549 550 $found = array(); 551 foreach ($found_keys as $k => $v) { 552 $found[] = $this->dom->nodes[$k]; 553 } 554 555 // return nth-element or array 556 if (is_null($idx)) { 557 return $found; 558 } elseif ($idx < 0) { 559 $idx = count($found) + $idx; 560 } 561 return (isset($found[$idx])) ? $found[$idx] : null; 562 } 563 564 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) { 565 global $debug_object; 566 if (is_object($debug_object)) { 567 $debug_object->debug_log_entry(1); 568 } 569 570 list($tag, $id, $class, $attributes, $cmb) = $selector; 571 $nodes = array(); 572 573 if ($parent_cmd === ' ') { // Descendant Combinator 574 // Find parent closing tag if the current element doesn't have a closing 575 // tag (i.e. void element) 576 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 577 if ($end == 0) { 578 $parent = $this->parent; 579 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 580 $end -= 1; 581 $parent = $parent->parent; 582 } 583 $end += $parent->_[HDOM_INFO_END]; 584 } 585 586 // Get list of target nodes 587 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 588 $nodes_count = $end - $nodes_start; 589 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 590 } elseif ($parent_cmd === '>') { // Child Combinator 591 $nodes = $this->children; 592 } elseif ($parent_cmd === '+' 593 && $this->parent 594 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 595 $index = array_search($this, $this->parent->children, true) + 1; 596 if ($index < count($this->parent->children)) 597 $nodes[] = $this->parent->children[$index]; 598 } elseif ($parent_cmd === '~' 599 && $this->parent 600 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 601 $index = array_search($this, $this->parent->children, true); 602 $nodes = array_slice($this->parent->children, $index); 603 } 604 605 // Go throgh each element starting at this element until the end tag 606 // Note: If this element is a void tag, any previous void element is 607 // skipped. 608 foreach ($nodes as $node) { 609 $pass = true; 610 611 // Skip root nodes 612 if (!$node->parent) { 613 $pass = false; 614 } 615 616 // Handle 'text' selector 617 if ($pass && $tag === 'text' && $node->tag === 'text') { 618 $ret[array_search($node, $this->dom->nodes, true)] = 1; 619 unset($node); 620 continue; 621 } 622 623 // Skip if node isn't a child node (i.e. text nodes) 624 if ($pass && !in_array($node, $node->parent->children, true)) { 625 $pass = false; 626 } 627 628 // Skip if tag doesn't match 629 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 630 $pass = false; 631 } 632 633 // Skip if ID doesn't exist 634 if ($pass && $id !== '' && !isset($node->attr['id'])) { 635 $pass = false; 636 } 637 638 // Check if ID matches 639 if ($pass && $id !== '' && isset($node->attr['id'])) { 640 // Note: Only consider the first ID (as browsers do) 641 $node_id = explode(' ', trim($node->attr['id']))[0]; 642 643 if ($id !== $node_id) { 644 $pass = false; 645 } 646 } 647 648 // Check if all class(es) exist 649 if ($pass && $class !== '' && is_array($class) && !empty($class)) { 650 if (isset($node->attr['class'])) { 651 $node_classes = explode(' ', $node->attr['class']); 652 653 if ($lowercase) { 654 $node_classes = array_map('strtolower', $node_classes); 655 } 656 657 foreach ($class as $c) { 658 if (!in_array($c, $node_classes)) { 659 $pass = false; 660 break; 661 } 662 } 663 } else { 664 $pass = false; 665 } 666 } 667 668 // Check attributes 669 if ($pass 670 && $attributes !== '' 671 && is_array($attributes) 672 && !empty($attributes)) { 673 foreach ($attributes as $a) { 674 list( 675 $att_name, 676 $att_expr, 677 $att_val, 678 $att_inv, 679 $att_case_sensitivity 680 ) = $a; 681 682 // Handle indexing attributes (i.e. "[2]") 683 /** 684 * Note: This is not supported by the CSS Standard but adds 685 * the ability to select items compatible to XPath (i.e. 686 * the 3rd element within it's parent). 687 * 688 * Note: This doesn't conflict with the CSS Standard which 689 * doesn't work on numeric attributes anyway. 690 */ 691 if (is_numeric($att_name) 692 && $att_expr === '' 693 && $att_val === '') { 694 $count = 0; 695 696 // Find index of current element in parent 697 foreach ($node->parent->children as $c) { 698 if ($c->tag === $node->tag) 699 ++$count; 700 if ($c === $node) 701 break; 702 } 703 704 // If this is the correct node, continue with next 705 // attribute 706 if ($count === (int) $att_name) 707 continue; 708 } 709 710 // Check attribute availability 711 if ($att_inv) { // Attribute should NOT be set 712 if (isset($node->attr[$att_name])) { 713 $pass = false; 714 break; 715 } 716 } else { // Attribute should be set 717 // todo: "plaintext" is not a valid CSS selector! 718 if ($att_name !== 'plaintext' 719 && !isset($node->attr[$att_name])) { 720 $pass = false; 721 break; 722 } 723 } 724 725 // Continue with next attribute if expression isn't defined 726 if ($att_expr === '') 727 continue; 728 729 // If they have told us that this is a "plaintext" 730 // search then we want the plaintext of the node - right? 731 // todo "plaintext" is not a valid CSS selector! 732 if ($att_name === 'plaintext') { 733 $nodeKeyValue = $node->text(); 734 } else { 735 $nodeKeyValue = $node->attr[$att_name]; 736 } 737 738 if (is_object($debug_object)) { 739 $debug_object->debug_log(2, 740 'testing node: ' 741 . $node->tag 742 . ' for attribute: ' 743 . $att_name 744 . $att_expr 745 . $att_val 746 . ' where nodes value is: ' 747 . $nodeKeyValue 748 ); 749 } 750 751 // If lowercase is set, do a case insensitive test of 752 // the value of the selector. 753 if ($lowercase) { 754 $check = $this->match( 755 $att_expr, 756 strtolower($att_val), 757 strtolower($nodeKeyValue), 758 $att_case_sensitivity 759 ); 760 } else { 761 $check = $this->match( 762 $att_expr, 763 $att_val, 764 $nodeKeyValue, 765 $att_case_sensitivity 766 ); 767 } 768 769 if (is_object($debug_object)) { 770 $debug_object->debug_log(2, 771 'after match: ' 772 . ($check ? 'true' : 'false') 773 ); 774 } 775 776 if (!$check) { 777 $pass = false; 778 break; 779 } 780 } 781 } 782 783 // Found a match. Add to list and clear node 784 if ($pass) 785 $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 786 unset($node); 787 } 788 // It's passed by reference so this is actually what this function returns. 789 if (is_object($debug_object)) { 790 $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 791 } 792 } 793 794 protected function match($exp, $pattern, $value, $case_sensitivity) { 795 global $debug_object; 796 if (is_object($debug_object)) { 797 $debug_object->debug_log_entry(1); 798 } 799 800 if ($case_sensitivity === 'i') { 801 $pattern = strtolower($pattern); 802 $value = strtolower($value); 803 } 804 805 switch ($exp) { 806 case '=': 807 return ($value === $pattern); 808 case '!=': 809 return ($value !== $pattern); 810 case '^=': 811 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 812 case '$=': 813 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 814 case '*=': 815 return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 816 case '|=': 817 /** 818 * [att|=val] 819 * 820 * Represents an element with the att attribute, its value 821 * either being exactly "val" or beginning with "val" 822 * immediately followed by "-" (U+002D). 823 */ 824 return strpos($value, $pattern) === 0; 825 case '~=': 826 /** 827 * [att~=val] 828 * 829 * Represents an element with the att attribute whose value is a 830 * whitespace-separated list of words, one of which is exactly 831 * "val". If "val" contains whitespace, it will never represent 832 * anything (since the words are separated by spaces). Also if 833 * "val" is the empty string, it will never represent anything. 834 */ 835 return in_array($pattern, explode(' ', trim($value)), true); 836 } 837 return false; 838 } 839 840 protected function parse_selector($selector_string) { 841 global $debug_object; 842 if (is_object($debug_object)) { 843 $debug_object->debug_log_entry(1); 844 } 845 846 /** 847 * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 848 * 849 * Paperg: Add the colon to the attribute, so that it properly finds 850 * <tag attr:ibute="something" > like google does. 851 * 852 * Note: if you try to look at this attribute, you MUST use getAttribute 853 * since $dom->x:y will fail the php syntax check. 854 * 855 * Notice the \[ starting the attribute? and the @? following? This 856 * implies that an attribute can begin with an @ sign that is not 857 * captured. This implies that an html attribute specifier may start 858 * with an @ sign that is NOT captured by the expression. Farther study 859 * is required to determine of this should be documented or removed. 860 * 861 * Matches selectors in this order: 862 * 863 * [0] - full match 864 * 865 * [1] - tag name 866 * ([\w:\*-]*) 867 * Matches the tag name consisting of zero or more words, colons, 868 * asterisks and hyphens. 869 * 870 * [2] - id name 871 * (?:\#([\w-]+)) 872 * Optionally matches a id name, consisting of an "#" followed by 873 * the id name (one or more words and hyphens). 874 * 875 * [3] - class names (including dots) 876 * (?:\.([\w\.-]+))? 877 * Optionally matches a list of classs, consisting of an "." 878 * followed by the class name (one or more words and hyphens) 879 * where multiple classes can be chained (i.e. ".foo.bar.baz") 880 * 881 * [4] - attributes 882 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 883 * Optionally matches the attributes list 884 * 885 * [5] - separator 886 * ([\/, >+~]+) 887 * Matches the selector list separator 888 */ 889 // phpcs:ignore Generic.Files.LineLength 890 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 891 892 preg_match_all( 893 $pattern, 894 trim($selector_string) . ' ', // Add final ' ' as pseudo separator 895 $matches, 896 PREG_SET_ORDER 897 ); 898 899 if (is_object($debug_object)) { 900 $debug_object->debug_log(2, 'Matches Array: ', $matches); 901 } 902 903 $selectors = array(); 904 $result = array(); 905 906 foreach ($matches as $m) { 907 $m[0] = trim($m[0]); 908 909 // Skip NoOps 910 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { 911 continue; 912 } 913 914 // Convert to lowercase 915 if ($this->dom->lowercase) { 916 $m[1] = strtolower($m[1]); 917 } 918 919 // Extract classes 920 if ($m[3] !== '') { 921 $m[3] = explode('.', $m[3]); 922 } 923 924 /* Extract attributes (pattern based on the pattern above!) 925 926 * [0] - full match 927 * [1] - attribute name 928 * [2] - attribute expression 929 * [3] - attribute value 930 * [4] - case sensitivity 931 * 932 * Note: Attributes can be negated with a "!" prefix to their name 933 */ 934 if ($m[4] !== '') { 935 preg_match_all( 936 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 937 trim($m[4]), 938 $attributes, 939 PREG_SET_ORDER 940 ); 941 942 // Replace element by array 943 $m[4] = array(); 944 945 foreach ($attributes as $att) { 946 // Skip empty matches 947 if (trim($att[0]) === '') { 948 continue; 949 } 950 951 $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 952 $m[4][] = array( 953 $inverted ? substr($att[1], 1) : $att[1], // Name 954 (isset($att[2])) ? $att[2] : '', // Expression 955 (isset($att[3])) ? $att[3] : '', // Value 956 $inverted, // Inverted Flag 957 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 958 ); 959 } 960 } 961 962 // Sanitize Separator 963 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 964 $m[5] = ' '; 965 } else { // Other Separator 966 $m[5] = trim($m[5]); 967 } 968 969 // Clear Separator if it's a Selector List 970 if ($is_list = ($m[5] === ',')) { 971 $m[5] = ''; 972 } 973 974 // Remove full match before adding to results 975 array_shift($m); 976 $result[] = $m; 977 978 if ($is_list) { // Selector List 979 $selectors[] = $result; 980 $result = array(); 981 } 982 } 983 984 if (count($result) > 0) { 985 $selectors[] = $result; 986 } 987 return $selectors; 988 } 989 990 function __get($name) { 991 if (isset($this->attr[$name])) { 992 return $this->convert_text($this->attr[$name]); 993 } 994 switch ($name) { 995 case 'outertext': 996 return $this->outertext(); 997 case 'innertext': 998 return $this->innertext(); 999 case 'plaintext': 1000 return $this->text(); 1001 case 'xmltext': 1002 return $this->xmltext(); 1003 default: 1004 return array_key_exists($name, $this->attr); 1005 } 1006 } 1007 1008 function __set($name, $value) { 1009 global $debug_object; 1010 if (is_object($debug_object)) { 1011 $debug_object->debug_log_entry(1); 1012 } 1013 1014 switch ($name) { 1015 case 'outertext': 1016 return $this->_[HDOM_INFO_OUTER] = $value; 1017 case 'innertext': 1018 if (isset($this->_[HDOM_INFO_TEXT])) { 1019 return $this->_[HDOM_INFO_TEXT] = $value; 1020 } 1021 return $this->_[HDOM_INFO_INNER] = $value; 1022 } 1023 1024 if (!isset($this->attr[$name])) { 1025 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 1026 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1027 } 1028 1029 $this->attr[$name] = $value; 1030 } 1031 1032 function __isset($name) { 1033 switch ($name) { 1034 case 'outertext': 1035 return true; 1036 case 'innertext': 1037 return true; 1038 case 'plaintext': 1039 return true; 1040 } 1041 //no value attr: nowrap, checked selected... 1042 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1043 } 1044 1045 function __unset($name) { 1046 if (isset($this->attr[$name])) { 1047 unset($this->attr[$name]); 1048 } 1049 } 1050 1051 function convert_text($text) { 1052 global $debug_object; 1053 if (is_object($debug_object)) { 1054 $debug_object->debug_log_entry(1); 1055 } 1056 1057 $converted_text = $text; 1058 1059 $sourceCharset = ''; 1060 $targetCharset = ''; 1061 1062 if ($this->dom) { 1063 $sourceCharset = strtoupper($this->dom->_charset); 1064 $targetCharset = strtoupper($this->dom->_target_charset); 1065 } 1066 1067 if (is_object($debug_object)) { 1068 $debug_object->debug_log(3, 1069 'source charset: ' 1070 . $sourceCharset 1071 . ' target charaset: ' 1072 . $targetCharset 1073 ); 1074 } 1075 1076 if (!empty($sourceCharset) 1077 && !empty($targetCharset) 1078 && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1079 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1080 if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1081 && ($this->is_utf8($text))) { 1082 $converted_text = $text; 1083 } else { 1084 $converted_text = iconv($sourceCharset, $targetCharset, $text); 1085 } 1086 } 1087 1088 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1089 if ($targetCharset === 'UTF-8') { 1090 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1091 $converted_text = substr($converted_text, 3); 1092 } 1093 1094 if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1095 $converted_text = substr($converted_text, 0, -3); 1096 } 1097 } 1098 1099 return $converted_text; 1100 } 1101 1102 static function is_utf8($str) { 1103 $c = 0; 1104 $b = 0; 1105 $bits = 0; 1106 $len = strlen($str); 1107 for ($i = 0; $i < $len; $i++) { 1108 $c = ord($str[$i]); 1109 if ($c > 128) { 1110 if (($c >= 254)) { 1111 return false; 1112 } elseif ($c >= 252) { 1113 $bits = 6; 1114 } elseif ($c >= 248) { 1115 $bits = 5; 1116 } elseif ($c >= 240) { 1117 $bits = 4; 1118 } elseif ($c >= 224) { 1119 $bits = 3; 1120 } elseif ($c >= 192) { 1121 $bits = 2; 1122 } else { 1123 return false; 1124 } 1125 if (($i + $bits) > $len) { 1126 return false; 1127 } 1128 while ($bits > 1) { 1129 $i++; 1130 $b = ord($str[$i]); 1131 if ($b < 128 || $b > 191) { 1132 return false; 1133 } 1134 $bits--; 1135 } 1136 } 1137 } 1138 return true; 1139 } 1140 1141 function get_display_size() { 1142 global $debug_object; 1143 1144 $width = -1; 1145 $height = -1; 1146 1147 if ($this->tag !== 'img') { 1148 return false; 1149 } 1150 1151 // See if there is aheight or width attribute in the tag itself. 1152 if (isset($this->attr['width'])) { 1153 $width = $this->attr['width']; 1154 } 1155 1156 if (isset($this->attr['height'])) { 1157 $height = $this->attr['height']; 1158 } 1159 1160 // Now look for an inline style. 1161 if (isset($this->attr['style'])) { 1162 // Thanks to user gnarf from stackoverflow for this regular expression. 1163 $attributes = array(); 1164 1165 preg_match_all( 1166 '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1167 $this->attr['style'], 1168 $matches, 1169 PREG_SET_ORDER 1170 ); 1171 1172 foreach ($matches as $match) { 1173 $attributes[$match[1]] = $match[2]; 1174 } 1175 1176 // If there is a width in the style attributes: 1177 if (isset($attributes['width']) && $width == -1) { 1178 // check that the last two characters are px (pixels) 1179 if (strtolower(substr($attributes['width'], -2)) === 'px') { 1180 $proposed_width = substr($attributes['width'], 0, -2); 1181 // Now make sure that it's an integer and not something stupid. 1182 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1183 $width = $proposed_width; 1184 } 1185 } 1186 } 1187 1188 // If there is a width in the style attributes: 1189 if (isset($attributes['height']) && $height == -1) { 1190 // check that the last two characters are px (pixels) 1191 if (strtolower(substr($attributes['height'], -2)) == 'px') { 1192 $proposed_height = substr($attributes['height'], 0, -2); 1193 // Now make sure that it's an integer and not something stupid. 1194 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1195 $height = $proposed_height; 1196 } 1197 } 1198 } 1199 1200 } 1201 1202 // Future enhancement: 1203 // Look in the tag to see if there is a class or id specified that has 1204 // a height or width attribute to it. 1205 1206 // Far future enhancement 1207 // Look at all the parent tags of this image to see if they specify a 1208 // class or id that has an img selector that specifies a height or width 1209 // Note that in this case, the class or id will have the img subselector 1210 // for it to apply to the image. 1211 1212 // ridiculously far future development 1213 // If the class or id is specified in a SEPARATE css file thats not on 1214 // the page, go get it and do what we were just doing for the ones on 1215 // the page. 1216 1217 $result = array( 1218 'height' => $height, 1219 'width' => $width 1220 ); 1221 1222 return $result; 1223 } 1224 1225 function save($filepath = '') { 1226 $ret = $this->outertext(); 1227 1228 if ($filepath !== '') { 1229 file_put_contents($filepath, $ret, LOCK_EX); 1230 } 1231 1232 return $ret; 1233 } 1234 1235 function addClass($class) { 1236 if (is_string($class)) { 1237 $class = explode(' ', $class); 1238 } 1239 1240 if (is_array($class)) { 1241 foreach ($class as $c) { 1242 if (isset($this->class)) { 1243 if ($this->hasClass($c)) { 1244 continue; 1245 } else { 1246 $this->class .= ' ' . $c; 1247 } 1248 } else { 1249 $this->class = $c; 1250 } 1251 } 1252 } else { 1253 if (is_object($debug_object)) { 1254 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1255 } 1256 } 1257 } 1258 1259 function hasClass($class) { 1260 if (is_string($class)) { 1261 if (isset($this->class)) { 1262 return in_array($class, explode(' ', $this->class), true); 1263 } 1264 } else { 1265 if (is_object($debug_object)) { 1266 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1267 } 1268 } 1269 1270 return false; 1271 } 1272 1273 function removeClass($class = null) { 1274 if (!isset($this->class)) { 1275 return; 1276 } 1277 1278 if (is_null($class)) { 1279 $this->removeAttribute('class'); 1280 return; 1281 } 1282 1283 if (is_string($class)) { 1284 $class = explode(' ', $class); 1285 } 1286 1287 if (is_array($class)) { 1288 $class = array_diff(explode(' ', $this->class), $class); 1289 if (empty($class)) { 1290 $this->removeAttribute('class'); 1291 } else { 1292 $this->class = implode(' ', $class); 1293 } 1294 } 1295 } 1296 1297 function getAllAttributes() { 1298 return $this->attr; 1299 } 1300 1301 function getAttribute($name) { 1302 return $this->__get($name); 1303 } 1304 1305 function setAttribute($name, $value) { 1306 $this->__set($name, $value); 1307 } 1308 1309 function hasAttribute($name) { 1310 return $this->__isset($name); 1311 } 1312 1313 function removeAttribute($name) { 1314 $this->__set($name, null); 1315 } 1316 1317 function remove() { 1318 if ($this->parent) { 1319 $this->parent->removeChild($this); 1320 } 1321 } 1322 1323 function removeChild($node) { 1324 $nidx = array_search($node, $this->nodes, true); 1325 $cidx = array_search($node, $this->children, true); 1326 $didx = array_search($node, $this->dom->nodes, true); 1327 1328 if ($nidx !== false && $cidx !== false && $didx !== false) { 1329 1330 foreach ($node->children as $child) { 1331 $node->removeChild($child); 1332 } 1333 1334 foreach ($node->nodes as $entity) { 1335 $enidx = array_search($entity, $node->nodes, true); 1336 $edidx = array_search($entity, $node->dom->nodes, true); 1337 1338 if ($enidx !== false && $edidx !== false) { 1339 unset($node->nodes[$enidx]); 1340 unset($node->dom->nodes[$edidx]); 1341 } 1342 } 1343 1344 unset($this->nodes[$nidx]); 1345 unset($this->children[$cidx]); 1346 unset($this->dom->nodes[$didx]); 1347 1348 $node->clear(); 1349 1350 } 1351 } 1352 1353 function getElementById($id) { 1354 return $this->find("#$id", 0); 1355 } 1356 1357 function getElementsById($id, $idx = null) { 1358 return $this->find("#$id", $idx); 1359 } 1360 1361 function getElementByTagName($name) { 1362 return $this->find($name, 0); 1363 } 1364 1365 function getElementsByTagName($name, $idx = null) { 1366 return $this->find($name, $idx); 1367 } 1368 1369 function parentNode() { 1370 return $this->parent(); 1371 } 1372 1373 function childNodes($idx = -1) { 1374 return $this->children($idx); 1375 } 1376 1377 function firstChild() { 1378 return $this->first_child(); 1379 } 1380 1381 function lastChild() { 1382 return $this->last_child(); 1383 } 1384 1385 function nextSibling() { 1386 return $this->next_sibling(); 1387 } 1388 1389 function previousSibling() { 1390 return $this->prev_sibling(); 1391 } 1392 1393 function hasChildNodes() { 1394 return $this->has_child(); 1395 } 1396 1397 function nodeName() { 1398 return $this->tag; 1399 } 1400 1401 function appendChild($node) { 1402 $node->parent($this); 1403 return $node; 1404 } 1405 1406} 1407 1408class simple_html_dom { 1409 public $root = null; 1410 public $nodes = array(); 1411 public $callback = null; 1412 public $lowercase = false; 1413 public $original_size; 1414 public $size; 1415 1416 protected $pos; 1417 protected $doc; 1418 protected $char; 1419 1420 protected $cursor; 1421 protected $parent; 1422 protected $noise = array(); 1423 protected $token_blank = " \t\r\n"; 1424 protected $token_equal = ' =/>'; 1425 protected $token_slash = " />\r\n\t"; 1426 protected $token_attr = ' >'; 1427 1428 public $_charset = ''; 1429 public $_target_charset = ''; 1430 1431 protected $default_br_text = ''; 1432 1433 public $default_span_text = ''; 1434 1435 protected $self_closing_tags = array( 1436 'area' => 1, 1437 'base' => 1, 1438 'br' => 1, 1439 'col' => 1, 1440 'embed' => 1, 1441 'hr' => 1, 1442 'img' => 1, 1443 'input' => 1, 1444 'link' => 1, 1445 'meta' => 1, 1446 'param' => 1, 1447 'source' => 1, 1448 'track' => 1, 1449 'wbr' => 1 1450 ); 1451 protected $block_tags = array( 1452 'body' => 1, 1453 'div' => 1, 1454 'form' => 1, 1455 'root' => 1, 1456 'span' => 1, 1457 'table' => 1 1458 ); 1459 protected $optional_closing_tags = array( 1460 // Not optional, see 1461 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1462 'b' => array('b' => 1), 1463 'dd' => array('dd' => 1, 'dt' => 1), 1464 // Not optional, see 1465 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1466 'dl' => array('dd' => 1, 'dt' => 1), 1467 'dt' => array('dd' => 1, 'dt' => 1), 1468 'li' => array('li' => 1), 1469 'optgroup' => array('optgroup' => 1, 'option' => 1), 1470 'option' => array('optgroup' => 1, 'option' => 1), 1471 'p' => array('p' => 1), 1472 'rp' => array('rp' => 1, 'rt' => 1), 1473 'rt' => array('rp' => 1, 'rt' => 1), 1474 'td' => array('td' => 1, 'th' => 1), 1475 'th' => array('td' => 1, 'th' => 1), 1476 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1477 ); 1478 1479 function __construct( 1480 $str = null, 1481 $lowercase = true, 1482 $forceTagsClosed = true, 1483 $target_charset = DEFAULT_TARGET_CHARSET, 1484 $stripRN = true, 1485 $defaultBRText = DEFAULT_BR_TEXT, 1486 $defaultSpanText = DEFAULT_SPAN_TEXT, 1487 $options = 0) { 1488 if ($str) { 1489 if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1490 $this->load_file($str); 1491 } else { 1492 $this->load( 1493 $str, 1494 $lowercase, 1495 $stripRN, 1496 $defaultBRText, 1497 $defaultSpanText, 1498 $options 1499 ); 1500 } 1501 } 1502 // Forcing tags to be closed implies that we don't trust the html, but 1503 // it can lead to parsing errors if we SHOULD trust the html. 1504 if (!$forceTagsClosed) { 1505 $this->optional_closing_array = array(); 1506 } 1507 1508 $this->_target_charset = $target_charset; 1509 } 1510 1511 function __destruct() { 1512 $this->clear(); 1513 } 1514 1515 function load( 1516 $str, 1517 $lowercase = true, 1518 $stripRN = true, 1519 $defaultBRText = DEFAULT_BR_TEXT, 1520 $defaultSpanText = DEFAULT_SPAN_TEXT, 1521 $options = 0) { 1522 global $debug_object; 1523 1524 // prepare 1525 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1526 1527 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1528 // Script tags removal now preceeds style tag removal. 1529 // strip out <script> tags 1530 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1531 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1532 1533 // strip out the \r \n's if we are told to. 1534 if ($stripRN) { 1535 $this->doc = str_replace("\r", ' ', $this->doc); 1536 $this->doc = str_replace("\n", ' ', $this->doc); 1537 1538 // set the length of content since we have changed it. 1539 $this->size = strlen($this->doc); 1540 } 1541 1542 // strip out cdata 1543 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1544 // strip out comments 1545 $this->remove_noise("'<!--(.*?)-->'is"); 1546 // strip out <style> tags 1547 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1548 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1549 // strip out preformatted tags 1550 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1551 // strip out server side scripts 1552 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1553 1554 if ($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1555 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1556 } 1557 1558 // parsing 1559 $this->parse(); 1560 // end 1561 $this->root->_[HDOM_INFO_END] = $this->cursor; 1562 $this->parse_charset(); 1563 1564 // make load function chainable 1565 return $this; 1566 } 1567 1568 function load_file() { 1569 $args = func_get_args(); 1570 1571 if (($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1572 $this->load($doc, true); 1573 } else { 1574 return false; 1575 } 1576 } 1577 1578 function set_callback($function_name) { 1579 $this->callback = $function_name; 1580 } 1581 1582 function remove_callback() { 1583 $this->callback = null; 1584 } 1585 1586 function save($filepath = '') { 1587 $ret = $this->root->innertext(); 1588 if ($filepath !== '') { 1589 file_put_contents($filepath, $ret, LOCK_EX); 1590 } 1591 return $ret; 1592 } 1593 1594 function find($selector, $idx = null, $lowercase = false) { 1595 return $this->root->find($selector, $idx, $lowercase); 1596 } 1597 1598 function clear() { 1599 if (isset($this->nodes)) { 1600 foreach ($this->nodes as $n) { 1601 $n->clear(); 1602 $n = null; 1603 } 1604 } 1605 1606 // This add next line is documented in the sourceforge repository. 1607 // 2977248 as a fix for ongoing memory leaks that occur even with the 1608 // use of clear. 1609 if (isset($this->children)) { 1610 foreach ($this->children as $n) { 1611 $n->clear(); 1612 $n = null; 1613 } 1614 } 1615 1616 if (isset($this->parent)) { 1617 $this->parent->clear(); 1618 unset($this->parent); 1619 } 1620 1621 if (isset($this->root)) { 1622 $this->root->clear(); 1623 unset($this->root); 1624 } 1625 1626 unset($this->doc); 1627 unset($this->noise); 1628 } 1629 1630 function dump($show_attr = true) { 1631 $this->root->dump($show_attr); 1632 } 1633 1634 protected function prepare( 1635 $str, $lowercase = true, 1636 $defaultBRText = DEFAULT_BR_TEXT, 1637 $defaultSpanText = DEFAULT_SPAN_TEXT) { 1638 $this->clear(); 1639 1640 $this->doc = trim($str); 1641 $this->size = strlen($this->doc); 1642 $this->original_size = $this->size; // original size of the html 1643 $this->pos = 0; 1644 $this->cursor = 1; 1645 $this->noise = array(); 1646 $this->nodes = array(); 1647 $this->lowercase = $lowercase; 1648 $this->default_br_text = $defaultBRText; 1649 $this->default_span_text = $defaultSpanText; 1650 $this->root = new simple_html_dom_node($this); 1651 $this->root->tag = 'root'; 1652 $this->root->_[HDOM_INFO_BEGIN] = -1; 1653 $this->root->nodetype = HDOM_TYPE_ROOT; 1654 $this->parent = $this->root; 1655 if ($this->size > 0) { 1656 $this->char = $this->doc[0]; 1657 } 1658 } 1659 1660 protected function parse() { 1661 while (true) { 1662 // Read next tag if there is no text between current position and the 1663 // next opening tag. 1664 if (($s = $this->copy_until_char('<')) === '') { 1665 if ($this->read_tag()) { 1666 continue; 1667 } else { 1668 return true; 1669 } 1670 } 1671 1672 // Add a text node for text between tags 1673 $node = new simple_html_dom_node($this); 1674 ++$this->cursor; 1675 $node->_[HDOM_INFO_TEXT] = $s; 1676 $this->link_nodes($node, false); 1677 } 1678 } 1679 1680 protected function parse_charset() { 1681 global $debug_object; 1682 1683 $charset = null; 1684 1685 if (function_exists('get_last_retrieve_url_contents_content_type')) { 1686 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1687 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1688 if ($success) { 1689 $charset = $matches[1]; 1690 if (is_object($debug_object)) { 1691 $debug_object->debug_log(2, 1692 'header content-type found charset of: ' 1693 . $charset 1694 ); 1695 } 1696 } 1697 } 1698 1699 if (empty($charset)) { 1700 // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1701 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1702 1703 if (!empty($el)) { 1704 $fullvalue = $el->content; 1705 if (is_object($debug_object)) { 1706 $debug_object->debug_log(2, 1707 'meta content-type tag found' 1708 . $fullvalue 1709 ); 1710 } 1711 1712 if (!empty($fullvalue)) { 1713 $success = preg_match( 1714 '/charset=(.+)/i', 1715 $fullvalue, 1716 $matches 1717 ); 1718 1719 if ($success) { 1720 $charset = $matches[1]; 1721 } else { 1722 // If there is a meta tag, and they don't specify the 1723 // character set, research says that it's typically 1724 // ISO-8859-1 1725 if (is_object($debug_object)) { 1726 $debug_object->debug_log(2, 1727 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1728 ); 1729 } 1730 1731 $charset = 'ISO-8859-1'; 1732 } 1733 } 1734 } 1735 } 1736 1737 if (empty($charset)) { 1738 // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1739 if ($meta = $this->root->find('meta[charset]', 0)) { 1740 $charset = $meta->charset; 1741 if (is_object($debug_object)) { 1742 $debug_object->debug_log(2, 'meta charset: ' . $charset); 1743 } 1744 } 1745 } 1746 1747 if (empty($charset)) { 1748 // Try to guess the charset based on the content 1749 // Requires Multibyte String (mbstring) support (optional) 1750 if (function_exists('mb_detect_encoding')) { 1751 /** 1752 * mb_detect_encoding() is not intended to distinguish between 1753 * charsets, especially single-byte charsets. Its primary 1754 * purpose is to detect which multibyte encoding is in use, 1755 * i.e. UTF-8, UTF-16, shift-JIS, etc. 1756 * 1757 * -- https://bugs.php.net/bug.php?id=38138 1758 * 1759 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1760 * always result in CP1251/ISO-8859-5 and vice versa. 1761 * 1762 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1763 * to stay compatible. 1764 */ 1765 $encoding = mb_detect_encoding( 1766 $this->doc, 1767 array('UTF-8', 'CP1252', 'ISO-8859-1') 1768 ); 1769 1770 if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1771 // Due to a limitation of mb_detect_encoding 1772 // 'CP1251'/'ISO-8859-5' will be detected as 1773 // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1774 // which case we can simply assume it is the other charset. 1775 if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1776 $encoding = 'CP1251'; 1777 } 1778 } 1779 1780 if ($encoding !== false) { 1781 $charset = $encoding; 1782 if (is_object($debug_object)) { 1783 $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1784 } 1785 } 1786 } 1787 } 1788 1789 if (empty($charset)) { 1790 // Assume it's UTF-8 as it is the most likely charset to be used 1791 $charset = 'UTF-8'; 1792 if (is_object($debug_object)) { 1793 $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1794 } 1795 } 1796 1797 // Since CP1252 is a superset, if we get one of it's subsets, we want 1798 // it instead. 1799 if ((strtolower($charset) == 'iso-8859-1') 1800 || (strtolower($charset) == 'latin1') 1801 || (strtolower($charset) == 'latin-1')) { 1802 $charset = 'CP1252'; 1803 if (is_object($debug_object)) { 1804 $debug_object->debug_log(2, 1805 'replacing ' . $charset . ' with CP1252 as its a superset' 1806 ); 1807 } 1808 } 1809 1810 if (is_object($debug_object)) { 1811 $debug_object->debug_log(1, 'EXIT - ' . $charset); 1812 } 1813 1814 return $this->_charset = $charset; 1815 } 1816 1817 protected function read_tag() { 1818 // Set end position if no further tags found 1819 if ($this->char !== '<') { 1820 $this->root->_[HDOM_INFO_END] = $this->cursor; 1821 return false; 1822 } 1823 1824 $begin_tag_pos = $this->pos; 1825 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1826 1827 // end tag 1828 if ($this->char === '/') { 1829 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1830 1831 // Skip whitespace in end tags (i.e. in "</ html>") 1832 $this->skip($this->token_blank); 1833 $tag = $this->copy_until_char('>'); 1834 1835 // Skip attributes in end tags 1836 if (($pos = strpos($tag, ' ')) !== false) { 1837 $tag = substr($tag, 0, $pos); 1838 } 1839 1840 $parent_lower = strtolower($this->parent->tag); 1841 $tag_lower = strtolower($tag); 1842 1843 // The end tag is supposed to close the parent tag. Handle situations 1844 // when it doesn't 1845 if ($parent_lower !== $tag_lower) { 1846 // Parent tag does not have to be closed necessarily (optional closing tag) 1847 // Current tag is a block tag, so it may close an ancestor 1848 if (isset($this->optional_closing_tags[$parent_lower]) 1849 && isset($this->block_tags[$tag_lower])) { 1850 1851 $this->parent->_[HDOM_INFO_END] = 0; 1852 $org_parent = $this->parent; 1853 1854 // Traverse ancestors to find a matching opening tag 1855 // Stop at root node 1856 while (($this->parent->parent) 1857 && strtolower($this->parent->tag) !== $tag_lower 1858 ) { 1859 $this->parent = $this->parent->parent; 1860 } 1861 1862 // If we don't have a match add current tag as text node 1863 if (strtolower($this->parent->tag) !== $tag_lower) { 1864 $this->parent = $org_parent; // restore origonal parent 1865 1866 if ($this->parent->parent) { 1867 $this->parent = $this->parent->parent; 1868 } 1869 1870 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1871 return $this->as_text_node($tag); 1872 } 1873 } elseif (($this->parent->parent) 1874 && isset($this->block_tags[$tag_lower]) 1875 ) { 1876 // Grandparent exists and current tag is a block tag, so our 1877 // parent doesn't have an end tag 1878 $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1879 $org_parent = $this->parent; 1880 1881 // Traverse ancestors to find a matching opening tag 1882 // Stop at root node 1883 while (($this->parent->parent) 1884 && strtolower($this->parent->tag) !== $tag_lower 1885 ) { 1886 $this->parent = $this->parent->parent; 1887 } 1888 1889 // If we don't have a match add current tag as text node 1890 if (strtolower($this->parent->tag) !== $tag_lower) { 1891 $this->parent = $org_parent; // restore origonal parent 1892 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1893 return $this->as_text_node($tag); 1894 } 1895 } elseif (($this->parent->parent) 1896 && strtolower($this->parent->parent->tag) === $tag_lower 1897 ) { // Grandparent exists and current tag closes it 1898 $this->parent->_[HDOM_INFO_END] = 0; 1899 $this->parent = $this->parent->parent; 1900 } else { // Random tag, add as text node 1901 return $this->as_text_node($tag); 1902 } 1903 } 1904 1905 // Set end position of parent tag to current cursor position 1906 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1907 1908 if ($this->parent->parent) { 1909 $this->parent = $this->parent->parent; 1910 } 1911 1912 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1913 return true; 1914 } 1915 1916 // start tag 1917 $node = new simple_html_dom_node($this); 1918 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1919 ++$this->cursor; 1920 $tag = $this->copy_until($this->token_slash); // Get tag name 1921 $node->tag_start = $begin_tag_pos; 1922 1923 // doctype, cdata & comments... 1924 // <!DOCTYPE html> 1925 // <![CDATA[ ... ]]> 1926 // <!-- Comment --> 1927 if (isset($tag[0]) && $tag[0] === '!') { 1928 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1929 1930 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1931 $node->nodetype = HDOM_TYPE_COMMENT; 1932 $node->tag = 'comment'; 1933 } else { // Could be doctype or CDATA but we don't care 1934 $node->nodetype = HDOM_TYPE_UNKNOWN; 1935 $node->tag = 'unknown'; 1936 } 1937 1938 if ($this->char === '>') { 1939 $node->_[HDOM_INFO_TEXT] .= '>'; 1940 } 1941 1942 $this->link_nodes($node, true); 1943 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1944 return true; 1945 } 1946 1947 // The start tag cannot contain another start tag, if so add as text 1948 // i.e. "<<html>" 1949 if ($pos = strpos($tag, '<') !== false) { 1950 $tag = '<' . substr($tag, 0, -1); 1951 $node->_[HDOM_INFO_TEXT] = $tag; 1952 $this->link_nodes($node, false); 1953 $this->char = $this->doc[--$this->pos]; // prev 1954 return true; 1955 } 1956 1957 // Handle invalid tag names (i.e. "<html#doc>") 1958 if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1959 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1960 1961 // Next char is the beginning of a new tag, don't touch it. 1962 if ($this->char === '<') { 1963 $this->link_nodes($node, false); 1964 return true; 1965 } 1966 1967 // Next char closes current tag, add and be done with it. 1968 if ($this->char === '>') { 1969 $node->_[HDOM_INFO_TEXT] .= '>'; 1970 } 1971 $this->link_nodes($node, false); 1972 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1973 return true; 1974 } 1975 1976 // begin tag, add new node 1977 $node->nodetype = HDOM_TYPE_ELEMENT; 1978 $tag_lower = strtolower($tag); 1979 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1980 1981 // handle optional closing tags 1982 if (isset($this->optional_closing_tags[$tag_lower])) { 1983 // Traverse ancestors to close all optional closing tags 1984 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1985 $this->parent->_[HDOM_INFO_END] = 0; 1986 $this->parent = $this->parent->parent; 1987 } 1988 $node->parent = $this->parent; 1989 } 1990 1991 $guard = 0; // prevent infinity loop 1992 1993 // [0] Space between tag and first attribute 1994 $space = array($this->copy_skip($this->token_blank), '', ''); 1995 1996 // attributes 1997 do { 1998 // Everything until the first equal sign should be the attribute name 1999 $name = $this->copy_until($this->token_equal); 2000 2001 if ($name === '' && $this->char !== null && $space[0] === '') { 2002 break; 2003 } 2004 2005 if ($guard === $this->pos) { // Escape infinite loop 2006 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2007 continue; 2008 } 2009 2010 $guard = $this->pos; 2011 2012 // handle endless '<' 2013 // Out of bounds before the tag ended 2014 if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2015 $node->nodetype = HDOM_TYPE_TEXT; 2016 $node->_[HDOM_INFO_END] = 0; 2017 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2018 $node->tag = 'text'; 2019 $this->link_nodes($node, false); 2020 return true; 2021 } 2022 2023 // handle mismatch '<' 2024 // Attributes cannot start after opening tag 2025 if ($this->doc[$this->pos - 1] == '<') { 2026 $node->nodetype = HDOM_TYPE_TEXT; 2027 $node->tag = 'text'; 2028 $node->attr = array(); 2029 $node->_[HDOM_INFO_END] = 0; 2030 $node->_[HDOM_INFO_TEXT] = substr( 2031 $this->doc, 2032 $begin_tag_pos, 2033 $this->pos - $begin_tag_pos - 1 2034 ); 2035 $this->pos -= 2; 2036 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2037 $this->link_nodes($node, false); 2038 return true; 2039 } 2040 2041 if ($name !== '/' && $name !== '') { // this is a attribute name 2042 // [1] Whitespace after attribute name 2043 $space[1] = $this->copy_skip($this->token_blank); 2044 2045 $name = $this->restore_noise($name); // might be a noisy name 2046 2047 if ($this->lowercase) { 2048 $name = strtolower($name); 2049 } 2050 2051 if ($this->char === '=') { // attribute with value 2052 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2053 $this->parse_attr($node, $name, $space); // get attribute value 2054 } else { 2055 //no value attr: nowrap, checked selected... 2056 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2057 $node->attr[$name] = true; 2058 if ($this->char != '>') { 2059 $this->char = $this->doc[--$this->pos]; 2060 } // prev 2061 } 2062 2063 $node->_[HDOM_INFO_SPACE][] = $space; 2064 2065 // prepare for next attribute 2066 $space = array( 2067 $this->copy_skip($this->token_blank), 2068 '', 2069 '' 2070 ); 2071 } else { // no more attributes 2072 break; 2073 } 2074 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2075 2076 $this->link_nodes($node, true); 2077 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2078 2079 // handle empty tags (i.e. "<div/>") 2080 if ($this->copy_until_char('>') === '/') { 2081 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2082 $node->_[HDOM_INFO_END] = 0; 2083 } else { 2084 // reset parent 2085 if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2086 $this->parent = $node; 2087 } 2088 } 2089 2090 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2091 2092 // If it's a BR tag, we need to set it's text to the default text. 2093 // This way when we see it in plaintext, we can generate formatting that the user wants. 2094 // since a br tag never has sub nodes, this works well. 2095 if ($node->tag === 'br') { 2096 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2097 } 2098 2099 return true; 2100 } 2101 2102 protected function parse_attr($node, $name, &$space) { 2103 $is_duplicate = isset($node->attr[$name]); 2104 2105 if (!$is_duplicate) // Copy whitespace between "=" and value 2106 $space[2] = $this->copy_skip($this->token_blank); 2107 2108 switch ($this->char) { 2109 case '"': 2110 $quote_type = HDOM_QUOTE_DOUBLE; 2111 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2112 $value = $this->copy_until_char('"'); 2113 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2114 break; 2115 case '\'': 2116 $quote_type = HDOM_QUOTE_SINGLE; 2117 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2118 $value = $this->copy_until_char('\''); 2119 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2120 break; 2121 default: 2122 $quote_type = HDOM_QUOTE_NO; 2123 $value = $this->copy_until($this->token_attr); 2124 } 2125 2126 $value = $this->restore_noise($value); 2127 2128 // PaperG: Attributes should not have \r or \n in them, that counts as 2129 // html whitespace. 2130 $value = str_replace("\r", '', $value); 2131 $value = str_replace("\n", '', $value); 2132 2133 // PaperG: If this is a "class" selector, lets get rid of the preceeding 2134 // and trailing space since some people leave it in the multi class case. 2135 if ($name === 'class') { 2136 $value = trim($value); 2137 } 2138 2139 if (!$is_duplicate) { 2140 $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2141 $node->attr[$name] = $value; 2142 } 2143 } 2144 2145 protected function link_nodes(&$node, $is_child) { 2146 $node->parent = $this->parent; 2147 $this->parent->nodes[] = $node; 2148 if ($is_child) { 2149 $this->parent->children[] = $node; 2150 } 2151 } 2152 2153 protected function as_text_node($tag) { 2154 $node = new simple_html_dom_node($this); 2155 ++$this->cursor; 2156 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2157 $this->link_nodes($node, false); 2158 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2159 return true; 2160 } 2161 2162 protected function skip($chars) { 2163 $this->pos += strspn($this->doc, $chars, $this->pos); 2164 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2165 } 2166 2167 protected function copy_skip($chars) { 2168 $pos = $this->pos; 2169 $len = strspn($this->doc, $chars, $pos); 2170 $this->pos += $len; 2171 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2172 if ($len === 0) { 2173 return ''; 2174 } 2175 return substr($this->doc, $pos, $len); 2176 } 2177 2178 protected function copy_until($chars) { 2179 $pos = $this->pos; 2180 $len = strcspn($this->doc, $chars, $pos); 2181 $this->pos += $len; 2182 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2183 return substr($this->doc, $pos, $len); 2184 } 2185 2186 protected function copy_until_char($char) { 2187 if ($this->char === null) { 2188 return ''; 2189 } 2190 2191 if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2192 $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2193 $this->char = null; 2194 $this->pos = $this->size; 2195 return $ret; 2196 } 2197 2198 if ($pos === $this->pos) { 2199 return ''; 2200 } 2201 2202 $pos_old = $this->pos; 2203 $this->char = $this->doc[$pos]; 2204 $this->pos = $pos; 2205 return substr($this->doc, $pos_old, $pos - $pos_old); 2206 } 2207 2208 protected function remove_noise($pattern, $remove_tag = false) { 2209 global $debug_object; 2210 if (is_object($debug_object)) { 2211 $debug_object->debug_log_entry(1); 2212 } 2213 2214 $count = preg_match_all( 2215 $pattern, 2216 $this->doc, 2217 $matches, 2218 PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2219 ); 2220 2221 for ($i = $count - 1; $i > -1; --$i) { 2222 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2223 2224 if (is_object($debug_object)) { 2225 $debug_object->debug_log(2, 'key is: ' . $key); 2226 } 2227 2228 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2229 $this->noise[$key] = $matches[$i][$idx][0]; 2230 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2231 } 2232 2233 // reset the length of content 2234 $this->size = strlen($this->doc); 2235 2236 if ($this->size > 0) { 2237 $this->char = $this->doc[0]; 2238 } 2239 } 2240 2241 function restore_noise($text) { 2242 global $debug_object; 2243 if (is_object($debug_object)) { 2244 $debug_object->debug_log_entry(1); 2245 } 2246 2247 while (($pos = strpos($text, '___noise___')) !== false) { 2248 // Sometimes there is a broken piece of markup, and we don't GET the 2249 // pos+11 etc... token which indicates a problem outside of us... 2250 2251 // todo: "___noise___1000" (or any number with four or more digits) 2252 // in the DOM causes an infinite loop which could be utilized by 2253 // malicious software 2254 if (strlen($text) > $pos + 15) { 2255 $key = '___noise___' 2256 . $text[$pos + 11] 2257 . $text[$pos + 12] 2258 . $text[$pos + 13] 2259 . $text[$pos + 14] 2260 . $text[$pos + 15]; 2261 2262 if (is_object($debug_object)) { 2263 $debug_object->debug_log(2, 'located key of: ' . $key); 2264 } 2265 2266 if (isset($this->noise[$key])) { 2267 $text = substr($text, 0, $pos) 2268 . $this->noise[$key] 2269 . substr($text, $pos + 16); 2270 } else { 2271 // do this to prevent an infinite loop. 2272 $text = substr($text, 0, $pos) 2273 . 'UNDEFINED NOISE FOR KEY: ' 2274 . $key 2275 . substr($text, $pos + 16); 2276 } 2277 } else { 2278 // There is no valid key being given back to us... We must get 2279 // rid of the ___noise___ or we will have a problem. 2280 $text = substr($text, 0, $pos) 2281 . 'NO NUMERIC NOISE KEY' 2282 . substr($text, $pos + 11); 2283 } 2284 } 2285 return $text; 2286 } 2287 2288 function search_noise($text) { 2289 global $debug_object; 2290 if (is_object($debug_object)) { 2291 $debug_object->debug_log_entry(1); 2292 } 2293 2294 foreach ($this->noise as $noiseElement) { 2295 if (strpos($noiseElement, $text) !== false) { 2296 return $noiseElement; 2297 } 2298 } 2299 } 2300 2301 function __toString() { 2302 return $this->root->innertext(); 2303 } 2304 2305 function __get($name) { 2306 switch ($name) { 2307 case 'outertext': 2308 return $this->root->innertext(); 2309 case 'innertext': 2310 return $this->root->innertext(); 2311 case 'plaintext': 2312 return $this->root->text(); 2313 case 'charset': 2314 return $this->_charset; 2315 case 'target_charset': 2316 return $this->_target_charset; 2317 } 2318 } 2319 2320 function childNodes($idx = -1) { 2321 return $this->root->childNodes($idx); 2322 } 2323 2324 function firstChild() { 2325 return $this->root->first_child(); 2326 } 2327 2328 function lastChild() { 2329 return $this->root->last_child(); 2330 } 2331 2332 function createElement($name, $value = null) { 2333 return @str_get_html("<$name>$value</$name>")->firstChild(); 2334 } 2335 2336 function createTextNode($value) { 2337 return @end(str_get_html($value)->nodes); 2338 } 2339 2340 function getElementById($id) { 2341 return $this->find("#$id", 0); 2342 } 2343 2344 function getElementsById($id, $idx = null) { 2345 return $this->find("#$id", $idx); 2346 } 2347 2348 function getElementByTagName($name) { 2349 return $this->find($name, 0); 2350 } 2351 2352 function getElementsByTagName($name, $idx = -1) { 2353 return $this->find($name, $idx); 2354 } 2355 2356 function loadFile() { 2357 $args = func_get_args(); 2358 $this->load_file($args); 2359 } 2360}