1<?php 2/** 3 * Website: http://sourceforge.net/projects/simplehtmldom/ 4 * Additional projects: http://sourceforge.net/projects/debugobject/ 5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 * 7 * Licensed under The MIT License 8 * See the LICENSE file in the project root for more information. 9 * 10 * Authors: 11 * S.C. Chen 12 * John Schlick 13 * Rus Carroll 14 * logmanoriginal 15 * 16 * Contributors: 17 * Yousuke Kumakura 18 * Vadim Voituk 19 * Antcs 20 * James Collins (nomadjimbob) 21 * 22 * Based on Version Rev. 1.9.1 (291) 23 * Version 1.9.1.1 24 */ 25 26define('HDOM_TYPE_ELEMENT', 1); 27define('HDOM_TYPE_COMMENT', 2); 28define('HDOM_TYPE_TEXT', 3); 29define('HDOM_TYPE_ENDTAG', 4); 30define('HDOM_TYPE_ROOT', 5); 31define('HDOM_TYPE_UNKNOWN', 6); 32define('HDOM_QUOTE_DOUBLE', 0); 33define('HDOM_QUOTE_SINGLE', 1); 34define('HDOM_QUOTE_NO', 3); 35define('HDOM_INFO_BEGIN', 0); 36define('HDOM_INFO_END', 1); 37define('HDOM_INFO_QUOTE', 2); 38define('HDOM_INFO_SPACE', 3); 39define('HDOM_INFO_TEXT', 4); 40define('HDOM_INFO_INNER', 5); 41define('HDOM_INFO_OUTER', 6); 42define('HDOM_INFO_ENDSPACE', 7); 43 44defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 45defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 46defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 47defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 48define('HDOM_SMARTY_AS_TEXT', 1); 49 50function file_get_html( 51 $url, 52 $use_include_path = false, 53 $context = null, 54 $offset = 0, 55 $maxLen = -1, 56 $lowercase = true, 57 $forceTagsClosed = true, 58 $target_charset = DEFAULT_TARGET_CHARSET, 59 $stripRN = true, 60 $defaultBRText = DEFAULT_BR_TEXT, 61 $defaultSpanText = DEFAULT_SPAN_TEXT) 62{ 63 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 64 65 $dom = new simple_html_dom( 66 null, 67 $lowercase, 68 $forceTagsClosed, 69 $target_charset, 70 $stripRN, 71 $defaultBRText, 72 $defaultSpanText 73 ); 74 75 /** 76 * For sourceforge users: uncomment the next line and comment the 77 * retrieve_url_contents line 2 lines down if it is not already done. 78 */ 79 $contents = file_get_contents( 80 $url, 81 $use_include_path, 82 $context, 83 $offset, 84 $maxLen 85 ); 86 // $contents = retrieve_url_contents($url); 87 88 if (empty($contents) || strlen($contents) > $maxLen) { 89 $dom->clear(); 90 return false; 91 } 92 93 return $dom->load($contents, $lowercase, $stripRN); 94} 95 96function str_get_html( 97 $str, 98 $lowercase = true, 99 $forceTagsClosed = true, 100 $target_charset = DEFAULT_TARGET_CHARSET, 101 $stripRN = true, 102 $defaultBRText = DEFAULT_BR_TEXT, 103 $defaultSpanText = DEFAULT_SPAN_TEXT) 104{ 105 $dom = new simple_html_dom( 106 null, 107 $lowercase, 108 $forceTagsClosed, 109 $target_charset, 110 $stripRN, 111 $defaultBRText, 112 $defaultSpanText 113 ); 114 115 if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 116 $dom->clear(); 117 return false; 118 } 119 120 return $dom->load($str, $lowercase, $stripRN); 121} 122 123function dump_html_tree($node, $show_attr = true, $deep = 0) 124{ 125 $node->dump($node); 126} 127 128class simple_html_dom_node 129{ 130 public $nodetype = HDOM_TYPE_TEXT; 131 public $tag = 'text'; 132 public $attr = array(); 133 public $children = array(); 134 public $nodes = array(); 135 public $parent = null; 136 public $_ = array(); 137 public $tag_start = 0; 138 private $dom = null; 139 140 function __construct($dom) 141 { 142 $this->dom = $dom; 143 $dom->nodes[] = $this; 144 } 145 146 function __destruct() 147 { 148 $this->clear(); 149 } 150 151 function __toString() 152 { 153 return $this->outertext(); 154 } 155 156 function clear() 157 { 158 $this->dom = null; 159 $this->nodes = null; 160 $this->parent = null; 161 $this->children = null; 162 } 163 164 function dump($show_attr = true, $depth = 0) 165 { 166 echo str_repeat("\t", $depth) . $this->tag; 167 168 if ($show_attr && count($this->attr) > 0) { 169 echo '('; 170 foreach ($this->attr as $k => $v) { 171 echo "[$k]=>\"$v\", "; 172 } 173 echo ')'; 174 } 175 176 echo "\n"; 177 178 if ($this->nodes) { 179 foreach ($this->nodes as $node) { 180 $node->dump($show_attr, $depth + 1); 181 } 182 } 183 } 184 185 function dump_node($echo = true) 186 { 187 $string = $this->tag; 188 189 if (count($this->attr) > 0) { 190 $string .= '('; 191 foreach ($this->attr as $k => $v) { 192 $string .= "[$k]=>\"$v\", "; 193 } 194 $string .= ')'; 195 } 196 197 if (count($this->_) > 0) { 198 $string .= ' $_ ('; 199 foreach ($this->_ as $k => $v) { 200 if (is_array($v)) { 201 $string .= "[$k]=>("; 202 foreach ($v as $k2 => $v2) { 203 $string .= "[$k2]=>\"$v2\", "; 204 } 205 $string .= ')'; 206 } else { 207 $string .= "[$k]=>\"$v\", "; 208 } 209 } 210 $string .= ')'; 211 } 212 213 if (isset($this->text)) { 214 $string .= " text: ({$this->text})"; 215 } 216 217 $string .= ' HDOM_INNER_INFO: '; 218 219 if (isset($node->_[HDOM_INFO_INNER])) { 220 $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 221 } else { 222 $string .= ' NULL '; 223 } 224 225 $string .= ' children: ' . count($this->children); 226 $string .= ' nodes: ' . count($this->nodes); 227 $string .= ' tag_start: ' . $this->tag_start; 228 $string .= "\n"; 229 230 if ($echo) { 231 echo $string; 232 return; 233 } else { 234 return $string; 235 } 236 } 237 238 function parent($parent = null) 239 { 240 // I am SURE that this doesn't work properly. 241 // It fails to unset the current node from it's current parents nodes or 242 // children list first. 243 if ($parent !== null) { 244 $this->parent = $parent; 245 $this->parent->nodes[] = $this; 246 $this->parent->children[] = $this; 247 } 248 249 return $this->parent; 250 } 251 252 function has_child() 253 { 254 return !empty($this->children); 255 } 256 257 function children($idx = -1) 258 { 259 if ($idx === -1) { 260 return $this->children; 261 } 262 263 if (isset($this->children[$idx])) { 264 return $this->children[$idx]; 265 } 266 267 return null; 268 } 269 270 function first_child() 271 { 272 if (count($this->children) > 0) { 273 return $this->children[0]; 274 } 275 return null; 276 } 277 278 function last_child() 279 { 280 if (count($this->children) > 0) { 281 return end($this->children); 282 } 283 return null; 284 } 285 286 function next_sibling() 287 { 288 if ($this->parent === null) { 289 return null; 290 } 291 292 $idx = array_search($this, $this->parent->children, true); 293 294 if ($idx !== false && isset($this->parent->children[$idx + 1])) { 295 return $this->parent->children[$idx + 1]; 296 } 297 298 return null; 299 } 300 301 function prev_sibling() 302 { 303 if ($this->parent === null) { 304 return null; 305 } 306 307 $idx = array_search($this, $this->parent->children, true); 308 309 if ($idx !== false && $idx > 0) { 310 return $this->parent->children[$idx - 1]; 311 } 312 313 return null; 314 } 315 316 function find_ancestor_tag($tag) 317 { 318 global $debug_object; 319 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 320 321 if ($this->parent === null) { 322 return null; 323 } 324 325 $ancestor = $this->parent; 326 327 while (!is_null($ancestor)) { 328 if (is_object($debug_object)) { 329 $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 330 } 331 332 if ($ancestor->tag === $tag) { 333 break; 334 } 335 336 $ancestor = $ancestor->parent; 337 } 338 339 return $ancestor; 340 } 341 342 function innertext() 343 { 344 if (isset($this->_[HDOM_INFO_INNER])) { 345 return $this->_[HDOM_INFO_INNER]; 346 } 347 348 if (isset($this->_[HDOM_INFO_TEXT])) { 349 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 350 } 351 352 $ret = ''; 353 354 foreach ($this->nodes as $n) { 355 $ret .= $n->outertext(); 356 } 357 358 return $ret; 359 } 360 361 function outertext() 362 { 363 global $debug_object; 364 365 if (is_object($debug_object)) { 366 $text = ''; 367 368 if ($this->tag === 'text') { 369 if (!empty($this->text)) { 370 $text = ' with text: ' . $this->text; 371 } 372 } 373 374 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 375 } 376 377 if ($this->tag === 'root') { 378 return $this->innertext(); 379 } 380 381 // todo: What is the use of this callback? Remove? 382 if ($this->dom && $this->dom->callback !== null) { 383 call_user_func_array($this->dom->callback, array($this)); 384 } 385 386 if (isset($this->_[HDOM_INFO_OUTER])) { 387 return $this->_[HDOM_INFO_OUTER]; 388 } 389 390 if (isset($this->_[HDOM_INFO_TEXT])) { 391 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 392 } 393 394 $ret = ''; 395 396 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 397 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 398 } 399 400 if (isset($this->_[HDOM_INFO_INNER])) { 401 // todo: <br> should either never have HDOM_INFO_INNER or always 402 if ($this->tag !== 'br') { 403 $ret .= $this->_[HDOM_INFO_INNER]; 404 } 405 } elseif ($this->nodes) { 406 foreach ($this->nodes as $n) { 407 $ret .= $this->convert_text($n->outertext()); 408 } 409 } 410 411 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 412 $ret .= '</' . $this->tag . '>'; 413 } 414 415 return $ret; 416 } 417 418 function text() 419 { 420 if (isset($this->_[HDOM_INFO_INNER])) { 421 return $this->_[HDOM_INFO_INNER]; 422 } 423 424 switch ($this->nodetype) { 425 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 426 case HDOM_TYPE_COMMENT: return ''; 427 case HDOM_TYPE_UNKNOWN: return ''; 428 } 429 430 if (strcasecmp($this->tag, 'script') === 0) { return ''; } 431 if (strcasecmp($this->tag, 'style') === 0) { return ''; } 432 433 $ret = ''; 434 435 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 436 // for some span tags, and some p tags) $this->nodes is set to NULL. 437 // NOTE: This indicates that there is a problem where it's set to NULL 438 // without a clear happening. 439 // WHY is this happening? 440 if (!is_null($this->nodes)) { 441 foreach ($this->nodes as $n) { 442 // Start paragraph after a blank line 443 if ($n->tag === 'p') { 444 $ret = trim($ret) . "\n\n"; 445 } 446 447 $ret .= $this->convert_text($n->text()); 448 449 // If this node is a span... add a space at the end of it so 450 // multiple spans don't run into each other. This is plaintext 451 // after all. 452 if ($n->tag === 'span') { 453 $ret .= $this->dom->default_span_text; 454 } 455 } 456 } 457 return $ret; 458 } 459 460 function xmltext() 461 { 462 $ret = $this->innertext(); 463 $ret = str_ireplace('<![CDATA[', '', $ret); 464 $ret = str_replace(']]>', '', $ret); 465 return $ret; 466 } 467 468 function makeup() 469 { 470 // text, comment, unknown 471 if (isset($this->_[HDOM_INFO_TEXT])) { 472 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 473 } 474 475 $ret = '<' . $this->tag; 476 $i = -1; 477 478 foreach ($this->attr as $key => $val) { 479 ++$i; 480 481 // skip removed attribute 482 if ($val === null || $val === false) { continue; } 483 484 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 485 486 //no value attr: nowrap, checked selected... 487 if ($val === true) { 488 $ret .= $key; 489 } else { 490 switch ($this->_[HDOM_INFO_QUOTE][$i]) 491 { 492 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 493 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 494 default: $quote = ''; 495 } 496 497 $ret .= $key 498 . $this->_[HDOM_INFO_SPACE][$i][1] 499 . '=' 500 . $this->_[HDOM_INFO_SPACE][$i][2] 501 . $quote 502 . $val 503 . $quote; 504 } 505 } 506 507 $ret = $this->dom->restore_noise($ret); 508 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 509 } 510 511 function find($selector, $idx = null, $lowercase = false) 512 { 513 $selectors = $this->parse_selector($selector); 514 if (($count = count($selectors)) === 0) { return array(); } 515 $found_keys = array(); 516 517 // find each selector 518 for ($c = 0; $c < $count; ++$c) { 519 // The change on the below line was documented on the sourceforge 520 // code tracker id 2788009 521 // used to be: if (($levle=count($selectors[0]))===0) return array(); 522 if (($levle = count($selectors[$c])) === 0) { return array(); } 523 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 524 525 $head = array($this->_[HDOM_INFO_BEGIN] => 1); 526 $cmd = ' '; // Combinator 527 528 // handle descendant selectors, no recursive! 529 for ($l = 0; $l < $levle; ++$l) { 530 $ret = array(); 531 532 foreach ($head as $k => $v) { 533 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 534 //PaperG - Pass this optional parameter on to the seek function. 535 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 536 } 537 538 $head = $ret; 539 $cmd = $selectors[$c][$l][4]; // Next Combinator 540 } 541 542 foreach ($head as $k => $v) { 543 if (!isset($found_keys[$k])) { 544 $found_keys[$k] = 1; 545 } 546 } 547 } 548 549 // sort keys 550 ksort($found_keys); 551 552 $found = array(); 553 foreach ($found_keys as $k => $v) { 554 $found[] = $this->dom->nodes[$k]; 555 } 556 557 // return nth-element or array 558 if (is_null($idx)) { return $found; } 559 elseif ($idx < 0) { $idx = count($found) + $idx; } 560 return (isset($found[$idx])) ? $found[$idx] : null; 561 } 562 563 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 564 { 565 global $debug_object; 566 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 567 568 list($tag, $id, $class, $attributes, $cmb) = $selector; 569 $nodes = array(); 570 571 if ($parent_cmd === ' ') { // Descendant Combinator 572 // Find parent closing tag if the current element doesn't have a closing 573 // tag (i.e. void element) 574 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 575 if ($end == 0) { 576 $parent = $this->parent; 577 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 578 $end -= 1; 579 $parent = $parent->parent; 580 } 581 $end += $parent->_[HDOM_INFO_END]; 582 } 583 584 // Get list of target nodes 585 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 586 $nodes_count = $end - $nodes_start; 587 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 588 } elseif ($parent_cmd === '>') { // Child Combinator 589 $nodes = $this->children; 590 } elseif ($parent_cmd === '+' 591 && $this->parent 592 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 593 $index = array_search($this, $this->parent->children, true) + 1; 594 if ($index < count($this->parent->children)) 595 $nodes[] = $this->parent->children[$index]; 596 } elseif ($parent_cmd === '~' 597 && $this->parent 598 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 599 $index = array_search($this, $this->parent->children, true); 600 $nodes = array_slice($this->parent->children, $index); 601 } 602 603 // Go throgh each element starting at this element until the end tag 604 // Note: If this element is a void tag, any previous void element is 605 // skipped. 606 foreach($nodes as $node) { 607 $pass = true; 608 609 // Skip root nodes 610 if(!$node->parent) { 611 $pass = false; 612 } 613 614 // Handle 'text' selector 615 if($pass && $tag === 'text' && $node->tag === 'text') { 616 $ret[array_search($node, $this->dom->nodes, true)] = 1; 617 unset($node); 618 continue; 619 } 620 621 // Skip if node isn't a child node (i.e. text nodes) 622 if($pass && !in_array($node, $node->parent->children, true)) { 623 $pass = false; 624 } 625 626 // Skip if tag doesn't match 627 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 628 $pass = false; 629 } 630 631 // Skip if ID doesn't exist 632 if ($pass && $id !== '' && !isset($node->attr['id'])) { 633 $pass = false; 634 } 635 636 // Check if ID matches 637 if ($pass && $id !== '' && isset($node->attr['id'])) { 638 // Note: Only consider the first ID (as browsers do) 639 $node_id = explode(' ', trim($node->attr['id']))[0]; 640 641 if($id !== $node_id) { $pass = false; } 642 } 643 644 // Check if all class(es) exist 645 if ($pass && $class !== '' && is_array($class) && !empty($class)) { 646 if (isset($node->attr['class'])) { 647 $node_classes = explode(' ', $node->attr['class']); 648 649 if ($lowercase) { 650 $node_classes = array_map('strtolower', $node_classes); 651 } 652 653 foreach($class as $c) { 654 if(!in_array($c, $node_classes)) { 655 $pass = false; 656 break; 657 } 658 } 659 } else { 660 $pass = false; 661 } 662 } 663 664 // Check attributes 665 if ($pass 666 && $attributes !== '' 667 && is_array($attributes) 668 && !empty($attributes)) { 669 foreach($attributes as $a) { 670 list ( 671 $att_name, 672 $att_expr, 673 $att_val, 674 $att_inv, 675 $att_case_sensitivity 676 ) = $a; 677 678 // Handle indexing attributes (i.e. "[2]") 679 /** 680 * Note: This is not supported by the CSS Standard but adds 681 * the ability to select items compatible to XPath (i.e. 682 * the 3rd element within it's parent). 683 * 684 * Note: This doesn't conflict with the CSS Standard which 685 * doesn't work on numeric attributes anyway. 686 */ 687 if (is_numeric($att_name) 688 && $att_expr === '' 689 && $att_val === '') { 690 $count = 0; 691 692 // Find index of current element in parent 693 foreach ($node->parent->children as $c) { 694 if ($c->tag === $node->tag) ++$count; 695 if ($c === $node) break; 696 } 697 698 // If this is the correct node, continue with next 699 // attribute 700 if ($count === (int)$att_name) continue; 701 } 702 703 // Check attribute availability 704 if ($att_inv) { // Attribute should NOT be set 705 if (isset($node->attr[$att_name])) { 706 $pass = false; 707 break; 708 } 709 } else { // Attribute should be set 710 // todo: "plaintext" is not a valid CSS selector! 711 if ($att_name !== 'plaintext' 712 && !isset($node->attr[$att_name])) { 713 $pass = false; 714 break; 715 } 716 } 717 718 // Continue with next attribute if expression isn't defined 719 if ($att_expr === '') continue; 720 721 // If they have told us that this is a "plaintext" 722 // search then we want the plaintext of the node - right? 723 // todo "plaintext" is not a valid CSS selector! 724 if ($att_name === 'plaintext') { 725 $nodeKeyValue = $node->text(); 726 } else { 727 $nodeKeyValue = $node->attr[$att_name]; 728 } 729 730 if (is_object($debug_object)) { 731 $debug_object->debug_log(2, 732 'testing node: ' 733 . $node->tag 734 . ' for attribute: ' 735 . $att_name 736 . $att_expr 737 . $att_val 738 . ' where nodes value is: ' 739 . $nodeKeyValue 740 ); 741 } 742 743 // If lowercase is set, do a case insensitive test of 744 // the value of the selector. 745 if ($lowercase) { 746 $check = $this->match( 747 $att_expr, 748 strtolower($att_val), 749 strtolower($nodeKeyValue), 750 $att_case_sensitivity 751 ); 752 } else { 753 $check = $this->match( 754 $att_expr, 755 $att_val, 756 $nodeKeyValue, 757 $att_case_sensitivity 758 ); 759 } 760 761 if (is_object($debug_object)) { 762 $debug_object->debug_log(2, 763 'after match: ' 764 . ($check ? 'true' : 'false') 765 ); 766 } 767 768 if (!$check) { 769 $pass = false; 770 break; 771 } 772 } 773 } 774 775 // Found a match. Add to list and clear node 776 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 777 unset($node); 778 } 779 // It's passed by reference so this is actually what this function returns. 780 if (is_object($debug_object)) { 781 $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 782 } 783 } 784 785 protected function match($exp, $pattern, $value, $case_sensitivity) 786 { 787 global $debug_object; 788 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 789 790 if ($case_sensitivity === 'i') { 791 $pattern = strtolower($pattern); 792 $value = strtolower($value); 793 } 794 795 switch ($exp) { 796 case '=': 797 return ($value === $pattern); 798 case '!=': 799 return ($value !== $pattern); 800 case '^=': 801 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 802 case '$=': 803 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 804 case '*=': 805 return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 806 case '|=': 807 /** 808 * [att|=val] 809 * 810 * Represents an element with the att attribute, its value 811 * either being exactly "val" or beginning with "val" 812 * immediately followed by "-" (U+002D). 813 */ 814 return strpos($value, $pattern) === 0; 815 case '~=': 816 /** 817 * [att~=val] 818 * 819 * Represents an element with the att attribute whose value is a 820 * whitespace-separated list of words, one of which is exactly 821 * "val". If "val" contains whitespace, it will never represent 822 * anything (since the words are separated by spaces). Also if 823 * "val" is the empty string, it will never represent anything. 824 */ 825 return in_array($pattern, explode(' ', trim($value)), true); 826 } 827 return false; 828 } 829 830 protected function parse_selector($selector_string) 831 { 832 global $debug_object; 833 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 834 835 /** 836 * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 837 * 838 * Paperg: Add the colon to the attribute, so that it properly finds 839 * <tag attr:ibute="something" > like google does. 840 * 841 * Note: if you try to look at this attribute, you MUST use getAttribute 842 * since $dom->x:y will fail the php syntax check. 843 * 844 * Notice the \[ starting the attribute? and the @? following? This 845 * implies that an attribute can begin with an @ sign that is not 846 * captured. This implies that an html attribute specifier may start 847 * with an @ sign that is NOT captured by the expression. Farther study 848 * is required to determine of this should be documented or removed. 849 * 850 * Matches selectors in this order: 851 * 852 * [0] - full match 853 * 854 * [1] - tag name 855 * ([\w:\*-]*) 856 * Matches the tag name consisting of zero or more words, colons, 857 * asterisks and hyphens. 858 * 859 * [2] - id name 860 * (?:\#([\w-]+)) 861 * Optionally matches a id name, consisting of an "#" followed by 862 * the id name (one or more words and hyphens). 863 * 864 * [3] - class names (including dots) 865 * (?:\.([\w\.-]+))? 866 * Optionally matches a list of classs, consisting of an "." 867 * followed by the class name (one or more words and hyphens) 868 * where multiple classes can be chained (i.e. ".foo.bar.baz") 869 * 870 * [4] - attributes 871 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 872 * Optionally matches the attributes list 873 * 874 * [5] - separator 875 * ([\/, >+~]+) 876 * Matches the selector list separator 877 */ 878 // phpcs:ignore Generic.Files.LineLength 879 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 880 881 preg_match_all( 882 $pattern, 883 trim($selector_string) . ' ', // Add final ' ' as pseudo separator 884 $matches, 885 PREG_SET_ORDER 886 ); 887 888 if (is_object($debug_object)) { 889 $debug_object->debug_log(2, 'Matches Array: ', $matches); 890 } 891 892 $selectors = array(); 893 $result = array(); 894 895 foreach ($matches as $m) { 896 $m[0] = trim($m[0]); 897 898 // Skip NoOps 899 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 900 901 // Convert to lowercase 902 if ($this->dom->lowercase) { 903 $m[1] = strtolower($m[1]); 904 } 905 906 // Extract classes 907 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 908 909 /* Extract attributes (pattern based on the pattern above!) 910 911 * [0] - full match 912 * [1] - attribute name 913 * [2] - attribute expression 914 * [3] - attribute value 915 * [4] - case sensitivity 916 * 917 * Note: Attributes can be negated with a "!" prefix to their name 918 */ 919 if($m[4] !== '') { 920 preg_match_all( 921 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 922 trim($m[4]), 923 $attributes, 924 PREG_SET_ORDER 925 ); 926 927 // Replace element by array 928 $m[4] = array(); 929 930 foreach($attributes as $att) { 931 // Skip empty matches 932 if(trim($att[0]) === '') { continue; } 933 934 $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 935 $m[4][] = array( 936 $inverted ? substr($att[1], 1) : $att[1], // Name 937 (isset($att[2])) ? $att[2] : '', // Expression 938 (isset($att[3])) ? $att[3] : '', // Value 939 $inverted, // Inverted Flag 940 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 941 ); 942 } 943 } 944 945 // Sanitize Separator 946 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 947 $m[5] = ' '; 948 } else { // Other Separator 949 $m[5] = trim($m[5]); 950 } 951 952 // Clear Separator if it's a Selector List 953 if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 954 955 // Remove full match before adding to results 956 array_shift($m); 957 $result[] = $m; 958 959 if ($is_list) { // Selector List 960 $selectors[] = $result; 961 $result = array(); 962 } 963 } 964 965 if (count($result) > 0) { $selectors[] = $result; } 966 return $selectors; 967 } 968 969 function __get($name) 970 { 971 if (isset($this->attr[$name])) { 972 return $this->convert_text($this->attr[$name]); 973 } 974 switch ($name) { 975 case 'outertext': return $this->outertext(); 976 case 'innertext': return $this->innertext(); 977 case 'plaintext': return $this->text(); 978 case 'xmltext': return $this->xmltext(); 979 default: return array_key_exists($name, $this->attr); 980 } 981 } 982 983 function __set($name, $value) 984 { 985 global $debug_object; 986 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 987 988 switch ($name) { 989 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 990 case 'innertext': 991 if (isset($this->_[HDOM_INFO_TEXT])) { 992 return $this->_[HDOM_INFO_TEXT] = $value; 993 } 994 return $this->_[HDOM_INFO_INNER] = $value; 995 } 996 997 if (!isset($this->attr[$name])) { 998 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 999 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1000 } 1001 1002 $this->attr[$name] = $value; 1003 } 1004 1005 function __isset($name) 1006 { 1007 switch ($name) { 1008 case 'outertext': return true; 1009 case 'innertext': return true; 1010 case 'plaintext': return true; 1011 } 1012 //no value attr: nowrap, checked selected... 1013 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1014 } 1015 1016 function __unset($name) 1017 { 1018 if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1019 } 1020 1021 function convert_text($text) 1022 { 1023 global $debug_object; 1024 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1025 1026 $converted_text = $text; 1027 1028 $sourceCharset = ''; 1029 $targetCharset = ''; 1030 1031 if ($this->dom) { 1032 $sourceCharset = strtoupper($this->dom->_charset); 1033 $targetCharset = strtoupper($this->dom->_target_charset); 1034 } 1035 1036 if (is_object($debug_object)) { 1037 $debug_object->debug_log(3, 1038 'source charset: ' 1039 . $sourceCharset 1040 . ' target charaset: ' 1041 . $targetCharset 1042 ); 1043 } 1044 1045 if (!empty($sourceCharset) 1046 && !empty($targetCharset) 1047 && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1048 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1049 if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1050 && ($this->is_utf8($text))) { 1051 $converted_text = $text; 1052 } else { 1053 $converted_text = iconv($sourceCharset, $targetCharset, $text); 1054 } 1055 } 1056 1057 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1058 if ($targetCharset === 'UTF-8') { 1059 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1060 $converted_text = substr($converted_text, 3); 1061 } 1062 1063 if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1064 $converted_text = substr($converted_text, 0, -3); 1065 } 1066 } 1067 1068 return $converted_text; 1069 } 1070 1071 static function is_utf8($str) 1072 { 1073 $c = 0; $b = 0; 1074 $bits = 0; 1075 $len = strlen($str); 1076 for($i = 0; $i < $len; $i++) { 1077 $c = ord($str[$i]); 1078 if($c > 128) { 1079 if(($c >= 254)) { return false; } 1080 elseif($c >= 252) { $bits = 6; } 1081 elseif($c >= 248) { $bits = 5; } 1082 elseif($c >= 240) { $bits = 4; } 1083 elseif($c >= 224) { $bits = 3; } 1084 elseif($c >= 192) { $bits = 2; } 1085 else { return false; } 1086 if(($i + $bits) > $len) { return false; } 1087 while($bits > 1) { 1088 $i++; 1089 $b = ord($str[$i]); 1090 if($b < 128 || $b > 191) { return false; } 1091 $bits--; 1092 } 1093 } 1094 } 1095 return true; 1096 } 1097 1098 function get_display_size() 1099 { 1100 global $debug_object; 1101 1102 $width = -1; 1103 $height = -1; 1104 1105 if ($this->tag !== 'img') { 1106 return false; 1107 } 1108 1109 // See if there is aheight or width attribute in the tag itself. 1110 if (isset($this->attr['width'])) { 1111 $width = $this->attr['width']; 1112 } 1113 1114 if (isset($this->attr['height'])) { 1115 $height = $this->attr['height']; 1116 } 1117 1118 // Now look for an inline style. 1119 if (isset($this->attr['style'])) { 1120 // Thanks to user gnarf from stackoverflow for this regular expression. 1121 $attributes = array(); 1122 1123 preg_match_all( 1124 '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1125 $this->attr['style'], 1126 $matches, 1127 PREG_SET_ORDER 1128 ); 1129 1130 foreach ($matches as $match) { 1131 $attributes[$match[1]] = $match[2]; 1132 } 1133 1134 // If there is a width in the style attributes: 1135 if (isset($attributes['width']) && $width == -1) { 1136 // check that the last two characters are px (pixels) 1137 if (strtolower(substr($attributes['width'], -2)) === 'px') { 1138 $proposed_width = substr($attributes['width'], 0, -2); 1139 // Now make sure that it's an integer and not something stupid. 1140 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1141 $width = $proposed_width; 1142 } 1143 } 1144 } 1145 1146 // If there is a width in the style attributes: 1147 if (isset($attributes['height']) && $height == -1) { 1148 // check that the last two characters are px (pixels) 1149 if (strtolower(substr($attributes['height'], -2)) == 'px') { 1150 $proposed_height = substr($attributes['height'], 0, -2); 1151 // Now make sure that it's an integer and not something stupid. 1152 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1153 $height = $proposed_height; 1154 } 1155 } 1156 } 1157 1158 } 1159 1160 // Future enhancement: 1161 // Look in the tag to see if there is a class or id specified that has 1162 // a height or width attribute to it. 1163 1164 // Far future enhancement 1165 // Look at all the parent tags of this image to see if they specify a 1166 // class or id that has an img selector that specifies a height or width 1167 // Note that in this case, the class or id will have the img subselector 1168 // for it to apply to the image. 1169 1170 // ridiculously far future development 1171 // If the class or id is specified in a SEPARATE css file thats not on 1172 // the page, go get it and do what we were just doing for the ones on 1173 // the page. 1174 1175 $result = array( 1176 'height' => $height, 1177 'width' => $width 1178 ); 1179 1180 return $result; 1181 } 1182 1183 function save($filepath = '') 1184 { 1185 $ret = $this->outertext(); 1186 1187 if ($filepath !== '') { 1188 file_put_contents($filepath, $ret, LOCK_EX); 1189 } 1190 1191 return $ret; 1192 } 1193 1194 function addClass($class) 1195 { 1196 if (is_string($class)) { 1197 $class = explode(' ', $class); 1198 } 1199 1200 if (is_array($class)) { 1201 foreach($class as $c) { 1202 if (isset($this->class)) { 1203 if ($this->hasClass($c)) { 1204 continue; 1205 } else { 1206 $this->class .= ' ' . $c; 1207 } 1208 } else { 1209 $this->class = $c; 1210 } 1211 } 1212 } else { 1213 if (is_object($debug_object)) { 1214 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1215 } 1216 } 1217 } 1218 1219 function hasClass($class) 1220 { 1221 if (is_string($class)) { 1222 if (isset($this->class)) { 1223 return in_array($class, explode(' ', $this->class), true); 1224 } 1225 } else { 1226 if (is_object($debug_object)) { 1227 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1228 } 1229 } 1230 1231 return false; 1232 } 1233 1234 function removeClass($class = null) 1235 { 1236 if (!isset($this->class)) { 1237 return; 1238 } 1239 1240 if (is_null($class)) { 1241 $this->removeAttribute('class'); 1242 return; 1243 } 1244 1245 if (is_string($class)) { 1246 $class = explode(' ', $class); 1247 } 1248 1249 if (is_array($class)) { 1250 $class = array_diff(explode(' ', $this->class), $class); 1251 if (empty($class)) { 1252 $this->removeAttribute('class'); 1253 } else { 1254 $this->class = implode(' ', $class); 1255 } 1256 } 1257 } 1258 1259 function getAllAttributes() 1260 { 1261 return $this->attr; 1262 } 1263 1264 function getAttribute($name) 1265 { 1266 return $this->__get($name); 1267 } 1268 1269 function setAttribute($name, $value) 1270 { 1271 $this->__set($name, $value); 1272 } 1273 1274 function hasAttribute($name) 1275 { 1276 return $this->__isset($name); 1277 } 1278 1279 function removeAttribute($name) 1280 { 1281 $this->__set($name, null); 1282 } 1283 1284 function remove() 1285 { 1286 if ($this->parent) { 1287 $this->parent->removeChild($this); 1288 } 1289 } 1290 1291 function removeChild($node) 1292 { 1293 $nidx = array_search($node, $this->nodes, true); 1294 $cidx = array_search($node, $this->children, true); 1295 $didx = array_search($node, $this->dom->nodes, true); 1296 1297 if ($nidx !== false && $cidx !== false && $didx !== false) { 1298 1299 foreach($node->children as $child) { 1300 $node->removeChild($child); 1301 } 1302 1303 foreach($node->nodes as $entity) { 1304 $enidx = array_search($entity, $node->nodes, true); 1305 $edidx = array_search($entity, $node->dom->nodes, true); 1306 1307 if ($enidx !== false && $edidx !== false) { 1308 unset($node->nodes[$enidx]); 1309 unset($node->dom->nodes[$edidx]); 1310 } 1311 } 1312 1313 unset($this->nodes[$nidx]); 1314 unset($this->children[$cidx]); 1315 unset($this->dom->nodes[$didx]); 1316 1317 $node->clear(); 1318 1319 } 1320 } 1321 1322 function getElementById($id) 1323 { 1324 return $this->find("#$id", 0); 1325 } 1326 1327 function getElementsById($id, $idx = null) 1328 { 1329 return $this->find("#$id", $idx); 1330 } 1331 1332 function getElementByTagName($name) 1333 { 1334 return $this->find($name, 0); 1335 } 1336 1337 function getElementsByTagName($name, $idx = null) 1338 { 1339 return $this->find($name, $idx); 1340 } 1341 1342 function parentNode() 1343 { 1344 return $this->parent(); 1345 } 1346 1347 function childNodes($idx = -1) 1348 { 1349 return $this->children($idx); 1350 } 1351 1352 function firstChild() 1353 { 1354 return $this->first_child(); 1355 } 1356 1357 function lastChild() 1358 { 1359 return $this->last_child(); 1360 } 1361 1362 function nextSibling() 1363 { 1364 return $this->next_sibling(); 1365 } 1366 1367 function previousSibling() 1368 { 1369 return $this->prev_sibling(); 1370 } 1371 1372 function hasChildNodes() 1373 { 1374 return $this->has_child(); 1375 } 1376 1377 function nodeName() 1378 { 1379 return $this->tag; 1380 } 1381 1382 function appendChild($node) 1383 { 1384 $node->parent($this); 1385 return $node; 1386 } 1387 1388} 1389 1390class simple_html_dom 1391{ 1392 public $root = null; 1393 public $nodes = array(); 1394 public $callback = null; 1395 public $lowercase = false; 1396 public $original_size; 1397 public $size; 1398 1399 public $stripRNAttrValues = true; // added option to ignore RN in attr values - nomadjimbob 1400 1401 protected $pos; 1402 protected $doc; 1403 protected $char; 1404 1405 protected $cursor; 1406 protected $parent; 1407 protected $noise = array(); 1408 protected $token_blank = " \t\r\n"; 1409 protected $token_equal = ' =/>'; 1410 protected $token_slash = " />\r\n\t"; 1411 protected $token_attr = ' >'; 1412 1413 public $_charset = ''; 1414 public $_target_charset = ''; 1415 1416 protected $default_br_text = ''; 1417 1418 public $default_span_text = ''; 1419 1420 protected $self_closing_tags = array( 1421 'area' => 1, 1422 'base' => 1, 1423 'br' => 1, 1424 'col' => 1, 1425 'embed' => 1, 1426 'hr' => 1, 1427 'img' => 1, 1428 'input' => 1, 1429 'link' => 1, 1430 'meta' => 1, 1431 'param' => 1, 1432 'source' => 1, 1433 'track' => 1, 1434 'wbr' => 1 1435 ); 1436 protected $block_tags = array( 1437 'body' => 1, 1438 'div' => 1, 1439 'form' => 1, 1440 'root' => 1, 1441 'span' => 1, 1442 'table' => 1 1443 ); 1444 protected $optional_closing_tags = array( 1445 // Not optional, see 1446 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1447 'b' => array('b' => 1), 1448 'dd' => array('dd' => 1, 'dt' => 1), 1449 // Not optional, see 1450 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1451 'dl' => array('dd' => 1, 'dt' => 1), 1452 'dt' => array('dd' => 1, 'dt' => 1), 1453 'li' => array('li' => 1), 1454 'optgroup' => array('optgroup' => 1, 'option' => 1), 1455 'option' => array('optgroup' => 1, 'option' => 1), 1456 'p' => array('p' => 1), 1457 'rp' => array('rp' => 1, 'rt' => 1), 1458 'rt' => array('rp' => 1, 'rt' => 1), 1459 'td' => array('td' => 1, 'th' => 1), 1460 'th' => array('td' => 1, 'th' => 1), 1461 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1462 ); 1463 1464 function __construct( 1465 $str = null, 1466 $lowercase = true, 1467 $forceTagsClosed = true, 1468 $target_charset = DEFAULT_TARGET_CHARSET, 1469 $stripRN = true, 1470 $defaultBRText = DEFAULT_BR_TEXT, 1471 $defaultSpanText = DEFAULT_SPAN_TEXT, 1472 $options = 0) 1473 { 1474 if ($str) { 1475 if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1476 $this->load_file($str); 1477 } else { 1478 $this->load( 1479 $str, 1480 $lowercase, 1481 $stripRN, 1482 $defaultBRText, 1483 $defaultSpanText, 1484 $options 1485 ); 1486 } 1487 } 1488 // Forcing tags to be closed implies that we don't trust the html, but 1489 // it can lead to parsing errors if we SHOULD trust the html. 1490 if (!$forceTagsClosed) { 1491 $this->optional_closing_array = array(); 1492 } 1493 1494 $this->_target_charset = $target_charset; 1495 } 1496 1497 function __destruct() 1498 { 1499 $this->clear(); 1500 } 1501 1502 function load( 1503 $str, 1504 $lowercase = true, 1505 $stripRN = true, 1506 $defaultBRText = DEFAULT_BR_TEXT, 1507 $defaultSpanText = DEFAULT_SPAN_TEXT, 1508 $options = 0) 1509 { 1510 global $debug_object; 1511 1512 // prepare 1513 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1514 1515 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1516 // Script tags removal now preceeds style tag removal. 1517 // strip out <script> tags 1518 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1519 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1520 1521 // strip out the \r \n's if we are told to. 1522 if ($stripRN) { 1523 $this->doc = str_replace("\r", ' ', $this->doc); 1524 $this->doc = str_replace("\n", ' ', $this->doc); 1525 1526 // set the length of content since we have changed it. 1527 $this->size = strlen($this->doc); 1528 } 1529 1530 // strip out cdata 1531 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1532 // strip out comments 1533 $this->remove_noise("'<!--(.*?)-->'is"); 1534 // strip out <style> tags 1535 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1536 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1537 // strip out preformatted tags 1538 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1539 // strip out server side scripts 1540 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1541 1542 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1543 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1544 } 1545 1546 // parsing 1547 $this->parse(); 1548 // end 1549 $this->root->_[HDOM_INFO_END] = $this->cursor; 1550 $this->parse_charset(); 1551 1552 // make load function chainable 1553 return $this; 1554 } 1555 1556 function load_file() 1557 { 1558 $args = func_get_args(); 1559 1560 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1561 $this->load($doc, true); 1562 } else { 1563 return false; 1564 } 1565 } 1566 1567 function set_callback($function_name) 1568 { 1569 $this->callback = $function_name; 1570 } 1571 1572 function remove_callback() 1573 { 1574 $this->callback = null; 1575 } 1576 1577 function save($filepath = '') 1578 { 1579 $ret = $this->root->innertext(); 1580 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1581 return $ret; 1582 } 1583 1584 function find($selector, $idx = null, $lowercase = false) 1585 { 1586 return $this->root->find($selector, $idx, $lowercase); 1587 } 1588 1589 function clear() 1590 { 1591 if (isset($this->nodes)) { 1592 foreach ($this->nodes as $n) { 1593 $n->clear(); 1594 $n = null; 1595 } 1596 } 1597 1598 // This add next line is documented in the sourceforge repository. 1599 // 2977248 as a fix for ongoing memory leaks that occur even with the 1600 // use of clear. 1601 if (isset($this->children)) { 1602 foreach ($this->children as $n) { 1603 $n->clear(); 1604 $n = null; 1605 } 1606 } 1607 1608 if (isset($this->parent)) { 1609 $this->parent->clear(); 1610 unset($this->parent); 1611 } 1612 1613 if (isset($this->root)) { 1614 $this->root->clear(); 1615 unset($this->root); 1616 } 1617 1618 unset($this->doc); 1619 unset($this->noise); 1620 } 1621 1622 function dump($show_attr = true) 1623 { 1624 $this->root->dump($show_attr); 1625 } 1626 1627 protected function prepare( 1628 $str, $lowercase = true, 1629 $defaultBRText = DEFAULT_BR_TEXT, 1630 $defaultSpanText = DEFAULT_SPAN_TEXT) 1631 { 1632 $this->clear(); 1633 1634 $this->doc = trim($str); 1635 $this->size = strlen($this->doc); 1636 $this->original_size = $this->size; // original size of the html 1637 $this->pos = 0; 1638 $this->cursor = 1; 1639 $this->noise = array(); 1640 $this->nodes = array(); 1641 $this->lowercase = $lowercase; 1642 $this->default_br_text = $defaultBRText; 1643 $this->default_span_text = $defaultSpanText; 1644 $this->root = new simple_html_dom_node($this); 1645 $this->root->tag = 'root'; 1646 $this->root->_[HDOM_INFO_BEGIN] = -1; 1647 $this->root->nodetype = HDOM_TYPE_ROOT; 1648 $this->parent = $this->root; 1649 if ($this->size > 0) { $this->char = $this->doc[0]; } 1650 } 1651 1652 protected function parse() 1653 { 1654 while (true) { 1655 // Read next tag if there is no text between current position and the 1656 // next opening tag. 1657 if (($s = $this->copy_until_char('<')) === '') { 1658 if($this->read_tag()) { 1659 continue; 1660 } else { 1661 return true; 1662 } 1663 } 1664 1665 // Add a text node for text between tags 1666 $node = new simple_html_dom_node($this); 1667 ++$this->cursor; 1668 $node->_[HDOM_INFO_TEXT] = $s; 1669 $this->link_nodes($node, false); 1670 } 1671 } 1672 1673 protected function parse_charset() 1674 { 1675 global $debug_object; 1676 1677 $charset = null; 1678 1679 if (function_exists('get_last_retrieve_url_contents_content_type')) { 1680 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1681 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1682 if ($success) { 1683 $charset = $matches[1]; 1684 if (is_object($debug_object)) { 1685 $debug_object->debug_log(2, 1686 'header content-type found charset of: ' 1687 . $charset 1688 ); 1689 } 1690 } 1691 } 1692 1693 if (empty($charset)) { 1694 // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1695 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1696 1697 if (!empty($el)) { 1698 $fullvalue = $el->content; 1699 if (is_object($debug_object)) { 1700 $debug_object->debug_log(2, 1701 'meta content-type tag found' 1702 . $fullvalue 1703 ); 1704 } 1705 1706 if (!empty($fullvalue)) { 1707 $success = preg_match( 1708 '/charset=(.+)/i', 1709 $fullvalue, 1710 $matches 1711 ); 1712 1713 if ($success) { 1714 $charset = $matches[1]; 1715 } else { 1716 // If there is a meta tag, and they don't specify the 1717 // character set, research says that it's typically 1718 // ISO-8859-1 1719 if (is_object($debug_object)) { 1720 $debug_object->debug_log(2, 1721 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1722 ); 1723 } 1724 1725 $charset = 'ISO-8859-1'; 1726 } 1727 } 1728 } 1729 } 1730 1731 if (empty($charset)) { 1732 // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1733 if ($meta = $this->root->find('meta[charset]', 0)) { 1734 $charset = $meta->charset; 1735 if (is_object($debug_object)) { 1736 $debug_object->debug_log(2, 'meta charset: ' . $charset); 1737 } 1738 } 1739 } 1740 1741 if (empty($charset)) { 1742 // Try to guess the charset based on the content 1743 // Requires Multibyte String (mbstring) support (optional) 1744 if (function_exists('mb_detect_encoding')) { 1745 /** 1746 * mb_detect_encoding() is not intended to distinguish between 1747 * charsets, especially single-byte charsets. Its primary 1748 * purpose is to detect which multibyte encoding is in use, 1749 * i.e. UTF-8, UTF-16, shift-JIS, etc. 1750 * 1751 * -- https://bugs.php.net/bug.php?id=38138 1752 * 1753 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1754 * always result in CP1251/ISO-8859-5 and vice versa. 1755 * 1756 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1757 * to stay compatible. 1758 */ 1759 $encoding = mb_detect_encoding( 1760 $this->doc, 1761 array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1762 ); 1763 1764 if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1765 // Due to a limitation of mb_detect_encoding 1766 // 'CP1251'/'ISO-8859-5' will be detected as 1767 // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1768 // which case we can simply assume it is the other charset. 1769 if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1770 $encoding = 'CP1251'; 1771 } 1772 } 1773 1774 if ($encoding !== false) { 1775 $charset = $encoding; 1776 if (is_object($debug_object)) { 1777 $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1778 } 1779 } 1780 } 1781 } 1782 1783 if (empty($charset)) { 1784 // Assume it's UTF-8 as it is the most likely charset to be used 1785 $charset = 'UTF-8'; 1786 if (is_object($debug_object)) { 1787 $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1788 } 1789 } 1790 1791 // Since CP1252 is a superset, if we get one of it's subsets, we want 1792 // it instead. 1793 if ((strtolower($charset) == 'iso-8859-1') 1794 || (strtolower($charset) == 'latin1') 1795 || (strtolower($charset) == 'latin-1')) { 1796 $charset = 'CP1252'; 1797 if (is_object($debug_object)) { 1798 $debug_object->debug_log(2, 1799 'replacing ' . $charset . ' with CP1252 as its a superset' 1800 ); 1801 } 1802 } 1803 1804 if (is_object($debug_object)) { 1805 $debug_object->debug_log(1, 'EXIT - ' . $charset); 1806 } 1807 1808 return $this->_charset = $charset; 1809 } 1810 1811 protected function read_tag() 1812 { 1813 // Set end position if no further tags found 1814 if ($this->char !== '<') { 1815 $this->root->_[HDOM_INFO_END] = $this->cursor; 1816 return false; 1817 } 1818 1819 $begin_tag_pos = $this->pos; 1820 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1821 1822 // end tag 1823 if ($this->char === '/') { 1824 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1825 1826 // Skip whitespace in end tags (i.e. in "</ html>") 1827 $this->skip($this->token_blank); 1828 $tag = $this->copy_until_char('>'); 1829 1830 // Skip attributes in end tags 1831 if (($pos = strpos($tag, ' ')) !== false) { 1832 $tag = substr($tag, 0, $pos); 1833 } 1834 1835 $parent_lower = strtolower($this->parent->tag); 1836 $tag_lower = strtolower($tag); 1837 1838 // The end tag is supposed to close the parent tag. Handle situations 1839 // when it doesn't 1840 if ($parent_lower !== $tag_lower) { 1841 // Parent tag does not have to be closed necessarily (optional closing tag) 1842 // Current tag is a block tag, so it may close an ancestor 1843 if (isset($this->optional_closing_tags[$parent_lower]) 1844 && isset($this->block_tags[$tag_lower])) { 1845 1846 $this->parent->_[HDOM_INFO_END] = 0; 1847 $org_parent = $this->parent; 1848 1849 // Traverse ancestors to find a matching opening tag 1850 // Stop at root node 1851 while (($this->parent->parent) 1852 && strtolower($this->parent->tag) !== $tag_lower 1853 ){ 1854 $this->parent = $this->parent->parent; 1855 } 1856 1857 // If we don't have a match add current tag as text node 1858 if (strtolower($this->parent->tag) !== $tag_lower) { 1859 $this->parent = $org_parent; // restore origonal parent 1860 1861 if ($this->parent->parent) { 1862 $this->parent = $this->parent->parent; 1863 } 1864 1865 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1866 return $this->as_text_node($tag); 1867 } 1868 } elseif (($this->parent->parent) 1869 && isset($this->block_tags[$tag_lower]) 1870 ) { 1871 // Grandparent exists and current tag is a block tag, so our 1872 // parent doesn't have an end tag 1873 $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1874 $org_parent = $this->parent; 1875 1876 // Traverse ancestors to find a matching opening tag 1877 // Stop at root node 1878 while (($this->parent->parent) 1879 && strtolower($this->parent->tag) !== $tag_lower 1880 ) { 1881 $this->parent = $this->parent->parent; 1882 } 1883 1884 // If we don't have a match add current tag as text node 1885 if (strtolower($this->parent->tag) !== $tag_lower) { 1886 $this->parent = $org_parent; // restore origonal parent 1887 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1888 return $this->as_text_node($tag); 1889 } 1890 } elseif (($this->parent->parent) 1891 && strtolower($this->parent->parent->tag) === $tag_lower 1892 ) { // Grandparent exists and current tag closes it 1893 $this->parent->_[HDOM_INFO_END] = 0; 1894 $this->parent = $this->parent->parent; 1895 } else { // Random tag, add as text node 1896 return $this->as_text_node($tag); 1897 } 1898 } 1899 1900 // Set end position of parent tag to current cursor position 1901 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1902 1903 if ($this->parent->parent) { 1904 $this->parent = $this->parent->parent; 1905 } 1906 1907 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1908 return true; 1909 } 1910 1911 // start tag 1912 $node = new simple_html_dom_node($this); 1913 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1914 ++$this->cursor; 1915 $tag = $this->copy_until($this->token_slash); // Get tag name 1916 $node->tag_start = $begin_tag_pos; 1917 1918 // doctype, cdata & comments... 1919 // <!DOCTYPE html> 1920 // <![CDATA[ ... ]]> 1921 // <!-- Comment --> 1922 if (isset($tag[0]) && $tag[0] === '!') { 1923 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1924 1925 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1926 $node->nodetype = HDOM_TYPE_COMMENT; 1927 $node->tag = 'comment'; 1928 } else { // Could be doctype or CDATA but we don't care 1929 $node->nodetype = HDOM_TYPE_UNKNOWN; 1930 $node->tag = 'unknown'; 1931 } 1932 1933 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1934 1935 $this->link_nodes($node, true); 1936 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1937 return true; 1938 } 1939 1940 // The start tag cannot contain another start tag, if so add as text 1941 // i.e. "<<html>" 1942 if ($pos = strpos($tag, '<') !== false) { 1943 $tag = '<' . substr($tag, 0, -1); 1944 $node->_[HDOM_INFO_TEXT] = $tag; 1945 $this->link_nodes($node, false); 1946 $this->char = $this->doc[--$this->pos]; // prev 1947 return true; 1948 } 1949 1950 // Handle invalid tag names (i.e. "<html#doc>") 1951 if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1952 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1953 1954 // Next char is the beginning of a new tag, don't touch it. 1955 if ($this->char === '<') { 1956 $this->link_nodes($node, false); 1957 return true; 1958 } 1959 1960 // Next char closes current tag, add and be done with it. 1961 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1962 $this->link_nodes($node, false); 1963 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1964 return true; 1965 } 1966 1967 // begin tag, add new node 1968 $node->nodetype = HDOM_TYPE_ELEMENT; 1969 $tag_lower = strtolower($tag); 1970 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1971 1972 // handle optional closing tags 1973 if (isset($this->optional_closing_tags[$tag_lower])) { 1974 // Traverse ancestors to close all optional closing tags 1975 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1976 $this->parent->_[HDOM_INFO_END] = 0; 1977 $this->parent = $this->parent->parent; 1978 } 1979 $node->parent = $this->parent; 1980 } 1981 1982 $guard = 0; // prevent infinity loop 1983 1984 // [0] Space between tag and first attribute 1985 $space = array($this->copy_skip($this->token_blank), '', ''); 1986 1987 // attributes 1988 do { 1989 // Everything until the first equal sign should be the attribute name 1990 $name = $this->copy_until($this->token_equal); 1991 1992 if ($name === '' && $this->char !== null && $space[0] === '') { 1993 break; 1994 } 1995 1996 if ($guard === $this->pos) { // Escape infinite loop 1997 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1998 continue; 1999 } 2000 2001 $guard = $this->pos; 2002 2003 // handle endless '<' 2004 // Out of bounds before the tag ended 2005 if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2006 $node->nodetype = HDOM_TYPE_TEXT; 2007 $node->_[HDOM_INFO_END] = 0; 2008 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2009 $node->tag = 'text'; 2010 $this->link_nodes($node, false); 2011 return true; 2012 } 2013 2014 // handle mismatch '<' 2015 // Attributes cannot start after opening tag 2016 if ($this->doc[$this->pos - 1] == '<') { 2017 $node->nodetype = HDOM_TYPE_TEXT; 2018 $node->tag = 'text'; 2019 $node->attr = array(); 2020 $node->_[HDOM_INFO_END] = 0; 2021 $node->_[HDOM_INFO_TEXT] = substr( 2022 $this->doc, 2023 $begin_tag_pos, 2024 $this->pos - $begin_tag_pos - 1 2025 ); 2026 $this->pos -= 2; 2027 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2028 $this->link_nodes($node, false); 2029 return true; 2030 } 2031 2032 if ($name !== '/' && $name !== '') { // this is a attribute name 2033 // [1] Whitespace after attribute name 2034 $space[1] = $this->copy_skip($this->token_blank); 2035 2036 $name = $this->restore_noise($name); // might be a noisy name 2037 2038 if ($this->lowercase) { $name = strtolower($name); } 2039 2040 if ($this->char === '=') { // attribute with value 2041 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2042 $this->parse_attr($node, $name, $space); // get attribute value 2043 } else { 2044 //no value attr: nowrap, checked selected... 2045 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2046 $node->attr[$name] = true; 2047 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2048 } 2049 2050 $node->_[HDOM_INFO_SPACE][] = $space; 2051 2052 // prepare for next attribute 2053 $space = array( 2054 $this->copy_skip($this->token_blank), 2055 '', 2056 '' 2057 ); 2058 } else { // no more attributes 2059 break; 2060 } 2061 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2062 2063 $this->link_nodes($node, true); 2064 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2065 2066 // handle empty tags (i.e. "<div/>") 2067 if ($this->copy_until_char('>') === '/') { 2068 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2069 $node->_[HDOM_INFO_END] = 0; 2070 } else { 2071 // reset parent 2072 if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2073 $this->parent = $node; 2074 } 2075 } 2076 2077 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2078 2079 // If it's a BR tag, we need to set it's text to the default text. 2080 // This way when we see it in plaintext, we can generate formatting that the user wants. 2081 // since a br tag never has sub nodes, this works well. 2082 if ($node->tag === 'br') { 2083 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2084 } 2085 2086 return true; 2087 } 2088 2089 protected function parse_attr($node, $name, &$space) 2090 { 2091 $is_duplicate = isset($node->attr[$name]); 2092 2093 if (!$is_duplicate) // Copy whitespace between "=" and value 2094 $space[2] = $this->copy_skip($this->token_blank); 2095 2096 switch ($this->char) { 2097 case '"': 2098 $quote_type = HDOM_QUOTE_DOUBLE; 2099 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2100 $value = $this->copy_until_char('"'); 2101 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2102 break; 2103 case '\'': 2104 $quote_type = HDOM_QUOTE_SINGLE; 2105 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2106 $value = $this->copy_until_char('\''); 2107 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2108 break; 2109 default: 2110 $quote_type = HDOM_QUOTE_NO; 2111 $value = $this->copy_until($this->token_attr); 2112 } 2113 2114 $value = $this->restore_noise($value); 2115 2116 // PaperG: Attributes should not have \r or \n in them, that counts as 2117 // html whitespace. 2118 2119 // Added $stripRNAttrValues option for DokuWiki - nomadjimbob 2120 if($this->stripRNAttrValues) { 2121 $value = str_replace("\r", '', $value); 2122 $value = str_replace("\n", '', $value); 2123 } 2124 2125 // PaperG: If this is a "class" selector, lets get rid of the preceeding 2126 // and trailing space since some people leave it in the multi class case. 2127 if ($name === 'class') { 2128 $value = trim($value); 2129 } 2130 2131 if (!$is_duplicate) { 2132 $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2133 $node->attr[$name] = $value; 2134 } 2135 } 2136 2137 protected function link_nodes(&$node, $is_child) 2138 { 2139 $node->parent = $this->parent; 2140 $this->parent->nodes[] = $node; 2141 if ($is_child) { 2142 $this->parent->children[] = $node; 2143 } 2144 } 2145 2146 protected function as_text_node($tag) 2147 { 2148 $node = new simple_html_dom_node($this); 2149 ++$this->cursor; 2150 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2151 $this->link_nodes($node, false); 2152 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2153 return true; 2154 } 2155 2156 protected function skip($chars) 2157 { 2158 $this->pos += strspn($this->doc, $chars, $this->pos); 2159 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2160 } 2161 2162 protected function copy_skip($chars) 2163 { 2164 $pos = $this->pos; 2165 $len = strspn($this->doc, $chars, $pos); 2166 $this->pos += $len; 2167 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2168 if ($len === 0) { return ''; } 2169 return substr($this->doc, $pos, $len); 2170 } 2171 2172 protected function copy_until($chars) 2173 { 2174 $pos = $this->pos; 2175 $len = strcspn($this->doc, $chars, $pos); 2176 $this->pos += $len; 2177 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2178 return substr($this->doc, $pos, $len); 2179 } 2180 2181 protected function copy_until_char($char) 2182 { 2183 if ($this->char === null) { return ''; } 2184 2185 if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2186 $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2187 $this->char = null; 2188 $this->pos = $this->size; 2189 return $ret; 2190 } 2191 2192 if ($pos === $this->pos) { return ''; } 2193 2194 $pos_old = $this->pos; 2195 $this->char = $this->doc[$pos]; 2196 $this->pos = $pos; 2197 return substr($this->doc, $pos_old, $pos - $pos_old); 2198 } 2199 2200 protected function remove_noise($pattern, $remove_tag = false) 2201 { 2202 global $debug_object; 2203 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2204 2205 $count = preg_match_all( 2206 $pattern, 2207 $this->doc, 2208 $matches, 2209 PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2210 ); 2211 2212 for ($i = $count - 1; $i > -1; --$i) { 2213 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2214 2215 if (is_object($debug_object)) { 2216 $debug_object->debug_log(2, 'key is: ' . $key); 2217 } 2218 2219 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2220 $this->noise[$key] = $matches[$i][$idx][0]; 2221 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2222 } 2223 2224 // reset the length of content 2225 $this->size = strlen($this->doc); 2226 2227 if ($this->size > 0) { 2228 $this->char = $this->doc[0]; 2229 } 2230 } 2231 2232 function restore_noise($text) 2233 { 2234 global $debug_object; 2235 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2236 2237 while (($pos = strpos($text, '___noise___')) !== false) { 2238 // Sometimes there is a broken piece of markup, and we don't GET the 2239 // pos+11 etc... token which indicates a problem outside of us... 2240 2241 // todo: "___noise___1000" (or any number with four or more digits) 2242 // in the DOM causes an infinite loop which could be utilized by 2243 // malicious software 2244 if (strlen($text) > $pos + 15) { 2245 $key = '___noise___' 2246 . $text[$pos + 11] 2247 . $text[$pos + 12] 2248 . $text[$pos + 13] 2249 . $text[$pos + 14] 2250 . $text[$pos + 15]; 2251 2252 if (is_object($debug_object)) { 2253 $debug_object->debug_log(2, 'located key of: ' . $key); 2254 } 2255 2256 if (isset($this->noise[$key])) { 2257 $text = substr($text, 0, $pos) 2258 . $this->noise[$key] 2259 . substr($text, $pos + 16); 2260 } else { 2261 // do this to prevent an infinite loop. 2262 $text = substr($text, 0, $pos) 2263 . 'UNDEFINED NOISE FOR KEY: ' 2264 . $key 2265 . substr($text, $pos + 16); 2266 } 2267 } else { 2268 // There is no valid key being given back to us... We must get 2269 // rid of the ___noise___ or we will have a problem. 2270 $text = substr($text, 0, $pos) 2271 . 'NO NUMERIC NOISE KEY' 2272 . substr($text, $pos + 11); 2273 } 2274 } 2275 return $text; 2276 } 2277 2278 function search_noise($text) 2279 { 2280 global $debug_object; 2281 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2282 2283 foreach($this->noise as $noiseElement) { 2284 if (strpos($noiseElement, $text) !== false) { 2285 return $noiseElement; 2286 } 2287 } 2288 } 2289 2290 function __toString() 2291 { 2292 return $this->root->innertext(); 2293 } 2294 2295 function __get($name) 2296 { 2297 switch ($name) { 2298 case 'outertext': 2299 return $this->root->innertext(); 2300 case 'innertext': 2301 return $this->root->innertext(); 2302 case 'plaintext': 2303 return $this->root->text(); 2304 case 'charset': 2305 return $this->_charset; 2306 case 'target_charset': 2307 return $this->_target_charset; 2308 } 2309 } 2310 2311 function childNodes($idx = -1) 2312 { 2313 return $this->root->childNodes($idx); 2314 } 2315 2316 function firstChild() 2317 { 2318 return $this->root->first_child(); 2319 } 2320 2321 function lastChild() 2322 { 2323 return $this->root->last_child(); 2324 } 2325 2326 function createElement($name, $value = null) 2327 { 2328 return @str_get_html("<$name>$value</$name>")->firstChild(); 2329 } 2330 2331 function createTextNode($value) 2332 { 2333 return @end(str_get_html($value)->nodes); 2334 } 2335 2336 function getElementById($id) 2337 { 2338 return $this->find("#$id", 0); 2339 } 2340 2341 function getElementsById($id, $idx = null) 2342 { 2343 return $this->find("#$id", $idx); 2344 } 2345 2346 function getElementByTagName($name) 2347 { 2348 return $this->find($name, 0); 2349 } 2350 2351 function getElementsByTagName($name, $idx = -1) 2352 { 2353 return $this->find($name, $idx); 2354 } 2355 2356 function loadFile() 2357 { 2358 $args = func_get_args(); 2359 $this->load_file($args); 2360 } 2361} 2362