1<?php 2/** 3 * Website: http://sourceforge.net/projects/simplehtmldom/ 4 * Additional projects: http://sourceforge.net/projects/debugobject/ 5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 * 7 * Licensed under The MIT License 8 * See the LICENSE file in the project root for more information. 9 * 10 * Authors: 11 * S.C. Chen 12 * John Schlick 13 * Rus Carroll 14 * logmanoriginal 15 * 16 * Contributors: 17 * Yousuke Kumakura 18 * Vadim Voituk 19 * Antcs 20 * 21 * Version Rev. 1.9.1 (291) 22 */ 23 24define('HDOM_TYPE_ELEMENT', 1); 25define('HDOM_TYPE_COMMENT', 2); 26define('HDOM_TYPE_TEXT', 3); 27define('HDOM_TYPE_ENDTAG', 4); 28define('HDOM_TYPE_ROOT', 5); 29define('HDOM_TYPE_UNKNOWN', 6); 30define('HDOM_QUOTE_DOUBLE', 0); 31define('HDOM_QUOTE_SINGLE', 1); 32define('HDOM_QUOTE_NO', 3); 33define('HDOM_INFO_BEGIN', 0); 34define('HDOM_INFO_END', 1); 35define('HDOM_INFO_QUOTE', 2); 36define('HDOM_INFO_SPACE', 3); 37define('HDOM_INFO_TEXT', 4); 38define('HDOM_INFO_INNER', 5); 39define('HDOM_INFO_OUTER', 6); 40define('HDOM_INFO_ENDSPACE', 7); 41 42defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 43defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 44defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 45defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 46define('HDOM_SMARTY_AS_TEXT', 1); 47 48function file_get_html( 49 $url, 50 $use_include_path = false, 51 $context = null, 52 $offset = 0, 53 $maxLen = -1, 54 $lowercase = true, 55 $forceTagsClosed = true, 56 $target_charset = DEFAULT_TARGET_CHARSET, 57 $stripRN = true, 58 $defaultBRText = DEFAULT_BR_TEXT, 59 $defaultSpanText = DEFAULT_SPAN_TEXT) 60{ 61 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 62 63 $dom = new simple_html_dom( 64 null, 65 $lowercase, 66 $forceTagsClosed, 67 $target_charset, 68 $stripRN, 69 $defaultBRText, 70 $defaultSpanText 71 ); 72 73 /** 74 * For sourceforge users: uncomment the next line and comment the 75 * retrieve_url_contents line 2 lines down if it is not already done. 76 */ 77 $contents = file_get_contents( 78 $url, 79 $use_include_path, 80 $context, 81 $offset, 82 $maxLen 83 ); 84 // $contents = retrieve_url_contents($url); 85 86 if (empty($contents) || strlen($contents) > $maxLen) { 87 $dom->clear(); 88 return false; 89 } 90 91 return $dom->load($contents, $lowercase, $stripRN); 92} 93 94function str_get_html( 95 $str, 96 $lowercase = true, 97 $forceTagsClosed = true, 98 $target_charset = DEFAULT_TARGET_CHARSET, 99 $stripRN = true, 100 $defaultBRText = DEFAULT_BR_TEXT, 101 $defaultSpanText = DEFAULT_SPAN_TEXT) 102{ 103 $dom = new simple_html_dom( 104 null, 105 $lowercase, 106 $forceTagsClosed, 107 $target_charset, 108 $stripRN, 109 $defaultBRText, 110 $defaultSpanText 111 ); 112 113 if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 114 $dom->clear(); 115 return false; 116 } 117 118 return $dom->load($str, $lowercase, $stripRN); 119} 120 121function dump_html_tree($node, $show_attr = true, $deep = 0) 122{ 123 $node->dump($node); 124} 125 126class simple_html_dom_node 127{ 128 public $nodetype = HDOM_TYPE_TEXT; 129 public $tag = 'text'; 130 public $attr = array(); 131 public $children = array(); 132 public $nodes = array(); 133 public $parent = null; 134 public $_ = array(); 135 public $tag_start = 0; 136 private $dom = null; 137 138 function __construct($dom) 139 { 140 $this->dom = $dom; 141 $dom->nodes[] = $this; 142 } 143 144 function __destruct() 145 { 146 $this->clear(); 147 } 148 149 function __toString() 150 { 151 return $this->outertext(); 152 } 153 154 function clear() 155 { 156 $this->dom = null; 157 $this->nodes = null; 158 $this->parent = null; 159 $this->children = null; 160 } 161 162 function dump($show_attr = true, $depth = 0) 163 { 164 echo str_repeat("\t", $depth) . $this->tag; 165 166 if ($show_attr && count($this->attr) > 0) { 167 echo '('; 168 foreach ($this->attr as $k => $v) { 169 echo "[$k]=>\"$v\", "; 170 } 171 echo ')'; 172 } 173 174 echo "\n"; 175 176 if ($this->nodes) { 177 foreach ($this->nodes as $node) { 178 $node->dump($show_attr, $depth + 1); 179 } 180 } 181 } 182 183 function dump_node($echo = true) 184 { 185 $string = $this->tag; 186 187 if (count($this->attr) > 0) { 188 $string .= '('; 189 foreach ($this->attr as $k => $v) { 190 $string .= "[$k]=>\"$v\", "; 191 } 192 $string .= ')'; 193 } 194 195 if (count($this->_) > 0) { 196 $string .= ' $_ ('; 197 foreach ($this->_ as $k => $v) { 198 if (is_array($v)) { 199 $string .= "[$k]=>("; 200 foreach ($v as $k2 => $v2) { 201 $string .= "[$k2]=>\"$v2\", "; 202 } 203 $string .= ')'; 204 } else { 205 $string .= "[$k]=>\"$v\", "; 206 } 207 } 208 $string .= ')'; 209 } 210 211 if (isset($this->text)) { 212 $string .= " text: ({$this->text})"; 213 } 214 215 $string .= ' HDOM_INNER_INFO: '; 216 217 if (isset($node->_[HDOM_INFO_INNER])) { 218 $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 219 } else { 220 $string .= ' NULL '; 221 } 222 223 $string .= ' children: ' . count($this->children); 224 $string .= ' nodes: ' . count($this->nodes); 225 $string .= ' tag_start: ' . $this->tag_start; 226 $string .= "\n"; 227 228 if ($echo) { 229 echo $string; 230 return; 231 } else { 232 return $string; 233 } 234 } 235 236 function parent($parent = null) 237 { 238 // I am SURE that this doesn't work properly. 239 // It fails to unset the current node from it's current parents nodes or 240 // children list first. 241 if ($parent !== null) { 242 $this->parent = $parent; 243 $this->parent->nodes[] = $this; 244 $this->parent->children[] = $this; 245 } 246 247 return $this->parent; 248 } 249 250 function has_child() 251 { 252 return !empty($this->children); 253 } 254 255 function children($idx = -1) 256 { 257 if ($idx === -1) { 258 return $this->children; 259 } 260 261 if (isset($this->children[$idx])) { 262 return $this->children[$idx]; 263 } 264 265 return null; 266 } 267 268 function first_child() 269 { 270 if (count($this->children) > 0) { 271 return $this->children[0]; 272 } 273 return null; 274 } 275 276 function last_child() 277 { 278 if (count($this->children) > 0) { 279 return end($this->children); 280 } 281 return null; 282 } 283 284 function next_sibling() 285 { 286 if ($this->parent === null) { 287 return null; 288 } 289 290 $idx = array_search($this, $this->parent->children, true); 291 292 if ($idx !== false && isset($this->parent->children[$idx + 1])) { 293 return $this->parent->children[$idx + 1]; 294 } 295 296 return null; 297 } 298 299 function prev_sibling() 300 { 301 if ($this->parent === null) { 302 return null; 303 } 304 305 $idx = array_search($this, $this->parent->children, true); 306 307 if ($idx !== false && $idx > 0) { 308 return $this->parent->children[$idx - 1]; 309 } 310 311 return null; 312 } 313 314 function find_ancestor_tag($tag) 315 { 316 global $debug_object; 317 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 318 319 if ($this->parent === null) { 320 return null; 321 } 322 323 $ancestor = $this->parent; 324 325 while (!is_null($ancestor)) { 326 if (is_object($debug_object)) { 327 $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 328 } 329 330 if ($ancestor->tag === $tag) { 331 break; 332 } 333 334 $ancestor = $ancestor->parent; 335 } 336 337 return $ancestor; 338 } 339 340 function innertext() 341 { 342 if (isset($this->_[HDOM_INFO_INNER])) { 343 return $this->_[HDOM_INFO_INNER]; 344 } 345 346 if (isset($this->_[HDOM_INFO_TEXT])) { 347 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 348 } 349 350 $ret = ''; 351 352 foreach ($this->nodes as $n) { 353 $ret .= $n->outertext(); 354 } 355 356 return $ret; 357 } 358 359 function outertext() 360 { 361 global $debug_object; 362 363 if (is_object($debug_object)) { 364 $text = ''; 365 366 if ($this->tag === 'text') { 367 if (!empty($this->text)) { 368 $text = ' with text: ' . $this->text; 369 } 370 } 371 372 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 373 } 374 375 if ($this->tag === 'root') { 376 return $this->innertext(); 377 } 378 379 // todo: What is the use of this callback? Remove? 380 if ($this->dom && $this->dom->callback !== null) { 381 call_user_func_array($this->dom->callback, array($this)); 382 } 383 384 if (isset($this->_[HDOM_INFO_OUTER])) { 385 return $this->_[HDOM_INFO_OUTER]; 386 } 387 388 if (isset($this->_[HDOM_INFO_TEXT])) { 389 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 390 } 391 392 $ret = ''; 393 394 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 395 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 396 } 397 398 if (isset($this->_[HDOM_INFO_INNER])) { 399 // todo: <br> should either never have HDOM_INFO_INNER or always 400 if ($this->tag !== 'br') { 401 $ret .= $this->_[HDOM_INFO_INNER]; 402 } 403 } elseif ($this->nodes) { 404 foreach ($this->nodes as $n) { 405 $ret .= $this->convert_text($n->outertext()); 406 } 407 } 408 409 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 410 $ret .= '</' . $this->tag . '>'; 411 } 412 413 return $ret; 414 } 415 416 function text() 417 { 418 if (isset($this->_[HDOM_INFO_INNER])) { 419 return $this->_[HDOM_INFO_INNER]; 420 } 421 422 switch ($this->nodetype) { 423 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 424 case HDOM_TYPE_COMMENT: return ''; 425 case HDOM_TYPE_UNKNOWN: return ''; 426 } 427 428 if (strcasecmp($this->tag, 'script') === 0) { return ''; } 429 if (strcasecmp($this->tag, 'style') === 0) { return ''; } 430 431 $ret = ''; 432 433 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 434 // for some span tags, and some p tags) $this->nodes is set to NULL. 435 // NOTE: This indicates that there is a problem where it's set to NULL 436 // without a clear happening. 437 // WHY is this happening? 438 if (!is_null($this->nodes)) { 439 foreach ($this->nodes as $n) { 440 // Start paragraph after a blank line 441 if ($n->tag === 'p') { 442 $ret = trim($ret) . "\n\n"; 443 } 444 445 $ret .= $this->convert_text($n->text()); 446 447 // If this node is a span... add a space at the end of it so 448 // multiple spans don't run into each other. This is plaintext 449 // after all. 450 if ($n->tag === 'span') { 451 $ret .= $this->dom->default_span_text; 452 } 453 } 454 } 455 return $ret; 456 } 457 458 function xmltext() 459 { 460 $ret = $this->innertext(); 461 $ret = str_ireplace('<![CDATA[', '', $ret); 462 $ret = str_replace(']]>', '', $ret); 463 return $ret; 464 } 465 466 function makeup() 467 { 468 // text, comment, unknown 469 if (isset($this->_[HDOM_INFO_TEXT])) { 470 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 471 } 472 473 $ret = '<' . $this->tag; 474 $i = -1; 475 476 foreach ($this->attr as $key => $val) { 477 ++$i; 478 479 // skip removed attribute 480 if ($val === null || $val === false) { continue; } 481 482 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 483 484 //no value attr: nowrap, checked selected... 485 if ($val === true) { 486 $ret .= $key; 487 } else { 488 switch ($this->_[HDOM_INFO_QUOTE][$i]) 489 { 490 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 491 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 492 default: $quote = ''; 493 } 494 495 $ret .= $key 496 . $this->_[HDOM_INFO_SPACE][$i][1] 497 . '=' 498 . $this->_[HDOM_INFO_SPACE][$i][2] 499 . $quote 500 . $val 501 . $quote; 502 } 503 } 504 505 $ret = $this->dom->restore_noise($ret); 506 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 507 } 508 509 function find($selector, $idx = null, $lowercase = false) 510 { 511 $selectors = $this->parse_selector($selector); 512 if (($count = count($selectors)) === 0) { return array(); } 513 $found_keys = array(); 514 515 // find each selector 516 for ($c = 0; $c < $count; ++$c) { 517 // The change on the below line was documented on the sourceforge 518 // code tracker id 2788009 519 // used to be: if (($levle=count($selectors[0]))===0) return array(); 520 if (($levle = count($selectors[$c])) === 0) { return array(); } 521 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 522 523 $head = array($this->_[HDOM_INFO_BEGIN] => 1); 524 $cmd = ' '; // Combinator 525 526 // handle descendant selectors, no recursive! 527 for ($l = 0; $l < $levle; ++$l) { 528 $ret = array(); 529 530 foreach ($head as $k => $v) { 531 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 532 //PaperG - Pass this optional parameter on to the seek function. 533 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 534 } 535 536 $head = $ret; 537 $cmd = $selectors[$c][$l][4]; // Next Combinator 538 } 539 540 foreach ($head as $k => $v) { 541 if (!isset($found_keys[$k])) { 542 $found_keys[$k] = 1; 543 } 544 } 545 } 546 547 // sort keys 548 ksort($found_keys); 549 550 $found = array(); 551 foreach ($found_keys as $k => $v) { 552 $found[] = $this->dom->nodes[$k]; 553 } 554 555 // return nth-element or array 556 if (is_null($idx)) { return $found; } 557 elseif ($idx < 0) { $idx = count($found) + $idx; } 558 return (isset($found[$idx])) ? $found[$idx] : null; 559 } 560 561 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 562 { 563 global $debug_object; 564 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 565 566 list($tag, $id, $class, $attributes, $cmb) = $selector; 567 $nodes = array(); 568 569 if ($parent_cmd === ' ') { // Descendant Combinator 570 // Find parent closing tag if the current element doesn't have a closing 571 // tag (i.e. void element) 572 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 573 if ($end == 0) { 574 $parent = $this->parent; 575 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 576 $end -= 1; 577 $parent = $parent->parent; 578 } 579 $end += $parent->_[HDOM_INFO_END]; 580 } 581 582 // Get list of target nodes 583 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 584 $nodes_count = $end - $nodes_start; 585 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 586 } elseif ($parent_cmd === '>') { // Child Combinator 587 $nodes = $this->children; 588 } elseif ($parent_cmd === '+' 589 && $this->parent 590 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 591 $index = array_search($this, $this->parent->children, true) + 1; 592 if ($index < count($this->parent->children)) 593 $nodes[] = $this->parent->children[$index]; 594 } elseif ($parent_cmd === '~' 595 && $this->parent 596 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 597 $index = array_search($this, $this->parent->children, true); 598 $nodes = array_slice($this->parent->children, $index); 599 } 600 601 // Go throgh each element starting at this element until the end tag 602 // Note: If this element is a void tag, any previous void element is 603 // skipped. 604 foreach($nodes as $node) { 605 $pass = true; 606 607 // Skip root nodes 608 if(!$node->parent) { 609 $pass = false; 610 } 611 612 // Handle 'text' selector 613 if($pass && $tag === 'text' && $node->tag === 'text') { 614 $ret[array_search($node, $this->dom->nodes, true)] = 1; 615 unset($node); 616 continue; 617 } 618 619 // Skip if node isn't a child node (i.e. text nodes) 620 if($pass && !in_array($node, $node->parent->children, true)) { 621 $pass = false; 622 } 623 624 // Skip if tag doesn't match 625 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 626 $pass = false; 627 } 628 629 // Skip if ID doesn't exist 630 if ($pass && $id !== '' && !isset($node->attr['id'])) { 631 $pass = false; 632 } 633 634 // Check if ID matches 635 if ($pass && $id !== '' && isset($node->attr['id'])) { 636 // Note: Only consider the first ID (as browsers do) 637 $node_id = explode(' ', trim($node->attr['id']))[0]; 638 639 if($id !== $node_id) { $pass = false; } 640 } 641 642 // Check if all class(es) exist 643 if ($pass && $class !== '' && is_array($class) && !empty($class)) { 644 if (isset($node->attr['class'])) { 645 $node_classes = explode(' ', $node->attr['class']); 646 647 if ($lowercase) { 648 $node_classes = array_map('strtolower', $node_classes); 649 } 650 651 foreach($class as $c) { 652 if(!in_array($c, $node_classes)) { 653 $pass = false; 654 break; 655 } 656 } 657 } else { 658 $pass = false; 659 } 660 } 661 662 // Check attributes 663 if ($pass 664 && $attributes !== '' 665 && is_array($attributes) 666 && !empty($attributes)) { 667 foreach($attributes as $a) { 668 list ( 669 $att_name, 670 $att_expr, 671 $att_val, 672 $att_inv, 673 $att_case_sensitivity 674 ) = $a; 675 676 // Handle indexing attributes (i.e. "[2]") 677 /** 678 * Note: This is not supported by the CSS Standard but adds 679 * the ability to select items compatible to XPath (i.e. 680 * the 3rd element within it's parent). 681 * 682 * Note: This doesn't conflict with the CSS Standard which 683 * doesn't work on numeric attributes anyway. 684 */ 685 if (is_numeric($att_name) 686 && $att_expr === '' 687 && $att_val === '') { 688 $count = 0; 689 690 // Find index of current element in parent 691 foreach ($node->parent->children as $c) { 692 if ($c->tag === $node->tag) ++$count; 693 if ($c === $node) break; 694 } 695 696 // If this is the correct node, continue with next 697 // attribute 698 if ($count === (int)$att_name) continue; 699 } 700 701 // Check attribute availability 702 if ($att_inv) { // Attribute should NOT be set 703 if (isset($node->attr[$att_name])) { 704 $pass = false; 705 break; 706 } 707 } else { // Attribute should be set 708 // todo: "plaintext" is not a valid CSS selector! 709 if ($att_name !== 'plaintext' 710 && !isset($node->attr[$att_name])) { 711 $pass = false; 712 break; 713 } 714 } 715 716 // Continue with next attribute if expression isn't defined 717 if ($att_expr === '') continue; 718 719 // If they have told us that this is a "plaintext" 720 // search then we want the plaintext of the node - right? 721 // todo "plaintext" is not a valid CSS selector! 722 if ($att_name === 'plaintext') { 723 $nodeKeyValue = $node->text(); 724 } else { 725 $nodeKeyValue = $node->attr[$att_name]; 726 } 727 728 if (is_object($debug_object)) { 729 $debug_object->debug_log(2, 730 'testing node: ' 731 . $node->tag 732 . ' for attribute: ' 733 . $att_name 734 . $att_expr 735 . $att_val 736 . ' where nodes value is: ' 737 . $nodeKeyValue 738 ); 739 } 740 741 // If lowercase is set, do a case insensitive test of 742 // the value of the selector. 743 if ($lowercase) { 744 $check = $this->match( 745 $att_expr, 746 strtolower($att_val), 747 strtolower($nodeKeyValue), 748 $att_case_sensitivity 749 ); 750 } else { 751 $check = $this->match( 752 $att_expr, 753 $att_val, 754 $nodeKeyValue, 755 $att_case_sensitivity 756 ); 757 } 758 759 if (is_object($debug_object)) { 760 $debug_object->debug_log(2, 761 'after match: ' 762 . ($check ? 'true' : 'false') 763 ); 764 } 765 766 if (!$check) { 767 $pass = false; 768 break; 769 } 770 } 771 } 772 773 // Found a match. Add to list and clear node 774 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 775 unset($node); 776 } 777 // It's passed by reference so this is actually what this function returns. 778 if (is_object($debug_object)) { 779 $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 780 } 781 } 782 783 protected function match($exp, $pattern, $value, $case_sensitivity) 784 { 785 global $debug_object; 786 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 787 788 if ($case_sensitivity === 'i') { 789 $pattern = strtolower($pattern); 790 $value = strtolower($value); 791 } 792 793 switch ($exp) { 794 case '=': 795 return ($value === $pattern); 796 case '!=': 797 return ($value !== $pattern); 798 case '^=': 799 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 800 case '$=': 801 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 802 case '*=': 803 return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 804 case '|=': 805 /** 806 * [att|=val] 807 * 808 * Represents an element with the att attribute, its value 809 * either being exactly "val" or beginning with "val" 810 * immediately followed by "-" (U+002D). 811 */ 812 return strpos($value, $pattern) === 0; 813 case '~=': 814 /** 815 * [att~=val] 816 * 817 * Represents an element with the att attribute whose value is a 818 * whitespace-separated list of words, one of which is exactly 819 * "val". If "val" contains whitespace, it will never represent 820 * anything (since the words are separated by spaces). Also if 821 * "val" is the empty string, it will never represent anything. 822 */ 823 return in_array($pattern, explode(' ', trim($value)), true); 824 } 825 return false; 826 } 827 828 protected function parse_selector($selector_string) 829 { 830 global $debug_object; 831 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 832 833 /** 834 * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 835 * 836 * Paperg: Add the colon to the attribute, so that it properly finds 837 * <tag attr:ibute="something" > like google does. 838 * 839 * Note: if you try to look at this attribute, you MUST use getAttribute 840 * since $dom->x:y will fail the php syntax check. 841 * 842 * Notice the \[ starting the attribute? and the @? following? This 843 * implies that an attribute can begin with an @ sign that is not 844 * captured. This implies that an html attribute specifier may start 845 * with an @ sign that is NOT captured by the expression. Farther study 846 * is required to determine of this should be documented or removed. 847 * 848 * Matches selectors in this order: 849 * 850 * [0] - full match 851 * 852 * [1] - tag name 853 * ([\w:\*-]*) 854 * Matches the tag name consisting of zero or more words, colons, 855 * asterisks and hyphens. 856 * 857 * [2] - id name 858 * (?:\#([\w-]+)) 859 * Optionally matches a id name, consisting of an "#" followed by 860 * the id name (one or more words and hyphens). 861 * 862 * [3] - class names (including dots) 863 * (?:\.([\w\.-]+))? 864 * Optionally matches a list of classs, consisting of an "." 865 * followed by the class name (one or more words and hyphens) 866 * where multiple classes can be chained (i.e. ".foo.bar.baz") 867 * 868 * [4] - attributes 869 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 870 * Optionally matches the attributes list 871 * 872 * [5] - separator 873 * ([\/, >+~]+) 874 * Matches the selector list separator 875 */ 876 // phpcs:ignore Generic.Files.LineLength 877 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 878 879 preg_match_all( 880 $pattern, 881 trim($selector_string) . ' ', // Add final ' ' as pseudo separator 882 $matches, 883 PREG_SET_ORDER 884 ); 885 886 if (is_object($debug_object)) { 887 $debug_object->debug_log(2, 'Matches Array: ', $matches); 888 } 889 890 $selectors = array(); 891 $result = array(); 892 893 foreach ($matches as $m) { 894 $m[0] = trim($m[0]); 895 896 // Skip NoOps 897 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 898 899 // Convert to lowercase 900 if ($this->dom->lowercase) { 901 $m[1] = strtolower($m[1]); 902 } 903 904 // Extract classes 905 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 906 907 /* Extract attributes (pattern based on the pattern above!) 908 909 * [0] - full match 910 * [1] - attribute name 911 * [2] - attribute expression 912 * [3] - attribute value 913 * [4] - case sensitivity 914 * 915 * Note: Attributes can be negated with a "!" prefix to their name 916 */ 917 if($m[4] !== '') { 918 preg_match_all( 919 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 920 trim($m[4]), 921 $attributes, 922 PREG_SET_ORDER 923 ); 924 925 // Replace element by array 926 $m[4] = array(); 927 928 foreach($attributes as $att) { 929 // Skip empty matches 930 if(trim($att[0]) === '') { continue; } 931 932 $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 933 $m[4][] = array( 934 $inverted ? substr($att[1], 1) : $att[1], // Name 935 (isset($att[2])) ? $att[2] : '', // Expression 936 (isset($att[3])) ? $att[3] : '', // Value 937 $inverted, // Inverted Flag 938 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 939 ); 940 } 941 } 942 943 // Sanitize Separator 944 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 945 $m[5] = ' '; 946 } else { // Other Separator 947 $m[5] = trim($m[5]); 948 } 949 950 // Clear Separator if it's a Selector List 951 if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 952 953 // Remove full match before adding to results 954 array_shift($m); 955 $result[] = $m; 956 957 if ($is_list) { // Selector List 958 $selectors[] = $result; 959 $result = array(); 960 } 961 } 962 963 if (count($result) > 0) { $selectors[] = $result; } 964 return $selectors; 965 } 966 967 function __get($name) 968 { 969 if (isset($this->attr[$name])) { 970 return $this->convert_text($this->attr[$name]); 971 } 972 switch ($name) { 973 case 'outertext': return $this->outertext(); 974 case 'innertext': return $this->innertext(); 975 case 'plaintext': return $this->text(); 976 case 'xmltext': return $this->xmltext(); 977 default: return array_key_exists($name, $this->attr); 978 } 979 } 980 981 function __set($name, $value) 982 { 983 global $debug_object; 984 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 985 986 switch ($name) { 987 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 988 case 'innertext': 989 if (isset($this->_[HDOM_INFO_TEXT])) { 990 return $this->_[HDOM_INFO_TEXT] = $value; 991 } 992 return $this->_[HDOM_INFO_INNER] = $value; 993 } 994 995 if (!isset($this->attr[$name])) { 996 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 997 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 998 } 999 1000 $this->attr[$name] = $value; 1001 } 1002 1003 function __isset($name) 1004 { 1005 switch ($name) { 1006 case 'outertext': return true; 1007 case 'innertext': return true; 1008 case 'plaintext': return true; 1009 } 1010 //no value attr: nowrap, checked selected... 1011 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1012 } 1013 1014 function __unset($name) 1015 { 1016 if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1017 } 1018 1019 function convert_text($text) 1020 { 1021 global $debug_object; 1022 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1023 1024 $converted_text = $text; 1025 1026 $sourceCharset = ''; 1027 $targetCharset = ''; 1028 1029 if ($this->dom) { 1030 $sourceCharset = strtoupper($this->dom->_charset); 1031 $targetCharset = strtoupper($this->dom->_target_charset); 1032 } 1033 1034 if (is_object($debug_object)) { 1035 $debug_object->debug_log(3, 1036 'source charset: ' 1037 . $sourceCharset 1038 . ' target charaset: ' 1039 . $targetCharset 1040 ); 1041 } 1042 1043 if (!empty($sourceCharset) 1044 && !empty($targetCharset) 1045 && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1046 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1047 if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1048 && ($this->is_utf8($text))) { 1049 $converted_text = $text; 1050 } else { 1051 $converted_text = iconv($sourceCharset, $targetCharset, $text); 1052 } 1053 } 1054 1055 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1056 if ($targetCharset === 'UTF-8') { 1057 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1058 $converted_text = substr($converted_text, 3); 1059 } 1060 1061 if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1062 $converted_text = substr($converted_text, 0, -3); 1063 } 1064 } 1065 1066 return $converted_text; 1067 } 1068 1069 static function is_utf8($str) 1070 { 1071 $c = 0; $b = 0; 1072 $bits = 0; 1073 $len = strlen($str); 1074 for($i = 0; $i < $len; $i++) { 1075 $c = ord($str[$i]); 1076 if($c > 128) { 1077 if(($c >= 254)) { return false; } 1078 elseif($c >= 252) { $bits = 6; } 1079 elseif($c >= 248) { $bits = 5; } 1080 elseif($c >= 240) { $bits = 4; } 1081 elseif($c >= 224) { $bits = 3; } 1082 elseif($c >= 192) { $bits = 2; } 1083 else { return false; } 1084 if(($i + $bits) > $len) { return false; } 1085 while($bits > 1) { 1086 $i++; 1087 $b = ord($str[$i]); 1088 if($b < 128 || $b > 191) { return false; } 1089 $bits--; 1090 } 1091 } 1092 } 1093 return true; 1094 } 1095 1096 function get_display_size() 1097 { 1098 global $debug_object; 1099 1100 $width = -1; 1101 $height = -1; 1102 1103 if ($this->tag !== 'img') { 1104 return false; 1105 } 1106 1107 // See if there is aheight or width attribute in the tag itself. 1108 if (isset($this->attr['width'])) { 1109 $width = $this->attr['width']; 1110 } 1111 1112 if (isset($this->attr['height'])) { 1113 $height = $this->attr['height']; 1114 } 1115 1116 // Now look for an inline style. 1117 if (isset($this->attr['style'])) { 1118 // Thanks to user gnarf from stackoverflow for this regular expression. 1119 $attributes = array(); 1120 1121 preg_match_all( 1122 '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1123 $this->attr['style'], 1124 $matches, 1125 PREG_SET_ORDER 1126 ); 1127 1128 foreach ($matches as $match) { 1129 $attributes[$match[1]] = $match[2]; 1130 } 1131 1132 // If there is a width in the style attributes: 1133 if (isset($attributes['width']) && $width == -1) { 1134 // check that the last two characters are px (pixels) 1135 if (strtolower(substr($attributes['width'], -2)) === 'px') { 1136 $proposed_width = substr($attributes['width'], 0, -2); 1137 // Now make sure that it's an integer and not something stupid. 1138 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1139 $width = $proposed_width; 1140 } 1141 } 1142 } 1143 1144 // If there is a width in the style attributes: 1145 if (isset($attributes['height']) && $height == -1) { 1146 // check that the last two characters are px (pixels) 1147 if (strtolower(substr($attributes['height'], -2)) == 'px') { 1148 $proposed_height = substr($attributes['height'], 0, -2); 1149 // Now make sure that it's an integer and not something stupid. 1150 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1151 $height = $proposed_height; 1152 } 1153 } 1154 } 1155 1156 } 1157 1158 // Future enhancement: 1159 // Look in the tag to see if there is a class or id specified that has 1160 // a height or width attribute to it. 1161 1162 // Far future enhancement 1163 // Look at all the parent tags of this image to see if they specify a 1164 // class or id that has an img selector that specifies a height or width 1165 // Note that in this case, the class or id will have the img subselector 1166 // for it to apply to the image. 1167 1168 // ridiculously far future development 1169 // If the class or id is specified in a SEPARATE css file thats not on 1170 // the page, go get it and do what we were just doing for the ones on 1171 // the page. 1172 1173 $result = array( 1174 'height' => $height, 1175 'width' => $width 1176 ); 1177 1178 return $result; 1179 } 1180 1181 function save($filepath = '') 1182 { 1183 $ret = $this->outertext(); 1184 1185 if ($filepath !== '') { 1186 file_put_contents($filepath, $ret, LOCK_EX); 1187 } 1188 1189 return $ret; 1190 } 1191 1192 function addClass($class) 1193 { 1194 if (is_string($class)) { 1195 $class = explode(' ', $class); 1196 } 1197 1198 if (is_array($class)) { 1199 foreach($class as $c) { 1200 if (isset($this->class)) { 1201 if ($this->hasClass($c)) { 1202 continue; 1203 } else { 1204 $this->class .= ' ' . $c; 1205 } 1206 } else { 1207 $this->class = $c; 1208 } 1209 } 1210 } else { 1211 if (is_object($debug_object)) { 1212 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1213 } 1214 } 1215 } 1216 1217 function hasClass($class) 1218 { 1219 if (is_string($class)) { 1220 if (isset($this->class)) { 1221 return in_array($class, explode(' ', $this->class), true); 1222 } 1223 } else { 1224 if (is_object($debug_object)) { 1225 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1226 } 1227 } 1228 1229 return false; 1230 } 1231 1232 function removeClass($class = null) 1233 { 1234 if (!isset($this->class)) { 1235 return; 1236 } 1237 1238 if (is_null($class)) { 1239 $this->removeAttribute('class'); 1240 return; 1241 } 1242 1243 if (is_string($class)) { 1244 $class = explode(' ', $class); 1245 } 1246 1247 if (is_array($class)) { 1248 $class = array_diff(explode(' ', $this->class), $class); 1249 if (empty($class)) { 1250 $this->removeAttribute('class'); 1251 } else { 1252 $this->class = implode(' ', $class); 1253 } 1254 } 1255 } 1256 1257 function getAllAttributes() 1258 { 1259 return $this->attr; 1260 } 1261 1262 function getAttribute($name) 1263 { 1264 return $this->__get($name); 1265 } 1266 1267 function setAttribute($name, $value) 1268 { 1269 $this->__set($name, $value); 1270 } 1271 1272 function hasAttribute($name) 1273 { 1274 return $this->__isset($name); 1275 } 1276 1277 function removeAttribute($name) 1278 { 1279 $this->__set($name, null); 1280 } 1281 1282 function remove() 1283 { 1284 if ($this->parent) { 1285 $this->parent->removeChild($this); 1286 } 1287 } 1288 1289 function removeChild($node) 1290 { 1291 $nidx = array_search($node, $this->nodes, true); 1292 $cidx = array_search($node, $this->children, true); 1293 $didx = array_search($node, $this->dom->nodes, true); 1294 1295 if ($nidx !== false && $cidx !== false && $didx !== false) { 1296 1297 foreach($node->children as $child) { 1298 $node->removeChild($child); 1299 } 1300 1301 foreach($node->nodes as $entity) { 1302 $enidx = array_search($entity, $node->nodes, true); 1303 $edidx = array_search($entity, $node->dom->nodes, true); 1304 1305 if ($enidx !== false && $edidx !== false) { 1306 unset($node->nodes[$enidx]); 1307 unset($node->dom->nodes[$edidx]); 1308 } 1309 } 1310 1311 unset($this->nodes[$nidx]); 1312 unset($this->children[$cidx]); 1313 unset($this->dom->nodes[$didx]); 1314 1315 $node->clear(); 1316 1317 } 1318 } 1319 1320 function getElementById($id) 1321 { 1322 return $this->find("#$id", 0); 1323 } 1324 1325 function getElementsById($id, $idx = null) 1326 { 1327 return $this->find("#$id", $idx); 1328 } 1329 1330 function getElementByTagName($name) 1331 { 1332 return $this->find($name, 0); 1333 } 1334 1335 function getElementsByTagName($name, $idx = null) 1336 { 1337 return $this->find($name, $idx); 1338 } 1339 1340 function parentNode() 1341 { 1342 return $this->parent(); 1343 } 1344 1345 function childNodes($idx = -1) 1346 { 1347 return $this->children($idx); 1348 } 1349 1350 function firstChild() 1351 { 1352 return $this->first_child(); 1353 } 1354 1355 function lastChild() 1356 { 1357 return $this->last_child(); 1358 } 1359 1360 function nextSibling() 1361 { 1362 return $this->next_sibling(); 1363 } 1364 1365 function previousSibling() 1366 { 1367 return $this->prev_sibling(); 1368 } 1369 1370 function hasChildNodes() 1371 { 1372 return $this->has_child(); 1373 } 1374 1375 function nodeName() 1376 { 1377 return $this->tag; 1378 } 1379 1380 function appendChild($node) 1381 { 1382 $node->parent($this); 1383 return $node; 1384 } 1385 1386} 1387 1388class simple_html_dom 1389{ 1390 public $root = null; 1391 public $nodes = array(); 1392 public $callback = null; 1393 public $lowercase = false; 1394 public $original_size; 1395 public $size; 1396 1397 protected $pos; 1398 protected $doc; 1399 protected $char; 1400 1401 protected $cursor; 1402 protected $parent; 1403 protected $noise = array(); 1404 protected $token_blank = " \t\r\n"; 1405 protected $token_equal = ' =/>'; 1406 protected $token_slash = " />\r\n\t"; 1407 protected $token_attr = ' >'; 1408 1409 public $_charset = ''; 1410 public $_target_charset = ''; 1411 1412 protected $default_br_text = ''; 1413 1414 public $default_span_text = ''; 1415 1416 protected $self_closing_tags = array( 1417 'area' => 1, 1418 'base' => 1, 1419 'br' => 1, 1420 'col' => 1, 1421 'embed' => 1, 1422 'hr' => 1, 1423 'img' => 1, 1424 'input' => 1, 1425 'link' => 1, 1426 'meta' => 1, 1427 'param' => 1, 1428 'source' => 1, 1429 'track' => 1, 1430 'wbr' => 1 1431 ); 1432 protected $block_tags = array( 1433 'body' => 1, 1434 'div' => 1, 1435 'form' => 1, 1436 'root' => 1, 1437 'span' => 1, 1438 'table' => 1 1439 ); 1440 protected $optional_closing_tags = array( 1441 // Not optional, see 1442 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1443 'b' => array('b' => 1), 1444 'dd' => array('dd' => 1, 'dt' => 1), 1445 // Not optional, see 1446 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1447 'dl' => array('dd' => 1, 'dt' => 1), 1448 'dt' => array('dd' => 1, 'dt' => 1), 1449 'li' => array('li' => 1), 1450 'optgroup' => array('optgroup' => 1, 'option' => 1), 1451 'option' => array('optgroup' => 1, 'option' => 1), 1452 'p' => array('p' => 1), 1453 'rp' => array('rp' => 1, 'rt' => 1), 1454 'rt' => array('rp' => 1, 'rt' => 1), 1455 'td' => array('td' => 1, 'th' => 1), 1456 'th' => array('td' => 1, 'th' => 1), 1457 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1458 ); 1459 1460 function __construct( 1461 $str = null, 1462 $lowercase = true, 1463 $forceTagsClosed = true, 1464 $target_charset = DEFAULT_TARGET_CHARSET, 1465 $stripRN = true, 1466 $defaultBRText = DEFAULT_BR_TEXT, 1467 $defaultSpanText = DEFAULT_SPAN_TEXT, 1468 $options = 0) 1469 { 1470 if ($str) { 1471 if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1472 $this->load_file($str); 1473 } else { 1474 $this->load( 1475 $str, 1476 $lowercase, 1477 $stripRN, 1478 $defaultBRText, 1479 $defaultSpanText, 1480 $options 1481 ); 1482 } 1483 } 1484 // Forcing tags to be closed implies that we don't trust the html, but 1485 // it can lead to parsing errors if we SHOULD trust the html. 1486 if (!$forceTagsClosed) { 1487 $this->optional_closing_array = array(); 1488 } 1489 1490 $this->_target_charset = $target_charset; 1491 } 1492 1493 function __destruct() 1494 { 1495 $this->clear(); 1496 } 1497 1498 function load( 1499 $str, 1500 $lowercase = true, 1501 $stripRN = true, 1502 $defaultBRText = DEFAULT_BR_TEXT, 1503 $defaultSpanText = DEFAULT_SPAN_TEXT, 1504 $options = 0) 1505 { 1506 global $debug_object; 1507 1508 // prepare 1509 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1510 1511 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1512 // Script tags removal now preceeds style tag removal. 1513 // strip out <script> tags 1514 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1515 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1516 1517 // strip out the \r \n's if we are told to. 1518 if ($stripRN) { 1519 $this->doc = str_replace("\r", ' ', $this->doc); 1520 $this->doc = str_replace("\n", ' ', $this->doc); 1521 1522 // set the length of content since we have changed it. 1523 $this->size = strlen($this->doc); 1524 } 1525 1526 // strip out cdata 1527 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1528 // strip out comments 1529 $this->remove_noise("'<!--(.*?)-->'is"); 1530 // strip out <style> tags 1531 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1532 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1533 // strip out preformatted tags 1534 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1535 // strip out server side scripts 1536 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1537 1538 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1539 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1540 } 1541 1542 // parsing 1543 $this->parse(); 1544 // end 1545 $this->root->_[HDOM_INFO_END] = $this->cursor; 1546 $this->parse_charset(); 1547 1548 // make load function chainable 1549 return $this; 1550 } 1551 1552 function load_file() 1553 { 1554 $args = func_get_args(); 1555 1556 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1557 $this->load($doc, true); 1558 } else { 1559 return false; 1560 } 1561 } 1562 1563 function set_callback($function_name) 1564 { 1565 $this->callback = $function_name; 1566 } 1567 1568 function remove_callback() 1569 { 1570 $this->callback = null; 1571 } 1572 1573 function save($filepath = '') 1574 { 1575 $ret = $this->root->innertext(); 1576 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1577 return $ret; 1578 } 1579 1580 function find($selector, $idx = null, $lowercase = false) 1581 { 1582 return $this->root->find($selector, $idx, $lowercase); 1583 } 1584 1585 function clear() 1586 { 1587 if (isset($this->nodes)) { 1588 foreach ($this->nodes as $n) { 1589 $n->clear(); 1590 $n = null; 1591 } 1592 } 1593 1594 // This add next line is documented in the sourceforge repository. 1595 // 2977248 as a fix for ongoing memory leaks that occur even with the 1596 // use of clear. 1597 if (isset($this->children)) { 1598 foreach ($this->children as $n) { 1599 $n->clear(); 1600 $n = null; 1601 } 1602 } 1603 1604 if (isset($this->parent)) { 1605 $this->parent->clear(); 1606 unset($this->parent); 1607 } 1608 1609 if (isset($this->root)) { 1610 $this->root->clear(); 1611 unset($this->root); 1612 } 1613 1614 unset($this->doc); 1615 unset($this->noise); 1616 } 1617 1618 function dump($show_attr = true) 1619 { 1620 $this->root->dump($show_attr); 1621 } 1622 1623 protected function prepare( 1624 $str, $lowercase = true, 1625 $defaultBRText = DEFAULT_BR_TEXT, 1626 $defaultSpanText = DEFAULT_SPAN_TEXT) 1627 { 1628 $this->clear(); 1629 1630 $this->doc = trim($str); 1631 $this->size = strlen($this->doc); 1632 $this->original_size = $this->size; // original size of the html 1633 $this->pos = 0; 1634 $this->cursor = 1; 1635 $this->noise = array(); 1636 $this->nodes = array(); 1637 $this->lowercase = $lowercase; 1638 $this->default_br_text = $defaultBRText; 1639 $this->default_span_text = $defaultSpanText; 1640 $this->root = new simple_html_dom_node($this); 1641 $this->root->tag = 'root'; 1642 $this->root->_[HDOM_INFO_BEGIN] = -1; 1643 $this->root->nodetype = HDOM_TYPE_ROOT; 1644 $this->parent = $this->root; 1645 if ($this->size > 0) { $this->char = $this->doc[0]; } 1646 } 1647 1648 protected function parse() 1649 { 1650 while (true) { 1651 // Read next tag if there is no text between current position and the 1652 // next opening tag. 1653 if (($s = $this->copy_until_char('<')) === '') { 1654 if($this->read_tag()) { 1655 continue; 1656 } else { 1657 return true; 1658 } 1659 } 1660 1661 // Add a text node for text between tags 1662 $node = new simple_html_dom_node($this); 1663 ++$this->cursor; 1664 $node->_[HDOM_INFO_TEXT] = $s; 1665 $this->link_nodes($node, false); 1666 } 1667 } 1668 1669 protected function parse_charset() 1670 { 1671 global $debug_object; 1672 1673 $charset = null; 1674 1675 if (function_exists('get_last_retrieve_url_contents_content_type')) { 1676 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1677 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1678 if ($success) { 1679 $charset = $matches[1]; 1680 if (is_object($debug_object)) { 1681 $debug_object->debug_log(2, 1682 'header content-type found charset of: ' 1683 . $charset 1684 ); 1685 } 1686 } 1687 } 1688 1689 if (empty($charset)) { 1690 // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1691 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1692 1693 if (!empty($el)) { 1694 $fullvalue = $el->content; 1695 if (is_object($debug_object)) { 1696 $debug_object->debug_log(2, 1697 'meta content-type tag found' 1698 . $fullvalue 1699 ); 1700 } 1701 1702 if (!empty($fullvalue)) { 1703 $success = preg_match( 1704 '/charset=(.+)/i', 1705 $fullvalue, 1706 $matches 1707 ); 1708 1709 if ($success) { 1710 $charset = $matches[1]; 1711 } else { 1712 // If there is a meta tag, and they don't specify the 1713 // character set, research says that it's typically 1714 // ISO-8859-1 1715 if (is_object($debug_object)) { 1716 $debug_object->debug_log(2, 1717 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1718 ); 1719 } 1720 1721 $charset = 'ISO-8859-1'; 1722 } 1723 } 1724 } 1725 } 1726 1727 if (empty($charset)) { 1728 // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1729 if ($meta = $this->root->find('meta[charset]', 0)) { 1730 $charset = $meta->charset; 1731 if (is_object($debug_object)) { 1732 $debug_object->debug_log(2, 'meta charset: ' . $charset); 1733 } 1734 } 1735 } 1736 1737 if (empty($charset)) { 1738 // Try to guess the charset based on the content 1739 // Requires Multibyte String (mbstring) support (optional) 1740 if (function_exists('mb_detect_encoding')) { 1741 /** 1742 * mb_detect_encoding() is not intended to distinguish between 1743 * charsets, especially single-byte charsets. Its primary 1744 * purpose is to detect which multibyte encoding is in use, 1745 * i.e. UTF-8, UTF-16, shift-JIS, etc. 1746 * 1747 * -- https://bugs.php.net/bug.php?id=38138 1748 * 1749 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1750 * always result in CP1251/ISO-8859-5 and vice versa. 1751 * 1752 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1753 * to stay compatible. 1754 */ 1755 $encoding = mb_detect_encoding( 1756 $this->doc, 1757 array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1758 ); 1759 1760 if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1761 // Due to a limitation of mb_detect_encoding 1762 // 'CP1251'/'ISO-8859-5' will be detected as 1763 // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1764 // which case we can simply assume it is the other charset. 1765 if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1766 $encoding = 'CP1251'; 1767 } 1768 } 1769 1770 if ($encoding !== false) { 1771 $charset = $encoding; 1772 if (is_object($debug_object)) { 1773 $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1774 } 1775 } 1776 } 1777 } 1778 1779 if (empty($charset)) { 1780 // Assume it's UTF-8 as it is the most likely charset to be used 1781 $charset = 'UTF-8'; 1782 if (is_object($debug_object)) { 1783 $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1784 } 1785 } 1786 1787 // Since CP1252 is a superset, if we get one of it's subsets, we want 1788 // it instead. 1789 if ((strtolower($charset) == 'iso-8859-1') 1790 || (strtolower($charset) == 'latin1') 1791 || (strtolower($charset) == 'latin-1')) { 1792 $charset = 'CP1252'; 1793 if (is_object($debug_object)) { 1794 $debug_object->debug_log(2, 1795 'replacing ' . $charset . ' with CP1252 as its a superset' 1796 ); 1797 } 1798 } 1799 1800 if (is_object($debug_object)) { 1801 $debug_object->debug_log(1, 'EXIT - ' . $charset); 1802 } 1803 1804 return $this->_charset = $charset; 1805 } 1806 1807 protected function read_tag() 1808 { 1809 // Set end position if no further tags found 1810 if ($this->char !== '<') { 1811 $this->root->_[HDOM_INFO_END] = $this->cursor; 1812 return false; 1813 } 1814 1815 $begin_tag_pos = $this->pos; 1816 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1817 1818 // end tag 1819 if ($this->char === '/') { 1820 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1821 1822 // Skip whitespace in end tags (i.e. in "</ html>") 1823 $this->skip($this->token_blank); 1824 $tag = $this->copy_until_char('>'); 1825 1826 // Skip attributes in end tags 1827 if (($pos = strpos($tag, ' ')) !== false) { 1828 $tag = substr($tag, 0, $pos); 1829 } 1830 1831 $parent_lower = strtolower($this->parent->tag); 1832 $tag_lower = strtolower($tag); 1833 1834 // The end tag is supposed to close the parent tag. Handle situations 1835 // when it doesn't 1836 if ($parent_lower !== $tag_lower) { 1837 // Parent tag does not have to be closed necessarily (optional closing tag) 1838 // Current tag is a block tag, so it may close an ancestor 1839 if (isset($this->optional_closing_tags[$parent_lower]) 1840 && isset($this->block_tags[$tag_lower])) { 1841 1842 $this->parent->_[HDOM_INFO_END] = 0; 1843 $org_parent = $this->parent; 1844 1845 // Traverse ancestors to find a matching opening tag 1846 // Stop at root node 1847 while (($this->parent->parent) 1848 && strtolower($this->parent->tag) !== $tag_lower 1849 ){ 1850 $this->parent = $this->parent->parent; 1851 } 1852 1853 // If we don't have a match add current tag as text node 1854 if (strtolower($this->parent->tag) !== $tag_lower) { 1855 $this->parent = $org_parent; // restore origonal parent 1856 1857 if ($this->parent->parent) { 1858 $this->parent = $this->parent->parent; 1859 } 1860 1861 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1862 return $this->as_text_node($tag); 1863 } 1864 } elseif (($this->parent->parent) 1865 && isset($this->block_tags[$tag_lower]) 1866 ) { 1867 // Grandparent exists and current tag is a block tag, so our 1868 // parent doesn't have an end tag 1869 $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1870 $org_parent = $this->parent; 1871 1872 // Traverse ancestors to find a matching opening tag 1873 // Stop at root node 1874 while (($this->parent->parent) 1875 && strtolower($this->parent->tag) !== $tag_lower 1876 ) { 1877 $this->parent = $this->parent->parent; 1878 } 1879 1880 // If we don't have a match add current tag as text node 1881 if (strtolower($this->parent->tag) !== $tag_lower) { 1882 $this->parent = $org_parent; // restore origonal parent 1883 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1884 return $this->as_text_node($tag); 1885 } 1886 } elseif (($this->parent->parent) 1887 && strtolower($this->parent->parent->tag) === $tag_lower 1888 ) { // Grandparent exists and current tag closes it 1889 $this->parent->_[HDOM_INFO_END] = 0; 1890 $this->parent = $this->parent->parent; 1891 } else { // Random tag, add as text node 1892 return $this->as_text_node($tag); 1893 } 1894 } 1895 1896 // Set end position of parent tag to current cursor position 1897 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1898 1899 if ($this->parent->parent) { 1900 $this->parent = $this->parent->parent; 1901 } 1902 1903 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1904 return true; 1905 } 1906 1907 // start tag 1908 $node = new simple_html_dom_node($this); 1909 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1910 ++$this->cursor; 1911 $tag = $this->copy_until($this->token_slash); // Get tag name 1912 $node->tag_start = $begin_tag_pos; 1913 1914 // doctype, cdata & comments... 1915 // <!DOCTYPE html> 1916 // <![CDATA[ ... ]]> 1917 // <!-- Comment --> 1918 if (isset($tag[0]) && $tag[0] === '!') { 1919 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1920 1921 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1922 $node->nodetype = HDOM_TYPE_COMMENT; 1923 $node->tag = 'comment'; 1924 } else { // Could be doctype or CDATA but we don't care 1925 $node->nodetype = HDOM_TYPE_UNKNOWN; 1926 $node->tag = 'unknown'; 1927 } 1928 1929 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1930 1931 $this->link_nodes($node, true); 1932 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1933 return true; 1934 } 1935 1936 // The start tag cannot contain another start tag, if so add as text 1937 // i.e. "<<html>" 1938 if ($pos = strpos($tag, '<') !== false) { 1939 $tag = '<' . substr($tag, 0, -1); 1940 $node->_[HDOM_INFO_TEXT] = $tag; 1941 $this->link_nodes($node, false); 1942 $this->char = $this->doc[--$this->pos]; // prev 1943 return true; 1944 } 1945 1946 // Handle invalid tag names (i.e. "<html#doc>") 1947 if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1948 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1949 1950 // Next char is the beginning of a new tag, don't touch it. 1951 if ($this->char === '<') { 1952 $this->link_nodes($node, false); 1953 return true; 1954 } 1955 1956 // Next char closes current tag, add and be done with it. 1957 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1958 $this->link_nodes($node, false); 1959 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1960 return true; 1961 } 1962 1963 // begin tag, add new node 1964 $node->nodetype = HDOM_TYPE_ELEMENT; 1965 $tag_lower = strtolower($tag); 1966 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1967 1968 // handle optional closing tags 1969 if (isset($this->optional_closing_tags[$tag_lower])) { 1970 // Traverse ancestors to close all optional closing tags 1971 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1972 $this->parent->_[HDOM_INFO_END] = 0; 1973 $this->parent = $this->parent->parent; 1974 } 1975 $node->parent = $this->parent; 1976 } 1977 1978 $guard = 0; // prevent infinity loop 1979 1980 // [0] Space between tag and first attribute 1981 $space = array($this->copy_skip($this->token_blank), '', ''); 1982 1983 // attributes 1984 do { 1985 // Everything until the first equal sign should be the attribute name 1986 $name = $this->copy_until($this->token_equal); 1987 1988 if ($name === '' && $this->char !== null && $space[0] === '') { 1989 break; 1990 } 1991 1992 if ($guard === $this->pos) { // Escape infinite loop 1993 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1994 continue; 1995 } 1996 1997 $guard = $this->pos; 1998 1999 // handle endless '<' 2000 // Out of bounds before the tag ended 2001 if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2002 $node->nodetype = HDOM_TYPE_TEXT; 2003 $node->_[HDOM_INFO_END] = 0; 2004 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2005 $node->tag = 'text'; 2006 $this->link_nodes($node, false); 2007 return true; 2008 } 2009 2010 // handle mismatch '<' 2011 // Attributes cannot start after opening tag 2012 if ($this->doc[$this->pos - 1] == '<') { 2013 $node->nodetype = HDOM_TYPE_TEXT; 2014 $node->tag = 'text'; 2015 $node->attr = array(); 2016 $node->_[HDOM_INFO_END] = 0; 2017 $node->_[HDOM_INFO_TEXT] = substr( 2018 $this->doc, 2019 $begin_tag_pos, 2020 $this->pos - $begin_tag_pos - 1 2021 ); 2022 $this->pos -= 2; 2023 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2024 $this->link_nodes($node, false); 2025 return true; 2026 } 2027 2028 if ($name !== '/' && $name !== '') { // this is a attribute name 2029 // [1] Whitespace after attribute name 2030 $space[1] = $this->copy_skip($this->token_blank); 2031 2032 $name = $this->restore_noise($name); // might be a noisy name 2033 2034 if ($this->lowercase) { $name = strtolower($name); } 2035 2036 if ($this->char === '=') { // attribute with value 2037 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2038 $this->parse_attr($node, $name, $space); // get attribute value 2039 } else { 2040 //no value attr: nowrap, checked selected... 2041 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2042 $node->attr[$name] = true; 2043 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2044 } 2045 2046 $node->_[HDOM_INFO_SPACE][] = $space; 2047 2048 // prepare for next attribute 2049 $space = array( 2050 $this->copy_skip($this->token_blank), 2051 '', 2052 '' 2053 ); 2054 } else { // no more attributes 2055 break; 2056 } 2057 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2058 2059 $this->link_nodes($node, true); 2060 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2061 2062 // handle empty tags (i.e. "<div/>") 2063 if ($this->copy_until_char('>') === '/') { 2064 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2065 $node->_[HDOM_INFO_END] = 0; 2066 } else { 2067 // reset parent 2068 if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2069 $this->parent = $node; 2070 } 2071 } 2072 2073 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2074 2075 // If it's a BR tag, we need to set it's text to the default text. 2076 // This way when we see it in plaintext, we can generate formatting that the user wants. 2077 // since a br tag never has sub nodes, this works well. 2078 if ($node->tag === 'br') { 2079 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2080 } 2081 2082 return true; 2083 } 2084 2085 protected function parse_attr($node, $name, &$space) 2086 { 2087 $is_duplicate = isset($node->attr[$name]); 2088 2089 if (!$is_duplicate) // Copy whitespace between "=" and value 2090 $space[2] = $this->copy_skip($this->token_blank); 2091 2092 switch ($this->char) { 2093 case '"': 2094 $quote_type = HDOM_QUOTE_DOUBLE; 2095 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2096 $value = $this->copy_until_char('"'); 2097 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2098 break; 2099 case '\'': 2100 $quote_type = HDOM_QUOTE_SINGLE; 2101 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2102 $value = $this->copy_until_char('\''); 2103 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2104 break; 2105 default: 2106 $quote_type = HDOM_QUOTE_NO; 2107 $value = $this->copy_until($this->token_attr); 2108 } 2109 2110 $value = $this->restore_noise($value); 2111 2112 // PaperG: Attributes should not have \r or \n in them, that counts as 2113 // html whitespace. 2114 $value = str_replace("\r", '', $value); 2115 $value = str_replace("\n", '', $value); 2116 2117 // PaperG: If this is a "class" selector, lets get rid of the preceeding 2118 // and trailing space since some people leave it in the multi class case. 2119 if ($name === 'class') { 2120 $value = trim($value); 2121 } 2122 2123 if (!$is_duplicate) { 2124 $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2125 $node->attr[$name] = $value; 2126 } 2127 } 2128 2129 protected function link_nodes(&$node, $is_child) 2130 { 2131 $node->parent = $this->parent; 2132 $this->parent->nodes[] = $node; 2133 if ($is_child) { 2134 $this->parent->children[] = $node; 2135 } 2136 } 2137 2138 protected function as_text_node($tag) 2139 { 2140 $node = new simple_html_dom_node($this); 2141 ++$this->cursor; 2142 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2143 $this->link_nodes($node, false); 2144 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2145 return true; 2146 } 2147 2148 protected function skip($chars) 2149 { 2150 $this->pos += strspn($this->doc, $chars, $this->pos); 2151 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2152 } 2153 2154 protected function copy_skip($chars) 2155 { 2156 $pos = $this->pos; 2157 $len = strspn($this->doc, $chars, $pos); 2158 $this->pos += $len; 2159 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2160 if ($len === 0) { return ''; } 2161 return substr($this->doc, $pos, $len); 2162 } 2163 2164 protected function copy_until($chars) 2165 { 2166 $pos = $this->pos; 2167 $len = strcspn($this->doc, $chars, $pos); 2168 $this->pos += $len; 2169 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2170 return substr($this->doc, $pos, $len); 2171 } 2172 2173 protected function copy_until_char($char) 2174 { 2175 if ($this->char === null) { return ''; } 2176 2177 if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2178 $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2179 $this->char = null; 2180 $this->pos = $this->size; 2181 return $ret; 2182 } 2183 2184 if ($pos === $this->pos) { return ''; } 2185 2186 $pos_old = $this->pos; 2187 $this->char = $this->doc[$pos]; 2188 $this->pos = $pos; 2189 return substr($this->doc, $pos_old, $pos - $pos_old); 2190 } 2191 2192 protected function remove_noise($pattern, $remove_tag = false) 2193 { 2194 global $debug_object; 2195 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2196 2197 $count = preg_match_all( 2198 $pattern, 2199 $this->doc, 2200 $matches, 2201 PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2202 ); 2203 2204 for ($i = $count - 1; $i > -1; --$i) { 2205 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2206 2207 if (is_object($debug_object)) { 2208 $debug_object->debug_log(2, 'key is: ' . $key); 2209 } 2210 2211 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2212 $this->noise[$key] = $matches[$i][$idx][0]; 2213 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2214 } 2215 2216 // reset the length of content 2217 $this->size = strlen($this->doc); 2218 2219 if ($this->size > 0) { 2220 $this->char = $this->doc[0]; 2221 } 2222 } 2223 2224 function restore_noise($text) 2225 { 2226 global $debug_object; 2227 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2228 2229 while (($pos = strpos($text, '___noise___')) !== false) { 2230 // Sometimes there is a broken piece of markup, and we don't GET the 2231 // pos+11 etc... token which indicates a problem outside of us... 2232 2233 // todo: "___noise___1000" (or any number with four or more digits) 2234 // in the DOM causes an infinite loop which could be utilized by 2235 // malicious software 2236 if (strlen($text) > $pos + 15) { 2237 $key = '___noise___' 2238 . $text[$pos + 11] 2239 . $text[$pos + 12] 2240 . $text[$pos + 13] 2241 . $text[$pos + 14] 2242 . $text[$pos + 15]; 2243 2244 if (is_object($debug_object)) { 2245 $debug_object->debug_log(2, 'located key of: ' . $key); 2246 } 2247 2248 if (isset($this->noise[$key])) { 2249 $text = substr($text, 0, $pos) 2250 . $this->noise[$key] 2251 . substr($text, $pos + 16); 2252 } else { 2253 // do this to prevent an infinite loop. 2254 $text = substr($text, 0, $pos) 2255 . 'UNDEFINED NOISE FOR KEY: ' 2256 . $key 2257 . substr($text, $pos + 16); 2258 } 2259 } else { 2260 // There is no valid key being given back to us... We must get 2261 // rid of the ___noise___ or we will have a problem. 2262 $text = substr($text, 0, $pos) 2263 . 'NO NUMERIC NOISE KEY' 2264 . substr($text, $pos + 11); 2265 } 2266 } 2267 return $text; 2268 } 2269 2270 function search_noise($text) 2271 { 2272 global $debug_object; 2273 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2274 2275 foreach($this->noise as $noiseElement) { 2276 if (strpos($noiseElement, $text) !== false) { 2277 return $noiseElement; 2278 } 2279 } 2280 } 2281 2282 function __toString() 2283 { 2284 return $this->root->innertext(); 2285 } 2286 2287 function __get($name) 2288 { 2289 switch ($name) { 2290 case 'outertext': 2291 return $this->root->innertext(); 2292 case 'innertext': 2293 return $this->root->innertext(); 2294 case 'plaintext': 2295 return $this->root->text(); 2296 case 'charset': 2297 return $this->_charset; 2298 case 'target_charset': 2299 return $this->_target_charset; 2300 } 2301 } 2302 2303 function childNodes($idx = -1) 2304 { 2305 return $this->root->childNodes($idx); 2306 } 2307 2308 function firstChild() 2309 { 2310 return $this->root->first_child(); 2311 } 2312 2313 function lastChild() 2314 { 2315 return $this->root->last_child(); 2316 } 2317 2318 function createElement($name, $value = null) 2319 { 2320 return @str_get_html("<$name>$value</$name>")->firstChild(); 2321 } 2322 2323 function createTextNode($value) 2324 { 2325 return @end(str_get_html($value)->nodes); 2326 } 2327 2328 function getElementById($id) 2329 { 2330 return $this->find("#$id", 0); 2331 } 2332 2333 function getElementsById($id, $idx = null) 2334 { 2335 return $this->find("#$id", $idx); 2336 } 2337 2338 function getElementByTagName($name) 2339 { 2340 return $this->find($name, 0); 2341 } 2342 2343 function getElementsByTagName($name, $idx = -1) 2344 { 2345 return $this->find($name, $idx); 2346 } 2347 2348 function loadFile() 2349 { 2350 $args = func_get_args(); 2351 $this->load_file($args); 2352 } 2353} 2354