1<?php 2/** 3 * Website: http://sourceforge.net/projects/simplehtmldom/ 4 * Additional projects: http://sourceforge.net/projects/debugobject/ 5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 * 7 * Licensed under The MIT License 8 * See the LICENSE file in the project root for more information. 9 * 10 * Authors: 11 * S.C. Chen 12 * John Schlick 13 * Rus Carroll 14 * logmanoriginal 15 * 16 * Contributors: 17 * Yousuke Kumakura 18 * Vadim Voituk 19 * Antcs 20 * 21 * Version Rev. 1.9.1 (291) 22 * 23 * THIS LIBRARY HAS BEEN MODIFIED BY NOMADJIMBOB - james.collins@outlook.com.au 24 * Lines 2116 - stripping of \r\n from attributes has been disabled 25 */ 26 27define('HDOM_TYPE_ELEMENT', 1); 28define('HDOM_TYPE_COMMENT', 2); 29define('HDOM_TYPE_TEXT', 3); 30define('HDOM_TYPE_ENDTAG', 4); 31define('HDOM_TYPE_ROOT', 5); 32define('HDOM_TYPE_UNKNOWN', 6); 33define('HDOM_QUOTE_DOUBLE', 0); 34define('HDOM_QUOTE_SINGLE', 1); 35define('HDOM_QUOTE_NO', 3); 36define('HDOM_INFO_BEGIN', 0); 37define('HDOM_INFO_END', 1); 38define('HDOM_INFO_QUOTE', 2); 39define('HDOM_INFO_SPACE', 3); 40define('HDOM_INFO_TEXT', 4); 41define('HDOM_INFO_INNER', 5); 42define('HDOM_INFO_OUTER', 6); 43define('HDOM_INFO_ENDSPACE', 7); 44 45defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 46defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 47defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 48defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 49define('HDOM_SMARTY_AS_TEXT', 1); 50 51function file_get_html( 52 $url, 53 $use_include_path = false, 54 $context = null, 55 $offset = 0, 56 $maxLen = -1, 57 $lowercase = true, 58 $forceTagsClosed = true, 59 $target_charset = DEFAULT_TARGET_CHARSET, 60 $stripRN = true, 61 $defaultBRText = DEFAULT_BR_TEXT, 62 $defaultSpanText = DEFAULT_SPAN_TEXT) 63{ 64 if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 65 66 $dom = new simple_html_dom( 67 null, 68 $lowercase, 69 $forceTagsClosed, 70 $target_charset, 71 $stripRN, 72 $defaultBRText, 73 $defaultSpanText 74 ); 75 76 /** 77 * For sourceforge users: uncomment the next line and comment the 78 * retrieve_url_contents line 2 lines down if it is not already done. 79 */ 80 $contents = file_get_contents( 81 $url, 82 $use_include_path, 83 $context, 84 $offset, 85 $maxLen 86 ); 87 // $contents = retrieve_url_contents($url); 88 89 if (empty($contents) || strlen($contents) > $maxLen) { 90 $dom->clear(); 91 return false; 92 } 93 94 return $dom->load($contents, $lowercase, $stripRN); 95} 96 97function str_get_html( 98 $str, 99 $lowercase = true, 100 $forceTagsClosed = true, 101 $target_charset = DEFAULT_TARGET_CHARSET, 102 $stripRN = true, 103 $defaultBRText = DEFAULT_BR_TEXT, 104 $defaultSpanText = DEFAULT_SPAN_TEXT) 105{ 106 $dom = new simple_html_dom( 107 null, 108 $lowercase, 109 $forceTagsClosed, 110 $target_charset, 111 $stripRN, 112 $defaultBRText, 113 $defaultSpanText 114 ); 115 116 if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 117 $dom->clear(); 118 return false; 119 } 120 121 return $dom->load($str, $lowercase, $stripRN); 122} 123 124function dump_html_tree($node, $show_attr = true, $deep = 0) 125{ 126 $node->dump($node); 127} 128 129class simple_html_dom_node 130{ 131 public $nodetype = HDOM_TYPE_TEXT; 132 public $tag = 'text'; 133 public $attr = array(); 134 public $children = array(); 135 public $nodes = array(); 136 public $parent = null; 137 public $_ = array(); 138 public $tag_start = 0; 139 private $dom = null; 140 141 function __construct($dom) 142 { 143 $this->dom = $dom; 144 $dom->nodes[] = $this; 145 } 146 147 function __destruct() 148 { 149 $this->clear(); 150 } 151 152 function __toString() 153 { 154 return $this->outertext(); 155 } 156 157 function clear() 158 { 159 $this->dom = null; 160 $this->nodes = null; 161 $this->parent = null; 162 $this->children = null; 163 } 164 165 function dump($show_attr = true, $depth = 0) 166 { 167 echo str_repeat("\t", $depth) . $this->tag; 168 169 if ($show_attr && count($this->attr) > 0) { 170 echo '('; 171 foreach ($this->attr as $k => $v) { 172 echo "[$k]=>\"$v\", "; 173 } 174 echo ')'; 175 } 176 177 echo "\n"; 178 179 if ($this->nodes) { 180 foreach ($this->nodes as $node) { 181 $node->dump($show_attr, $depth + 1); 182 } 183 } 184 } 185 186 function dump_node($echo = true) 187 { 188 $string = $this->tag; 189 190 if (count($this->attr) > 0) { 191 $string .= '('; 192 foreach ($this->attr as $k => $v) { 193 $string .= "[$k]=>\"$v\", "; 194 } 195 $string .= ')'; 196 } 197 198 if (count($this->_) > 0) { 199 $string .= ' $_ ('; 200 foreach ($this->_ as $k => $v) { 201 if (is_array($v)) { 202 $string .= "[$k]=>("; 203 foreach ($v as $k2 => $v2) { 204 $string .= "[$k2]=>\"$v2\", "; 205 } 206 $string .= ')'; 207 } else { 208 $string .= "[$k]=>\"$v\", "; 209 } 210 } 211 $string .= ')'; 212 } 213 214 if (isset($this->text)) { 215 $string .= " text: ({$this->text})"; 216 } 217 218 $string .= ' HDOM_INNER_INFO: '; 219 220 if (isset($node->_[HDOM_INFO_INNER])) { 221 $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 222 } else { 223 $string .= ' NULL '; 224 } 225 226 $string .= ' children: ' . count($this->children); 227 $string .= ' nodes: ' . count($this->nodes); 228 $string .= ' tag_start: ' . $this->tag_start; 229 $string .= "\n"; 230 231 if ($echo) { 232 echo $string; 233 return; 234 } else { 235 return $string; 236 } 237 } 238 239 function parent($parent = null) 240 { 241 // I am SURE that this doesn't work properly. 242 // It fails to unset the current node from it's current parents nodes or 243 // children list first. 244 if ($parent !== null) { 245 $this->parent = $parent; 246 $this->parent->nodes[] = $this; 247 $this->parent->children[] = $this; 248 } 249 250 return $this->parent; 251 } 252 253 function has_child() 254 { 255 return !empty($this->children); 256 } 257 258 function children($idx = -1) 259 { 260 if ($idx === -1) { 261 return $this->children; 262 } 263 264 if (isset($this->children[$idx])) { 265 return $this->children[$idx]; 266 } 267 268 return null; 269 } 270 271 function first_child() 272 { 273 if (count($this->children) > 0) { 274 return $this->children[0]; 275 } 276 return null; 277 } 278 279 function last_child() 280 { 281 if (count($this->children) > 0) { 282 return end($this->children); 283 } 284 return null; 285 } 286 287 function next_sibling() 288 { 289 if ($this->parent === null) { 290 return null; 291 } 292 293 $idx = array_search($this, $this->parent->children, true); 294 295 if ($idx !== false && isset($this->parent->children[$idx + 1])) { 296 return $this->parent->children[$idx + 1]; 297 } 298 299 return null; 300 } 301 302 function prev_sibling() 303 { 304 if ($this->parent === null) { 305 return null; 306 } 307 308 $idx = array_search($this, $this->parent->children, true); 309 310 if ($idx !== false && $idx > 0) { 311 return $this->parent->children[$idx - 1]; 312 } 313 314 return null; 315 } 316 317 function find_ancestor_tag($tag) 318 { 319 global $debug_object; 320 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 321 322 if ($this->parent === null) { 323 return null; 324 } 325 326 $ancestor = $this->parent; 327 328 while (!is_null($ancestor)) { 329 if (is_object($debug_object)) { 330 $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 331 } 332 333 if ($ancestor->tag === $tag) { 334 break; 335 } 336 337 $ancestor = $ancestor->parent; 338 } 339 340 return $ancestor; 341 } 342 343 function innertext() 344 { 345 if (isset($this->_[HDOM_INFO_INNER])) { 346 return $this->_[HDOM_INFO_INNER]; 347 } 348 349 if (isset($this->_[HDOM_INFO_TEXT])) { 350 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 351 } 352 353 $ret = ''; 354 355 foreach ($this->nodes as $n) { 356 $ret .= $n->outertext(); 357 } 358 359 return $ret; 360 } 361 362 function outertext() 363 { 364 global $debug_object; 365 366 if (is_object($debug_object)) { 367 $text = ''; 368 369 if ($this->tag === 'text') { 370 if (!empty($this->text)) { 371 $text = ' with text: ' . $this->text; 372 } 373 } 374 375 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 376 } 377 378 if ($this->tag === 'root') { 379 return $this->innertext(); 380 } 381 382 // todo: What is the use of this callback? Remove? 383 if ($this->dom && $this->dom->callback !== null) { 384 call_user_func_array($this->dom->callback, array($this)); 385 } 386 387 if (isset($this->_[HDOM_INFO_OUTER])) { 388 return $this->_[HDOM_INFO_OUTER]; 389 } 390 391 if (isset($this->_[HDOM_INFO_TEXT])) { 392 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 393 } 394 395 $ret = ''; 396 397 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 398 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 399 } 400 401 if (isset($this->_[HDOM_INFO_INNER])) { 402 // todo: <br> should either never have HDOM_INFO_INNER or always 403 if ($this->tag !== 'br') { 404 $ret .= $this->_[HDOM_INFO_INNER]; 405 } 406 } elseif ($this->nodes) { 407 foreach ($this->nodes as $n) { 408 $ret .= $this->convert_text($n->outertext()); 409 } 410 } 411 412 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 413 $ret .= '</' . $this->tag . '>'; 414 } 415 416 return $ret; 417 } 418 419 function text() 420 { 421 if (isset($this->_[HDOM_INFO_INNER])) { 422 return $this->_[HDOM_INFO_INNER]; 423 } 424 425 switch ($this->nodetype) { 426 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 427 case HDOM_TYPE_COMMENT: return ''; 428 case HDOM_TYPE_UNKNOWN: return ''; 429 } 430 431 if (strcasecmp($this->tag, 'script') === 0) { return ''; } 432 if (strcasecmp($this->tag, 'style') === 0) { return ''; } 433 434 $ret = ''; 435 436 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 437 // for some span tags, and some p tags) $this->nodes is set to NULL. 438 // NOTE: This indicates that there is a problem where it's set to NULL 439 // without a clear happening. 440 // WHY is this happening? 441 if (!is_null($this->nodes)) { 442 foreach ($this->nodes as $n) { 443 // Start paragraph after a blank line 444 if ($n->tag === 'p') { 445 $ret = trim($ret) . "\n\n"; 446 } 447 448 $ret .= $this->convert_text($n->text()); 449 450 // If this node is a span... add a space at the end of it so 451 // multiple spans don't run into each other. This is plaintext 452 // after all. 453 if ($n->tag === 'span') { 454 $ret .= $this->dom->default_span_text; 455 } 456 } 457 } 458 return $ret; 459 } 460 461 function xmltext() 462 { 463 $ret = $this->innertext(); 464 $ret = str_ireplace('<![CDATA[', '', $ret); 465 $ret = str_replace(']]>', '', $ret); 466 return $ret; 467 } 468 469 function makeup() 470 { 471 // text, comment, unknown 472 if (isset($this->_[HDOM_INFO_TEXT])) { 473 return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 474 } 475 476 $ret = '<' . $this->tag; 477 $i = -1; 478 479 foreach ($this->attr as $key => $val) { 480 ++$i; 481 482 // skip removed attribute 483 if ($val === null || $val === false) { continue; } 484 485 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 486 487 //no value attr: nowrap, checked selected... 488 if ($val === true) { 489 $ret .= $key; 490 } else { 491 switch ($this->_[HDOM_INFO_QUOTE][$i]) 492 { 493 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 494 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 495 default: $quote = ''; 496 } 497 498 $ret .= $key 499 . $this->_[HDOM_INFO_SPACE][$i][1] 500 . '=' 501 . $this->_[HDOM_INFO_SPACE][$i][2] 502 . $quote 503 . $val 504 . $quote; 505 } 506 } 507 508 $ret = $this->dom->restore_noise($ret); 509 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 510 } 511 512 function find($selector, $idx = null, $lowercase = false) 513 { 514 $selectors = $this->parse_selector($selector); 515 if (($count = count($selectors)) === 0) { return array(); } 516 $found_keys = array(); 517 518 // find each selector 519 for ($c = 0; $c < $count; ++$c) { 520 // The change on the below line was documented on the sourceforge 521 // code tracker id 2788009 522 // used to be: if (($levle=count($selectors[0]))===0) return array(); 523 if (($levle = count($selectors[$c])) === 0) { return array(); } 524 if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 525 526 $head = array($this->_[HDOM_INFO_BEGIN] => 1); 527 $cmd = ' '; // Combinator 528 529 // handle descendant selectors, no recursive! 530 for ($l = 0; $l < $levle; ++$l) { 531 $ret = array(); 532 533 foreach ($head as $k => $v) { 534 $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 535 //PaperG - Pass this optional parameter on to the seek function. 536 $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 537 } 538 539 $head = $ret; 540 $cmd = $selectors[$c][$l][4]; // Next Combinator 541 } 542 543 foreach ($head as $k => $v) { 544 if (!isset($found_keys[$k])) { 545 $found_keys[$k] = 1; 546 } 547 } 548 } 549 550 // sort keys 551 ksort($found_keys); 552 553 $found = array(); 554 foreach ($found_keys as $k => $v) { 555 $found[] = $this->dom->nodes[$k]; 556 } 557 558 // return nth-element or array 559 if (is_null($idx)) { return $found; } 560 elseif ($idx < 0) { $idx = count($found) + $idx; } 561 return (isset($found[$idx])) ? $found[$idx] : null; 562 } 563 564 protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 565 { 566 global $debug_object; 567 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 568 569 list($tag, $id, $class, $attributes, $cmb) = $selector; 570 $nodes = array(); 571 572 if ($parent_cmd === ' ') { // Descendant Combinator 573 // Find parent closing tag if the current element doesn't have a closing 574 // tag (i.e. void element) 575 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 576 if ($end == 0) { 577 $parent = $this->parent; 578 while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 579 $end -= 1; 580 $parent = $parent->parent; 581 } 582 $end += $parent->_[HDOM_INFO_END]; 583 } 584 585 // Get list of target nodes 586 $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 587 $nodes_count = $end - $nodes_start; 588 $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 589 } elseif ($parent_cmd === '>') { // Child Combinator 590 $nodes = $this->children; 591 } elseif ($parent_cmd === '+' 592 && $this->parent 593 && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 594 $index = array_search($this, $this->parent->children, true) + 1; 595 if ($index < count($this->parent->children)) 596 $nodes[] = $this->parent->children[$index]; 597 } elseif ($parent_cmd === '~' 598 && $this->parent 599 && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 600 $index = array_search($this, $this->parent->children, true); 601 $nodes = array_slice($this->parent->children, $index); 602 } 603 604 // Go throgh each element starting at this element until the end tag 605 // Note: If this element is a void tag, any previous void element is 606 // skipped. 607 foreach($nodes as $node) { 608 $pass = true; 609 610 // Skip root nodes 611 if(!$node->parent) { 612 $pass = false; 613 } 614 615 // Handle 'text' selector 616 if($pass && $tag === 'text' && $node->tag === 'text') { 617 $ret[array_search($node, $this->dom->nodes, true)] = 1; 618 unset($node); 619 continue; 620 } 621 622 // Skip if node isn't a child node (i.e. text nodes) 623 if($pass && !in_array($node, $node->parent->children, true)) { 624 $pass = false; 625 } 626 627 // Skip if tag doesn't match 628 if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 629 $pass = false; 630 } 631 632 // Skip if ID doesn't exist 633 if ($pass && $id !== '' && !isset($node->attr['id'])) { 634 $pass = false; 635 } 636 637 // Check if ID matches 638 if ($pass && $id !== '' && isset($node->attr['id'])) { 639 // Note: Only consider the first ID (as browsers do) 640 $node_id = explode(' ', trim($node->attr['id']))[0]; 641 642 if($id !== $node_id) { $pass = false; } 643 } 644 645 // Check if all class(es) exist 646 if ($pass && $class !== '' && is_array($class) && !empty($class)) { 647 if (isset($node->attr['class'])) { 648 $node_classes = explode(' ', $node->attr['class']); 649 650 if ($lowercase) { 651 $node_classes = array_map('strtolower', $node_classes); 652 } 653 654 foreach($class as $c) { 655 if(!in_array($c, $node_classes)) { 656 $pass = false; 657 break; 658 } 659 } 660 } else { 661 $pass = false; 662 } 663 } 664 665 // Check attributes 666 if ($pass 667 && $attributes !== '' 668 && is_array($attributes) 669 && !empty($attributes)) { 670 foreach($attributes as $a) { 671 list ( 672 $att_name, 673 $att_expr, 674 $att_val, 675 $att_inv, 676 $att_case_sensitivity 677 ) = $a; 678 679 // Handle indexing attributes (i.e. "[2]") 680 /** 681 * Note: This is not supported by the CSS Standard but adds 682 * the ability to select items compatible to XPath (i.e. 683 * the 3rd element within it's parent). 684 * 685 * Note: This doesn't conflict with the CSS Standard which 686 * doesn't work on numeric attributes anyway. 687 */ 688 if (is_numeric($att_name) 689 && $att_expr === '' 690 && $att_val === '') { 691 $count = 0; 692 693 // Find index of current element in parent 694 foreach ($node->parent->children as $c) { 695 if ($c->tag === $node->tag) ++$count; 696 if ($c === $node) break; 697 } 698 699 // If this is the correct node, continue with next 700 // attribute 701 if ($count === (int)$att_name) continue; 702 } 703 704 // Check attribute availability 705 if ($att_inv) { // Attribute should NOT be set 706 if (isset($node->attr[$att_name])) { 707 $pass = false; 708 break; 709 } 710 } else { // Attribute should be set 711 // todo: "plaintext" is not a valid CSS selector! 712 if ($att_name !== 'plaintext' 713 && !isset($node->attr[$att_name])) { 714 $pass = false; 715 break; 716 } 717 } 718 719 // Continue with next attribute if expression isn't defined 720 if ($att_expr === '') continue; 721 722 // If they have told us that this is a "plaintext" 723 // search then we want the plaintext of the node - right? 724 // todo "plaintext" is not a valid CSS selector! 725 if ($att_name === 'plaintext') { 726 $nodeKeyValue = $node->text(); 727 } else { 728 $nodeKeyValue = $node->attr[$att_name]; 729 } 730 731 if (is_object($debug_object)) { 732 $debug_object->debug_log(2, 733 'testing node: ' 734 . $node->tag 735 . ' for attribute: ' 736 . $att_name 737 . $att_expr 738 . $att_val 739 . ' where nodes value is: ' 740 . $nodeKeyValue 741 ); 742 } 743 744 // If lowercase is set, do a case insensitive test of 745 // the value of the selector. 746 if ($lowercase) { 747 $check = $this->match( 748 $att_expr, 749 strtolower($att_val), 750 strtolower($nodeKeyValue), 751 $att_case_sensitivity 752 ); 753 } else { 754 $check = $this->match( 755 $att_expr, 756 $att_val, 757 $nodeKeyValue, 758 $att_case_sensitivity 759 ); 760 } 761 762 if (is_object($debug_object)) { 763 $debug_object->debug_log(2, 764 'after match: ' 765 . ($check ? 'true' : 'false') 766 ); 767 } 768 769 if (!$check) { 770 $pass = false; 771 break; 772 } 773 } 774 } 775 776 // Found a match. Add to list and clear node 777 if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 778 unset($node); 779 } 780 // It's passed by reference so this is actually what this function returns. 781 if (is_object($debug_object)) { 782 $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 783 } 784 } 785 786 protected function match($exp, $pattern, $value, $case_sensitivity) 787 { 788 global $debug_object; 789 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 790 791 if ($case_sensitivity === 'i') { 792 $pattern = strtolower($pattern); 793 $value = strtolower($value); 794 } 795 796 switch ($exp) { 797 case '=': 798 return ($value === $pattern); 799 case '!=': 800 return ($value !== $pattern); 801 case '^=': 802 return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 803 case '$=': 804 return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 805 case '*=': 806 return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 807 case '|=': 808 /** 809 * [att|=val] 810 * 811 * Represents an element with the att attribute, its value 812 * either being exactly "val" or beginning with "val" 813 * immediately followed by "-" (U+002D). 814 */ 815 return strpos($value, $pattern) === 0; 816 case '~=': 817 /** 818 * [att~=val] 819 * 820 * Represents an element with the att attribute whose value is a 821 * whitespace-separated list of words, one of which is exactly 822 * "val". If "val" contains whitespace, it will never represent 823 * anything (since the words are separated by spaces). Also if 824 * "val" is the empty string, it will never represent anything. 825 */ 826 return in_array($pattern, explode(' ', trim($value)), true); 827 } 828 return false; 829 } 830 831 protected function parse_selector($selector_string) 832 { 833 global $debug_object; 834 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 835 836 /** 837 * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 838 * 839 * Paperg: Add the colon to the attribute, so that it properly finds 840 * <tag attr:ibute="something" > like google does. 841 * 842 * Note: if you try to look at this attribute, you MUST use getAttribute 843 * since $dom->x:y will fail the php syntax check. 844 * 845 * Notice the \[ starting the attribute? and the @? following? This 846 * implies that an attribute can begin with an @ sign that is not 847 * captured. This implies that an html attribute specifier may start 848 * with an @ sign that is NOT captured by the expression. Farther study 849 * is required to determine of this should be documented or removed. 850 * 851 * Matches selectors in this order: 852 * 853 * [0] - full match 854 * 855 * [1] - tag name 856 * ([\w:\*-]*) 857 * Matches the tag name consisting of zero or more words, colons, 858 * asterisks and hyphens. 859 * 860 * [2] - id name 861 * (?:\#([\w-]+)) 862 * Optionally matches a id name, consisting of an "#" followed by 863 * the id name (one or more words and hyphens). 864 * 865 * [3] - class names (including dots) 866 * (?:\.([\w\.-]+))? 867 * Optionally matches a list of classs, consisting of an "." 868 * followed by the class name (one or more words and hyphens) 869 * where multiple classes can be chained (i.e. ".foo.bar.baz") 870 * 871 * [4] - attributes 872 * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 873 * Optionally matches the attributes list 874 * 875 * [5] - separator 876 * ([\/, >+~]+) 877 * Matches the selector list separator 878 */ 879 // phpcs:ignore Generic.Files.LineLength 880 $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 881 882 preg_match_all( 883 $pattern, 884 trim($selector_string) . ' ', // Add final ' ' as pseudo separator 885 $matches, 886 PREG_SET_ORDER 887 ); 888 889 if (is_object($debug_object)) { 890 $debug_object->debug_log(2, 'Matches Array: ', $matches); 891 } 892 893 $selectors = array(); 894 $result = array(); 895 896 foreach ($matches as $m) { 897 $m[0] = trim($m[0]); 898 899 // Skip NoOps 900 if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 901 902 // Convert to lowercase 903 if ($this->dom->lowercase) { 904 $m[1] = strtolower($m[1]); 905 } 906 907 // Extract classes 908 if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 909 910 /* Extract attributes (pattern based on the pattern above!) 911 912 * [0] - full match 913 * [1] - attribute name 914 * [2] - attribute expression 915 * [3] - attribute value 916 * [4] - case sensitivity 917 * 918 * Note: Attributes can be negated with a "!" prefix to their name 919 */ 920 if($m[4] !== '') { 921 preg_match_all( 922 "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 923 trim($m[4]), 924 $attributes, 925 PREG_SET_ORDER 926 ); 927 928 // Replace element by array 929 $m[4] = array(); 930 931 foreach($attributes as $att) { 932 // Skip empty matches 933 if(trim($att[0]) === '') { continue; } 934 935 $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 936 $m[4][] = array( 937 $inverted ? substr($att[1], 1) : $att[1], // Name 938 (isset($att[2])) ? $att[2] : '', // Expression 939 (isset($att[3])) ? $att[3] : '', // Value 940 $inverted, // Inverted Flag 941 (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 942 ); 943 } 944 } 945 946 // Sanitize Separator 947 if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 948 $m[5] = ' '; 949 } else { // Other Separator 950 $m[5] = trim($m[5]); 951 } 952 953 // Clear Separator if it's a Selector List 954 if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 955 956 // Remove full match before adding to results 957 array_shift($m); 958 $result[] = $m; 959 960 if ($is_list) { // Selector List 961 $selectors[] = $result; 962 $result = array(); 963 } 964 } 965 966 if (count($result) > 0) { $selectors[] = $result; } 967 return $selectors; 968 } 969 970 function __get($name) 971 { 972 if (isset($this->attr[$name])) { 973 return $this->convert_text($this->attr[$name]); 974 } 975 switch ($name) { 976 case 'outertext': return $this->outertext(); 977 case 'innertext': return $this->innertext(); 978 case 'plaintext': return $this->text(); 979 case 'xmltext': return $this->xmltext(); 980 default: return array_key_exists($name, $this->attr); 981 } 982 } 983 984 function __set($name, $value) 985 { 986 global $debug_object; 987 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 988 989 switch ($name) { 990 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 991 case 'innertext': 992 if (isset($this->_[HDOM_INFO_TEXT])) { 993 return $this->_[HDOM_INFO_TEXT] = $value; 994 } 995 return $this->_[HDOM_INFO_INNER] = $value; 996 } 997 998 if (!isset($this->attr[$name])) { 999 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 1000 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1001 } 1002 1003 $this->attr[$name] = $value; 1004 } 1005 1006 function __isset($name) 1007 { 1008 switch ($name) { 1009 case 'outertext': return true; 1010 case 'innertext': return true; 1011 case 'plaintext': return true; 1012 } 1013 //no value attr: nowrap, checked selected... 1014 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1015 } 1016 1017 function __unset($name) 1018 { 1019 if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1020 } 1021 1022 function convert_text($text) 1023 { 1024 global $debug_object; 1025 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1026 1027 $converted_text = $text; 1028 1029 $sourceCharset = ''; 1030 $targetCharset = ''; 1031 1032 if ($this->dom) { 1033 $sourceCharset = strtoupper($this->dom->_charset); 1034 $targetCharset = strtoupper($this->dom->_target_charset); 1035 } 1036 1037 if (is_object($debug_object)) { 1038 $debug_object->debug_log(3, 1039 'source charset: ' 1040 . $sourceCharset 1041 . ' target charaset: ' 1042 . $targetCharset 1043 ); 1044 } 1045 1046 if (!empty($sourceCharset) 1047 && !empty($targetCharset) 1048 && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1049 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1050 if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1051 && ($this->is_utf8($text))) { 1052 $converted_text = $text; 1053 } else { 1054 $converted_text = iconv($sourceCharset, $targetCharset, $text); 1055 } 1056 } 1057 1058 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1059 if ($targetCharset === 'UTF-8') { 1060 if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1061 $converted_text = substr($converted_text, 3); 1062 } 1063 1064 if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1065 $converted_text = substr($converted_text, 0, -3); 1066 } 1067 } 1068 1069 return $converted_text; 1070 } 1071 1072 static function is_utf8($str) 1073 { 1074 $c = 0; $b = 0; 1075 $bits = 0; 1076 $len = strlen($str); 1077 for($i = 0; $i < $len; $i++) { 1078 $c = ord($str[$i]); 1079 if($c > 128) { 1080 if(($c >= 254)) { return false; } 1081 elseif($c >= 252) { $bits = 6; } 1082 elseif($c >= 248) { $bits = 5; } 1083 elseif($c >= 240) { $bits = 4; } 1084 elseif($c >= 224) { $bits = 3; } 1085 elseif($c >= 192) { $bits = 2; } 1086 else { return false; } 1087 if(($i + $bits) > $len) { return false; } 1088 while($bits > 1) { 1089 $i++; 1090 $b = ord($str[$i]); 1091 if($b < 128 || $b > 191) { return false; } 1092 $bits--; 1093 } 1094 } 1095 } 1096 return true; 1097 } 1098 1099 function get_display_size() 1100 { 1101 global $debug_object; 1102 1103 $width = -1; 1104 $height = -1; 1105 1106 if ($this->tag !== 'img') { 1107 return false; 1108 } 1109 1110 // See if there is aheight or width attribute in the tag itself. 1111 if (isset($this->attr['width'])) { 1112 $width = $this->attr['width']; 1113 } 1114 1115 if (isset($this->attr['height'])) { 1116 $height = $this->attr['height']; 1117 } 1118 1119 // Now look for an inline style. 1120 if (isset($this->attr['style'])) { 1121 // Thanks to user gnarf from stackoverflow for this regular expression. 1122 $attributes = array(); 1123 1124 preg_match_all( 1125 '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1126 $this->attr['style'], 1127 $matches, 1128 PREG_SET_ORDER 1129 ); 1130 1131 foreach ($matches as $match) { 1132 $attributes[$match[1]] = $match[2]; 1133 } 1134 1135 // If there is a width in the style attributes: 1136 if (isset($attributes['width']) && $width == -1) { 1137 // check that the last two characters are px (pixels) 1138 if (strtolower(substr($attributes['width'], -2)) === 'px') { 1139 $proposed_width = substr($attributes['width'], 0, -2); 1140 // Now make sure that it's an integer and not something stupid. 1141 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1142 $width = $proposed_width; 1143 } 1144 } 1145 } 1146 1147 // If there is a width in the style attributes: 1148 if (isset($attributes['height']) && $height == -1) { 1149 // check that the last two characters are px (pixels) 1150 if (strtolower(substr($attributes['height'], -2)) == 'px') { 1151 $proposed_height = substr($attributes['height'], 0, -2); 1152 // Now make sure that it's an integer and not something stupid. 1153 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1154 $height = $proposed_height; 1155 } 1156 } 1157 } 1158 1159 } 1160 1161 // Future enhancement: 1162 // Look in the tag to see if there is a class or id specified that has 1163 // a height or width attribute to it. 1164 1165 // Far future enhancement 1166 // Look at all the parent tags of this image to see if they specify a 1167 // class or id that has an img selector that specifies a height or width 1168 // Note that in this case, the class or id will have the img subselector 1169 // for it to apply to the image. 1170 1171 // ridiculously far future development 1172 // If the class or id is specified in a SEPARATE css file thats not on 1173 // the page, go get it and do what we were just doing for the ones on 1174 // the page. 1175 1176 $result = array( 1177 'height' => $height, 1178 'width' => $width 1179 ); 1180 1181 return $result; 1182 } 1183 1184 function save($filepath = '') 1185 { 1186 $ret = $this->outertext(); 1187 1188 if ($filepath !== '') { 1189 file_put_contents($filepath, $ret, LOCK_EX); 1190 } 1191 1192 return $ret; 1193 } 1194 1195 function addClass($class) 1196 { 1197 if (is_string($class)) { 1198 $class = explode(' ', $class); 1199 } 1200 1201 if (is_array($class)) { 1202 foreach($class as $c) { 1203 if (isset($this->class)) { 1204 if ($this->hasClass($c)) { 1205 continue; 1206 } else { 1207 $this->class .= ' ' . $c; 1208 } 1209 } else { 1210 $this->class = $c; 1211 } 1212 } 1213 } else { 1214 if (is_object($debug_object)) { 1215 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1216 } 1217 } 1218 } 1219 1220 function hasClass($class) 1221 { 1222 if (is_string($class)) { 1223 if (isset($this->class)) { 1224 return in_array($class, explode(' ', $this->class), true); 1225 } 1226 } else { 1227 if (is_object($debug_object)) { 1228 $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1229 } 1230 } 1231 1232 return false; 1233 } 1234 1235 function removeClass($class = null) 1236 { 1237 if (!isset($this->class)) { 1238 return; 1239 } 1240 1241 if (is_null($class)) { 1242 $this->removeAttribute('class'); 1243 return; 1244 } 1245 1246 if (is_string($class)) { 1247 $class = explode(' ', $class); 1248 } 1249 1250 if (is_array($class)) { 1251 $class = array_diff(explode(' ', $this->class), $class); 1252 if (empty($class)) { 1253 $this->removeAttribute('class'); 1254 } else { 1255 $this->class = implode(' ', $class); 1256 } 1257 } 1258 } 1259 1260 function getAllAttributes() 1261 { 1262 return $this->attr; 1263 } 1264 1265 function getAttribute($name) 1266 { 1267 return $this->__get($name); 1268 } 1269 1270 function setAttribute($name, $value) 1271 { 1272 $this->__set($name, $value); 1273 } 1274 1275 function hasAttribute($name) 1276 { 1277 return $this->__isset($name); 1278 } 1279 1280 function removeAttribute($name) 1281 { 1282 $this->__set($name, null); 1283 } 1284 1285 function remove() 1286 { 1287 if ($this->parent) { 1288 $this->parent->removeChild($this); 1289 } 1290 } 1291 1292 function removeChild($node) 1293 { 1294 $nidx = array_search($node, $this->nodes, true); 1295 $cidx = array_search($node, $this->children, true); 1296 $didx = array_search($node, $this->dom->nodes, true); 1297 1298 if ($nidx !== false && $cidx !== false && $didx !== false) { 1299 1300 foreach($node->children as $child) { 1301 $node->removeChild($child); 1302 } 1303 1304 foreach($node->nodes as $entity) { 1305 $enidx = array_search($entity, $node->nodes, true); 1306 $edidx = array_search($entity, $node->dom->nodes, true); 1307 1308 if ($enidx !== false && $edidx !== false) { 1309 unset($node->nodes[$enidx]); 1310 unset($node->dom->nodes[$edidx]); 1311 } 1312 } 1313 1314 unset($this->nodes[$nidx]); 1315 unset($this->children[$cidx]); 1316 unset($this->dom->nodes[$didx]); 1317 1318 $node->clear(); 1319 1320 } 1321 } 1322 1323 function getElementById($id) 1324 { 1325 return $this->find("#$id", 0); 1326 } 1327 1328 function getElementsById($id, $idx = null) 1329 { 1330 return $this->find("#$id", $idx); 1331 } 1332 1333 function getElementByTagName($name) 1334 { 1335 return $this->find($name, 0); 1336 } 1337 1338 function getElementsByTagName($name, $idx = null) 1339 { 1340 return $this->find($name, $idx); 1341 } 1342 1343 function parentNode() 1344 { 1345 return $this->parent(); 1346 } 1347 1348 function childNodes($idx = -1) 1349 { 1350 return $this->children($idx); 1351 } 1352 1353 function firstChild() 1354 { 1355 return $this->first_child(); 1356 } 1357 1358 function lastChild() 1359 { 1360 return $this->last_child(); 1361 } 1362 1363 function nextSibling() 1364 { 1365 return $this->next_sibling(); 1366 } 1367 1368 function previousSibling() 1369 { 1370 return $this->prev_sibling(); 1371 } 1372 1373 function hasChildNodes() 1374 { 1375 return $this->has_child(); 1376 } 1377 1378 function nodeName() 1379 { 1380 return $this->tag; 1381 } 1382 1383 function appendChild($node) 1384 { 1385 $node->parent($this); 1386 return $node; 1387 } 1388 1389} 1390 1391class simple_html_dom 1392{ 1393 public $root = null; 1394 public $nodes = array(); 1395 public $callback = null; 1396 public $lowercase = false; 1397 public $original_size; 1398 public $size; 1399 1400 protected $pos; 1401 protected $doc; 1402 protected $char; 1403 1404 protected $cursor; 1405 protected $parent; 1406 protected $noise = array(); 1407 protected $token_blank = " \t\r\n"; 1408 protected $token_equal = ' =/>'; 1409 protected $token_slash = " />\r\n\t"; 1410 protected $token_attr = ' >'; 1411 1412 public $_charset = ''; 1413 public $_target_charset = ''; 1414 1415 protected $default_br_text = ''; 1416 1417 public $default_span_text = ''; 1418 1419 protected $self_closing_tags = array( 1420 'area' => 1, 1421 'base' => 1, 1422 'br' => 1, 1423 'col' => 1, 1424 'embed' => 1, 1425 'hr' => 1, 1426 'img' => 1, 1427 'input' => 1, 1428 'link' => 1, 1429 'meta' => 1, 1430 'param' => 1, 1431 'source' => 1, 1432 'track' => 1, 1433 'wbr' => 1 1434 ); 1435 protected $block_tags = array( 1436 'body' => 1, 1437 'div' => 1, 1438 'form' => 1, 1439 'root' => 1, 1440 'span' => 1, 1441 'table' => 1 1442 ); 1443 protected $optional_closing_tags = array( 1444 // Not optional, see 1445 // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1446 'b' => array('b' => 1), 1447 'dd' => array('dd' => 1, 'dt' => 1), 1448 // Not optional, see 1449 // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1450 'dl' => array('dd' => 1, 'dt' => 1), 1451 'dt' => array('dd' => 1, 'dt' => 1), 1452 'li' => array('li' => 1), 1453 'optgroup' => array('optgroup' => 1, 'option' => 1), 1454 'option' => array('optgroup' => 1, 'option' => 1), 1455 'p' => array('p' => 1), 1456 'rp' => array('rp' => 1, 'rt' => 1), 1457 'rt' => array('rp' => 1, 'rt' => 1), 1458 'td' => array('td' => 1, 'th' => 1), 1459 'th' => array('td' => 1, 'th' => 1), 1460 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1461 ); 1462 1463 function __construct( 1464 $str = null, 1465 $lowercase = true, 1466 $forceTagsClosed = true, 1467 $target_charset = DEFAULT_TARGET_CHARSET, 1468 $stripRN = true, 1469 $defaultBRText = DEFAULT_BR_TEXT, 1470 $defaultSpanText = DEFAULT_SPAN_TEXT, 1471 $options = 0) 1472 { 1473 if ($str) { 1474 if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1475 $this->load_file($str); 1476 } else { 1477 $this->load( 1478 $str, 1479 $lowercase, 1480 $stripRN, 1481 $defaultBRText, 1482 $defaultSpanText, 1483 $options 1484 ); 1485 } 1486 } 1487 // Forcing tags to be closed implies that we don't trust the html, but 1488 // it can lead to parsing errors if we SHOULD trust the html. 1489 if (!$forceTagsClosed) { 1490 $this->optional_closing_array = array(); 1491 } 1492 1493 $this->_target_charset = $target_charset; 1494 } 1495 1496 function __destruct() 1497 { 1498 $this->clear(); 1499 } 1500 1501 function load( 1502 $str, 1503 $lowercase = true, 1504 $stripRN = true, 1505 $defaultBRText = DEFAULT_BR_TEXT, 1506 $defaultSpanText = DEFAULT_SPAN_TEXT, 1507 $options = 0) 1508 { 1509 global $debug_object; 1510 1511 // prepare 1512 $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1513 1514 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1515 // Script tags removal now preceeds style tag removal. 1516 // strip out <script> tags 1517 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1518 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1519 1520 // strip out the \r \n's if we are told to. 1521 if ($stripRN) { 1522 $this->doc = str_replace("\r", ' ', $this->doc); 1523 $this->doc = str_replace("\n", ' ', $this->doc); 1524 1525 // set the length of content since we have changed it. 1526 $this->size = strlen($this->doc); 1527 } 1528 1529 // strip out cdata 1530 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1531 // strip out comments 1532 $this->remove_noise("'<!--(.*?)-->'is"); 1533 // strip out <style> tags 1534 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1535 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1536 // strip out preformatted tags 1537 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1538 // strip out server side scripts 1539 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1540 1541 if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1542 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1543 } 1544 1545 // parsing 1546 $this->parse(); 1547 // end 1548 $this->root->_[HDOM_INFO_END] = $this->cursor; 1549 $this->parse_charset(); 1550 1551 // make load function chainable 1552 return $this; 1553 } 1554 1555 function load_file() 1556 { 1557 $args = func_get_args(); 1558 1559 if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1560 $this->load($doc, true); 1561 } else { 1562 return false; 1563 } 1564 } 1565 1566 function set_callback($function_name) 1567 { 1568 $this->callback = $function_name; 1569 } 1570 1571 function remove_callback() 1572 { 1573 $this->callback = null; 1574 } 1575 1576 function save($filepath = '') 1577 { 1578 $ret = $this->root->innertext(); 1579 if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1580 return $ret; 1581 } 1582 1583 function find($selector, $idx = null, $lowercase = false) 1584 { 1585 return $this->root->find($selector, $idx, $lowercase); 1586 } 1587 1588 function clear() 1589 { 1590 if (isset($this->nodes)) { 1591 foreach ($this->nodes as $n) { 1592 $n->clear(); 1593 $n = null; 1594 } 1595 } 1596 1597 // This add next line is documented in the sourceforge repository. 1598 // 2977248 as a fix for ongoing memory leaks that occur even with the 1599 // use of clear. 1600 if (isset($this->children)) { 1601 foreach ($this->children as $n) { 1602 $n->clear(); 1603 $n = null; 1604 } 1605 } 1606 1607 if (isset($this->parent)) { 1608 $this->parent->clear(); 1609 unset($this->parent); 1610 } 1611 1612 if (isset($this->root)) { 1613 $this->root->clear(); 1614 unset($this->root); 1615 } 1616 1617 unset($this->doc); 1618 unset($this->noise); 1619 } 1620 1621 function dump($show_attr = true) 1622 { 1623 $this->root->dump($show_attr); 1624 } 1625 1626 protected function prepare( 1627 $str, $lowercase = true, 1628 $defaultBRText = DEFAULT_BR_TEXT, 1629 $defaultSpanText = DEFAULT_SPAN_TEXT) 1630 { 1631 $this->clear(); 1632 1633 $this->doc = trim($str); 1634 $this->size = strlen($this->doc); 1635 $this->original_size = $this->size; // original size of the html 1636 $this->pos = 0; 1637 $this->cursor = 1; 1638 $this->noise = array(); 1639 $this->nodes = array(); 1640 $this->lowercase = $lowercase; 1641 $this->default_br_text = $defaultBRText; 1642 $this->default_span_text = $defaultSpanText; 1643 $this->root = new simple_html_dom_node($this); 1644 $this->root->tag = 'root'; 1645 $this->root->_[HDOM_INFO_BEGIN] = -1; 1646 $this->root->nodetype = HDOM_TYPE_ROOT; 1647 $this->parent = $this->root; 1648 if ($this->size > 0) { $this->char = $this->doc[0]; } 1649 } 1650 1651 protected function parse() 1652 { 1653 while (true) { 1654 // Read next tag if there is no text between current position and the 1655 // next opening tag. 1656 if (($s = $this->copy_until_char('<')) === '') { 1657 if($this->read_tag()) { 1658 continue; 1659 } else { 1660 return true; 1661 } 1662 } 1663 1664 // Add a text node for text between tags 1665 $node = new simple_html_dom_node($this); 1666 ++$this->cursor; 1667 $node->_[HDOM_INFO_TEXT] = $s; 1668 $this->link_nodes($node, false); 1669 } 1670 } 1671 1672 protected function parse_charset() 1673 { 1674 global $debug_object; 1675 1676 $charset = null; 1677 1678 if (function_exists('get_last_retrieve_url_contents_content_type')) { 1679 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1680 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1681 if ($success) { 1682 $charset = $matches[1]; 1683 if (is_object($debug_object)) { 1684 $debug_object->debug_log(2, 1685 'header content-type found charset of: ' 1686 . $charset 1687 ); 1688 } 1689 } 1690 } 1691 1692 if (empty($charset)) { 1693 // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1694 $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1695 1696 if (!empty($el)) { 1697 $fullvalue = $el->content; 1698 if (is_object($debug_object)) { 1699 $debug_object->debug_log(2, 1700 'meta content-type tag found' 1701 . $fullvalue 1702 ); 1703 } 1704 1705 if (!empty($fullvalue)) { 1706 $success = preg_match( 1707 '/charset=(.+)/i', 1708 $fullvalue, 1709 $matches 1710 ); 1711 1712 if ($success) { 1713 $charset = $matches[1]; 1714 } else { 1715 // If there is a meta tag, and they don't specify the 1716 // character set, research says that it's typically 1717 // ISO-8859-1 1718 if (is_object($debug_object)) { 1719 $debug_object->debug_log(2, 1720 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1721 ); 1722 } 1723 1724 $charset = 'ISO-8859-1'; 1725 } 1726 } 1727 } 1728 } 1729 1730 if (empty($charset)) { 1731 // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1732 if ($meta = $this->root->find('meta[charset]', 0)) { 1733 $charset = $meta->charset; 1734 if (is_object($debug_object)) { 1735 $debug_object->debug_log(2, 'meta charset: ' . $charset); 1736 } 1737 } 1738 } 1739 1740 if (empty($charset)) { 1741 // Try to guess the charset based on the content 1742 // Requires Multibyte String (mbstring) support (optional) 1743 if (function_exists('mb_detect_encoding')) { 1744 /** 1745 * mb_detect_encoding() is not intended to distinguish between 1746 * charsets, especially single-byte charsets. Its primary 1747 * purpose is to detect which multibyte encoding is in use, 1748 * i.e. UTF-8, UTF-16, shift-JIS, etc. 1749 * 1750 * -- https://bugs.php.net/bug.php?id=38138 1751 * 1752 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1753 * always result in CP1251/ISO-8859-5 and vice versa. 1754 * 1755 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1756 * to stay compatible. 1757 */ 1758 $encoding = mb_detect_encoding( 1759 $this->doc, 1760 array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1761 ); 1762 1763 if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1764 // Due to a limitation of mb_detect_encoding 1765 // 'CP1251'/'ISO-8859-5' will be detected as 1766 // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1767 // which case we can simply assume it is the other charset. 1768 if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1769 $encoding = 'CP1251'; 1770 } 1771 } 1772 1773 if ($encoding !== false) { 1774 $charset = $encoding; 1775 if (is_object($debug_object)) { 1776 $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1777 } 1778 } 1779 } 1780 } 1781 1782 if (empty($charset)) { 1783 // Assume it's UTF-8 as it is the most likely charset to be used 1784 $charset = 'UTF-8'; 1785 if (is_object($debug_object)) { 1786 $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1787 } 1788 } 1789 1790 // Since CP1252 is a superset, if we get one of it's subsets, we want 1791 // it instead. 1792 if ((strtolower($charset) == 'iso-8859-1') 1793 || (strtolower($charset) == 'latin1') 1794 || (strtolower($charset) == 'latin-1')) { 1795 $charset = 'CP1252'; 1796 if (is_object($debug_object)) { 1797 $debug_object->debug_log(2, 1798 'replacing ' . $charset . ' with CP1252 as its a superset' 1799 ); 1800 } 1801 } 1802 1803 if (is_object($debug_object)) { 1804 $debug_object->debug_log(1, 'EXIT - ' . $charset); 1805 } 1806 1807 return $this->_charset = $charset; 1808 } 1809 1810 protected function read_tag() 1811 { 1812 // Set end position if no further tags found 1813 if ($this->char !== '<') { 1814 $this->root->_[HDOM_INFO_END] = $this->cursor; 1815 return false; 1816 } 1817 1818 $begin_tag_pos = $this->pos; 1819 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1820 1821 // end tag 1822 if ($this->char === '/') { 1823 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1824 1825 // Skip whitespace in end tags (i.e. in "</ html>") 1826 $this->skip($this->token_blank); 1827 $tag = $this->copy_until_char('>'); 1828 1829 // Skip attributes in end tags 1830 if (($pos = strpos($tag, ' ')) !== false) { 1831 $tag = substr($tag, 0, $pos); 1832 } 1833 1834 $parent_lower = strtolower($this->parent->tag); 1835 $tag_lower = strtolower($tag); 1836 1837 // The end tag is supposed to close the parent tag. Handle situations 1838 // when it doesn't 1839 if ($parent_lower !== $tag_lower) { 1840 // Parent tag does not have to be closed necessarily (optional closing tag) 1841 // Current tag is a block tag, so it may close an ancestor 1842 if (isset($this->optional_closing_tags[$parent_lower]) 1843 && isset($this->block_tags[$tag_lower])) { 1844 1845 $this->parent->_[HDOM_INFO_END] = 0; 1846 $org_parent = $this->parent; 1847 1848 // Traverse ancestors to find a matching opening tag 1849 // Stop at root node 1850 while (($this->parent->parent) 1851 && strtolower($this->parent->tag) !== $tag_lower 1852 ){ 1853 $this->parent = $this->parent->parent; 1854 } 1855 1856 // If we don't have a match add current tag as text node 1857 if (strtolower($this->parent->tag) !== $tag_lower) { 1858 $this->parent = $org_parent; // restore origonal parent 1859 1860 if ($this->parent->parent) { 1861 $this->parent = $this->parent->parent; 1862 } 1863 1864 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1865 return $this->as_text_node($tag); 1866 } 1867 } elseif (($this->parent->parent) 1868 && isset($this->block_tags[$tag_lower]) 1869 ) { 1870 // Grandparent exists and current tag is a block tag, so our 1871 // parent doesn't have an end tag 1872 $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1873 $org_parent = $this->parent; 1874 1875 // Traverse ancestors to find a matching opening tag 1876 // Stop at root node 1877 while (($this->parent->parent) 1878 && strtolower($this->parent->tag) !== $tag_lower 1879 ) { 1880 $this->parent = $this->parent->parent; 1881 } 1882 1883 // If we don't have a match add current tag as text node 1884 if (strtolower($this->parent->tag) !== $tag_lower) { 1885 $this->parent = $org_parent; // restore origonal parent 1886 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1887 return $this->as_text_node($tag); 1888 } 1889 } elseif (($this->parent->parent) 1890 && strtolower($this->parent->parent->tag) === $tag_lower 1891 ) { // Grandparent exists and current tag closes it 1892 $this->parent->_[HDOM_INFO_END] = 0; 1893 $this->parent = $this->parent->parent; 1894 } else { // Random tag, add as text node 1895 return $this->as_text_node($tag); 1896 } 1897 } 1898 1899 // Set end position of parent tag to current cursor position 1900 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1901 1902 if ($this->parent->parent) { 1903 $this->parent = $this->parent->parent; 1904 } 1905 1906 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1907 return true; 1908 } 1909 1910 // start tag 1911 $node = new simple_html_dom_node($this); 1912 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1913 ++$this->cursor; 1914 $tag = $this->copy_until($this->token_slash); // Get tag name 1915 $node->tag_start = $begin_tag_pos; 1916 1917 // doctype, cdata & comments... 1918 // <!DOCTYPE html> 1919 // <![CDATA[ ... ]]> 1920 // <!-- Comment --> 1921 if (isset($tag[0]) && $tag[0] === '!') { 1922 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1923 1924 if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1925 $node->nodetype = HDOM_TYPE_COMMENT; 1926 $node->tag = 'comment'; 1927 } else { // Could be doctype or CDATA but we don't care 1928 $node->nodetype = HDOM_TYPE_UNKNOWN; 1929 $node->tag = 'unknown'; 1930 } 1931 1932 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1933 1934 $this->link_nodes($node, true); 1935 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1936 return true; 1937 } 1938 1939 // The start tag cannot contain another start tag, if so add as text 1940 // i.e. "<<html>" 1941 if ($pos = strpos($tag, '<') !== false) { 1942 $tag = '<' . substr($tag, 0, -1); 1943 $node->_[HDOM_INFO_TEXT] = $tag; 1944 $this->link_nodes($node, false); 1945 $this->char = $this->doc[--$this->pos]; // prev 1946 return true; 1947 } 1948 1949 // Handle invalid tag names (i.e. "<html#doc>") 1950 if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1951 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1952 1953 // Next char is the beginning of a new tag, don't touch it. 1954 if ($this->char === '<') { 1955 $this->link_nodes($node, false); 1956 return true; 1957 } 1958 1959 // Next char closes current tag, add and be done with it. 1960 if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1961 $this->link_nodes($node, false); 1962 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1963 return true; 1964 } 1965 1966 // begin tag, add new node 1967 $node->nodetype = HDOM_TYPE_ELEMENT; 1968 $tag_lower = strtolower($tag); 1969 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1970 1971 // handle optional closing tags 1972 if (isset($this->optional_closing_tags[$tag_lower])) { 1973 // Traverse ancestors to close all optional closing tags 1974 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1975 $this->parent->_[HDOM_INFO_END] = 0; 1976 $this->parent = $this->parent->parent; 1977 } 1978 $node->parent = $this->parent; 1979 } 1980 1981 $guard = 0; // prevent infinity loop 1982 1983 // [0] Space between tag and first attribute 1984 $space = array($this->copy_skip($this->token_blank), '', ''); 1985 1986 // attributes 1987 do { 1988 // Everything until the first equal sign should be the attribute name 1989 $name = $this->copy_until($this->token_equal); 1990 1991 if ($name === '' && $this->char !== null && $space[0] === '') { 1992 break; 1993 } 1994 1995 if ($guard === $this->pos) { // Escape infinite loop 1996 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1997 continue; 1998 } 1999 2000 $guard = $this->pos; 2001 2002 // handle endless '<' 2003 // Out of bounds before the tag ended 2004 if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2005 $node->nodetype = HDOM_TYPE_TEXT; 2006 $node->_[HDOM_INFO_END] = 0; 2007 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2008 $node->tag = 'text'; 2009 $this->link_nodes($node, false); 2010 return true; 2011 } 2012 2013 // handle mismatch '<' 2014 // Attributes cannot start after opening tag 2015 if ($this->doc[$this->pos - 1] == '<') { 2016 $node->nodetype = HDOM_TYPE_TEXT; 2017 $node->tag = 'text'; 2018 $node->attr = array(); 2019 $node->_[HDOM_INFO_END] = 0; 2020 $node->_[HDOM_INFO_TEXT] = substr( 2021 $this->doc, 2022 $begin_tag_pos, 2023 $this->pos - $begin_tag_pos - 1 2024 ); 2025 $this->pos -= 2; 2026 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2027 $this->link_nodes($node, false); 2028 return true; 2029 } 2030 2031 if ($name !== '/' && $name !== '') { // this is a attribute name 2032 // [1] Whitespace after attribute name 2033 $space[1] = $this->copy_skip($this->token_blank); 2034 2035 $name = $this->restore_noise($name); // might be a noisy name 2036 2037 if ($this->lowercase) { $name = strtolower($name); } 2038 2039 if ($this->char === '=') { // attribute with value 2040 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2041 $this->parse_attr($node, $name, $space); // get attribute value 2042 } else { 2043 //no value attr: nowrap, checked selected... 2044 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2045 $node->attr[$name] = true; 2046 if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2047 } 2048 2049 $node->_[HDOM_INFO_SPACE][] = $space; 2050 2051 // prepare for next attribute 2052 $space = array( 2053 $this->copy_skip($this->token_blank), 2054 '', 2055 '' 2056 ); 2057 } else { // no more attributes 2058 break; 2059 } 2060 } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2061 2062 $this->link_nodes($node, true); 2063 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2064 2065 // handle empty tags (i.e. "<div/>") 2066 if ($this->copy_until_char('>') === '/') { 2067 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2068 $node->_[HDOM_INFO_END] = 0; 2069 } else { 2070 // reset parent 2071 if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2072 $this->parent = $node; 2073 } 2074 } 2075 2076 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2077 2078 // If it's a BR tag, we need to set it's text to the default text. 2079 // This way when we see it in plaintext, we can generate formatting that the user wants. 2080 // since a br tag never has sub nodes, this works well. 2081 if ($node->tag === 'br') { 2082 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2083 } 2084 2085 return true; 2086 } 2087 2088 protected function parse_attr($node, $name, &$space) 2089 { 2090 $is_duplicate = isset($node->attr[$name]); 2091 2092 if (!$is_duplicate) // Copy whitespace between "=" and value 2093 $space[2] = $this->copy_skip($this->token_blank); 2094 2095 switch ($this->char) { 2096 case '"': 2097 $quote_type = HDOM_QUOTE_DOUBLE; 2098 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2099 $value = $this->copy_until_char('"'); 2100 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2101 break; 2102 case '\'': 2103 $quote_type = HDOM_QUOTE_SINGLE; 2104 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2105 $value = $this->copy_until_char('\''); 2106 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2107 break; 2108 default: 2109 $quote_type = HDOM_QUOTE_NO; 2110 $value = $this->copy_until($this->token_attr); 2111 } 2112 2113 $value = $this->restore_noise($value); 2114 2115 // PaperG: Attributes should not have \r or \n in them, that counts as 2116 // html whitespace. 2117 2118// The following was commented out as it interferes with DokuWiki edit mode - nomadjimbob 2119// 2120// $value = str_replace("\r", '', $value); 2121// $value = str_replace("\n", '', $value); 2122 2123 // PaperG: If this is a "class" selector, lets get rid of the preceeding 2124 // and trailing space since some people leave it in the multi class case. 2125 if ($name === 'class') { 2126 $value = trim($value); 2127 } 2128 2129 if (!$is_duplicate) { 2130 $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2131 $node->attr[$name] = $value; 2132 } 2133 } 2134 2135 protected function link_nodes(&$node, $is_child) 2136 { 2137 $node->parent = $this->parent; 2138 $this->parent->nodes[] = $node; 2139 if ($is_child) { 2140 $this->parent->children[] = $node; 2141 } 2142 } 2143 2144 protected function as_text_node($tag) 2145 { 2146 $node = new simple_html_dom_node($this); 2147 ++$this->cursor; 2148 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2149 $this->link_nodes($node, false); 2150 $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2151 return true; 2152 } 2153 2154 protected function skip($chars) 2155 { 2156 $this->pos += strspn($this->doc, $chars, $this->pos); 2157 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2158 } 2159 2160 protected function copy_skip($chars) 2161 { 2162 $pos = $this->pos; 2163 $len = strspn($this->doc, $chars, $pos); 2164 $this->pos += $len; 2165 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2166 if ($len === 0) { return ''; } 2167 return substr($this->doc, $pos, $len); 2168 } 2169 2170 protected function copy_until($chars) 2171 { 2172 $pos = $this->pos; 2173 $len = strcspn($this->doc, $chars, $pos); 2174 $this->pos += $len; 2175 $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2176 return substr($this->doc, $pos, $len); 2177 } 2178 2179 protected function copy_until_char($char) 2180 { 2181 if ($this->char === null) { return ''; } 2182 2183 if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2184 $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2185 $this->char = null; 2186 $this->pos = $this->size; 2187 return $ret; 2188 } 2189 2190 if ($pos === $this->pos) { return ''; } 2191 2192 $pos_old = $this->pos; 2193 $this->char = $this->doc[$pos]; 2194 $this->pos = $pos; 2195 return substr($this->doc, $pos_old, $pos - $pos_old); 2196 } 2197 2198 protected function remove_noise($pattern, $remove_tag = false) 2199 { 2200 global $debug_object; 2201 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2202 2203 $count = preg_match_all( 2204 $pattern, 2205 $this->doc, 2206 $matches, 2207 PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2208 ); 2209 2210 for ($i = $count - 1; $i > -1; --$i) { 2211 $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2212 2213 if (is_object($debug_object)) { 2214 $debug_object->debug_log(2, 'key is: ' . $key); 2215 } 2216 2217 $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2218 $this->noise[$key] = $matches[$i][$idx][0]; 2219 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2220 } 2221 2222 // reset the length of content 2223 $this->size = strlen($this->doc); 2224 2225 if ($this->size > 0) { 2226 $this->char = $this->doc[0]; 2227 } 2228 } 2229 2230 function restore_noise($text) 2231 { 2232 global $debug_object; 2233 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2234 2235 while (($pos = strpos($text, '___noise___')) !== false) { 2236 // Sometimes there is a broken piece of markup, and we don't GET the 2237 // pos+11 etc... token which indicates a problem outside of us... 2238 2239 // todo: "___noise___1000" (or any number with four or more digits) 2240 // in the DOM causes an infinite loop which could be utilized by 2241 // malicious software 2242 if (strlen($text) > $pos + 15) { 2243 $key = '___noise___' 2244 . $text[$pos + 11] 2245 . $text[$pos + 12] 2246 . $text[$pos + 13] 2247 . $text[$pos + 14] 2248 . $text[$pos + 15]; 2249 2250 if (is_object($debug_object)) { 2251 $debug_object->debug_log(2, 'located key of: ' . $key); 2252 } 2253 2254 if (isset($this->noise[$key])) { 2255 $text = substr($text, 0, $pos) 2256 . $this->noise[$key] 2257 . substr($text, $pos + 16); 2258 } else { 2259 // do this to prevent an infinite loop. 2260 $text = substr($text, 0, $pos) 2261 . 'UNDEFINED NOISE FOR KEY: ' 2262 . $key 2263 . substr($text, $pos + 16); 2264 } 2265 } else { 2266 // There is no valid key being given back to us... We must get 2267 // rid of the ___noise___ or we will have a problem. 2268 $text = substr($text, 0, $pos) 2269 . 'NO NUMERIC NOISE KEY' 2270 . substr($text, $pos + 11); 2271 } 2272 } 2273 return $text; 2274 } 2275 2276 function search_noise($text) 2277 { 2278 global $debug_object; 2279 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2280 2281 foreach($this->noise as $noiseElement) { 2282 if (strpos($noiseElement, $text) !== false) { 2283 return $noiseElement; 2284 } 2285 } 2286 } 2287 2288 function __toString() 2289 { 2290 return $this->root->innertext(); 2291 } 2292 2293 function __get($name) 2294 { 2295 switch ($name) { 2296 case 'outertext': 2297 return $this->root->innertext(); 2298 case 'innertext': 2299 return $this->root->innertext(); 2300 case 'plaintext': 2301 return $this->root->text(); 2302 case 'charset': 2303 return $this->_charset; 2304 case 'target_charset': 2305 return $this->_target_charset; 2306 } 2307 } 2308 2309 function childNodes($idx = -1) 2310 { 2311 return $this->root->childNodes($idx); 2312 } 2313 2314 function firstChild() 2315 { 2316 return $this->root->first_child(); 2317 } 2318 2319 function lastChild() 2320 { 2321 return $this->root->last_child(); 2322 } 2323 2324 function createElement($name, $value = null) 2325 { 2326 return @str_get_html("<$name>$value</$name>")->firstChild(); 2327 } 2328 2329 function createTextNode($value) 2330 { 2331 return @end(str_get_html($value)->nodes); 2332 } 2333 2334 function getElementById($id) 2335 { 2336 return $this->find("#$id", 0); 2337 } 2338 2339 function getElementsById($id, $idx = null) 2340 { 2341 return $this->find("#$id", $idx); 2342 } 2343 2344 function getElementByTagName($name) 2345 { 2346 return $this->find($name, 0); 2347 } 2348 2349 function getElementsByTagName($name, $idx = -1) 2350 { 2351 return $this->find($name, $idx); 2352 } 2353 2354 function loadFile() 2355 { 2356 $args = func_get_args(); 2357 $this->load_file($args); 2358 } 2359} 2360