1<?php 2/** 3 * Website: http://sourceforge.net/projects/simplehtmldom/ 4 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 5 * Contributions by: 6 * Yousuke Kumakura (Attribute filters) 7 * Vadim Voituk (Negative indexes supports of "find" method) 8 * Antcs (Constructor with automatically load contents either text or file/url) 9 * 10 * all affected sections have comments starting with "PaperG" 11 * 12 * Paperg - Added case insensitive testing of the value of the selector. 13 * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately. 14 * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source, 15 * it will almost always be smaller by some amount. 16 * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from. 17 * but for most purposes, it's a really good estimation. 18 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. 19 * Allow the user to tell us how much they trust the html. 20 * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. 21 * This allows for us to find tags based on the text they contain. 22 * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. 23 * Paperg: added parse_charset so that we know about the character set of the source document. 24 * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the 25 * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. 26 * 27 * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. 28 * PaperG (John Schlick) Added get_display_size for "IMG" tags. 29 * 30 * Licensed under The MIT License 31 * Redistributions of files must retain the above copyright notice. 32 * 33 * @author S.C. Chen <me578022@gmail.com> 34 * @author John Schlick 35 * @author Rus Carroll 36 * @version 1.5 ($Rev: 196 $) 37 * @package PlaceLocalInclude 38 * @subpackage simple_html_dom 39 */ 40 41/** 42 * All of the Defines for the classes below. 43 * @author S.C. Chen <me578022@gmail.com> 44 */ 45define('HDOM_TYPE_ELEMENT', 1); 46define('HDOM_TYPE_COMMENT', 2); 47define('HDOM_TYPE_TEXT', 3); 48define('HDOM_TYPE_ENDTAG', 4); 49define('HDOM_TYPE_ROOT', 5); 50define('HDOM_TYPE_UNKNOWN', 6); 51define('HDOM_QUOTE_DOUBLE', 0); 52define('HDOM_QUOTE_SINGLE', 1); 53define('HDOM_QUOTE_NO', 3); 54define('HDOM_INFO_BEGIN', 0); 55define('HDOM_INFO_END', 1); 56define('HDOM_INFO_QUOTE', 2); 57define('HDOM_INFO_SPACE', 3); 58define('HDOM_INFO_TEXT', 4); 59define('HDOM_INFO_INNER', 5); 60define('HDOM_INFO_OUTER', 6); 61define('HDOM_INFO_ENDSPACE',7); 62define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 63define('DEFAULT_BR_TEXT', "\r\n"); 64define('DEFAULT_SPAN_TEXT', " "); 65define('MAX_FILE_SIZE', 600000); 66// helper functions 67// ----------------------------------------------------------------------------- 68// get html dom from file 69// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 70function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 71{ 72 // We DO force the tags to be terminated. 73 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 74 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. 75 $contents = file_get_contents($url, $use_include_path, $context, $offset); 76 // Paperg - use our own mechanism for getting the contents as we want to control the timeout. 77 //$contents = retrieve_url_contents($url); 78 if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) 79 { 80 return false; 81 } 82 // The second parameter can force the selectors to all be lowercase. 83 $dom->load($contents, $lowercase, $stripRN); 84 return $dom; 85} 86 87// get html dom from string 88function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 89{ 90 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 91 if (empty($str) || strlen($str) > MAX_FILE_SIZE) 92 { 93 $dom->clear(); 94 return false; 95 } 96 $dom->load($str, $lowercase, $stripRN); 97 return $dom; 98} 99 100// dump html dom tree 101function dump_html_tree($node, $show_attr=true, $deep=0) 102{ 103 $node->dump($node); 104} 105 106 107/** 108 * simple html dom node 109 * PaperG - added ability for "find" routine to lowercase the value of the selector. 110 * PaperG - added $tag_start to track the start position of the tag in the total byte index 111 * 112 * @package PlaceLocalInclude 113 */ 114class simple_html_dom_node 115{ 116 public $nodetype = HDOM_TYPE_TEXT; 117 public $tag = 'text'; 118 public $attr = array(); 119 public $children = array(); 120 public $nodes = array(); 121 public $parent = null; 122 // The "info" array - see HDOM_INFO_... for what each element contains. 123 public $_ = array(); 124 public $tag_start = 0; 125 private $dom = null; 126 127 function __construct($dom) 128 { 129 $this->dom = $dom; 130 $dom->nodes[] = $this; 131 } 132 133 function __destruct() 134 { 135 $this->clear(); 136 } 137 138 function __toString() 139 { 140 return $this->outertext(); 141 } 142 143 // clean up memory due to php5 circular references memory leak... 144 function clear() 145 { 146 $this->dom = null; 147 $this->nodes = null; 148 $this->parent = null; 149 $this->children = null; 150 } 151 152 // dump node's tree 153 function dump($show_attr=true, $deep=0) 154 { 155 $lead = str_repeat(' ', $deep); 156 157 echo $lead.$this->tag; 158 if ($show_attr && count($this->attr)>0) 159 { 160 echo '('; 161 foreach ($this->attr as $k=>$v) 162 echo "[$k]=>\"".$this->$k.'", '; 163 echo ')'; 164 } 165 echo "\n"; 166 167 if ($this->nodes) 168 { 169 foreach ($this->nodes as $c) 170 { 171 $c->dump($show_attr, $deep+1); 172 } 173 } 174 } 175 176 177 // Debugging function to dump a single dom node with a bunch of information about it. 178 function dump_node($echo=true) 179 { 180 181 $string = $this->tag; 182 if (count($this->attr)>0) 183 { 184 $string .= '('; 185 foreach ($this->attr as $k=>$v) 186 { 187 $string .= "[$k]=>\"".$this->$k.'", '; 188 } 189 $string .= ')'; 190 } 191 if (count($this->_)>0) 192 { 193 $string .= ' $_ ('; 194 foreach ($this->_ as $k=>$v) 195 { 196 if (is_array($v)) 197 { 198 $string .= "[$k]=>("; 199 foreach ($v as $k2=>$v2) 200 { 201 $string .= "[$k2]=>\"".$v2.'", '; 202 } 203 $string .= ")"; 204 } else { 205 $string .= "[$k]=>\"".$v.'", '; 206 } 207 } 208 $string .= ")"; 209 } 210 211 if (isset($this->text)) 212 { 213 $string .= " text: (" . $this->text . ")"; 214 } 215 216 $string .= " HDOM_INNER_INFO: '"; 217 if (isset($node->_[HDOM_INFO_INNER])) 218 { 219 $string .= $node->_[HDOM_INFO_INNER] . "'"; 220 } 221 else 222 { 223 $string .= ' NULL '; 224 } 225 226 $string .= " children: " . count($this->children); 227 $string .= " nodes: " . count($this->nodes); 228 $string .= " tag_start: " . $this->tag_start; 229 $string .= "\n"; 230 231 if ($echo) 232 { 233 echo $string; 234 return; 235 } 236 else 237 { 238 return $string; 239 } 240 } 241 242 // returns the parent of node 243 // If a node is passed in, it will reset the parent of the current node to that one. 244 function parent($parent=null) 245 { 246 // I am SURE that this doesn't work properly. 247 // It fails to unset the current node from it's current parents nodes or children list first. 248 if ($parent !== null) 249 { 250 $this->parent = $parent; 251 $this->parent->nodes[] = $this; 252 $this->parent->children[] = $this; 253 } 254 255 return $this->parent; 256 } 257 258 // verify that node has children 259 function has_child() 260 { 261 return !empty($this->children); 262 } 263 264 // returns children of node 265 function children($idx=-1) 266 { 267 if ($idx===-1) 268 { 269 return $this->children; 270 } 271 if (isset($this->children[$idx])) return $this->children[$idx]; 272 return null; 273 } 274 275 // returns the first child of node 276 function first_child() 277 { 278 if (count($this->children)>0) 279 { 280 return $this->children[0]; 281 } 282 return null; 283 } 284 285 // returns the last child of node 286 function last_child() 287 { 288 if (($count=count($this->children))>0) 289 { 290 return $this->children[$count-1]; 291 } 292 return null; 293 } 294 295 // returns the next sibling of node 296 function next_sibling() 297 { 298 if ($this->parent===null) 299 { 300 return null; 301 } 302 303 $idx = 0; 304 $count = count($this->parent->children); 305 while ($idx<$count && $this!==$this->parent->children[$idx]) 306 { 307 ++$idx; 308 } 309 if (++$idx>=$count) 310 { 311 return null; 312 } 313 return $this->parent->children[$idx]; 314 } 315 316 // returns the previous sibling of node 317 function prev_sibling() 318 { 319 if ($this->parent===null) return null; 320 $idx = 0; 321 $count = count($this->parent->children); 322 while ($idx<$count && $this!==$this->parent->children[$idx]) 323 ++$idx; 324 if (--$idx<0) return null; 325 return $this->parent->children[$idx]; 326 } 327 328 // function to locate a specific ancestor tag in the path to the root. 329 function find_ancestor_tag($tag) 330 { 331 global $debugObject; 332 if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 333 334 // Start by including ourselves in the comparison. 335 $returnDom = $this; 336 337 while (!is_null($returnDom)) 338 { 339 if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); } 340 341 if ($returnDom->tag == $tag) 342 { 343 break; 344 } 345 $returnDom = $returnDom->parent; 346 } 347 return $returnDom; 348 } 349 350 // get dom node's inner html 351 function innertext() 352 { 353 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 354 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 355 356 $ret = ''; 357 foreach ($this->nodes as $n) 358 $ret .= $n->outertext(); 359 return $ret; 360 } 361 362 // get dom node's outer text (with tag) 363 function outertext() 364 { 365 global $debugObject; 366 if (is_object($debugObject)) 367 { 368 $text = ''; 369 if ($this->tag == 'text') 370 { 371 if (!empty($this->text)) 372 { 373 $text = " with text: " . $this->text; 374 } 375 } 376 $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); 377 } 378 379 if ($this->tag==='root') return $this->innertext(); 380 381 // trigger callback 382 if ($this->dom && $this->dom->callback!==null) 383 { 384 call_user_func_array($this->dom->callback, array($this)); 385 } 386 387 if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 388 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 389 390 // render begin tag 391 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 392 { 393 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 394 } else { 395 $ret = ""; 396 } 397 398 // render inner text 399 if (isset($this->_[HDOM_INFO_INNER])) 400 { 401 // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 402 if ($this->tag != "br") 403 { 404 $ret .= $this->_[HDOM_INFO_INNER]; 405 } 406 } else { 407 if ($this->nodes) 408 { 409 foreach ($this->nodes as $n) 410 { 411 $ret .= $this->convert_text($n->outertext()); 412 } 413 } 414 } 415 416 // render end tag 417 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 418 $ret .= '</'.$this->tag.'>'; 419 return $ret; 420 } 421 422 // get dom node's plain text 423 function text() 424 { 425 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 426 switch ($this->nodetype) 427 { 428 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 429 case HDOM_TYPE_COMMENT: return ''; 430 case HDOM_TYPE_UNKNOWN: return ''; 431 } 432 if (strcasecmp($this->tag, 'script')===0) return ''; 433 if (strcasecmp($this->tag, 'style')===0) return ''; 434 435 $ret = ''; 436 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 437 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 438 // WHY is this happening? 439 if (!is_null($this->nodes)) 440 { 441 foreach ($this->nodes as $n) 442 { 443 $ret .= $this->convert_text($n->text()); 444 } 445 446 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 447 if ($this->tag == "span") 448 { 449 $ret .= $this->dom->default_span_text; 450 } 451 452 453 } 454 return $ret; 455 } 456 457 function xmltext() 458 { 459 $ret = $this->innertext(); 460 $ret = str_ireplace('<![CDATA[', '', $ret); 461 $ret = str_replace(']]>', '', $ret); 462 return $ret; 463 } 464 465 // build node's text with tag 466 function makeup() 467 { 468 // text, comment, unknown 469 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 470 471 $ret = '<'.$this->tag; 472 $i = -1; 473 474 foreach ($this->attr as $key=>$val) 475 { 476 ++$i; 477 478 // skip removed attribute 479 if ($val===null || $val===false) 480 continue; 481 482 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 483 //no value attr: nowrap, checked selected... 484 if ($val===true) 485 $ret .= $key; 486 else { 487 switch ($this->_[HDOM_INFO_QUOTE][$i]) 488 { 489 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 490 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 491 default: $quote = ''; 492 } 493 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 494 } 495 } 496 $ret = $this->dom->restore_noise($ret); 497 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 498 } 499 500 // find elements by css selector 501 //PaperG - added ability for find to lowercase the value of the selector. 502 function find($selector, $idx=null, $lowercase=false) 503 { 504 $selectors = $this->parse_selector($selector); 505 if (($count=count($selectors))===0) return array(); 506 $found_keys = array(); 507 508 // find each selector 509 for ($c=0; $c<$count; ++$c) 510 { 511 // The change on the below line was documented on the sourceforge code tracker id 2788009 512 // used to be: if (($levle=count($selectors[0]))===0) return array(); 513 if (($levle=count($selectors[$c]))===0) return array(); 514 if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 515 516 $head = array($this->_[HDOM_INFO_BEGIN]=>1); 517 518 // handle descendant selectors, no recursive! 519 for ($l=0; $l<$levle; ++$l) 520 { 521 $ret = array(); 522 foreach ($head as $k=>$v) 523 { 524 $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 525 //PaperG - Pass this optional parameter on to the seek function. 526 $n->seek($selectors[$c][$l], $ret, $lowercase); 527 } 528 $head = $ret; 529 } 530 531 foreach ($head as $k=>$v) 532 { 533 if (!isset($found_keys[$k])) 534 $found_keys[$k] = 1; 535 } 536 } 537 538 // sort keys 539 ksort($found_keys); 540 541 $found = array(); 542 foreach ($found_keys as $k=>$v) 543 $found[] = $this->dom->nodes[$k]; 544 545 // return nth-element or array 546 if (is_null($idx)) return $found; 547 else if ($idx<0) $idx = count($found) + $idx; 548 return (isset($found[$idx])) ? $found[$idx] : null; 549 } 550 551 // seek for given conditions 552 // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 553 protected function seek($selector, &$ret, $lowercase=false) 554 { 555 global $debugObject; 556 if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 557 558 list($tag, $key, $val, $exp, $no_key) = $selector; 559 560 // xpath index 561 if ($tag && $key && is_numeric($key)) 562 { 563 $count = 0; 564 foreach ($this->children as $c) 565 { 566 if ($tag==='*' || $tag===$c->tag) { 567 if (++$count==$key) { 568 $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 569 return; 570 } 571 } 572 } 573 return; 574 } 575 576 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 577 if ($end==0) { 578 $parent = $this->parent; 579 while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 580 $end -= 1; 581 $parent = $parent->parent; 582 } 583 $end += $parent->_[HDOM_INFO_END]; 584 } 585 586 for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 587 $node = $this->dom->nodes[$i]; 588 589 $pass = true; 590 591 if ($tag==='*' && !$key) { 592 if (in_array($node, $this->children, true)) 593 $ret[$i] = 1; 594 continue; 595 } 596 597 // compare tag 598 if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 599 // compare key 600 if ($pass && $key) { 601 if ($no_key) { 602 if (isset($node->attr[$key])) $pass=false; 603 } else { 604 if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 605 } 606 } 607 // compare value 608 if ($pass && $key && $val && $val!=='*') { 609 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 610 if ($key == "plaintext") { 611 // $node->plaintext actually returns $node->text(); 612 $nodeKeyValue = $node->text(); 613 } else { 614 // this is a normal search, we want the value of that attribute of the tag. 615 $nodeKeyValue = $node->attr[$key]; 616 } 617 if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 618 619 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 620 if ($lowercase) { 621 $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 622 } else { 623 $check = $this->match($exp, $val, $nodeKeyValue); 624 } 625 if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));} 626 627 // handle multiple class 628 if (!$check && strcasecmp($key, 'class')===0) { 629 foreach (explode(' ',$node->attr[$key]) as $k) { 630 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 631 if (!empty($k)) { 632 if ($lowercase) { 633 $check = $this->match($exp, strtolower($val), strtolower($k)); 634 } else { 635 $check = $this->match($exp, $val, $k); 636 } 637 if ($check) break; 638 } 639 } 640 } 641 if (!$check) $pass = false; 642 } 643 if ($pass) $ret[$i] = 1; 644 unset($node); 645 } 646 // It's passed by reference so this is actually what this function returns. 647 if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);} 648 } 649 650 protected function match($exp, $pattern, $value) { 651 global $debugObject; 652 if (is_object($debugObject)) {$debugObject->debugLogEntry(1);} 653 654 switch ($exp) { 655 case '=': 656 return ($value===$pattern); 657 case '!=': 658 return ($value!==$pattern); 659 case '^=': 660 return preg_match("/^".preg_quote($pattern,'/')."/", $value); 661 case '$=': 662 return preg_match("/".preg_quote($pattern,'/')."$/", $value); 663 case '*=': 664 if ($pattern[0]=='/') { 665 return preg_match($pattern, $value); 666 } 667 return preg_match("/".$pattern."/i", $value); 668 } 669 return false; 670 } 671 672 protected function parse_selector($selector_string) { 673 global $debugObject; 674 if (is_object($debugObject)) {$debugObject->debugLogEntry(1);} 675 676 // pattern of CSS selectors, modified from mootools 677 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 678 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 679// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 680// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 681// farther study is required to determine of this should be documented or removed. 682// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 683 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 684 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 685 if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);} 686 687 $selectors = array(); 688 $result = array(); 689 //print_r($matches); 690 691 foreach ($matches as $m) { 692 $m[0] = trim($m[0]); 693 if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 694 // for browser generated xpath 695 if ($m[1]==='tbody') continue; 696 697 list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 698 if (!empty($m[2])) {$key='id'; $val=$m[2];} 699 if (!empty($m[3])) {$key='class'; $val=$m[3];} 700 if (!empty($m[4])) {$key=$m[4];} 701 if (!empty($m[5])) {$exp=$m[5];} 702 if (!empty($m[6])) {$val=$m[6];} 703 704 // convert to lowercase 705 if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 706 //elements that do NOT have the specified attribute 707 if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 708 709 $result[] = array($tag, $key, $val, $exp, $no_key); 710 if (trim($m[7])===',') { 711 $selectors[] = $result; 712 $result = array(); 713 } 714 } 715 if (count($result)>0) 716 $selectors[] = $result; 717 return $selectors; 718 } 719 720 function __get($name) { 721 if (isset($this->attr[$name])) 722 { 723 return $this->convert_text($this->attr[$name]); 724 } 725 switch ($name) { 726 case 'outertext': return $this->outertext(); 727 case 'innertext': return $this->innertext(); 728 case 'plaintext': return $this->text(); 729 case 'xmltext': return $this->xmltext(); 730 default: return array_key_exists($name, $this->attr); 731 } 732 } 733 734 function __set($name, $value) { 735 switch ($name) { 736 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 737 case 'innertext': 738 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 739 return $this->_[HDOM_INFO_INNER] = $value; 740 } 741 if (!isset($this->attr[$name])) { 742 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 743 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 744 } 745 $this->attr[$name] = $value; 746 } 747 748 function __isset($name) { 749 switch ($name) { 750 case 'outertext': return true; 751 case 'innertext': return true; 752 case 'plaintext': return true; 753 } 754 //no value attr: nowrap, checked selected... 755 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 756 } 757 758 function __unset($name) { 759 if (isset($this->attr[$name])) 760 unset($this->attr[$name]); 761 } 762 763 // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 764 function convert_text($text) 765 { 766 global $debugObject; 767 if (is_object($debugObject)) {$debugObject->debugLogEntry(1);} 768 769 $converted_text = $text; 770 771 $sourceCharset = ""; 772 $targetCharset = ""; 773 774 if ($this->dom) 775 { 776 $sourceCharset = strtoupper($this->dom->_charset); 777 $targetCharset = strtoupper($this->dom->_target_charset); 778 } 779 if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 780 781 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 782 { 783 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 784 if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 785 { 786 $converted_text = $text; 787 } 788 else 789 { 790 $converted_text = iconv($sourceCharset, $targetCharset, $text); 791 } 792 } 793 794 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 795 if ($targetCharset == 'UTF-8') 796 { 797 if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 798 { 799 $converted_text = substr($converted_text, 3); 800 } 801 if (substr($converted_text, -3) == "\xef\xbb\xbf") 802 { 803 $converted_text = substr($converted_text, 0, -3); 804 } 805 } 806 807 return $converted_text; 808 } 809 810 /** 811 * Returns true if $string is valid UTF-8 and false otherwise. 812 * 813 * @param mixed $str String to be tested 814 * @return boolean 815 */ 816 static function is_utf8($str) 817 { 818 $c=0; $b=0; 819 $bits=0; 820 $len=strlen($str); 821 for($i=0; $i<$len; $i++) 822 { 823 $c=ord($str[$i]); 824 if($c > 128) 825 { 826 if(($c >= 254)) return false; 827 elseif($c >= 252) $bits=6; 828 elseif($c >= 248) $bits=5; 829 elseif($c >= 240) $bits=4; 830 elseif($c >= 224) $bits=3; 831 elseif($c >= 192) $bits=2; 832 else return false; 833 if(($i+$bits) > $len) return false; 834 while($bits > 1) 835 { 836 $i++; 837 $b=ord($str[$i]); 838 if($b < 128 || $b > 191) return false; 839 $bits--; 840 } 841 } 842 } 843 return true; 844 } 845 /* 846 function is_utf8($string) 847 { 848 //this is buggy 849 return (utf8_encode(utf8_decode($string)) == $string); 850 } 851 */ 852 853 /** 854 * Function to try a few tricks to determine the displayed size of an img on the page. 855 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 856 * 857 * @author John Schlick 858 * @version April 19 2012 859 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 860 */ 861 function get_display_size() 862 { 863 global $debugObject; 864 865 $width = -1; 866 $height = -1; 867 868 if ($this->tag !== 'img') 869 { 870 return false; 871 } 872 873 // See if there is aheight or width attribute in the tag itself. 874 if (isset($this->attr['width'])) 875 { 876 $width = $this->attr['width']; 877 } 878 879 if (isset($this->attr['height'])) 880 { 881 $height = $this->attr['height']; 882 } 883 884 // Now look for an inline style. 885 if (isset($this->attr['style'])) 886 { 887 // Thanks to user gnarf from stackoverflow for this regular expression. 888 $attributes = array(); 889 preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 890 foreach ($matches as $match) { 891 $attributes[$match[1]] = $match[2]; 892 } 893 894 // If there is a width in the style attributes: 895 if (isset($attributes['width']) && $width == -1) 896 { 897 // check that the last two characters are px (pixels) 898 if (strtolower(substr($attributes['width'], -2)) == 'px') 899 { 900 $proposed_width = substr($attributes['width'], 0, -2); 901 // Now make sure that it's an integer and not something stupid. 902 if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 903 { 904 $width = $proposed_width; 905 } 906 } 907 } 908 909 // If there is a width in the style attributes: 910 if (isset($attributes['height']) && $height == -1) 911 { 912 // check that the last two characters are px (pixels) 913 if (strtolower(substr($attributes['height'], -2)) == 'px') 914 { 915 $proposed_height = substr($attributes['height'], 0, -2); 916 // Now make sure that it's an integer and not something stupid. 917 if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 918 { 919 $height = $proposed_height; 920 } 921 } 922 } 923 924 } 925 926 // Future enhancement: 927 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 928 929 // Far future enhancement 930 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 931 // Note that in this case, the class or id will have the img subselector for it to apply to the image. 932 933 // ridiculously far future development 934 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 935 936 $result = array('height' => $height, 937 'width' => $width); 938 return $result; 939 } 940 941 // camel naming conventions 942 function getAllAttributes() {return $this->attr;} 943 function getAttribute($name) {return $this->__get($name);} 944 function setAttribute($name, $value) {$this->__set($name, $value);} 945 function hasAttribute($name) {return $this->__isset($name);} 946 function removeAttribute($name) {$this->__set($name, null);} 947 function getElementById($id) {return $this->find("#$id", 0);} 948 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 949 function getElementByTagName($name) {return $this->find($name, 0);} 950 function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 951 function parentNode() {return $this->parent();} 952 function childNodes($idx=-1) {return $this->children($idx);} 953 function firstChild() {return $this->first_child();} 954 function lastChild() {return $this->last_child();} 955 function nextSibling() {return $this->next_sibling();} 956 function previousSibling() {return $this->prev_sibling();} 957 function hasChildNodes() {return $this->has_child();} 958 function nodeName() {return $this->tag;} 959 function appendChild($node) {$node->parent($this); return $node;} 960 961} 962 963/** 964 * simple html dom parser 965 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. 966 * Paperg - change $size from protected to public so we can easily access it 967 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. 968 * 969 * @package PlaceLocalInclude 970 */ 971class simple_html_dom 972{ 973 public $root = null; 974 public $nodes = array(); 975 public $callback = null; 976 public $lowercase = false; 977 // Used to keep track of how large the text was when we started. 978 public $original_size; 979 public $size; 980 protected $pos; 981 protected $doc; 982 protected $char; 983 protected $cursor; 984 protected $parent; 985 protected $noise = array(); 986 protected $token_blank = " \t\r\n"; 987 protected $token_equal = ' =/>'; 988 protected $token_slash = " />\r\n\t"; 989 protected $token_attr = ' >'; 990 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 991 public $_charset = ''; 992 public $_target_charset = ''; 993 protected $default_br_text = ""; 994 public $default_span_text = ""; 995 996 // use isset instead of in_array, performance boost about 30%... 997 protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); 998 protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 999 // Known sourceforge issue #2977341 1000 // B tags that are not closed cause us to return everything to the end of the document. 1001 protected $optional_closing_tags = array( 1002 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), 1003 'th'=>array('th'=>1), 1004 'td'=>array('td'=>1), 1005 'li'=>array('li'=>1), 1006 'dt'=>array('dt'=>1, 'dd'=>1), 1007 'dd'=>array('dd'=>1, 'dt'=>1), 1008 'dl'=>array('dd'=>1, 'dt'=>1), 1009 'p'=>array('p'=>1), 1010 'nobr'=>array('nobr'=>1), 1011 'b'=>array('b'=>1), 1012 'option'=>array('option'=>1), 1013 ); 1014 1015 function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1016 { 1017 if ($str) 1018 { 1019 if (preg_match("/^http:\/\//i",$str) || is_file($str)) 1020 { 1021 $this->load_file($str); 1022 } 1023 else 1024 { 1025 $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1026 } 1027 } 1028 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 1029 if (!$forceTagsClosed) { 1030 $this->optional_closing_array=array(); 1031 } 1032 $this->_target_charset = $target_charset; 1033 } 1034 1035 function __destruct() 1036 { 1037 $this->clear(); 1038 } 1039 1040 // load html from string 1041 function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1042 { 1043 global $debugObject; 1044 1045 // prepare 1046 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1047 // strip out comments 1048 $this->remove_noise("'<!--(.*?)-->'is"); 1049 // strip out cdata 1050 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1051 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1052 // Script tags removal now preceeds style tag removal. 1053 // strip out <script> tags 1054 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1055 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1056 // strip out <style> tags 1057 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1058 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1059 // strip out preformatted tags 1060 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1061 // strip out server side scripts 1062 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1063 // strip smarty scripts 1064 $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1065 1066 // parsing 1067 while ($this->parse()); 1068 // end 1069 $this->root->_[HDOM_INFO_END] = $this->cursor; 1070 $this->parse_charset(); 1071 1072 // make load function chainable 1073 return $this; 1074 1075 } 1076 1077 // load html from file 1078 function load_file() 1079 { 1080 $args = func_get_args(); 1081 $this->load(call_user_func_array('file_get_contents', $args), true); 1082 // Throw an error if we can't properly load the dom. 1083 if (($error=error_get_last())!==null) { 1084 $this->clear(); 1085 return false; 1086 } 1087 } 1088 1089 // set callback function 1090 function set_callback($function_name) 1091 { 1092 $this->callback = $function_name; 1093 } 1094 1095 // remove callback function 1096 function remove_callback() 1097 { 1098 $this->callback = null; 1099 } 1100 1101 // save dom as string 1102 function save($filepath='') 1103 { 1104 $ret = $this->root->innertext(); 1105 if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX); 1106 return $ret; 1107 } 1108 1109 // find dom node by css selector 1110 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. 1111 function find($selector, $idx=null, $lowercase=false) 1112 { 1113 return $this->root->find($selector, $idx, $lowercase); 1114 } 1115 1116 // clean up memory due to php5 circular references memory leak... 1117 function clear() 1118 { 1119 foreach ($this->nodes as $n) {$n->clear(); $n = null;} 1120 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. 1121 if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;} 1122 if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} 1123 if (isset($this->root)) {$this->root->clear(); unset($this->root);} 1124 unset($this->doc); 1125 unset($this->noise); 1126 } 1127 1128 function dump($show_attr=true) 1129 { 1130 $this->root->dump($show_attr); 1131 } 1132 1133 // prepare HTML data and init everything 1134 protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1135 { 1136 $this->clear(); 1137 1138 // set the length of content before we do anything to it. 1139 $this->size = strlen($str); 1140 // Save the original size of the html that we got in. It might be useful to someone. 1141 $this->original_size = $this->size; 1142 1143 //before we save the string as the doc... strip out the \r \n's if we are told to. 1144 if ($stripRN) { 1145 $str = str_replace("\r", " ", $str); 1146 $str = str_replace("\n", " ", $str); 1147 1148 // set the length of content since we have changed it. 1149 $this->size = strlen($str); 1150 } 1151 1152 $this->doc = $str; 1153 $this->pos = 0; 1154 $this->cursor = 1; 1155 $this->noise = array(); 1156 $this->nodes = array(); 1157 $this->lowercase = $lowercase; 1158 $this->default_br_text = $defaultBRText; 1159 $this->default_span_text = $defaultSpanText; 1160 $this->root = new simple_html_dom_node($this); 1161 $this->root->tag = 'root'; 1162 $this->root->_[HDOM_INFO_BEGIN] = -1; 1163 $this->root->nodetype = HDOM_TYPE_ROOT; 1164 $this->parent = $this->root; 1165 if ($this->size>0) $this->char = $this->doc[0]; 1166 } 1167 1168 // parse html content 1169 protected function parse() 1170 { 1171 if (($s = $this->copy_until_char('<'))==='') 1172 { 1173 return $this->read_tag(); 1174 } 1175 1176 // text 1177 $node = new simple_html_dom_node($this); 1178 ++$this->cursor; 1179 $node->_[HDOM_INFO_TEXT] = $s; 1180 $this->link_nodes($node, false); 1181 return true; 1182 } 1183 1184 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. 1185 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec 1186 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. 1187 protected function parse_charset() 1188 { 1189 global $debugObject; 1190 1191 $charset = null; 1192 1193 if (function_exists('get_last_retrieve_url_contents_content_type')) 1194 { 1195 $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1196 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1197 if ($success) 1198 { 1199 $charset = $matches[1]; 1200 if (is_object($debugObject)) {$debugObject->debugLog(2, 'header content-type found charset of: ' . $charset);} 1201 } 1202 1203 } 1204 1205 if (empty($charset)) 1206 { 1207 $el = $this->root->find('meta[http-equiv=Content-Type]',0); 1208 if (!empty($el)) 1209 { 1210 $fullvalue = $el->content; 1211 if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag found' . $fullvalue);} 1212 1213 if (!empty($fullvalue)) 1214 { 1215 $success = preg_match('/charset=(.+)/', $fullvalue, $matches); 1216 if ($success) 1217 { 1218 $charset = $matches[1]; 1219 } 1220 else 1221 { 1222 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 1223 if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 1224 $charset = 'ISO-8859-1'; 1225 } 1226 } 1227 } 1228 } 1229 1230 // If we couldn't find a charset above, then lets try to detect one based on the text we got... 1231 if (empty($charset)) 1232 { 1233 // Have php try to detect the encoding from the text given to us. 1234 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 1235 if (is_object($debugObject)) {$debugObject->debugLog(2, 'mb_detect found: ' . $charset);} 1236 1237 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 1238 if ($charset === false) 1239 { 1240 if (is_object($debugObject)) {$debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8');} 1241 $charset = 'UTF-8'; 1242 } 1243 } 1244 1245 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 1246 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 1247 { 1248 if (is_object($debugObject)) {$debugObject->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 1249 $charset = 'CP1252'; 1250 } 1251 1252 if (is_object($debugObject)) {$debugObject->debugLog(1, 'EXIT - ' . $charset);} 1253 1254 return $this->_charset = $charset; 1255 } 1256 1257 // read tag info 1258 protected function read_tag() 1259 { 1260 if ($this->char!=='<') 1261 { 1262 $this->root->_[HDOM_INFO_END] = $this->cursor; 1263 return false; 1264 } 1265 $begin_tag_pos = $this->pos; 1266 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1267 1268 // end tag 1269 if ($this->char==='/') 1270 { 1271 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1272 // This represents the change in the simple_html_dom trunk from revision 180 to 181. 1273 // $this->skip($this->token_blank_t); 1274 $this->skip($this->token_blank); 1275 $tag = $this->copy_until_char('>'); 1276 1277 // skip attributes in end tag 1278 if (($pos = strpos($tag, ' '))!==false) 1279 $tag = substr($tag, 0, $pos); 1280 1281 $parent_lower = strtolower($this->parent->tag); 1282 $tag_lower = strtolower($tag); 1283 1284 if ($parent_lower!==$tag_lower) 1285 { 1286 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) 1287 { 1288 $this->parent->_[HDOM_INFO_END] = 0; 1289 $org_parent = $this->parent; 1290 1291 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 1292 $this->parent = $this->parent->parent; 1293 1294 if (strtolower($this->parent->tag)!==$tag_lower) { 1295 $this->parent = $org_parent; // restore origonal parent 1296 if ($this->parent->parent) $this->parent = $this->parent->parent; 1297 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1298 return $this->as_text_node($tag); 1299 } 1300 } 1301 else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) 1302 { 1303 $this->parent->_[HDOM_INFO_END] = 0; 1304 $org_parent = $this->parent; 1305 1306 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 1307 $this->parent = $this->parent->parent; 1308 1309 if (strtolower($this->parent->tag)!==$tag_lower) 1310 { 1311 $this->parent = $org_parent; // restore origonal parent 1312 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1313 return $this->as_text_node($tag); 1314 } 1315 } 1316 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) 1317 { 1318 $this->parent->_[HDOM_INFO_END] = 0; 1319 $this->parent = $this->parent->parent; 1320 } 1321 else 1322 return $this->as_text_node($tag); 1323 } 1324 1325 $this->parent->_[HDOM_INFO_END] = $this->cursor; 1326 if ($this->parent->parent) $this->parent = $this->parent->parent; 1327 1328 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1329 return true; 1330 } 1331 1332 $node = new simple_html_dom_node($this); 1333 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1334 ++$this->cursor; 1335 $tag = $this->copy_until($this->token_slash); 1336 $node->tag_start = $begin_tag_pos; 1337 1338 // doctype, cdata & comments... 1339 if (isset($tag[0]) && $tag[0]==='!') { 1340 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1341 1342 if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { 1343 $node->nodetype = HDOM_TYPE_COMMENT; 1344 $node->tag = 'comment'; 1345 } else { 1346 $node->nodetype = HDOM_TYPE_UNKNOWN; 1347 $node->tag = 'unknown'; 1348 } 1349 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 1350 $this->link_nodes($node, true); 1351 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1352 return true; 1353 } 1354 1355 // text 1356 if ($pos=strpos($tag, '<')!==false) { 1357 $tag = '<' . substr($tag, 0, -1); 1358 $node->_[HDOM_INFO_TEXT] = $tag; 1359 $this->link_nodes($node, false); 1360 $this->char = $this->doc[--$this->pos]; // prev 1361 return true; 1362 } 1363 1364 if (!preg_match("/^[\w-:]+$/", $tag)) { 1365 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1366 if ($this->char==='<') { 1367 $this->link_nodes($node, false); 1368 return true; 1369 } 1370 1371 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 1372 $this->link_nodes($node, false); 1373 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1374 return true; 1375 } 1376 1377 // begin tag 1378 $node->nodetype = HDOM_TYPE_ELEMENT; 1379 $tag_lower = strtolower($tag); 1380 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1381 1382 // handle optional closing tags 1383 if (isset($this->optional_closing_tags[$tag_lower]) ) 1384 { 1385 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) 1386 { 1387 $this->parent->_[HDOM_INFO_END] = 0; 1388 $this->parent = $this->parent->parent; 1389 } 1390 $node->parent = $this->parent; 1391 } 1392 1393 $guard = 0; // prevent infinity loop 1394 $space = array($this->copy_skip($this->token_blank), '', ''); 1395 1396 // attributes 1397 do 1398 { 1399 if ($this->char!==null && $space[0]==='') 1400 { 1401 break; 1402 } 1403 $name = $this->copy_until($this->token_equal); 1404 if ($guard===$this->pos) 1405 { 1406 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1407 continue; 1408 } 1409 $guard = $this->pos; 1410 1411 // handle endless '<' 1412 if ($this->pos>=$this->size-1 && $this->char!=='>') { 1413 $node->nodetype = HDOM_TYPE_TEXT; 1414 $node->_[HDOM_INFO_END] = 0; 1415 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; 1416 $node->tag = 'text'; 1417 $this->link_nodes($node, false); 1418 return true; 1419 } 1420 1421 // handle mismatch '<' 1422 if ($this->doc[$this->pos-1]=='<') { 1423 $node->nodetype = HDOM_TYPE_TEXT; 1424 $node->tag = 'text'; 1425 $node->attr = array(); 1426 $node->_[HDOM_INFO_END] = 0; 1427 $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); 1428 $this->pos -= 2; 1429 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1430 $this->link_nodes($node, false); 1431 return true; 1432 } 1433 1434 if ($name!=='/' && $name!=='') { 1435 $space[1] = $this->copy_skip($this->token_blank); 1436 $name = $this->restore_noise($name); 1437 if ($this->lowercase) $name = strtolower($name); 1438 if ($this->char==='=') { 1439 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1440 $this->parse_attr($node, $name, $space); 1441 } 1442 else { 1443 //no value attr: nowrap, checked selected... 1444 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 1445 $node->attr[$name] = true; 1446 if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev 1447 } 1448 $node->_[HDOM_INFO_SPACE][] = $space; 1449 $space = array($this->copy_skip($this->token_blank), '', ''); 1450 } 1451 else 1452 break; 1453 } while ($this->char!=='>' && $this->char!=='/'); 1454 1455 $this->link_nodes($node, true); 1456 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 1457 1458 // check self closing 1459 if ($this->copy_until_char_escape('>')==='/') 1460 { 1461 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 1462 $node->_[HDOM_INFO_END] = 0; 1463 } 1464 else 1465 { 1466 // reset parent 1467 if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; 1468 } 1469 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1470 1471 // If it's a BR tag, we need to set it's text to the default text. 1472 // This way when we see it in plaintext, we can generate formatting that the user wants. 1473 // since a br tag never has sub nodes, this works well. 1474 if ($node->tag == "br") 1475 { 1476 $node->_[HDOM_INFO_INNER] = $this->default_br_text; 1477 } 1478 1479 return true; 1480 } 1481 1482 // parse attributes 1483 protected function parse_attr($node, $name, &$space) 1484 { 1485 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 1486 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. 1487 if (isset($node->attr[$name])) 1488 { 1489 return; 1490 } 1491 1492 $space[2] = $this->copy_skip($this->token_blank); 1493 switch ($this->char) { 1494 case '"': 1495 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1496 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1497 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); 1498 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1499 break; 1500 case '\'': 1501 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; 1502 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1503 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); 1504 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1505 break; 1506 default: 1507 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 1508 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); 1509 } 1510 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. 1511 $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); 1512 $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); 1513 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. 1514 if ($name == "class") { 1515 $node->attr[$name] = trim($node->attr[$name]); 1516 } 1517 } 1518 1519 // link node's parent 1520 protected function link_nodes(&$node, $is_child) 1521 { 1522 $node->parent = $this->parent; 1523 $this->parent->nodes[] = $node; 1524 if ($is_child) 1525 { 1526 $this->parent->children[] = $node; 1527 } 1528 } 1529 1530 // as a text node 1531 protected function as_text_node($tag) 1532 { 1533 $node = new simple_html_dom_node($this); 1534 ++$this->cursor; 1535 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 1536 $this->link_nodes($node, false); 1537 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1538 return true; 1539 } 1540 1541 protected function skip($chars) 1542 { 1543 $this->pos += strspn($this->doc, $chars, $this->pos); 1544 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1545 } 1546 1547 protected function copy_skip($chars) 1548 { 1549 $pos = $this->pos; 1550 $len = strspn($this->doc, $chars, $pos); 1551 $this->pos += $len; 1552 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1553 if ($len===0) return ''; 1554 return substr($this->doc, $pos, $len); 1555 } 1556 1557 protected function copy_until($chars) 1558 { 1559 $pos = $this->pos; 1560 $len = strcspn($this->doc, $chars, $pos); 1561 $this->pos += $len; 1562 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1563 return substr($this->doc, $pos, $len); 1564 } 1565 1566 protected function copy_until_char($char) 1567 { 1568 if ($this->char===null) return ''; 1569 1570 if (($pos = strpos($this->doc, $char, $this->pos))===false) { 1571 $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 1572 $this->char = null; 1573 $this->pos = $this->size; 1574 return $ret; 1575 } 1576 1577 if ($pos===$this->pos) return ''; 1578 $pos_old = $this->pos; 1579 $this->char = $this->doc[$pos]; 1580 $this->pos = $pos; 1581 return substr($this->doc, $pos_old, $pos-$pos_old); 1582 } 1583 1584 protected function copy_until_char_escape($char) 1585 { 1586 if ($this->char===null) return ''; 1587 1588 $start = $this->pos; 1589 while (1) 1590 { 1591 if (($pos = strpos($this->doc, $char, $start))===false) 1592 { 1593 $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 1594 $this->char = null; 1595 $this->pos = $this->size; 1596 return $ret; 1597 } 1598 1599 if ($pos===$this->pos) return ''; 1600 1601 if ($this->doc[$pos-1]==='\\') { 1602 $start = $pos+1; 1603 continue; 1604 } 1605 1606 $pos_old = $this->pos; 1607 $this->char = $this->doc[$pos]; 1608 $this->pos = $pos; 1609 return substr($this->doc, $pos_old, $pos-$pos_old); 1610 } 1611 } 1612 1613 // remove noise from html content 1614 // save the noise in the $this->noise array. 1615 protected function remove_noise($pattern, $remove_tag=false) 1616 { 1617 global $debugObject; 1618 if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 1619 1620 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 1621 1622 for ($i=$count-1; $i>-1; --$i) 1623 { 1624 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 1625 if (is_object($debugObject)) { $debugObject->debugLog(2, 'key is: ' . $key); } 1626 $idx = ($remove_tag) ? 0 : 1; 1627 $this->noise[$key] = $matches[$i][$idx][0]; 1628 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 1629 } 1630 1631 // reset the length of content 1632 $this->size = strlen($this->doc); 1633 if ($this->size>0) 1634 { 1635 $this->char = $this->doc[0]; 1636 } 1637 } 1638 1639 // restore noise to html content 1640 function restore_noise($text) 1641 { 1642 global $debugObject; 1643 if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 1644 1645 while (($pos=strpos($text, '___noise___'))!==false) 1646 { 1647 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... 1648 if (strlen($text) > $pos+15) 1649 { 1650 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 1651 if (is_object($debugObject)) { $debugObject->debugLog(2, 'located key of: ' . $key); } 1652 1653 if (isset($this->noise[$key])) 1654 { 1655 $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16); 1656 } 1657 else 1658 { 1659 // do this to prevent an infinite loop. 1660 $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16); 1661 } 1662 } 1663 else 1664 { 1665 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. 1666 $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11); 1667 } 1668 } 1669 return $text; 1670 } 1671 1672 // Sometimes we NEED one of the noise elements. 1673 function search_noise($text) 1674 { 1675 global $debugObject; 1676 if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 1677 1678 foreach($this->noise as $noiseElement) 1679 { 1680 if (strpos($noiseElement, $text)!==false) 1681 { 1682 return $noiseElement; 1683 } 1684 } 1685 } 1686 function __toString() 1687 { 1688 return $this->root->innertext(); 1689 } 1690 1691 function __get($name) 1692 { 1693 switch ($name) 1694 { 1695 case 'outertext': 1696 return $this->root->innertext(); 1697 case 'innertext': 1698 return $this->root->innertext(); 1699 case 'plaintext': 1700 return $this->root->text(); 1701 case 'charset': 1702 return $this->_charset; 1703 case 'target_charset': 1704 return $this->_target_charset; 1705 } 1706 } 1707 1708 // camel naming conventions 1709 function childNodes($idx=-1) {return $this->root->childNodes($idx);} 1710 function firstChild() {return $this->root->first_child();} 1711 function lastChild() {return $this->root->last_child();} 1712 function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();} 1713 function createTextNode($value) {return @end(str_get_html($value)->nodes);} 1714 function getElementById($id) {return $this->find("#$id", 0);} 1715 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 1716 function getElementByTagName($name) {return $this->find($name, 0);} 1717 function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} 1718 function loadFile() {$args = func_get_args();$this->load_file($args);} 1719} 1720 1721?>