1c165b184SJames Collins<?php 2c165b184SJames Collins/** 3c165b184SJames Collins * Website: http://sourceforge.net/projects/simplehtmldom/ 4c165b184SJames Collins * Additional projects: http://sourceforge.net/projects/debugobject/ 5c165b184SJames Collins * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6c165b184SJames Collins * 7c165b184SJames Collins * Licensed under The MIT License 8c165b184SJames Collins * See the LICENSE file in the project root for more information. 9c165b184SJames Collins * 10c165b184SJames Collins * Authors: 11c165b184SJames Collins * S.C. Chen 12c165b184SJames Collins * John Schlick 13c165b184SJames Collins * Rus Carroll 14c165b184SJames Collins * logmanoriginal 15c165b184SJames Collins * 16c165b184SJames Collins * Contributors: 17c165b184SJames Collins * Yousuke Kumakura 18c165b184SJames Collins * Vadim Voituk 19c165b184SJames Collins * Antcs 20c165b184SJames Collins * 21c165b184SJames Collins * Version Rev. 1.9.1 (291) 22*cdddb6f0SJames Collins * 23*cdddb6f0SJames Collins * THIS LIBRARY HAS BEEN MODIFIED BY NOMADJIMBOB - james.collins@outlook.com.au 24*cdddb6f0SJames Collins * Lines 2116 - stripping of \r\n from attributes has been disabled 25c165b184SJames Collins */ 26c165b184SJames Collins 27c165b184SJames Collinsdefine('HDOM_TYPE_ELEMENT', 1); 28c165b184SJames Collinsdefine('HDOM_TYPE_COMMENT', 2); 29c165b184SJames Collinsdefine('HDOM_TYPE_TEXT', 3); 30c165b184SJames Collinsdefine('HDOM_TYPE_ENDTAG', 4); 31c165b184SJames Collinsdefine('HDOM_TYPE_ROOT', 5); 32c165b184SJames Collinsdefine('HDOM_TYPE_UNKNOWN', 6); 33c165b184SJames Collinsdefine('HDOM_QUOTE_DOUBLE', 0); 34c165b184SJames Collinsdefine('HDOM_QUOTE_SINGLE', 1); 35c165b184SJames Collinsdefine('HDOM_QUOTE_NO', 3); 36c165b184SJames Collinsdefine('HDOM_INFO_BEGIN', 0); 37c165b184SJames Collinsdefine('HDOM_INFO_END', 1); 38c165b184SJames Collinsdefine('HDOM_INFO_QUOTE', 2); 39c165b184SJames Collinsdefine('HDOM_INFO_SPACE', 3); 40c165b184SJames Collinsdefine('HDOM_INFO_TEXT', 4); 41c165b184SJames Collinsdefine('HDOM_INFO_INNER', 5); 42c165b184SJames Collinsdefine('HDOM_INFO_OUTER', 6); 43c165b184SJames Collinsdefine('HDOM_INFO_ENDSPACE', 7); 44c165b184SJames Collins 45c165b184SJames Collinsdefined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 46c165b184SJames Collinsdefined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 47c165b184SJames Collinsdefined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 48c165b184SJames Collinsdefined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 49c165b184SJames Collinsdefine('HDOM_SMARTY_AS_TEXT', 1); 50c165b184SJames Collins 51c165b184SJames Collinsfunction file_get_html( 52c165b184SJames Collins $url, 53c165b184SJames Collins $use_include_path = false, 54c165b184SJames Collins $context = null, 55c165b184SJames Collins $offset = 0, 56c165b184SJames Collins $maxLen = -1, 57c165b184SJames Collins $lowercase = true, 58c165b184SJames Collins $forceTagsClosed = true, 59c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 60c165b184SJames Collins $stripRN = true, 61c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 62c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 63c165b184SJames Collins{ 64c165b184SJames Collins if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 65c165b184SJames Collins 66c165b184SJames Collins $dom = new simple_html_dom( 67c165b184SJames Collins null, 68c165b184SJames Collins $lowercase, 69c165b184SJames Collins $forceTagsClosed, 70c165b184SJames Collins $target_charset, 71c165b184SJames Collins $stripRN, 72c165b184SJames Collins $defaultBRText, 73c165b184SJames Collins $defaultSpanText 74c165b184SJames Collins ); 75c165b184SJames Collins 76c165b184SJames Collins /** 77c165b184SJames Collins * For sourceforge users: uncomment the next line and comment the 78c165b184SJames Collins * retrieve_url_contents line 2 lines down if it is not already done. 79c165b184SJames Collins */ 80c165b184SJames Collins $contents = file_get_contents( 81c165b184SJames Collins $url, 82c165b184SJames Collins $use_include_path, 83c165b184SJames Collins $context, 84c165b184SJames Collins $offset, 85c165b184SJames Collins $maxLen 86c165b184SJames Collins ); 87c165b184SJames Collins // $contents = retrieve_url_contents($url); 88c165b184SJames Collins 89c165b184SJames Collins if (empty($contents) || strlen($contents) > $maxLen) { 90c165b184SJames Collins $dom->clear(); 91c165b184SJames Collins return false; 92c165b184SJames Collins } 93c165b184SJames Collins 94c165b184SJames Collins return $dom->load($contents, $lowercase, $stripRN); 95c165b184SJames Collins} 96c165b184SJames Collins 97c165b184SJames Collinsfunction str_get_html( 98c165b184SJames Collins $str, 99c165b184SJames Collins $lowercase = true, 100c165b184SJames Collins $forceTagsClosed = true, 101c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 102c165b184SJames Collins $stripRN = true, 103c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 104c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 105c165b184SJames Collins{ 106c165b184SJames Collins $dom = new simple_html_dom( 107c165b184SJames Collins null, 108c165b184SJames Collins $lowercase, 109c165b184SJames Collins $forceTagsClosed, 110c165b184SJames Collins $target_charset, 111c165b184SJames Collins $stripRN, 112c165b184SJames Collins $defaultBRText, 113c165b184SJames Collins $defaultSpanText 114c165b184SJames Collins ); 115c165b184SJames Collins 116c165b184SJames Collins if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 117c165b184SJames Collins $dom->clear(); 118c165b184SJames Collins return false; 119c165b184SJames Collins } 120c165b184SJames Collins 121c165b184SJames Collins return $dom->load($str, $lowercase, $stripRN); 122c165b184SJames Collins} 123c165b184SJames Collins 124c165b184SJames Collinsfunction dump_html_tree($node, $show_attr = true, $deep = 0) 125c165b184SJames Collins{ 126c165b184SJames Collins $node->dump($node); 127c165b184SJames Collins} 128c165b184SJames Collins 129c165b184SJames Collinsclass simple_html_dom_node 130c165b184SJames Collins{ 131c165b184SJames Collins public $nodetype = HDOM_TYPE_TEXT; 132c165b184SJames Collins public $tag = 'text'; 133c165b184SJames Collins public $attr = array(); 134c165b184SJames Collins public $children = array(); 135c165b184SJames Collins public $nodes = array(); 136c165b184SJames Collins public $parent = null; 137c165b184SJames Collins public $_ = array(); 138c165b184SJames Collins public $tag_start = 0; 139c165b184SJames Collins private $dom = null; 140c165b184SJames Collins 141c165b184SJames Collins function __construct($dom) 142c165b184SJames Collins { 143c165b184SJames Collins $this->dom = $dom; 144c165b184SJames Collins $dom->nodes[] = $this; 145c165b184SJames Collins } 146c165b184SJames Collins 147c165b184SJames Collins function __destruct() 148c165b184SJames Collins { 149c165b184SJames Collins $this->clear(); 150c165b184SJames Collins } 151c165b184SJames Collins 152c165b184SJames Collins function __toString() 153c165b184SJames Collins { 154c165b184SJames Collins return $this->outertext(); 155c165b184SJames Collins } 156c165b184SJames Collins 157c165b184SJames Collins function clear() 158c165b184SJames Collins { 159c165b184SJames Collins $this->dom = null; 160c165b184SJames Collins $this->nodes = null; 161c165b184SJames Collins $this->parent = null; 162c165b184SJames Collins $this->children = null; 163c165b184SJames Collins } 164c165b184SJames Collins 165c165b184SJames Collins function dump($show_attr = true, $depth = 0) 166c165b184SJames Collins { 167c165b184SJames Collins echo str_repeat("\t", $depth) . $this->tag; 168c165b184SJames Collins 169c165b184SJames Collins if ($show_attr && count($this->attr) > 0) { 170c165b184SJames Collins echo '('; 171c165b184SJames Collins foreach ($this->attr as $k => $v) { 172c165b184SJames Collins echo "[$k]=>\"$v\", "; 173c165b184SJames Collins } 174c165b184SJames Collins echo ')'; 175c165b184SJames Collins } 176c165b184SJames Collins 177c165b184SJames Collins echo "\n"; 178c165b184SJames Collins 179c165b184SJames Collins if ($this->nodes) { 180c165b184SJames Collins foreach ($this->nodes as $node) { 181c165b184SJames Collins $node->dump($show_attr, $depth + 1); 182c165b184SJames Collins } 183c165b184SJames Collins } 184c165b184SJames Collins } 185c165b184SJames Collins 186c165b184SJames Collins function dump_node($echo = true) 187c165b184SJames Collins { 188c165b184SJames Collins $string = $this->tag; 189c165b184SJames Collins 190c165b184SJames Collins if (count($this->attr) > 0) { 191c165b184SJames Collins $string .= '('; 192c165b184SJames Collins foreach ($this->attr as $k => $v) { 193c165b184SJames Collins $string .= "[$k]=>\"$v\", "; 194c165b184SJames Collins } 195c165b184SJames Collins $string .= ')'; 196c165b184SJames Collins } 197c165b184SJames Collins 198c165b184SJames Collins if (count($this->_) > 0) { 199c165b184SJames Collins $string .= ' $_ ('; 200c165b184SJames Collins foreach ($this->_ as $k => $v) { 201c165b184SJames Collins if (is_array($v)) { 202c165b184SJames Collins $string .= "[$k]=>("; 203c165b184SJames Collins foreach ($v as $k2 => $v2) { 204c165b184SJames Collins $string .= "[$k2]=>\"$v2\", "; 205c165b184SJames Collins } 206c165b184SJames Collins $string .= ')'; 207c165b184SJames Collins } else { 208c165b184SJames Collins $string .= "[$k]=>\"$v\", "; 209c165b184SJames Collins } 210c165b184SJames Collins } 211c165b184SJames Collins $string .= ')'; 212c165b184SJames Collins } 213c165b184SJames Collins 214c165b184SJames Collins if (isset($this->text)) { 215c165b184SJames Collins $string .= " text: ({$this->text})"; 216c165b184SJames Collins } 217c165b184SJames Collins 218c165b184SJames Collins $string .= ' HDOM_INNER_INFO: '; 219c165b184SJames Collins 220c165b184SJames Collins if (isset($node->_[HDOM_INFO_INNER])) { 221c165b184SJames Collins $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 222c165b184SJames Collins } else { 223c165b184SJames Collins $string .= ' NULL '; 224c165b184SJames Collins } 225c165b184SJames Collins 226c165b184SJames Collins $string .= ' children: ' . count($this->children); 227c165b184SJames Collins $string .= ' nodes: ' . count($this->nodes); 228c165b184SJames Collins $string .= ' tag_start: ' . $this->tag_start; 229c165b184SJames Collins $string .= "\n"; 230c165b184SJames Collins 231c165b184SJames Collins if ($echo) { 232c165b184SJames Collins echo $string; 233c165b184SJames Collins return; 234c165b184SJames Collins } else { 235c165b184SJames Collins return $string; 236c165b184SJames Collins } 237c165b184SJames Collins } 238c165b184SJames Collins 239c165b184SJames Collins function parent($parent = null) 240c165b184SJames Collins { 241c165b184SJames Collins // I am SURE that this doesn't work properly. 242c165b184SJames Collins // It fails to unset the current node from it's current parents nodes or 243c165b184SJames Collins // children list first. 244c165b184SJames Collins if ($parent !== null) { 245c165b184SJames Collins $this->parent = $parent; 246c165b184SJames Collins $this->parent->nodes[] = $this; 247c165b184SJames Collins $this->parent->children[] = $this; 248c165b184SJames Collins } 249c165b184SJames Collins 250c165b184SJames Collins return $this->parent; 251c165b184SJames Collins } 252c165b184SJames Collins 253c165b184SJames Collins function has_child() 254c165b184SJames Collins { 255c165b184SJames Collins return !empty($this->children); 256c165b184SJames Collins } 257c165b184SJames Collins 258c165b184SJames Collins function children($idx = -1) 259c165b184SJames Collins { 260c165b184SJames Collins if ($idx === -1) { 261c165b184SJames Collins return $this->children; 262c165b184SJames Collins } 263c165b184SJames Collins 264c165b184SJames Collins if (isset($this->children[$idx])) { 265c165b184SJames Collins return $this->children[$idx]; 266c165b184SJames Collins } 267c165b184SJames Collins 268c165b184SJames Collins return null; 269c165b184SJames Collins } 270c165b184SJames Collins 271c165b184SJames Collins function first_child() 272c165b184SJames Collins { 273c165b184SJames Collins if (count($this->children) > 0) { 274c165b184SJames Collins return $this->children[0]; 275c165b184SJames Collins } 276c165b184SJames Collins return null; 277c165b184SJames Collins } 278c165b184SJames Collins 279c165b184SJames Collins function last_child() 280c165b184SJames Collins { 281c165b184SJames Collins if (count($this->children) > 0) { 282c165b184SJames Collins return end($this->children); 283c165b184SJames Collins } 284c165b184SJames Collins return null; 285c165b184SJames Collins } 286c165b184SJames Collins 287c165b184SJames Collins function next_sibling() 288c165b184SJames Collins { 289c165b184SJames Collins if ($this->parent === null) { 290c165b184SJames Collins return null; 291c165b184SJames Collins } 292c165b184SJames Collins 293c165b184SJames Collins $idx = array_search($this, $this->parent->children, true); 294c165b184SJames Collins 295c165b184SJames Collins if ($idx !== false && isset($this->parent->children[$idx + 1])) { 296c165b184SJames Collins return $this->parent->children[$idx + 1]; 297c165b184SJames Collins } 298c165b184SJames Collins 299c165b184SJames Collins return null; 300c165b184SJames Collins } 301c165b184SJames Collins 302c165b184SJames Collins function prev_sibling() 303c165b184SJames Collins { 304c165b184SJames Collins if ($this->parent === null) { 305c165b184SJames Collins return null; 306c165b184SJames Collins } 307c165b184SJames Collins 308c165b184SJames Collins $idx = array_search($this, $this->parent->children, true); 309c165b184SJames Collins 310c165b184SJames Collins if ($idx !== false && $idx > 0) { 311c165b184SJames Collins return $this->parent->children[$idx - 1]; 312c165b184SJames Collins } 313c165b184SJames Collins 314c165b184SJames Collins return null; 315c165b184SJames Collins } 316c165b184SJames Collins 317c165b184SJames Collins function find_ancestor_tag($tag) 318c165b184SJames Collins { 319c165b184SJames Collins global $debug_object; 320c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 321c165b184SJames Collins 322c165b184SJames Collins if ($this->parent === null) { 323c165b184SJames Collins return null; 324c165b184SJames Collins } 325c165b184SJames Collins 326c165b184SJames Collins $ancestor = $this->parent; 327c165b184SJames Collins 328c165b184SJames Collins while (!is_null($ancestor)) { 329c165b184SJames Collins if (is_object($debug_object)) { 330c165b184SJames Collins $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 331c165b184SJames Collins } 332c165b184SJames Collins 333c165b184SJames Collins if ($ancestor->tag === $tag) { 334c165b184SJames Collins break; 335c165b184SJames Collins } 336c165b184SJames Collins 337c165b184SJames Collins $ancestor = $ancestor->parent; 338c165b184SJames Collins } 339c165b184SJames Collins 340c165b184SJames Collins return $ancestor; 341c165b184SJames Collins } 342c165b184SJames Collins 343c165b184SJames Collins function innertext() 344c165b184SJames Collins { 345c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 346c165b184SJames Collins return $this->_[HDOM_INFO_INNER]; 347c165b184SJames Collins } 348c165b184SJames Collins 349c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 350c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 351c165b184SJames Collins } 352c165b184SJames Collins 353c165b184SJames Collins $ret = ''; 354c165b184SJames Collins 355c165b184SJames Collins foreach ($this->nodes as $n) { 356c165b184SJames Collins $ret .= $n->outertext(); 357c165b184SJames Collins } 358c165b184SJames Collins 359c165b184SJames Collins return $ret; 360c165b184SJames Collins } 361c165b184SJames Collins 362c165b184SJames Collins function outertext() 363c165b184SJames Collins { 364c165b184SJames Collins global $debug_object; 365c165b184SJames Collins 366c165b184SJames Collins if (is_object($debug_object)) { 367c165b184SJames Collins $text = ''; 368c165b184SJames Collins 369c165b184SJames Collins if ($this->tag === 'text') { 370c165b184SJames Collins if (!empty($this->text)) { 371c165b184SJames Collins $text = ' with text: ' . $this->text; 372c165b184SJames Collins } 373c165b184SJames Collins } 374c165b184SJames Collins 375c165b184SJames Collins $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 376c165b184SJames Collins } 377c165b184SJames Collins 378c165b184SJames Collins if ($this->tag === 'root') { 379c165b184SJames Collins return $this->innertext(); 380c165b184SJames Collins } 381c165b184SJames Collins 382c165b184SJames Collins // todo: What is the use of this callback? Remove? 383c165b184SJames Collins if ($this->dom && $this->dom->callback !== null) { 384c165b184SJames Collins call_user_func_array($this->dom->callback, array($this)); 385c165b184SJames Collins } 386c165b184SJames Collins 387c165b184SJames Collins if (isset($this->_[HDOM_INFO_OUTER])) { 388c165b184SJames Collins return $this->_[HDOM_INFO_OUTER]; 389c165b184SJames Collins } 390c165b184SJames Collins 391c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 392c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 393c165b184SJames Collins } 394c165b184SJames Collins 395c165b184SJames Collins $ret = ''; 396c165b184SJames Collins 397c165b184SJames Collins if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 398c165b184SJames Collins $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 399c165b184SJames Collins } 400c165b184SJames Collins 401c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 402c165b184SJames Collins // todo: <br> should either never have HDOM_INFO_INNER or always 403c165b184SJames Collins if ($this->tag !== 'br') { 404c165b184SJames Collins $ret .= $this->_[HDOM_INFO_INNER]; 405c165b184SJames Collins } 406c165b184SJames Collins } elseif ($this->nodes) { 407c165b184SJames Collins foreach ($this->nodes as $n) { 408c165b184SJames Collins $ret .= $this->convert_text($n->outertext()); 409c165b184SJames Collins } 410c165b184SJames Collins } 411c165b184SJames Collins 412c165b184SJames Collins if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 413c165b184SJames Collins $ret .= '</' . $this->tag . '>'; 414c165b184SJames Collins } 415c165b184SJames Collins 416c165b184SJames Collins return $ret; 417c165b184SJames Collins } 418c165b184SJames Collins 419c165b184SJames Collins function text() 420c165b184SJames Collins { 421c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 422c165b184SJames Collins return $this->_[HDOM_INFO_INNER]; 423c165b184SJames Collins } 424c165b184SJames Collins 425c165b184SJames Collins switch ($this->nodetype) { 426c165b184SJames Collins case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 427c165b184SJames Collins case HDOM_TYPE_COMMENT: return ''; 428c165b184SJames Collins case HDOM_TYPE_UNKNOWN: return ''; 429c165b184SJames Collins } 430c165b184SJames Collins 431c165b184SJames Collins if (strcasecmp($this->tag, 'script') === 0) { return ''; } 432c165b184SJames Collins if (strcasecmp($this->tag, 'style') === 0) { return ''; } 433c165b184SJames Collins 434c165b184SJames Collins $ret = ''; 435c165b184SJames Collins 436c165b184SJames Collins // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 437c165b184SJames Collins // for some span tags, and some p tags) $this->nodes is set to NULL. 438c165b184SJames Collins // NOTE: This indicates that there is a problem where it's set to NULL 439c165b184SJames Collins // without a clear happening. 440c165b184SJames Collins // WHY is this happening? 441c165b184SJames Collins if (!is_null($this->nodes)) { 442c165b184SJames Collins foreach ($this->nodes as $n) { 443c165b184SJames Collins // Start paragraph after a blank line 444c165b184SJames Collins if ($n->tag === 'p') { 445c165b184SJames Collins $ret = trim($ret) . "\n\n"; 446c165b184SJames Collins } 447c165b184SJames Collins 448c165b184SJames Collins $ret .= $this->convert_text($n->text()); 449c165b184SJames Collins 450c165b184SJames Collins // If this node is a span... add a space at the end of it so 451c165b184SJames Collins // multiple spans don't run into each other. This is plaintext 452c165b184SJames Collins // after all. 453c165b184SJames Collins if ($n->tag === 'span') { 454c165b184SJames Collins $ret .= $this->dom->default_span_text; 455c165b184SJames Collins } 456c165b184SJames Collins } 457c165b184SJames Collins } 458c165b184SJames Collins return $ret; 459c165b184SJames Collins } 460c165b184SJames Collins 461c165b184SJames Collins function xmltext() 462c165b184SJames Collins { 463c165b184SJames Collins $ret = $this->innertext(); 464c165b184SJames Collins $ret = str_ireplace('<![CDATA[', '', $ret); 465c165b184SJames Collins $ret = str_replace(']]>', '', $ret); 466c165b184SJames Collins return $ret; 467c165b184SJames Collins } 468c165b184SJames Collins 469c165b184SJames Collins function makeup() 470c165b184SJames Collins { 471c165b184SJames Collins // text, comment, unknown 472c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 473c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 474c165b184SJames Collins } 475c165b184SJames Collins 476c165b184SJames Collins $ret = '<' . $this->tag; 477c165b184SJames Collins $i = -1; 478c165b184SJames Collins 479c165b184SJames Collins foreach ($this->attr as $key => $val) { 480c165b184SJames Collins ++$i; 481c165b184SJames Collins 482c165b184SJames Collins // skip removed attribute 483c165b184SJames Collins if ($val === null || $val === false) { continue; } 484c165b184SJames Collins 485c165b184SJames Collins $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 486c165b184SJames Collins 487c165b184SJames Collins //no value attr: nowrap, checked selected... 488c165b184SJames Collins if ($val === true) { 489c165b184SJames Collins $ret .= $key; 490c165b184SJames Collins } else { 491c165b184SJames Collins switch ($this->_[HDOM_INFO_QUOTE][$i]) 492c165b184SJames Collins { 493c165b184SJames Collins case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 494c165b184SJames Collins case HDOM_QUOTE_SINGLE: $quote = '\''; break; 495c165b184SJames Collins default: $quote = ''; 496c165b184SJames Collins } 497c165b184SJames Collins 498c165b184SJames Collins $ret .= $key 499c165b184SJames Collins . $this->_[HDOM_INFO_SPACE][$i][1] 500c165b184SJames Collins . '=' 501c165b184SJames Collins . $this->_[HDOM_INFO_SPACE][$i][2] 502c165b184SJames Collins . $quote 503c165b184SJames Collins . $val 504c165b184SJames Collins . $quote; 505c165b184SJames Collins } 506c165b184SJames Collins } 507c165b184SJames Collins 508c165b184SJames Collins $ret = $this->dom->restore_noise($ret); 509c165b184SJames Collins return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 510c165b184SJames Collins } 511c165b184SJames Collins 512c165b184SJames Collins function find($selector, $idx = null, $lowercase = false) 513c165b184SJames Collins { 514c165b184SJames Collins $selectors = $this->parse_selector($selector); 515c165b184SJames Collins if (($count = count($selectors)) === 0) { return array(); } 516c165b184SJames Collins $found_keys = array(); 517c165b184SJames Collins 518c165b184SJames Collins // find each selector 519c165b184SJames Collins for ($c = 0; $c < $count; ++$c) { 520c165b184SJames Collins // The change on the below line was documented on the sourceforge 521c165b184SJames Collins // code tracker id 2788009 522c165b184SJames Collins // used to be: if (($levle=count($selectors[0]))===0) return array(); 523c165b184SJames Collins if (($levle = count($selectors[$c])) === 0) { return array(); } 524c165b184SJames Collins if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 525c165b184SJames Collins 526c165b184SJames Collins $head = array($this->_[HDOM_INFO_BEGIN] => 1); 527c165b184SJames Collins $cmd = ' '; // Combinator 528c165b184SJames Collins 529c165b184SJames Collins // handle descendant selectors, no recursive! 530c165b184SJames Collins for ($l = 0; $l < $levle; ++$l) { 531c165b184SJames Collins $ret = array(); 532c165b184SJames Collins 533c165b184SJames Collins foreach ($head as $k => $v) { 534c165b184SJames Collins $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 535c165b184SJames Collins //PaperG - Pass this optional parameter on to the seek function. 536c165b184SJames Collins $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 537c165b184SJames Collins } 538c165b184SJames Collins 539c165b184SJames Collins $head = $ret; 540c165b184SJames Collins $cmd = $selectors[$c][$l][4]; // Next Combinator 541c165b184SJames Collins } 542c165b184SJames Collins 543c165b184SJames Collins foreach ($head as $k => $v) { 544c165b184SJames Collins if (!isset($found_keys[$k])) { 545c165b184SJames Collins $found_keys[$k] = 1; 546c165b184SJames Collins } 547c165b184SJames Collins } 548c165b184SJames Collins } 549c165b184SJames Collins 550c165b184SJames Collins // sort keys 551c165b184SJames Collins ksort($found_keys); 552c165b184SJames Collins 553c165b184SJames Collins $found = array(); 554c165b184SJames Collins foreach ($found_keys as $k => $v) { 555c165b184SJames Collins $found[] = $this->dom->nodes[$k]; 556c165b184SJames Collins } 557c165b184SJames Collins 558c165b184SJames Collins // return nth-element or array 559c165b184SJames Collins if (is_null($idx)) { return $found; } 560c165b184SJames Collins elseif ($idx < 0) { $idx = count($found) + $idx; } 561c165b184SJames Collins return (isset($found[$idx])) ? $found[$idx] : null; 562c165b184SJames Collins } 563c165b184SJames Collins 564c165b184SJames Collins protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 565c165b184SJames Collins { 566c165b184SJames Collins global $debug_object; 567c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 568c165b184SJames Collins 569c165b184SJames Collins list($tag, $id, $class, $attributes, $cmb) = $selector; 570c165b184SJames Collins $nodes = array(); 571c165b184SJames Collins 572c165b184SJames Collins if ($parent_cmd === ' ') { // Descendant Combinator 573c165b184SJames Collins // Find parent closing tag if the current element doesn't have a closing 574c165b184SJames Collins // tag (i.e. void element) 575c165b184SJames Collins $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 576c165b184SJames Collins if ($end == 0) { 577c165b184SJames Collins $parent = $this->parent; 578c165b184SJames Collins while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 579c165b184SJames Collins $end -= 1; 580c165b184SJames Collins $parent = $parent->parent; 581c165b184SJames Collins } 582c165b184SJames Collins $end += $parent->_[HDOM_INFO_END]; 583c165b184SJames Collins } 584c165b184SJames Collins 585c165b184SJames Collins // Get list of target nodes 586c165b184SJames Collins $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 587c165b184SJames Collins $nodes_count = $end - $nodes_start; 588c165b184SJames Collins $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 589c165b184SJames Collins } elseif ($parent_cmd === '>') { // Child Combinator 590c165b184SJames Collins $nodes = $this->children; 591c165b184SJames Collins } elseif ($parent_cmd === '+' 592c165b184SJames Collins && $this->parent 593c165b184SJames Collins && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 594c165b184SJames Collins $index = array_search($this, $this->parent->children, true) + 1; 595c165b184SJames Collins if ($index < count($this->parent->children)) 596c165b184SJames Collins $nodes[] = $this->parent->children[$index]; 597c165b184SJames Collins } elseif ($parent_cmd === '~' 598c165b184SJames Collins && $this->parent 599c165b184SJames Collins && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 600c165b184SJames Collins $index = array_search($this, $this->parent->children, true); 601c165b184SJames Collins $nodes = array_slice($this->parent->children, $index); 602c165b184SJames Collins } 603c165b184SJames Collins 604c165b184SJames Collins // Go throgh each element starting at this element until the end tag 605c165b184SJames Collins // Note: If this element is a void tag, any previous void element is 606c165b184SJames Collins // skipped. 607c165b184SJames Collins foreach($nodes as $node) { 608c165b184SJames Collins $pass = true; 609c165b184SJames Collins 610c165b184SJames Collins // Skip root nodes 611c165b184SJames Collins if(!$node->parent) { 612c165b184SJames Collins $pass = false; 613c165b184SJames Collins } 614c165b184SJames Collins 615c165b184SJames Collins // Handle 'text' selector 616c165b184SJames Collins if($pass && $tag === 'text' && $node->tag === 'text') { 617c165b184SJames Collins $ret[array_search($node, $this->dom->nodes, true)] = 1; 618c165b184SJames Collins unset($node); 619c165b184SJames Collins continue; 620c165b184SJames Collins } 621c165b184SJames Collins 622c165b184SJames Collins // Skip if node isn't a child node (i.e. text nodes) 623c165b184SJames Collins if($pass && !in_array($node, $node->parent->children, true)) { 624c165b184SJames Collins $pass = false; 625c165b184SJames Collins } 626c165b184SJames Collins 627c165b184SJames Collins // Skip if tag doesn't match 628c165b184SJames Collins if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 629c165b184SJames Collins $pass = false; 630c165b184SJames Collins } 631c165b184SJames Collins 632c165b184SJames Collins // Skip if ID doesn't exist 633c165b184SJames Collins if ($pass && $id !== '' && !isset($node->attr['id'])) { 634c165b184SJames Collins $pass = false; 635c165b184SJames Collins } 636c165b184SJames Collins 637c165b184SJames Collins // Check if ID matches 638c165b184SJames Collins if ($pass && $id !== '' && isset($node->attr['id'])) { 639c165b184SJames Collins // Note: Only consider the first ID (as browsers do) 640c165b184SJames Collins $node_id = explode(' ', trim($node->attr['id']))[0]; 641c165b184SJames Collins 642c165b184SJames Collins if($id !== $node_id) { $pass = false; } 643c165b184SJames Collins } 644c165b184SJames Collins 645c165b184SJames Collins // Check if all class(es) exist 646c165b184SJames Collins if ($pass && $class !== '' && is_array($class) && !empty($class)) { 647c165b184SJames Collins if (isset($node->attr['class'])) { 648c165b184SJames Collins $node_classes = explode(' ', $node->attr['class']); 649c165b184SJames Collins 650c165b184SJames Collins if ($lowercase) { 651c165b184SJames Collins $node_classes = array_map('strtolower', $node_classes); 652c165b184SJames Collins } 653c165b184SJames Collins 654c165b184SJames Collins foreach($class as $c) { 655c165b184SJames Collins if(!in_array($c, $node_classes)) { 656c165b184SJames Collins $pass = false; 657c165b184SJames Collins break; 658c165b184SJames Collins } 659c165b184SJames Collins } 660c165b184SJames Collins } else { 661c165b184SJames Collins $pass = false; 662c165b184SJames Collins } 663c165b184SJames Collins } 664c165b184SJames Collins 665c165b184SJames Collins // Check attributes 666c165b184SJames Collins if ($pass 667c165b184SJames Collins && $attributes !== '' 668c165b184SJames Collins && is_array($attributes) 669c165b184SJames Collins && !empty($attributes)) { 670c165b184SJames Collins foreach($attributes as $a) { 671c165b184SJames Collins list ( 672c165b184SJames Collins $att_name, 673c165b184SJames Collins $att_expr, 674c165b184SJames Collins $att_val, 675c165b184SJames Collins $att_inv, 676c165b184SJames Collins $att_case_sensitivity 677c165b184SJames Collins ) = $a; 678c165b184SJames Collins 679c165b184SJames Collins // Handle indexing attributes (i.e. "[2]") 680c165b184SJames Collins /** 681c165b184SJames Collins * Note: This is not supported by the CSS Standard but adds 682c165b184SJames Collins * the ability to select items compatible to XPath (i.e. 683c165b184SJames Collins * the 3rd element within it's parent). 684c165b184SJames Collins * 685c165b184SJames Collins * Note: This doesn't conflict with the CSS Standard which 686c165b184SJames Collins * doesn't work on numeric attributes anyway. 687c165b184SJames Collins */ 688c165b184SJames Collins if (is_numeric($att_name) 689c165b184SJames Collins && $att_expr === '' 690c165b184SJames Collins && $att_val === '') { 691c165b184SJames Collins $count = 0; 692c165b184SJames Collins 693c165b184SJames Collins // Find index of current element in parent 694c165b184SJames Collins foreach ($node->parent->children as $c) { 695c165b184SJames Collins if ($c->tag === $node->tag) ++$count; 696c165b184SJames Collins if ($c === $node) break; 697c165b184SJames Collins } 698c165b184SJames Collins 699c165b184SJames Collins // If this is the correct node, continue with next 700c165b184SJames Collins // attribute 701c165b184SJames Collins if ($count === (int)$att_name) continue; 702c165b184SJames Collins } 703c165b184SJames Collins 704c165b184SJames Collins // Check attribute availability 705c165b184SJames Collins if ($att_inv) { // Attribute should NOT be set 706c165b184SJames Collins if (isset($node->attr[$att_name])) { 707c165b184SJames Collins $pass = false; 708c165b184SJames Collins break; 709c165b184SJames Collins } 710c165b184SJames Collins } else { // Attribute should be set 711c165b184SJames Collins // todo: "plaintext" is not a valid CSS selector! 712c165b184SJames Collins if ($att_name !== 'plaintext' 713c165b184SJames Collins && !isset($node->attr[$att_name])) { 714c165b184SJames Collins $pass = false; 715c165b184SJames Collins break; 716c165b184SJames Collins } 717c165b184SJames Collins } 718c165b184SJames Collins 719c165b184SJames Collins // Continue with next attribute if expression isn't defined 720c165b184SJames Collins if ($att_expr === '') continue; 721c165b184SJames Collins 722c165b184SJames Collins // If they have told us that this is a "plaintext" 723c165b184SJames Collins // search then we want the plaintext of the node - right? 724c165b184SJames Collins // todo "plaintext" is not a valid CSS selector! 725c165b184SJames Collins if ($att_name === 'plaintext') { 726c165b184SJames Collins $nodeKeyValue = $node->text(); 727c165b184SJames Collins } else { 728c165b184SJames Collins $nodeKeyValue = $node->attr[$att_name]; 729c165b184SJames Collins } 730c165b184SJames Collins 731c165b184SJames Collins if (is_object($debug_object)) { 732c165b184SJames Collins $debug_object->debug_log(2, 733c165b184SJames Collins 'testing node: ' 734c165b184SJames Collins . $node->tag 735c165b184SJames Collins . ' for attribute: ' 736c165b184SJames Collins . $att_name 737c165b184SJames Collins . $att_expr 738c165b184SJames Collins . $att_val 739c165b184SJames Collins . ' where nodes value is: ' 740c165b184SJames Collins . $nodeKeyValue 741c165b184SJames Collins ); 742c165b184SJames Collins } 743c165b184SJames Collins 744c165b184SJames Collins // If lowercase is set, do a case insensitive test of 745c165b184SJames Collins // the value of the selector. 746c165b184SJames Collins if ($lowercase) { 747c165b184SJames Collins $check = $this->match( 748c165b184SJames Collins $att_expr, 749c165b184SJames Collins strtolower($att_val), 750c165b184SJames Collins strtolower($nodeKeyValue), 751c165b184SJames Collins $att_case_sensitivity 752c165b184SJames Collins ); 753c165b184SJames Collins } else { 754c165b184SJames Collins $check = $this->match( 755c165b184SJames Collins $att_expr, 756c165b184SJames Collins $att_val, 757c165b184SJames Collins $nodeKeyValue, 758c165b184SJames Collins $att_case_sensitivity 759c165b184SJames Collins ); 760c165b184SJames Collins } 761c165b184SJames Collins 762c165b184SJames Collins if (is_object($debug_object)) { 763c165b184SJames Collins $debug_object->debug_log(2, 764c165b184SJames Collins 'after match: ' 765c165b184SJames Collins . ($check ? 'true' : 'false') 766c165b184SJames Collins ); 767c165b184SJames Collins } 768c165b184SJames Collins 769c165b184SJames Collins if (!$check) { 770c165b184SJames Collins $pass = false; 771c165b184SJames Collins break; 772c165b184SJames Collins } 773c165b184SJames Collins } 774c165b184SJames Collins } 775c165b184SJames Collins 776c165b184SJames Collins // Found a match. Add to list and clear node 777c165b184SJames Collins if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 778c165b184SJames Collins unset($node); 779c165b184SJames Collins } 780c165b184SJames Collins // It's passed by reference so this is actually what this function returns. 781c165b184SJames Collins if (is_object($debug_object)) { 782c165b184SJames Collins $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 783c165b184SJames Collins } 784c165b184SJames Collins } 785c165b184SJames Collins 786c165b184SJames Collins protected function match($exp, $pattern, $value, $case_sensitivity) 787c165b184SJames Collins { 788c165b184SJames Collins global $debug_object; 789c165b184SJames Collins if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 790c165b184SJames Collins 791c165b184SJames Collins if ($case_sensitivity === 'i') { 792c165b184SJames Collins $pattern = strtolower($pattern); 793c165b184SJames Collins $value = strtolower($value); 794c165b184SJames Collins } 795c165b184SJames Collins 796c165b184SJames Collins switch ($exp) { 797c165b184SJames Collins case '=': 798c165b184SJames Collins return ($value === $pattern); 799c165b184SJames Collins case '!=': 800c165b184SJames Collins return ($value !== $pattern); 801c165b184SJames Collins case '^=': 802c165b184SJames Collins return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 803c165b184SJames Collins case '$=': 804c165b184SJames Collins return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 805c165b184SJames Collins case '*=': 806c165b184SJames Collins return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 807c165b184SJames Collins case '|=': 808c165b184SJames Collins /** 809c165b184SJames Collins * [att|=val] 810c165b184SJames Collins * 811c165b184SJames Collins * Represents an element with the att attribute, its value 812c165b184SJames Collins * either being exactly "val" or beginning with "val" 813c165b184SJames Collins * immediately followed by "-" (U+002D). 814c165b184SJames Collins */ 815c165b184SJames Collins return strpos($value, $pattern) === 0; 816c165b184SJames Collins case '~=': 817c165b184SJames Collins /** 818c165b184SJames Collins * [att~=val] 819c165b184SJames Collins * 820c165b184SJames Collins * Represents an element with the att attribute whose value is a 821c165b184SJames Collins * whitespace-separated list of words, one of which is exactly 822c165b184SJames Collins * "val". If "val" contains whitespace, it will never represent 823c165b184SJames Collins * anything (since the words are separated by spaces). Also if 824c165b184SJames Collins * "val" is the empty string, it will never represent anything. 825c165b184SJames Collins */ 826c165b184SJames Collins return in_array($pattern, explode(' ', trim($value)), true); 827c165b184SJames Collins } 828c165b184SJames Collins return false; 829c165b184SJames Collins } 830c165b184SJames Collins 831c165b184SJames Collins protected function parse_selector($selector_string) 832c165b184SJames Collins { 833c165b184SJames Collins global $debug_object; 834c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 835c165b184SJames Collins 836c165b184SJames Collins /** 837c165b184SJames Collins * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 838c165b184SJames Collins * 839c165b184SJames Collins * Paperg: Add the colon to the attribute, so that it properly finds 840c165b184SJames Collins * <tag attr:ibute="something" > like google does. 841c165b184SJames Collins * 842c165b184SJames Collins * Note: if you try to look at this attribute, you MUST use getAttribute 843c165b184SJames Collins * since $dom->x:y will fail the php syntax check. 844c165b184SJames Collins * 845c165b184SJames Collins * Notice the \[ starting the attribute? and the @? following? This 846c165b184SJames Collins * implies that an attribute can begin with an @ sign that is not 847c165b184SJames Collins * captured. This implies that an html attribute specifier may start 848c165b184SJames Collins * with an @ sign that is NOT captured by the expression. Farther study 849c165b184SJames Collins * is required to determine of this should be documented or removed. 850c165b184SJames Collins * 851c165b184SJames Collins * Matches selectors in this order: 852c165b184SJames Collins * 853c165b184SJames Collins * [0] - full match 854c165b184SJames Collins * 855c165b184SJames Collins * [1] - tag name 856c165b184SJames Collins * ([\w:\*-]*) 857c165b184SJames Collins * Matches the tag name consisting of zero or more words, colons, 858c165b184SJames Collins * asterisks and hyphens. 859c165b184SJames Collins * 860c165b184SJames Collins * [2] - id name 861c165b184SJames Collins * (?:\#([\w-]+)) 862c165b184SJames Collins * Optionally matches a id name, consisting of an "#" followed by 863c165b184SJames Collins * the id name (one or more words and hyphens). 864c165b184SJames Collins * 865c165b184SJames Collins * [3] - class names (including dots) 866c165b184SJames Collins * (?:\.([\w\.-]+))? 867c165b184SJames Collins * Optionally matches a list of classs, consisting of an "." 868c165b184SJames Collins * followed by the class name (one or more words and hyphens) 869c165b184SJames Collins * where multiple classes can be chained (i.e. ".foo.bar.baz") 870c165b184SJames Collins * 871c165b184SJames Collins * [4] - attributes 872c165b184SJames Collins * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 873c165b184SJames Collins * Optionally matches the attributes list 874c165b184SJames Collins * 875c165b184SJames Collins * [5] - separator 876c165b184SJames Collins * ([\/, >+~]+) 877c165b184SJames Collins * Matches the selector list separator 878c165b184SJames Collins */ 879c165b184SJames Collins // phpcs:ignore Generic.Files.LineLength 880c165b184SJames Collins $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 881c165b184SJames Collins 882c165b184SJames Collins preg_match_all( 883c165b184SJames Collins $pattern, 884c165b184SJames Collins trim($selector_string) . ' ', // Add final ' ' as pseudo separator 885c165b184SJames Collins $matches, 886c165b184SJames Collins PREG_SET_ORDER 887c165b184SJames Collins ); 888c165b184SJames Collins 889c165b184SJames Collins if (is_object($debug_object)) { 890c165b184SJames Collins $debug_object->debug_log(2, 'Matches Array: ', $matches); 891c165b184SJames Collins } 892c165b184SJames Collins 893c165b184SJames Collins $selectors = array(); 894c165b184SJames Collins $result = array(); 895c165b184SJames Collins 896c165b184SJames Collins foreach ($matches as $m) { 897c165b184SJames Collins $m[0] = trim($m[0]); 898c165b184SJames Collins 899c165b184SJames Collins // Skip NoOps 900c165b184SJames Collins if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 901c165b184SJames Collins 902c165b184SJames Collins // Convert to lowercase 903c165b184SJames Collins if ($this->dom->lowercase) { 904c165b184SJames Collins $m[1] = strtolower($m[1]); 905c165b184SJames Collins } 906c165b184SJames Collins 907c165b184SJames Collins // Extract classes 908c165b184SJames Collins if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 909c165b184SJames Collins 910c165b184SJames Collins /* Extract attributes (pattern based on the pattern above!) 911c165b184SJames Collins 912c165b184SJames Collins * [0] - full match 913c165b184SJames Collins * [1] - attribute name 914c165b184SJames Collins * [2] - attribute expression 915c165b184SJames Collins * [3] - attribute value 916c165b184SJames Collins * [4] - case sensitivity 917c165b184SJames Collins * 918c165b184SJames Collins * Note: Attributes can be negated with a "!" prefix to their name 919c165b184SJames Collins */ 920c165b184SJames Collins if($m[4] !== '') { 921c165b184SJames Collins preg_match_all( 922c165b184SJames Collins "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 923c165b184SJames Collins trim($m[4]), 924c165b184SJames Collins $attributes, 925c165b184SJames Collins PREG_SET_ORDER 926c165b184SJames Collins ); 927c165b184SJames Collins 928c165b184SJames Collins // Replace element by array 929c165b184SJames Collins $m[4] = array(); 930c165b184SJames Collins 931c165b184SJames Collins foreach($attributes as $att) { 932c165b184SJames Collins // Skip empty matches 933c165b184SJames Collins if(trim($att[0]) === '') { continue; } 934c165b184SJames Collins 935c165b184SJames Collins $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 936c165b184SJames Collins $m[4][] = array( 937c165b184SJames Collins $inverted ? substr($att[1], 1) : $att[1], // Name 938c165b184SJames Collins (isset($att[2])) ? $att[2] : '', // Expression 939c165b184SJames Collins (isset($att[3])) ? $att[3] : '', // Value 940c165b184SJames Collins $inverted, // Inverted Flag 941c165b184SJames Collins (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 942c165b184SJames Collins ); 943c165b184SJames Collins } 944c165b184SJames Collins } 945c165b184SJames Collins 946c165b184SJames Collins // Sanitize Separator 947c165b184SJames Collins if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 948c165b184SJames Collins $m[5] = ' '; 949c165b184SJames Collins } else { // Other Separator 950c165b184SJames Collins $m[5] = trim($m[5]); 951c165b184SJames Collins } 952c165b184SJames Collins 953c165b184SJames Collins // Clear Separator if it's a Selector List 954c165b184SJames Collins if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 955c165b184SJames Collins 956c165b184SJames Collins // Remove full match before adding to results 957c165b184SJames Collins array_shift($m); 958c165b184SJames Collins $result[] = $m; 959c165b184SJames Collins 960c165b184SJames Collins if ($is_list) { // Selector List 961c165b184SJames Collins $selectors[] = $result; 962c165b184SJames Collins $result = array(); 963c165b184SJames Collins } 964c165b184SJames Collins } 965c165b184SJames Collins 966c165b184SJames Collins if (count($result) > 0) { $selectors[] = $result; } 967c165b184SJames Collins return $selectors; 968c165b184SJames Collins } 969c165b184SJames Collins 970c165b184SJames Collins function __get($name) 971c165b184SJames Collins { 972c165b184SJames Collins if (isset($this->attr[$name])) { 973c165b184SJames Collins return $this->convert_text($this->attr[$name]); 974c165b184SJames Collins } 975c165b184SJames Collins switch ($name) { 976c165b184SJames Collins case 'outertext': return $this->outertext(); 977c165b184SJames Collins case 'innertext': return $this->innertext(); 978c165b184SJames Collins case 'plaintext': return $this->text(); 979c165b184SJames Collins case 'xmltext': return $this->xmltext(); 980c165b184SJames Collins default: return array_key_exists($name, $this->attr); 981c165b184SJames Collins } 982c165b184SJames Collins } 983c165b184SJames Collins 984c165b184SJames Collins function __set($name, $value) 985c165b184SJames Collins { 986c165b184SJames Collins global $debug_object; 987c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 988c165b184SJames Collins 989c165b184SJames Collins switch ($name) { 990c165b184SJames Collins case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 991c165b184SJames Collins case 'innertext': 992c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 993c165b184SJames Collins return $this->_[HDOM_INFO_TEXT] = $value; 994c165b184SJames Collins } 995c165b184SJames Collins return $this->_[HDOM_INFO_INNER] = $value; 996c165b184SJames Collins } 997c165b184SJames Collins 998c165b184SJames Collins if (!isset($this->attr[$name])) { 999c165b184SJames Collins $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 1000c165b184SJames Collins $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1001c165b184SJames Collins } 1002c165b184SJames Collins 1003c165b184SJames Collins $this->attr[$name] = $value; 1004c165b184SJames Collins } 1005c165b184SJames Collins 1006c165b184SJames Collins function __isset($name) 1007c165b184SJames Collins { 1008c165b184SJames Collins switch ($name) { 1009c165b184SJames Collins case 'outertext': return true; 1010c165b184SJames Collins case 'innertext': return true; 1011c165b184SJames Collins case 'plaintext': return true; 1012c165b184SJames Collins } 1013c165b184SJames Collins //no value attr: nowrap, checked selected... 1014c165b184SJames Collins return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1015c165b184SJames Collins } 1016c165b184SJames Collins 1017c165b184SJames Collins function __unset($name) 1018c165b184SJames Collins { 1019c165b184SJames Collins if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1020c165b184SJames Collins } 1021c165b184SJames Collins 1022c165b184SJames Collins function convert_text($text) 1023c165b184SJames Collins { 1024c165b184SJames Collins global $debug_object; 1025c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1026c165b184SJames Collins 1027c165b184SJames Collins $converted_text = $text; 1028c165b184SJames Collins 1029c165b184SJames Collins $sourceCharset = ''; 1030c165b184SJames Collins $targetCharset = ''; 1031c165b184SJames Collins 1032c165b184SJames Collins if ($this->dom) { 1033c165b184SJames Collins $sourceCharset = strtoupper($this->dom->_charset); 1034c165b184SJames Collins $targetCharset = strtoupper($this->dom->_target_charset); 1035c165b184SJames Collins } 1036c165b184SJames Collins 1037c165b184SJames Collins if (is_object($debug_object)) { 1038c165b184SJames Collins $debug_object->debug_log(3, 1039c165b184SJames Collins 'source charset: ' 1040c165b184SJames Collins . $sourceCharset 1041c165b184SJames Collins . ' target charaset: ' 1042c165b184SJames Collins . $targetCharset 1043c165b184SJames Collins ); 1044c165b184SJames Collins } 1045c165b184SJames Collins 1046c165b184SJames Collins if (!empty($sourceCharset) 1047c165b184SJames Collins && !empty($targetCharset) 1048c165b184SJames Collins && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1049c165b184SJames Collins // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1050c165b184SJames Collins if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1051c165b184SJames Collins && ($this->is_utf8($text))) { 1052c165b184SJames Collins $converted_text = $text; 1053c165b184SJames Collins } else { 1054c165b184SJames Collins $converted_text = iconv($sourceCharset, $targetCharset, $text); 1055c165b184SJames Collins } 1056c165b184SJames Collins } 1057c165b184SJames Collins 1058c165b184SJames Collins // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1059c165b184SJames Collins if ($targetCharset === 'UTF-8') { 1060c165b184SJames Collins if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1061c165b184SJames Collins $converted_text = substr($converted_text, 3); 1062c165b184SJames Collins } 1063c165b184SJames Collins 1064c165b184SJames Collins if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1065c165b184SJames Collins $converted_text = substr($converted_text, 0, -3); 1066c165b184SJames Collins } 1067c165b184SJames Collins } 1068c165b184SJames Collins 1069c165b184SJames Collins return $converted_text; 1070c165b184SJames Collins } 1071c165b184SJames Collins 1072c165b184SJames Collins static function is_utf8($str) 1073c165b184SJames Collins { 1074c165b184SJames Collins $c = 0; $b = 0; 1075c165b184SJames Collins $bits = 0; 1076c165b184SJames Collins $len = strlen($str); 1077c165b184SJames Collins for($i = 0; $i < $len; $i++) { 1078c165b184SJames Collins $c = ord($str[$i]); 1079c165b184SJames Collins if($c > 128) { 1080c165b184SJames Collins if(($c >= 254)) { return false; } 1081c165b184SJames Collins elseif($c >= 252) { $bits = 6; } 1082c165b184SJames Collins elseif($c >= 248) { $bits = 5; } 1083c165b184SJames Collins elseif($c >= 240) { $bits = 4; } 1084c165b184SJames Collins elseif($c >= 224) { $bits = 3; } 1085c165b184SJames Collins elseif($c >= 192) { $bits = 2; } 1086c165b184SJames Collins else { return false; } 1087c165b184SJames Collins if(($i + $bits) > $len) { return false; } 1088c165b184SJames Collins while($bits > 1) { 1089c165b184SJames Collins $i++; 1090c165b184SJames Collins $b = ord($str[$i]); 1091c165b184SJames Collins if($b < 128 || $b > 191) { return false; } 1092c165b184SJames Collins $bits--; 1093c165b184SJames Collins } 1094c165b184SJames Collins } 1095c165b184SJames Collins } 1096c165b184SJames Collins return true; 1097c165b184SJames Collins } 1098c165b184SJames Collins 1099c165b184SJames Collins function get_display_size() 1100c165b184SJames Collins { 1101c165b184SJames Collins global $debug_object; 1102c165b184SJames Collins 1103c165b184SJames Collins $width = -1; 1104c165b184SJames Collins $height = -1; 1105c165b184SJames Collins 1106c165b184SJames Collins if ($this->tag !== 'img') { 1107c165b184SJames Collins return false; 1108c165b184SJames Collins } 1109c165b184SJames Collins 1110c165b184SJames Collins // See if there is aheight or width attribute in the tag itself. 1111c165b184SJames Collins if (isset($this->attr['width'])) { 1112c165b184SJames Collins $width = $this->attr['width']; 1113c165b184SJames Collins } 1114c165b184SJames Collins 1115c165b184SJames Collins if (isset($this->attr['height'])) { 1116c165b184SJames Collins $height = $this->attr['height']; 1117c165b184SJames Collins } 1118c165b184SJames Collins 1119c165b184SJames Collins // Now look for an inline style. 1120c165b184SJames Collins if (isset($this->attr['style'])) { 1121c165b184SJames Collins // Thanks to user gnarf from stackoverflow for this regular expression. 1122c165b184SJames Collins $attributes = array(); 1123c165b184SJames Collins 1124c165b184SJames Collins preg_match_all( 1125c165b184SJames Collins '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1126c165b184SJames Collins $this->attr['style'], 1127c165b184SJames Collins $matches, 1128c165b184SJames Collins PREG_SET_ORDER 1129c165b184SJames Collins ); 1130c165b184SJames Collins 1131c165b184SJames Collins foreach ($matches as $match) { 1132c165b184SJames Collins $attributes[$match[1]] = $match[2]; 1133c165b184SJames Collins } 1134c165b184SJames Collins 1135c165b184SJames Collins // If there is a width in the style attributes: 1136c165b184SJames Collins if (isset($attributes['width']) && $width == -1) { 1137c165b184SJames Collins // check that the last two characters are px (pixels) 1138c165b184SJames Collins if (strtolower(substr($attributes['width'], -2)) === 'px') { 1139c165b184SJames Collins $proposed_width = substr($attributes['width'], 0, -2); 1140c165b184SJames Collins // Now make sure that it's an integer and not something stupid. 1141c165b184SJames Collins if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1142c165b184SJames Collins $width = $proposed_width; 1143c165b184SJames Collins } 1144c165b184SJames Collins } 1145c165b184SJames Collins } 1146c165b184SJames Collins 1147c165b184SJames Collins // If there is a width in the style attributes: 1148c165b184SJames Collins if (isset($attributes['height']) && $height == -1) { 1149c165b184SJames Collins // check that the last two characters are px (pixels) 1150c165b184SJames Collins if (strtolower(substr($attributes['height'], -2)) == 'px') { 1151c165b184SJames Collins $proposed_height = substr($attributes['height'], 0, -2); 1152c165b184SJames Collins // Now make sure that it's an integer and not something stupid. 1153c165b184SJames Collins if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1154c165b184SJames Collins $height = $proposed_height; 1155c165b184SJames Collins } 1156c165b184SJames Collins } 1157c165b184SJames Collins } 1158c165b184SJames Collins 1159c165b184SJames Collins } 1160c165b184SJames Collins 1161c165b184SJames Collins // Future enhancement: 1162c165b184SJames Collins // Look in the tag to see if there is a class or id specified that has 1163c165b184SJames Collins // a height or width attribute to it. 1164c165b184SJames Collins 1165c165b184SJames Collins // Far future enhancement 1166c165b184SJames Collins // Look at all the parent tags of this image to see if they specify a 1167c165b184SJames Collins // class or id that has an img selector that specifies a height or width 1168c165b184SJames Collins // Note that in this case, the class or id will have the img subselector 1169c165b184SJames Collins // for it to apply to the image. 1170c165b184SJames Collins 1171c165b184SJames Collins // ridiculously far future development 1172c165b184SJames Collins // If the class or id is specified in a SEPARATE css file thats not on 1173c165b184SJames Collins // the page, go get it and do what we were just doing for the ones on 1174c165b184SJames Collins // the page. 1175c165b184SJames Collins 1176c165b184SJames Collins $result = array( 1177c165b184SJames Collins 'height' => $height, 1178c165b184SJames Collins 'width' => $width 1179c165b184SJames Collins ); 1180c165b184SJames Collins 1181c165b184SJames Collins return $result; 1182c165b184SJames Collins } 1183c165b184SJames Collins 1184c165b184SJames Collins function save($filepath = '') 1185c165b184SJames Collins { 1186c165b184SJames Collins $ret = $this->outertext(); 1187c165b184SJames Collins 1188c165b184SJames Collins if ($filepath !== '') { 1189c165b184SJames Collins file_put_contents($filepath, $ret, LOCK_EX); 1190c165b184SJames Collins } 1191c165b184SJames Collins 1192c165b184SJames Collins return $ret; 1193c165b184SJames Collins } 1194c165b184SJames Collins 1195c165b184SJames Collins function addClass($class) 1196c165b184SJames Collins { 1197c165b184SJames Collins if (is_string($class)) { 1198c165b184SJames Collins $class = explode(' ', $class); 1199c165b184SJames Collins } 1200c165b184SJames Collins 1201c165b184SJames Collins if (is_array($class)) { 1202c165b184SJames Collins foreach($class as $c) { 1203c165b184SJames Collins if (isset($this->class)) { 1204c165b184SJames Collins if ($this->hasClass($c)) { 1205c165b184SJames Collins continue; 1206c165b184SJames Collins } else { 1207c165b184SJames Collins $this->class .= ' ' . $c; 1208c165b184SJames Collins } 1209c165b184SJames Collins } else { 1210c165b184SJames Collins $this->class = $c; 1211c165b184SJames Collins } 1212c165b184SJames Collins } 1213c165b184SJames Collins } else { 1214c165b184SJames Collins if (is_object($debug_object)) { 1215c165b184SJames Collins $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1216c165b184SJames Collins } 1217c165b184SJames Collins } 1218c165b184SJames Collins } 1219c165b184SJames Collins 1220c165b184SJames Collins function hasClass($class) 1221c165b184SJames Collins { 1222c165b184SJames Collins if (is_string($class)) { 1223c165b184SJames Collins if (isset($this->class)) { 1224c165b184SJames Collins return in_array($class, explode(' ', $this->class), true); 1225c165b184SJames Collins } 1226c165b184SJames Collins } else { 1227c165b184SJames Collins if (is_object($debug_object)) { 1228c165b184SJames Collins $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1229c165b184SJames Collins } 1230c165b184SJames Collins } 1231c165b184SJames Collins 1232c165b184SJames Collins return false; 1233c165b184SJames Collins } 1234c165b184SJames Collins 1235c165b184SJames Collins function removeClass($class = null) 1236c165b184SJames Collins { 1237c165b184SJames Collins if (!isset($this->class)) { 1238c165b184SJames Collins return; 1239c165b184SJames Collins } 1240c165b184SJames Collins 1241c165b184SJames Collins if (is_null($class)) { 1242c165b184SJames Collins $this->removeAttribute('class'); 1243c165b184SJames Collins return; 1244c165b184SJames Collins } 1245c165b184SJames Collins 1246c165b184SJames Collins if (is_string($class)) { 1247c165b184SJames Collins $class = explode(' ', $class); 1248c165b184SJames Collins } 1249c165b184SJames Collins 1250c165b184SJames Collins if (is_array($class)) { 1251c165b184SJames Collins $class = array_diff(explode(' ', $this->class), $class); 1252c165b184SJames Collins if (empty($class)) { 1253c165b184SJames Collins $this->removeAttribute('class'); 1254c165b184SJames Collins } else { 1255c165b184SJames Collins $this->class = implode(' ', $class); 1256c165b184SJames Collins } 1257c165b184SJames Collins } 1258c165b184SJames Collins } 1259c165b184SJames Collins 1260c165b184SJames Collins function getAllAttributes() 1261c165b184SJames Collins { 1262c165b184SJames Collins return $this->attr; 1263c165b184SJames Collins } 1264c165b184SJames Collins 1265c165b184SJames Collins function getAttribute($name) 1266c165b184SJames Collins { 1267c165b184SJames Collins return $this->__get($name); 1268c165b184SJames Collins } 1269c165b184SJames Collins 1270c165b184SJames Collins function setAttribute($name, $value) 1271c165b184SJames Collins { 1272c165b184SJames Collins $this->__set($name, $value); 1273c165b184SJames Collins } 1274c165b184SJames Collins 1275c165b184SJames Collins function hasAttribute($name) 1276c165b184SJames Collins { 1277c165b184SJames Collins return $this->__isset($name); 1278c165b184SJames Collins } 1279c165b184SJames Collins 1280c165b184SJames Collins function removeAttribute($name) 1281c165b184SJames Collins { 1282c165b184SJames Collins $this->__set($name, null); 1283c165b184SJames Collins } 1284c165b184SJames Collins 1285c165b184SJames Collins function remove() 1286c165b184SJames Collins { 1287c165b184SJames Collins if ($this->parent) { 1288c165b184SJames Collins $this->parent->removeChild($this); 1289c165b184SJames Collins } 1290c165b184SJames Collins } 1291c165b184SJames Collins 1292c165b184SJames Collins function removeChild($node) 1293c165b184SJames Collins { 1294c165b184SJames Collins $nidx = array_search($node, $this->nodes, true); 1295c165b184SJames Collins $cidx = array_search($node, $this->children, true); 1296c165b184SJames Collins $didx = array_search($node, $this->dom->nodes, true); 1297c165b184SJames Collins 1298c165b184SJames Collins if ($nidx !== false && $cidx !== false && $didx !== false) { 1299c165b184SJames Collins 1300c165b184SJames Collins foreach($node->children as $child) { 1301c165b184SJames Collins $node->removeChild($child); 1302c165b184SJames Collins } 1303c165b184SJames Collins 1304c165b184SJames Collins foreach($node->nodes as $entity) { 1305c165b184SJames Collins $enidx = array_search($entity, $node->nodes, true); 1306c165b184SJames Collins $edidx = array_search($entity, $node->dom->nodes, true); 1307c165b184SJames Collins 1308c165b184SJames Collins if ($enidx !== false && $edidx !== false) { 1309c165b184SJames Collins unset($node->nodes[$enidx]); 1310c165b184SJames Collins unset($node->dom->nodes[$edidx]); 1311c165b184SJames Collins } 1312c165b184SJames Collins } 1313c165b184SJames Collins 1314c165b184SJames Collins unset($this->nodes[$nidx]); 1315c165b184SJames Collins unset($this->children[$cidx]); 1316c165b184SJames Collins unset($this->dom->nodes[$didx]); 1317c165b184SJames Collins 1318c165b184SJames Collins $node->clear(); 1319c165b184SJames Collins 1320c165b184SJames Collins } 1321c165b184SJames Collins } 1322c165b184SJames Collins 1323c165b184SJames Collins function getElementById($id) 1324c165b184SJames Collins { 1325c165b184SJames Collins return $this->find("#$id", 0); 1326c165b184SJames Collins } 1327c165b184SJames Collins 1328c165b184SJames Collins function getElementsById($id, $idx = null) 1329c165b184SJames Collins { 1330c165b184SJames Collins return $this->find("#$id", $idx); 1331c165b184SJames Collins } 1332c165b184SJames Collins 1333c165b184SJames Collins function getElementByTagName($name) 1334c165b184SJames Collins { 1335c165b184SJames Collins return $this->find($name, 0); 1336c165b184SJames Collins } 1337c165b184SJames Collins 1338c165b184SJames Collins function getElementsByTagName($name, $idx = null) 1339c165b184SJames Collins { 1340c165b184SJames Collins return $this->find($name, $idx); 1341c165b184SJames Collins } 1342c165b184SJames Collins 1343c165b184SJames Collins function parentNode() 1344c165b184SJames Collins { 1345c165b184SJames Collins return $this->parent(); 1346c165b184SJames Collins } 1347c165b184SJames Collins 1348c165b184SJames Collins function childNodes($idx = -1) 1349c165b184SJames Collins { 1350c165b184SJames Collins return $this->children($idx); 1351c165b184SJames Collins } 1352c165b184SJames Collins 1353c165b184SJames Collins function firstChild() 1354c165b184SJames Collins { 1355c165b184SJames Collins return $this->first_child(); 1356c165b184SJames Collins } 1357c165b184SJames Collins 1358c165b184SJames Collins function lastChild() 1359c165b184SJames Collins { 1360c165b184SJames Collins return $this->last_child(); 1361c165b184SJames Collins } 1362c165b184SJames Collins 1363c165b184SJames Collins function nextSibling() 1364c165b184SJames Collins { 1365c165b184SJames Collins return $this->next_sibling(); 1366c165b184SJames Collins } 1367c165b184SJames Collins 1368c165b184SJames Collins function previousSibling() 1369c165b184SJames Collins { 1370c165b184SJames Collins return $this->prev_sibling(); 1371c165b184SJames Collins } 1372c165b184SJames Collins 1373c165b184SJames Collins function hasChildNodes() 1374c165b184SJames Collins { 1375c165b184SJames Collins return $this->has_child(); 1376c165b184SJames Collins } 1377c165b184SJames Collins 1378c165b184SJames Collins function nodeName() 1379c165b184SJames Collins { 1380c165b184SJames Collins return $this->tag; 1381c165b184SJames Collins } 1382c165b184SJames Collins 1383c165b184SJames Collins function appendChild($node) 1384c165b184SJames Collins { 1385c165b184SJames Collins $node->parent($this); 1386c165b184SJames Collins return $node; 1387c165b184SJames Collins } 1388c165b184SJames Collins 1389c165b184SJames Collins} 1390c165b184SJames Collins 1391c165b184SJames Collinsclass simple_html_dom 1392c165b184SJames Collins{ 1393c165b184SJames Collins public $root = null; 1394c165b184SJames Collins public $nodes = array(); 1395c165b184SJames Collins public $callback = null; 1396c165b184SJames Collins public $lowercase = false; 1397c165b184SJames Collins public $original_size; 1398c165b184SJames Collins public $size; 1399c165b184SJames Collins 1400c165b184SJames Collins protected $pos; 1401c165b184SJames Collins protected $doc; 1402c165b184SJames Collins protected $char; 1403c165b184SJames Collins 1404c165b184SJames Collins protected $cursor; 1405c165b184SJames Collins protected $parent; 1406c165b184SJames Collins protected $noise = array(); 1407c165b184SJames Collins protected $token_blank = " \t\r\n"; 1408c165b184SJames Collins protected $token_equal = ' =/>'; 1409c165b184SJames Collins protected $token_slash = " />\r\n\t"; 1410c165b184SJames Collins protected $token_attr = ' >'; 1411c165b184SJames Collins 1412c165b184SJames Collins public $_charset = ''; 1413c165b184SJames Collins public $_target_charset = ''; 1414c165b184SJames Collins 1415c165b184SJames Collins protected $default_br_text = ''; 1416c165b184SJames Collins 1417c165b184SJames Collins public $default_span_text = ''; 1418c165b184SJames Collins 1419c165b184SJames Collins protected $self_closing_tags = array( 1420c165b184SJames Collins 'area' => 1, 1421c165b184SJames Collins 'base' => 1, 1422c165b184SJames Collins 'br' => 1, 1423c165b184SJames Collins 'col' => 1, 1424c165b184SJames Collins 'embed' => 1, 1425c165b184SJames Collins 'hr' => 1, 1426c165b184SJames Collins 'img' => 1, 1427c165b184SJames Collins 'input' => 1, 1428c165b184SJames Collins 'link' => 1, 1429c165b184SJames Collins 'meta' => 1, 1430c165b184SJames Collins 'param' => 1, 1431c165b184SJames Collins 'source' => 1, 1432c165b184SJames Collins 'track' => 1, 1433c165b184SJames Collins 'wbr' => 1 1434c165b184SJames Collins ); 1435c165b184SJames Collins protected $block_tags = array( 1436c165b184SJames Collins 'body' => 1, 1437c165b184SJames Collins 'div' => 1, 1438c165b184SJames Collins 'form' => 1, 1439c165b184SJames Collins 'root' => 1, 1440c165b184SJames Collins 'span' => 1, 1441c165b184SJames Collins 'table' => 1 1442c165b184SJames Collins ); 1443c165b184SJames Collins protected $optional_closing_tags = array( 1444c165b184SJames Collins // Not optional, see 1445c165b184SJames Collins // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1446c165b184SJames Collins 'b' => array('b' => 1), 1447c165b184SJames Collins 'dd' => array('dd' => 1, 'dt' => 1), 1448c165b184SJames Collins // Not optional, see 1449c165b184SJames Collins // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1450c165b184SJames Collins 'dl' => array('dd' => 1, 'dt' => 1), 1451c165b184SJames Collins 'dt' => array('dd' => 1, 'dt' => 1), 1452c165b184SJames Collins 'li' => array('li' => 1), 1453c165b184SJames Collins 'optgroup' => array('optgroup' => 1, 'option' => 1), 1454c165b184SJames Collins 'option' => array('optgroup' => 1, 'option' => 1), 1455c165b184SJames Collins 'p' => array('p' => 1), 1456c165b184SJames Collins 'rp' => array('rp' => 1, 'rt' => 1), 1457c165b184SJames Collins 'rt' => array('rp' => 1, 'rt' => 1), 1458c165b184SJames Collins 'td' => array('td' => 1, 'th' => 1), 1459c165b184SJames Collins 'th' => array('td' => 1, 'th' => 1), 1460c165b184SJames Collins 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1461c165b184SJames Collins ); 1462c165b184SJames Collins 1463c165b184SJames Collins function __construct( 1464c165b184SJames Collins $str = null, 1465c165b184SJames Collins $lowercase = true, 1466c165b184SJames Collins $forceTagsClosed = true, 1467c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 1468c165b184SJames Collins $stripRN = true, 1469c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1470c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT, 1471c165b184SJames Collins $options = 0) 1472c165b184SJames Collins { 1473c165b184SJames Collins if ($str) { 1474c165b184SJames Collins if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1475c165b184SJames Collins $this->load_file($str); 1476c165b184SJames Collins } else { 1477c165b184SJames Collins $this->load( 1478c165b184SJames Collins $str, 1479c165b184SJames Collins $lowercase, 1480c165b184SJames Collins $stripRN, 1481c165b184SJames Collins $defaultBRText, 1482c165b184SJames Collins $defaultSpanText, 1483c165b184SJames Collins $options 1484c165b184SJames Collins ); 1485c165b184SJames Collins } 1486c165b184SJames Collins } 1487c165b184SJames Collins // Forcing tags to be closed implies that we don't trust the html, but 1488c165b184SJames Collins // it can lead to parsing errors if we SHOULD trust the html. 1489c165b184SJames Collins if (!$forceTagsClosed) { 1490c165b184SJames Collins $this->optional_closing_array = array(); 1491c165b184SJames Collins } 1492c165b184SJames Collins 1493c165b184SJames Collins $this->_target_charset = $target_charset; 1494c165b184SJames Collins } 1495c165b184SJames Collins 1496c165b184SJames Collins function __destruct() 1497c165b184SJames Collins { 1498c165b184SJames Collins $this->clear(); 1499c165b184SJames Collins } 1500c165b184SJames Collins 1501c165b184SJames Collins function load( 1502c165b184SJames Collins $str, 1503c165b184SJames Collins $lowercase = true, 1504c165b184SJames Collins $stripRN = true, 1505c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1506c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT, 1507c165b184SJames Collins $options = 0) 1508c165b184SJames Collins { 1509c165b184SJames Collins global $debug_object; 1510c165b184SJames Collins 1511c165b184SJames Collins // prepare 1512c165b184SJames Collins $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1513c165b184SJames Collins 1514c165b184SJames Collins // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1515c165b184SJames Collins // Script tags removal now preceeds style tag removal. 1516c165b184SJames Collins // strip out <script> tags 1517c165b184SJames Collins $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1518c165b184SJames Collins $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1519c165b184SJames Collins 1520c165b184SJames Collins // strip out the \r \n's if we are told to. 1521c165b184SJames Collins if ($stripRN) { 1522c165b184SJames Collins $this->doc = str_replace("\r", ' ', $this->doc); 1523c165b184SJames Collins $this->doc = str_replace("\n", ' ', $this->doc); 1524c165b184SJames Collins 1525c165b184SJames Collins // set the length of content since we have changed it. 1526c165b184SJames Collins $this->size = strlen($this->doc); 1527c165b184SJames Collins } 1528c165b184SJames Collins 1529c165b184SJames Collins // strip out cdata 1530c165b184SJames Collins $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1531c165b184SJames Collins // strip out comments 1532c165b184SJames Collins $this->remove_noise("'<!--(.*?)-->'is"); 1533c165b184SJames Collins // strip out <style> tags 1534c165b184SJames Collins $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1535c165b184SJames Collins $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1536c165b184SJames Collins // strip out preformatted tags 1537c165b184SJames Collins $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1538c165b184SJames Collins // strip out server side scripts 1539c165b184SJames Collins $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1540c165b184SJames Collins 1541c165b184SJames Collins if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1542c165b184SJames Collins $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1543c165b184SJames Collins } 1544c165b184SJames Collins 1545c165b184SJames Collins // parsing 1546c165b184SJames Collins $this->parse(); 1547c165b184SJames Collins // end 1548c165b184SJames Collins $this->root->_[HDOM_INFO_END] = $this->cursor; 1549c165b184SJames Collins $this->parse_charset(); 1550c165b184SJames Collins 1551c165b184SJames Collins // make load function chainable 1552c165b184SJames Collins return $this; 1553c165b184SJames Collins } 1554c165b184SJames Collins 1555c165b184SJames Collins function load_file() 1556c165b184SJames Collins { 1557c165b184SJames Collins $args = func_get_args(); 1558c165b184SJames Collins 1559c165b184SJames Collins if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1560c165b184SJames Collins $this->load($doc, true); 1561c165b184SJames Collins } else { 1562c165b184SJames Collins return false; 1563c165b184SJames Collins } 1564c165b184SJames Collins } 1565c165b184SJames Collins 1566c165b184SJames Collins function set_callback($function_name) 1567c165b184SJames Collins { 1568c165b184SJames Collins $this->callback = $function_name; 1569c165b184SJames Collins } 1570c165b184SJames Collins 1571c165b184SJames Collins function remove_callback() 1572c165b184SJames Collins { 1573c165b184SJames Collins $this->callback = null; 1574c165b184SJames Collins } 1575c165b184SJames Collins 1576c165b184SJames Collins function save($filepath = '') 1577c165b184SJames Collins { 1578c165b184SJames Collins $ret = $this->root->innertext(); 1579c165b184SJames Collins if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1580c165b184SJames Collins return $ret; 1581c165b184SJames Collins } 1582c165b184SJames Collins 1583c165b184SJames Collins function find($selector, $idx = null, $lowercase = false) 1584c165b184SJames Collins { 1585c165b184SJames Collins return $this->root->find($selector, $idx, $lowercase); 1586c165b184SJames Collins } 1587c165b184SJames Collins 1588c165b184SJames Collins function clear() 1589c165b184SJames Collins { 1590c165b184SJames Collins if (isset($this->nodes)) { 1591c165b184SJames Collins foreach ($this->nodes as $n) { 1592c165b184SJames Collins $n->clear(); 1593c165b184SJames Collins $n = null; 1594c165b184SJames Collins } 1595c165b184SJames Collins } 1596c165b184SJames Collins 1597c165b184SJames Collins // This add next line is documented in the sourceforge repository. 1598c165b184SJames Collins // 2977248 as a fix for ongoing memory leaks that occur even with the 1599c165b184SJames Collins // use of clear. 1600c165b184SJames Collins if (isset($this->children)) { 1601c165b184SJames Collins foreach ($this->children as $n) { 1602c165b184SJames Collins $n->clear(); 1603c165b184SJames Collins $n = null; 1604c165b184SJames Collins } 1605c165b184SJames Collins } 1606c165b184SJames Collins 1607c165b184SJames Collins if (isset($this->parent)) { 1608c165b184SJames Collins $this->parent->clear(); 1609c165b184SJames Collins unset($this->parent); 1610c165b184SJames Collins } 1611c165b184SJames Collins 1612c165b184SJames Collins if (isset($this->root)) { 1613c165b184SJames Collins $this->root->clear(); 1614c165b184SJames Collins unset($this->root); 1615c165b184SJames Collins } 1616c165b184SJames Collins 1617c165b184SJames Collins unset($this->doc); 1618c165b184SJames Collins unset($this->noise); 1619c165b184SJames Collins } 1620c165b184SJames Collins 1621c165b184SJames Collins function dump($show_attr = true) 1622c165b184SJames Collins { 1623c165b184SJames Collins $this->root->dump($show_attr); 1624c165b184SJames Collins } 1625c165b184SJames Collins 1626c165b184SJames Collins protected function prepare( 1627c165b184SJames Collins $str, $lowercase = true, 1628c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1629c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 1630c165b184SJames Collins { 1631c165b184SJames Collins $this->clear(); 1632c165b184SJames Collins 1633c165b184SJames Collins $this->doc = trim($str); 1634c165b184SJames Collins $this->size = strlen($this->doc); 1635c165b184SJames Collins $this->original_size = $this->size; // original size of the html 1636c165b184SJames Collins $this->pos = 0; 1637c165b184SJames Collins $this->cursor = 1; 1638c165b184SJames Collins $this->noise = array(); 1639c165b184SJames Collins $this->nodes = array(); 1640c165b184SJames Collins $this->lowercase = $lowercase; 1641c165b184SJames Collins $this->default_br_text = $defaultBRText; 1642c165b184SJames Collins $this->default_span_text = $defaultSpanText; 1643c165b184SJames Collins $this->root = new simple_html_dom_node($this); 1644c165b184SJames Collins $this->root->tag = 'root'; 1645c165b184SJames Collins $this->root->_[HDOM_INFO_BEGIN] = -1; 1646c165b184SJames Collins $this->root->nodetype = HDOM_TYPE_ROOT; 1647c165b184SJames Collins $this->parent = $this->root; 1648c165b184SJames Collins if ($this->size > 0) { $this->char = $this->doc[0]; } 1649c165b184SJames Collins } 1650c165b184SJames Collins 1651c165b184SJames Collins protected function parse() 1652c165b184SJames Collins { 1653c165b184SJames Collins while (true) { 1654c165b184SJames Collins // Read next tag if there is no text between current position and the 1655c165b184SJames Collins // next opening tag. 1656c165b184SJames Collins if (($s = $this->copy_until_char('<')) === '') { 1657c165b184SJames Collins if($this->read_tag()) { 1658c165b184SJames Collins continue; 1659c165b184SJames Collins } else { 1660c165b184SJames Collins return true; 1661c165b184SJames Collins } 1662c165b184SJames Collins } 1663c165b184SJames Collins 1664c165b184SJames Collins // Add a text node for text between tags 1665c165b184SJames Collins $node = new simple_html_dom_node($this); 1666c165b184SJames Collins ++$this->cursor; 1667c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = $s; 1668c165b184SJames Collins $this->link_nodes($node, false); 1669c165b184SJames Collins } 1670c165b184SJames Collins } 1671c165b184SJames Collins 1672c165b184SJames Collins protected function parse_charset() 1673c165b184SJames Collins { 1674c165b184SJames Collins global $debug_object; 1675c165b184SJames Collins 1676c165b184SJames Collins $charset = null; 1677c165b184SJames Collins 1678c165b184SJames Collins if (function_exists('get_last_retrieve_url_contents_content_type')) { 1679c165b184SJames Collins $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1680c165b184SJames Collins $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1681c165b184SJames Collins if ($success) { 1682c165b184SJames Collins $charset = $matches[1]; 1683c165b184SJames Collins if (is_object($debug_object)) { 1684c165b184SJames Collins $debug_object->debug_log(2, 1685c165b184SJames Collins 'header content-type found charset of: ' 1686c165b184SJames Collins . $charset 1687c165b184SJames Collins ); 1688c165b184SJames Collins } 1689c165b184SJames Collins } 1690c165b184SJames Collins } 1691c165b184SJames Collins 1692c165b184SJames Collins if (empty($charset)) { 1693c165b184SJames Collins // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1694c165b184SJames Collins $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1695c165b184SJames Collins 1696c165b184SJames Collins if (!empty($el)) { 1697c165b184SJames Collins $fullvalue = $el->content; 1698c165b184SJames Collins if (is_object($debug_object)) { 1699c165b184SJames Collins $debug_object->debug_log(2, 1700c165b184SJames Collins 'meta content-type tag found' 1701c165b184SJames Collins . $fullvalue 1702c165b184SJames Collins ); 1703c165b184SJames Collins } 1704c165b184SJames Collins 1705c165b184SJames Collins if (!empty($fullvalue)) { 1706c165b184SJames Collins $success = preg_match( 1707c165b184SJames Collins '/charset=(.+)/i', 1708c165b184SJames Collins $fullvalue, 1709c165b184SJames Collins $matches 1710c165b184SJames Collins ); 1711c165b184SJames Collins 1712c165b184SJames Collins if ($success) { 1713c165b184SJames Collins $charset = $matches[1]; 1714c165b184SJames Collins } else { 1715c165b184SJames Collins // If there is a meta tag, and they don't specify the 1716c165b184SJames Collins // character set, research says that it's typically 1717c165b184SJames Collins // ISO-8859-1 1718c165b184SJames Collins if (is_object($debug_object)) { 1719c165b184SJames Collins $debug_object->debug_log(2, 1720c165b184SJames Collins 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1721c165b184SJames Collins ); 1722c165b184SJames Collins } 1723c165b184SJames Collins 1724c165b184SJames Collins $charset = 'ISO-8859-1'; 1725c165b184SJames Collins } 1726c165b184SJames Collins } 1727c165b184SJames Collins } 1728c165b184SJames Collins } 1729c165b184SJames Collins 1730c165b184SJames Collins if (empty($charset)) { 1731c165b184SJames Collins // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1732c165b184SJames Collins if ($meta = $this->root->find('meta[charset]', 0)) { 1733c165b184SJames Collins $charset = $meta->charset; 1734c165b184SJames Collins if (is_object($debug_object)) { 1735c165b184SJames Collins $debug_object->debug_log(2, 'meta charset: ' . $charset); 1736c165b184SJames Collins } 1737c165b184SJames Collins } 1738c165b184SJames Collins } 1739c165b184SJames Collins 1740c165b184SJames Collins if (empty($charset)) { 1741c165b184SJames Collins // Try to guess the charset based on the content 1742c165b184SJames Collins // Requires Multibyte String (mbstring) support (optional) 1743c165b184SJames Collins if (function_exists('mb_detect_encoding')) { 1744c165b184SJames Collins /** 1745c165b184SJames Collins * mb_detect_encoding() is not intended to distinguish between 1746c165b184SJames Collins * charsets, especially single-byte charsets. Its primary 1747c165b184SJames Collins * purpose is to detect which multibyte encoding is in use, 1748c165b184SJames Collins * i.e. UTF-8, UTF-16, shift-JIS, etc. 1749c165b184SJames Collins * 1750c165b184SJames Collins * -- https://bugs.php.net/bug.php?id=38138 1751c165b184SJames Collins * 1752c165b184SJames Collins * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1753c165b184SJames Collins * always result in CP1251/ISO-8859-5 and vice versa. 1754c165b184SJames Collins * 1755c165b184SJames Collins * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1756c165b184SJames Collins * to stay compatible. 1757c165b184SJames Collins */ 1758c165b184SJames Collins $encoding = mb_detect_encoding( 1759c165b184SJames Collins $this->doc, 1760c165b184SJames Collins array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1761c165b184SJames Collins ); 1762c165b184SJames Collins 1763c165b184SJames Collins if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1764c165b184SJames Collins // Due to a limitation of mb_detect_encoding 1765c165b184SJames Collins // 'CP1251'/'ISO-8859-5' will be detected as 1766c165b184SJames Collins // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1767c165b184SJames Collins // which case we can simply assume it is the other charset. 1768c165b184SJames Collins if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1769c165b184SJames Collins $encoding = 'CP1251'; 1770c165b184SJames Collins } 1771c165b184SJames Collins } 1772c165b184SJames Collins 1773c165b184SJames Collins if ($encoding !== false) { 1774c165b184SJames Collins $charset = $encoding; 1775c165b184SJames Collins if (is_object($debug_object)) { 1776c165b184SJames Collins $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1777c165b184SJames Collins } 1778c165b184SJames Collins } 1779c165b184SJames Collins } 1780c165b184SJames Collins } 1781c165b184SJames Collins 1782c165b184SJames Collins if (empty($charset)) { 1783c165b184SJames Collins // Assume it's UTF-8 as it is the most likely charset to be used 1784c165b184SJames Collins $charset = 'UTF-8'; 1785c165b184SJames Collins if (is_object($debug_object)) { 1786c165b184SJames Collins $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1787c165b184SJames Collins } 1788c165b184SJames Collins } 1789c165b184SJames Collins 1790c165b184SJames Collins // Since CP1252 is a superset, if we get one of it's subsets, we want 1791c165b184SJames Collins // it instead. 1792c165b184SJames Collins if ((strtolower($charset) == 'iso-8859-1') 1793c165b184SJames Collins || (strtolower($charset) == 'latin1') 1794c165b184SJames Collins || (strtolower($charset) == 'latin-1')) { 1795c165b184SJames Collins $charset = 'CP1252'; 1796c165b184SJames Collins if (is_object($debug_object)) { 1797c165b184SJames Collins $debug_object->debug_log(2, 1798c165b184SJames Collins 'replacing ' . $charset . ' with CP1252 as its a superset' 1799c165b184SJames Collins ); 1800c165b184SJames Collins } 1801c165b184SJames Collins } 1802c165b184SJames Collins 1803c165b184SJames Collins if (is_object($debug_object)) { 1804c165b184SJames Collins $debug_object->debug_log(1, 'EXIT - ' . $charset); 1805c165b184SJames Collins } 1806c165b184SJames Collins 1807c165b184SJames Collins return $this->_charset = $charset; 1808c165b184SJames Collins } 1809c165b184SJames Collins 1810c165b184SJames Collins protected function read_tag() 1811c165b184SJames Collins { 1812c165b184SJames Collins // Set end position if no further tags found 1813c165b184SJames Collins if ($this->char !== '<') { 1814c165b184SJames Collins $this->root->_[HDOM_INFO_END] = $this->cursor; 1815c165b184SJames Collins return false; 1816c165b184SJames Collins } 1817c165b184SJames Collins 1818c165b184SJames Collins $begin_tag_pos = $this->pos; 1819c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1820c165b184SJames Collins 1821c165b184SJames Collins // end tag 1822c165b184SJames Collins if ($this->char === '/') { 1823c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1824c165b184SJames Collins 1825c165b184SJames Collins // Skip whitespace in end tags (i.e. in "</ html>") 1826c165b184SJames Collins $this->skip($this->token_blank); 1827c165b184SJames Collins $tag = $this->copy_until_char('>'); 1828c165b184SJames Collins 1829c165b184SJames Collins // Skip attributes in end tags 1830c165b184SJames Collins if (($pos = strpos($tag, ' ')) !== false) { 1831c165b184SJames Collins $tag = substr($tag, 0, $pos); 1832c165b184SJames Collins } 1833c165b184SJames Collins 1834c165b184SJames Collins $parent_lower = strtolower($this->parent->tag); 1835c165b184SJames Collins $tag_lower = strtolower($tag); 1836c165b184SJames Collins 1837c165b184SJames Collins // The end tag is supposed to close the parent tag. Handle situations 1838c165b184SJames Collins // when it doesn't 1839c165b184SJames Collins if ($parent_lower !== $tag_lower) { 1840c165b184SJames Collins // Parent tag does not have to be closed necessarily (optional closing tag) 1841c165b184SJames Collins // Current tag is a block tag, so it may close an ancestor 1842c165b184SJames Collins if (isset($this->optional_closing_tags[$parent_lower]) 1843c165b184SJames Collins && isset($this->block_tags[$tag_lower])) { 1844c165b184SJames Collins 1845c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1846c165b184SJames Collins $org_parent = $this->parent; 1847c165b184SJames Collins 1848c165b184SJames Collins // Traverse ancestors to find a matching opening tag 1849c165b184SJames Collins // Stop at root node 1850c165b184SJames Collins while (($this->parent->parent) 1851c165b184SJames Collins && strtolower($this->parent->tag) !== $tag_lower 1852c165b184SJames Collins ){ 1853c165b184SJames Collins $this->parent = $this->parent->parent; 1854c165b184SJames Collins } 1855c165b184SJames Collins 1856c165b184SJames Collins // If we don't have a match add current tag as text node 1857c165b184SJames Collins if (strtolower($this->parent->tag) !== $tag_lower) { 1858c165b184SJames Collins $this->parent = $org_parent; // restore origonal parent 1859c165b184SJames Collins 1860c165b184SJames Collins if ($this->parent->parent) { 1861c165b184SJames Collins $this->parent = $this->parent->parent; 1862c165b184SJames Collins } 1863c165b184SJames Collins 1864c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1865c165b184SJames Collins return $this->as_text_node($tag); 1866c165b184SJames Collins } 1867c165b184SJames Collins } elseif (($this->parent->parent) 1868c165b184SJames Collins && isset($this->block_tags[$tag_lower]) 1869c165b184SJames Collins ) { 1870c165b184SJames Collins // Grandparent exists and current tag is a block tag, so our 1871c165b184SJames Collins // parent doesn't have an end tag 1872c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1873c165b184SJames Collins $org_parent = $this->parent; 1874c165b184SJames Collins 1875c165b184SJames Collins // Traverse ancestors to find a matching opening tag 1876c165b184SJames Collins // Stop at root node 1877c165b184SJames Collins while (($this->parent->parent) 1878c165b184SJames Collins && strtolower($this->parent->tag) !== $tag_lower 1879c165b184SJames Collins ) { 1880c165b184SJames Collins $this->parent = $this->parent->parent; 1881c165b184SJames Collins } 1882c165b184SJames Collins 1883c165b184SJames Collins // If we don't have a match add current tag as text node 1884c165b184SJames Collins if (strtolower($this->parent->tag) !== $tag_lower) { 1885c165b184SJames Collins $this->parent = $org_parent; // restore origonal parent 1886c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1887c165b184SJames Collins return $this->as_text_node($tag); 1888c165b184SJames Collins } 1889c165b184SJames Collins } elseif (($this->parent->parent) 1890c165b184SJames Collins && strtolower($this->parent->parent->tag) === $tag_lower 1891c165b184SJames Collins ) { // Grandparent exists and current tag closes it 1892c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1893c165b184SJames Collins $this->parent = $this->parent->parent; 1894c165b184SJames Collins } else { // Random tag, add as text node 1895c165b184SJames Collins return $this->as_text_node($tag); 1896c165b184SJames Collins } 1897c165b184SJames Collins } 1898c165b184SJames Collins 1899c165b184SJames Collins // Set end position of parent tag to current cursor position 1900c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1901c165b184SJames Collins 1902c165b184SJames Collins if ($this->parent->parent) { 1903c165b184SJames Collins $this->parent = $this->parent->parent; 1904c165b184SJames Collins } 1905c165b184SJames Collins 1906c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1907c165b184SJames Collins return true; 1908c165b184SJames Collins } 1909c165b184SJames Collins 1910c165b184SJames Collins // start tag 1911c165b184SJames Collins $node = new simple_html_dom_node($this); 1912c165b184SJames Collins $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1913c165b184SJames Collins ++$this->cursor; 1914c165b184SJames Collins $tag = $this->copy_until($this->token_slash); // Get tag name 1915c165b184SJames Collins $node->tag_start = $begin_tag_pos; 1916c165b184SJames Collins 1917c165b184SJames Collins // doctype, cdata & comments... 1918c165b184SJames Collins // <!DOCTYPE html> 1919c165b184SJames Collins // <![CDATA[ ... ]]> 1920c165b184SJames Collins // <!-- Comment --> 1921c165b184SJames Collins if (isset($tag[0]) && $tag[0] === '!') { 1922c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1923c165b184SJames Collins 1924c165b184SJames Collins if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1925c165b184SJames Collins $node->nodetype = HDOM_TYPE_COMMENT; 1926c165b184SJames Collins $node->tag = 'comment'; 1927c165b184SJames Collins } else { // Could be doctype or CDATA but we don't care 1928c165b184SJames Collins $node->nodetype = HDOM_TYPE_UNKNOWN; 1929c165b184SJames Collins $node->tag = 'unknown'; 1930c165b184SJames Collins } 1931c165b184SJames Collins 1932c165b184SJames Collins if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1933c165b184SJames Collins 1934c165b184SJames Collins $this->link_nodes($node, true); 1935c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1936c165b184SJames Collins return true; 1937c165b184SJames Collins } 1938c165b184SJames Collins 1939c165b184SJames Collins // The start tag cannot contain another start tag, if so add as text 1940c165b184SJames Collins // i.e. "<<html>" 1941c165b184SJames Collins if ($pos = strpos($tag, '<') !== false) { 1942c165b184SJames Collins $tag = '<' . substr($tag, 0, -1); 1943c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = $tag; 1944c165b184SJames Collins $this->link_nodes($node, false); 1945c165b184SJames Collins $this->char = $this->doc[--$this->pos]; // prev 1946c165b184SJames Collins return true; 1947c165b184SJames Collins } 1948c165b184SJames Collins 1949c165b184SJames Collins // Handle invalid tag names (i.e. "<html#doc>") 1950c165b184SJames Collins if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1951c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1952c165b184SJames Collins 1953c165b184SJames Collins // Next char is the beginning of a new tag, don't touch it. 1954c165b184SJames Collins if ($this->char === '<') { 1955c165b184SJames Collins $this->link_nodes($node, false); 1956c165b184SJames Collins return true; 1957c165b184SJames Collins } 1958c165b184SJames Collins 1959c165b184SJames Collins // Next char closes current tag, add and be done with it. 1960c165b184SJames Collins if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1961c165b184SJames Collins $this->link_nodes($node, false); 1962c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1963c165b184SJames Collins return true; 1964c165b184SJames Collins } 1965c165b184SJames Collins 1966c165b184SJames Collins // begin tag, add new node 1967c165b184SJames Collins $node->nodetype = HDOM_TYPE_ELEMENT; 1968c165b184SJames Collins $tag_lower = strtolower($tag); 1969c165b184SJames Collins $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1970c165b184SJames Collins 1971c165b184SJames Collins // handle optional closing tags 1972c165b184SJames Collins if (isset($this->optional_closing_tags[$tag_lower])) { 1973c165b184SJames Collins // Traverse ancestors to close all optional closing tags 1974c165b184SJames Collins while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1975c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1976c165b184SJames Collins $this->parent = $this->parent->parent; 1977c165b184SJames Collins } 1978c165b184SJames Collins $node->parent = $this->parent; 1979c165b184SJames Collins } 1980c165b184SJames Collins 1981c165b184SJames Collins $guard = 0; // prevent infinity loop 1982c165b184SJames Collins 1983c165b184SJames Collins // [0] Space between tag and first attribute 1984c165b184SJames Collins $space = array($this->copy_skip($this->token_blank), '', ''); 1985c165b184SJames Collins 1986c165b184SJames Collins // attributes 1987c165b184SJames Collins do { 1988c165b184SJames Collins // Everything until the first equal sign should be the attribute name 1989c165b184SJames Collins $name = $this->copy_until($this->token_equal); 1990c165b184SJames Collins 1991c165b184SJames Collins if ($name === '' && $this->char !== null && $space[0] === '') { 1992c165b184SJames Collins break; 1993c165b184SJames Collins } 1994c165b184SJames Collins 1995c165b184SJames Collins if ($guard === $this->pos) { // Escape infinite loop 1996c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1997c165b184SJames Collins continue; 1998c165b184SJames Collins } 1999c165b184SJames Collins 2000c165b184SJames Collins $guard = $this->pos; 2001c165b184SJames Collins 2002c165b184SJames Collins // handle endless '<' 2003c165b184SJames Collins // Out of bounds before the tag ended 2004c165b184SJames Collins if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2005c165b184SJames Collins $node->nodetype = HDOM_TYPE_TEXT; 2006c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2007c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2008c165b184SJames Collins $node->tag = 'text'; 2009c165b184SJames Collins $this->link_nodes($node, false); 2010c165b184SJames Collins return true; 2011c165b184SJames Collins } 2012c165b184SJames Collins 2013c165b184SJames Collins // handle mismatch '<' 2014c165b184SJames Collins // Attributes cannot start after opening tag 2015c165b184SJames Collins if ($this->doc[$this->pos - 1] == '<') { 2016c165b184SJames Collins $node->nodetype = HDOM_TYPE_TEXT; 2017c165b184SJames Collins $node->tag = 'text'; 2018c165b184SJames Collins $node->attr = array(); 2019c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2020c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = substr( 2021c165b184SJames Collins $this->doc, 2022c165b184SJames Collins $begin_tag_pos, 2023c165b184SJames Collins $this->pos - $begin_tag_pos - 1 2024c165b184SJames Collins ); 2025c165b184SJames Collins $this->pos -= 2; 2026c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2027c165b184SJames Collins $this->link_nodes($node, false); 2028c165b184SJames Collins return true; 2029c165b184SJames Collins } 2030c165b184SJames Collins 2031c165b184SJames Collins if ($name !== '/' && $name !== '') { // this is a attribute name 2032c165b184SJames Collins // [1] Whitespace after attribute name 2033c165b184SJames Collins $space[1] = $this->copy_skip($this->token_blank); 2034c165b184SJames Collins 2035c165b184SJames Collins $name = $this->restore_noise($name); // might be a noisy name 2036c165b184SJames Collins 2037c165b184SJames Collins if ($this->lowercase) { $name = strtolower($name); } 2038c165b184SJames Collins 2039c165b184SJames Collins if ($this->char === '=') { // attribute with value 2040c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2041c165b184SJames Collins $this->parse_attr($node, $name, $space); // get attribute value 2042c165b184SJames Collins } else { 2043c165b184SJames Collins //no value attr: nowrap, checked selected... 2044c165b184SJames Collins $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2045c165b184SJames Collins $node->attr[$name] = true; 2046c165b184SJames Collins if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2047c165b184SJames Collins } 2048c165b184SJames Collins 2049c165b184SJames Collins $node->_[HDOM_INFO_SPACE][] = $space; 2050c165b184SJames Collins 2051c165b184SJames Collins // prepare for next attribute 2052c165b184SJames Collins $space = array( 2053c165b184SJames Collins $this->copy_skip($this->token_blank), 2054c165b184SJames Collins '', 2055c165b184SJames Collins '' 2056c165b184SJames Collins ); 2057c165b184SJames Collins } else { // no more attributes 2058c165b184SJames Collins break; 2059c165b184SJames Collins } 2060c165b184SJames Collins } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2061c165b184SJames Collins 2062c165b184SJames Collins $this->link_nodes($node, true); 2063c165b184SJames Collins $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2064c165b184SJames Collins 2065c165b184SJames Collins // handle empty tags (i.e. "<div/>") 2066c165b184SJames Collins if ($this->copy_until_char('>') === '/') { 2067c165b184SJames Collins $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2068c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2069c165b184SJames Collins } else { 2070c165b184SJames Collins // reset parent 2071c165b184SJames Collins if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2072c165b184SJames Collins $this->parent = $node; 2073c165b184SJames Collins } 2074c165b184SJames Collins } 2075c165b184SJames Collins 2076c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2077c165b184SJames Collins 2078c165b184SJames Collins // If it's a BR tag, we need to set it's text to the default text. 2079c165b184SJames Collins // This way when we see it in plaintext, we can generate formatting that the user wants. 2080c165b184SJames Collins // since a br tag never has sub nodes, this works well. 2081c165b184SJames Collins if ($node->tag === 'br') { 2082c165b184SJames Collins $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2083c165b184SJames Collins } 2084c165b184SJames Collins 2085c165b184SJames Collins return true; 2086c165b184SJames Collins } 2087c165b184SJames Collins 2088c165b184SJames Collins protected function parse_attr($node, $name, &$space) 2089c165b184SJames Collins { 2090c165b184SJames Collins $is_duplicate = isset($node->attr[$name]); 2091c165b184SJames Collins 2092c165b184SJames Collins if (!$is_duplicate) // Copy whitespace between "=" and value 2093c165b184SJames Collins $space[2] = $this->copy_skip($this->token_blank); 2094c165b184SJames Collins 2095c165b184SJames Collins switch ($this->char) { 2096c165b184SJames Collins case '"': 2097c165b184SJames Collins $quote_type = HDOM_QUOTE_DOUBLE; 2098c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2099c165b184SJames Collins $value = $this->copy_until_char('"'); 2100c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2101c165b184SJames Collins break; 2102c165b184SJames Collins case '\'': 2103c165b184SJames Collins $quote_type = HDOM_QUOTE_SINGLE; 2104c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2105c165b184SJames Collins $value = $this->copy_until_char('\''); 2106c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2107c165b184SJames Collins break; 2108c165b184SJames Collins default: 2109c165b184SJames Collins $quote_type = HDOM_QUOTE_NO; 2110c165b184SJames Collins $value = $this->copy_until($this->token_attr); 2111c165b184SJames Collins } 2112c165b184SJames Collins 2113c165b184SJames Collins $value = $this->restore_noise($value); 2114c165b184SJames Collins 2115c165b184SJames Collins // PaperG: Attributes should not have \r or \n in them, that counts as 2116c165b184SJames Collins // html whitespace. 2117*cdddb6f0SJames Collins 2118*cdddb6f0SJames Collins// The following was commented out as it interferes with DokuWiki edit mode - nomadjimbob 2119*cdddb6f0SJames Collins// 2120*cdddb6f0SJames Collins// $value = str_replace("\r", '', $value); 2121*cdddb6f0SJames Collins// $value = str_replace("\n", '', $value); 2122c165b184SJames Collins 2123c165b184SJames Collins // PaperG: If this is a "class" selector, lets get rid of the preceeding 2124c165b184SJames Collins // and trailing space since some people leave it in the multi class case. 2125c165b184SJames Collins if ($name === 'class') { 2126c165b184SJames Collins $value = trim($value); 2127c165b184SJames Collins } 2128c165b184SJames Collins 2129c165b184SJames Collins if (!$is_duplicate) { 2130c165b184SJames Collins $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2131c165b184SJames Collins $node->attr[$name] = $value; 2132c165b184SJames Collins } 2133c165b184SJames Collins } 2134c165b184SJames Collins 2135c165b184SJames Collins protected function link_nodes(&$node, $is_child) 2136c165b184SJames Collins { 2137c165b184SJames Collins $node->parent = $this->parent; 2138c165b184SJames Collins $this->parent->nodes[] = $node; 2139c165b184SJames Collins if ($is_child) { 2140c165b184SJames Collins $this->parent->children[] = $node; 2141c165b184SJames Collins } 2142c165b184SJames Collins } 2143c165b184SJames Collins 2144c165b184SJames Collins protected function as_text_node($tag) 2145c165b184SJames Collins { 2146c165b184SJames Collins $node = new simple_html_dom_node($this); 2147c165b184SJames Collins ++$this->cursor; 2148c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2149c165b184SJames Collins $this->link_nodes($node, false); 2150c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2151c165b184SJames Collins return true; 2152c165b184SJames Collins } 2153c165b184SJames Collins 2154c165b184SJames Collins protected function skip($chars) 2155c165b184SJames Collins { 2156c165b184SJames Collins $this->pos += strspn($this->doc, $chars, $this->pos); 2157c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2158c165b184SJames Collins } 2159c165b184SJames Collins 2160c165b184SJames Collins protected function copy_skip($chars) 2161c165b184SJames Collins { 2162c165b184SJames Collins $pos = $this->pos; 2163c165b184SJames Collins $len = strspn($this->doc, $chars, $pos); 2164c165b184SJames Collins $this->pos += $len; 2165c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2166c165b184SJames Collins if ($len === 0) { return ''; } 2167c165b184SJames Collins return substr($this->doc, $pos, $len); 2168c165b184SJames Collins } 2169c165b184SJames Collins 2170c165b184SJames Collins protected function copy_until($chars) 2171c165b184SJames Collins { 2172c165b184SJames Collins $pos = $this->pos; 2173c165b184SJames Collins $len = strcspn($this->doc, $chars, $pos); 2174c165b184SJames Collins $this->pos += $len; 2175c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2176c165b184SJames Collins return substr($this->doc, $pos, $len); 2177c165b184SJames Collins } 2178c165b184SJames Collins 2179c165b184SJames Collins protected function copy_until_char($char) 2180c165b184SJames Collins { 2181c165b184SJames Collins if ($this->char === null) { return ''; } 2182c165b184SJames Collins 2183c165b184SJames Collins if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2184c165b184SJames Collins $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2185c165b184SJames Collins $this->char = null; 2186c165b184SJames Collins $this->pos = $this->size; 2187c165b184SJames Collins return $ret; 2188c165b184SJames Collins } 2189c165b184SJames Collins 2190c165b184SJames Collins if ($pos === $this->pos) { return ''; } 2191c165b184SJames Collins 2192c165b184SJames Collins $pos_old = $this->pos; 2193c165b184SJames Collins $this->char = $this->doc[$pos]; 2194c165b184SJames Collins $this->pos = $pos; 2195c165b184SJames Collins return substr($this->doc, $pos_old, $pos - $pos_old); 2196c165b184SJames Collins } 2197c165b184SJames Collins 2198c165b184SJames Collins protected function remove_noise($pattern, $remove_tag = false) 2199c165b184SJames Collins { 2200c165b184SJames Collins global $debug_object; 2201c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2202c165b184SJames Collins 2203c165b184SJames Collins $count = preg_match_all( 2204c165b184SJames Collins $pattern, 2205c165b184SJames Collins $this->doc, 2206c165b184SJames Collins $matches, 2207c165b184SJames Collins PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2208c165b184SJames Collins ); 2209c165b184SJames Collins 2210c165b184SJames Collins for ($i = $count - 1; $i > -1; --$i) { 2211c165b184SJames Collins $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2212c165b184SJames Collins 2213c165b184SJames Collins if (is_object($debug_object)) { 2214c165b184SJames Collins $debug_object->debug_log(2, 'key is: ' . $key); 2215c165b184SJames Collins } 2216c165b184SJames Collins 2217c165b184SJames Collins $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2218c165b184SJames Collins $this->noise[$key] = $matches[$i][$idx][0]; 2219c165b184SJames Collins $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2220c165b184SJames Collins } 2221c165b184SJames Collins 2222c165b184SJames Collins // reset the length of content 2223c165b184SJames Collins $this->size = strlen($this->doc); 2224c165b184SJames Collins 2225c165b184SJames Collins if ($this->size > 0) { 2226c165b184SJames Collins $this->char = $this->doc[0]; 2227c165b184SJames Collins } 2228c165b184SJames Collins } 2229c165b184SJames Collins 2230c165b184SJames Collins function restore_noise($text) 2231c165b184SJames Collins { 2232c165b184SJames Collins global $debug_object; 2233c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2234c165b184SJames Collins 2235c165b184SJames Collins while (($pos = strpos($text, '___noise___')) !== false) { 2236c165b184SJames Collins // Sometimes there is a broken piece of markup, and we don't GET the 2237c165b184SJames Collins // pos+11 etc... token which indicates a problem outside of us... 2238c165b184SJames Collins 2239c165b184SJames Collins // todo: "___noise___1000" (or any number with four or more digits) 2240c165b184SJames Collins // in the DOM causes an infinite loop which could be utilized by 2241c165b184SJames Collins // malicious software 2242c165b184SJames Collins if (strlen($text) > $pos + 15) { 2243c165b184SJames Collins $key = '___noise___' 2244c165b184SJames Collins . $text[$pos + 11] 2245c165b184SJames Collins . $text[$pos + 12] 2246c165b184SJames Collins . $text[$pos + 13] 2247c165b184SJames Collins . $text[$pos + 14] 2248c165b184SJames Collins . $text[$pos + 15]; 2249c165b184SJames Collins 2250c165b184SJames Collins if (is_object($debug_object)) { 2251c165b184SJames Collins $debug_object->debug_log(2, 'located key of: ' . $key); 2252c165b184SJames Collins } 2253c165b184SJames Collins 2254c165b184SJames Collins if (isset($this->noise[$key])) { 2255c165b184SJames Collins $text = substr($text, 0, $pos) 2256c165b184SJames Collins . $this->noise[$key] 2257c165b184SJames Collins . substr($text, $pos + 16); 2258c165b184SJames Collins } else { 2259c165b184SJames Collins // do this to prevent an infinite loop. 2260c165b184SJames Collins $text = substr($text, 0, $pos) 2261c165b184SJames Collins . 'UNDEFINED NOISE FOR KEY: ' 2262c165b184SJames Collins . $key 2263c165b184SJames Collins . substr($text, $pos + 16); 2264c165b184SJames Collins } 2265c165b184SJames Collins } else { 2266c165b184SJames Collins // There is no valid key being given back to us... We must get 2267c165b184SJames Collins // rid of the ___noise___ or we will have a problem. 2268c165b184SJames Collins $text = substr($text, 0, $pos) 2269c165b184SJames Collins . 'NO NUMERIC NOISE KEY' 2270c165b184SJames Collins . substr($text, $pos + 11); 2271c165b184SJames Collins } 2272c165b184SJames Collins } 2273c165b184SJames Collins return $text; 2274c165b184SJames Collins } 2275c165b184SJames Collins 2276c165b184SJames Collins function search_noise($text) 2277c165b184SJames Collins { 2278c165b184SJames Collins global $debug_object; 2279c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2280c165b184SJames Collins 2281c165b184SJames Collins foreach($this->noise as $noiseElement) { 2282c165b184SJames Collins if (strpos($noiseElement, $text) !== false) { 2283c165b184SJames Collins return $noiseElement; 2284c165b184SJames Collins } 2285c165b184SJames Collins } 2286c165b184SJames Collins } 2287c165b184SJames Collins 2288c165b184SJames Collins function __toString() 2289c165b184SJames Collins { 2290c165b184SJames Collins return $this->root->innertext(); 2291c165b184SJames Collins } 2292c165b184SJames Collins 2293c165b184SJames Collins function __get($name) 2294c165b184SJames Collins { 2295c165b184SJames Collins switch ($name) { 2296c165b184SJames Collins case 'outertext': 2297c165b184SJames Collins return $this->root->innertext(); 2298c165b184SJames Collins case 'innertext': 2299c165b184SJames Collins return $this->root->innertext(); 2300c165b184SJames Collins case 'plaintext': 2301c165b184SJames Collins return $this->root->text(); 2302c165b184SJames Collins case 'charset': 2303c165b184SJames Collins return $this->_charset; 2304c165b184SJames Collins case 'target_charset': 2305c165b184SJames Collins return $this->_target_charset; 2306c165b184SJames Collins } 2307c165b184SJames Collins } 2308c165b184SJames Collins 2309c165b184SJames Collins function childNodes($idx = -1) 2310c165b184SJames Collins { 2311c165b184SJames Collins return $this->root->childNodes($idx); 2312c165b184SJames Collins } 2313c165b184SJames Collins 2314c165b184SJames Collins function firstChild() 2315c165b184SJames Collins { 2316c165b184SJames Collins return $this->root->first_child(); 2317c165b184SJames Collins } 2318c165b184SJames Collins 2319c165b184SJames Collins function lastChild() 2320c165b184SJames Collins { 2321c165b184SJames Collins return $this->root->last_child(); 2322c165b184SJames Collins } 2323c165b184SJames Collins 2324c165b184SJames Collins function createElement($name, $value = null) 2325c165b184SJames Collins { 2326c165b184SJames Collins return @str_get_html("<$name>$value</$name>")->firstChild(); 2327c165b184SJames Collins } 2328c165b184SJames Collins 2329c165b184SJames Collins function createTextNode($value) 2330c165b184SJames Collins { 2331c165b184SJames Collins return @end(str_get_html($value)->nodes); 2332c165b184SJames Collins } 2333c165b184SJames Collins 2334c165b184SJames Collins function getElementById($id) 2335c165b184SJames Collins { 2336c165b184SJames Collins return $this->find("#$id", 0); 2337c165b184SJames Collins } 2338c165b184SJames Collins 2339c165b184SJames Collins function getElementsById($id, $idx = null) 2340c165b184SJames Collins { 2341c165b184SJames Collins return $this->find("#$id", $idx); 2342c165b184SJames Collins } 2343c165b184SJames Collins 2344c165b184SJames Collins function getElementByTagName($name) 2345c165b184SJames Collins { 2346c165b184SJames Collins return $this->find($name, 0); 2347c165b184SJames Collins } 2348c165b184SJames Collins 2349c165b184SJames Collins function getElementsByTagName($name, $idx = -1) 2350c165b184SJames Collins { 2351c165b184SJames Collins return $this->find($name, $idx); 2352c165b184SJames Collins } 2353c165b184SJames Collins 2354c165b184SJames Collins function loadFile() 2355c165b184SJames Collins { 2356c165b184SJames Collins $args = func_get_args(); 2357c165b184SJames Collins $this->load_file($args); 2358c165b184SJames Collins } 2359c165b184SJames Collins} 2360