1c165b184SJames Collins<?php 2c165b184SJames Collins/** 3c165b184SJames Collins * Website: http://sourceforge.net/projects/simplehtmldom/ 4c165b184SJames Collins * Additional projects: http://sourceforge.net/projects/debugobject/ 5c165b184SJames Collins * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6c165b184SJames Collins * 7c165b184SJames Collins * Licensed under The MIT License 8c165b184SJames Collins * See the LICENSE file in the project root for more information. 9c165b184SJames Collins * 10c165b184SJames Collins * Authors: 11c165b184SJames Collins * S.C. Chen 12c165b184SJames Collins * John Schlick 13c165b184SJames Collins * Rus Carroll 14c165b184SJames Collins * logmanoriginal 15c165b184SJames Collins * 16c165b184SJames Collins * Contributors: 17c165b184SJames Collins * Yousuke Kumakura 18c165b184SJames Collins * Vadim Voituk 19c165b184SJames Collins * Antcs 20*bc1032d9SJames Collins * James Collins (nomadjimbob) 21c165b184SJames Collins * 22*bc1032d9SJames Collins * Based on Version Rev. 1.9.1 (291) 23*bc1032d9SJames Collins * Version 1.9.1.1 24c165b184SJames Collins */ 25c165b184SJames Collins 26c165b184SJames Collinsdefine('HDOM_TYPE_ELEMENT', 1); 27c165b184SJames Collinsdefine('HDOM_TYPE_COMMENT', 2); 28c165b184SJames Collinsdefine('HDOM_TYPE_TEXT', 3); 29c165b184SJames Collinsdefine('HDOM_TYPE_ENDTAG', 4); 30c165b184SJames Collinsdefine('HDOM_TYPE_ROOT', 5); 31c165b184SJames Collinsdefine('HDOM_TYPE_UNKNOWN', 6); 32c165b184SJames Collinsdefine('HDOM_QUOTE_DOUBLE', 0); 33c165b184SJames Collinsdefine('HDOM_QUOTE_SINGLE', 1); 34c165b184SJames Collinsdefine('HDOM_QUOTE_NO', 3); 35c165b184SJames Collinsdefine('HDOM_INFO_BEGIN', 0); 36c165b184SJames Collinsdefine('HDOM_INFO_END', 1); 37c165b184SJames Collinsdefine('HDOM_INFO_QUOTE', 2); 38c165b184SJames Collinsdefine('HDOM_INFO_SPACE', 3); 39c165b184SJames Collinsdefine('HDOM_INFO_TEXT', 4); 40c165b184SJames Collinsdefine('HDOM_INFO_INNER', 5); 41c165b184SJames Collinsdefine('HDOM_INFO_OUTER', 6); 42c165b184SJames Collinsdefine('HDOM_INFO_ENDSPACE', 7); 43c165b184SJames Collins 44c165b184SJames Collinsdefined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 45c165b184SJames Collinsdefined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 46c165b184SJames Collinsdefined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 47c165b184SJames Collinsdefined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 48c165b184SJames Collinsdefine('HDOM_SMARTY_AS_TEXT', 1); 49c165b184SJames Collins 50c165b184SJames Collinsfunction file_get_html( 51c165b184SJames Collins $url, 52c165b184SJames Collins $use_include_path = false, 53c165b184SJames Collins $context = null, 54c165b184SJames Collins $offset = 0, 55c165b184SJames Collins $maxLen = -1, 56c165b184SJames Collins $lowercase = true, 57c165b184SJames Collins $forceTagsClosed = true, 58c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 59c165b184SJames Collins $stripRN = true, 60c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 61c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 62c165b184SJames Collins{ 63c165b184SJames Collins if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 64c165b184SJames Collins 65c165b184SJames Collins $dom = new simple_html_dom( 66c165b184SJames Collins null, 67c165b184SJames Collins $lowercase, 68c165b184SJames Collins $forceTagsClosed, 69c165b184SJames Collins $target_charset, 70c165b184SJames Collins $stripRN, 71c165b184SJames Collins $defaultBRText, 72c165b184SJames Collins $defaultSpanText 73c165b184SJames Collins ); 74c165b184SJames Collins 75c165b184SJames Collins /** 76c165b184SJames Collins * For sourceforge users: uncomment the next line and comment the 77c165b184SJames Collins * retrieve_url_contents line 2 lines down if it is not already done. 78c165b184SJames Collins */ 79c165b184SJames Collins $contents = file_get_contents( 80c165b184SJames Collins $url, 81c165b184SJames Collins $use_include_path, 82c165b184SJames Collins $context, 83c165b184SJames Collins $offset, 84c165b184SJames Collins $maxLen 85c165b184SJames Collins ); 86c165b184SJames Collins // $contents = retrieve_url_contents($url); 87c165b184SJames Collins 88c165b184SJames Collins if (empty($contents) || strlen($contents) > $maxLen) { 89c165b184SJames Collins $dom->clear(); 90c165b184SJames Collins return false; 91c165b184SJames Collins } 92c165b184SJames Collins 93c165b184SJames Collins return $dom->load($contents, $lowercase, $stripRN); 94c165b184SJames Collins} 95c165b184SJames Collins 96c165b184SJames Collinsfunction str_get_html( 97c165b184SJames Collins $str, 98c165b184SJames Collins $lowercase = true, 99c165b184SJames Collins $forceTagsClosed = true, 100c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 101c165b184SJames Collins $stripRN = true, 102c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 103c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 104c165b184SJames Collins{ 105c165b184SJames Collins $dom = new simple_html_dom( 106c165b184SJames Collins null, 107c165b184SJames Collins $lowercase, 108c165b184SJames Collins $forceTagsClosed, 109c165b184SJames Collins $target_charset, 110c165b184SJames Collins $stripRN, 111c165b184SJames Collins $defaultBRText, 112c165b184SJames Collins $defaultSpanText 113c165b184SJames Collins ); 114c165b184SJames Collins 115c165b184SJames Collins if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 116c165b184SJames Collins $dom->clear(); 117c165b184SJames Collins return false; 118c165b184SJames Collins } 119c165b184SJames Collins 120c165b184SJames Collins return $dom->load($str, $lowercase, $stripRN); 121c165b184SJames Collins} 122c165b184SJames Collins 123c165b184SJames Collinsfunction dump_html_tree($node, $show_attr = true, $deep = 0) 124c165b184SJames Collins{ 125c165b184SJames Collins $node->dump($node); 126c165b184SJames Collins} 127c165b184SJames Collins 128c165b184SJames Collinsclass simple_html_dom_node 129c165b184SJames Collins{ 130c165b184SJames Collins public $nodetype = HDOM_TYPE_TEXT; 131c165b184SJames Collins public $tag = 'text'; 132c165b184SJames Collins public $attr = array(); 133c165b184SJames Collins public $children = array(); 134c165b184SJames Collins public $nodes = array(); 135c165b184SJames Collins public $parent = null; 136c165b184SJames Collins public $_ = array(); 137c165b184SJames Collins public $tag_start = 0; 138c165b184SJames Collins private $dom = null; 139c165b184SJames Collins 140c165b184SJames Collins function __construct($dom) 141c165b184SJames Collins { 142c165b184SJames Collins $this->dom = $dom; 143c165b184SJames Collins $dom->nodes[] = $this; 144c165b184SJames Collins } 145c165b184SJames Collins 146c165b184SJames Collins function __destruct() 147c165b184SJames Collins { 148c165b184SJames Collins $this->clear(); 149c165b184SJames Collins } 150c165b184SJames Collins 151c165b184SJames Collins function __toString() 152c165b184SJames Collins { 153c165b184SJames Collins return $this->outertext(); 154c165b184SJames Collins } 155c165b184SJames Collins 156c165b184SJames Collins function clear() 157c165b184SJames Collins { 158c165b184SJames Collins $this->dom = null; 159c165b184SJames Collins $this->nodes = null; 160c165b184SJames Collins $this->parent = null; 161c165b184SJames Collins $this->children = null; 162c165b184SJames Collins } 163c165b184SJames Collins 164c165b184SJames Collins function dump($show_attr = true, $depth = 0) 165c165b184SJames Collins { 166c165b184SJames Collins echo str_repeat("\t", $depth) . $this->tag; 167c165b184SJames Collins 168c165b184SJames Collins if ($show_attr && count($this->attr) > 0) { 169c165b184SJames Collins echo '('; 170c165b184SJames Collins foreach ($this->attr as $k => $v) { 171c165b184SJames Collins echo "[$k]=>\"$v\", "; 172c165b184SJames Collins } 173c165b184SJames Collins echo ')'; 174c165b184SJames Collins } 175c165b184SJames Collins 176c165b184SJames Collins echo "\n"; 177c165b184SJames Collins 178c165b184SJames Collins if ($this->nodes) { 179c165b184SJames Collins foreach ($this->nodes as $node) { 180c165b184SJames Collins $node->dump($show_attr, $depth + 1); 181c165b184SJames Collins } 182c165b184SJames Collins } 183c165b184SJames Collins } 184c165b184SJames Collins 185c165b184SJames Collins function dump_node($echo = true) 186c165b184SJames Collins { 187c165b184SJames Collins $string = $this->tag; 188c165b184SJames Collins 189c165b184SJames Collins if (count($this->attr) > 0) { 190c165b184SJames Collins $string .= '('; 191c165b184SJames Collins foreach ($this->attr as $k => $v) { 192c165b184SJames Collins $string .= "[$k]=>\"$v\", "; 193c165b184SJames Collins } 194c165b184SJames Collins $string .= ')'; 195c165b184SJames Collins } 196c165b184SJames Collins 197c165b184SJames Collins if (count($this->_) > 0) { 198c165b184SJames Collins $string .= ' $_ ('; 199c165b184SJames Collins foreach ($this->_ as $k => $v) { 200c165b184SJames Collins if (is_array($v)) { 201c165b184SJames Collins $string .= "[$k]=>("; 202c165b184SJames Collins foreach ($v as $k2 => $v2) { 203c165b184SJames Collins $string .= "[$k2]=>\"$v2\", "; 204c165b184SJames Collins } 205c165b184SJames Collins $string .= ')'; 206c165b184SJames Collins } else { 207c165b184SJames Collins $string .= "[$k]=>\"$v\", "; 208c165b184SJames Collins } 209c165b184SJames Collins } 210c165b184SJames Collins $string .= ')'; 211c165b184SJames Collins } 212c165b184SJames Collins 213c165b184SJames Collins if (isset($this->text)) { 214c165b184SJames Collins $string .= " text: ({$this->text})"; 215c165b184SJames Collins } 216c165b184SJames Collins 217c165b184SJames Collins $string .= ' HDOM_INNER_INFO: '; 218c165b184SJames Collins 219c165b184SJames Collins if (isset($node->_[HDOM_INFO_INNER])) { 220c165b184SJames Collins $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 221c165b184SJames Collins } else { 222c165b184SJames Collins $string .= ' NULL '; 223c165b184SJames Collins } 224c165b184SJames Collins 225c165b184SJames Collins $string .= ' children: ' . count($this->children); 226c165b184SJames Collins $string .= ' nodes: ' . count($this->nodes); 227c165b184SJames Collins $string .= ' tag_start: ' . $this->tag_start; 228c165b184SJames Collins $string .= "\n"; 229c165b184SJames Collins 230c165b184SJames Collins if ($echo) { 231c165b184SJames Collins echo $string; 232c165b184SJames Collins return; 233c165b184SJames Collins } else { 234c165b184SJames Collins return $string; 235c165b184SJames Collins } 236c165b184SJames Collins } 237c165b184SJames Collins 238c165b184SJames Collins function parent($parent = null) 239c165b184SJames Collins { 240c165b184SJames Collins // I am SURE that this doesn't work properly. 241c165b184SJames Collins // It fails to unset the current node from it's current parents nodes or 242c165b184SJames Collins // children list first. 243c165b184SJames Collins if ($parent !== null) { 244c165b184SJames Collins $this->parent = $parent; 245c165b184SJames Collins $this->parent->nodes[] = $this; 246c165b184SJames Collins $this->parent->children[] = $this; 247c165b184SJames Collins } 248c165b184SJames Collins 249c165b184SJames Collins return $this->parent; 250c165b184SJames Collins } 251c165b184SJames Collins 252c165b184SJames Collins function has_child() 253c165b184SJames Collins { 254c165b184SJames Collins return !empty($this->children); 255c165b184SJames Collins } 256c165b184SJames Collins 257c165b184SJames Collins function children($idx = -1) 258c165b184SJames Collins { 259c165b184SJames Collins if ($idx === -1) { 260c165b184SJames Collins return $this->children; 261c165b184SJames Collins } 262c165b184SJames Collins 263c165b184SJames Collins if (isset($this->children[$idx])) { 264c165b184SJames Collins return $this->children[$idx]; 265c165b184SJames Collins } 266c165b184SJames Collins 267c165b184SJames Collins return null; 268c165b184SJames Collins } 269c165b184SJames Collins 270c165b184SJames Collins function first_child() 271c165b184SJames Collins { 272c165b184SJames Collins if (count($this->children) > 0) { 273c165b184SJames Collins return $this->children[0]; 274c165b184SJames Collins } 275c165b184SJames Collins return null; 276c165b184SJames Collins } 277c165b184SJames Collins 278c165b184SJames Collins function last_child() 279c165b184SJames Collins { 280c165b184SJames Collins if (count($this->children) > 0) { 281c165b184SJames Collins return end($this->children); 282c165b184SJames Collins } 283c165b184SJames Collins return null; 284c165b184SJames Collins } 285c165b184SJames Collins 286c165b184SJames Collins function next_sibling() 287c165b184SJames Collins { 288c165b184SJames Collins if ($this->parent === null) { 289c165b184SJames Collins return null; 290c165b184SJames Collins } 291c165b184SJames Collins 292c165b184SJames Collins $idx = array_search($this, $this->parent->children, true); 293c165b184SJames Collins 294c165b184SJames Collins if ($idx !== false && isset($this->parent->children[$idx + 1])) { 295c165b184SJames Collins return $this->parent->children[$idx + 1]; 296c165b184SJames Collins } 297c165b184SJames Collins 298c165b184SJames Collins return null; 299c165b184SJames Collins } 300c165b184SJames Collins 301c165b184SJames Collins function prev_sibling() 302c165b184SJames Collins { 303c165b184SJames Collins if ($this->parent === null) { 304c165b184SJames Collins return null; 305c165b184SJames Collins } 306c165b184SJames Collins 307c165b184SJames Collins $idx = array_search($this, $this->parent->children, true); 308c165b184SJames Collins 309c165b184SJames Collins if ($idx !== false && $idx > 0) { 310c165b184SJames Collins return $this->parent->children[$idx - 1]; 311c165b184SJames Collins } 312c165b184SJames Collins 313c165b184SJames Collins return null; 314c165b184SJames Collins } 315c165b184SJames Collins 316c165b184SJames Collins function find_ancestor_tag($tag) 317c165b184SJames Collins { 318c165b184SJames Collins global $debug_object; 319c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 320c165b184SJames Collins 321c165b184SJames Collins if ($this->parent === null) { 322c165b184SJames Collins return null; 323c165b184SJames Collins } 324c165b184SJames Collins 325c165b184SJames Collins $ancestor = $this->parent; 326c165b184SJames Collins 327c165b184SJames Collins while (!is_null($ancestor)) { 328c165b184SJames Collins if (is_object($debug_object)) { 329c165b184SJames Collins $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 330c165b184SJames Collins } 331c165b184SJames Collins 332c165b184SJames Collins if ($ancestor->tag === $tag) { 333c165b184SJames Collins break; 334c165b184SJames Collins } 335c165b184SJames Collins 336c165b184SJames Collins $ancestor = $ancestor->parent; 337c165b184SJames Collins } 338c165b184SJames Collins 339c165b184SJames Collins return $ancestor; 340c165b184SJames Collins } 341c165b184SJames Collins 342c165b184SJames Collins function innertext() 343c165b184SJames Collins { 344c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 345c165b184SJames Collins return $this->_[HDOM_INFO_INNER]; 346c165b184SJames Collins } 347c165b184SJames Collins 348c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 349c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 350c165b184SJames Collins } 351c165b184SJames Collins 352c165b184SJames Collins $ret = ''; 353c165b184SJames Collins 354c165b184SJames Collins foreach ($this->nodes as $n) { 355c165b184SJames Collins $ret .= $n->outertext(); 356c165b184SJames Collins } 357c165b184SJames Collins 358c165b184SJames Collins return $ret; 359c165b184SJames Collins } 360c165b184SJames Collins 361c165b184SJames Collins function outertext() 362c165b184SJames Collins { 363c165b184SJames Collins global $debug_object; 364c165b184SJames Collins 365c165b184SJames Collins if (is_object($debug_object)) { 366c165b184SJames Collins $text = ''; 367c165b184SJames Collins 368c165b184SJames Collins if ($this->tag === 'text') { 369c165b184SJames Collins if (!empty($this->text)) { 370c165b184SJames Collins $text = ' with text: ' . $this->text; 371c165b184SJames Collins } 372c165b184SJames Collins } 373c165b184SJames Collins 374c165b184SJames Collins $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 375c165b184SJames Collins } 376c165b184SJames Collins 377c165b184SJames Collins if ($this->tag === 'root') { 378c165b184SJames Collins return $this->innertext(); 379c165b184SJames Collins } 380c165b184SJames Collins 381c165b184SJames Collins // todo: What is the use of this callback? Remove? 382c165b184SJames Collins if ($this->dom && $this->dom->callback !== null) { 383c165b184SJames Collins call_user_func_array($this->dom->callback, array($this)); 384c165b184SJames Collins } 385c165b184SJames Collins 386c165b184SJames Collins if (isset($this->_[HDOM_INFO_OUTER])) { 387c165b184SJames Collins return $this->_[HDOM_INFO_OUTER]; 388c165b184SJames Collins } 389c165b184SJames Collins 390c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 391c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 392c165b184SJames Collins } 393c165b184SJames Collins 394c165b184SJames Collins $ret = ''; 395c165b184SJames Collins 396c165b184SJames Collins if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 397c165b184SJames Collins $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 398c165b184SJames Collins } 399c165b184SJames Collins 400c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 401c165b184SJames Collins // todo: <br> should either never have HDOM_INFO_INNER or always 402c165b184SJames Collins if ($this->tag !== 'br') { 403c165b184SJames Collins $ret .= $this->_[HDOM_INFO_INNER]; 404c165b184SJames Collins } 405c165b184SJames Collins } elseif ($this->nodes) { 406c165b184SJames Collins foreach ($this->nodes as $n) { 407c165b184SJames Collins $ret .= $this->convert_text($n->outertext()); 408c165b184SJames Collins } 409c165b184SJames Collins } 410c165b184SJames Collins 411c165b184SJames Collins if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 412c165b184SJames Collins $ret .= '</' . $this->tag . '>'; 413c165b184SJames Collins } 414c165b184SJames Collins 415c165b184SJames Collins return $ret; 416c165b184SJames Collins } 417c165b184SJames Collins 418c165b184SJames Collins function text() 419c165b184SJames Collins { 420c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 421c165b184SJames Collins return $this->_[HDOM_INFO_INNER]; 422c165b184SJames Collins } 423c165b184SJames Collins 424c165b184SJames Collins switch ($this->nodetype) { 425c165b184SJames Collins case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 426c165b184SJames Collins case HDOM_TYPE_COMMENT: return ''; 427c165b184SJames Collins case HDOM_TYPE_UNKNOWN: return ''; 428c165b184SJames Collins } 429c165b184SJames Collins 430c165b184SJames Collins if (strcasecmp($this->tag, 'script') === 0) { return ''; } 431c165b184SJames Collins if (strcasecmp($this->tag, 'style') === 0) { return ''; } 432c165b184SJames Collins 433c165b184SJames Collins $ret = ''; 434c165b184SJames Collins 435c165b184SJames Collins // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 436c165b184SJames Collins // for some span tags, and some p tags) $this->nodes is set to NULL. 437c165b184SJames Collins // NOTE: This indicates that there is a problem where it's set to NULL 438c165b184SJames Collins // without a clear happening. 439c165b184SJames Collins // WHY is this happening? 440c165b184SJames Collins if (!is_null($this->nodes)) { 441c165b184SJames Collins foreach ($this->nodes as $n) { 442c165b184SJames Collins // Start paragraph after a blank line 443c165b184SJames Collins if ($n->tag === 'p') { 444c165b184SJames Collins $ret = trim($ret) . "\n\n"; 445c165b184SJames Collins } 446c165b184SJames Collins 447c165b184SJames Collins $ret .= $this->convert_text($n->text()); 448c165b184SJames Collins 449c165b184SJames Collins // If this node is a span... add a space at the end of it so 450c165b184SJames Collins // multiple spans don't run into each other. This is plaintext 451c165b184SJames Collins // after all. 452c165b184SJames Collins if ($n->tag === 'span') { 453c165b184SJames Collins $ret .= $this->dom->default_span_text; 454c165b184SJames Collins } 455c165b184SJames Collins } 456c165b184SJames Collins } 457c165b184SJames Collins return $ret; 458c165b184SJames Collins } 459c165b184SJames Collins 460c165b184SJames Collins function xmltext() 461c165b184SJames Collins { 462c165b184SJames Collins $ret = $this->innertext(); 463c165b184SJames Collins $ret = str_ireplace('<![CDATA[', '', $ret); 464c165b184SJames Collins $ret = str_replace(']]>', '', $ret); 465c165b184SJames Collins return $ret; 466c165b184SJames Collins } 467c165b184SJames Collins 468c165b184SJames Collins function makeup() 469c165b184SJames Collins { 470c165b184SJames Collins // text, comment, unknown 471c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 472c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 473c165b184SJames Collins } 474c165b184SJames Collins 475c165b184SJames Collins $ret = '<' . $this->tag; 476c165b184SJames Collins $i = -1; 477c165b184SJames Collins 478c165b184SJames Collins foreach ($this->attr as $key => $val) { 479c165b184SJames Collins ++$i; 480c165b184SJames Collins 481c165b184SJames Collins // skip removed attribute 482c165b184SJames Collins if ($val === null || $val === false) { continue; } 483c165b184SJames Collins 484c165b184SJames Collins $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 485c165b184SJames Collins 486c165b184SJames Collins //no value attr: nowrap, checked selected... 487c165b184SJames Collins if ($val === true) { 488c165b184SJames Collins $ret .= $key; 489c165b184SJames Collins } else { 490c165b184SJames Collins switch ($this->_[HDOM_INFO_QUOTE][$i]) 491c165b184SJames Collins { 492c165b184SJames Collins case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 493c165b184SJames Collins case HDOM_QUOTE_SINGLE: $quote = '\''; break; 494c165b184SJames Collins default: $quote = ''; 495c165b184SJames Collins } 496c165b184SJames Collins 497c165b184SJames Collins $ret .= $key 498c165b184SJames Collins . $this->_[HDOM_INFO_SPACE][$i][1] 499c165b184SJames Collins . '=' 500c165b184SJames Collins . $this->_[HDOM_INFO_SPACE][$i][2] 501c165b184SJames Collins . $quote 502c165b184SJames Collins . $val 503c165b184SJames Collins . $quote; 504c165b184SJames Collins } 505c165b184SJames Collins } 506c165b184SJames Collins 507c165b184SJames Collins $ret = $this->dom->restore_noise($ret); 508c165b184SJames Collins return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 509c165b184SJames Collins } 510c165b184SJames Collins 511c165b184SJames Collins function find($selector, $idx = null, $lowercase = false) 512c165b184SJames Collins { 513c165b184SJames Collins $selectors = $this->parse_selector($selector); 514c165b184SJames Collins if (($count = count($selectors)) === 0) { return array(); } 515c165b184SJames Collins $found_keys = array(); 516c165b184SJames Collins 517c165b184SJames Collins // find each selector 518c165b184SJames Collins for ($c = 0; $c < $count; ++$c) { 519c165b184SJames Collins // The change on the below line was documented on the sourceforge 520c165b184SJames Collins // code tracker id 2788009 521c165b184SJames Collins // used to be: if (($levle=count($selectors[0]))===0) return array(); 522c165b184SJames Collins if (($levle = count($selectors[$c])) === 0) { return array(); } 523c165b184SJames Collins if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 524c165b184SJames Collins 525c165b184SJames Collins $head = array($this->_[HDOM_INFO_BEGIN] => 1); 526c165b184SJames Collins $cmd = ' '; // Combinator 527c165b184SJames Collins 528c165b184SJames Collins // handle descendant selectors, no recursive! 529c165b184SJames Collins for ($l = 0; $l < $levle; ++$l) { 530c165b184SJames Collins $ret = array(); 531c165b184SJames Collins 532c165b184SJames Collins foreach ($head as $k => $v) { 533c165b184SJames Collins $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 534c165b184SJames Collins //PaperG - Pass this optional parameter on to the seek function. 535c165b184SJames Collins $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 536c165b184SJames Collins } 537c165b184SJames Collins 538c165b184SJames Collins $head = $ret; 539c165b184SJames Collins $cmd = $selectors[$c][$l][4]; // Next Combinator 540c165b184SJames Collins } 541c165b184SJames Collins 542c165b184SJames Collins foreach ($head as $k => $v) { 543c165b184SJames Collins if (!isset($found_keys[$k])) { 544c165b184SJames Collins $found_keys[$k] = 1; 545c165b184SJames Collins } 546c165b184SJames Collins } 547c165b184SJames Collins } 548c165b184SJames Collins 549c165b184SJames Collins // sort keys 550c165b184SJames Collins ksort($found_keys); 551c165b184SJames Collins 552c165b184SJames Collins $found = array(); 553c165b184SJames Collins foreach ($found_keys as $k => $v) { 554c165b184SJames Collins $found[] = $this->dom->nodes[$k]; 555c165b184SJames Collins } 556c165b184SJames Collins 557c165b184SJames Collins // return nth-element or array 558c165b184SJames Collins if (is_null($idx)) { return $found; } 559c165b184SJames Collins elseif ($idx < 0) { $idx = count($found) + $idx; } 560c165b184SJames Collins return (isset($found[$idx])) ? $found[$idx] : null; 561c165b184SJames Collins } 562c165b184SJames Collins 563c165b184SJames Collins protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 564c165b184SJames Collins { 565c165b184SJames Collins global $debug_object; 566c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 567c165b184SJames Collins 568c165b184SJames Collins list($tag, $id, $class, $attributes, $cmb) = $selector; 569c165b184SJames Collins $nodes = array(); 570c165b184SJames Collins 571c165b184SJames Collins if ($parent_cmd === ' ') { // Descendant Combinator 572c165b184SJames Collins // Find parent closing tag if the current element doesn't have a closing 573c165b184SJames Collins // tag (i.e. void element) 574c165b184SJames Collins $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 575c165b184SJames Collins if ($end == 0) { 576c165b184SJames Collins $parent = $this->parent; 577c165b184SJames Collins while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 578c165b184SJames Collins $end -= 1; 579c165b184SJames Collins $parent = $parent->parent; 580c165b184SJames Collins } 581c165b184SJames Collins $end += $parent->_[HDOM_INFO_END]; 582c165b184SJames Collins } 583c165b184SJames Collins 584c165b184SJames Collins // Get list of target nodes 585c165b184SJames Collins $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 586c165b184SJames Collins $nodes_count = $end - $nodes_start; 587c165b184SJames Collins $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 588c165b184SJames Collins } elseif ($parent_cmd === '>') { // Child Combinator 589c165b184SJames Collins $nodes = $this->children; 590c165b184SJames Collins } elseif ($parent_cmd === '+' 591c165b184SJames Collins && $this->parent 592c165b184SJames Collins && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 593c165b184SJames Collins $index = array_search($this, $this->parent->children, true) + 1; 594c165b184SJames Collins if ($index < count($this->parent->children)) 595c165b184SJames Collins $nodes[] = $this->parent->children[$index]; 596c165b184SJames Collins } elseif ($parent_cmd === '~' 597c165b184SJames Collins && $this->parent 598c165b184SJames Collins && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 599c165b184SJames Collins $index = array_search($this, $this->parent->children, true); 600c165b184SJames Collins $nodes = array_slice($this->parent->children, $index); 601c165b184SJames Collins } 602c165b184SJames Collins 603c165b184SJames Collins // Go throgh each element starting at this element until the end tag 604c165b184SJames Collins // Note: If this element is a void tag, any previous void element is 605c165b184SJames Collins // skipped. 606c165b184SJames Collins foreach($nodes as $node) { 607c165b184SJames Collins $pass = true; 608c165b184SJames Collins 609c165b184SJames Collins // Skip root nodes 610c165b184SJames Collins if(!$node->parent) { 611c165b184SJames Collins $pass = false; 612c165b184SJames Collins } 613c165b184SJames Collins 614c165b184SJames Collins // Handle 'text' selector 615c165b184SJames Collins if($pass && $tag === 'text' && $node->tag === 'text') { 616c165b184SJames Collins $ret[array_search($node, $this->dom->nodes, true)] = 1; 617c165b184SJames Collins unset($node); 618c165b184SJames Collins continue; 619c165b184SJames Collins } 620c165b184SJames Collins 621c165b184SJames Collins // Skip if node isn't a child node (i.e. text nodes) 622c165b184SJames Collins if($pass && !in_array($node, $node->parent->children, true)) { 623c165b184SJames Collins $pass = false; 624c165b184SJames Collins } 625c165b184SJames Collins 626c165b184SJames Collins // Skip if tag doesn't match 627c165b184SJames Collins if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 628c165b184SJames Collins $pass = false; 629c165b184SJames Collins } 630c165b184SJames Collins 631c165b184SJames Collins // Skip if ID doesn't exist 632c165b184SJames Collins if ($pass && $id !== '' && !isset($node->attr['id'])) { 633c165b184SJames Collins $pass = false; 634c165b184SJames Collins } 635c165b184SJames Collins 636c165b184SJames Collins // Check if ID matches 637c165b184SJames Collins if ($pass && $id !== '' && isset($node->attr['id'])) { 638c165b184SJames Collins // Note: Only consider the first ID (as browsers do) 639c165b184SJames Collins $node_id = explode(' ', trim($node->attr['id']))[0]; 640c165b184SJames Collins 641c165b184SJames Collins if($id !== $node_id) { $pass = false; } 642c165b184SJames Collins } 643c165b184SJames Collins 644c165b184SJames Collins // Check if all class(es) exist 645c165b184SJames Collins if ($pass && $class !== '' && is_array($class) && !empty($class)) { 646c165b184SJames Collins if (isset($node->attr['class'])) { 647c165b184SJames Collins $node_classes = explode(' ', $node->attr['class']); 648c165b184SJames Collins 649c165b184SJames Collins if ($lowercase) { 650c165b184SJames Collins $node_classes = array_map('strtolower', $node_classes); 651c165b184SJames Collins } 652c165b184SJames Collins 653c165b184SJames Collins foreach($class as $c) { 654c165b184SJames Collins if(!in_array($c, $node_classes)) { 655c165b184SJames Collins $pass = false; 656c165b184SJames Collins break; 657c165b184SJames Collins } 658c165b184SJames Collins } 659c165b184SJames Collins } else { 660c165b184SJames Collins $pass = false; 661c165b184SJames Collins } 662c165b184SJames Collins } 663c165b184SJames Collins 664c165b184SJames Collins // Check attributes 665c165b184SJames Collins if ($pass 666c165b184SJames Collins && $attributes !== '' 667c165b184SJames Collins && is_array($attributes) 668c165b184SJames Collins && !empty($attributes)) { 669c165b184SJames Collins foreach($attributes as $a) { 670c165b184SJames Collins list ( 671c165b184SJames Collins $att_name, 672c165b184SJames Collins $att_expr, 673c165b184SJames Collins $att_val, 674c165b184SJames Collins $att_inv, 675c165b184SJames Collins $att_case_sensitivity 676c165b184SJames Collins ) = $a; 677c165b184SJames Collins 678c165b184SJames Collins // Handle indexing attributes (i.e. "[2]") 679c165b184SJames Collins /** 680c165b184SJames Collins * Note: This is not supported by the CSS Standard but adds 681c165b184SJames Collins * the ability to select items compatible to XPath (i.e. 682c165b184SJames Collins * the 3rd element within it's parent). 683c165b184SJames Collins * 684c165b184SJames Collins * Note: This doesn't conflict with the CSS Standard which 685c165b184SJames Collins * doesn't work on numeric attributes anyway. 686c165b184SJames Collins */ 687c165b184SJames Collins if (is_numeric($att_name) 688c165b184SJames Collins && $att_expr === '' 689c165b184SJames Collins && $att_val === '') { 690c165b184SJames Collins $count = 0; 691c165b184SJames Collins 692c165b184SJames Collins // Find index of current element in parent 693c165b184SJames Collins foreach ($node->parent->children as $c) { 694c165b184SJames Collins if ($c->tag === $node->tag) ++$count; 695c165b184SJames Collins if ($c === $node) break; 696c165b184SJames Collins } 697c165b184SJames Collins 698c165b184SJames Collins // If this is the correct node, continue with next 699c165b184SJames Collins // attribute 700c165b184SJames Collins if ($count === (int)$att_name) continue; 701c165b184SJames Collins } 702c165b184SJames Collins 703c165b184SJames Collins // Check attribute availability 704c165b184SJames Collins if ($att_inv) { // Attribute should NOT be set 705c165b184SJames Collins if (isset($node->attr[$att_name])) { 706c165b184SJames Collins $pass = false; 707c165b184SJames Collins break; 708c165b184SJames Collins } 709c165b184SJames Collins } else { // Attribute should be set 710c165b184SJames Collins // todo: "plaintext" is not a valid CSS selector! 711c165b184SJames Collins if ($att_name !== 'plaintext' 712c165b184SJames Collins && !isset($node->attr[$att_name])) { 713c165b184SJames Collins $pass = false; 714c165b184SJames Collins break; 715c165b184SJames Collins } 716c165b184SJames Collins } 717c165b184SJames Collins 718c165b184SJames Collins // Continue with next attribute if expression isn't defined 719c165b184SJames Collins if ($att_expr === '') continue; 720c165b184SJames Collins 721c165b184SJames Collins // If they have told us that this is a "plaintext" 722c165b184SJames Collins // search then we want the plaintext of the node - right? 723c165b184SJames Collins // todo "plaintext" is not a valid CSS selector! 724c165b184SJames Collins if ($att_name === 'plaintext') { 725c165b184SJames Collins $nodeKeyValue = $node->text(); 726c165b184SJames Collins } else { 727c165b184SJames Collins $nodeKeyValue = $node->attr[$att_name]; 728c165b184SJames Collins } 729c165b184SJames Collins 730c165b184SJames Collins if (is_object($debug_object)) { 731c165b184SJames Collins $debug_object->debug_log(2, 732c165b184SJames Collins 'testing node: ' 733c165b184SJames Collins . $node->tag 734c165b184SJames Collins . ' for attribute: ' 735c165b184SJames Collins . $att_name 736c165b184SJames Collins . $att_expr 737c165b184SJames Collins . $att_val 738c165b184SJames Collins . ' where nodes value is: ' 739c165b184SJames Collins . $nodeKeyValue 740c165b184SJames Collins ); 741c165b184SJames Collins } 742c165b184SJames Collins 743c165b184SJames Collins // If lowercase is set, do a case insensitive test of 744c165b184SJames Collins // the value of the selector. 745c165b184SJames Collins if ($lowercase) { 746c165b184SJames Collins $check = $this->match( 747c165b184SJames Collins $att_expr, 748c165b184SJames Collins strtolower($att_val), 749c165b184SJames Collins strtolower($nodeKeyValue), 750c165b184SJames Collins $att_case_sensitivity 751c165b184SJames Collins ); 752c165b184SJames Collins } else { 753c165b184SJames Collins $check = $this->match( 754c165b184SJames Collins $att_expr, 755c165b184SJames Collins $att_val, 756c165b184SJames Collins $nodeKeyValue, 757c165b184SJames Collins $att_case_sensitivity 758c165b184SJames Collins ); 759c165b184SJames Collins } 760c165b184SJames Collins 761c165b184SJames Collins if (is_object($debug_object)) { 762c165b184SJames Collins $debug_object->debug_log(2, 763c165b184SJames Collins 'after match: ' 764c165b184SJames Collins . ($check ? 'true' : 'false') 765c165b184SJames Collins ); 766c165b184SJames Collins } 767c165b184SJames Collins 768c165b184SJames Collins if (!$check) { 769c165b184SJames Collins $pass = false; 770c165b184SJames Collins break; 771c165b184SJames Collins } 772c165b184SJames Collins } 773c165b184SJames Collins } 774c165b184SJames Collins 775c165b184SJames Collins // Found a match. Add to list and clear node 776c165b184SJames Collins if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 777c165b184SJames Collins unset($node); 778c165b184SJames Collins } 779c165b184SJames Collins // It's passed by reference so this is actually what this function returns. 780c165b184SJames Collins if (is_object($debug_object)) { 781c165b184SJames Collins $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 782c165b184SJames Collins } 783c165b184SJames Collins } 784c165b184SJames Collins 785c165b184SJames Collins protected function match($exp, $pattern, $value, $case_sensitivity) 786c165b184SJames Collins { 787c165b184SJames Collins global $debug_object; 788c165b184SJames Collins if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 789c165b184SJames Collins 790c165b184SJames Collins if ($case_sensitivity === 'i') { 791c165b184SJames Collins $pattern = strtolower($pattern); 792c165b184SJames Collins $value = strtolower($value); 793c165b184SJames Collins } 794c165b184SJames Collins 795c165b184SJames Collins switch ($exp) { 796c165b184SJames Collins case '=': 797c165b184SJames Collins return ($value === $pattern); 798c165b184SJames Collins case '!=': 799c165b184SJames Collins return ($value !== $pattern); 800c165b184SJames Collins case '^=': 801c165b184SJames Collins return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 802c165b184SJames Collins case '$=': 803c165b184SJames Collins return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 804c165b184SJames Collins case '*=': 805c165b184SJames Collins return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 806c165b184SJames Collins case '|=': 807c165b184SJames Collins /** 808c165b184SJames Collins * [att|=val] 809c165b184SJames Collins * 810c165b184SJames Collins * Represents an element with the att attribute, its value 811c165b184SJames Collins * either being exactly "val" or beginning with "val" 812c165b184SJames Collins * immediately followed by "-" (U+002D). 813c165b184SJames Collins */ 814c165b184SJames Collins return strpos($value, $pattern) === 0; 815c165b184SJames Collins case '~=': 816c165b184SJames Collins /** 817c165b184SJames Collins * [att~=val] 818c165b184SJames Collins * 819c165b184SJames Collins * Represents an element with the att attribute whose value is a 820c165b184SJames Collins * whitespace-separated list of words, one of which is exactly 821c165b184SJames Collins * "val". If "val" contains whitespace, it will never represent 822c165b184SJames Collins * anything (since the words are separated by spaces). Also if 823c165b184SJames Collins * "val" is the empty string, it will never represent anything. 824c165b184SJames Collins */ 825c165b184SJames Collins return in_array($pattern, explode(' ', trim($value)), true); 826c165b184SJames Collins } 827c165b184SJames Collins return false; 828c165b184SJames Collins } 829c165b184SJames Collins 830c165b184SJames Collins protected function parse_selector($selector_string) 831c165b184SJames Collins { 832c165b184SJames Collins global $debug_object; 833c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 834c165b184SJames Collins 835c165b184SJames Collins /** 836c165b184SJames Collins * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 837c165b184SJames Collins * 838c165b184SJames Collins * Paperg: Add the colon to the attribute, so that it properly finds 839c165b184SJames Collins * <tag attr:ibute="something" > like google does. 840c165b184SJames Collins * 841c165b184SJames Collins * Note: if you try to look at this attribute, you MUST use getAttribute 842c165b184SJames Collins * since $dom->x:y will fail the php syntax check. 843c165b184SJames Collins * 844c165b184SJames Collins * Notice the \[ starting the attribute? and the @? following? This 845c165b184SJames Collins * implies that an attribute can begin with an @ sign that is not 846c165b184SJames Collins * captured. This implies that an html attribute specifier may start 847c165b184SJames Collins * with an @ sign that is NOT captured by the expression. Farther study 848c165b184SJames Collins * is required to determine of this should be documented or removed. 849c165b184SJames Collins * 850c165b184SJames Collins * Matches selectors in this order: 851c165b184SJames Collins * 852c165b184SJames Collins * [0] - full match 853c165b184SJames Collins * 854c165b184SJames Collins * [1] - tag name 855c165b184SJames Collins * ([\w:\*-]*) 856c165b184SJames Collins * Matches the tag name consisting of zero or more words, colons, 857c165b184SJames Collins * asterisks and hyphens. 858c165b184SJames Collins * 859c165b184SJames Collins * [2] - id name 860c165b184SJames Collins * (?:\#([\w-]+)) 861c165b184SJames Collins * Optionally matches a id name, consisting of an "#" followed by 862c165b184SJames Collins * the id name (one or more words and hyphens). 863c165b184SJames Collins * 864c165b184SJames Collins * [3] - class names (including dots) 865c165b184SJames Collins * (?:\.([\w\.-]+))? 866c165b184SJames Collins * Optionally matches a list of classs, consisting of an "." 867c165b184SJames Collins * followed by the class name (one or more words and hyphens) 868c165b184SJames Collins * where multiple classes can be chained (i.e. ".foo.bar.baz") 869c165b184SJames Collins * 870c165b184SJames Collins * [4] - attributes 871c165b184SJames Collins * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 872c165b184SJames Collins * Optionally matches the attributes list 873c165b184SJames Collins * 874c165b184SJames Collins * [5] - separator 875c165b184SJames Collins * ([\/, >+~]+) 876c165b184SJames Collins * Matches the selector list separator 877c165b184SJames Collins */ 878c165b184SJames Collins // phpcs:ignore Generic.Files.LineLength 879c165b184SJames Collins $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 880c165b184SJames Collins 881c165b184SJames Collins preg_match_all( 882c165b184SJames Collins $pattern, 883c165b184SJames Collins trim($selector_string) . ' ', // Add final ' ' as pseudo separator 884c165b184SJames Collins $matches, 885c165b184SJames Collins PREG_SET_ORDER 886c165b184SJames Collins ); 887c165b184SJames Collins 888c165b184SJames Collins if (is_object($debug_object)) { 889c165b184SJames Collins $debug_object->debug_log(2, 'Matches Array: ', $matches); 890c165b184SJames Collins } 891c165b184SJames Collins 892c165b184SJames Collins $selectors = array(); 893c165b184SJames Collins $result = array(); 894c165b184SJames Collins 895c165b184SJames Collins foreach ($matches as $m) { 896c165b184SJames Collins $m[0] = trim($m[0]); 897c165b184SJames Collins 898c165b184SJames Collins // Skip NoOps 899c165b184SJames Collins if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 900c165b184SJames Collins 901c165b184SJames Collins // Convert to lowercase 902c165b184SJames Collins if ($this->dom->lowercase) { 903c165b184SJames Collins $m[1] = strtolower($m[1]); 904c165b184SJames Collins } 905c165b184SJames Collins 906c165b184SJames Collins // Extract classes 907c165b184SJames Collins if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 908c165b184SJames Collins 909c165b184SJames Collins /* Extract attributes (pattern based on the pattern above!) 910c165b184SJames Collins 911c165b184SJames Collins * [0] - full match 912c165b184SJames Collins * [1] - attribute name 913c165b184SJames Collins * [2] - attribute expression 914c165b184SJames Collins * [3] - attribute value 915c165b184SJames Collins * [4] - case sensitivity 916c165b184SJames Collins * 917c165b184SJames Collins * Note: Attributes can be negated with a "!" prefix to their name 918c165b184SJames Collins */ 919c165b184SJames Collins if($m[4] !== '') { 920c165b184SJames Collins preg_match_all( 921c165b184SJames Collins "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 922c165b184SJames Collins trim($m[4]), 923c165b184SJames Collins $attributes, 924c165b184SJames Collins PREG_SET_ORDER 925c165b184SJames Collins ); 926c165b184SJames Collins 927c165b184SJames Collins // Replace element by array 928c165b184SJames Collins $m[4] = array(); 929c165b184SJames Collins 930c165b184SJames Collins foreach($attributes as $att) { 931c165b184SJames Collins // Skip empty matches 932c165b184SJames Collins if(trim($att[0]) === '') { continue; } 933c165b184SJames Collins 934c165b184SJames Collins $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 935c165b184SJames Collins $m[4][] = array( 936c165b184SJames Collins $inverted ? substr($att[1], 1) : $att[1], // Name 937c165b184SJames Collins (isset($att[2])) ? $att[2] : '', // Expression 938c165b184SJames Collins (isset($att[3])) ? $att[3] : '', // Value 939c165b184SJames Collins $inverted, // Inverted Flag 940c165b184SJames Collins (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 941c165b184SJames Collins ); 942c165b184SJames Collins } 943c165b184SJames Collins } 944c165b184SJames Collins 945c165b184SJames Collins // Sanitize Separator 946c165b184SJames Collins if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 947c165b184SJames Collins $m[5] = ' '; 948c165b184SJames Collins } else { // Other Separator 949c165b184SJames Collins $m[5] = trim($m[5]); 950c165b184SJames Collins } 951c165b184SJames Collins 952c165b184SJames Collins // Clear Separator if it's a Selector List 953c165b184SJames Collins if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 954c165b184SJames Collins 955c165b184SJames Collins // Remove full match before adding to results 956c165b184SJames Collins array_shift($m); 957c165b184SJames Collins $result[] = $m; 958c165b184SJames Collins 959c165b184SJames Collins if ($is_list) { // Selector List 960c165b184SJames Collins $selectors[] = $result; 961c165b184SJames Collins $result = array(); 962c165b184SJames Collins } 963c165b184SJames Collins } 964c165b184SJames Collins 965c165b184SJames Collins if (count($result) > 0) { $selectors[] = $result; } 966c165b184SJames Collins return $selectors; 967c165b184SJames Collins } 968c165b184SJames Collins 969c165b184SJames Collins function __get($name) 970c165b184SJames Collins { 971c165b184SJames Collins if (isset($this->attr[$name])) { 972c165b184SJames Collins return $this->convert_text($this->attr[$name]); 973c165b184SJames Collins } 974c165b184SJames Collins switch ($name) { 975c165b184SJames Collins case 'outertext': return $this->outertext(); 976c165b184SJames Collins case 'innertext': return $this->innertext(); 977c165b184SJames Collins case 'plaintext': return $this->text(); 978c165b184SJames Collins case 'xmltext': return $this->xmltext(); 979c165b184SJames Collins default: return array_key_exists($name, $this->attr); 980c165b184SJames Collins } 981c165b184SJames Collins } 982c165b184SJames Collins 983c165b184SJames Collins function __set($name, $value) 984c165b184SJames Collins { 985c165b184SJames Collins global $debug_object; 986c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 987c165b184SJames Collins 988c165b184SJames Collins switch ($name) { 989c165b184SJames Collins case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 990c165b184SJames Collins case 'innertext': 991c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 992c165b184SJames Collins return $this->_[HDOM_INFO_TEXT] = $value; 993c165b184SJames Collins } 994c165b184SJames Collins return $this->_[HDOM_INFO_INNER] = $value; 995c165b184SJames Collins } 996c165b184SJames Collins 997c165b184SJames Collins if (!isset($this->attr[$name])) { 998c165b184SJames Collins $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 999c165b184SJames Collins $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1000c165b184SJames Collins } 1001c165b184SJames Collins 1002c165b184SJames Collins $this->attr[$name] = $value; 1003c165b184SJames Collins } 1004c165b184SJames Collins 1005c165b184SJames Collins function __isset($name) 1006c165b184SJames Collins { 1007c165b184SJames Collins switch ($name) { 1008c165b184SJames Collins case 'outertext': return true; 1009c165b184SJames Collins case 'innertext': return true; 1010c165b184SJames Collins case 'plaintext': return true; 1011c165b184SJames Collins } 1012c165b184SJames Collins //no value attr: nowrap, checked selected... 1013c165b184SJames Collins return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1014c165b184SJames Collins } 1015c165b184SJames Collins 1016c165b184SJames Collins function __unset($name) 1017c165b184SJames Collins { 1018c165b184SJames Collins if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1019c165b184SJames Collins } 1020c165b184SJames Collins 1021c165b184SJames Collins function convert_text($text) 1022c165b184SJames Collins { 1023c165b184SJames Collins global $debug_object; 1024c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1025c165b184SJames Collins 1026c165b184SJames Collins $converted_text = $text; 1027c165b184SJames Collins 1028c165b184SJames Collins $sourceCharset = ''; 1029c165b184SJames Collins $targetCharset = ''; 1030c165b184SJames Collins 1031c165b184SJames Collins if ($this->dom) { 1032c165b184SJames Collins $sourceCharset = strtoupper($this->dom->_charset); 1033c165b184SJames Collins $targetCharset = strtoupper($this->dom->_target_charset); 1034c165b184SJames Collins } 1035c165b184SJames Collins 1036c165b184SJames Collins if (is_object($debug_object)) { 1037c165b184SJames Collins $debug_object->debug_log(3, 1038c165b184SJames Collins 'source charset: ' 1039c165b184SJames Collins . $sourceCharset 1040c165b184SJames Collins . ' target charaset: ' 1041c165b184SJames Collins . $targetCharset 1042c165b184SJames Collins ); 1043c165b184SJames Collins } 1044c165b184SJames Collins 1045c165b184SJames Collins if (!empty($sourceCharset) 1046c165b184SJames Collins && !empty($targetCharset) 1047c165b184SJames Collins && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1048c165b184SJames Collins // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1049c165b184SJames Collins if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1050c165b184SJames Collins && ($this->is_utf8($text))) { 1051c165b184SJames Collins $converted_text = $text; 1052c165b184SJames Collins } else { 1053c165b184SJames Collins $converted_text = iconv($sourceCharset, $targetCharset, $text); 1054c165b184SJames Collins } 1055c165b184SJames Collins } 1056c165b184SJames Collins 1057c165b184SJames Collins // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1058c165b184SJames Collins if ($targetCharset === 'UTF-8') { 1059c165b184SJames Collins if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1060c165b184SJames Collins $converted_text = substr($converted_text, 3); 1061c165b184SJames Collins } 1062c165b184SJames Collins 1063c165b184SJames Collins if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1064c165b184SJames Collins $converted_text = substr($converted_text, 0, -3); 1065c165b184SJames Collins } 1066c165b184SJames Collins } 1067c165b184SJames Collins 1068c165b184SJames Collins return $converted_text; 1069c165b184SJames Collins } 1070c165b184SJames Collins 1071c165b184SJames Collins static function is_utf8($str) 1072c165b184SJames Collins { 1073c165b184SJames Collins $c = 0; $b = 0; 1074c165b184SJames Collins $bits = 0; 1075c165b184SJames Collins $len = strlen($str); 1076c165b184SJames Collins for($i = 0; $i < $len; $i++) { 1077c165b184SJames Collins $c = ord($str[$i]); 1078c165b184SJames Collins if($c > 128) { 1079c165b184SJames Collins if(($c >= 254)) { return false; } 1080c165b184SJames Collins elseif($c >= 252) { $bits = 6; } 1081c165b184SJames Collins elseif($c >= 248) { $bits = 5; } 1082c165b184SJames Collins elseif($c >= 240) { $bits = 4; } 1083c165b184SJames Collins elseif($c >= 224) { $bits = 3; } 1084c165b184SJames Collins elseif($c >= 192) { $bits = 2; } 1085c165b184SJames Collins else { return false; } 1086c165b184SJames Collins if(($i + $bits) > $len) { return false; } 1087c165b184SJames Collins while($bits > 1) { 1088c165b184SJames Collins $i++; 1089c165b184SJames Collins $b = ord($str[$i]); 1090c165b184SJames Collins if($b < 128 || $b > 191) { return false; } 1091c165b184SJames Collins $bits--; 1092c165b184SJames Collins } 1093c165b184SJames Collins } 1094c165b184SJames Collins } 1095c165b184SJames Collins return true; 1096c165b184SJames Collins } 1097c165b184SJames Collins 1098c165b184SJames Collins function get_display_size() 1099c165b184SJames Collins { 1100c165b184SJames Collins global $debug_object; 1101c165b184SJames Collins 1102c165b184SJames Collins $width = -1; 1103c165b184SJames Collins $height = -1; 1104c165b184SJames Collins 1105c165b184SJames Collins if ($this->tag !== 'img') { 1106c165b184SJames Collins return false; 1107c165b184SJames Collins } 1108c165b184SJames Collins 1109c165b184SJames Collins // See if there is aheight or width attribute in the tag itself. 1110c165b184SJames Collins if (isset($this->attr['width'])) { 1111c165b184SJames Collins $width = $this->attr['width']; 1112c165b184SJames Collins } 1113c165b184SJames Collins 1114c165b184SJames Collins if (isset($this->attr['height'])) { 1115c165b184SJames Collins $height = $this->attr['height']; 1116c165b184SJames Collins } 1117c165b184SJames Collins 1118c165b184SJames Collins // Now look for an inline style. 1119c165b184SJames Collins if (isset($this->attr['style'])) { 1120c165b184SJames Collins // Thanks to user gnarf from stackoverflow for this regular expression. 1121c165b184SJames Collins $attributes = array(); 1122c165b184SJames Collins 1123c165b184SJames Collins preg_match_all( 1124c165b184SJames Collins '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1125c165b184SJames Collins $this->attr['style'], 1126c165b184SJames Collins $matches, 1127c165b184SJames Collins PREG_SET_ORDER 1128c165b184SJames Collins ); 1129c165b184SJames Collins 1130c165b184SJames Collins foreach ($matches as $match) { 1131c165b184SJames Collins $attributes[$match[1]] = $match[2]; 1132c165b184SJames Collins } 1133c165b184SJames Collins 1134c165b184SJames Collins // If there is a width in the style attributes: 1135c165b184SJames Collins if (isset($attributes['width']) && $width == -1) { 1136c165b184SJames Collins // check that the last two characters are px (pixels) 1137c165b184SJames Collins if (strtolower(substr($attributes['width'], -2)) === 'px') { 1138c165b184SJames Collins $proposed_width = substr($attributes['width'], 0, -2); 1139c165b184SJames Collins // Now make sure that it's an integer and not something stupid. 1140c165b184SJames Collins if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1141c165b184SJames Collins $width = $proposed_width; 1142c165b184SJames Collins } 1143c165b184SJames Collins } 1144c165b184SJames Collins } 1145c165b184SJames Collins 1146c165b184SJames Collins // If there is a width in the style attributes: 1147c165b184SJames Collins if (isset($attributes['height']) && $height == -1) { 1148c165b184SJames Collins // check that the last two characters are px (pixels) 1149c165b184SJames Collins if (strtolower(substr($attributes['height'], -2)) == 'px') { 1150c165b184SJames Collins $proposed_height = substr($attributes['height'], 0, -2); 1151c165b184SJames Collins // Now make sure that it's an integer and not something stupid. 1152c165b184SJames Collins if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1153c165b184SJames Collins $height = $proposed_height; 1154c165b184SJames Collins } 1155c165b184SJames Collins } 1156c165b184SJames Collins } 1157c165b184SJames Collins 1158c165b184SJames Collins } 1159c165b184SJames Collins 1160c165b184SJames Collins // Future enhancement: 1161c165b184SJames Collins // Look in the tag to see if there is a class or id specified that has 1162c165b184SJames Collins // a height or width attribute to it. 1163c165b184SJames Collins 1164c165b184SJames Collins // Far future enhancement 1165c165b184SJames Collins // Look at all the parent tags of this image to see if they specify a 1166c165b184SJames Collins // class or id that has an img selector that specifies a height or width 1167c165b184SJames Collins // Note that in this case, the class or id will have the img subselector 1168c165b184SJames Collins // for it to apply to the image. 1169c165b184SJames Collins 1170c165b184SJames Collins // ridiculously far future development 1171c165b184SJames Collins // If the class or id is specified in a SEPARATE css file thats not on 1172c165b184SJames Collins // the page, go get it and do what we were just doing for the ones on 1173c165b184SJames Collins // the page. 1174c165b184SJames Collins 1175c165b184SJames Collins $result = array( 1176c165b184SJames Collins 'height' => $height, 1177c165b184SJames Collins 'width' => $width 1178c165b184SJames Collins ); 1179c165b184SJames Collins 1180c165b184SJames Collins return $result; 1181c165b184SJames Collins } 1182c165b184SJames Collins 1183c165b184SJames Collins function save($filepath = '') 1184c165b184SJames Collins { 1185c165b184SJames Collins $ret = $this->outertext(); 1186c165b184SJames Collins 1187c165b184SJames Collins if ($filepath !== '') { 1188c165b184SJames Collins file_put_contents($filepath, $ret, LOCK_EX); 1189c165b184SJames Collins } 1190c165b184SJames Collins 1191c165b184SJames Collins return $ret; 1192c165b184SJames Collins } 1193c165b184SJames Collins 1194c165b184SJames Collins function addClass($class) 1195c165b184SJames Collins { 1196c165b184SJames Collins if (is_string($class)) { 1197c165b184SJames Collins $class = explode(' ', $class); 1198c165b184SJames Collins } 1199c165b184SJames Collins 1200c165b184SJames Collins if (is_array($class)) { 1201c165b184SJames Collins foreach($class as $c) { 1202c165b184SJames Collins if (isset($this->class)) { 1203c165b184SJames Collins if ($this->hasClass($c)) { 1204c165b184SJames Collins continue; 1205c165b184SJames Collins } else { 1206c165b184SJames Collins $this->class .= ' ' . $c; 1207c165b184SJames Collins } 1208c165b184SJames Collins } else { 1209c165b184SJames Collins $this->class = $c; 1210c165b184SJames Collins } 1211c165b184SJames Collins } 1212c165b184SJames Collins } else { 1213c165b184SJames Collins if (is_object($debug_object)) { 1214c165b184SJames Collins $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1215c165b184SJames Collins } 1216c165b184SJames Collins } 1217c165b184SJames Collins } 1218c165b184SJames Collins 1219c165b184SJames Collins function hasClass($class) 1220c165b184SJames Collins { 1221c165b184SJames Collins if (is_string($class)) { 1222c165b184SJames Collins if (isset($this->class)) { 1223c165b184SJames Collins return in_array($class, explode(' ', $this->class), true); 1224c165b184SJames Collins } 1225c165b184SJames Collins } else { 1226c165b184SJames Collins if (is_object($debug_object)) { 1227c165b184SJames Collins $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1228c165b184SJames Collins } 1229c165b184SJames Collins } 1230c165b184SJames Collins 1231c165b184SJames Collins return false; 1232c165b184SJames Collins } 1233c165b184SJames Collins 1234c165b184SJames Collins function removeClass($class = null) 1235c165b184SJames Collins { 1236c165b184SJames Collins if (!isset($this->class)) { 1237c165b184SJames Collins return; 1238c165b184SJames Collins } 1239c165b184SJames Collins 1240c165b184SJames Collins if (is_null($class)) { 1241c165b184SJames Collins $this->removeAttribute('class'); 1242c165b184SJames Collins return; 1243c165b184SJames Collins } 1244c165b184SJames Collins 1245c165b184SJames Collins if (is_string($class)) { 1246c165b184SJames Collins $class = explode(' ', $class); 1247c165b184SJames Collins } 1248c165b184SJames Collins 1249c165b184SJames Collins if (is_array($class)) { 1250c165b184SJames Collins $class = array_diff(explode(' ', $this->class), $class); 1251c165b184SJames Collins if (empty($class)) { 1252c165b184SJames Collins $this->removeAttribute('class'); 1253c165b184SJames Collins } else { 1254c165b184SJames Collins $this->class = implode(' ', $class); 1255c165b184SJames Collins } 1256c165b184SJames Collins } 1257c165b184SJames Collins } 1258c165b184SJames Collins 1259c165b184SJames Collins function getAllAttributes() 1260c165b184SJames Collins { 1261c165b184SJames Collins return $this->attr; 1262c165b184SJames Collins } 1263c165b184SJames Collins 1264c165b184SJames Collins function getAttribute($name) 1265c165b184SJames Collins { 1266c165b184SJames Collins return $this->__get($name); 1267c165b184SJames Collins } 1268c165b184SJames Collins 1269c165b184SJames Collins function setAttribute($name, $value) 1270c165b184SJames Collins { 1271c165b184SJames Collins $this->__set($name, $value); 1272c165b184SJames Collins } 1273c165b184SJames Collins 1274c165b184SJames Collins function hasAttribute($name) 1275c165b184SJames Collins { 1276c165b184SJames Collins return $this->__isset($name); 1277c165b184SJames Collins } 1278c165b184SJames Collins 1279c165b184SJames Collins function removeAttribute($name) 1280c165b184SJames Collins { 1281c165b184SJames Collins $this->__set($name, null); 1282c165b184SJames Collins } 1283c165b184SJames Collins 1284c165b184SJames Collins function remove() 1285c165b184SJames Collins { 1286c165b184SJames Collins if ($this->parent) { 1287c165b184SJames Collins $this->parent->removeChild($this); 1288c165b184SJames Collins } 1289c165b184SJames Collins } 1290c165b184SJames Collins 1291c165b184SJames Collins function removeChild($node) 1292c165b184SJames Collins { 1293c165b184SJames Collins $nidx = array_search($node, $this->nodes, true); 1294c165b184SJames Collins $cidx = array_search($node, $this->children, true); 1295c165b184SJames Collins $didx = array_search($node, $this->dom->nodes, true); 1296c165b184SJames Collins 1297c165b184SJames Collins if ($nidx !== false && $cidx !== false && $didx !== false) { 1298c165b184SJames Collins 1299c165b184SJames Collins foreach($node->children as $child) { 1300c165b184SJames Collins $node->removeChild($child); 1301c165b184SJames Collins } 1302c165b184SJames Collins 1303c165b184SJames Collins foreach($node->nodes as $entity) { 1304c165b184SJames Collins $enidx = array_search($entity, $node->nodes, true); 1305c165b184SJames Collins $edidx = array_search($entity, $node->dom->nodes, true); 1306c165b184SJames Collins 1307c165b184SJames Collins if ($enidx !== false && $edidx !== false) { 1308c165b184SJames Collins unset($node->nodes[$enidx]); 1309c165b184SJames Collins unset($node->dom->nodes[$edidx]); 1310c165b184SJames Collins } 1311c165b184SJames Collins } 1312c165b184SJames Collins 1313c165b184SJames Collins unset($this->nodes[$nidx]); 1314c165b184SJames Collins unset($this->children[$cidx]); 1315c165b184SJames Collins unset($this->dom->nodes[$didx]); 1316c165b184SJames Collins 1317c165b184SJames Collins $node->clear(); 1318c165b184SJames Collins 1319c165b184SJames Collins } 1320c165b184SJames Collins } 1321c165b184SJames Collins 1322c165b184SJames Collins function getElementById($id) 1323c165b184SJames Collins { 1324c165b184SJames Collins return $this->find("#$id", 0); 1325c165b184SJames Collins } 1326c165b184SJames Collins 1327c165b184SJames Collins function getElementsById($id, $idx = null) 1328c165b184SJames Collins { 1329c165b184SJames Collins return $this->find("#$id", $idx); 1330c165b184SJames Collins } 1331c165b184SJames Collins 1332c165b184SJames Collins function getElementByTagName($name) 1333c165b184SJames Collins { 1334c165b184SJames Collins return $this->find($name, 0); 1335c165b184SJames Collins } 1336c165b184SJames Collins 1337c165b184SJames Collins function getElementsByTagName($name, $idx = null) 1338c165b184SJames Collins { 1339c165b184SJames Collins return $this->find($name, $idx); 1340c165b184SJames Collins } 1341c165b184SJames Collins 1342c165b184SJames Collins function parentNode() 1343c165b184SJames Collins { 1344c165b184SJames Collins return $this->parent(); 1345c165b184SJames Collins } 1346c165b184SJames Collins 1347c165b184SJames Collins function childNodes($idx = -1) 1348c165b184SJames Collins { 1349c165b184SJames Collins return $this->children($idx); 1350c165b184SJames Collins } 1351c165b184SJames Collins 1352c165b184SJames Collins function firstChild() 1353c165b184SJames Collins { 1354c165b184SJames Collins return $this->first_child(); 1355c165b184SJames Collins } 1356c165b184SJames Collins 1357c165b184SJames Collins function lastChild() 1358c165b184SJames Collins { 1359c165b184SJames Collins return $this->last_child(); 1360c165b184SJames Collins } 1361c165b184SJames Collins 1362c165b184SJames Collins function nextSibling() 1363c165b184SJames Collins { 1364c165b184SJames Collins return $this->next_sibling(); 1365c165b184SJames Collins } 1366c165b184SJames Collins 1367c165b184SJames Collins function previousSibling() 1368c165b184SJames Collins { 1369c165b184SJames Collins return $this->prev_sibling(); 1370c165b184SJames Collins } 1371c165b184SJames Collins 1372c165b184SJames Collins function hasChildNodes() 1373c165b184SJames Collins { 1374c165b184SJames Collins return $this->has_child(); 1375c165b184SJames Collins } 1376c165b184SJames Collins 1377c165b184SJames Collins function nodeName() 1378c165b184SJames Collins { 1379c165b184SJames Collins return $this->tag; 1380c165b184SJames Collins } 1381c165b184SJames Collins 1382c165b184SJames Collins function appendChild($node) 1383c165b184SJames Collins { 1384c165b184SJames Collins $node->parent($this); 1385c165b184SJames Collins return $node; 1386c165b184SJames Collins } 1387c165b184SJames Collins 1388c165b184SJames Collins} 1389c165b184SJames Collins 1390c165b184SJames Collinsclass simple_html_dom 1391c165b184SJames Collins{ 1392c165b184SJames Collins public $root = null; 1393c165b184SJames Collins public $nodes = array(); 1394c165b184SJames Collins public $callback = null; 1395c165b184SJames Collins public $lowercase = false; 1396c165b184SJames Collins public $original_size; 1397c165b184SJames Collins public $size; 1398c165b184SJames Collins 1399*bc1032d9SJames Collins public $stripRNAttrValues = true; // added option to ignore RN in attr values - nomadjimbob 1400*bc1032d9SJames Collins 1401c165b184SJames Collins protected $pos; 1402c165b184SJames Collins protected $doc; 1403c165b184SJames Collins protected $char; 1404c165b184SJames Collins 1405c165b184SJames Collins protected $cursor; 1406c165b184SJames Collins protected $parent; 1407c165b184SJames Collins protected $noise = array(); 1408c165b184SJames Collins protected $token_blank = " \t\r\n"; 1409c165b184SJames Collins protected $token_equal = ' =/>'; 1410c165b184SJames Collins protected $token_slash = " />\r\n\t"; 1411c165b184SJames Collins protected $token_attr = ' >'; 1412c165b184SJames Collins 1413c165b184SJames Collins public $_charset = ''; 1414c165b184SJames Collins public $_target_charset = ''; 1415c165b184SJames Collins 1416c165b184SJames Collins protected $default_br_text = ''; 1417c165b184SJames Collins 1418c165b184SJames Collins public $default_span_text = ''; 1419c165b184SJames Collins 1420c165b184SJames Collins protected $self_closing_tags = array( 1421c165b184SJames Collins 'area' => 1, 1422c165b184SJames Collins 'base' => 1, 1423c165b184SJames Collins 'br' => 1, 1424c165b184SJames Collins 'col' => 1, 1425c165b184SJames Collins 'embed' => 1, 1426c165b184SJames Collins 'hr' => 1, 1427c165b184SJames Collins 'img' => 1, 1428c165b184SJames Collins 'input' => 1, 1429c165b184SJames Collins 'link' => 1, 1430c165b184SJames Collins 'meta' => 1, 1431c165b184SJames Collins 'param' => 1, 1432c165b184SJames Collins 'source' => 1, 1433c165b184SJames Collins 'track' => 1, 1434c165b184SJames Collins 'wbr' => 1 1435c165b184SJames Collins ); 1436c165b184SJames Collins protected $block_tags = array( 1437c165b184SJames Collins 'body' => 1, 1438c165b184SJames Collins 'div' => 1, 1439c165b184SJames Collins 'form' => 1, 1440c165b184SJames Collins 'root' => 1, 1441c165b184SJames Collins 'span' => 1, 1442c165b184SJames Collins 'table' => 1 1443c165b184SJames Collins ); 1444c165b184SJames Collins protected $optional_closing_tags = array( 1445c165b184SJames Collins // Not optional, see 1446c165b184SJames Collins // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1447c165b184SJames Collins 'b' => array('b' => 1), 1448c165b184SJames Collins 'dd' => array('dd' => 1, 'dt' => 1), 1449c165b184SJames Collins // Not optional, see 1450c165b184SJames Collins // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1451c165b184SJames Collins 'dl' => array('dd' => 1, 'dt' => 1), 1452c165b184SJames Collins 'dt' => array('dd' => 1, 'dt' => 1), 1453c165b184SJames Collins 'li' => array('li' => 1), 1454c165b184SJames Collins 'optgroup' => array('optgroup' => 1, 'option' => 1), 1455c165b184SJames Collins 'option' => array('optgroup' => 1, 'option' => 1), 1456c165b184SJames Collins 'p' => array('p' => 1), 1457c165b184SJames Collins 'rp' => array('rp' => 1, 'rt' => 1), 1458c165b184SJames Collins 'rt' => array('rp' => 1, 'rt' => 1), 1459c165b184SJames Collins 'td' => array('td' => 1, 'th' => 1), 1460c165b184SJames Collins 'th' => array('td' => 1, 'th' => 1), 1461c165b184SJames Collins 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1462c165b184SJames Collins ); 1463c165b184SJames Collins 1464c165b184SJames Collins function __construct( 1465c165b184SJames Collins $str = null, 1466c165b184SJames Collins $lowercase = true, 1467c165b184SJames Collins $forceTagsClosed = true, 1468c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 1469c165b184SJames Collins $stripRN = true, 1470c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1471c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT, 1472c165b184SJames Collins $options = 0) 1473c165b184SJames Collins { 1474c165b184SJames Collins if ($str) { 1475c165b184SJames Collins if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1476c165b184SJames Collins $this->load_file($str); 1477c165b184SJames Collins } else { 1478c165b184SJames Collins $this->load( 1479c165b184SJames Collins $str, 1480c165b184SJames Collins $lowercase, 1481c165b184SJames Collins $stripRN, 1482c165b184SJames Collins $defaultBRText, 1483c165b184SJames Collins $defaultSpanText, 1484c165b184SJames Collins $options 1485c165b184SJames Collins ); 1486c165b184SJames Collins } 1487c165b184SJames Collins } 1488c165b184SJames Collins // Forcing tags to be closed implies that we don't trust the html, but 1489c165b184SJames Collins // it can lead to parsing errors if we SHOULD trust the html. 1490c165b184SJames Collins if (!$forceTagsClosed) { 1491c165b184SJames Collins $this->optional_closing_array = array(); 1492c165b184SJames Collins } 1493c165b184SJames Collins 1494c165b184SJames Collins $this->_target_charset = $target_charset; 1495c165b184SJames Collins } 1496c165b184SJames Collins 1497c165b184SJames Collins function __destruct() 1498c165b184SJames Collins { 1499c165b184SJames Collins $this->clear(); 1500c165b184SJames Collins } 1501c165b184SJames Collins 1502c165b184SJames Collins function load( 1503c165b184SJames Collins $str, 1504c165b184SJames Collins $lowercase = true, 1505c165b184SJames Collins $stripRN = true, 1506c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1507c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT, 1508c165b184SJames Collins $options = 0) 1509c165b184SJames Collins { 1510c165b184SJames Collins global $debug_object; 1511c165b184SJames Collins 1512c165b184SJames Collins // prepare 1513c165b184SJames Collins $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1514c165b184SJames Collins 1515c165b184SJames Collins // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1516c165b184SJames Collins // Script tags removal now preceeds style tag removal. 1517c165b184SJames Collins // strip out <script> tags 1518c165b184SJames Collins $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1519c165b184SJames Collins $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1520c165b184SJames Collins 1521c165b184SJames Collins // strip out the \r \n's if we are told to. 1522c165b184SJames Collins if ($stripRN) { 1523c165b184SJames Collins $this->doc = str_replace("\r", ' ', $this->doc); 1524c165b184SJames Collins $this->doc = str_replace("\n", ' ', $this->doc); 1525c165b184SJames Collins 1526c165b184SJames Collins // set the length of content since we have changed it. 1527c165b184SJames Collins $this->size = strlen($this->doc); 1528c165b184SJames Collins } 1529c165b184SJames Collins 1530c165b184SJames Collins // strip out cdata 1531c165b184SJames Collins $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1532c165b184SJames Collins // strip out comments 1533c165b184SJames Collins $this->remove_noise("'<!--(.*?)-->'is"); 1534c165b184SJames Collins // strip out <style> tags 1535c165b184SJames Collins $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1536c165b184SJames Collins $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1537c165b184SJames Collins // strip out preformatted tags 1538c165b184SJames Collins $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1539c165b184SJames Collins // strip out server side scripts 1540c165b184SJames Collins $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1541c165b184SJames Collins 1542c165b184SJames Collins if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1543c165b184SJames Collins $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1544c165b184SJames Collins } 1545c165b184SJames Collins 1546c165b184SJames Collins // parsing 1547c165b184SJames Collins $this->parse(); 1548c165b184SJames Collins // end 1549c165b184SJames Collins $this->root->_[HDOM_INFO_END] = $this->cursor; 1550c165b184SJames Collins $this->parse_charset(); 1551c165b184SJames Collins 1552c165b184SJames Collins // make load function chainable 1553c165b184SJames Collins return $this; 1554c165b184SJames Collins } 1555c165b184SJames Collins 1556c165b184SJames Collins function load_file() 1557c165b184SJames Collins { 1558c165b184SJames Collins $args = func_get_args(); 1559c165b184SJames Collins 1560c165b184SJames Collins if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1561c165b184SJames Collins $this->load($doc, true); 1562c165b184SJames Collins } else { 1563c165b184SJames Collins return false; 1564c165b184SJames Collins } 1565c165b184SJames Collins } 1566c165b184SJames Collins 1567c165b184SJames Collins function set_callback($function_name) 1568c165b184SJames Collins { 1569c165b184SJames Collins $this->callback = $function_name; 1570c165b184SJames Collins } 1571c165b184SJames Collins 1572c165b184SJames Collins function remove_callback() 1573c165b184SJames Collins { 1574c165b184SJames Collins $this->callback = null; 1575c165b184SJames Collins } 1576c165b184SJames Collins 1577c165b184SJames Collins function save($filepath = '') 1578c165b184SJames Collins { 1579c165b184SJames Collins $ret = $this->root->innertext(); 1580c165b184SJames Collins if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1581c165b184SJames Collins return $ret; 1582c165b184SJames Collins } 1583c165b184SJames Collins 1584c165b184SJames Collins function find($selector, $idx = null, $lowercase = false) 1585c165b184SJames Collins { 1586c165b184SJames Collins return $this->root->find($selector, $idx, $lowercase); 1587c165b184SJames Collins } 1588c165b184SJames Collins 1589c165b184SJames Collins function clear() 1590c165b184SJames Collins { 1591c165b184SJames Collins if (isset($this->nodes)) { 1592c165b184SJames Collins foreach ($this->nodes as $n) { 1593c165b184SJames Collins $n->clear(); 1594c165b184SJames Collins $n = null; 1595c165b184SJames Collins } 1596c165b184SJames Collins } 1597c165b184SJames Collins 1598c165b184SJames Collins // This add next line is documented in the sourceforge repository. 1599c165b184SJames Collins // 2977248 as a fix for ongoing memory leaks that occur even with the 1600c165b184SJames Collins // use of clear. 1601c165b184SJames Collins if (isset($this->children)) { 1602c165b184SJames Collins foreach ($this->children as $n) { 1603c165b184SJames Collins $n->clear(); 1604c165b184SJames Collins $n = null; 1605c165b184SJames Collins } 1606c165b184SJames Collins } 1607c165b184SJames Collins 1608c165b184SJames Collins if (isset($this->parent)) { 1609c165b184SJames Collins $this->parent->clear(); 1610c165b184SJames Collins unset($this->parent); 1611c165b184SJames Collins } 1612c165b184SJames Collins 1613c165b184SJames Collins if (isset($this->root)) { 1614c165b184SJames Collins $this->root->clear(); 1615c165b184SJames Collins unset($this->root); 1616c165b184SJames Collins } 1617c165b184SJames Collins 1618c165b184SJames Collins unset($this->doc); 1619c165b184SJames Collins unset($this->noise); 1620c165b184SJames Collins } 1621c165b184SJames Collins 1622c165b184SJames Collins function dump($show_attr = true) 1623c165b184SJames Collins { 1624c165b184SJames Collins $this->root->dump($show_attr); 1625c165b184SJames Collins } 1626c165b184SJames Collins 1627c165b184SJames Collins protected function prepare( 1628c165b184SJames Collins $str, $lowercase = true, 1629c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1630c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 1631c165b184SJames Collins { 1632c165b184SJames Collins $this->clear(); 1633c165b184SJames Collins 1634c165b184SJames Collins $this->doc = trim($str); 1635c165b184SJames Collins $this->size = strlen($this->doc); 1636c165b184SJames Collins $this->original_size = $this->size; // original size of the html 1637c165b184SJames Collins $this->pos = 0; 1638c165b184SJames Collins $this->cursor = 1; 1639c165b184SJames Collins $this->noise = array(); 1640c165b184SJames Collins $this->nodes = array(); 1641c165b184SJames Collins $this->lowercase = $lowercase; 1642c165b184SJames Collins $this->default_br_text = $defaultBRText; 1643c165b184SJames Collins $this->default_span_text = $defaultSpanText; 1644c165b184SJames Collins $this->root = new simple_html_dom_node($this); 1645c165b184SJames Collins $this->root->tag = 'root'; 1646c165b184SJames Collins $this->root->_[HDOM_INFO_BEGIN] = -1; 1647c165b184SJames Collins $this->root->nodetype = HDOM_TYPE_ROOT; 1648c165b184SJames Collins $this->parent = $this->root; 1649c165b184SJames Collins if ($this->size > 0) { $this->char = $this->doc[0]; } 1650c165b184SJames Collins } 1651c165b184SJames Collins 1652c165b184SJames Collins protected function parse() 1653c165b184SJames Collins { 1654c165b184SJames Collins while (true) { 1655c165b184SJames Collins // Read next tag if there is no text between current position and the 1656c165b184SJames Collins // next opening tag. 1657c165b184SJames Collins if (($s = $this->copy_until_char('<')) === '') { 1658c165b184SJames Collins if($this->read_tag()) { 1659c165b184SJames Collins continue; 1660c165b184SJames Collins } else { 1661c165b184SJames Collins return true; 1662c165b184SJames Collins } 1663c165b184SJames Collins } 1664c165b184SJames Collins 1665c165b184SJames Collins // Add a text node for text between tags 1666c165b184SJames Collins $node = new simple_html_dom_node($this); 1667c165b184SJames Collins ++$this->cursor; 1668c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = $s; 1669c165b184SJames Collins $this->link_nodes($node, false); 1670c165b184SJames Collins } 1671c165b184SJames Collins } 1672c165b184SJames Collins 1673c165b184SJames Collins protected function parse_charset() 1674c165b184SJames Collins { 1675c165b184SJames Collins global $debug_object; 1676c165b184SJames Collins 1677c165b184SJames Collins $charset = null; 1678c165b184SJames Collins 1679c165b184SJames Collins if (function_exists('get_last_retrieve_url_contents_content_type')) { 1680c165b184SJames Collins $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1681c165b184SJames Collins $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1682c165b184SJames Collins if ($success) { 1683c165b184SJames Collins $charset = $matches[1]; 1684c165b184SJames Collins if (is_object($debug_object)) { 1685c165b184SJames Collins $debug_object->debug_log(2, 1686c165b184SJames Collins 'header content-type found charset of: ' 1687c165b184SJames Collins . $charset 1688c165b184SJames Collins ); 1689c165b184SJames Collins } 1690c165b184SJames Collins } 1691c165b184SJames Collins } 1692c165b184SJames Collins 1693c165b184SJames Collins if (empty($charset)) { 1694c165b184SJames Collins // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1695c165b184SJames Collins $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1696c165b184SJames Collins 1697c165b184SJames Collins if (!empty($el)) { 1698c165b184SJames Collins $fullvalue = $el->content; 1699c165b184SJames Collins if (is_object($debug_object)) { 1700c165b184SJames Collins $debug_object->debug_log(2, 1701c165b184SJames Collins 'meta content-type tag found' 1702c165b184SJames Collins . $fullvalue 1703c165b184SJames Collins ); 1704c165b184SJames Collins } 1705c165b184SJames Collins 1706c165b184SJames Collins if (!empty($fullvalue)) { 1707c165b184SJames Collins $success = preg_match( 1708c165b184SJames Collins '/charset=(.+)/i', 1709c165b184SJames Collins $fullvalue, 1710c165b184SJames Collins $matches 1711c165b184SJames Collins ); 1712c165b184SJames Collins 1713c165b184SJames Collins if ($success) { 1714c165b184SJames Collins $charset = $matches[1]; 1715c165b184SJames Collins } else { 1716c165b184SJames Collins // If there is a meta tag, and they don't specify the 1717c165b184SJames Collins // character set, research says that it's typically 1718c165b184SJames Collins // ISO-8859-1 1719c165b184SJames Collins if (is_object($debug_object)) { 1720c165b184SJames Collins $debug_object->debug_log(2, 1721c165b184SJames Collins 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1722c165b184SJames Collins ); 1723c165b184SJames Collins } 1724c165b184SJames Collins 1725c165b184SJames Collins $charset = 'ISO-8859-1'; 1726c165b184SJames Collins } 1727c165b184SJames Collins } 1728c165b184SJames Collins } 1729c165b184SJames Collins } 1730c165b184SJames Collins 1731c165b184SJames Collins if (empty($charset)) { 1732c165b184SJames Collins // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1733c165b184SJames Collins if ($meta = $this->root->find('meta[charset]', 0)) { 1734c165b184SJames Collins $charset = $meta->charset; 1735c165b184SJames Collins if (is_object($debug_object)) { 1736c165b184SJames Collins $debug_object->debug_log(2, 'meta charset: ' . $charset); 1737c165b184SJames Collins } 1738c165b184SJames Collins } 1739c165b184SJames Collins } 1740c165b184SJames Collins 1741c165b184SJames Collins if (empty($charset)) { 1742c165b184SJames Collins // Try to guess the charset based on the content 1743c165b184SJames Collins // Requires Multibyte String (mbstring) support (optional) 1744c165b184SJames Collins if (function_exists('mb_detect_encoding')) { 1745c165b184SJames Collins /** 1746c165b184SJames Collins * mb_detect_encoding() is not intended to distinguish between 1747c165b184SJames Collins * charsets, especially single-byte charsets. Its primary 1748c165b184SJames Collins * purpose is to detect which multibyte encoding is in use, 1749c165b184SJames Collins * i.e. UTF-8, UTF-16, shift-JIS, etc. 1750c165b184SJames Collins * 1751c165b184SJames Collins * -- https://bugs.php.net/bug.php?id=38138 1752c165b184SJames Collins * 1753c165b184SJames Collins * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1754c165b184SJames Collins * always result in CP1251/ISO-8859-5 and vice versa. 1755c165b184SJames Collins * 1756c165b184SJames Collins * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1757c165b184SJames Collins * to stay compatible. 1758c165b184SJames Collins */ 1759c165b184SJames Collins $encoding = mb_detect_encoding( 1760c165b184SJames Collins $this->doc, 1761c165b184SJames Collins array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1762c165b184SJames Collins ); 1763c165b184SJames Collins 1764c165b184SJames Collins if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1765c165b184SJames Collins // Due to a limitation of mb_detect_encoding 1766c165b184SJames Collins // 'CP1251'/'ISO-8859-5' will be detected as 1767c165b184SJames Collins // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1768c165b184SJames Collins // which case we can simply assume it is the other charset. 1769c165b184SJames Collins if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1770c165b184SJames Collins $encoding = 'CP1251'; 1771c165b184SJames Collins } 1772c165b184SJames Collins } 1773c165b184SJames Collins 1774c165b184SJames Collins if ($encoding !== false) { 1775c165b184SJames Collins $charset = $encoding; 1776c165b184SJames Collins if (is_object($debug_object)) { 1777c165b184SJames Collins $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1778c165b184SJames Collins } 1779c165b184SJames Collins } 1780c165b184SJames Collins } 1781c165b184SJames Collins } 1782c165b184SJames Collins 1783c165b184SJames Collins if (empty($charset)) { 1784c165b184SJames Collins // Assume it's UTF-8 as it is the most likely charset to be used 1785c165b184SJames Collins $charset = 'UTF-8'; 1786c165b184SJames Collins if (is_object($debug_object)) { 1787c165b184SJames Collins $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1788c165b184SJames Collins } 1789c165b184SJames Collins } 1790c165b184SJames Collins 1791c165b184SJames Collins // Since CP1252 is a superset, if we get one of it's subsets, we want 1792c165b184SJames Collins // it instead. 1793c165b184SJames Collins if ((strtolower($charset) == 'iso-8859-1') 1794c165b184SJames Collins || (strtolower($charset) == 'latin1') 1795c165b184SJames Collins || (strtolower($charset) == 'latin-1')) { 1796c165b184SJames Collins $charset = 'CP1252'; 1797c165b184SJames Collins if (is_object($debug_object)) { 1798c165b184SJames Collins $debug_object->debug_log(2, 1799c165b184SJames Collins 'replacing ' . $charset . ' with CP1252 as its a superset' 1800c165b184SJames Collins ); 1801c165b184SJames Collins } 1802c165b184SJames Collins } 1803c165b184SJames Collins 1804c165b184SJames Collins if (is_object($debug_object)) { 1805c165b184SJames Collins $debug_object->debug_log(1, 'EXIT - ' . $charset); 1806c165b184SJames Collins } 1807c165b184SJames Collins 1808c165b184SJames Collins return $this->_charset = $charset; 1809c165b184SJames Collins } 1810c165b184SJames Collins 1811c165b184SJames Collins protected function read_tag() 1812c165b184SJames Collins { 1813c165b184SJames Collins // Set end position if no further tags found 1814c165b184SJames Collins if ($this->char !== '<') { 1815c165b184SJames Collins $this->root->_[HDOM_INFO_END] = $this->cursor; 1816c165b184SJames Collins return false; 1817c165b184SJames Collins } 1818c165b184SJames Collins 1819c165b184SJames Collins $begin_tag_pos = $this->pos; 1820c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1821c165b184SJames Collins 1822c165b184SJames Collins // end tag 1823c165b184SJames Collins if ($this->char === '/') { 1824c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1825c165b184SJames Collins 1826c165b184SJames Collins // Skip whitespace in end tags (i.e. in "</ html>") 1827c165b184SJames Collins $this->skip($this->token_blank); 1828c165b184SJames Collins $tag = $this->copy_until_char('>'); 1829c165b184SJames Collins 1830c165b184SJames Collins // Skip attributes in end tags 1831c165b184SJames Collins if (($pos = strpos($tag, ' ')) !== false) { 1832c165b184SJames Collins $tag = substr($tag, 0, $pos); 1833c165b184SJames Collins } 1834c165b184SJames Collins 1835c165b184SJames Collins $parent_lower = strtolower($this->parent->tag); 1836c165b184SJames Collins $tag_lower = strtolower($tag); 1837c165b184SJames Collins 1838c165b184SJames Collins // The end tag is supposed to close the parent tag. Handle situations 1839c165b184SJames Collins // when it doesn't 1840c165b184SJames Collins if ($parent_lower !== $tag_lower) { 1841c165b184SJames Collins // Parent tag does not have to be closed necessarily (optional closing tag) 1842c165b184SJames Collins // Current tag is a block tag, so it may close an ancestor 1843c165b184SJames Collins if (isset($this->optional_closing_tags[$parent_lower]) 1844c165b184SJames Collins && isset($this->block_tags[$tag_lower])) { 1845c165b184SJames Collins 1846c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1847c165b184SJames Collins $org_parent = $this->parent; 1848c165b184SJames Collins 1849c165b184SJames Collins // Traverse ancestors to find a matching opening tag 1850c165b184SJames Collins // Stop at root node 1851c165b184SJames Collins while (($this->parent->parent) 1852c165b184SJames Collins && strtolower($this->parent->tag) !== $tag_lower 1853c165b184SJames Collins ){ 1854c165b184SJames Collins $this->parent = $this->parent->parent; 1855c165b184SJames Collins } 1856c165b184SJames Collins 1857c165b184SJames Collins // If we don't have a match add current tag as text node 1858c165b184SJames Collins if (strtolower($this->parent->tag) !== $tag_lower) { 1859c165b184SJames Collins $this->parent = $org_parent; // restore origonal parent 1860c165b184SJames Collins 1861c165b184SJames Collins if ($this->parent->parent) { 1862c165b184SJames Collins $this->parent = $this->parent->parent; 1863c165b184SJames Collins } 1864c165b184SJames Collins 1865c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1866c165b184SJames Collins return $this->as_text_node($tag); 1867c165b184SJames Collins } 1868c165b184SJames Collins } elseif (($this->parent->parent) 1869c165b184SJames Collins && isset($this->block_tags[$tag_lower]) 1870c165b184SJames Collins ) { 1871c165b184SJames Collins // Grandparent exists and current tag is a block tag, so our 1872c165b184SJames Collins // parent doesn't have an end tag 1873c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1874c165b184SJames Collins $org_parent = $this->parent; 1875c165b184SJames Collins 1876c165b184SJames Collins // Traverse ancestors to find a matching opening tag 1877c165b184SJames Collins // Stop at root node 1878c165b184SJames Collins while (($this->parent->parent) 1879c165b184SJames Collins && strtolower($this->parent->tag) !== $tag_lower 1880c165b184SJames Collins ) { 1881c165b184SJames Collins $this->parent = $this->parent->parent; 1882c165b184SJames Collins } 1883c165b184SJames Collins 1884c165b184SJames Collins // If we don't have a match add current tag as text node 1885c165b184SJames Collins if (strtolower($this->parent->tag) !== $tag_lower) { 1886c165b184SJames Collins $this->parent = $org_parent; // restore origonal parent 1887c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1888c165b184SJames Collins return $this->as_text_node($tag); 1889c165b184SJames Collins } 1890c165b184SJames Collins } elseif (($this->parent->parent) 1891c165b184SJames Collins && strtolower($this->parent->parent->tag) === $tag_lower 1892c165b184SJames Collins ) { // Grandparent exists and current tag closes it 1893c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1894c165b184SJames Collins $this->parent = $this->parent->parent; 1895c165b184SJames Collins } else { // Random tag, add as text node 1896c165b184SJames Collins return $this->as_text_node($tag); 1897c165b184SJames Collins } 1898c165b184SJames Collins } 1899c165b184SJames Collins 1900c165b184SJames Collins // Set end position of parent tag to current cursor position 1901c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1902c165b184SJames Collins 1903c165b184SJames Collins if ($this->parent->parent) { 1904c165b184SJames Collins $this->parent = $this->parent->parent; 1905c165b184SJames Collins } 1906c165b184SJames Collins 1907c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1908c165b184SJames Collins return true; 1909c165b184SJames Collins } 1910c165b184SJames Collins 1911c165b184SJames Collins // start tag 1912c165b184SJames Collins $node = new simple_html_dom_node($this); 1913c165b184SJames Collins $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1914c165b184SJames Collins ++$this->cursor; 1915c165b184SJames Collins $tag = $this->copy_until($this->token_slash); // Get tag name 1916c165b184SJames Collins $node->tag_start = $begin_tag_pos; 1917c165b184SJames Collins 1918c165b184SJames Collins // doctype, cdata & comments... 1919c165b184SJames Collins // <!DOCTYPE html> 1920c165b184SJames Collins // <![CDATA[ ... ]]> 1921c165b184SJames Collins // <!-- Comment --> 1922c165b184SJames Collins if (isset($tag[0]) && $tag[0] === '!') { 1923c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1924c165b184SJames Collins 1925c165b184SJames Collins if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1926c165b184SJames Collins $node->nodetype = HDOM_TYPE_COMMENT; 1927c165b184SJames Collins $node->tag = 'comment'; 1928c165b184SJames Collins } else { // Could be doctype or CDATA but we don't care 1929c165b184SJames Collins $node->nodetype = HDOM_TYPE_UNKNOWN; 1930c165b184SJames Collins $node->tag = 'unknown'; 1931c165b184SJames Collins } 1932c165b184SJames Collins 1933c165b184SJames Collins if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1934c165b184SJames Collins 1935c165b184SJames Collins $this->link_nodes($node, true); 1936c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1937c165b184SJames Collins return true; 1938c165b184SJames Collins } 1939c165b184SJames Collins 1940c165b184SJames Collins // The start tag cannot contain another start tag, if so add as text 1941c165b184SJames Collins // i.e. "<<html>" 1942c165b184SJames Collins if ($pos = strpos($tag, '<') !== false) { 1943c165b184SJames Collins $tag = '<' . substr($tag, 0, -1); 1944c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = $tag; 1945c165b184SJames Collins $this->link_nodes($node, false); 1946c165b184SJames Collins $this->char = $this->doc[--$this->pos]; // prev 1947c165b184SJames Collins return true; 1948c165b184SJames Collins } 1949c165b184SJames Collins 1950c165b184SJames Collins // Handle invalid tag names (i.e. "<html#doc>") 1951c165b184SJames Collins if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1952c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1953c165b184SJames Collins 1954c165b184SJames Collins // Next char is the beginning of a new tag, don't touch it. 1955c165b184SJames Collins if ($this->char === '<') { 1956c165b184SJames Collins $this->link_nodes($node, false); 1957c165b184SJames Collins return true; 1958c165b184SJames Collins } 1959c165b184SJames Collins 1960c165b184SJames Collins // Next char closes current tag, add and be done with it. 1961c165b184SJames Collins if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1962c165b184SJames Collins $this->link_nodes($node, false); 1963c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1964c165b184SJames Collins return true; 1965c165b184SJames Collins } 1966c165b184SJames Collins 1967c165b184SJames Collins // begin tag, add new node 1968c165b184SJames Collins $node->nodetype = HDOM_TYPE_ELEMENT; 1969c165b184SJames Collins $tag_lower = strtolower($tag); 1970c165b184SJames Collins $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1971c165b184SJames Collins 1972c165b184SJames Collins // handle optional closing tags 1973c165b184SJames Collins if (isset($this->optional_closing_tags[$tag_lower])) { 1974c165b184SJames Collins // Traverse ancestors to close all optional closing tags 1975c165b184SJames Collins while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1976c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1977c165b184SJames Collins $this->parent = $this->parent->parent; 1978c165b184SJames Collins } 1979c165b184SJames Collins $node->parent = $this->parent; 1980c165b184SJames Collins } 1981c165b184SJames Collins 1982c165b184SJames Collins $guard = 0; // prevent infinity loop 1983c165b184SJames Collins 1984c165b184SJames Collins // [0] Space between tag and first attribute 1985c165b184SJames Collins $space = array($this->copy_skip($this->token_blank), '', ''); 1986c165b184SJames Collins 1987c165b184SJames Collins // attributes 1988c165b184SJames Collins do { 1989c165b184SJames Collins // Everything until the first equal sign should be the attribute name 1990c165b184SJames Collins $name = $this->copy_until($this->token_equal); 1991c165b184SJames Collins 1992c165b184SJames Collins if ($name === '' && $this->char !== null && $space[0] === '') { 1993c165b184SJames Collins break; 1994c165b184SJames Collins } 1995c165b184SJames Collins 1996c165b184SJames Collins if ($guard === $this->pos) { // Escape infinite loop 1997c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1998c165b184SJames Collins continue; 1999c165b184SJames Collins } 2000c165b184SJames Collins 2001c165b184SJames Collins $guard = $this->pos; 2002c165b184SJames Collins 2003c165b184SJames Collins // handle endless '<' 2004c165b184SJames Collins // Out of bounds before the tag ended 2005c165b184SJames Collins if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2006c165b184SJames Collins $node->nodetype = HDOM_TYPE_TEXT; 2007c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2008c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2009c165b184SJames Collins $node->tag = 'text'; 2010c165b184SJames Collins $this->link_nodes($node, false); 2011c165b184SJames Collins return true; 2012c165b184SJames Collins } 2013c165b184SJames Collins 2014c165b184SJames Collins // handle mismatch '<' 2015c165b184SJames Collins // Attributes cannot start after opening tag 2016c165b184SJames Collins if ($this->doc[$this->pos - 1] == '<') { 2017c165b184SJames Collins $node->nodetype = HDOM_TYPE_TEXT; 2018c165b184SJames Collins $node->tag = 'text'; 2019c165b184SJames Collins $node->attr = array(); 2020c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2021c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = substr( 2022c165b184SJames Collins $this->doc, 2023c165b184SJames Collins $begin_tag_pos, 2024c165b184SJames Collins $this->pos - $begin_tag_pos - 1 2025c165b184SJames Collins ); 2026c165b184SJames Collins $this->pos -= 2; 2027c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2028c165b184SJames Collins $this->link_nodes($node, false); 2029c165b184SJames Collins return true; 2030c165b184SJames Collins } 2031c165b184SJames Collins 2032c165b184SJames Collins if ($name !== '/' && $name !== '') { // this is a attribute name 2033c165b184SJames Collins // [1] Whitespace after attribute name 2034c165b184SJames Collins $space[1] = $this->copy_skip($this->token_blank); 2035c165b184SJames Collins 2036c165b184SJames Collins $name = $this->restore_noise($name); // might be a noisy name 2037c165b184SJames Collins 2038c165b184SJames Collins if ($this->lowercase) { $name = strtolower($name); } 2039c165b184SJames Collins 2040c165b184SJames Collins if ($this->char === '=') { // attribute with value 2041c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2042c165b184SJames Collins $this->parse_attr($node, $name, $space); // get attribute value 2043c165b184SJames Collins } else { 2044c165b184SJames Collins //no value attr: nowrap, checked selected... 2045c165b184SJames Collins $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2046c165b184SJames Collins $node->attr[$name] = true; 2047c165b184SJames Collins if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2048c165b184SJames Collins } 2049c165b184SJames Collins 2050c165b184SJames Collins $node->_[HDOM_INFO_SPACE][] = $space; 2051c165b184SJames Collins 2052c165b184SJames Collins // prepare for next attribute 2053c165b184SJames Collins $space = array( 2054c165b184SJames Collins $this->copy_skip($this->token_blank), 2055c165b184SJames Collins '', 2056c165b184SJames Collins '' 2057c165b184SJames Collins ); 2058c165b184SJames Collins } else { // no more attributes 2059c165b184SJames Collins break; 2060c165b184SJames Collins } 2061c165b184SJames Collins } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2062c165b184SJames Collins 2063c165b184SJames Collins $this->link_nodes($node, true); 2064c165b184SJames Collins $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2065c165b184SJames Collins 2066c165b184SJames Collins // handle empty tags (i.e. "<div/>") 2067c165b184SJames Collins if ($this->copy_until_char('>') === '/') { 2068c165b184SJames Collins $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2069c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2070c165b184SJames Collins } else { 2071c165b184SJames Collins // reset parent 2072c165b184SJames Collins if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2073c165b184SJames Collins $this->parent = $node; 2074c165b184SJames Collins } 2075c165b184SJames Collins } 2076c165b184SJames Collins 2077c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2078c165b184SJames Collins 2079c165b184SJames Collins // If it's a BR tag, we need to set it's text to the default text. 2080c165b184SJames Collins // This way when we see it in plaintext, we can generate formatting that the user wants. 2081c165b184SJames Collins // since a br tag never has sub nodes, this works well. 2082c165b184SJames Collins if ($node->tag === 'br') { 2083c165b184SJames Collins $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2084c165b184SJames Collins } 2085c165b184SJames Collins 2086c165b184SJames Collins return true; 2087c165b184SJames Collins } 2088c165b184SJames Collins 2089c165b184SJames Collins protected function parse_attr($node, $name, &$space) 2090c165b184SJames Collins { 2091c165b184SJames Collins $is_duplicate = isset($node->attr[$name]); 2092c165b184SJames Collins 2093c165b184SJames Collins if (!$is_duplicate) // Copy whitespace between "=" and value 2094c165b184SJames Collins $space[2] = $this->copy_skip($this->token_blank); 2095c165b184SJames Collins 2096c165b184SJames Collins switch ($this->char) { 2097c165b184SJames Collins case '"': 2098c165b184SJames Collins $quote_type = HDOM_QUOTE_DOUBLE; 2099c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2100c165b184SJames Collins $value = $this->copy_until_char('"'); 2101c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2102c165b184SJames Collins break; 2103c165b184SJames Collins case '\'': 2104c165b184SJames Collins $quote_type = HDOM_QUOTE_SINGLE; 2105c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2106c165b184SJames Collins $value = $this->copy_until_char('\''); 2107c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2108c165b184SJames Collins break; 2109c165b184SJames Collins default: 2110c165b184SJames Collins $quote_type = HDOM_QUOTE_NO; 2111c165b184SJames Collins $value = $this->copy_until($this->token_attr); 2112c165b184SJames Collins } 2113c165b184SJames Collins 2114c165b184SJames Collins $value = $this->restore_noise($value); 2115c165b184SJames Collins 2116c165b184SJames Collins // PaperG: Attributes should not have \r or \n in them, that counts as 2117c165b184SJames Collins // html whitespace. 2118cdddb6f0SJames Collins 2119*bc1032d9SJames Collins // Added $stripRNAttrValues option for DokuWiki - nomadjimbob 2120*bc1032d9SJames Collins if($this->stripRNAttrValues) { 2121*bc1032d9SJames Collins $value = str_replace("\r", '', $value); 2122*bc1032d9SJames Collins $value = str_replace("\n", '', $value); 2123*bc1032d9SJames Collins } 2124c165b184SJames Collins 2125c165b184SJames Collins // PaperG: If this is a "class" selector, lets get rid of the preceeding 2126c165b184SJames Collins // and trailing space since some people leave it in the multi class case. 2127c165b184SJames Collins if ($name === 'class') { 2128c165b184SJames Collins $value = trim($value); 2129c165b184SJames Collins } 2130c165b184SJames Collins 2131c165b184SJames Collins if (!$is_duplicate) { 2132c165b184SJames Collins $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2133c165b184SJames Collins $node->attr[$name] = $value; 2134c165b184SJames Collins } 2135c165b184SJames Collins } 2136c165b184SJames Collins 2137c165b184SJames Collins protected function link_nodes(&$node, $is_child) 2138c165b184SJames Collins { 2139c165b184SJames Collins $node->parent = $this->parent; 2140c165b184SJames Collins $this->parent->nodes[] = $node; 2141c165b184SJames Collins if ($is_child) { 2142c165b184SJames Collins $this->parent->children[] = $node; 2143c165b184SJames Collins } 2144c165b184SJames Collins } 2145c165b184SJames Collins 2146c165b184SJames Collins protected function as_text_node($tag) 2147c165b184SJames Collins { 2148c165b184SJames Collins $node = new simple_html_dom_node($this); 2149c165b184SJames Collins ++$this->cursor; 2150c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2151c165b184SJames Collins $this->link_nodes($node, false); 2152c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2153c165b184SJames Collins return true; 2154c165b184SJames Collins } 2155c165b184SJames Collins 2156c165b184SJames Collins protected function skip($chars) 2157c165b184SJames Collins { 2158c165b184SJames Collins $this->pos += strspn($this->doc, $chars, $this->pos); 2159c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2160c165b184SJames Collins } 2161c165b184SJames Collins 2162c165b184SJames Collins protected function copy_skip($chars) 2163c165b184SJames Collins { 2164c165b184SJames Collins $pos = $this->pos; 2165c165b184SJames Collins $len = strspn($this->doc, $chars, $pos); 2166c165b184SJames Collins $this->pos += $len; 2167c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2168c165b184SJames Collins if ($len === 0) { return ''; } 2169c165b184SJames Collins return substr($this->doc, $pos, $len); 2170c165b184SJames Collins } 2171c165b184SJames Collins 2172c165b184SJames Collins protected function copy_until($chars) 2173c165b184SJames Collins { 2174c165b184SJames Collins $pos = $this->pos; 2175c165b184SJames Collins $len = strcspn($this->doc, $chars, $pos); 2176c165b184SJames Collins $this->pos += $len; 2177c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2178c165b184SJames Collins return substr($this->doc, $pos, $len); 2179c165b184SJames Collins } 2180c165b184SJames Collins 2181c165b184SJames Collins protected function copy_until_char($char) 2182c165b184SJames Collins { 2183c165b184SJames Collins if ($this->char === null) { return ''; } 2184c165b184SJames Collins 2185c165b184SJames Collins if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2186c165b184SJames Collins $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2187c165b184SJames Collins $this->char = null; 2188c165b184SJames Collins $this->pos = $this->size; 2189c165b184SJames Collins return $ret; 2190c165b184SJames Collins } 2191c165b184SJames Collins 2192c165b184SJames Collins if ($pos === $this->pos) { return ''; } 2193c165b184SJames Collins 2194c165b184SJames Collins $pos_old = $this->pos; 2195c165b184SJames Collins $this->char = $this->doc[$pos]; 2196c165b184SJames Collins $this->pos = $pos; 2197c165b184SJames Collins return substr($this->doc, $pos_old, $pos - $pos_old); 2198c165b184SJames Collins } 2199c165b184SJames Collins 2200c165b184SJames Collins protected function remove_noise($pattern, $remove_tag = false) 2201c165b184SJames Collins { 2202c165b184SJames Collins global $debug_object; 2203c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2204c165b184SJames Collins 2205c165b184SJames Collins $count = preg_match_all( 2206c165b184SJames Collins $pattern, 2207c165b184SJames Collins $this->doc, 2208c165b184SJames Collins $matches, 2209c165b184SJames Collins PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2210c165b184SJames Collins ); 2211c165b184SJames Collins 2212c165b184SJames Collins for ($i = $count - 1; $i > -1; --$i) { 2213c165b184SJames Collins $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2214c165b184SJames Collins 2215c165b184SJames Collins if (is_object($debug_object)) { 2216c165b184SJames Collins $debug_object->debug_log(2, 'key is: ' . $key); 2217c165b184SJames Collins } 2218c165b184SJames Collins 2219c165b184SJames Collins $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2220c165b184SJames Collins $this->noise[$key] = $matches[$i][$idx][0]; 2221c165b184SJames Collins $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2222c165b184SJames Collins } 2223c165b184SJames Collins 2224c165b184SJames Collins // reset the length of content 2225c165b184SJames Collins $this->size = strlen($this->doc); 2226c165b184SJames Collins 2227c165b184SJames Collins if ($this->size > 0) { 2228c165b184SJames Collins $this->char = $this->doc[0]; 2229c165b184SJames Collins } 2230c165b184SJames Collins } 2231c165b184SJames Collins 2232c165b184SJames Collins function restore_noise($text) 2233c165b184SJames Collins { 2234c165b184SJames Collins global $debug_object; 2235c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2236c165b184SJames Collins 2237c165b184SJames Collins while (($pos = strpos($text, '___noise___')) !== false) { 2238c165b184SJames Collins // Sometimes there is a broken piece of markup, and we don't GET the 2239c165b184SJames Collins // pos+11 etc... token which indicates a problem outside of us... 2240c165b184SJames Collins 2241c165b184SJames Collins // todo: "___noise___1000" (or any number with four or more digits) 2242c165b184SJames Collins // in the DOM causes an infinite loop which could be utilized by 2243c165b184SJames Collins // malicious software 2244c165b184SJames Collins if (strlen($text) > $pos + 15) { 2245c165b184SJames Collins $key = '___noise___' 2246c165b184SJames Collins . $text[$pos + 11] 2247c165b184SJames Collins . $text[$pos + 12] 2248c165b184SJames Collins . $text[$pos + 13] 2249c165b184SJames Collins . $text[$pos + 14] 2250c165b184SJames Collins . $text[$pos + 15]; 2251c165b184SJames Collins 2252c165b184SJames Collins if (is_object($debug_object)) { 2253c165b184SJames Collins $debug_object->debug_log(2, 'located key of: ' . $key); 2254c165b184SJames Collins } 2255c165b184SJames Collins 2256c165b184SJames Collins if (isset($this->noise[$key])) { 2257c165b184SJames Collins $text = substr($text, 0, $pos) 2258c165b184SJames Collins . $this->noise[$key] 2259c165b184SJames Collins . substr($text, $pos + 16); 2260c165b184SJames Collins } else { 2261c165b184SJames Collins // do this to prevent an infinite loop. 2262c165b184SJames Collins $text = substr($text, 0, $pos) 2263c165b184SJames Collins . 'UNDEFINED NOISE FOR KEY: ' 2264c165b184SJames Collins . $key 2265c165b184SJames Collins . substr($text, $pos + 16); 2266c165b184SJames Collins } 2267c165b184SJames Collins } else { 2268c165b184SJames Collins // There is no valid key being given back to us... We must get 2269c165b184SJames Collins // rid of the ___noise___ or we will have a problem. 2270c165b184SJames Collins $text = substr($text, 0, $pos) 2271c165b184SJames Collins . 'NO NUMERIC NOISE KEY' 2272c165b184SJames Collins . substr($text, $pos + 11); 2273c165b184SJames Collins } 2274c165b184SJames Collins } 2275c165b184SJames Collins return $text; 2276c165b184SJames Collins } 2277c165b184SJames Collins 2278c165b184SJames Collins function search_noise($text) 2279c165b184SJames Collins { 2280c165b184SJames Collins global $debug_object; 2281c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2282c165b184SJames Collins 2283c165b184SJames Collins foreach($this->noise as $noiseElement) { 2284c165b184SJames Collins if (strpos($noiseElement, $text) !== false) { 2285c165b184SJames Collins return $noiseElement; 2286c165b184SJames Collins } 2287c165b184SJames Collins } 2288c165b184SJames Collins } 2289c165b184SJames Collins 2290c165b184SJames Collins function __toString() 2291c165b184SJames Collins { 2292c165b184SJames Collins return $this->root->innertext(); 2293c165b184SJames Collins } 2294c165b184SJames Collins 2295c165b184SJames Collins function __get($name) 2296c165b184SJames Collins { 2297c165b184SJames Collins switch ($name) { 2298c165b184SJames Collins case 'outertext': 2299c165b184SJames Collins return $this->root->innertext(); 2300c165b184SJames Collins case 'innertext': 2301c165b184SJames Collins return $this->root->innertext(); 2302c165b184SJames Collins case 'plaintext': 2303c165b184SJames Collins return $this->root->text(); 2304c165b184SJames Collins case 'charset': 2305c165b184SJames Collins return $this->_charset; 2306c165b184SJames Collins case 'target_charset': 2307c165b184SJames Collins return $this->_target_charset; 2308c165b184SJames Collins } 2309c165b184SJames Collins } 2310c165b184SJames Collins 2311c165b184SJames Collins function childNodes($idx = -1) 2312c165b184SJames Collins { 2313c165b184SJames Collins return $this->root->childNodes($idx); 2314c165b184SJames Collins } 2315c165b184SJames Collins 2316c165b184SJames Collins function firstChild() 2317c165b184SJames Collins { 2318c165b184SJames Collins return $this->root->first_child(); 2319c165b184SJames Collins } 2320c165b184SJames Collins 2321c165b184SJames Collins function lastChild() 2322c165b184SJames Collins { 2323c165b184SJames Collins return $this->root->last_child(); 2324c165b184SJames Collins } 2325c165b184SJames Collins 2326c165b184SJames Collins function createElement($name, $value = null) 2327c165b184SJames Collins { 2328c165b184SJames Collins return @str_get_html("<$name>$value</$name>")->firstChild(); 2329c165b184SJames Collins } 2330c165b184SJames Collins 2331c165b184SJames Collins function createTextNode($value) 2332c165b184SJames Collins { 2333c165b184SJames Collins return @end(str_get_html($value)->nodes); 2334c165b184SJames Collins } 2335c165b184SJames Collins 2336c165b184SJames Collins function getElementById($id) 2337c165b184SJames Collins { 2338c165b184SJames Collins return $this->find("#$id", 0); 2339c165b184SJames Collins } 2340c165b184SJames Collins 2341c165b184SJames Collins function getElementsById($id, $idx = null) 2342c165b184SJames Collins { 2343c165b184SJames Collins return $this->find("#$id", $idx); 2344c165b184SJames Collins } 2345c165b184SJames Collins 2346c165b184SJames Collins function getElementByTagName($name) 2347c165b184SJames Collins { 2348c165b184SJames Collins return $this->find($name, 0); 2349c165b184SJames Collins } 2350c165b184SJames Collins 2351c165b184SJames Collins function getElementsByTagName($name, $idx = -1) 2352c165b184SJames Collins { 2353c165b184SJames Collins return $this->find($name, $idx); 2354c165b184SJames Collins } 2355c165b184SJames Collins 2356c165b184SJames Collins function loadFile() 2357c165b184SJames Collins { 2358c165b184SJames Collins $args = func_get_args(); 2359c165b184SJames Collins $this->load_file($args); 2360c165b184SJames Collins } 2361c165b184SJames Collins} 2362