1*c165b184SJames Collins<?php 2*c165b184SJames Collins/** 3*c165b184SJames Collins * Website: http://sourceforge.net/projects/simplehtmldom/ 4*c165b184SJames Collins * Additional projects: http://sourceforge.net/projects/debugobject/ 5*c165b184SJames Collins * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6*c165b184SJames Collins * 7*c165b184SJames Collins * Licensed under The MIT License 8*c165b184SJames Collins * See the LICENSE file in the project root for more information. 9*c165b184SJames Collins * 10*c165b184SJames Collins * Authors: 11*c165b184SJames Collins * S.C. Chen 12*c165b184SJames Collins * John Schlick 13*c165b184SJames Collins * Rus Carroll 14*c165b184SJames Collins * logmanoriginal 15*c165b184SJames Collins * 16*c165b184SJames Collins * Contributors: 17*c165b184SJames Collins * Yousuke Kumakura 18*c165b184SJames Collins * Vadim Voituk 19*c165b184SJames Collins * Antcs 20*c165b184SJames Collins * 21*c165b184SJames Collins * Version Rev. 1.9.1 (291) 22*c165b184SJames Collins */ 23*c165b184SJames Collins 24*c165b184SJames Collinsdefine('HDOM_TYPE_ELEMENT', 1); 25*c165b184SJames Collinsdefine('HDOM_TYPE_COMMENT', 2); 26*c165b184SJames Collinsdefine('HDOM_TYPE_TEXT', 3); 27*c165b184SJames Collinsdefine('HDOM_TYPE_ENDTAG', 4); 28*c165b184SJames Collinsdefine('HDOM_TYPE_ROOT', 5); 29*c165b184SJames Collinsdefine('HDOM_TYPE_UNKNOWN', 6); 30*c165b184SJames Collinsdefine('HDOM_QUOTE_DOUBLE', 0); 31*c165b184SJames Collinsdefine('HDOM_QUOTE_SINGLE', 1); 32*c165b184SJames Collinsdefine('HDOM_QUOTE_NO', 3); 33*c165b184SJames Collinsdefine('HDOM_INFO_BEGIN', 0); 34*c165b184SJames Collinsdefine('HDOM_INFO_END', 1); 35*c165b184SJames Collinsdefine('HDOM_INFO_QUOTE', 2); 36*c165b184SJames Collinsdefine('HDOM_INFO_SPACE', 3); 37*c165b184SJames Collinsdefine('HDOM_INFO_TEXT', 4); 38*c165b184SJames Collinsdefine('HDOM_INFO_INNER', 5); 39*c165b184SJames Collinsdefine('HDOM_INFO_OUTER', 6); 40*c165b184SJames Collinsdefine('HDOM_INFO_ENDSPACE', 7); 41*c165b184SJames Collins 42*c165b184SJames Collinsdefined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 43*c165b184SJames Collinsdefined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n"); 44*c165b184SJames Collinsdefined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' '); 45*c165b184SJames Collinsdefined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000); 46*c165b184SJames Collinsdefine('HDOM_SMARTY_AS_TEXT', 1); 47*c165b184SJames Collins 48*c165b184SJames Collinsfunction file_get_html( 49*c165b184SJames Collins $url, 50*c165b184SJames Collins $use_include_path = false, 51*c165b184SJames Collins $context = null, 52*c165b184SJames Collins $offset = 0, 53*c165b184SJames Collins $maxLen = -1, 54*c165b184SJames Collins $lowercase = true, 55*c165b184SJames Collins $forceTagsClosed = true, 56*c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 57*c165b184SJames Collins $stripRN = true, 58*c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 59*c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 60*c165b184SJames Collins{ 61*c165b184SJames Collins if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; } 62*c165b184SJames Collins 63*c165b184SJames Collins $dom = new simple_html_dom( 64*c165b184SJames Collins null, 65*c165b184SJames Collins $lowercase, 66*c165b184SJames Collins $forceTagsClosed, 67*c165b184SJames Collins $target_charset, 68*c165b184SJames Collins $stripRN, 69*c165b184SJames Collins $defaultBRText, 70*c165b184SJames Collins $defaultSpanText 71*c165b184SJames Collins ); 72*c165b184SJames Collins 73*c165b184SJames Collins /** 74*c165b184SJames Collins * For sourceforge users: uncomment the next line and comment the 75*c165b184SJames Collins * retrieve_url_contents line 2 lines down if it is not already done. 76*c165b184SJames Collins */ 77*c165b184SJames Collins $contents = file_get_contents( 78*c165b184SJames Collins $url, 79*c165b184SJames Collins $use_include_path, 80*c165b184SJames Collins $context, 81*c165b184SJames Collins $offset, 82*c165b184SJames Collins $maxLen 83*c165b184SJames Collins ); 84*c165b184SJames Collins // $contents = retrieve_url_contents($url); 85*c165b184SJames Collins 86*c165b184SJames Collins if (empty($contents) || strlen($contents) > $maxLen) { 87*c165b184SJames Collins $dom->clear(); 88*c165b184SJames Collins return false; 89*c165b184SJames Collins } 90*c165b184SJames Collins 91*c165b184SJames Collins return $dom->load($contents, $lowercase, $stripRN); 92*c165b184SJames Collins} 93*c165b184SJames Collins 94*c165b184SJames Collinsfunction str_get_html( 95*c165b184SJames Collins $str, 96*c165b184SJames Collins $lowercase = true, 97*c165b184SJames Collins $forceTagsClosed = true, 98*c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 99*c165b184SJames Collins $stripRN = true, 100*c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 101*c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 102*c165b184SJames Collins{ 103*c165b184SJames Collins $dom = new simple_html_dom( 104*c165b184SJames Collins null, 105*c165b184SJames Collins $lowercase, 106*c165b184SJames Collins $forceTagsClosed, 107*c165b184SJames Collins $target_charset, 108*c165b184SJames Collins $stripRN, 109*c165b184SJames Collins $defaultBRText, 110*c165b184SJames Collins $defaultSpanText 111*c165b184SJames Collins ); 112*c165b184SJames Collins 113*c165b184SJames Collins if (empty($str) || strlen($str) > MAX_FILE_SIZE) { 114*c165b184SJames Collins $dom->clear(); 115*c165b184SJames Collins return false; 116*c165b184SJames Collins } 117*c165b184SJames Collins 118*c165b184SJames Collins return $dom->load($str, $lowercase, $stripRN); 119*c165b184SJames Collins} 120*c165b184SJames Collins 121*c165b184SJames Collinsfunction dump_html_tree($node, $show_attr = true, $deep = 0) 122*c165b184SJames Collins{ 123*c165b184SJames Collins $node->dump($node); 124*c165b184SJames Collins} 125*c165b184SJames Collins 126*c165b184SJames Collinsclass simple_html_dom_node 127*c165b184SJames Collins{ 128*c165b184SJames Collins public $nodetype = HDOM_TYPE_TEXT; 129*c165b184SJames Collins public $tag = 'text'; 130*c165b184SJames Collins public $attr = array(); 131*c165b184SJames Collins public $children = array(); 132*c165b184SJames Collins public $nodes = array(); 133*c165b184SJames Collins public $parent = null; 134*c165b184SJames Collins public $_ = array(); 135*c165b184SJames Collins public $tag_start = 0; 136*c165b184SJames Collins private $dom = null; 137*c165b184SJames Collins 138*c165b184SJames Collins function __construct($dom) 139*c165b184SJames Collins { 140*c165b184SJames Collins $this->dom = $dom; 141*c165b184SJames Collins $dom->nodes[] = $this; 142*c165b184SJames Collins } 143*c165b184SJames Collins 144*c165b184SJames Collins function __destruct() 145*c165b184SJames Collins { 146*c165b184SJames Collins $this->clear(); 147*c165b184SJames Collins } 148*c165b184SJames Collins 149*c165b184SJames Collins function __toString() 150*c165b184SJames Collins { 151*c165b184SJames Collins return $this->outertext(); 152*c165b184SJames Collins } 153*c165b184SJames Collins 154*c165b184SJames Collins function clear() 155*c165b184SJames Collins { 156*c165b184SJames Collins $this->dom = null; 157*c165b184SJames Collins $this->nodes = null; 158*c165b184SJames Collins $this->parent = null; 159*c165b184SJames Collins $this->children = null; 160*c165b184SJames Collins } 161*c165b184SJames Collins 162*c165b184SJames Collins function dump($show_attr = true, $depth = 0) 163*c165b184SJames Collins { 164*c165b184SJames Collins echo str_repeat("\t", $depth) . $this->tag; 165*c165b184SJames Collins 166*c165b184SJames Collins if ($show_attr && count($this->attr) > 0) { 167*c165b184SJames Collins echo '('; 168*c165b184SJames Collins foreach ($this->attr as $k => $v) { 169*c165b184SJames Collins echo "[$k]=>\"$v\", "; 170*c165b184SJames Collins } 171*c165b184SJames Collins echo ')'; 172*c165b184SJames Collins } 173*c165b184SJames Collins 174*c165b184SJames Collins echo "\n"; 175*c165b184SJames Collins 176*c165b184SJames Collins if ($this->nodes) { 177*c165b184SJames Collins foreach ($this->nodes as $node) { 178*c165b184SJames Collins $node->dump($show_attr, $depth + 1); 179*c165b184SJames Collins } 180*c165b184SJames Collins } 181*c165b184SJames Collins } 182*c165b184SJames Collins 183*c165b184SJames Collins function dump_node($echo = true) 184*c165b184SJames Collins { 185*c165b184SJames Collins $string = $this->tag; 186*c165b184SJames Collins 187*c165b184SJames Collins if (count($this->attr) > 0) { 188*c165b184SJames Collins $string .= '('; 189*c165b184SJames Collins foreach ($this->attr as $k => $v) { 190*c165b184SJames Collins $string .= "[$k]=>\"$v\", "; 191*c165b184SJames Collins } 192*c165b184SJames Collins $string .= ')'; 193*c165b184SJames Collins } 194*c165b184SJames Collins 195*c165b184SJames Collins if (count($this->_) > 0) { 196*c165b184SJames Collins $string .= ' $_ ('; 197*c165b184SJames Collins foreach ($this->_ as $k => $v) { 198*c165b184SJames Collins if (is_array($v)) { 199*c165b184SJames Collins $string .= "[$k]=>("; 200*c165b184SJames Collins foreach ($v as $k2 => $v2) { 201*c165b184SJames Collins $string .= "[$k2]=>\"$v2\", "; 202*c165b184SJames Collins } 203*c165b184SJames Collins $string .= ')'; 204*c165b184SJames Collins } else { 205*c165b184SJames Collins $string .= "[$k]=>\"$v\", "; 206*c165b184SJames Collins } 207*c165b184SJames Collins } 208*c165b184SJames Collins $string .= ')'; 209*c165b184SJames Collins } 210*c165b184SJames Collins 211*c165b184SJames Collins if (isset($this->text)) { 212*c165b184SJames Collins $string .= " text: ({$this->text})"; 213*c165b184SJames Collins } 214*c165b184SJames Collins 215*c165b184SJames Collins $string .= ' HDOM_INNER_INFO: '; 216*c165b184SJames Collins 217*c165b184SJames Collins if (isset($node->_[HDOM_INFO_INNER])) { 218*c165b184SJames Collins $string .= "'" . $node->_[HDOM_INFO_INNER] . "'"; 219*c165b184SJames Collins } else { 220*c165b184SJames Collins $string .= ' NULL '; 221*c165b184SJames Collins } 222*c165b184SJames Collins 223*c165b184SJames Collins $string .= ' children: ' . count($this->children); 224*c165b184SJames Collins $string .= ' nodes: ' . count($this->nodes); 225*c165b184SJames Collins $string .= ' tag_start: ' . $this->tag_start; 226*c165b184SJames Collins $string .= "\n"; 227*c165b184SJames Collins 228*c165b184SJames Collins if ($echo) { 229*c165b184SJames Collins echo $string; 230*c165b184SJames Collins return; 231*c165b184SJames Collins } else { 232*c165b184SJames Collins return $string; 233*c165b184SJames Collins } 234*c165b184SJames Collins } 235*c165b184SJames Collins 236*c165b184SJames Collins function parent($parent = null) 237*c165b184SJames Collins { 238*c165b184SJames Collins // I am SURE that this doesn't work properly. 239*c165b184SJames Collins // It fails to unset the current node from it's current parents nodes or 240*c165b184SJames Collins // children list first. 241*c165b184SJames Collins if ($parent !== null) { 242*c165b184SJames Collins $this->parent = $parent; 243*c165b184SJames Collins $this->parent->nodes[] = $this; 244*c165b184SJames Collins $this->parent->children[] = $this; 245*c165b184SJames Collins } 246*c165b184SJames Collins 247*c165b184SJames Collins return $this->parent; 248*c165b184SJames Collins } 249*c165b184SJames Collins 250*c165b184SJames Collins function has_child() 251*c165b184SJames Collins { 252*c165b184SJames Collins return !empty($this->children); 253*c165b184SJames Collins } 254*c165b184SJames Collins 255*c165b184SJames Collins function children($idx = -1) 256*c165b184SJames Collins { 257*c165b184SJames Collins if ($idx === -1) { 258*c165b184SJames Collins return $this->children; 259*c165b184SJames Collins } 260*c165b184SJames Collins 261*c165b184SJames Collins if (isset($this->children[$idx])) { 262*c165b184SJames Collins return $this->children[$idx]; 263*c165b184SJames Collins } 264*c165b184SJames Collins 265*c165b184SJames Collins return null; 266*c165b184SJames Collins } 267*c165b184SJames Collins 268*c165b184SJames Collins function first_child() 269*c165b184SJames Collins { 270*c165b184SJames Collins if (count($this->children) > 0) { 271*c165b184SJames Collins return $this->children[0]; 272*c165b184SJames Collins } 273*c165b184SJames Collins return null; 274*c165b184SJames Collins } 275*c165b184SJames Collins 276*c165b184SJames Collins function last_child() 277*c165b184SJames Collins { 278*c165b184SJames Collins if (count($this->children) > 0) { 279*c165b184SJames Collins return end($this->children); 280*c165b184SJames Collins } 281*c165b184SJames Collins return null; 282*c165b184SJames Collins } 283*c165b184SJames Collins 284*c165b184SJames Collins function next_sibling() 285*c165b184SJames Collins { 286*c165b184SJames Collins if ($this->parent === null) { 287*c165b184SJames Collins return null; 288*c165b184SJames Collins } 289*c165b184SJames Collins 290*c165b184SJames Collins $idx = array_search($this, $this->parent->children, true); 291*c165b184SJames Collins 292*c165b184SJames Collins if ($idx !== false && isset($this->parent->children[$idx + 1])) { 293*c165b184SJames Collins return $this->parent->children[$idx + 1]; 294*c165b184SJames Collins } 295*c165b184SJames Collins 296*c165b184SJames Collins return null; 297*c165b184SJames Collins } 298*c165b184SJames Collins 299*c165b184SJames Collins function prev_sibling() 300*c165b184SJames Collins { 301*c165b184SJames Collins if ($this->parent === null) { 302*c165b184SJames Collins return null; 303*c165b184SJames Collins } 304*c165b184SJames Collins 305*c165b184SJames Collins $idx = array_search($this, $this->parent->children, true); 306*c165b184SJames Collins 307*c165b184SJames Collins if ($idx !== false && $idx > 0) { 308*c165b184SJames Collins return $this->parent->children[$idx - 1]; 309*c165b184SJames Collins } 310*c165b184SJames Collins 311*c165b184SJames Collins return null; 312*c165b184SJames Collins } 313*c165b184SJames Collins 314*c165b184SJames Collins function find_ancestor_tag($tag) 315*c165b184SJames Collins { 316*c165b184SJames Collins global $debug_object; 317*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 318*c165b184SJames Collins 319*c165b184SJames Collins if ($this->parent === null) { 320*c165b184SJames Collins return null; 321*c165b184SJames Collins } 322*c165b184SJames Collins 323*c165b184SJames Collins $ancestor = $this->parent; 324*c165b184SJames Collins 325*c165b184SJames Collins while (!is_null($ancestor)) { 326*c165b184SJames Collins if (is_object($debug_object)) { 327*c165b184SJames Collins $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag); 328*c165b184SJames Collins } 329*c165b184SJames Collins 330*c165b184SJames Collins if ($ancestor->tag === $tag) { 331*c165b184SJames Collins break; 332*c165b184SJames Collins } 333*c165b184SJames Collins 334*c165b184SJames Collins $ancestor = $ancestor->parent; 335*c165b184SJames Collins } 336*c165b184SJames Collins 337*c165b184SJames Collins return $ancestor; 338*c165b184SJames Collins } 339*c165b184SJames Collins 340*c165b184SJames Collins function innertext() 341*c165b184SJames Collins { 342*c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 343*c165b184SJames Collins return $this->_[HDOM_INFO_INNER]; 344*c165b184SJames Collins } 345*c165b184SJames Collins 346*c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 347*c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 348*c165b184SJames Collins } 349*c165b184SJames Collins 350*c165b184SJames Collins $ret = ''; 351*c165b184SJames Collins 352*c165b184SJames Collins foreach ($this->nodes as $n) { 353*c165b184SJames Collins $ret .= $n->outertext(); 354*c165b184SJames Collins } 355*c165b184SJames Collins 356*c165b184SJames Collins return $ret; 357*c165b184SJames Collins } 358*c165b184SJames Collins 359*c165b184SJames Collins function outertext() 360*c165b184SJames Collins { 361*c165b184SJames Collins global $debug_object; 362*c165b184SJames Collins 363*c165b184SJames Collins if (is_object($debug_object)) { 364*c165b184SJames Collins $text = ''; 365*c165b184SJames Collins 366*c165b184SJames Collins if ($this->tag === 'text') { 367*c165b184SJames Collins if (!empty($this->text)) { 368*c165b184SJames Collins $text = ' with text: ' . $this->text; 369*c165b184SJames Collins } 370*c165b184SJames Collins } 371*c165b184SJames Collins 372*c165b184SJames Collins $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 373*c165b184SJames Collins } 374*c165b184SJames Collins 375*c165b184SJames Collins if ($this->tag === 'root') { 376*c165b184SJames Collins return $this->innertext(); 377*c165b184SJames Collins } 378*c165b184SJames Collins 379*c165b184SJames Collins // todo: What is the use of this callback? Remove? 380*c165b184SJames Collins if ($this->dom && $this->dom->callback !== null) { 381*c165b184SJames Collins call_user_func_array($this->dom->callback, array($this)); 382*c165b184SJames Collins } 383*c165b184SJames Collins 384*c165b184SJames Collins if (isset($this->_[HDOM_INFO_OUTER])) { 385*c165b184SJames Collins return $this->_[HDOM_INFO_OUTER]; 386*c165b184SJames Collins } 387*c165b184SJames Collins 388*c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 389*c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 390*c165b184SJames Collins } 391*c165b184SJames Collins 392*c165b184SJames Collins $ret = ''; 393*c165b184SJames Collins 394*c165b184SJames Collins if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { 395*c165b184SJames Collins $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 396*c165b184SJames Collins } 397*c165b184SJames Collins 398*c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 399*c165b184SJames Collins // todo: <br> should either never have HDOM_INFO_INNER or always 400*c165b184SJames Collins if ($this->tag !== 'br') { 401*c165b184SJames Collins $ret .= $this->_[HDOM_INFO_INNER]; 402*c165b184SJames Collins } 403*c165b184SJames Collins } elseif ($this->nodes) { 404*c165b184SJames Collins foreach ($this->nodes as $n) { 405*c165b184SJames Collins $ret .= $this->convert_text($n->outertext()); 406*c165b184SJames Collins } 407*c165b184SJames Collins } 408*c165b184SJames Collins 409*c165b184SJames Collins if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { 410*c165b184SJames Collins $ret .= '</' . $this->tag . '>'; 411*c165b184SJames Collins } 412*c165b184SJames Collins 413*c165b184SJames Collins return $ret; 414*c165b184SJames Collins } 415*c165b184SJames Collins 416*c165b184SJames Collins function text() 417*c165b184SJames Collins { 418*c165b184SJames Collins if (isset($this->_[HDOM_INFO_INNER])) { 419*c165b184SJames Collins return $this->_[HDOM_INFO_INNER]; 420*c165b184SJames Collins } 421*c165b184SJames Collins 422*c165b184SJames Collins switch ($this->nodetype) { 423*c165b184SJames Collins case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 424*c165b184SJames Collins case HDOM_TYPE_COMMENT: return ''; 425*c165b184SJames Collins case HDOM_TYPE_UNKNOWN: return ''; 426*c165b184SJames Collins } 427*c165b184SJames Collins 428*c165b184SJames Collins if (strcasecmp($this->tag, 'script') === 0) { return ''; } 429*c165b184SJames Collins if (strcasecmp($this->tag, 'style') === 0) { return ''; } 430*c165b184SJames Collins 431*c165b184SJames Collins $ret = ''; 432*c165b184SJames Collins 433*c165b184SJames Collins // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed 434*c165b184SJames Collins // for some span tags, and some p tags) $this->nodes is set to NULL. 435*c165b184SJames Collins // NOTE: This indicates that there is a problem where it's set to NULL 436*c165b184SJames Collins // without a clear happening. 437*c165b184SJames Collins // WHY is this happening? 438*c165b184SJames Collins if (!is_null($this->nodes)) { 439*c165b184SJames Collins foreach ($this->nodes as $n) { 440*c165b184SJames Collins // Start paragraph after a blank line 441*c165b184SJames Collins if ($n->tag === 'p') { 442*c165b184SJames Collins $ret = trim($ret) . "\n\n"; 443*c165b184SJames Collins } 444*c165b184SJames Collins 445*c165b184SJames Collins $ret .= $this->convert_text($n->text()); 446*c165b184SJames Collins 447*c165b184SJames Collins // If this node is a span... add a space at the end of it so 448*c165b184SJames Collins // multiple spans don't run into each other. This is plaintext 449*c165b184SJames Collins // after all. 450*c165b184SJames Collins if ($n->tag === 'span') { 451*c165b184SJames Collins $ret .= $this->dom->default_span_text; 452*c165b184SJames Collins } 453*c165b184SJames Collins } 454*c165b184SJames Collins } 455*c165b184SJames Collins return $ret; 456*c165b184SJames Collins } 457*c165b184SJames Collins 458*c165b184SJames Collins function xmltext() 459*c165b184SJames Collins { 460*c165b184SJames Collins $ret = $this->innertext(); 461*c165b184SJames Collins $ret = str_ireplace('<![CDATA[', '', $ret); 462*c165b184SJames Collins $ret = str_replace(']]>', '', $ret); 463*c165b184SJames Collins return $ret; 464*c165b184SJames Collins } 465*c165b184SJames Collins 466*c165b184SJames Collins function makeup() 467*c165b184SJames Collins { 468*c165b184SJames Collins // text, comment, unknown 469*c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 470*c165b184SJames Collins return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 471*c165b184SJames Collins } 472*c165b184SJames Collins 473*c165b184SJames Collins $ret = '<' . $this->tag; 474*c165b184SJames Collins $i = -1; 475*c165b184SJames Collins 476*c165b184SJames Collins foreach ($this->attr as $key => $val) { 477*c165b184SJames Collins ++$i; 478*c165b184SJames Collins 479*c165b184SJames Collins // skip removed attribute 480*c165b184SJames Collins if ($val === null || $val === false) { continue; } 481*c165b184SJames Collins 482*c165b184SJames Collins $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 483*c165b184SJames Collins 484*c165b184SJames Collins //no value attr: nowrap, checked selected... 485*c165b184SJames Collins if ($val === true) { 486*c165b184SJames Collins $ret .= $key; 487*c165b184SJames Collins } else { 488*c165b184SJames Collins switch ($this->_[HDOM_INFO_QUOTE][$i]) 489*c165b184SJames Collins { 490*c165b184SJames Collins case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 491*c165b184SJames Collins case HDOM_QUOTE_SINGLE: $quote = '\''; break; 492*c165b184SJames Collins default: $quote = ''; 493*c165b184SJames Collins } 494*c165b184SJames Collins 495*c165b184SJames Collins $ret .= $key 496*c165b184SJames Collins . $this->_[HDOM_INFO_SPACE][$i][1] 497*c165b184SJames Collins . '=' 498*c165b184SJames Collins . $this->_[HDOM_INFO_SPACE][$i][2] 499*c165b184SJames Collins . $quote 500*c165b184SJames Collins . $val 501*c165b184SJames Collins . $quote; 502*c165b184SJames Collins } 503*c165b184SJames Collins } 504*c165b184SJames Collins 505*c165b184SJames Collins $ret = $this->dom->restore_noise($ret); 506*c165b184SJames Collins return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 507*c165b184SJames Collins } 508*c165b184SJames Collins 509*c165b184SJames Collins function find($selector, $idx = null, $lowercase = false) 510*c165b184SJames Collins { 511*c165b184SJames Collins $selectors = $this->parse_selector($selector); 512*c165b184SJames Collins if (($count = count($selectors)) === 0) { return array(); } 513*c165b184SJames Collins $found_keys = array(); 514*c165b184SJames Collins 515*c165b184SJames Collins // find each selector 516*c165b184SJames Collins for ($c = 0; $c < $count; ++$c) { 517*c165b184SJames Collins // The change on the below line was documented on the sourceforge 518*c165b184SJames Collins // code tracker id 2788009 519*c165b184SJames Collins // used to be: if (($levle=count($selectors[0]))===0) return array(); 520*c165b184SJames Collins if (($levle = count($selectors[$c])) === 0) { return array(); } 521*c165b184SJames Collins if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); } 522*c165b184SJames Collins 523*c165b184SJames Collins $head = array($this->_[HDOM_INFO_BEGIN] => 1); 524*c165b184SJames Collins $cmd = ' '; // Combinator 525*c165b184SJames Collins 526*c165b184SJames Collins // handle descendant selectors, no recursive! 527*c165b184SJames Collins for ($l = 0; $l < $levle; ++$l) { 528*c165b184SJames Collins $ret = array(); 529*c165b184SJames Collins 530*c165b184SJames Collins foreach ($head as $k => $v) { 531*c165b184SJames Collins $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; 532*c165b184SJames Collins //PaperG - Pass this optional parameter on to the seek function. 533*c165b184SJames Collins $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); 534*c165b184SJames Collins } 535*c165b184SJames Collins 536*c165b184SJames Collins $head = $ret; 537*c165b184SJames Collins $cmd = $selectors[$c][$l][4]; // Next Combinator 538*c165b184SJames Collins } 539*c165b184SJames Collins 540*c165b184SJames Collins foreach ($head as $k => $v) { 541*c165b184SJames Collins if (!isset($found_keys[$k])) { 542*c165b184SJames Collins $found_keys[$k] = 1; 543*c165b184SJames Collins } 544*c165b184SJames Collins } 545*c165b184SJames Collins } 546*c165b184SJames Collins 547*c165b184SJames Collins // sort keys 548*c165b184SJames Collins ksort($found_keys); 549*c165b184SJames Collins 550*c165b184SJames Collins $found = array(); 551*c165b184SJames Collins foreach ($found_keys as $k => $v) { 552*c165b184SJames Collins $found[] = $this->dom->nodes[$k]; 553*c165b184SJames Collins } 554*c165b184SJames Collins 555*c165b184SJames Collins // return nth-element or array 556*c165b184SJames Collins if (is_null($idx)) { return $found; } 557*c165b184SJames Collins elseif ($idx < 0) { $idx = count($found) + $idx; } 558*c165b184SJames Collins return (isset($found[$idx])) ? $found[$idx] : null; 559*c165b184SJames Collins } 560*c165b184SJames Collins 561*c165b184SJames Collins protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) 562*c165b184SJames Collins { 563*c165b184SJames Collins global $debug_object; 564*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 565*c165b184SJames Collins 566*c165b184SJames Collins list($tag, $id, $class, $attributes, $cmb) = $selector; 567*c165b184SJames Collins $nodes = array(); 568*c165b184SJames Collins 569*c165b184SJames Collins if ($parent_cmd === ' ') { // Descendant Combinator 570*c165b184SJames Collins // Find parent closing tag if the current element doesn't have a closing 571*c165b184SJames Collins // tag (i.e. void element) 572*c165b184SJames Collins $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 573*c165b184SJames Collins if ($end == 0) { 574*c165b184SJames Collins $parent = $this->parent; 575*c165b184SJames Collins while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { 576*c165b184SJames Collins $end -= 1; 577*c165b184SJames Collins $parent = $parent->parent; 578*c165b184SJames Collins } 579*c165b184SJames Collins $end += $parent->_[HDOM_INFO_END]; 580*c165b184SJames Collins } 581*c165b184SJames Collins 582*c165b184SJames Collins // Get list of target nodes 583*c165b184SJames Collins $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1; 584*c165b184SJames Collins $nodes_count = $end - $nodes_start; 585*c165b184SJames Collins $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true); 586*c165b184SJames Collins } elseif ($parent_cmd === '>') { // Child Combinator 587*c165b184SJames Collins $nodes = $this->children; 588*c165b184SJames Collins } elseif ($parent_cmd === '+' 589*c165b184SJames Collins && $this->parent 590*c165b184SJames Collins && in_array($this, $this->parent->children)) { // Next-Sibling Combinator 591*c165b184SJames Collins $index = array_search($this, $this->parent->children, true) + 1; 592*c165b184SJames Collins if ($index < count($this->parent->children)) 593*c165b184SJames Collins $nodes[] = $this->parent->children[$index]; 594*c165b184SJames Collins } elseif ($parent_cmd === '~' 595*c165b184SJames Collins && $this->parent 596*c165b184SJames Collins && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator 597*c165b184SJames Collins $index = array_search($this, $this->parent->children, true); 598*c165b184SJames Collins $nodes = array_slice($this->parent->children, $index); 599*c165b184SJames Collins } 600*c165b184SJames Collins 601*c165b184SJames Collins // Go throgh each element starting at this element until the end tag 602*c165b184SJames Collins // Note: If this element is a void tag, any previous void element is 603*c165b184SJames Collins // skipped. 604*c165b184SJames Collins foreach($nodes as $node) { 605*c165b184SJames Collins $pass = true; 606*c165b184SJames Collins 607*c165b184SJames Collins // Skip root nodes 608*c165b184SJames Collins if(!$node->parent) { 609*c165b184SJames Collins $pass = false; 610*c165b184SJames Collins } 611*c165b184SJames Collins 612*c165b184SJames Collins // Handle 'text' selector 613*c165b184SJames Collins if($pass && $tag === 'text' && $node->tag === 'text') { 614*c165b184SJames Collins $ret[array_search($node, $this->dom->nodes, true)] = 1; 615*c165b184SJames Collins unset($node); 616*c165b184SJames Collins continue; 617*c165b184SJames Collins } 618*c165b184SJames Collins 619*c165b184SJames Collins // Skip if node isn't a child node (i.e. text nodes) 620*c165b184SJames Collins if($pass && !in_array($node, $node->parent->children, true)) { 621*c165b184SJames Collins $pass = false; 622*c165b184SJames Collins } 623*c165b184SJames Collins 624*c165b184SJames Collins // Skip if tag doesn't match 625*c165b184SJames Collins if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') { 626*c165b184SJames Collins $pass = false; 627*c165b184SJames Collins } 628*c165b184SJames Collins 629*c165b184SJames Collins // Skip if ID doesn't exist 630*c165b184SJames Collins if ($pass && $id !== '' && !isset($node->attr['id'])) { 631*c165b184SJames Collins $pass = false; 632*c165b184SJames Collins } 633*c165b184SJames Collins 634*c165b184SJames Collins // Check if ID matches 635*c165b184SJames Collins if ($pass && $id !== '' && isset($node->attr['id'])) { 636*c165b184SJames Collins // Note: Only consider the first ID (as browsers do) 637*c165b184SJames Collins $node_id = explode(' ', trim($node->attr['id']))[0]; 638*c165b184SJames Collins 639*c165b184SJames Collins if($id !== $node_id) { $pass = false; } 640*c165b184SJames Collins } 641*c165b184SJames Collins 642*c165b184SJames Collins // Check if all class(es) exist 643*c165b184SJames Collins if ($pass && $class !== '' && is_array($class) && !empty($class)) { 644*c165b184SJames Collins if (isset($node->attr['class'])) { 645*c165b184SJames Collins $node_classes = explode(' ', $node->attr['class']); 646*c165b184SJames Collins 647*c165b184SJames Collins if ($lowercase) { 648*c165b184SJames Collins $node_classes = array_map('strtolower', $node_classes); 649*c165b184SJames Collins } 650*c165b184SJames Collins 651*c165b184SJames Collins foreach($class as $c) { 652*c165b184SJames Collins if(!in_array($c, $node_classes)) { 653*c165b184SJames Collins $pass = false; 654*c165b184SJames Collins break; 655*c165b184SJames Collins } 656*c165b184SJames Collins } 657*c165b184SJames Collins } else { 658*c165b184SJames Collins $pass = false; 659*c165b184SJames Collins } 660*c165b184SJames Collins } 661*c165b184SJames Collins 662*c165b184SJames Collins // Check attributes 663*c165b184SJames Collins if ($pass 664*c165b184SJames Collins && $attributes !== '' 665*c165b184SJames Collins && is_array($attributes) 666*c165b184SJames Collins && !empty($attributes)) { 667*c165b184SJames Collins foreach($attributes as $a) { 668*c165b184SJames Collins list ( 669*c165b184SJames Collins $att_name, 670*c165b184SJames Collins $att_expr, 671*c165b184SJames Collins $att_val, 672*c165b184SJames Collins $att_inv, 673*c165b184SJames Collins $att_case_sensitivity 674*c165b184SJames Collins ) = $a; 675*c165b184SJames Collins 676*c165b184SJames Collins // Handle indexing attributes (i.e. "[2]") 677*c165b184SJames Collins /** 678*c165b184SJames Collins * Note: This is not supported by the CSS Standard but adds 679*c165b184SJames Collins * the ability to select items compatible to XPath (i.e. 680*c165b184SJames Collins * the 3rd element within it's parent). 681*c165b184SJames Collins * 682*c165b184SJames Collins * Note: This doesn't conflict with the CSS Standard which 683*c165b184SJames Collins * doesn't work on numeric attributes anyway. 684*c165b184SJames Collins */ 685*c165b184SJames Collins if (is_numeric($att_name) 686*c165b184SJames Collins && $att_expr === '' 687*c165b184SJames Collins && $att_val === '') { 688*c165b184SJames Collins $count = 0; 689*c165b184SJames Collins 690*c165b184SJames Collins // Find index of current element in parent 691*c165b184SJames Collins foreach ($node->parent->children as $c) { 692*c165b184SJames Collins if ($c->tag === $node->tag) ++$count; 693*c165b184SJames Collins if ($c === $node) break; 694*c165b184SJames Collins } 695*c165b184SJames Collins 696*c165b184SJames Collins // If this is the correct node, continue with next 697*c165b184SJames Collins // attribute 698*c165b184SJames Collins if ($count === (int)$att_name) continue; 699*c165b184SJames Collins } 700*c165b184SJames Collins 701*c165b184SJames Collins // Check attribute availability 702*c165b184SJames Collins if ($att_inv) { // Attribute should NOT be set 703*c165b184SJames Collins if (isset($node->attr[$att_name])) { 704*c165b184SJames Collins $pass = false; 705*c165b184SJames Collins break; 706*c165b184SJames Collins } 707*c165b184SJames Collins } else { // Attribute should be set 708*c165b184SJames Collins // todo: "plaintext" is not a valid CSS selector! 709*c165b184SJames Collins if ($att_name !== 'plaintext' 710*c165b184SJames Collins && !isset($node->attr[$att_name])) { 711*c165b184SJames Collins $pass = false; 712*c165b184SJames Collins break; 713*c165b184SJames Collins } 714*c165b184SJames Collins } 715*c165b184SJames Collins 716*c165b184SJames Collins // Continue with next attribute if expression isn't defined 717*c165b184SJames Collins if ($att_expr === '') continue; 718*c165b184SJames Collins 719*c165b184SJames Collins // If they have told us that this is a "plaintext" 720*c165b184SJames Collins // search then we want the plaintext of the node - right? 721*c165b184SJames Collins // todo "plaintext" is not a valid CSS selector! 722*c165b184SJames Collins if ($att_name === 'plaintext') { 723*c165b184SJames Collins $nodeKeyValue = $node->text(); 724*c165b184SJames Collins } else { 725*c165b184SJames Collins $nodeKeyValue = $node->attr[$att_name]; 726*c165b184SJames Collins } 727*c165b184SJames Collins 728*c165b184SJames Collins if (is_object($debug_object)) { 729*c165b184SJames Collins $debug_object->debug_log(2, 730*c165b184SJames Collins 'testing node: ' 731*c165b184SJames Collins . $node->tag 732*c165b184SJames Collins . ' for attribute: ' 733*c165b184SJames Collins . $att_name 734*c165b184SJames Collins . $att_expr 735*c165b184SJames Collins . $att_val 736*c165b184SJames Collins . ' where nodes value is: ' 737*c165b184SJames Collins . $nodeKeyValue 738*c165b184SJames Collins ); 739*c165b184SJames Collins } 740*c165b184SJames Collins 741*c165b184SJames Collins // If lowercase is set, do a case insensitive test of 742*c165b184SJames Collins // the value of the selector. 743*c165b184SJames Collins if ($lowercase) { 744*c165b184SJames Collins $check = $this->match( 745*c165b184SJames Collins $att_expr, 746*c165b184SJames Collins strtolower($att_val), 747*c165b184SJames Collins strtolower($nodeKeyValue), 748*c165b184SJames Collins $att_case_sensitivity 749*c165b184SJames Collins ); 750*c165b184SJames Collins } else { 751*c165b184SJames Collins $check = $this->match( 752*c165b184SJames Collins $att_expr, 753*c165b184SJames Collins $att_val, 754*c165b184SJames Collins $nodeKeyValue, 755*c165b184SJames Collins $att_case_sensitivity 756*c165b184SJames Collins ); 757*c165b184SJames Collins } 758*c165b184SJames Collins 759*c165b184SJames Collins if (is_object($debug_object)) { 760*c165b184SJames Collins $debug_object->debug_log(2, 761*c165b184SJames Collins 'after match: ' 762*c165b184SJames Collins . ($check ? 'true' : 'false') 763*c165b184SJames Collins ); 764*c165b184SJames Collins } 765*c165b184SJames Collins 766*c165b184SJames Collins if (!$check) { 767*c165b184SJames Collins $pass = false; 768*c165b184SJames Collins break; 769*c165b184SJames Collins } 770*c165b184SJames Collins } 771*c165b184SJames Collins } 772*c165b184SJames Collins 773*c165b184SJames Collins // Found a match. Add to list and clear node 774*c165b184SJames Collins if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1; 775*c165b184SJames Collins unset($node); 776*c165b184SJames Collins } 777*c165b184SJames Collins // It's passed by reference so this is actually what this function returns. 778*c165b184SJames Collins if (is_object($debug_object)) { 779*c165b184SJames Collins $debug_object->debug_log(1, 'EXIT - ret: ', $ret); 780*c165b184SJames Collins } 781*c165b184SJames Collins } 782*c165b184SJames Collins 783*c165b184SJames Collins protected function match($exp, $pattern, $value, $case_sensitivity) 784*c165b184SJames Collins { 785*c165b184SJames Collins global $debug_object; 786*c165b184SJames Collins if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 787*c165b184SJames Collins 788*c165b184SJames Collins if ($case_sensitivity === 'i') { 789*c165b184SJames Collins $pattern = strtolower($pattern); 790*c165b184SJames Collins $value = strtolower($value); 791*c165b184SJames Collins } 792*c165b184SJames Collins 793*c165b184SJames Collins switch ($exp) { 794*c165b184SJames Collins case '=': 795*c165b184SJames Collins return ($value === $pattern); 796*c165b184SJames Collins case '!=': 797*c165b184SJames Collins return ($value !== $pattern); 798*c165b184SJames Collins case '^=': 799*c165b184SJames Collins return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); 800*c165b184SJames Collins case '$=': 801*c165b184SJames Collins return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); 802*c165b184SJames Collins case '*=': 803*c165b184SJames Collins return preg_match('/' . preg_quote($pattern, '/') . '/', $value); 804*c165b184SJames Collins case '|=': 805*c165b184SJames Collins /** 806*c165b184SJames Collins * [att|=val] 807*c165b184SJames Collins * 808*c165b184SJames Collins * Represents an element with the att attribute, its value 809*c165b184SJames Collins * either being exactly "val" or beginning with "val" 810*c165b184SJames Collins * immediately followed by "-" (U+002D). 811*c165b184SJames Collins */ 812*c165b184SJames Collins return strpos($value, $pattern) === 0; 813*c165b184SJames Collins case '~=': 814*c165b184SJames Collins /** 815*c165b184SJames Collins * [att~=val] 816*c165b184SJames Collins * 817*c165b184SJames Collins * Represents an element with the att attribute whose value is a 818*c165b184SJames Collins * whitespace-separated list of words, one of which is exactly 819*c165b184SJames Collins * "val". If "val" contains whitespace, it will never represent 820*c165b184SJames Collins * anything (since the words are separated by spaces). Also if 821*c165b184SJames Collins * "val" is the empty string, it will never represent anything. 822*c165b184SJames Collins */ 823*c165b184SJames Collins return in_array($pattern, explode(' ', trim($value)), true); 824*c165b184SJames Collins } 825*c165b184SJames Collins return false; 826*c165b184SJames Collins } 827*c165b184SJames Collins 828*c165b184SJames Collins protected function parse_selector($selector_string) 829*c165b184SJames Collins { 830*c165b184SJames Collins global $debug_object; 831*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 832*c165b184SJames Collins 833*c165b184SJames Collins /** 834*c165b184SJames Collins * Pattern of CSS selectors, modified from mootools (https://mootools.net/) 835*c165b184SJames Collins * 836*c165b184SJames Collins * Paperg: Add the colon to the attribute, so that it properly finds 837*c165b184SJames Collins * <tag attr:ibute="something" > like google does. 838*c165b184SJames Collins * 839*c165b184SJames Collins * Note: if you try to look at this attribute, you MUST use getAttribute 840*c165b184SJames Collins * since $dom->x:y will fail the php syntax check. 841*c165b184SJames Collins * 842*c165b184SJames Collins * Notice the \[ starting the attribute? and the @? following? This 843*c165b184SJames Collins * implies that an attribute can begin with an @ sign that is not 844*c165b184SJames Collins * captured. This implies that an html attribute specifier may start 845*c165b184SJames Collins * with an @ sign that is NOT captured by the expression. Farther study 846*c165b184SJames Collins * is required to determine of this should be documented or removed. 847*c165b184SJames Collins * 848*c165b184SJames Collins * Matches selectors in this order: 849*c165b184SJames Collins * 850*c165b184SJames Collins * [0] - full match 851*c165b184SJames Collins * 852*c165b184SJames Collins * [1] - tag name 853*c165b184SJames Collins * ([\w:\*-]*) 854*c165b184SJames Collins * Matches the tag name consisting of zero or more words, colons, 855*c165b184SJames Collins * asterisks and hyphens. 856*c165b184SJames Collins * 857*c165b184SJames Collins * [2] - id name 858*c165b184SJames Collins * (?:\#([\w-]+)) 859*c165b184SJames Collins * Optionally matches a id name, consisting of an "#" followed by 860*c165b184SJames Collins * the id name (one or more words and hyphens). 861*c165b184SJames Collins * 862*c165b184SJames Collins * [3] - class names (including dots) 863*c165b184SJames Collins * (?:\.([\w\.-]+))? 864*c165b184SJames Collins * Optionally matches a list of classs, consisting of an "." 865*c165b184SJames Collins * followed by the class name (one or more words and hyphens) 866*c165b184SJames Collins * where multiple classes can be chained (i.e. ".foo.bar.baz") 867*c165b184SJames Collins * 868*c165b184SJames Collins * [4] - attributes 869*c165b184SJames Collins * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? 870*c165b184SJames Collins * Optionally matches the attributes list 871*c165b184SJames Collins * 872*c165b184SJames Collins * [5] - separator 873*c165b184SJames Collins * ([\/, >+~]+) 874*c165b184SJames Collins * Matches the selector list separator 875*c165b184SJames Collins */ 876*c165b184SJames Collins // phpcs:ignore Generic.Files.LineLength 877*c165b184SJames Collins $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; 878*c165b184SJames Collins 879*c165b184SJames Collins preg_match_all( 880*c165b184SJames Collins $pattern, 881*c165b184SJames Collins trim($selector_string) . ' ', // Add final ' ' as pseudo separator 882*c165b184SJames Collins $matches, 883*c165b184SJames Collins PREG_SET_ORDER 884*c165b184SJames Collins ); 885*c165b184SJames Collins 886*c165b184SJames Collins if (is_object($debug_object)) { 887*c165b184SJames Collins $debug_object->debug_log(2, 'Matches Array: ', $matches); 888*c165b184SJames Collins } 889*c165b184SJames Collins 890*c165b184SJames Collins $selectors = array(); 891*c165b184SJames Collins $result = array(); 892*c165b184SJames Collins 893*c165b184SJames Collins foreach ($matches as $m) { 894*c165b184SJames Collins $m[0] = trim($m[0]); 895*c165b184SJames Collins 896*c165b184SJames Collins // Skip NoOps 897*c165b184SJames Collins if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; } 898*c165b184SJames Collins 899*c165b184SJames Collins // Convert to lowercase 900*c165b184SJames Collins if ($this->dom->lowercase) { 901*c165b184SJames Collins $m[1] = strtolower($m[1]); 902*c165b184SJames Collins } 903*c165b184SJames Collins 904*c165b184SJames Collins // Extract classes 905*c165b184SJames Collins if ($m[3] !== '') { $m[3] = explode('.', $m[3]); } 906*c165b184SJames Collins 907*c165b184SJames Collins /* Extract attributes (pattern based on the pattern above!) 908*c165b184SJames Collins 909*c165b184SJames Collins * [0] - full match 910*c165b184SJames Collins * [1] - attribute name 911*c165b184SJames Collins * [2] - attribute expression 912*c165b184SJames Collins * [3] - attribute value 913*c165b184SJames Collins * [4] - case sensitivity 914*c165b184SJames Collins * 915*c165b184SJames Collins * Note: Attributes can be negated with a "!" prefix to their name 916*c165b184SJames Collins */ 917*c165b184SJames Collins if($m[4] !== '') { 918*c165b184SJames Collins preg_match_all( 919*c165b184SJames Collins "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", 920*c165b184SJames Collins trim($m[4]), 921*c165b184SJames Collins $attributes, 922*c165b184SJames Collins PREG_SET_ORDER 923*c165b184SJames Collins ); 924*c165b184SJames Collins 925*c165b184SJames Collins // Replace element by array 926*c165b184SJames Collins $m[4] = array(); 927*c165b184SJames Collins 928*c165b184SJames Collins foreach($attributes as $att) { 929*c165b184SJames Collins // Skip empty matches 930*c165b184SJames Collins if(trim($att[0]) === '') { continue; } 931*c165b184SJames Collins 932*c165b184SJames Collins $inverted = (isset($att[1][0]) && $att[1][0] === '!'); 933*c165b184SJames Collins $m[4][] = array( 934*c165b184SJames Collins $inverted ? substr($att[1], 1) : $att[1], // Name 935*c165b184SJames Collins (isset($att[2])) ? $att[2] : '', // Expression 936*c165b184SJames Collins (isset($att[3])) ? $att[3] : '', // Value 937*c165b184SJames Collins $inverted, // Inverted Flag 938*c165b184SJames Collins (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity 939*c165b184SJames Collins ); 940*c165b184SJames Collins } 941*c165b184SJames Collins } 942*c165b184SJames Collins 943*c165b184SJames Collins // Sanitize Separator 944*c165b184SJames Collins if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator 945*c165b184SJames Collins $m[5] = ' '; 946*c165b184SJames Collins } else { // Other Separator 947*c165b184SJames Collins $m[5] = trim($m[5]); 948*c165b184SJames Collins } 949*c165b184SJames Collins 950*c165b184SJames Collins // Clear Separator if it's a Selector List 951*c165b184SJames Collins if ($is_list = ($m[5] === ',')) { $m[5] = ''; } 952*c165b184SJames Collins 953*c165b184SJames Collins // Remove full match before adding to results 954*c165b184SJames Collins array_shift($m); 955*c165b184SJames Collins $result[] = $m; 956*c165b184SJames Collins 957*c165b184SJames Collins if ($is_list) { // Selector List 958*c165b184SJames Collins $selectors[] = $result; 959*c165b184SJames Collins $result = array(); 960*c165b184SJames Collins } 961*c165b184SJames Collins } 962*c165b184SJames Collins 963*c165b184SJames Collins if (count($result) > 0) { $selectors[] = $result; } 964*c165b184SJames Collins return $selectors; 965*c165b184SJames Collins } 966*c165b184SJames Collins 967*c165b184SJames Collins function __get($name) 968*c165b184SJames Collins { 969*c165b184SJames Collins if (isset($this->attr[$name])) { 970*c165b184SJames Collins return $this->convert_text($this->attr[$name]); 971*c165b184SJames Collins } 972*c165b184SJames Collins switch ($name) { 973*c165b184SJames Collins case 'outertext': return $this->outertext(); 974*c165b184SJames Collins case 'innertext': return $this->innertext(); 975*c165b184SJames Collins case 'plaintext': return $this->text(); 976*c165b184SJames Collins case 'xmltext': return $this->xmltext(); 977*c165b184SJames Collins default: return array_key_exists($name, $this->attr); 978*c165b184SJames Collins } 979*c165b184SJames Collins } 980*c165b184SJames Collins 981*c165b184SJames Collins function __set($name, $value) 982*c165b184SJames Collins { 983*c165b184SJames Collins global $debug_object; 984*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 985*c165b184SJames Collins 986*c165b184SJames Collins switch ($name) { 987*c165b184SJames Collins case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 988*c165b184SJames Collins case 'innertext': 989*c165b184SJames Collins if (isset($this->_[HDOM_INFO_TEXT])) { 990*c165b184SJames Collins return $this->_[HDOM_INFO_TEXT] = $value; 991*c165b184SJames Collins } 992*c165b184SJames Collins return $this->_[HDOM_INFO_INNER] = $value; 993*c165b184SJames Collins } 994*c165b184SJames Collins 995*c165b184SJames Collins if (!isset($this->attr[$name])) { 996*c165b184SJames Collins $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 997*c165b184SJames Collins $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 998*c165b184SJames Collins } 999*c165b184SJames Collins 1000*c165b184SJames Collins $this->attr[$name] = $value; 1001*c165b184SJames Collins } 1002*c165b184SJames Collins 1003*c165b184SJames Collins function __isset($name) 1004*c165b184SJames Collins { 1005*c165b184SJames Collins switch ($name) { 1006*c165b184SJames Collins case 'outertext': return true; 1007*c165b184SJames Collins case 'innertext': return true; 1008*c165b184SJames Collins case 'plaintext': return true; 1009*c165b184SJames Collins } 1010*c165b184SJames Collins //no value attr: nowrap, checked selected... 1011*c165b184SJames Collins return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 1012*c165b184SJames Collins } 1013*c165b184SJames Collins 1014*c165b184SJames Collins function __unset($name) 1015*c165b184SJames Collins { 1016*c165b184SJames Collins if (isset($this->attr[$name])) { unset($this->attr[$name]); } 1017*c165b184SJames Collins } 1018*c165b184SJames Collins 1019*c165b184SJames Collins function convert_text($text) 1020*c165b184SJames Collins { 1021*c165b184SJames Collins global $debug_object; 1022*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1023*c165b184SJames Collins 1024*c165b184SJames Collins $converted_text = $text; 1025*c165b184SJames Collins 1026*c165b184SJames Collins $sourceCharset = ''; 1027*c165b184SJames Collins $targetCharset = ''; 1028*c165b184SJames Collins 1029*c165b184SJames Collins if ($this->dom) { 1030*c165b184SJames Collins $sourceCharset = strtoupper($this->dom->_charset); 1031*c165b184SJames Collins $targetCharset = strtoupper($this->dom->_target_charset); 1032*c165b184SJames Collins } 1033*c165b184SJames Collins 1034*c165b184SJames Collins if (is_object($debug_object)) { 1035*c165b184SJames Collins $debug_object->debug_log(3, 1036*c165b184SJames Collins 'source charset: ' 1037*c165b184SJames Collins . $sourceCharset 1038*c165b184SJames Collins . ' target charaset: ' 1039*c165b184SJames Collins . $targetCharset 1040*c165b184SJames Collins ); 1041*c165b184SJames Collins } 1042*c165b184SJames Collins 1043*c165b184SJames Collins if (!empty($sourceCharset) 1044*c165b184SJames Collins && !empty($targetCharset) 1045*c165b184SJames Collins && (strcasecmp($sourceCharset, $targetCharset) != 0)) { 1046*c165b184SJames Collins // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 1047*c165b184SJames Collins if ((strcasecmp($targetCharset, 'UTF-8') == 0) 1048*c165b184SJames Collins && ($this->is_utf8($text))) { 1049*c165b184SJames Collins $converted_text = $text; 1050*c165b184SJames Collins } else { 1051*c165b184SJames Collins $converted_text = iconv($sourceCharset, $targetCharset, $text); 1052*c165b184SJames Collins } 1053*c165b184SJames Collins } 1054*c165b184SJames Collins 1055*c165b184SJames Collins // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 1056*c165b184SJames Collins if ($targetCharset === 'UTF-8') { 1057*c165b184SJames Collins if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") { 1058*c165b184SJames Collins $converted_text = substr($converted_text, 3); 1059*c165b184SJames Collins } 1060*c165b184SJames Collins 1061*c165b184SJames Collins if (substr($converted_text, -3) === "\xef\xbb\xbf") { 1062*c165b184SJames Collins $converted_text = substr($converted_text, 0, -3); 1063*c165b184SJames Collins } 1064*c165b184SJames Collins } 1065*c165b184SJames Collins 1066*c165b184SJames Collins return $converted_text; 1067*c165b184SJames Collins } 1068*c165b184SJames Collins 1069*c165b184SJames Collins static function is_utf8($str) 1070*c165b184SJames Collins { 1071*c165b184SJames Collins $c = 0; $b = 0; 1072*c165b184SJames Collins $bits = 0; 1073*c165b184SJames Collins $len = strlen($str); 1074*c165b184SJames Collins for($i = 0; $i < $len; $i++) { 1075*c165b184SJames Collins $c = ord($str[$i]); 1076*c165b184SJames Collins if($c > 128) { 1077*c165b184SJames Collins if(($c >= 254)) { return false; } 1078*c165b184SJames Collins elseif($c >= 252) { $bits = 6; } 1079*c165b184SJames Collins elseif($c >= 248) { $bits = 5; } 1080*c165b184SJames Collins elseif($c >= 240) { $bits = 4; } 1081*c165b184SJames Collins elseif($c >= 224) { $bits = 3; } 1082*c165b184SJames Collins elseif($c >= 192) { $bits = 2; } 1083*c165b184SJames Collins else { return false; } 1084*c165b184SJames Collins if(($i + $bits) > $len) { return false; } 1085*c165b184SJames Collins while($bits > 1) { 1086*c165b184SJames Collins $i++; 1087*c165b184SJames Collins $b = ord($str[$i]); 1088*c165b184SJames Collins if($b < 128 || $b > 191) { return false; } 1089*c165b184SJames Collins $bits--; 1090*c165b184SJames Collins } 1091*c165b184SJames Collins } 1092*c165b184SJames Collins } 1093*c165b184SJames Collins return true; 1094*c165b184SJames Collins } 1095*c165b184SJames Collins 1096*c165b184SJames Collins function get_display_size() 1097*c165b184SJames Collins { 1098*c165b184SJames Collins global $debug_object; 1099*c165b184SJames Collins 1100*c165b184SJames Collins $width = -1; 1101*c165b184SJames Collins $height = -1; 1102*c165b184SJames Collins 1103*c165b184SJames Collins if ($this->tag !== 'img') { 1104*c165b184SJames Collins return false; 1105*c165b184SJames Collins } 1106*c165b184SJames Collins 1107*c165b184SJames Collins // See if there is aheight or width attribute in the tag itself. 1108*c165b184SJames Collins if (isset($this->attr['width'])) { 1109*c165b184SJames Collins $width = $this->attr['width']; 1110*c165b184SJames Collins } 1111*c165b184SJames Collins 1112*c165b184SJames Collins if (isset($this->attr['height'])) { 1113*c165b184SJames Collins $height = $this->attr['height']; 1114*c165b184SJames Collins } 1115*c165b184SJames Collins 1116*c165b184SJames Collins // Now look for an inline style. 1117*c165b184SJames Collins if (isset($this->attr['style'])) { 1118*c165b184SJames Collins // Thanks to user gnarf from stackoverflow for this regular expression. 1119*c165b184SJames Collins $attributes = array(); 1120*c165b184SJames Collins 1121*c165b184SJames Collins preg_match_all( 1122*c165b184SJames Collins '/([\w-]+)\s*:\s*([^;]+)\s*;?/', 1123*c165b184SJames Collins $this->attr['style'], 1124*c165b184SJames Collins $matches, 1125*c165b184SJames Collins PREG_SET_ORDER 1126*c165b184SJames Collins ); 1127*c165b184SJames Collins 1128*c165b184SJames Collins foreach ($matches as $match) { 1129*c165b184SJames Collins $attributes[$match[1]] = $match[2]; 1130*c165b184SJames Collins } 1131*c165b184SJames Collins 1132*c165b184SJames Collins // If there is a width in the style attributes: 1133*c165b184SJames Collins if (isset($attributes['width']) && $width == -1) { 1134*c165b184SJames Collins // check that the last two characters are px (pixels) 1135*c165b184SJames Collins if (strtolower(substr($attributes['width'], -2)) === 'px') { 1136*c165b184SJames Collins $proposed_width = substr($attributes['width'], 0, -2); 1137*c165b184SJames Collins // Now make sure that it's an integer and not something stupid. 1138*c165b184SJames Collins if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { 1139*c165b184SJames Collins $width = $proposed_width; 1140*c165b184SJames Collins } 1141*c165b184SJames Collins } 1142*c165b184SJames Collins } 1143*c165b184SJames Collins 1144*c165b184SJames Collins // If there is a width in the style attributes: 1145*c165b184SJames Collins if (isset($attributes['height']) && $height == -1) { 1146*c165b184SJames Collins // check that the last two characters are px (pixels) 1147*c165b184SJames Collins if (strtolower(substr($attributes['height'], -2)) == 'px') { 1148*c165b184SJames Collins $proposed_height = substr($attributes['height'], 0, -2); 1149*c165b184SJames Collins // Now make sure that it's an integer and not something stupid. 1150*c165b184SJames Collins if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { 1151*c165b184SJames Collins $height = $proposed_height; 1152*c165b184SJames Collins } 1153*c165b184SJames Collins } 1154*c165b184SJames Collins } 1155*c165b184SJames Collins 1156*c165b184SJames Collins } 1157*c165b184SJames Collins 1158*c165b184SJames Collins // Future enhancement: 1159*c165b184SJames Collins // Look in the tag to see if there is a class or id specified that has 1160*c165b184SJames Collins // a height or width attribute to it. 1161*c165b184SJames Collins 1162*c165b184SJames Collins // Far future enhancement 1163*c165b184SJames Collins // Look at all the parent tags of this image to see if they specify a 1164*c165b184SJames Collins // class or id that has an img selector that specifies a height or width 1165*c165b184SJames Collins // Note that in this case, the class or id will have the img subselector 1166*c165b184SJames Collins // for it to apply to the image. 1167*c165b184SJames Collins 1168*c165b184SJames Collins // ridiculously far future development 1169*c165b184SJames Collins // If the class or id is specified in a SEPARATE css file thats not on 1170*c165b184SJames Collins // the page, go get it and do what we were just doing for the ones on 1171*c165b184SJames Collins // the page. 1172*c165b184SJames Collins 1173*c165b184SJames Collins $result = array( 1174*c165b184SJames Collins 'height' => $height, 1175*c165b184SJames Collins 'width' => $width 1176*c165b184SJames Collins ); 1177*c165b184SJames Collins 1178*c165b184SJames Collins return $result; 1179*c165b184SJames Collins } 1180*c165b184SJames Collins 1181*c165b184SJames Collins function save($filepath = '') 1182*c165b184SJames Collins { 1183*c165b184SJames Collins $ret = $this->outertext(); 1184*c165b184SJames Collins 1185*c165b184SJames Collins if ($filepath !== '') { 1186*c165b184SJames Collins file_put_contents($filepath, $ret, LOCK_EX); 1187*c165b184SJames Collins } 1188*c165b184SJames Collins 1189*c165b184SJames Collins return $ret; 1190*c165b184SJames Collins } 1191*c165b184SJames Collins 1192*c165b184SJames Collins function addClass($class) 1193*c165b184SJames Collins { 1194*c165b184SJames Collins if (is_string($class)) { 1195*c165b184SJames Collins $class = explode(' ', $class); 1196*c165b184SJames Collins } 1197*c165b184SJames Collins 1198*c165b184SJames Collins if (is_array($class)) { 1199*c165b184SJames Collins foreach($class as $c) { 1200*c165b184SJames Collins if (isset($this->class)) { 1201*c165b184SJames Collins if ($this->hasClass($c)) { 1202*c165b184SJames Collins continue; 1203*c165b184SJames Collins } else { 1204*c165b184SJames Collins $this->class .= ' ' . $c; 1205*c165b184SJames Collins } 1206*c165b184SJames Collins } else { 1207*c165b184SJames Collins $this->class = $c; 1208*c165b184SJames Collins } 1209*c165b184SJames Collins } 1210*c165b184SJames Collins } else { 1211*c165b184SJames Collins if (is_object($debug_object)) { 1212*c165b184SJames Collins $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1213*c165b184SJames Collins } 1214*c165b184SJames Collins } 1215*c165b184SJames Collins } 1216*c165b184SJames Collins 1217*c165b184SJames Collins function hasClass($class) 1218*c165b184SJames Collins { 1219*c165b184SJames Collins if (is_string($class)) { 1220*c165b184SJames Collins if (isset($this->class)) { 1221*c165b184SJames Collins return in_array($class, explode(' ', $this->class), true); 1222*c165b184SJames Collins } 1223*c165b184SJames Collins } else { 1224*c165b184SJames Collins if (is_object($debug_object)) { 1225*c165b184SJames Collins $debug_object->debug_log(2, 'Invalid type: ', gettype($class)); 1226*c165b184SJames Collins } 1227*c165b184SJames Collins } 1228*c165b184SJames Collins 1229*c165b184SJames Collins return false; 1230*c165b184SJames Collins } 1231*c165b184SJames Collins 1232*c165b184SJames Collins function removeClass($class = null) 1233*c165b184SJames Collins { 1234*c165b184SJames Collins if (!isset($this->class)) { 1235*c165b184SJames Collins return; 1236*c165b184SJames Collins } 1237*c165b184SJames Collins 1238*c165b184SJames Collins if (is_null($class)) { 1239*c165b184SJames Collins $this->removeAttribute('class'); 1240*c165b184SJames Collins return; 1241*c165b184SJames Collins } 1242*c165b184SJames Collins 1243*c165b184SJames Collins if (is_string($class)) { 1244*c165b184SJames Collins $class = explode(' ', $class); 1245*c165b184SJames Collins } 1246*c165b184SJames Collins 1247*c165b184SJames Collins if (is_array($class)) { 1248*c165b184SJames Collins $class = array_diff(explode(' ', $this->class), $class); 1249*c165b184SJames Collins if (empty($class)) { 1250*c165b184SJames Collins $this->removeAttribute('class'); 1251*c165b184SJames Collins } else { 1252*c165b184SJames Collins $this->class = implode(' ', $class); 1253*c165b184SJames Collins } 1254*c165b184SJames Collins } 1255*c165b184SJames Collins } 1256*c165b184SJames Collins 1257*c165b184SJames Collins function getAllAttributes() 1258*c165b184SJames Collins { 1259*c165b184SJames Collins return $this->attr; 1260*c165b184SJames Collins } 1261*c165b184SJames Collins 1262*c165b184SJames Collins function getAttribute($name) 1263*c165b184SJames Collins { 1264*c165b184SJames Collins return $this->__get($name); 1265*c165b184SJames Collins } 1266*c165b184SJames Collins 1267*c165b184SJames Collins function setAttribute($name, $value) 1268*c165b184SJames Collins { 1269*c165b184SJames Collins $this->__set($name, $value); 1270*c165b184SJames Collins } 1271*c165b184SJames Collins 1272*c165b184SJames Collins function hasAttribute($name) 1273*c165b184SJames Collins { 1274*c165b184SJames Collins return $this->__isset($name); 1275*c165b184SJames Collins } 1276*c165b184SJames Collins 1277*c165b184SJames Collins function removeAttribute($name) 1278*c165b184SJames Collins { 1279*c165b184SJames Collins $this->__set($name, null); 1280*c165b184SJames Collins } 1281*c165b184SJames Collins 1282*c165b184SJames Collins function remove() 1283*c165b184SJames Collins { 1284*c165b184SJames Collins if ($this->parent) { 1285*c165b184SJames Collins $this->parent->removeChild($this); 1286*c165b184SJames Collins } 1287*c165b184SJames Collins } 1288*c165b184SJames Collins 1289*c165b184SJames Collins function removeChild($node) 1290*c165b184SJames Collins { 1291*c165b184SJames Collins $nidx = array_search($node, $this->nodes, true); 1292*c165b184SJames Collins $cidx = array_search($node, $this->children, true); 1293*c165b184SJames Collins $didx = array_search($node, $this->dom->nodes, true); 1294*c165b184SJames Collins 1295*c165b184SJames Collins if ($nidx !== false && $cidx !== false && $didx !== false) { 1296*c165b184SJames Collins 1297*c165b184SJames Collins foreach($node->children as $child) { 1298*c165b184SJames Collins $node->removeChild($child); 1299*c165b184SJames Collins } 1300*c165b184SJames Collins 1301*c165b184SJames Collins foreach($node->nodes as $entity) { 1302*c165b184SJames Collins $enidx = array_search($entity, $node->nodes, true); 1303*c165b184SJames Collins $edidx = array_search($entity, $node->dom->nodes, true); 1304*c165b184SJames Collins 1305*c165b184SJames Collins if ($enidx !== false && $edidx !== false) { 1306*c165b184SJames Collins unset($node->nodes[$enidx]); 1307*c165b184SJames Collins unset($node->dom->nodes[$edidx]); 1308*c165b184SJames Collins } 1309*c165b184SJames Collins } 1310*c165b184SJames Collins 1311*c165b184SJames Collins unset($this->nodes[$nidx]); 1312*c165b184SJames Collins unset($this->children[$cidx]); 1313*c165b184SJames Collins unset($this->dom->nodes[$didx]); 1314*c165b184SJames Collins 1315*c165b184SJames Collins $node->clear(); 1316*c165b184SJames Collins 1317*c165b184SJames Collins } 1318*c165b184SJames Collins } 1319*c165b184SJames Collins 1320*c165b184SJames Collins function getElementById($id) 1321*c165b184SJames Collins { 1322*c165b184SJames Collins return $this->find("#$id", 0); 1323*c165b184SJames Collins } 1324*c165b184SJames Collins 1325*c165b184SJames Collins function getElementsById($id, $idx = null) 1326*c165b184SJames Collins { 1327*c165b184SJames Collins return $this->find("#$id", $idx); 1328*c165b184SJames Collins } 1329*c165b184SJames Collins 1330*c165b184SJames Collins function getElementByTagName($name) 1331*c165b184SJames Collins { 1332*c165b184SJames Collins return $this->find($name, 0); 1333*c165b184SJames Collins } 1334*c165b184SJames Collins 1335*c165b184SJames Collins function getElementsByTagName($name, $idx = null) 1336*c165b184SJames Collins { 1337*c165b184SJames Collins return $this->find($name, $idx); 1338*c165b184SJames Collins } 1339*c165b184SJames Collins 1340*c165b184SJames Collins function parentNode() 1341*c165b184SJames Collins { 1342*c165b184SJames Collins return $this->parent(); 1343*c165b184SJames Collins } 1344*c165b184SJames Collins 1345*c165b184SJames Collins function childNodes($idx = -1) 1346*c165b184SJames Collins { 1347*c165b184SJames Collins return $this->children($idx); 1348*c165b184SJames Collins } 1349*c165b184SJames Collins 1350*c165b184SJames Collins function firstChild() 1351*c165b184SJames Collins { 1352*c165b184SJames Collins return $this->first_child(); 1353*c165b184SJames Collins } 1354*c165b184SJames Collins 1355*c165b184SJames Collins function lastChild() 1356*c165b184SJames Collins { 1357*c165b184SJames Collins return $this->last_child(); 1358*c165b184SJames Collins } 1359*c165b184SJames Collins 1360*c165b184SJames Collins function nextSibling() 1361*c165b184SJames Collins { 1362*c165b184SJames Collins return $this->next_sibling(); 1363*c165b184SJames Collins } 1364*c165b184SJames Collins 1365*c165b184SJames Collins function previousSibling() 1366*c165b184SJames Collins { 1367*c165b184SJames Collins return $this->prev_sibling(); 1368*c165b184SJames Collins } 1369*c165b184SJames Collins 1370*c165b184SJames Collins function hasChildNodes() 1371*c165b184SJames Collins { 1372*c165b184SJames Collins return $this->has_child(); 1373*c165b184SJames Collins } 1374*c165b184SJames Collins 1375*c165b184SJames Collins function nodeName() 1376*c165b184SJames Collins { 1377*c165b184SJames Collins return $this->tag; 1378*c165b184SJames Collins } 1379*c165b184SJames Collins 1380*c165b184SJames Collins function appendChild($node) 1381*c165b184SJames Collins { 1382*c165b184SJames Collins $node->parent($this); 1383*c165b184SJames Collins return $node; 1384*c165b184SJames Collins } 1385*c165b184SJames Collins 1386*c165b184SJames Collins} 1387*c165b184SJames Collins 1388*c165b184SJames Collinsclass simple_html_dom 1389*c165b184SJames Collins{ 1390*c165b184SJames Collins public $root = null; 1391*c165b184SJames Collins public $nodes = array(); 1392*c165b184SJames Collins public $callback = null; 1393*c165b184SJames Collins public $lowercase = false; 1394*c165b184SJames Collins public $original_size; 1395*c165b184SJames Collins public $size; 1396*c165b184SJames Collins 1397*c165b184SJames Collins protected $pos; 1398*c165b184SJames Collins protected $doc; 1399*c165b184SJames Collins protected $char; 1400*c165b184SJames Collins 1401*c165b184SJames Collins protected $cursor; 1402*c165b184SJames Collins protected $parent; 1403*c165b184SJames Collins protected $noise = array(); 1404*c165b184SJames Collins protected $token_blank = " \t\r\n"; 1405*c165b184SJames Collins protected $token_equal = ' =/>'; 1406*c165b184SJames Collins protected $token_slash = " />\r\n\t"; 1407*c165b184SJames Collins protected $token_attr = ' >'; 1408*c165b184SJames Collins 1409*c165b184SJames Collins public $_charset = ''; 1410*c165b184SJames Collins public $_target_charset = ''; 1411*c165b184SJames Collins 1412*c165b184SJames Collins protected $default_br_text = ''; 1413*c165b184SJames Collins 1414*c165b184SJames Collins public $default_span_text = ''; 1415*c165b184SJames Collins 1416*c165b184SJames Collins protected $self_closing_tags = array( 1417*c165b184SJames Collins 'area' => 1, 1418*c165b184SJames Collins 'base' => 1, 1419*c165b184SJames Collins 'br' => 1, 1420*c165b184SJames Collins 'col' => 1, 1421*c165b184SJames Collins 'embed' => 1, 1422*c165b184SJames Collins 'hr' => 1, 1423*c165b184SJames Collins 'img' => 1, 1424*c165b184SJames Collins 'input' => 1, 1425*c165b184SJames Collins 'link' => 1, 1426*c165b184SJames Collins 'meta' => 1, 1427*c165b184SJames Collins 'param' => 1, 1428*c165b184SJames Collins 'source' => 1, 1429*c165b184SJames Collins 'track' => 1, 1430*c165b184SJames Collins 'wbr' => 1 1431*c165b184SJames Collins ); 1432*c165b184SJames Collins protected $block_tags = array( 1433*c165b184SJames Collins 'body' => 1, 1434*c165b184SJames Collins 'div' => 1, 1435*c165b184SJames Collins 'form' => 1, 1436*c165b184SJames Collins 'root' => 1, 1437*c165b184SJames Collins 'span' => 1, 1438*c165b184SJames Collins 'table' => 1 1439*c165b184SJames Collins ); 1440*c165b184SJames Collins protected $optional_closing_tags = array( 1441*c165b184SJames Collins // Not optional, see 1442*c165b184SJames Collins // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 1443*c165b184SJames Collins 'b' => array('b' => 1), 1444*c165b184SJames Collins 'dd' => array('dd' => 1, 'dt' => 1), 1445*c165b184SJames Collins // Not optional, see 1446*c165b184SJames Collins // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 1447*c165b184SJames Collins 'dl' => array('dd' => 1, 'dt' => 1), 1448*c165b184SJames Collins 'dt' => array('dd' => 1, 'dt' => 1), 1449*c165b184SJames Collins 'li' => array('li' => 1), 1450*c165b184SJames Collins 'optgroup' => array('optgroup' => 1, 'option' => 1), 1451*c165b184SJames Collins 'option' => array('optgroup' => 1, 'option' => 1), 1452*c165b184SJames Collins 'p' => array('p' => 1), 1453*c165b184SJames Collins 'rp' => array('rp' => 1, 'rt' => 1), 1454*c165b184SJames Collins 'rt' => array('rp' => 1, 'rt' => 1), 1455*c165b184SJames Collins 'td' => array('td' => 1, 'th' => 1), 1456*c165b184SJames Collins 'th' => array('td' => 1, 'th' => 1), 1457*c165b184SJames Collins 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1), 1458*c165b184SJames Collins ); 1459*c165b184SJames Collins 1460*c165b184SJames Collins function __construct( 1461*c165b184SJames Collins $str = null, 1462*c165b184SJames Collins $lowercase = true, 1463*c165b184SJames Collins $forceTagsClosed = true, 1464*c165b184SJames Collins $target_charset = DEFAULT_TARGET_CHARSET, 1465*c165b184SJames Collins $stripRN = true, 1466*c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1467*c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT, 1468*c165b184SJames Collins $options = 0) 1469*c165b184SJames Collins { 1470*c165b184SJames Collins if ($str) { 1471*c165b184SJames Collins if (preg_match('/^http:\/\//i', $str) || is_file($str)) { 1472*c165b184SJames Collins $this->load_file($str); 1473*c165b184SJames Collins } else { 1474*c165b184SJames Collins $this->load( 1475*c165b184SJames Collins $str, 1476*c165b184SJames Collins $lowercase, 1477*c165b184SJames Collins $stripRN, 1478*c165b184SJames Collins $defaultBRText, 1479*c165b184SJames Collins $defaultSpanText, 1480*c165b184SJames Collins $options 1481*c165b184SJames Collins ); 1482*c165b184SJames Collins } 1483*c165b184SJames Collins } 1484*c165b184SJames Collins // Forcing tags to be closed implies that we don't trust the html, but 1485*c165b184SJames Collins // it can lead to parsing errors if we SHOULD trust the html. 1486*c165b184SJames Collins if (!$forceTagsClosed) { 1487*c165b184SJames Collins $this->optional_closing_array = array(); 1488*c165b184SJames Collins } 1489*c165b184SJames Collins 1490*c165b184SJames Collins $this->_target_charset = $target_charset; 1491*c165b184SJames Collins } 1492*c165b184SJames Collins 1493*c165b184SJames Collins function __destruct() 1494*c165b184SJames Collins { 1495*c165b184SJames Collins $this->clear(); 1496*c165b184SJames Collins } 1497*c165b184SJames Collins 1498*c165b184SJames Collins function load( 1499*c165b184SJames Collins $str, 1500*c165b184SJames Collins $lowercase = true, 1501*c165b184SJames Collins $stripRN = true, 1502*c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1503*c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT, 1504*c165b184SJames Collins $options = 0) 1505*c165b184SJames Collins { 1506*c165b184SJames Collins global $debug_object; 1507*c165b184SJames Collins 1508*c165b184SJames Collins // prepare 1509*c165b184SJames Collins $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); 1510*c165b184SJames Collins 1511*c165b184SJames Collins // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1512*c165b184SJames Collins // Script tags removal now preceeds style tag removal. 1513*c165b184SJames Collins // strip out <script> tags 1514*c165b184SJames Collins $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1515*c165b184SJames Collins $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1516*c165b184SJames Collins 1517*c165b184SJames Collins // strip out the \r \n's if we are told to. 1518*c165b184SJames Collins if ($stripRN) { 1519*c165b184SJames Collins $this->doc = str_replace("\r", ' ', $this->doc); 1520*c165b184SJames Collins $this->doc = str_replace("\n", ' ', $this->doc); 1521*c165b184SJames Collins 1522*c165b184SJames Collins // set the length of content since we have changed it. 1523*c165b184SJames Collins $this->size = strlen($this->doc); 1524*c165b184SJames Collins } 1525*c165b184SJames Collins 1526*c165b184SJames Collins // strip out cdata 1527*c165b184SJames Collins $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1528*c165b184SJames Collins // strip out comments 1529*c165b184SJames Collins $this->remove_noise("'<!--(.*?)-->'is"); 1530*c165b184SJames Collins // strip out <style> tags 1531*c165b184SJames Collins $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1532*c165b184SJames Collins $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1533*c165b184SJames Collins // strip out preformatted tags 1534*c165b184SJames Collins $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1535*c165b184SJames Collins // strip out server side scripts 1536*c165b184SJames Collins $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1537*c165b184SJames Collins 1538*c165b184SJames Collins if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts 1539*c165b184SJames Collins $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1540*c165b184SJames Collins } 1541*c165b184SJames Collins 1542*c165b184SJames Collins // parsing 1543*c165b184SJames Collins $this->parse(); 1544*c165b184SJames Collins // end 1545*c165b184SJames Collins $this->root->_[HDOM_INFO_END] = $this->cursor; 1546*c165b184SJames Collins $this->parse_charset(); 1547*c165b184SJames Collins 1548*c165b184SJames Collins // make load function chainable 1549*c165b184SJames Collins return $this; 1550*c165b184SJames Collins } 1551*c165b184SJames Collins 1552*c165b184SJames Collins function load_file() 1553*c165b184SJames Collins { 1554*c165b184SJames Collins $args = func_get_args(); 1555*c165b184SJames Collins 1556*c165b184SJames Collins if(($doc = call_user_func_array('file_get_contents', $args)) !== false) { 1557*c165b184SJames Collins $this->load($doc, true); 1558*c165b184SJames Collins } else { 1559*c165b184SJames Collins return false; 1560*c165b184SJames Collins } 1561*c165b184SJames Collins } 1562*c165b184SJames Collins 1563*c165b184SJames Collins function set_callback($function_name) 1564*c165b184SJames Collins { 1565*c165b184SJames Collins $this->callback = $function_name; 1566*c165b184SJames Collins } 1567*c165b184SJames Collins 1568*c165b184SJames Collins function remove_callback() 1569*c165b184SJames Collins { 1570*c165b184SJames Collins $this->callback = null; 1571*c165b184SJames Collins } 1572*c165b184SJames Collins 1573*c165b184SJames Collins function save($filepath = '') 1574*c165b184SJames Collins { 1575*c165b184SJames Collins $ret = $this->root->innertext(); 1576*c165b184SJames Collins if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); } 1577*c165b184SJames Collins return $ret; 1578*c165b184SJames Collins } 1579*c165b184SJames Collins 1580*c165b184SJames Collins function find($selector, $idx = null, $lowercase = false) 1581*c165b184SJames Collins { 1582*c165b184SJames Collins return $this->root->find($selector, $idx, $lowercase); 1583*c165b184SJames Collins } 1584*c165b184SJames Collins 1585*c165b184SJames Collins function clear() 1586*c165b184SJames Collins { 1587*c165b184SJames Collins if (isset($this->nodes)) { 1588*c165b184SJames Collins foreach ($this->nodes as $n) { 1589*c165b184SJames Collins $n->clear(); 1590*c165b184SJames Collins $n = null; 1591*c165b184SJames Collins } 1592*c165b184SJames Collins } 1593*c165b184SJames Collins 1594*c165b184SJames Collins // This add next line is documented in the sourceforge repository. 1595*c165b184SJames Collins // 2977248 as a fix for ongoing memory leaks that occur even with the 1596*c165b184SJames Collins // use of clear. 1597*c165b184SJames Collins if (isset($this->children)) { 1598*c165b184SJames Collins foreach ($this->children as $n) { 1599*c165b184SJames Collins $n->clear(); 1600*c165b184SJames Collins $n = null; 1601*c165b184SJames Collins } 1602*c165b184SJames Collins } 1603*c165b184SJames Collins 1604*c165b184SJames Collins if (isset($this->parent)) { 1605*c165b184SJames Collins $this->parent->clear(); 1606*c165b184SJames Collins unset($this->parent); 1607*c165b184SJames Collins } 1608*c165b184SJames Collins 1609*c165b184SJames Collins if (isset($this->root)) { 1610*c165b184SJames Collins $this->root->clear(); 1611*c165b184SJames Collins unset($this->root); 1612*c165b184SJames Collins } 1613*c165b184SJames Collins 1614*c165b184SJames Collins unset($this->doc); 1615*c165b184SJames Collins unset($this->noise); 1616*c165b184SJames Collins } 1617*c165b184SJames Collins 1618*c165b184SJames Collins function dump($show_attr = true) 1619*c165b184SJames Collins { 1620*c165b184SJames Collins $this->root->dump($show_attr); 1621*c165b184SJames Collins } 1622*c165b184SJames Collins 1623*c165b184SJames Collins protected function prepare( 1624*c165b184SJames Collins $str, $lowercase = true, 1625*c165b184SJames Collins $defaultBRText = DEFAULT_BR_TEXT, 1626*c165b184SJames Collins $defaultSpanText = DEFAULT_SPAN_TEXT) 1627*c165b184SJames Collins { 1628*c165b184SJames Collins $this->clear(); 1629*c165b184SJames Collins 1630*c165b184SJames Collins $this->doc = trim($str); 1631*c165b184SJames Collins $this->size = strlen($this->doc); 1632*c165b184SJames Collins $this->original_size = $this->size; // original size of the html 1633*c165b184SJames Collins $this->pos = 0; 1634*c165b184SJames Collins $this->cursor = 1; 1635*c165b184SJames Collins $this->noise = array(); 1636*c165b184SJames Collins $this->nodes = array(); 1637*c165b184SJames Collins $this->lowercase = $lowercase; 1638*c165b184SJames Collins $this->default_br_text = $defaultBRText; 1639*c165b184SJames Collins $this->default_span_text = $defaultSpanText; 1640*c165b184SJames Collins $this->root = new simple_html_dom_node($this); 1641*c165b184SJames Collins $this->root->tag = 'root'; 1642*c165b184SJames Collins $this->root->_[HDOM_INFO_BEGIN] = -1; 1643*c165b184SJames Collins $this->root->nodetype = HDOM_TYPE_ROOT; 1644*c165b184SJames Collins $this->parent = $this->root; 1645*c165b184SJames Collins if ($this->size > 0) { $this->char = $this->doc[0]; } 1646*c165b184SJames Collins } 1647*c165b184SJames Collins 1648*c165b184SJames Collins protected function parse() 1649*c165b184SJames Collins { 1650*c165b184SJames Collins while (true) { 1651*c165b184SJames Collins // Read next tag if there is no text between current position and the 1652*c165b184SJames Collins // next opening tag. 1653*c165b184SJames Collins if (($s = $this->copy_until_char('<')) === '') { 1654*c165b184SJames Collins if($this->read_tag()) { 1655*c165b184SJames Collins continue; 1656*c165b184SJames Collins } else { 1657*c165b184SJames Collins return true; 1658*c165b184SJames Collins } 1659*c165b184SJames Collins } 1660*c165b184SJames Collins 1661*c165b184SJames Collins // Add a text node for text between tags 1662*c165b184SJames Collins $node = new simple_html_dom_node($this); 1663*c165b184SJames Collins ++$this->cursor; 1664*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = $s; 1665*c165b184SJames Collins $this->link_nodes($node, false); 1666*c165b184SJames Collins } 1667*c165b184SJames Collins } 1668*c165b184SJames Collins 1669*c165b184SJames Collins protected function parse_charset() 1670*c165b184SJames Collins { 1671*c165b184SJames Collins global $debug_object; 1672*c165b184SJames Collins 1673*c165b184SJames Collins $charset = null; 1674*c165b184SJames Collins 1675*c165b184SJames Collins if (function_exists('get_last_retrieve_url_contents_content_type')) { 1676*c165b184SJames Collins $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1677*c165b184SJames Collins $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1678*c165b184SJames Collins if ($success) { 1679*c165b184SJames Collins $charset = $matches[1]; 1680*c165b184SJames Collins if (is_object($debug_object)) { 1681*c165b184SJames Collins $debug_object->debug_log(2, 1682*c165b184SJames Collins 'header content-type found charset of: ' 1683*c165b184SJames Collins . $charset 1684*c165b184SJames Collins ); 1685*c165b184SJames Collins } 1686*c165b184SJames Collins } 1687*c165b184SJames Collins } 1688*c165b184SJames Collins 1689*c165b184SJames Collins if (empty($charset)) { 1690*c165b184SJames Collins // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type 1691*c165b184SJames Collins $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); 1692*c165b184SJames Collins 1693*c165b184SJames Collins if (!empty($el)) { 1694*c165b184SJames Collins $fullvalue = $el->content; 1695*c165b184SJames Collins if (is_object($debug_object)) { 1696*c165b184SJames Collins $debug_object->debug_log(2, 1697*c165b184SJames Collins 'meta content-type tag found' 1698*c165b184SJames Collins . $fullvalue 1699*c165b184SJames Collins ); 1700*c165b184SJames Collins } 1701*c165b184SJames Collins 1702*c165b184SJames Collins if (!empty($fullvalue)) { 1703*c165b184SJames Collins $success = preg_match( 1704*c165b184SJames Collins '/charset=(.+)/i', 1705*c165b184SJames Collins $fullvalue, 1706*c165b184SJames Collins $matches 1707*c165b184SJames Collins ); 1708*c165b184SJames Collins 1709*c165b184SJames Collins if ($success) { 1710*c165b184SJames Collins $charset = $matches[1]; 1711*c165b184SJames Collins } else { 1712*c165b184SJames Collins // If there is a meta tag, and they don't specify the 1713*c165b184SJames Collins // character set, research says that it's typically 1714*c165b184SJames Collins // ISO-8859-1 1715*c165b184SJames Collins if (is_object($debug_object)) { 1716*c165b184SJames Collins $debug_object->debug_log(2, 1717*c165b184SJames Collins 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' 1718*c165b184SJames Collins ); 1719*c165b184SJames Collins } 1720*c165b184SJames Collins 1721*c165b184SJames Collins $charset = 'ISO-8859-1'; 1722*c165b184SJames Collins } 1723*c165b184SJames Collins } 1724*c165b184SJames Collins } 1725*c165b184SJames Collins } 1726*c165b184SJames Collins 1727*c165b184SJames Collins if (empty($charset)) { 1728*c165b184SJames Collins // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration 1729*c165b184SJames Collins if ($meta = $this->root->find('meta[charset]', 0)) { 1730*c165b184SJames Collins $charset = $meta->charset; 1731*c165b184SJames Collins if (is_object($debug_object)) { 1732*c165b184SJames Collins $debug_object->debug_log(2, 'meta charset: ' . $charset); 1733*c165b184SJames Collins } 1734*c165b184SJames Collins } 1735*c165b184SJames Collins } 1736*c165b184SJames Collins 1737*c165b184SJames Collins if (empty($charset)) { 1738*c165b184SJames Collins // Try to guess the charset based on the content 1739*c165b184SJames Collins // Requires Multibyte String (mbstring) support (optional) 1740*c165b184SJames Collins if (function_exists('mb_detect_encoding')) { 1741*c165b184SJames Collins /** 1742*c165b184SJames Collins * mb_detect_encoding() is not intended to distinguish between 1743*c165b184SJames Collins * charsets, especially single-byte charsets. Its primary 1744*c165b184SJames Collins * purpose is to detect which multibyte encoding is in use, 1745*c165b184SJames Collins * i.e. UTF-8, UTF-16, shift-JIS, etc. 1746*c165b184SJames Collins * 1747*c165b184SJames Collins * -- https://bugs.php.net/bug.php?id=38138 1748*c165b184SJames Collins * 1749*c165b184SJames Collins * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will 1750*c165b184SJames Collins * always result in CP1251/ISO-8859-5 and vice versa. 1751*c165b184SJames Collins * 1752*c165b184SJames Collins * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 1753*c165b184SJames Collins * to stay compatible. 1754*c165b184SJames Collins */ 1755*c165b184SJames Collins $encoding = mb_detect_encoding( 1756*c165b184SJames Collins $this->doc, 1757*c165b184SJames Collins array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) 1758*c165b184SJames Collins ); 1759*c165b184SJames Collins 1760*c165b184SJames Collins if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') { 1761*c165b184SJames Collins // Due to a limitation of mb_detect_encoding 1762*c165b184SJames Collins // 'CP1251'/'ISO-8859-5' will be detected as 1763*c165b184SJames Collins // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in 1764*c165b184SJames Collins // which case we can simply assume it is the other charset. 1765*c165b184SJames Collins if (!@iconv('CP1252', 'UTF-8', $this->doc)) { 1766*c165b184SJames Collins $encoding = 'CP1251'; 1767*c165b184SJames Collins } 1768*c165b184SJames Collins } 1769*c165b184SJames Collins 1770*c165b184SJames Collins if ($encoding !== false) { 1771*c165b184SJames Collins $charset = $encoding; 1772*c165b184SJames Collins if (is_object($debug_object)) { 1773*c165b184SJames Collins $debug_object->debug_log(2, 'mb_detect: ' . $charset); 1774*c165b184SJames Collins } 1775*c165b184SJames Collins } 1776*c165b184SJames Collins } 1777*c165b184SJames Collins } 1778*c165b184SJames Collins 1779*c165b184SJames Collins if (empty($charset)) { 1780*c165b184SJames Collins // Assume it's UTF-8 as it is the most likely charset to be used 1781*c165b184SJames Collins $charset = 'UTF-8'; 1782*c165b184SJames Collins if (is_object($debug_object)) { 1783*c165b184SJames Collins $debug_object->debug_log(2, 'No match found, assume ' . $charset); 1784*c165b184SJames Collins } 1785*c165b184SJames Collins } 1786*c165b184SJames Collins 1787*c165b184SJames Collins // Since CP1252 is a superset, if we get one of it's subsets, we want 1788*c165b184SJames Collins // it instead. 1789*c165b184SJames Collins if ((strtolower($charset) == 'iso-8859-1') 1790*c165b184SJames Collins || (strtolower($charset) == 'latin1') 1791*c165b184SJames Collins || (strtolower($charset) == 'latin-1')) { 1792*c165b184SJames Collins $charset = 'CP1252'; 1793*c165b184SJames Collins if (is_object($debug_object)) { 1794*c165b184SJames Collins $debug_object->debug_log(2, 1795*c165b184SJames Collins 'replacing ' . $charset . ' with CP1252 as its a superset' 1796*c165b184SJames Collins ); 1797*c165b184SJames Collins } 1798*c165b184SJames Collins } 1799*c165b184SJames Collins 1800*c165b184SJames Collins if (is_object($debug_object)) { 1801*c165b184SJames Collins $debug_object->debug_log(1, 'EXIT - ' . $charset); 1802*c165b184SJames Collins } 1803*c165b184SJames Collins 1804*c165b184SJames Collins return $this->_charset = $charset; 1805*c165b184SJames Collins } 1806*c165b184SJames Collins 1807*c165b184SJames Collins protected function read_tag() 1808*c165b184SJames Collins { 1809*c165b184SJames Collins // Set end position if no further tags found 1810*c165b184SJames Collins if ($this->char !== '<') { 1811*c165b184SJames Collins $this->root->_[HDOM_INFO_END] = $this->cursor; 1812*c165b184SJames Collins return false; 1813*c165b184SJames Collins } 1814*c165b184SJames Collins 1815*c165b184SJames Collins $begin_tag_pos = $this->pos; 1816*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1817*c165b184SJames Collins 1818*c165b184SJames Collins // end tag 1819*c165b184SJames Collins if ($this->char === '/') { 1820*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1821*c165b184SJames Collins 1822*c165b184SJames Collins // Skip whitespace in end tags (i.e. in "</ html>") 1823*c165b184SJames Collins $this->skip($this->token_blank); 1824*c165b184SJames Collins $tag = $this->copy_until_char('>'); 1825*c165b184SJames Collins 1826*c165b184SJames Collins // Skip attributes in end tags 1827*c165b184SJames Collins if (($pos = strpos($tag, ' ')) !== false) { 1828*c165b184SJames Collins $tag = substr($tag, 0, $pos); 1829*c165b184SJames Collins } 1830*c165b184SJames Collins 1831*c165b184SJames Collins $parent_lower = strtolower($this->parent->tag); 1832*c165b184SJames Collins $tag_lower = strtolower($tag); 1833*c165b184SJames Collins 1834*c165b184SJames Collins // The end tag is supposed to close the parent tag. Handle situations 1835*c165b184SJames Collins // when it doesn't 1836*c165b184SJames Collins if ($parent_lower !== $tag_lower) { 1837*c165b184SJames Collins // Parent tag does not have to be closed necessarily (optional closing tag) 1838*c165b184SJames Collins // Current tag is a block tag, so it may close an ancestor 1839*c165b184SJames Collins if (isset($this->optional_closing_tags[$parent_lower]) 1840*c165b184SJames Collins && isset($this->block_tags[$tag_lower])) { 1841*c165b184SJames Collins 1842*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1843*c165b184SJames Collins $org_parent = $this->parent; 1844*c165b184SJames Collins 1845*c165b184SJames Collins // Traverse ancestors to find a matching opening tag 1846*c165b184SJames Collins // Stop at root node 1847*c165b184SJames Collins while (($this->parent->parent) 1848*c165b184SJames Collins && strtolower($this->parent->tag) !== $tag_lower 1849*c165b184SJames Collins ){ 1850*c165b184SJames Collins $this->parent = $this->parent->parent; 1851*c165b184SJames Collins } 1852*c165b184SJames Collins 1853*c165b184SJames Collins // If we don't have a match add current tag as text node 1854*c165b184SJames Collins if (strtolower($this->parent->tag) !== $tag_lower) { 1855*c165b184SJames Collins $this->parent = $org_parent; // restore origonal parent 1856*c165b184SJames Collins 1857*c165b184SJames Collins if ($this->parent->parent) { 1858*c165b184SJames Collins $this->parent = $this->parent->parent; 1859*c165b184SJames Collins } 1860*c165b184SJames Collins 1861*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1862*c165b184SJames Collins return $this->as_text_node($tag); 1863*c165b184SJames Collins } 1864*c165b184SJames Collins } elseif (($this->parent->parent) 1865*c165b184SJames Collins && isset($this->block_tags[$tag_lower]) 1866*c165b184SJames Collins ) { 1867*c165b184SJames Collins // Grandparent exists and current tag is a block tag, so our 1868*c165b184SJames Collins // parent doesn't have an end tag 1869*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; // No end tag 1870*c165b184SJames Collins $org_parent = $this->parent; 1871*c165b184SJames Collins 1872*c165b184SJames Collins // Traverse ancestors to find a matching opening tag 1873*c165b184SJames Collins // Stop at root node 1874*c165b184SJames Collins while (($this->parent->parent) 1875*c165b184SJames Collins && strtolower($this->parent->tag) !== $tag_lower 1876*c165b184SJames Collins ) { 1877*c165b184SJames Collins $this->parent = $this->parent->parent; 1878*c165b184SJames Collins } 1879*c165b184SJames Collins 1880*c165b184SJames Collins // If we don't have a match add current tag as text node 1881*c165b184SJames Collins if (strtolower($this->parent->tag) !== $tag_lower) { 1882*c165b184SJames Collins $this->parent = $org_parent; // restore origonal parent 1883*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1884*c165b184SJames Collins return $this->as_text_node($tag); 1885*c165b184SJames Collins } 1886*c165b184SJames Collins } elseif (($this->parent->parent) 1887*c165b184SJames Collins && strtolower($this->parent->parent->tag) === $tag_lower 1888*c165b184SJames Collins ) { // Grandparent exists and current tag closes it 1889*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1890*c165b184SJames Collins $this->parent = $this->parent->parent; 1891*c165b184SJames Collins } else { // Random tag, add as text node 1892*c165b184SJames Collins return $this->as_text_node($tag); 1893*c165b184SJames Collins } 1894*c165b184SJames Collins } 1895*c165b184SJames Collins 1896*c165b184SJames Collins // Set end position of parent tag to current cursor position 1897*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = $this->cursor; 1898*c165b184SJames Collins 1899*c165b184SJames Collins if ($this->parent->parent) { 1900*c165b184SJames Collins $this->parent = $this->parent->parent; 1901*c165b184SJames Collins } 1902*c165b184SJames Collins 1903*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1904*c165b184SJames Collins return true; 1905*c165b184SJames Collins } 1906*c165b184SJames Collins 1907*c165b184SJames Collins // start tag 1908*c165b184SJames Collins $node = new simple_html_dom_node($this); 1909*c165b184SJames Collins $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1910*c165b184SJames Collins ++$this->cursor; 1911*c165b184SJames Collins $tag = $this->copy_until($this->token_slash); // Get tag name 1912*c165b184SJames Collins $node->tag_start = $begin_tag_pos; 1913*c165b184SJames Collins 1914*c165b184SJames Collins // doctype, cdata & comments... 1915*c165b184SJames Collins // <!DOCTYPE html> 1916*c165b184SJames Collins // <![CDATA[ ... ]]> 1917*c165b184SJames Collins // <!-- Comment --> 1918*c165b184SJames Collins if (isset($tag[0]) && $tag[0] === '!') { 1919*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1920*c165b184SJames Collins 1921*c165b184SJames Collins if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--") 1922*c165b184SJames Collins $node->nodetype = HDOM_TYPE_COMMENT; 1923*c165b184SJames Collins $node->tag = 'comment'; 1924*c165b184SJames Collins } else { // Could be doctype or CDATA but we don't care 1925*c165b184SJames Collins $node->nodetype = HDOM_TYPE_UNKNOWN; 1926*c165b184SJames Collins $node->tag = 'unknown'; 1927*c165b184SJames Collins } 1928*c165b184SJames Collins 1929*c165b184SJames Collins if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1930*c165b184SJames Collins 1931*c165b184SJames Collins $this->link_nodes($node, true); 1932*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1933*c165b184SJames Collins return true; 1934*c165b184SJames Collins } 1935*c165b184SJames Collins 1936*c165b184SJames Collins // The start tag cannot contain another start tag, if so add as text 1937*c165b184SJames Collins // i.e. "<<html>" 1938*c165b184SJames Collins if ($pos = strpos($tag, '<') !== false) { 1939*c165b184SJames Collins $tag = '<' . substr($tag, 0, -1); 1940*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = $tag; 1941*c165b184SJames Collins $this->link_nodes($node, false); 1942*c165b184SJames Collins $this->char = $this->doc[--$this->pos]; // prev 1943*c165b184SJames Collins return true; 1944*c165b184SJames Collins } 1945*c165b184SJames Collins 1946*c165b184SJames Collins // Handle invalid tag names (i.e. "<html#doc>") 1947*c165b184SJames Collins if (!preg_match('/^\w[\w:-]*$/', $tag)) { 1948*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1949*c165b184SJames Collins 1950*c165b184SJames Collins // Next char is the beginning of a new tag, don't touch it. 1951*c165b184SJames Collins if ($this->char === '<') { 1952*c165b184SJames Collins $this->link_nodes($node, false); 1953*c165b184SJames Collins return true; 1954*c165b184SJames Collins } 1955*c165b184SJames Collins 1956*c165b184SJames Collins // Next char closes current tag, add and be done with it. 1957*c165b184SJames Collins if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; } 1958*c165b184SJames Collins $this->link_nodes($node, false); 1959*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1960*c165b184SJames Collins return true; 1961*c165b184SJames Collins } 1962*c165b184SJames Collins 1963*c165b184SJames Collins // begin tag, add new node 1964*c165b184SJames Collins $node->nodetype = HDOM_TYPE_ELEMENT; 1965*c165b184SJames Collins $tag_lower = strtolower($tag); 1966*c165b184SJames Collins $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1967*c165b184SJames Collins 1968*c165b184SJames Collins // handle optional closing tags 1969*c165b184SJames Collins if (isset($this->optional_closing_tags[$tag_lower])) { 1970*c165b184SJames Collins // Traverse ancestors to close all optional closing tags 1971*c165b184SJames Collins while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 1972*c165b184SJames Collins $this->parent->_[HDOM_INFO_END] = 0; 1973*c165b184SJames Collins $this->parent = $this->parent->parent; 1974*c165b184SJames Collins } 1975*c165b184SJames Collins $node->parent = $this->parent; 1976*c165b184SJames Collins } 1977*c165b184SJames Collins 1978*c165b184SJames Collins $guard = 0; // prevent infinity loop 1979*c165b184SJames Collins 1980*c165b184SJames Collins // [0] Space between tag and first attribute 1981*c165b184SJames Collins $space = array($this->copy_skip($this->token_blank), '', ''); 1982*c165b184SJames Collins 1983*c165b184SJames Collins // attributes 1984*c165b184SJames Collins do { 1985*c165b184SJames Collins // Everything until the first equal sign should be the attribute name 1986*c165b184SJames Collins $name = $this->copy_until($this->token_equal); 1987*c165b184SJames Collins 1988*c165b184SJames Collins if ($name === '' && $this->char !== null && $space[0] === '') { 1989*c165b184SJames Collins break; 1990*c165b184SJames Collins } 1991*c165b184SJames Collins 1992*c165b184SJames Collins if ($guard === $this->pos) { // Escape infinite loop 1993*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 1994*c165b184SJames Collins continue; 1995*c165b184SJames Collins } 1996*c165b184SJames Collins 1997*c165b184SJames Collins $guard = $this->pos; 1998*c165b184SJames Collins 1999*c165b184SJames Collins // handle endless '<' 2000*c165b184SJames Collins // Out of bounds before the tag ended 2001*c165b184SJames Collins if ($this->pos >= $this->size - 1 && $this->char !== '>') { 2002*c165b184SJames Collins $node->nodetype = HDOM_TYPE_TEXT; 2003*c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2004*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; 2005*c165b184SJames Collins $node->tag = 'text'; 2006*c165b184SJames Collins $this->link_nodes($node, false); 2007*c165b184SJames Collins return true; 2008*c165b184SJames Collins } 2009*c165b184SJames Collins 2010*c165b184SJames Collins // handle mismatch '<' 2011*c165b184SJames Collins // Attributes cannot start after opening tag 2012*c165b184SJames Collins if ($this->doc[$this->pos - 1] == '<') { 2013*c165b184SJames Collins $node->nodetype = HDOM_TYPE_TEXT; 2014*c165b184SJames Collins $node->tag = 'text'; 2015*c165b184SJames Collins $node->attr = array(); 2016*c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2017*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = substr( 2018*c165b184SJames Collins $this->doc, 2019*c165b184SJames Collins $begin_tag_pos, 2020*c165b184SJames Collins $this->pos - $begin_tag_pos - 1 2021*c165b184SJames Collins ); 2022*c165b184SJames Collins $this->pos -= 2; 2023*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2024*c165b184SJames Collins $this->link_nodes($node, false); 2025*c165b184SJames Collins return true; 2026*c165b184SJames Collins } 2027*c165b184SJames Collins 2028*c165b184SJames Collins if ($name !== '/' && $name !== '') { // this is a attribute name 2029*c165b184SJames Collins // [1] Whitespace after attribute name 2030*c165b184SJames Collins $space[1] = $this->copy_skip($this->token_blank); 2031*c165b184SJames Collins 2032*c165b184SJames Collins $name = $this->restore_noise($name); // might be a noisy name 2033*c165b184SJames Collins 2034*c165b184SJames Collins if ($this->lowercase) { $name = strtolower($name); } 2035*c165b184SJames Collins 2036*c165b184SJames Collins if ($this->char === '=') { // attribute with value 2037*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2038*c165b184SJames Collins $this->parse_attr($node, $name, $space); // get attribute value 2039*c165b184SJames Collins } else { 2040*c165b184SJames Collins //no value attr: nowrap, checked selected... 2041*c165b184SJames Collins $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 2042*c165b184SJames Collins $node->attr[$name] = true; 2043*c165b184SJames Collins if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev 2044*c165b184SJames Collins } 2045*c165b184SJames Collins 2046*c165b184SJames Collins $node->_[HDOM_INFO_SPACE][] = $space; 2047*c165b184SJames Collins 2048*c165b184SJames Collins // prepare for next attribute 2049*c165b184SJames Collins $space = array( 2050*c165b184SJames Collins $this->copy_skip($this->token_blank), 2051*c165b184SJames Collins '', 2052*c165b184SJames Collins '' 2053*c165b184SJames Collins ); 2054*c165b184SJames Collins } else { // no more attributes 2055*c165b184SJames Collins break; 2056*c165b184SJames Collins } 2057*c165b184SJames Collins } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended 2058*c165b184SJames Collins 2059*c165b184SJames Collins $this->link_nodes($node, true); 2060*c165b184SJames Collins $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 2061*c165b184SJames Collins 2062*c165b184SJames Collins // handle empty tags (i.e. "<div/>") 2063*c165b184SJames Collins if ($this->copy_until_char('>') === '/') { 2064*c165b184SJames Collins $node->_[HDOM_INFO_ENDSPACE] .= '/'; 2065*c165b184SJames Collins $node->_[HDOM_INFO_END] = 0; 2066*c165b184SJames Collins } else { 2067*c165b184SJames Collins // reset parent 2068*c165b184SJames Collins if (!isset($this->self_closing_tags[strtolower($node->tag)])) { 2069*c165b184SJames Collins $this->parent = $node; 2070*c165b184SJames Collins } 2071*c165b184SJames Collins } 2072*c165b184SJames Collins 2073*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2074*c165b184SJames Collins 2075*c165b184SJames Collins // If it's a BR tag, we need to set it's text to the default text. 2076*c165b184SJames Collins // This way when we see it in plaintext, we can generate formatting that the user wants. 2077*c165b184SJames Collins // since a br tag never has sub nodes, this works well. 2078*c165b184SJames Collins if ($node->tag === 'br') { 2079*c165b184SJames Collins $node->_[HDOM_INFO_INNER] = $this->default_br_text; 2080*c165b184SJames Collins } 2081*c165b184SJames Collins 2082*c165b184SJames Collins return true; 2083*c165b184SJames Collins } 2084*c165b184SJames Collins 2085*c165b184SJames Collins protected function parse_attr($node, $name, &$space) 2086*c165b184SJames Collins { 2087*c165b184SJames Collins $is_duplicate = isset($node->attr[$name]); 2088*c165b184SJames Collins 2089*c165b184SJames Collins if (!$is_duplicate) // Copy whitespace between "=" and value 2090*c165b184SJames Collins $space[2] = $this->copy_skip($this->token_blank); 2091*c165b184SJames Collins 2092*c165b184SJames Collins switch ($this->char) { 2093*c165b184SJames Collins case '"': 2094*c165b184SJames Collins $quote_type = HDOM_QUOTE_DOUBLE; 2095*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2096*c165b184SJames Collins $value = $this->copy_until_char('"'); 2097*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2098*c165b184SJames Collins break; 2099*c165b184SJames Collins case '\'': 2100*c165b184SJames Collins $quote_type = HDOM_QUOTE_SINGLE; 2101*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2102*c165b184SJames Collins $value = $this->copy_until_char('\''); 2103*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2104*c165b184SJames Collins break; 2105*c165b184SJames Collins default: 2106*c165b184SJames Collins $quote_type = HDOM_QUOTE_NO; 2107*c165b184SJames Collins $value = $this->copy_until($this->token_attr); 2108*c165b184SJames Collins } 2109*c165b184SJames Collins 2110*c165b184SJames Collins $value = $this->restore_noise($value); 2111*c165b184SJames Collins 2112*c165b184SJames Collins // PaperG: Attributes should not have \r or \n in them, that counts as 2113*c165b184SJames Collins // html whitespace. 2114*c165b184SJames Collins $value = str_replace("\r", '', $value); 2115*c165b184SJames Collins $value = str_replace("\n", '', $value); 2116*c165b184SJames Collins 2117*c165b184SJames Collins // PaperG: If this is a "class" selector, lets get rid of the preceeding 2118*c165b184SJames Collins // and trailing space since some people leave it in the multi class case. 2119*c165b184SJames Collins if ($name === 'class') { 2120*c165b184SJames Collins $value = trim($value); 2121*c165b184SJames Collins } 2122*c165b184SJames Collins 2123*c165b184SJames Collins if (!$is_duplicate) { 2124*c165b184SJames Collins $node->_[HDOM_INFO_QUOTE][] = $quote_type; 2125*c165b184SJames Collins $node->attr[$name] = $value; 2126*c165b184SJames Collins } 2127*c165b184SJames Collins } 2128*c165b184SJames Collins 2129*c165b184SJames Collins protected function link_nodes(&$node, $is_child) 2130*c165b184SJames Collins { 2131*c165b184SJames Collins $node->parent = $this->parent; 2132*c165b184SJames Collins $this->parent->nodes[] = $node; 2133*c165b184SJames Collins if ($is_child) { 2134*c165b184SJames Collins $this->parent->children[] = $node; 2135*c165b184SJames Collins } 2136*c165b184SJames Collins } 2137*c165b184SJames Collins 2138*c165b184SJames Collins protected function as_text_node($tag) 2139*c165b184SJames Collins { 2140*c165b184SJames Collins $node = new simple_html_dom_node($this); 2141*c165b184SJames Collins ++$this->cursor; 2142*c165b184SJames Collins $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 2143*c165b184SJames Collins $this->link_nodes($node, false); 2144*c165b184SJames Collins $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2145*c165b184SJames Collins return true; 2146*c165b184SJames Collins } 2147*c165b184SJames Collins 2148*c165b184SJames Collins protected function skip($chars) 2149*c165b184SJames Collins { 2150*c165b184SJames Collins $this->pos += strspn($this->doc, $chars, $this->pos); 2151*c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2152*c165b184SJames Collins } 2153*c165b184SJames Collins 2154*c165b184SJames Collins protected function copy_skip($chars) 2155*c165b184SJames Collins { 2156*c165b184SJames Collins $pos = $this->pos; 2157*c165b184SJames Collins $len = strspn($this->doc, $chars, $pos); 2158*c165b184SJames Collins $this->pos += $len; 2159*c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2160*c165b184SJames Collins if ($len === 0) { return ''; } 2161*c165b184SJames Collins return substr($this->doc, $pos, $len); 2162*c165b184SJames Collins } 2163*c165b184SJames Collins 2164*c165b184SJames Collins protected function copy_until($chars) 2165*c165b184SJames Collins { 2166*c165b184SJames Collins $pos = $this->pos; 2167*c165b184SJames Collins $len = strcspn($this->doc, $chars, $pos); 2168*c165b184SJames Collins $this->pos += $len; 2169*c165b184SJames Collins $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next 2170*c165b184SJames Collins return substr($this->doc, $pos, $len); 2171*c165b184SJames Collins } 2172*c165b184SJames Collins 2173*c165b184SJames Collins protected function copy_until_char($char) 2174*c165b184SJames Collins { 2175*c165b184SJames Collins if ($this->char === null) { return ''; } 2176*c165b184SJames Collins 2177*c165b184SJames Collins if (($pos = strpos($this->doc, $char, $this->pos)) === false) { 2178*c165b184SJames Collins $ret = substr($this->doc, $this->pos, $this->size - $this->pos); 2179*c165b184SJames Collins $this->char = null; 2180*c165b184SJames Collins $this->pos = $this->size; 2181*c165b184SJames Collins return $ret; 2182*c165b184SJames Collins } 2183*c165b184SJames Collins 2184*c165b184SJames Collins if ($pos === $this->pos) { return ''; } 2185*c165b184SJames Collins 2186*c165b184SJames Collins $pos_old = $this->pos; 2187*c165b184SJames Collins $this->char = $this->doc[$pos]; 2188*c165b184SJames Collins $this->pos = $pos; 2189*c165b184SJames Collins return substr($this->doc, $pos_old, $pos - $pos_old); 2190*c165b184SJames Collins } 2191*c165b184SJames Collins 2192*c165b184SJames Collins protected function remove_noise($pattern, $remove_tag = false) 2193*c165b184SJames Collins { 2194*c165b184SJames Collins global $debug_object; 2195*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2196*c165b184SJames Collins 2197*c165b184SJames Collins $count = preg_match_all( 2198*c165b184SJames Collins $pattern, 2199*c165b184SJames Collins $this->doc, 2200*c165b184SJames Collins $matches, 2201*c165b184SJames Collins PREG_SET_ORDER | PREG_OFFSET_CAPTURE 2202*c165b184SJames Collins ); 2203*c165b184SJames Collins 2204*c165b184SJames Collins for ($i = $count - 1; $i > -1; --$i) { 2205*c165b184SJames Collins $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); 2206*c165b184SJames Collins 2207*c165b184SJames Collins if (is_object($debug_object)) { 2208*c165b184SJames Collins $debug_object->debug_log(2, 'key is: ' . $key); 2209*c165b184SJames Collins } 2210*c165b184SJames Collins 2211*c165b184SJames Collins $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch 2212*c165b184SJames Collins $this->noise[$key] = $matches[$i][$idx][0]; 2213*c165b184SJames Collins $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 2214*c165b184SJames Collins } 2215*c165b184SJames Collins 2216*c165b184SJames Collins // reset the length of content 2217*c165b184SJames Collins $this->size = strlen($this->doc); 2218*c165b184SJames Collins 2219*c165b184SJames Collins if ($this->size > 0) { 2220*c165b184SJames Collins $this->char = $this->doc[0]; 2221*c165b184SJames Collins } 2222*c165b184SJames Collins } 2223*c165b184SJames Collins 2224*c165b184SJames Collins function restore_noise($text) 2225*c165b184SJames Collins { 2226*c165b184SJames Collins global $debug_object; 2227*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2228*c165b184SJames Collins 2229*c165b184SJames Collins while (($pos = strpos($text, '___noise___')) !== false) { 2230*c165b184SJames Collins // Sometimes there is a broken piece of markup, and we don't GET the 2231*c165b184SJames Collins // pos+11 etc... token which indicates a problem outside of us... 2232*c165b184SJames Collins 2233*c165b184SJames Collins // todo: "___noise___1000" (or any number with four or more digits) 2234*c165b184SJames Collins // in the DOM causes an infinite loop which could be utilized by 2235*c165b184SJames Collins // malicious software 2236*c165b184SJames Collins if (strlen($text) > $pos + 15) { 2237*c165b184SJames Collins $key = '___noise___' 2238*c165b184SJames Collins . $text[$pos + 11] 2239*c165b184SJames Collins . $text[$pos + 12] 2240*c165b184SJames Collins . $text[$pos + 13] 2241*c165b184SJames Collins . $text[$pos + 14] 2242*c165b184SJames Collins . $text[$pos + 15]; 2243*c165b184SJames Collins 2244*c165b184SJames Collins if (is_object($debug_object)) { 2245*c165b184SJames Collins $debug_object->debug_log(2, 'located key of: ' . $key); 2246*c165b184SJames Collins } 2247*c165b184SJames Collins 2248*c165b184SJames Collins if (isset($this->noise[$key])) { 2249*c165b184SJames Collins $text = substr($text, 0, $pos) 2250*c165b184SJames Collins . $this->noise[$key] 2251*c165b184SJames Collins . substr($text, $pos + 16); 2252*c165b184SJames Collins } else { 2253*c165b184SJames Collins // do this to prevent an infinite loop. 2254*c165b184SJames Collins $text = substr($text, 0, $pos) 2255*c165b184SJames Collins . 'UNDEFINED NOISE FOR KEY: ' 2256*c165b184SJames Collins . $key 2257*c165b184SJames Collins . substr($text, $pos + 16); 2258*c165b184SJames Collins } 2259*c165b184SJames Collins } else { 2260*c165b184SJames Collins // There is no valid key being given back to us... We must get 2261*c165b184SJames Collins // rid of the ___noise___ or we will have a problem. 2262*c165b184SJames Collins $text = substr($text, 0, $pos) 2263*c165b184SJames Collins . 'NO NUMERIC NOISE KEY' 2264*c165b184SJames Collins . substr($text, $pos + 11); 2265*c165b184SJames Collins } 2266*c165b184SJames Collins } 2267*c165b184SJames Collins return $text; 2268*c165b184SJames Collins } 2269*c165b184SJames Collins 2270*c165b184SJames Collins function search_noise($text) 2271*c165b184SJames Collins { 2272*c165b184SJames Collins global $debug_object; 2273*c165b184SJames Collins if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 2274*c165b184SJames Collins 2275*c165b184SJames Collins foreach($this->noise as $noiseElement) { 2276*c165b184SJames Collins if (strpos($noiseElement, $text) !== false) { 2277*c165b184SJames Collins return $noiseElement; 2278*c165b184SJames Collins } 2279*c165b184SJames Collins } 2280*c165b184SJames Collins } 2281*c165b184SJames Collins 2282*c165b184SJames Collins function __toString() 2283*c165b184SJames Collins { 2284*c165b184SJames Collins return $this->root->innertext(); 2285*c165b184SJames Collins } 2286*c165b184SJames Collins 2287*c165b184SJames Collins function __get($name) 2288*c165b184SJames Collins { 2289*c165b184SJames Collins switch ($name) { 2290*c165b184SJames Collins case 'outertext': 2291*c165b184SJames Collins return $this->root->innertext(); 2292*c165b184SJames Collins case 'innertext': 2293*c165b184SJames Collins return $this->root->innertext(); 2294*c165b184SJames Collins case 'plaintext': 2295*c165b184SJames Collins return $this->root->text(); 2296*c165b184SJames Collins case 'charset': 2297*c165b184SJames Collins return $this->_charset; 2298*c165b184SJames Collins case 'target_charset': 2299*c165b184SJames Collins return $this->_target_charset; 2300*c165b184SJames Collins } 2301*c165b184SJames Collins } 2302*c165b184SJames Collins 2303*c165b184SJames Collins function childNodes($idx = -1) 2304*c165b184SJames Collins { 2305*c165b184SJames Collins return $this->root->childNodes($idx); 2306*c165b184SJames Collins } 2307*c165b184SJames Collins 2308*c165b184SJames Collins function firstChild() 2309*c165b184SJames Collins { 2310*c165b184SJames Collins return $this->root->first_child(); 2311*c165b184SJames Collins } 2312*c165b184SJames Collins 2313*c165b184SJames Collins function lastChild() 2314*c165b184SJames Collins { 2315*c165b184SJames Collins return $this->root->last_child(); 2316*c165b184SJames Collins } 2317*c165b184SJames Collins 2318*c165b184SJames Collins function createElement($name, $value = null) 2319*c165b184SJames Collins { 2320*c165b184SJames Collins return @str_get_html("<$name>$value</$name>")->firstChild(); 2321*c165b184SJames Collins } 2322*c165b184SJames Collins 2323*c165b184SJames Collins function createTextNode($value) 2324*c165b184SJames Collins { 2325*c165b184SJames Collins return @end(str_get_html($value)->nodes); 2326*c165b184SJames Collins } 2327*c165b184SJames Collins 2328*c165b184SJames Collins function getElementById($id) 2329*c165b184SJames Collins { 2330*c165b184SJames Collins return $this->find("#$id", 0); 2331*c165b184SJames Collins } 2332*c165b184SJames Collins 2333*c165b184SJames Collins function getElementsById($id, $idx = null) 2334*c165b184SJames Collins { 2335*c165b184SJames Collins return $this->find("#$id", $idx); 2336*c165b184SJames Collins } 2337*c165b184SJames Collins 2338*c165b184SJames Collins function getElementByTagName($name) 2339*c165b184SJames Collins { 2340*c165b184SJames Collins return $this->find($name, 0); 2341*c165b184SJames Collins } 2342*c165b184SJames Collins 2343*c165b184SJames Collins function getElementsByTagName($name, $idx = -1) 2344*c165b184SJames Collins { 2345*c165b184SJames Collins return $this->find($name, $idx); 2346*c165b184SJames Collins } 2347*c165b184SJames Collins 2348*c165b184SJames Collins function loadFile() 2349*c165b184SJames Collins { 2350*c165b184SJames Collins $args = func_get_args(); 2351*c165b184SJames Collins $this->load_file($args); 2352*c165b184SJames Collins } 2353*c165b184SJames Collins} 2354