xref: /template/mikio/inc/simple_html_dom.php (revision ab45ba7165dc08ce8ac280c60e4318ef9df9d00d)
1<?php
2/**
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6 *
7 * Licensed under The MIT License
8 * See the LICENSE file in the project root for more information.
9 *
10 * Authors:
11 *   S.C. Chen
12 *   John Schlick
13 *   Rus Carroll
14 *   logmanoriginal
15 *
16 * Contributors:
17 *   Yousuke Kumakura
18 *   Vadim Voituk
19 *   Antcs
20 *
21 * Version Rev. 1.9.1 (291)
22 */
23
24define('HDOM_TYPE_ELEMENT', 1);
25define('HDOM_TYPE_COMMENT', 2);
26define('HDOM_TYPE_TEXT', 3);
27define('HDOM_TYPE_ENDTAG', 4);
28define('HDOM_TYPE_ROOT', 5);
29define('HDOM_TYPE_UNKNOWN', 6);
30define('HDOM_QUOTE_DOUBLE', 0);
31define('HDOM_QUOTE_SINGLE', 1);
32define('HDOM_QUOTE_NO', 3);
33define('HDOM_INFO_BEGIN', 0);
34define('HDOM_INFO_END', 1);
35define('HDOM_INFO_QUOTE', 2);
36define('HDOM_INFO_SPACE', 3);
37define('HDOM_INFO_TEXT', 4);
38define('HDOM_INFO_INNER', 5);
39define('HDOM_INFO_OUTER', 6);
40define('HDOM_INFO_ENDSPACE', 7);
41
42defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
43defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
44defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
45defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
46define('HDOM_SMARTY_AS_TEXT', 1);
47
48function file_get_html(
49	$url,
50	$use_include_path = false,
51	$context = null,
52	$offset = 0,
53	$maxLen = -1,
54	$lowercase = true,
55	$forceTagsClosed = true,
56	$target_charset = DEFAULT_TARGET_CHARSET,
57	$stripRN = true,
58	$defaultBRText = DEFAULT_BR_TEXT,
59	$defaultSpanText = DEFAULT_SPAN_TEXT)
60{
61	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
62
63	$dom = new simple_html_dom(
64		null,
65		$lowercase,
66		$forceTagsClosed,
67		$target_charset,
68		$stripRN,
69		$defaultBRText,
70		$defaultSpanText
71	);
72
73	/**
74	 * For sourceforge users: uncomment the next line and comment the
75	 * retrieve_url_contents line 2 lines down if it is not already done.
76	 */
77	$contents = file_get_contents(
78		$url,
79		$use_include_path,
80		$context,
81		$offset,
82		$maxLen
83	);
84	// $contents = retrieve_url_contents($url);
85
86	if (empty($contents) || strlen($contents) > $maxLen) {
87		$dom->clear();
88		return false;
89	}
90
91	return $dom->load($contents, $lowercase, $stripRN);
92}
93
94function str_get_html(
95	$str,
96	$lowercase = true,
97	$forceTagsClosed = true,
98	$target_charset = DEFAULT_TARGET_CHARSET,
99	$stripRN = true,
100	$defaultBRText = DEFAULT_BR_TEXT,
101	$defaultSpanText = DEFAULT_SPAN_TEXT)
102{
103	$dom = new simple_html_dom(
104		null,
105		$lowercase,
106		$forceTagsClosed,
107		$target_charset,
108		$stripRN,
109		$defaultBRText,
110		$defaultSpanText
111	);
112
113	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
114		$dom->clear();
115		return false;
116	}
117
118	return $dom->load($str, $lowercase, $stripRN);
119}
120
121function dump_html_tree($node, $show_attr = true, $deep = 0)
122{
123	$node->dump($node);
124}
125
126class simple_html_dom_node
127{
128	public $nodetype = HDOM_TYPE_TEXT;
129	public $tag = 'text';
130	public $attr = array();
131	public $children = array();
132	public $nodes = array();
133	public $parent = null;
134	public $_ = array();
135	public $tag_start = 0;
136	private $dom = null;
137
138	function __construct($dom)
139	{
140		$this->dom = $dom;
141		$dom->nodes[] = $this;
142	}
143
144	function __destruct()
145	{
146		$this->clear();
147	}
148
149	function __toString()
150	{
151		return $this->outertext();
152	}
153
154	function clear()
155	{
156		$this->dom = null;
157		$this->nodes = null;
158		$this->parent = null;
159		$this->children = null;
160	}
161
162	function dump($show_attr = true, $depth = 0)
163	{
164		echo str_repeat("\t", $depth) . $this->tag;
165
166		if ($show_attr && count($this->attr) > 0) {
167			echo '(';
168			foreach ($this->attr as $k => $v) {
169				echo "[$k]=>\"$v\", ";
170			}
171			echo ')';
172		}
173
174		echo "\n";
175
176		if ($this->nodes) {
177			foreach ($this->nodes as $node) {
178				$node->dump($show_attr, $depth + 1);
179			}
180		}
181	}
182
183	function dump_node($echo = true)
184	{
185		$string = $this->tag;
186
187		if (count($this->attr) > 0) {
188			$string .= '(';
189			foreach ($this->attr as $k => $v) {
190				$string .= "[$k]=>\"$v\", ";
191			}
192			$string .= ')';
193		}
194
195		if (count($this->_) > 0) {
196			$string .= ' $_ (';
197			foreach ($this->_ as $k => $v) {
198				if (is_array($v)) {
199					$string .= "[$k]=>(";
200					foreach ($v as $k2 => $v2) {
201						$string .= "[$k2]=>\"$v2\", ";
202					}
203					$string .= ')';
204				} else {
205					$string .= "[$k]=>\"$v\", ";
206				}
207			}
208			$string .= ')';
209		}
210
211		if (isset($this->text)) {
212			$string .= " text: ({$this->text})";
213		}
214
215		$string .= ' HDOM_INNER_INFO: ';
216
217		if (isset($node->_[HDOM_INFO_INNER])) {
218			$string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
219		} else {
220			$string .= ' NULL ';
221		}
222
223		$string .= ' children: ' . count($this->children);
224		$string .= ' nodes: ' . count($this->nodes);
225		$string .= ' tag_start: ' . $this->tag_start;
226		$string .= "\n";
227
228		if ($echo) {
229			echo $string;
230			return;
231		} else {
232			return $string;
233		}
234	}
235
236	function parent($parent = null)
237	{
238		// I am SURE that this doesn't work properly.
239		// It fails to unset the current node from it's current parents nodes or
240		// children list first.
241		if ($parent !== null) {
242			$this->parent = $parent;
243			$this->parent->nodes[] = $this;
244			$this->parent->children[] = $this;
245		}
246
247		return $this->parent;
248	}
249
250	function has_child()
251	{
252		return !empty($this->children);
253	}
254
255	function children($idx = -1)
256	{
257		if ($idx === -1) {
258			return $this->children;
259		}
260
261		if (isset($this->children[$idx])) {
262			return $this->children[$idx];
263		}
264
265		return null;
266	}
267
268	function first_child()
269	{
270		if (count($this->children) > 0) {
271			return $this->children[0];
272		}
273		return null;
274	}
275
276	function last_child()
277	{
278		if (count($this->children) > 0) {
279			return end($this->children);
280		}
281		return null;
282	}
283
284	function next_sibling()
285	{
286		if ($this->parent === null) {
287			return null;
288		}
289
290		$idx = array_search($this, $this->parent->children, true);
291
292		if ($idx !== false && isset($this->parent->children[$idx + 1])) {
293			return $this->parent->children[$idx + 1];
294		}
295
296		return null;
297	}
298
299	function prev_sibling()
300	{
301		if ($this->parent === null) {
302			return null;
303		}
304
305		$idx = array_search($this, $this->parent->children, true);
306
307		if ($idx !== false && $idx > 0) {
308			return $this->parent->children[$idx - 1];
309		}
310
311		return null;
312	}
313
314	function find_ancestor_tag($tag)
315	{
316		global $debug_object;
317		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
318
319		if ($this->parent === null) {
320			return null;
321		}
322
323		$ancestor = $this->parent;
324
325		while (!is_null($ancestor)) {
326			if (is_object($debug_object)) {
327				$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
328			}
329
330			if ($ancestor->tag === $tag) {
331				break;
332			}
333
334			$ancestor = $ancestor->parent;
335		}
336
337		return $ancestor;
338	}
339
340	function innertext()
341	{
342		if (isset($this->_[HDOM_INFO_INNER])) {
343			return $this->_[HDOM_INFO_INNER];
344		}
345
346		if (isset($this->_[HDOM_INFO_TEXT])) {
347			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
348		}
349
350		$ret = '';
351
352		foreach ($this->nodes as $n) {
353			$ret .= $n->outertext();
354		}
355
356		return $ret;
357	}
358
359	function outertext()
360	{
361		global $debug_object;
362
363		if (is_object($debug_object)) {
364			$text = '';
365
366			if ($this->tag === 'text') {
367				if (!empty($this->text)) {
368					$text = ' with text: ' . $this->text;
369				}
370			}
371
372			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
373		}
374
375		if ($this->tag === 'root') {
376			return $this->innertext();
377		}
378
379		// todo: What is the use of this callback? Remove?
380		if ($this->dom && $this->dom->callback !== null) {
381			call_user_func_array($this->dom->callback, array($this));
382		}
383
384		if (isset($this->_[HDOM_INFO_OUTER])) {
385			return $this->_[HDOM_INFO_OUTER];
386		}
387
388		if (isset($this->_[HDOM_INFO_TEXT])) {
389			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390		}
391
392		$ret = '';
393
394		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
395			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
396		}
397
398		if (isset($this->_[HDOM_INFO_INNER])) {
399			// todo: <br> should either never have HDOM_INFO_INNER or always
400			if ($this->tag !== 'br') {
401				$ret .= $this->_[HDOM_INFO_INNER];
402			}
403		} elseif ($this->nodes) {
404			foreach ($this->nodes as $n) {
405				$ret .= $this->convert_text($n->outertext());
406			}
407		}
408
409		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
410			$ret .= '</' . $this->tag . '>';
411		}
412
413		return $ret;
414	}
415
416	function text()
417	{
418		if (isset($this->_[HDOM_INFO_INNER])) {
419			return $this->_[HDOM_INFO_INNER];
420		}
421
422		switch ($this->nodetype) {
423			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
424			case HDOM_TYPE_COMMENT: return '';
425			case HDOM_TYPE_UNKNOWN: return '';
426		}
427
428		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
429		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
430
431		$ret = '';
432
433		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
434		// for some span tags, and some p tags) $this->nodes is set to NULL.
435		// NOTE: This indicates that there is a problem where it's set to NULL
436		// without a clear happening.
437		// WHY is this happening?
438		if (!is_null($this->nodes)) {
439			foreach ($this->nodes as $n) {
440				// Start paragraph after a blank line
441				if ($n->tag === 'p') {
442					$ret = trim($ret) . "\n\n";
443				}
444
445				$ret .= $this->convert_text($n->text());
446
447				// If this node is a span... add a space at the end of it so
448				// multiple spans don't run into each other.  This is plaintext
449				// after all.
450				if ($n->tag === 'span') {
451					$ret .= $this->dom->default_span_text;
452				}
453			}
454		}
455		return $ret;
456	}
457
458	function xmltext()
459	{
460		$ret = $this->innertext();
461		$ret = str_ireplace('<![CDATA[', '', $ret);
462		$ret = str_replace(']]>', '', $ret);
463		return $ret;
464	}
465
466	function makeup()
467	{
468		// text, comment, unknown
469		if (isset($this->_[HDOM_INFO_TEXT])) {
470			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471		}
472
473		$ret = '<' . $this->tag;
474		$i = -1;
475
476		foreach ($this->attr as $key => $val) {
477			++$i;
478
479			// skip removed attribute
480			if ($val === null || $val === false) { continue; }
481
482			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
483
484			//no value attr: nowrap, checked selected...
485			if ($val === true) {
486				$ret .= $key;
487			} else {
488				switch ($this->_[HDOM_INFO_QUOTE][$i])
489				{
490					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492					default: $quote = '';
493				}
494
495				$ret .= $key
496				. $this->_[HDOM_INFO_SPACE][$i][1]
497				. '='
498				. $this->_[HDOM_INFO_SPACE][$i][2]
499				. $quote
500				. $val
501				. $quote;
502			}
503		}
504
505		$ret = $this->dom->restore_noise($ret);
506		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
507	}
508
509	function find($selector, $idx = null, $lowercase = false)
510	{
511		$selectors = $this->parse_selector($selector);
512		if (($count = count($selectors)) === 0) { return array(); }
513		$found_keys = array();
514
515		// find each selector
516		for ($c = 0; $c < $count; ++$c) {
517			// The change on the below line was documented on the sourceforge
518			// code tracker id 2788009
519			// used to be: if (($levle=count($selectors[0]))===0) return array();
520			if (($levle = count($selectors[$c])) === 0) { return array(); }
521			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
522
523			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
524			$cmd = ' '; // Combinator
525
526			// handle descendant selectors, no recursive!
527			for ($l = 0; $l < $levle; ++$l) {
528				$ret = array();
529
530				foreach ($head as $k => $v) {
531					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
532					//PaperG - Pass this optional parameter on to the seek function.
533					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
534				}
535
536				$head = $ret;
537				$cmd = $selectors[$c][$l][4]; // Next Combinator
538			}
539
540			foreach ($head as $k => $v) {
541				if (!isset($found_keys[$k])) {
542					$found_keys[$k] = 1;
543				}
544			}
545		}
546
547		// sort keys
548		ksort($found_keys);
549
550		$found = array();
551		foreach ($found_keys as $k => $v) {
552			$found[] = $this->dom->nodes[$k];
553		}
554
555		// return nth-element or array
556		if (is_null($idx)) { return $found; }
557		elseif ($idx < 0) { $idx = count($found) + $idx; }
558		return (isset($found[$idx])) ? $found[$idx] : null;
559	}
560
561	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
562	{
563		global $debug_object;
564		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
565
566		list($tag, $id, $class, $attributes, $cmb) = $selector;
567		$nodes = array();
568
569		if ($parent_cmd === ' ') { // Descendant Combinator
570			// Find parent closing tag if the current element doesn't have a closing
571			// tag (i.e. void element)
572			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
573			if ($end == 0) {
574				$parent = $this->parent;
575				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
576					$end -= 1;
577					$parent = $parent->parent;
578				}
579				$end += $parent->_[HDOM_INFO_END];
580			}
581
582			// Get list of target nodes
583			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
584			$nodes_count = $end - $nodes_start;
585			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
586		} elseif ($parent_cmd === '>') { // Child Combinator
587			$nodes = $this->children;
588		} elseif ($parent_cmd === '+'
589			&& $this->parent
590			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
591				$index = array_search($this, $this->parent->children, true) + 1;
592				if ($index < count($this->parent->children))
593					$nodes[] = $this->parent->children[$index];
594		} elseif ($parent_cmd === '~'
595			&& $this->parent
596			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
597				$index = array_search($this, $this->parent->children, true);
598				$nodes = array_slice($this->parent->children, $index);
599		}
600
601		// Go throgh each element starting at this element until the end tag
602		// Note: If this element is a void tag, any previous void element is
603		// skipped.
604		foreach($nodes as $node) {
605			$pass = true;
606
607			// Skip root nodes
608			if(!$node->parent) {
609				$pass = false;
610			}
611
612			// Handle 'text' selector
613			if($pass && $tag === 'text' && $node->tag === 'text') {
614				$ret[array_search($node, $this->dom->nodes, true)] = 1;
615				unset($node);
616				continue;
617			}
618
619			// Skip if node isn't a child node (i.e. text nodes)
620			if($pass && !in_array($node, $node->parent->children, true)) {
621				$pass = false;
622			}
623
624			// Skip if tag doesn't match
625			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
626				$pass = false;
627			}
628
629			// Skip if ID doesn't exist
630			if ($pass && $id !== '' && !isset($node->attr['id'])) {
631				$pass = false;
632			}
633
634			// Check if ID matches
635			if ($pass && $id !== '' && isset($node->attr['id'])) {
636				// Note: Only consider the first ID (as browsers do)
637				$node_id = explode(' ', trim($node->attr['id']))[0];
638
639				if($id !== $node_id) { $pass = false; }
640			}
641
642			// Check if all class(es) exist
643			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
644				if (isset($node->attr['class'])) {
645					$node_classes = explode(' ', $node->attr['class']);
646
647					if ($lowercase) {
648						$node_classes = array_map('strtolower', $node_classes);
649					}
650
651					foreach($class as $c) {
652						if(!in_array($c, $node_classes)) {
653							$pass = false;
654							break;
655						}
656					}
657				} else {
658					$pass = false;
659				}
660			}
661
662			// Check attributes
663			if ($pass
664				&& $attributes !== ''
665				&& is_array($attributes)
666				&& !empty($attributes)) {
667					foreach($attributes as $a) {
668						list (
669							$att_name,
670							$att_expr,
671							$att_val,
672							$att_inv,
673							$att_case_sensitivity
674						) = $a;
675
676						// Handle indexing attributes (i.e. "[2]")
677						/**
678						 * Note: This is not supported by the CSS Standard but adds
679						 * the ability to select items compatible to XPath (i.e.
680						 * the 3rd element within it's parent).
681						 *
682						 * Note: This doesn't conflict with the CSS Standard which
683						 * doesn't work on numeric attributes anyway.
684						 */
685						if (is_numeric($att_name)
686							&& $att_expr === ''
687							&& $att_val === '') {
688								$count = 0;
689
690								// Find index of current element in parent
691								foreach ($node->parent->children as $c) {
692									if ($c->tag === $node->tag) ++$count;
693									if ($c === $node) break;
694								}
695
696								// If this is the correct node, continue with next
697								// attribute
698								if ($count === (int)$att_name) continue;
699						}
700
701						// Check attribute availability
702						if ($att_inv) { // Attribute should NOT be set
703							if (isset($node->attr[$att_name])) {
704								$pass = false;
705								break;
706							}
707						} else { // Attribute should be set
708							// todo: "plaintext" is not a valid CSS selector!
709							if ($att_name !== 'plaintext'
710								&& !isset($node->attr[$att_name])) {
711									$pass = false;
712									break;
713							}
714						}
715
716						// Continue with next attribute if expression isn't defined
717						if ($att_expr === '') continue;
718
719						// If they have told us that this is a "plaintext"
720						// search then we want the plaintext of the node - right?
721						// todo "plaintext" is not a valid CSS selector!
722						if ($att_name === 'plaintext') {
723							$nodeKeyValue = $node->text();
724						} else {
725							$nodeKeyValue = $node->attr[$att_name];
726						}
727
728						if (is_object($debug_object)) {
729							$debug_object->debug_log(2,
730								'testing node: '
731								. $node->tag
732								. ' for attribute: '
733								. $att_name
734								. $att_expr
735								. $att_val
736								. ' where nodes value is: '
737								. $nodeKeyValue
738							);
739						}
740
741						// If lowercase is set, do a case insensitive test of
742						// the value of the selector.
743						if ($lowercase) {
744							$check = $this->match(
745								$att_expr,
746								strtolower($att_val),
747								strtolower($nodeKeyValue),
748								$att_case_sensitivity
749							);
750						} else {
751							$check = $this->match(
752								$att_expr,
753								$att_val,
754								$nodeKeyValue,
755								$att_case_sensitivity
756							);
757						}
758
759						if (is_object($debug_object)) {
760							$debug_object->debug_log(2,
761								'after match: '
762								. ($check ? 'true' : 'false')
763							);
764						}
765
766						if (!$check) {
767							$pass = false;
768							break;
769						}
770					}
771			}
772
773			// Found a match. Add to list and clear node
774			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
775			unset($node);
776		}
777		// It's passed by reference so this is actually what this function returns.
778		if (is_object($debug_object)) {
779			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
780		}
781	}
782
783	protected function match($exp, $pattern, $value, $case_sensitivity)
784	{
785		global $debug_object;
786		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
787
788		if ($case_sensitivity === 'i') {
789			$pattern = strtolower($pattern);
790			$value = strtolower($value);
791		}
792
793		switch ($exp) {
794			case '=':
795				return ($value === $pattern);
796			case '!=':
797				return ($value !== $pattern);
798			case '^=':
799				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
800			case '$=':
801				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
802			case '*=':
803				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
804			case '|=':
805				/**
806				 * [att|=val]
807				 *
808				 * Represents an element with the att attribute, its value
809				 * either being exactly "val" or beginning with "val"
810				 * immediately followed by "-" (U+002D).
811				 */
812				return strpos($value, $pattern) === 0;
813			case '~=':
814				/**
815				 * [att~=val]
816				 *
817				 * Represents an element with the att attribute whose value is a
818				 * whitespace-separated list of words, one of which is exactly
819				 * "val". If "val" contains whitespace, it will never represent
820				 * anything (since the words are separated by spaces). Also if
821				 * "val" is the empty string, it will never represent anything.
822				 */
823				return in_array($pattern, explode(' ', trim($value)), true);
824		}
825		return false;
826	}
827
828	protected function parse_selector($selector_string)
829	{
830		global $debug_object;
831		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
832
833		/**
834		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
835		 *
836		 * Paperg: Add the colon to the attribute, so that it properly finds
837		 * <tag attr:ibute="something" > like google does.
838		 *
839		 * Note: if you try to look at this attribute, you MUST use getAttribute
840		 * since $dom->x:y will fail the php syntax check.
841		 *
842		 * Notice the \[ starting the attribute? and the @? following? This
843		 * implies that an attribute can begin with an @ sign that is not
844		 * captured. This implies that an html attribute specifier may start
845		 * with an @ sign that is NOT captured by the expression. Farther study
846		 * is required to determine of this should be documented or removed.
847		 *
848		 * Matches selectors in this order:
849		 *
850		 * [0] - full match
851		 *
852		 * [1] - tag name
853		 *     ([\w:\*-]*)
854		 *     Matches the tag name consisting of zero or more words, colons,
855		 *     asterisks and hyphens.
856		 *
857		 * [2] - id name
858		 *     (?:\#([\w-]+))
859		 *     Optionally matches a id name, consisting of an "#" followed by
860		 *     the id name (one or more words and hyphens).
861		 *
862		 * [3] - class names (including dots)
863		 *     (?:\.([\w\.-]+))?
864		 *     Optionally matches a list of classs, consisting of an "."
865		 *     followed by the class name (one or more words and hyphens)
866		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
867		 *
868		 * [4] - attributes
869		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
870		 *     Optionally matches the attributes list
871		 *
872		 * [5] - separator
873		 *     ([\/, >+~]+)
874		 *     Matches the selector list separator
875		 */
876		// phpcs:ignore Generic.Files.LineLength
877		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
878
879		preg_match_all(
880			$pattern,
881			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
882			$matches,
883			PREG_SET_ORDER
884		);
885
886		if (is_object($debug_object)) {
887			$debug_object->debug_log(2, 'Matches Array: ', $matches);
888		}
889
890		$selectors = array();
891		$result = array();
892
893		foreach ($matches as $m) {
894			$m[0] = trim($m[0]);
895
896			// Skip NoOps
897			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
898
899			// Convert to lowercase
900			if ($this->dom->lowercase) {
901				$m[1] = strtolower($m[1]);
902			}
903
904			// Extract classes
905			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
906
907			/* Extract attributes (pattern based on the pattern above!)
908
909			 * [0] - full match
910			 * [1] - attribute name
911			 * [2] - attribute expression
912			 * [3] - attribute value
913			 * [4] - case sensitivity
914			 *
915			 * Note: Attributes can be negated with a "!" prefix to their name
916			 */
917			if($m[4] !== '') {
918				preg_match_all(
919					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
920					trim($m[4]),
921					$attributes,
922					PREG_SET_ORDER
923				);
924
925				// Replace element by array
926				$m[4] = array();
927
928				foreach($attributes as $att) {
929					// Skip empty matches
930					if(trim($att[0]) === '') { continue; }
931
932					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
933					$m[4][] = array(
934						$inverted ? substr($att[1], 1) : $att[1], // Name
935						(isset($att[2])) ? $att[2] : '', // Expression
936						(isset($att[3])) ? $att[3] : '', // Value
937						$inverted, // Inverted Flag
938						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
939					);
940				}
941			}
942
943			// Sanitize Separator
944			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
945				$m[5] = ' ';
946			} else { // Other Separator
947				$m[5] = trim($m[5]);
948			}
949
950			// Clear Separator if it's a Selector List
951			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
952
953			// Remove full match before adding to results
954			array_shift($m);
955			$result[] = $m;
956
957			if ($is_list) { // Selector List
958				$selectors[] = $result;
959				$result = array();
960			}
961		}
962
963		if (count($result) > 0) { $selectors[] = $result; }
964		return $selectors;
965	}
966
967	function __get($name)
968	{
969		if (isset($this->attr[$name])) {
970			return $this->convert_text($this->attr[$name]);
971		}
972		switch ($name) {
973			case 'outertext': return $this->outertext();
974			case 'innertext': return $this->innertext();
975			case 'plaintext': return $this->text();
976			case 'xmltext': return $this->xmltext();
977			default: return array_key_exists($name, $this->attr);
978		}
979	}
980
981	function __set($name, $value)
982	{
983		global $debug_object;
984		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
985
986		switch ($name) {
987			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
988			case 'innertext':
989				if (isset($this->_[HDOM_INFO_TEXT])) {
990					return $this->_[HDOM_INFO_TEXT] = $value;
991				}
992				return $this->_[HDOM_INFO_INNER] = $value;
993		}
994
995		if (!isset($this->attr[$name])) {
996			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
997			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
998		}
999
1000		$this->attr[$name] = $value;
1001	}
1002
1003	function __isset($name)
1004	{
1005		switch ($name) {
1006			case 'outertext': return true;
1007			case 'innertext': return true;
1008			case 'plaintext': return true;
1009		}
1010		//no value attr: nowrap, checked selected...
1011		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1012	}
1013
1014	function __unset($name)
1015	{
1016		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1017	}
1018
1019	function convert_text($text)
1020	{
1021		global $debug_object;
1022		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1023
1024		$converted_text = $text;
1025
1026		$sourceCharset = '';
1027		$targetCharset = '';
1028
1029		if ($this->dom) {
1030			$sourceCharset = strtoupper($this->dom->_charset);
1031			$targetCharset = strtoupper($this->dom->_target_charset);
1032		}
1033
1034		if (is_object($debug_object)) {
1035			$debug_object->debug_log(3,
1036				'source charset: '
1037				. $sourceCharset
1038				. ' target charaset: '
1039				. $targetCharset
1040			);
1041		}
1042
1043		if (!empty($sourceCharset)
1044			&& !empty($targetCharset)
1045			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1046			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1047			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1048				&& ($this->is_utf8($text))) {
1049				$converted_text = $text;
1050			} else {
1051				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1052			}
1053		}
1054
1055		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1056		if ($targetCharset === 'UTF-8') {
1057			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1058				$converted_text = substr($converted_text, 3);
1059			}
1060
1061			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1062				$converted_text = substr($converted_text, 0, -3);
1063			}
1064		}
1065
1066		return $converted_text;
1067	}
1068
1069	static function is_utf8($str)
1070	{
1071		$c = 0; $b = 0;
1072		$bits = 0;
1073		$len = strlen($str);
1074		for($i = 0; $i < $len; $i++) {
1075			$c = ord($str[$i]);
1076			if($c > 128) {
1077				if(($c >= 254)) { return false; }
1078				elseif($c >= 252) { $bits = 6; }
1079				elseif($c >= 248) { $bits = 5; }
1080				elseif($c >= 240) { $bits = 4; }
1081				elseif($c >= 224) { $bits = 3; }
1082				elseif($c >= 192) { $bits = 2; }
1083				else { return false; }
1084				if(($i + $bits) > $len) { return false; }
1085				while($bits > 1) {
1086					$i++;
1087					$b = ord($str[$i]);
1088					if($b < 128 || $b > 191) { return false; }
1089					$bits--;
1090				}
1091			}
1092		}
1093		return true;
1094	}
1095
1096	function get_display_size()
1097	{
1098		global $debug_object;
1099
1100		$width = -1;
1101		$height = -1;
1102
1103		if ($this->tag !== 'img') {
1104			return false;
1105		}
1106
1107		// See if there is aheight or width attribute in the tag itself.
1108		if (isset($this->attr['width'])) {
1109			$width = $this->attr['width'];
1110		}
1111
1112		if (isset($this->attr['height'])) {
1113			$height = $this->attr['height'];
1114		}
1115
1116		// Now look for an inline style.
1117		if (isset($this->attr['style'])) {
1118			// Thanks to user gnarf from stackoverflow for this regular expression.
1119			$attributes = array();
1120
1121			preg_match_all(
1122				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1123				$this->attr['style'],
1124				$matches,
1125				PREG_SET_ORDER
1126			);
1127
1128			foreach ($matches as $match) {
1129				$attributes[$match[1]] = $match[2];
1130			}
1131
1132			// If there is a width in the style attributes:
1133			if (isset($attributes['width']) && $width == -1) {
1134				// check that the last two characters are px (pixels)
1135				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1136					$proposed_width = substr($attributes['width'], 0, -2);
1137					// Now make sure that it's an integer and not something stupid.
1138					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1139						$width = $proposed_width;
1140					}
1141				}
1142			}
1143
1144			// If there is a width in the style attributes:
1145			if (isset($attributes['height']) && $height == -1) {
1146				// check that the last two characters are px (pixels)
1147				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1148					$proposed_height = substr($attributes['height'], 0, -2);
1149					// Now make sure that it's an integer and not something stupid.
1150					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1151						$height = $proposed_height;
1152					}
1153				}
1154			}
1155
1156		}
1157
1158		// Future enhancement:
1159		// Look in the tag to see if there is a class or id specified that has
1160		// a height or width attribute to it.
1161
1162		// Far future enhancement
1163		// Look at all the parent tags of this image to see if they specify a
1164		// class or id that has an img selector that specifies a height or width
1165		// Note that in this case, the class or id will have the img subselector
1166		// for it to apply to the image.
1167
1168		// ridiculously far future development
1169		// If the class or id is specified in a SEPARATE css file thats not on
1170		// the page, go get it and do what we were just doing for the ones on
1171		// the page.
1172
1173		$result = array(
1174			'height' => $height,
1175			'width' => $width
1176		);
1177
1178		return $result;
1179	}
1180
1181	function save($filepath = '')
1182	{
1183		$ret = $this->outertext();
1184
1185		if ($filepath !== '') {
1186			file_put_contents($filepath, $ret, LOCK_EX);
1187		}
1188
1189		return $ret;
1190	}
1191
1192	function addClass($class)
1193	{
1194		if (is_string($class)) {
1195			$class = explode(' ', $class);
1196		}
1197
1198		if (is_array($class)) {
1199			foreach($class as $c) {
1200				if (isset($this->class)) {
1201					if ($this->hasClass($c)) {
1202						continue;
1203					} else {
1204						$this->class .= ' ' . $c;
1205					}
1206				} else {
1207					$this->class = $c;
1208				}
1209			}
1210		} else {
1211			if (is_object($debug_object)) {
1212				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1213			}
1214		}
1215	}
1216
1217	function hasClass($class)
1218	{
1219		if (is_string($class)) {
1220			if (isset($this->class)) {
1221				return in_array($class, explode(' ', $this->class), true);
1222			}
1223		} else {
1224			if (is_object($debug_object)) {
1225				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1226			}
1227		}
1228
1229		return false;
1230	}
1231
1232	function removeClass($class = null)
1233	{
1234		if (!isset($this->class)) {
1235			return;
1236		}
1237
1238		if (is_null($class)) {
1239			$this->removeAttribute('class');
1240			return;
1241		}
1242
1243		if (is_string($class)) {
1244			$class = explode(' ', $class);
1245		}
1246
1247		if (is_array($class)) {
1248			$class = array_diff(explode(' ', $this->class), $class);
1249			if (empty($class)) {
1250				$this->removeAttribute('class');
1251			} else {
1252				$this->class = implode(' ', $class);
1253			}
1254		}
1255	}
1256
1257	function getAllAttributes()
1258	{
1259		return $this->attr;
1260	}
1261
1262	function getAttribute($name)
1263	{
1264		return $this->__get($name);
1265	}
1266
1267	function setAttribute($name, $value)
1268	{
1269		$this->__set($name, $value);
1270	}
1271
1272	function hasAttribute($name)
1273	{
1274		return $this->__isset($name);
1275	}
1276
1277	function removeAttribute($name)
1278	{
1279		$this->__set($name, null);
1280	}
1281
1282	function remove()
1283	{
1284		if ($this->parent) {
1285			$this->parent->removeChild($this);
1286		}
1287	}
1288
1289	function removeChild($node)
1290	{
1291		$nidx = array_search($node, $this->nodes, true);
1292		$cidx = array_search($node, $this->children, true);
1293		$didx = array_search($node, $this->dom->nodes, true);
1294
1295		if ($nidx !== false && $cidx !== false && $didx !== false) {
1296
1297			foreach($node->children as $child) {
1298				$node->removeChild($child);
1299			}
1300
1301			foreach($node->nodes as $entity) {
1302				$enidx = array_search($entity, $node->nodes, true);
1303				$edidx = array_search($entity, $node->dom->nodes, true);
1304
1305				if ($enidx !== false && $edidx !== false) {
1306					unset($node->nodes[$enidx]);
1307					unset($node->dom->nodes[$edidx]);
1308				}
1309			}
1310
1311			unset($this->nodes[$nidx]);
1312			unset($this->children[$cidx]);
1313			unset($this->dom->nodes[$didx]);
1314
1315			$node->clear();
1316
1317		}
1318	}
1319
1320	function getElementById($id)
1321	{
1322		return $this->find("#$id", 0);
1323	}
1324
1325	function getElementsById($id, $idx = null)
1326	{
1327		return $this->find("#$id", $idx);
1328	}
1329
1330	function getElementByTagName($name)
1331	{
1332		return $this->find($name, 0);
1333	}
1334
1335	function getElementsByTagName($name, $idx = null)
1336	{
1337		return $this->find($name, $idx);
1338	}
1339
1340	function parentNode()
1341	{
1342		return $this->parent();
1343	}
1344
1345	function childNodes($idx = -1)
1346	{
1347		return $this->children($idx);
1348	}
1349
1350	function firstChild()
1351	{
1352		return $this->first_child();
1353	}
1354
1355	function lastChild()
1356	{
1357		return $this->last_child();
1358	}
1359
1360	function nextSibling()
1361	{
1362		return $this->next_sibling();
1363	}
1364
1365	function previousSibling()
1366	{
1367		return $this->prev_sibling();
1368	}
1369
1370	function hasChildNodes()
1371	{
1372		return $this->has_child();
1373	}
1374
1375	function nodeName()
1376	{
1377		return $this->tag;
1378	}
1379
1380	function appendChild($node)
1381	{
1382		$node->parent($this);
1383		return $node;
1384	}
1385
1386}
1387
1388class simple_html_dom
1389{
1390	public $root = null;
1391	public $nodes = array();
1392	public $callback = null;
1393	public $lowercase = false;
1394	public $original_size;
1395	public $size;
1396
1397	protected $pos;
1398	protected $doc;
1399	protected $char;
1400
1401	protected $cursor;
1402	protected $parent;
1403	protected $noise = array();
1404	protected $token_blank = " \t\r\n";
1405	protected $token_equal = ' =/>';
1406	protected $token_slash = " />\r\n\t";
1407	protected $token_attr = ' >';
1408
1409	public $_charset = '';
1410	public $_target_charset = '';
1411
1412	protected $default_br_text = '';
1413
1414	public $default_span_text = '';
1415
1416	protected $self_closing_tags = array(
1417		'area' => 1,
1418		'base' => 1,
1419		'br' => 1,
1420		'col' => 1,
1421		'embed' => 1,
1422		'hr' => 1,
1423		'img' => 1,
1424		'input' => 1,
1425		'link' => 1,
1426		'meta' => 1,
1427		'param' => 1,
1428		'source' => 1,
1429		'track' => 1,
1430		'wbr' => 1
1431	);
1432	protected $block_tags = array(
1433		'body' => 1,
1434		'div' => 1,
1435		'form' => 1,
1436		'root' => 1,
1437		'span' => 1,
1438		'table' => 1
1439	);
1440	protected $optional_closing_tags = array(
1441		// Not optional, see
1442		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1443		'b' => array('b' => 1),
1444		'dd' => array('dd' => 1, 'dt' => 1),
1445		// Not optional, see
1446		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1447		'dl' => array('dd' => 1, 'dt' => 1),
1448		'dt' => array('dd' => 1, 'dt' => 1),
1449		'li' => array('li' => 1),
1450		'optgroup' => array('optgroup' => 1, 'option' => 1),
1451		'option' => array('optgroup' => 1, 'option' => 1),
1452		'p' => array('p' => 1),
1453		'rp' => array('rp' => 1, 'rt' => 1),
1454		'rt' => array('rp' => 1, 'rt' => 1),
1455		'td' => array('td' => 1, 'th' => 1),
1456		'th' => array('td' => 1, 'th' => 1),
1457		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1458	);
1459
1460	function __construct(
1461		$str = null,
1462		$lowercase = true,
1463		$forceTagsClosed = true,
1464		$target_charset = DEFAULT_TARGET_CHARSET,
1465		$stripRN = true,
1466		$defaultBRText = DEFAULT_BR_TEXT,
1467		$defaultSpanText = DEFAULT_SPAN_TEXT,
1468		$options = 0)
1469	{
1470		if ($str) {
1471			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1472				$this->load_file($str);
1473			} else {
1474				$this->load(
1475					$str,
1476					$lowercase,
1477					$stripRN,
1478					$defaultBRText,
1479					$defaultSpanText,
1480					$options
1481				);
1482			}
1483		}
1484		// Forcing tags to be closed implies that we don't trust the html, but
1485		// it can lead to parsing errors if we SHOULD trust the html.
1486		if (!$forceTagsClosed) {
1487			$this->optional_closing_array = array();
1488		}
1489
1490		$this->_target_charset = $target_charset;
1491	}
1492
1493	function __destruct()
1494	{
1495		$this->clear();
1496	}
1497
1498	function load(
1499		$str,
1500		$lowercase = true,
1501		$stripRN = true,
1502		$defaultBRText = DEFAULT_BR_TEXT,
1503		$defaultSpanText = DEFAULT_SPAN_TEXT,
1504		$options = 0)
1505	{
1506		global $debug_object;
1507
1508		// prepare
1509		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1510
1511		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1512		// Script tags removal now preceeds style tag removal.
1513		// strip out <script> tags
1514		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1515		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1516
1517		// strip out the \r \n's if we are told to.
1518		if ($stripRN) {
1519			$this->doc = str_replace("\r", ' ', $this->doc);
1520			$this->doc = str_replace("\n", ' ', $this->doc);
1521
1522			// set the length of content since we have changed it.
1523			$this->size = strlen($this->doc);
1524		}
1525
1526		// strip out cdata
1527		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1528		// strip out comments
1529		$this->remove_noise("'<!--(.*?)-->'is");
1530		// strip out <style> tags
1531		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1532		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1533		// strip out preformatted tags
1534		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1535		// strip out server side scripts
1536		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1537
1538		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1539			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1540		}
1541
1542		// parsing
1543		$this->parse();
1544		// end
1545		$this->root->_[HDOM_INFO_END] = $this->cursor;
1546		$this->parse_charset();
1547
1548		// make load function chainable
1549		return $this;
1550	}
1551
1552	function load_file()
1553	{
1554		$args = func_get_args();
1555
1556		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1557			$this->load($doc, true);
1558		} else {
1559			return false;
1560		}
1561	}
1562
1563	function set_callback($function_name)
1564	{
1565		$this->callback = $function_name;
1566	}
1567
1568	function remove_callback()
1569	{
1570		$this->callback = null;
1571	}
1572
1573	function save($filepath = '')
1574	{
1575		$ret = $this->root->innertext();
1576		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1577		return $ret;
1578	}
1579
1580	function find($selector, $idx = null, $lowercase = false)
1581	{
1582		return $this->root->find($selector, $idx, $lowercase);
1583	}
1584
1585	function clear()
1586	{
1587		if (isset($this->nodes)) {
1588			foreach ($this->nodes as $n) {
1589				$n->clear();
1590				$n = null;
1591			}
1592		}
1593
1594		// This add next line is documented in the sourceforge repository.
1595		// 2977248 as a fix for ongoing memory leaks that occur even with the
1596		// use of clear.
1597		if (isset($this->children)) {
1598			foreach ($this->children as $n) {
1599				$n->clear();
1600				$n = null;
1601			}
1602		}
1603
1604		if (isset($this->parent)) {
1605			$this->parent->clear();
1606			unset($this->parent);
1607		}
1608
1609		if (isset($this->root)) {
1610			$this->root->clear();
1611			unset($this->root);
1612		}
1613
1614		unset($this->doc);
1615		unset($this->noise);
1616	}
1617
1618	function dump($show_attr = true)
1619	{
1620		$this->root->dump($show_attr);
1621	}
1622
1623	protected function prepare(
1624		$str, $lowercase = true,
1625		$defaultBRText = DEFAULT_BR_TEXT,
1626		$defaultSpanText = DEFAULT_SPAN_TEXT)
1627	{
1628		$this->clear();
1629
1630		$this->doc = trim($str);
1631		$this->size = strlen($this->doc);
1632		$this->original_size = $this->size; // original size of the html
1633		$this->pos = 0;
1634		$this->cursor = 1;
1635		$this->noise = array();
1636		$this->nodes = array();
1637		$this->lowercase = $lowercase;
1638		$this->default_br_text = $defaultBRText;
1639		$this->default_span_text = $defaultSpanText;
1640		$this->root = new simple_html_dom_node($this);
1641		$this->root->tag = 'root';
1642		$this->root->_[HDOM_INFO_BEGIN] = -1;
1643		$this->root->nodetype = HDOM_TYPE_ROOT;
1644		$this->parent = $this->root;
1645		if ($this->size > 0) { $this->char = $this->doc[0]; }
1646	}
1647
1648	protected function parse()
1649	{
1650		while (true) {
1651			// Read next tag if there is no text between current position and the
1652			// next opening tag.
1653			if (($s = $this->copy_until_char('<')) === '') {
1654				if($this->read_tag()) {
1655					continue;
1656				} else {
1657					return true;
1658				}
1659			}
1660
1661			// Add a text node for text between tags
1662			$node = new simple_html_dom_node($this);
1663			++$this->cursor;
1664			$node->_[HDOM_INFO_TEXT] = $s;
1665			$this->link_nodes($node, false);
1666		}
1667	}
1668
1669	protected function parse_charset()
1670	{
1671		global $debug_object;
1672
1673		$charset = null;
1674
1675		if (function_exists('get_last_retrieve_url_contents_content_type')) {
1676			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1677			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1678			if ($success) {
1679				$charset = $matches[1];
1680				if (is_object($debug_object)) {
1681					$debug_object->debug_log(2,
1682						'header content-type found charset of: '
1683						. $charset
1684					);
1685				}
1686			}
1687		}
1688
1689		if (empty($charset)) {
1690			// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1691			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
1692
1693			if (!empty($el)) {
1694				$fullvalue = $el->content;
1695				if (is_object($debug_object)) {
1696					$debug_object->debug_log(2,
1697						'meta content-type tag found'
1698						. $fullvalue
1699					);
1700				}
1701
1702				if (!empty($fullvalue)) {
1703					$success = preg_match(
1704						'/charset=(.+)/i',
1705						$fullvalue,
1706						$matches
1707					);
1708
1709					if ($success) {
1710						$charset = $matches[1];
1711					} else {
1712						// If there is a meta tag, and they don't specify the
1713						// character set, research says that it's typically
1714						// ISO-8859-1
1715						if (is_object($debug_object)) {
1716							$debug_object->debug_log(2,
1717								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
1718							);
1719						}
1720
1721						$charset = 'ISO-8859-1';
1722					}
1723				}
1724			}
1725		}
1726
1727		if (empty($charset)) {
1728			// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
1729			if ($meta = $this->root->find('meta[charset]', 0)) {
1730				$charset = $meta->charset;
1731				if (is_object($debug_object)) {
1732					$debug_object->debug_log(2, 'meta charset: ' . $charset);
1733				}
1734			}
1735		}
1736
1737		if (empty($charset)) {
1738			// Try to guess the charset based on the content
1739			// Requires Multibyte String (mbstring) support (optional)
1740			if (function_exists('mb_detect_encoding')) {
1741				/**
1742				 * mb_detect_encoding() is not intended to distinguish between
1743				 * charsets, especially single-byte charsets. Its primary
1744				 * purpose is to detect which multibyte encoding is in use,
1745				 * i.e. UTF-8, UTF-16, shift-JIS, etc.
1746				 *
1747				 * -- https://bugs.php.net/bug.php?id=38138
1748				 *
1749				 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1750				 * always result in CP1251/ISO-8859-5 and vice versa.
1751				 *
1752				 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1753				 * to stay compatible.
1754				 */
1755				$encoding = mb_detect_encoding(
1756					$this->doc,
1757					array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
1758				);
1759
1760				if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
1761					// Due to a limitation of mb_detect_encoding
1762					// 'CP1251'/'ISO-8859-5' will be detected as
1763					// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1764					// which case we can simply assume it is the other charset.
1765					if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
1766						$encoding = 'CP1251';
1767					}
1768				}
1769
1770				if ($encoding !== false) {
1771					$charset = $encoding;
1772					if (is_object($debug_object)) {
1773						$debug_object->debug_log(2, 'mb_detect: ' . $charset);
1774					}
1775				}
1776			}
1777		}
1778
1779		if (empty($charset)) {
1780			// Assume it's UTF-8 as it is the most likely charset to be used
1781			$charset = 'UTF-8';
1782			if (is_object($debug_object)) {
1783				$debug_object->debug_log(2, 'No match found, assume ' . $charset);
1784			}
1785		}
1786
1787		// Since CP1252 is a superset, if we get one of it's subsets, we want
1788		// it instead.
1789		if ((strtolower($charset) == 'iso-8859-1')
1790			|| (strtolower($charset) == 'latin1')
1791			|| (strtolower($charset) == 'latin-1')) {
1792			$charset = 'CP1252';
1793			if (is_object($debug_object)) {
1794				$debug_object->debug_log(2,
1795					'replacing ' . $charset . ' with CP1252 as its a superset'
1796				);
1797			}
1798		}
1799
1800		if (is_object($debug_object)) {
1801			$debug_object->debug_log(1, 'EXIT - ' . $charset);
1802		}
1803
1804		return $this->_charset = $charset;
1805	}
1806
1807	protected function read_tag()
1808	{
1809		// Set end position if no further tags found
1810		if ($this->char !== '<') {
1811			$this->root->_[HDOM_INFO_END] = $this->cursor;
1812			return false;
1813		}
1814
1815		$begin_tag_pos = $this->pos;
1816		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1817
1818		// end tag
1819		if ($this->char === '/') {
1820			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1821
1822			// Skip whitespace in end tags (i.e. in "</   html>")
1823			$this->skip($this->token_blank);
1824			$tag = $this->copy_until_char('>');
1825
1826			// Skip attributes in end tags
1827			if (($pos = strpos($tag, ' ')) !== false) {
1828				$tag = substr($tag, 0, $pos);
1829			}
1830
1831			$parent_lower = strtolower($this->parent->tag);
1832			$tag_lower = strtolower($tag);
1833
1834			// The end tag is supposed to close the parent tag. Handle situations
1835			// when it doesn't
1836			if ($parent_lower !== $tag_lower) {
1837				// Parent tag does not have to be closed necessarily (optional closing tag)
1838				// Current tag is a block tag, so it may close an ancestor
1839				if (isset($this->optional_closing_tags[$parent_lower])
1840					&& isset($this->block_tags[$tag_lower])) {
1841
1842					$this->parent->_[HDOM_INFO_END] = 0;
1843					$org_parent = $this->parent;
1844
1845					// Traverse ancestors to find a matching opening tag
1846					// Stop at root node
1847					while (($this->parent->parent)
1848						&& strtolower($this->parent->tag) !== $tag_lower
1849					){
1850						$this->parent = $this->parent->parent;
1851					}
1852
1853					// If we don't have a match add current tag as text node
1854					if (strtolower($this->parent->tag) !== $tag_lower) {
1855						$this->parent = $org_parent; // restore origonal parent
1856
1857						if ($this->parent->parent) {
1858							$this->parent = $this->parent->parent;
1859						}
1860
1861						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1862						return $this->as_text_node($tag);
1863					}
1864				} elseif (($this->parent->parent)
1865					&& isset($this->block_tags[$tag_lower])
1866				) {
1867					// Grandparent exists and current tag is a block tag, so our
1868					// parent doesn't have an end tag
1869					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
1870					$org_parent = $this->parent;
1871
1872					// Traverse ancestors to find a matching opening tag
1873					// Stop at root node
1874					while (($this->parent->parent)
1875						&& strtolower($this->parent->tag) !== $tag_lower
1876					) {
1877						$this->parent = $this->parent->parent;
1878					}
1879
1880					// If we don't have a match add current tag as text node
1881					if (strtolower($this->parent->tag) !== $tag_lower) {
1882						$this->parent = $org_parent; // restore origonal parent
1883						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1884						return $this->as_text_node($tag);
1885					}
1886				} elseif (($this->parent->parent)
1887					&& strtolower($this->parent->parent->tag) === $tag_lower
1888				) { // Grandparent exists and current tag closes it
1889					$this->parent->_[HDOM_INFO_END] = 0;
1890					$this->parent = $this->parent->parent;
1891				} else { // Random tag, add as text node
1892					return $this->as_text_node($tag);
1893				}
1894			}
1895
1896			// Set end position of parent tag to current cursor position
1897			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1898
1899			if ($this->parent->parent) {
1900				$this->parent = $this->parent->parent;
1901			}
1902
1903			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1904			return true;
1905		}
1906
1907		// start tag
1908		$node = new simple_html_dom_node($this);
1909		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1910		++$this->cursor;
1911		$tag = $this->copy_until($this->token_slash); // Get tag name
1912		$node->tag_start = $begin_tag_pos;
1913
1914		// doctype, cdata & comments...
1915		// <!DOCTYPE html>
1916		// <![CDATA[ ... ]]>
1917		// <!-- Comment -->
1918		if (isset($tag[0]) && $tag[0] === '!') {
1919			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1920
1921			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
1922				$node->nodetype = HDOM_TYPE_COMMENT;
1923				$node->tag = 'comment';
1924			} else { // Could be doctype or CDATA but we don't care
1925				$node->nodetype = HDOM_TYPE_UNKNOWN;
1926				$node->tag = 'unknown';
1927			}
1928
1929			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1930
1931			$this->link_nodes($node, true);
1932			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1933			return true;
1934		}
1935
1936		// The start tag cannot contain another start tag, if so add as text
1937		// i.e. "<<html>"
1938		if ($pos = strpos($tag, '<') !== false) {
1939			$tag = '<' . substr($tag, 0, -1);
1940			$node->_[HDOM_INFO_TEXT] = $tag;
1941			$this->link_nodes($node, false);
1942			$this->char = $this->doc[--$this->pos]; // prev
1943			return true;
1944		}
1945
1946		// Handle invalid tag names (i.e. "<html#doc>")
1947		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
1948			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1949
1950			// Next char is the beginning of a new tag, don't touch it.
1951			if ($this->char === '<') {
1952				$this->link_nodes($node, false);
1953				return true;
1954			}
1955
1956			// Next char closes current tag, add and be done with it.
1957			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1958			$this->link_nodes($node, false);
1959			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1960			return true;
1961		}
1962
1963		// begin tag, add new node
1964		$node->nodetype = HDOM_TYPE_ELEMENT;
1965		$tag_lower = strtolower($tag);
1966		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1967
1968		// handle optional closing tags
1969		if (isset($this->optional_closing_tags[$tag_lower])) {
1970			// Traverse ancestors to close all optional closing tags
1971			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
1972				$this->parent->_[HDOM_INFO_END] = 0;
1973				$this->parent = $this->parent->parent;
1974			}
1975			$node->parent = $this->parent;
1976		}
1977
1978		$guard = 0; // prevent infinity loop
1979
1980		// [0] Space between tag and first attribute
1981		$space = array($this->copy_skip($this->token_blank), '', '');
1982
1983		// attributes
1984		do {
1985			// Everything until the first equal sign should be the attribute name
1986			$name = $this->copy_until($this->token_equal);
1987
1988			if ($name === '' && $this->char !== null && $space[0] === '') {
1989				break;
1990			}
1991
1992			if ($guard === $this->pos) { // Escape infinite loop
1993				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1994				continue;
1995			}
1996
1997			$guard = $this->pos;
1998
1999			// handle endless '<'
2000			// Out of bounds before the tag ended
2001			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2002				$node->nodetype = HDOM_TYPE_TEXT;
2003				$node->_[HDOM_INFO_END] = 0;
2004				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2005				$node->tag = 'text';
2006				$this->link_nodes($node, false);
2007				return true;
2008			}
2009
2010			// handle mismatch '<'
2011			// Attributes cannot start after opening tag
2012			if ($this->doc[$this->pos - 1] == '<') {
2013				$node->nodetype = HDOM_TYPE_TEXT;
2014				$node->tag = 'text';
2015				$node->attr = array();
2016				$node->_[HDOM_INFO_END] = 0;
2017				$node->_[HDOM_INFO_TEXT] = substr(
2018					$this->doc,
2019					$begin_tag_pos,
2020					$this->pos - $begin_tag_pos - 1
2021				);
2022				$this->pos -= 2;
2023				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2024				$this->link_nodes($node, false);
2025				return true;
2026			}
2027
2028			if ($name !== '/' && $name !== '') { // this is a attribute name
2029				// [1] Whitespace after attribute name
2030				$space[1] = $this->copy_skip($this->token_blank);
2031
2032				$name = $this->restore_noise($name); // might be a noisy name
2033
2034				if ($this->lowercase) { $name = strtolower($name); }
2035
2036				if ($this->char === '=') { // attribute with value
2037					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2038					$this->parse_attr($node, $name, $space); // get attribute value
2039				} else {
2040					//no value attr: nowrap, checked selected...
2041					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2042					$node->attr[$name] = true;
2043					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2044				}
2045
2046				$node->_[HDOM_INFO_SPACE][] = $space;
2047
2048				// prepare for next attribute
2049				$space = array(
2050					$this->copy_skip($this->token_blank),
2051					'',
2052					''
2053				);
2054			} else { // no more attributes
2055				break;
2056			}
2057		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2058
2059		$this->link_nodes($node, true);
2060		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2061
2062		// handle empty tags (i.e. "<div/>")
2063		if ($this->copy_until_char('>') === '/') {
2064			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2065			$node->_[HDOM_INFO_END] = 0;
2066		} else {
2067			// reset parent
2068			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2069				$this->parent = $node;
2070			}
2071		}
2072
2073		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2074
2075		// If it's a BR tag, we need to set it's text to the default text.
2076		// This way when we see it in plaintext, we can generate formatting that the user wants.
2077		// since a br tag never has sub nodes, this works well.
2078		if ($node->tag === 'br') {
2079			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2080		}
2081
2082		return true;
2083	}
2084
2085	protected function parse_attr($node, $name, &$space)
2086	{
2087		$is_duplicate = isset($node->attr[$name]);
2088
2089		if (!$is_duplicate) // Copy whitespace between "=" and value
2090			$space[2] = $this->copy_skip($this->token_blank);
2091
2092		switch ($this->char) {
2093			case '"':
2094				$quote_type = HDOM_QUOTE_DOUBLE;
2095				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2096				$value = $this->copy_until_char('"');
2097				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2098				break;
2099			case '\'':
2100				$quote_type = HDOM_QUOTE_SINGLE;
2101				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2102				$value = $this->copy_until_char('\'');
2103				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2104				break;
2105			default:
2106				$quote_type = HDOM_QUOTE_NO;
2107				$value = $this->copy_until($this->token_attr);
2108		}
2109
2110		$value = $this->restore_noise($value);
2111
2112		// PaperG: Attributes should not have \r or \n in them, that counts as
2113		// html whitespace.
2114		$value = str_replace("\r", '', $value);
2115		$value = str_replace("\n", '', $value);
2116
2117		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2118		// and trailing space since some people leave it in the multi class case.
2119		if ($name === 'class') {
2120			$value = trim($value);
2121		}
2122
2123		if (!$is_duplicate) {
2124			$node->_[HDOM_INFO_QUOTE][] = $quote_type;
2125			$node->attr[$name] = $value;
2126		}
2127	}
2128
2129	protected function link_nodes(&$node, $is_child)
2130	{
2131		$node->parent = $this->parent;
2132		$this->parent->nodes[] = $node;
2133		if ($is_child) {
2134			$this->parent->children[] = $node;
2135		}
2136	}
2137
2138	protected function as_text_node($tag)
2139	{
2140		$node = new simple_html_dom_node($this);
2141		++$this->cursor;
2142		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2143		$this->link_nodes($node, false);
2144		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2145		return true;
2146	}
2147
2148	protected function skip($chars)
2149	{
2150		$this->pos += strspn($this->doc, $chars, $this->pos);
2151		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2152	}
2153
2154	protected function copy_skip($chars)
2155	{
2156		$pos = $this->pos;
2157		$len = strspn($this->doc, $chars, $pos);
2158		$this->pos += $len;
2159		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2160		if ($len === 0) { return ''; }
2161		return substr($this->doc, $pos, $len);
2162	}
2163
2164	protected function copy_until($chars)
2165	{
2166		$pos = $this->pos;
2167		$len = strcspn($this->doc, $chars, $pos);
2168		$this->pos += $len;
2169		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2170		return substr($this->doc, $pos, $len);
2171	}
2172
2173	protected function copy_until_char($char)
2174	{
2175		if ($this->char === null) { return ''; }
2176
2177		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2178			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2179			$this->char = null;
2180			$this->pos = $this->size;
2181			return $ret;
2182		}
2183
2184		if ($pos === $this->pos) { return ''; }
2185
2186		$pos_old = $this->pos;
2187		$this->char = $this->doc[$pos];
2188		$this->pos = $pos;
2189		return substr($this->doc, $pos_old, $pos - $pos_old);
2190	}
2191
2192	protected function remove_noise($pattern, $remove_tag = false)
2193	{
2194		global $debug_object;
2195		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2196
2197		$count = preg_match_all(
2198			$pattern,
2199			$this->doc,
2200			$matches,
2201			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2202		);
2203
2204		for ($i = $count - 1; $i > -1; --$i) {
2205			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2206
2207			if (is_object($debug_object)) {
2208				$debug_object->debug_log(2, 'key is: ' . $key);
2209			}
2210
2211			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2212			$this->noise[$key] = $matches[$i][$idx][0];
2213			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2214		}
2215
2216		// reset the length of content
2217		$this->size = strlen($this->doc);
2218
2219		if ($this->size > 0) {
2220			$this->char = $this->doc[0];
2221		}
2222	}
2223
2224	function restore_noise($text)
2225	{
2226		global $debug_object;
2227		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2228
2229		while (($pos = strpos($text, '___noise___')) !== false) {
2230			// Sometimes there is a broken piece of markup, and we don't GET the
2231			// pos+11 etc... token which indicates a problem outside of us...
2232
2233			// todo: "___noise___1000" (or any number with four or more digits)
2234			// in the DOM causes an infinite loop which could be utilized by
2235			// malicious software
2236			if (strlen($text) > $pos + 15) {
2237				$key = '___noise___'
2238				. $text[$pos + 11]
2239				. $text[$pos + 12]
2240				. $text[$pos + 13]
2241				. $text[$pos + 14]
2242				. $text[$pos + 15];
2243
2244				if (is_object($debug_object)) {
2245					$debug_object->debug_log(2, 'located key of: ' . $key);
2246				}
2247
2248				if (isset($this->noise[$key])) {
2249					$text = substr($text, 0, $pos)
2250					. $this->noise[$key]
2251					. substr($text, $pos + 16);
2252				} else {
2253					// do this to prevent an infinite loop.
2254					$text = substr($text, 0, $pos)
2255					. 'UNDEFINED NOISE FOR KEY: '
2256					. $key
2257					. substr($text, $pos + 16);
2258				}
2259			} else {
2260				// There is no valid key being given back to us... We must get
2261				// rid of the ___noise___ or we will have a problem.
2262				$text = substr($text, 0, $pos)
2263				. 'NO NUMERIC NOISE KEY'
2264				. substr($text, $pos + 11);
2265			}
2266		}
2267		return $text;
2268	}
2269
2270	function search_noise($text)
2271	{
2272		global $debug_object;
2273		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2274
2275		foreach($this->noise as $noiseElement) {
2276			if (strpos($noiseElement, $text) !== false) {
2277				return $noiseElement;
2278			}
2279		}
2280	}
2281
2282	function __toString()
2283	{
2284		return $this->root->innertext();
2285	}
2286
2287	function __get($name)
2288	{
2289		switch ($name) {
2290			case 'outertext':
2291				return $this->root->innertext();
2292			case 'innertext':
2293				return $this->root->innertext();
2294			case 'plaintext':
2295				return $this->root->text();
2296			case 'charset':
2297				return $this->_charset;
2298			case 'target_charset':
2299				return $this->_target_charset;
2300		}
2301	}
2302
2303	function childNodes($idx = -1)
2304	{
2305		return $this->root->childNodes($idx);
2306	}
2307
2308	function firstChild()
2309	{
2310		return $this->root->first_child();
2311	}
2312
2313	function lastChild()
2314	{
2315		return $this->root->last_child();
2316	}
2317
2318	function createElement($name, $value = null)
2319	{
2320		return @str_get_html("<$name>$value</$name>")->firstChild();
2321	}
2322
2323	function createTextNode($value)
2324	{
2325		return @end(str_get_html($value)->nodes);
2326	}
2327
2328	function getElementById($id)
2329	{
2330		return $this->find("#$id", 0);
2331	}
2332
2333	function getElementsById($id, $idx = null)
2334	{
2335		return $this->find("#$id", $idx);
2336	}
2337
2338	function getElementByTagName($name)
2339	{
2340		return $this->find($name, 0);
2341	}
2342
2343	function getElementsByTagName($name, $idx = -1)
2344	{
2345		return $this->find($name, $idx);
2346	}
2347
2348	function loadFile()
2349	{
2350		$args = func_get_args();
2351		$this->load_file($args);
2352	}
2353}
2354