xref: /template/mikio/inc/simple_html_dom.php (revision c165b1846fc5c512505dc577e9428a1a76926c7e)
1*c165b184SJames Collins<?php
2*c165b184SJames Collins/**
3*c165b184SJames Collins * Website: http://sourceforge.net/projects/simplehtmldom/
4*c165b184SJames Collins * Additional projects: http://sourceforge.net/projects/debugobject/
5*c165b184SJames Collins * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6*c165b184SJames Collins *
7*c165b184SJames Collins * Licensed under The MIT License
8*c165b184SJames Collins * See the LICENSE file in the project root for more information.
9*c165b184SJames Collins *
10*c165b184SJames Collins * Authors:
11*c165b184SJames Collins *   S.C. Chen
12*c165b184SJames Collins *   John Schlick
13*c165b184SJames Collins *   Rus Carroll
14*c165b184SJames Collins *   logmanoriginal
15*c165b184SJames Collins *
16*c165b184SJames Collins * Contributors:
17*c165b184SJames Collins *   Yousuke Kumakura
18*c165b184SJames Collins *   Vadim Voituk
19*c165b184SJames Collins *   Antcs
20*c165b184SJames Collins *
21*c165b184SJames Collins * Version Rev. 1.9.1 (291)
22*c165b184SJames Collins */
23*c165b184SJames Collins
24*c165b184SJames Collinsdefine('HDOM_TYPE_ELEMENT', 1);
25*c165b184SJames Collinsdefine('HDOM_TYPE_COMMENT', 2);
26*c165b184SJames Collinsdefine('HDOM_TYPE_TEXT', 3);
27*c165b184SJames Collinsdefine('HDOM_TYPE_ENDTAG', 4);
28*c165b184SJames Collinsdefine('HDOM_TYPE_ROOT', 5);
29*c165b184SJames Collinsdefine('HDOM_TYPE_UNKNOWN', 6);
30*c165b184SJames Collinsdefine('HDOM_QUOTE_DOUBLE', 0);
31*c165b184SJames Collinsdefine('HDOM_QUOTE_SINGLE', 1);
32*c165b184SJames Collinsdefine('HDOM_QUOTE_NO', 3);
33*c165b184SJames Collinsdefine('HDOM_INFO_BEGIN', 0);
34*c165b184SJames Collinsdefine('HDOM_INFO_END', 1);
35*c165b184SJames Collinsdefine('HDOM_INFO_QUOTE', 2);
36*c165b184SJames Collinsdefine('HDOM_INFO_SPACE', 3);
37*c165b184SJames Collinsdefine('HDOM_INFO_TEXT', 4);
38*c165b184SJames Collinsdefine('HDOM_INFO_INNER', 5);
39*c165b184SJames Collinsdefine('HDOM_INFO_OUTER', 6);
40*c165b184SJames Collinsdefine('HDOM_INFO_ENDSPACE', 7);
41*c165b184SJames Collins
42*c165b184SJames Collinsdefined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
43*c165b184SJames Collinsdefined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
44*c165b184SJames Collinsdefined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
45*c165b184SJames Collinsdefined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
46*c165b184SJames Collinsdefine('HDOM_SMARTY_AS_TEXT', 1);
47*c165b184SJames Collins
48*c165b184SJames Collinsfunction file_get_html(
49*c165b184SJames Collins	$url,
50*c165b184SJames Collins	$use_include_path = false,
51*c165b184SJames Collins	$context = null,
52*c165b184SJames Collins	$offset = 0,
53*c165b184SJames Collins	$maxLen = -1,
54*c165b184SJames Collins	$lowercase = true,
55*c165b184SJames Collins	$forceTagsClosed = true,
56*c165b184SJames Collins	$target_charset = DEFAULT_TARGET_CHARSET,
57*c165b184SJames Collins	$stripRN = true,
58*c165b184SJames Collins	$defaultBRText = DEFAULT_BR_TEXT,
59*c165b184SJames Collins	$defaultSpanText = DEFAULT_SPAN_TEXT)
60*c165b184SJames Collins{
61*c165b184SJames Collins	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
62*c165b184SJames Collins
63*c165b184SJames Collins	$dom = new simple_html_dom(
64*c165b184SJames Collins		null,
65*c165b184SJames Collins		$lowercase,
66*c165b184SJames Collins		$forceTagsClosed,
67*c165b184SJames Collins		$target_charset,
68*c165b184SJames Collins		$stripRN,
69*c165b184SJames Collins		$defaultBRText,
70*c165b184SJames Collins		$defaultSpanText
71*c165b184SJames Collins	);
72*c165b184SJames Collins
73*c165b184SJames Collins	/**
74*c165b184SJames Collins	 * For sourceforge users: uncomment the next line and comment the
75*c165b184SJames Collins	 * retrieve_url_contents line 2 lines down if it is not already done.
76*c165b184SJames Collins	 */
77*c165b184SJames Collins	$contents = file_get_contents(
78*c165b184SJames Collins		$url,
79*c165b184SJames Collins		$use_include_path,
80*c165b184SJames Collins		$context,
81*c165b184SJames Collins		$offset,
82*c165b184SJames Collins		$maxLen
83*c165b184SJames Collins	);
84*c165b184SJames Collins	// $contents = retrieve_url_contents($url);
85*c165b184SJames Collins
86*c165b184SJames Collins	if (empty($contents) || strlen($contents) > $maxLen) {
87*c165b184SJames Collins		$dom->clear();
88*c165b184SJames Collins		return false;
89*c165b184SJames Collins	}
90*c165b184SJames Collins
91*c165b184SJames Collins	return $dom->load($contents, $lowercase, $stripRN);
92*c165b184SJames Collins}
93*c165b184SJames Collins
94*c165b184SJames Collinsfunction str_get_html(
95*c165b184SJames Collins	$str,
96*c165b184SJames Collins	$lowercase = true,
97*c165b184SJames Collins	$forceTagsClosed = true,
98*c165b184SJames Collins	$target_charset = DEFAULT_TARGET_CHARSET,
99*c165b184SJames Collins	$stripRN = true,
100*c165b184SJames Collins	$defaultBRText = DEFAULT_BR_TEXT,
101*c165b184SJames Collins	$defaultSpanText = DEFAULT_SPAN_TEXT)
102*c165b184SJames Collins{
103*c165b184SJames Collins	$dom = new simple_html_dom(
104*c165b184SJames Collins		null,
105*c165b184SJames Collins		$lowercase,
106*c165b184SJames Collins		$forceTagsClosed,
107*c165b184SJames Collins		$target_charset,
108*c165b184SJames Collins		$stripRN,
109*c165b184SJames Collins		$defaultBRText,
110*c165b184SJames Collins		$defaultSpanText
111*c165b184SJames Collins	);
112*c165b184SJames Collins
113*c165b184SJames Collins	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
114*c165b184SJames Collins		$dom->clear();
115*c165b184SJames Collins		return false;
116*c165b184SJames Collins	}
117*c165b184SJames Collins
118*c165b184SJames Collins	return $dom->load($str, $lowercase, $stripRN);
119*c165b184SJames Collins}
120*c165b184SJames Collins
121*c165b184SJames Collinsfunction dump_html_tree($node, $show_attr = true, $deep = 0)
122*c165b184SJames Collins{
123*c165b184SJames Collins	$node->dump($node);
124*c165b184SJames Collins}
125*c165b184SJames Collins
126*c165b184SJames Collinsclass simple_html_dom_node
127*c165b184SJames Collins{
128*c165b184SJames Collins	public $nodetype = HDOM_TYPE_TEXT;
129*c165b184SJames Collins	public $tag = 'text';
130*c165b184SJames Collins	public $attr = array();
131*c165b184SJames Collins	public $children = array();
132*c165b184SJames Collins	public $nodes = array();
133*c165b184SJames Collins	public $parent = null;
134*c165b184SJames Collins	public $_ = array();
135*c165b184SJames Collins	public $tag_start = 0;
136*c165b184SJames Collins	private $dom = null;
137*c165b184SJames Collins
138*c165b184SJames Collins	function __construct($dom)
139*c165b184SJames Collins	{
140*c165b184SJames Collins		$this->dom = $dom;
141*c165b184SJames Collins		$dom->nodes[] = $this;
142*c165b184SJames Collins	}
143*c165b184SJames Collins
144*c165b184SJames Collins	function __destruct()
145*c165b184SJames Collins	{
146*c165b184SJames Collins		$this->clear();
147*c165b184SJames Collins	}
148*c165b184SJames Collins
149*c165b184SJames Collins	function __toString()
150*c165b184SJames Collins	{
151*c165b184SJames Collins		return $this->outertext();
152*c165b184SJames Collins	}
153*c165b184SJames Collins
154*c165b184SJames Collins	function clear()
155*c165b184SJames Collins	{
156*c165b184SJames Collins		$this->dom = null;
157*c165b184SJames Collins		$this->nodes = null;
158*c165b184SJames Collins		$this->parent = null;
159*c165b184SJames Collins		$this->children = null;
160*c165b184SJames Collins	}
161*c165b184SJames Collins
162*c165b184SJames Collins	function dump($show_attr = true, $depth = 0)
163*c165b184SJames Collins	{
164*c165b184SJames Collins		echo str_repeat("\t", $depth) . $this->tag;
165*c165b184SJames Collins
166*c165b184SJames Collins		if ($show_attr && count($this->attr) > 0) {
167*c165b184SJames Collins			echo '(';
168*c165b184SJames Collins			foreach ($this->attr as $k => $v) {
169*c165b184SJames Collins				echo "[$k]=>\"$v\", ";
170*c165b184SJames Collins			}
171*c165b184SJames Collins			echo ')';
172*c165b184SJames Collins		}
173*c165b184SJames Collins
174*c165b184SJames Collins		echo "\n";
175*c165b184SJames Collins
176*c165b184SJames Collins		if ($this->nodes) {
177*c165b184SJames Collins			foreach ($this->nodes as $node) {
178*c165b184SJames Collins				$node->dump($show_attr, $depth + 1);
179*c165b184SJames Collins			}
180*c165b184SJames Collins		}
181*c165b184SJames Collins	}
182*c165b184SJames Collins
183*c165b184SJames Collins	function dump_node($echo = true)
184*c165b184SJames Collins	{
185*c165b184SJames Collins		$string = $this->tag;
186*c165b184SJames Collins
187*c165b184SJames Collins		if (count($this->attr) > 0) {
188*c165b184SJames Collins			$string .= '(';
189*c165b184SJames Collins			foreach ($this->attr as $k => $v) {
190*c165b184SJames Collins				$string .= "[$k]=>\"$v\", ";
191*c165b184SJames Collins			}
192*c165b184SJames Collins			$string .= ')';
193*c165b184SJames Collins		}
194*c165b184SJames Collins
195*c165b184SJames Collins		if (count($this->_) > 0) {
196*c165b184SJames Collins			$string .= ' $_ (';
197*c165b184SJames Collins			foreach ($this->_ as $k => $v) {
198*c165b184SJames Collins				if (is_array($v)) {
199*c165b184SJames Collins					$string .= "[$k]=>(";
200*c165b184SJames Collins					foreach ($v as $k2 => $v2) {
201*c165b184SJames Collins						$string .= "[$k2]=>\"$v2\", ";
202*c165b184SJames Collins					}
203*c165b184SJames Collins					$string .= ')';
204*c165b184SJames Collins				} else {
205*c165b184SJames Collins					$string .= "[$k]=>\"$v\", ";
206*c165b184SJames Collins				}
207*c165b184SJames Collins			}
208*c165b184SJames Collins			$string .= ')';
209*c165b184SJames Collins		}
210*c165b184SJames Collins
211*c165b184SJames Collins		if (isset($this->text)) {
212*c165b184SJames Collins			$string .= " text: ({$this->text})";
213*c165b184SJames Collins		}
214*c165b184SJames Collins
215*c165b184SJames Collins		$string .= ' HDOM_INNER_INFO: ';
216*c165b184SJames Collins
217*c165b184SJames Collins		if (isset($node->_[HDOM_INFO_INNER])) {
218*c165b184SJames Collins			$string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
219*c165b184SJames Collins		} else {
220*c165b184SJames Collins			$string .= ' NULL ';
221*c165b184SJames Collins		}
222*c165b184SJames Collins
223*c165b184SJames Collins		$string .= ' children: ' . count($this->children);
224*c165b184SJames Collins		$string .= ' nodes: ' . count($this->nodes);
225*c165b184SJames Collins		$string .= ' tag_start: ' . $this->tag_start;
226*c165b184SJames Collins		$string .= "\n";
227*c165b184SJames Collins
228*c165b184SJames Collins		if ($echo) {
229*c165b184SJames Collins			echo $string;
230*c165b184SJames Collins			return;
231*c165b184SJames Collins		} else {
232*c165b184SJames Collins			return $string;
233*c165b184SJames Collins		}
234*c165b184SJames Collins	}
235*c165b184SJames Collins
236*c165b184SJames Collins	function parent($parent = null)
237*c165b184SJames Collins	{
238*c165b184SJames Collins		// I am SURE that this doesn't work properly.
239*c165b184SJames Collins		// It fails to unset the current node from it's current parents nodes or
240*c165b184SJames Collins		// children list first.
241*c165b184SJames Collins		if ($parent !== null) {
242*c165b184SJames Collins			$this->parent = $parent;
243*c165b184SJames Collins			$this->parent->nodes[] = $this;
244*c165b184SJames Collins			$this->parent->children[] = $this;
245*c165b184SJames Collins		}
246*c165b184SJames Collins
247*c165b184SJames Collins		return $this->parent;
248*c165b184SJames Collins	}
249*c165b184SJames Collins
250*c165b184SJames Collins	function has_child()
251*c165b184SJames Collins	{
252*c165b184SJames Collins		return !empty($this->children);
253*c165b184SJames Collins	}
254*c165b184SJames Collins
255*c165b184SJames Collins	function children($idx = -1)
256*c165b184SJames Collins	{
257*c165b184SJames Collins		if ($idx === -1) {
258*c165b184SJames Collins			return $this->children;
259*c165b184SJames Collins		}
260*c165b184SJames Collins
261*c165b184SJames Collins		if (isset($this->children[$idx])) {
262*c165b184SJames Collins			return $this->children[$idx];
263*c165b184SJames Collins		}
264*c165b184SJames Collins
265*c165b184SJames Collins		return null;
266*c165b184SJames Collins	}
267*c165b184SJames Collins
268*c165b184SJames Collins	function first_child()
269*c165b184SJames Collins	{
270*c165b184SJames Collins		if (count($this->children) > 0) {
271*c165b184SJames Collins			return $this->children[0];
272*c165b184SJames Collins		}
273*c165b184SJames Collins		return null;
274*c165b184SJames Collins	}
275*c165b184SJames Collins
276*c165b184SJames Collins	function last_child()
277*c165b184SJames Collins	{
278*c165b184SJames Collins		if (count($this->children) > 0) {
279*c165b184SJames Collins			return end($this->children);
280*c165b184SJames Collins		}
281*c165b184SJames Collins		return null;
282*c165b184SJames Collins	}
283*c165b184SJames Collins
284*c165b184SJames Collins	function next_sibling()
285*c165b184SJames Collins	{
286*c165b184SJames Collins		if ($this->parent === null) {
287*c165b184SJames Collins			return null;
288*c165b184SJames Collins		}
289*c165b184SJames Collins
290*c165b184SJames Collins		$idx = array_search($this, $this->parent->children, true);
291*c165b184SJames Collins
292*c165b184SJames Collins		if ($idx !== false && isset($this->parent->children[$idx + 1])) {
293*c165b184SJames Collins			return $this->parent->children[$idx + 1];
294*c165b184SJames Collins		}
295*c165b184SJames Collins
296*c165b184SJames Collins		return null;
297*c165b184SJames Collins	}
298*c165b184SJames Collins
299*c165b184SJames Collins	function prev_sibling()
300*c165b184SJames Collins	{
301*c165b184SJames Collins		if ($this->parent === null) {
302*c165b184SJames Collins			return null;
303*c165b184SJames Collins		}
304*c165b184SJames Collins
305*c165b184SJames Collins		$idx = array_search($this, $this->parent->children, true);
306*c165b184SJames Collins
307*c165b184SJames Collins		if ($idx !== false && $idx > 0) {
308*c165b184SJames Collins			return $this->parent->children[$idx - 1];
309*c165b184SJames Collins		}
310*c165b184SJames Collins
311*c165b184SJames Collins		return null;
312*c165b184SJames Collins	}
313*c165b184SJames Collins
314*c165b184SJames Collins	function find_ancestor_tag($tag)
315*c165b184SJames Collins	{
316*c165b184SJames Collins		global $debug_object;
317*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
318*c165b184SJames Collins
319*c165b184SJames Collins		if ($this->parent === null) {
320*c165b184SJames Collins			return null;
321*c165b184SJames Collins		}
322*c165b184SJames Collins
323*c165b184SJames Collins		$ancestor = $this->parent;
324*c165b184SJames Collins
325*c165b184SJames Collins		while (!is_null($ancestor)) {
326*c165b184SJames Collins			if (is_object($debug_object)) {
327*c165b184SJames Collins				$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
328*c165b184SJames Collins			}
329*c165b184SJames Collins
330*c165b184SJames Collins			if ($ancestor->tag === $tag) {
331*c165b184SJames Collins				break;
332*c165b184SJames Collins			}
333*c165b184SJames Collins
334*c165b184SJames Collins			$ancestor = $ancestor->parent;
335*c165b184SJames Collins		}
336*c165b184SJames Collins
337*c165b184SJames Collins		return $ancestor;
338*c165b184SJames Collins	}
339*c165b184SJames Collins
340*c165b184SJames Collins	function innertext()
341*c165b184SJames Collins	{
342*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
343*c165b184SJames Collins			return $this->_[HDOM_INFO_INNER];
344*c165b184SJames Collins		}
345*c165b184SJames Collins
346*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
347*c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
348*c165b184SJames Collins		}
349*c165b184SJames Collins
350*c165b184SJames Collins		$ret = '';
351*c165b184SJames Collins
352*c165b184SJames Collins		foreach ($this->nodes as $n) {
353*c165b184SJames Collins			$ret .= $n->outertext();
354*c165b184SJames Collins		}
355*c165b184SJames Collins
356*c165b184SJames Collins		return $ret;
357*c165b184SJames Collins	}
358*c165b184SJames Collins
359*c165b184SJames Collins	function outertext()
360*c165b184SJames Collins	{
361*c165b184SJames Collins		global $debug_object;
362*c165b184SJames Collins
363*c165b184SJames Collins		if (is_object($debug_object)) {
364*c165b184SJames Collins			$text = '';
365*c165b184SJames Collins
366*c165b184SJames Collins			if ($this->tag === 'text') {
367*c165b184SJames Collins				if (!empty($this->text)) {
368*c165b184SJames Collins					$text = ' with text: ' . $this->text;
369*c165b184SJames Collins				}
370*c165b184SJames Collins			}
371*c165b184SJames Collins
372*c165b184SJames Collins			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
373*c165b184SJames Collins		}
374*c165b184SJames Collins
375*c165b184SJames Collins		if ($this->tag === 'root') {
376*c165b184SJames Collins			return $this->innertext();
377*c165b184SJames Collins		}
378*c165b184SJames Collins
379*c165b184SJames Collins		// todo: What is the use of this callback? Remove?
380*c165b184SJames Collins		if ($this->dom && $this->dom->callback !== null) {
381*c165b184SJames Collins			call_user_func_array($this->dom->callback, array($this));
382*c165b184SJames Collins		}
383*c165b184SJames Collins
384*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_OUTER])) {
385*c165b184SJames Collins			return $this->_[HDOM_INFO_OUTER];
386*c165b184SJames Collins		}
387*c165b184SJames Collins
388*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
389*c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390*c165b184SJames Collins		}
391*c165b184SJames Collins
392*c165b184SJames Collins		$ret = '';
393*c165b184SJames Collins
394*c165b184SJames Collins		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
395*c165b184SJames Collins			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
396*c165b184SJames Collins		}
397*c165b184SJames Collins
398*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
399*c165b184SJames Collins			// todo: <br> should either never have HDOM_INFO_INNER or always
400*c165b184SJames Collins			if ($this->tag !== 'br') {
401*c165b184SJames Collins				$ret .= $this->_[HDOM_INFO_INNER];
402*c165b184SJames Collins			}
403*c165b184SJames Collins		} elseif ($this->nodes) {
404*c165b184SJames Collins			foreach ($this->nodes as $n) {
405*c165b184SJames Collins				$ret .= $this->convert_text($n->outertext());
406*c165b184SJames Collins			}
407*c165b184SJames Collins		}
408*c165b184SJames Collins
409*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
410*c165b184SJames Collins			$ret .= '</' . $this->tag . '>';
411*c165b184SJames Collins		}
412*c165b184SJames Collins
413*c165b184SJames Collins		return $ret;
414*c165b184SJames Collins	}
415*c165b184SJames Collins
416*c165b184SJames Collins	function text()
417*c165b184SJames Collins	{
418*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
419*c165b184SJames Collins			return $this->_[HDOM_INFO_INNER];
420*c165b184SJames Collins		}
421*c165b184SJames Collins
422*c165b184SJames Collins		switch ($this->nodetype) {
423*c165b184SJames Collins			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
424*c165b184SJames Collins			case HDOM_TYPE_COMMENT: return '';
425*c165b184SJames Collins			case HDOM_TYPE_UNKNOWN: return '';
426*c165b184SJames Collins		}
427*c165b184SJames Collins
428*c165b184SJames Collins		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
429*c165b184SJames Collins		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
430*c165b184SJames Collins
431*c165b184SJames Collins		$ret = '';
432*c165b184SJames Collins
433*c165b184SJames Collins		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
434*c165b184SJames Collins		// for some span tags, and some p tags) $this->nodes is set to NULL.
435*c165b184SJames Collins		// NOTE: This indicates that there is a problem where it's set to NULL
436*c165b184SJames Collins		// without a clear happening.
437*c165b184SJames Collins		// WHY is this happening?
438*c165b184SJames Collins		if (!is_null($this->nodes)) {
439*c165b184SJames Collins			foreach ($this->nodes as $n) {
440*c165b184SJames Collins				// Start paragraph after a blank line
441*c165b184SJames Collins				if ($n->tag === 'p') {
442*c165b184SJames Collins					$ret = trim($ret) . "\n\n";
443*c165b184SJames Collins				}
444*c165b184SJames Collins
445*c165b184SJames Collins				$ret .= $this->convert_text($n->text());
446*c165b184SJames Collins
447*c165b184SJames Collins				// If this node is a span... add a space at the end of it so
448*c165b184SJames Collins				// multiple spans don't run into each other.  This is plaintext
449*c165b184SJames Collins				// after all.
450*c165b184SJames Collins				if ($n->tag === 'span') {
451*c165b184SJames Collins					$ret .= $this->dom->default_span_text;
452*c165b184SJames Collins				}
453*c165b184SJames Collins			}
454*c165b184SJames Collins		}
455*c165b184SJames Collins		return $ret;
456*c165b184SJames Collins	}
457*c165b184SJames Collins
458*c165b184SJames Collins	function xmltext()
459*c165b184SJames Collins	{
460*c165b184SJames Collins		$ret = $this->innertext();
461*c165b184SJames Collins		$ret = str_ireplace('<![CDATA[', '', $ret);
462*c165b184SJames Collins		$ret = str_replace(']]>', '', $ret);
463*c165b184SJames Collins		return $ret;
464*c165b184SJames Collins	}
465*c165b184SJames Collins
466*c165b184SJames Collins	function makeup()
467*c165b184SJames Collins	{
468*c165b184SJames Collins		// text, comment, unknown
469*c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
470*c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471*c165b184SJames Collins		}
472*c165b184SJames Collins
473*c165b184SJames Collins		$ret = '<' . $this->tag;
474*c165b184SJames Collins		$i = -1;
475*c165b184SJames Collins
476*c165b184SJames Collins		foreach ($this->attr as $key => $val) {
477*c165b184SJames Collins			++$i;
478*c165b184SJames Collins
479*c165b184SJames Collins			// skip removed attribute
480*c165b184SJames Collins			if ($val === null || $val === false) { continue; }
481*c165b184SJames Collins
482*c165b184SJames Collins			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
483*c165b184SJames Collins
484*c165b184SJames Collins			//no value attr: nowrap, checked selected...
485*c165b184SJames Collins			if ($val === true) {
486*c165b184SJames Collins				$ret .= $key;
487*c165b184SJames Collins			} else {
488*c165b184SJames Collins				switch ($this->_[HDOM_INFO_QUOTE][$i])
489*c165b184SJames Collins				{
490*c165b184SJames Collins					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491*c165b184SJames Collins					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492*c165b184SJames Collins					default: $quote = '';
493*c165b184SJames Collins				}
494*c165b184SJames Collins
495*c165b184SJames Collins				$ret .= $key
496*c165b184SJames Collins				. $this->_[HDOM_INFO_SPACE][$i][1]
497*c165b184SJames Collins				. '='
498*c165b184SJames Collins				. $this->_[HDOM_INFO_SPACE][$i][2]
499*c165b184SJames Collins				. $quote
500*c165b184SJames Collins				. $val
501*c165b184SJames Collins				. $quote;
502*c165b184SJames Collins			}
503*c165b184SJames Collins		}
504*c165b184SJames Collins
505*c165b184SJames Collins		$ret = $this->dom->restore_noise($ret);
506*c165b184SJames Collins		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
507*c165b184SJames Collins	}
508*c165b184SJames Collins
509*c165b184SJames Collins	function find($selector, $idx = null, $lowercase = false)
510*c165b184SJames Collins	{
511*c165b184SJames Collins		$selectors = $this->parse_selector($selector);
512*c165b184SJames Collins		if (($count = count($selectors)) === 0) { return array(); }
513*c165b184SJames Collins		$found_keys = array();
514*c165b184SJames Collins
515*c165b184SJames Collins		// find each selector
516*c165b184SJames Collins		for ($c = 0; $c < $count; ++$c) {
517*c165b184SJames Collins			// The change on the below line was documented on the sourceforge
518*c165b184SJames Collins			// code tracker id 2788009
519*c165b184SJames Collins			// used to be: if (($levle=count($selectors[0]))===0) return array();
520*c165b184SJames Collins			if (($levle = count($selectors[$c])) === 0) { return array(); }
521*c165b184SJames Collins			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
522*c165b184SJames Collins
523*c165b184SJames Collins			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
524*c165b184SJames Collins			$cmd = ' '; // Combinator
525*c165b184SJames Collins
526*c165b184SJames Collins			// handle descendant selectors, no recursive!
527*c165b184SJames Collins			for ($l = 0; $l < $levle; ++$l) {
528*c165b184SJames Collins				$ret = array();
529*c165b184SJames Collins
530*c165b184SJames Collins				foreach ($head as $k => $v) {
531*c165b184SJames Collins					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
532*c165b184SJames Collins					//PaperG - Pass this optional parameter on to the seek function.
533*c165b184SJames Collins					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
534*c165b184SJames Collins				}
535*c165b184SJames Collins
536*c165b184SJames Collins				$head = $ret;
537*c165b184SJames Collins				$cmd = $selectors[$c][$l][4]; // Next Combinator
538*c165b184SJames Collins			}
539*c165b184SJames Collins
540*c165b184SJames Collins			foreach ($head as $k => $v) {
541*c165b184SJames Collins				if (!isset($found_keys[$k])) {
542*c165b184SJames Collins					$found_keys[$k] = 1;
543*c165b184SJames Collins				}
544*c165b184SJames Collins			}
545*c165b184SJames Collins		}
546*c165b184SJames Collins
547*c165b184SJames Collins		// sort keys
548*c165b184SJames Collins		ksort($found_keys);
549*c165b184SJames Collins
550*c165b184SJames Collins		$found = array();
551*c165b184SJames Collins		foreach ($found_keys as $k => $v) {
552*c165b184SJames Collins			$found[] = $this->dom->nodes[$k];
553*c165b184SJames Collins		}
554*c165b184SJames Collins
555*c165b184SJames Collins		// return nth-element or array
556*c165b184SJames Collins		if (is_null($idx)) { return $found; }
557*c165b184SJames Collins		elseif ($idx < 0) { $idx = count($found) + $idx; }
558*c165b184SJames Collins		return (isset($found[$idx])) ? $found[$idx] : null;
559*c165b184SJames Collins	}
560*c165b184SJames Collins
561*c165b184SJames Collins	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
562*c165b184SJames Collins	{
563*c165b184SJames Collins		global $debug_object;
564*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
565*c165b184SJames Collins
566*c165b184SJames Collins		list($tag, $id, $class, $attributes, $cmb) = $selector;
567*c165b184SJames Collins		$nodes = array();
568*c165b184SJames Collins
569*c165b184SJames Collins		if ($parent_cmd === ' ') { // Descendant Combinator
570*c165b184SJames Collins			// Find parent closing tag if the current element doesn't have a closing
571*c165b184SJames Collins			// tag (i.e. void element)
572*c165b184SJames Collins			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
573*c165b184SJames Collins			if ($end == 0) {
574*c165b184SJames Collins				$parent = $this->parent;
575*c165b184SJames Collins				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
576*c165b184SJames Collins					$end -= 1;
577*c165b184SJames Collins					$parent = $parent->parent;
578*c165b184SJames Collins				}
579*c165b184SJames Collins				$end += $parent->_[HDOM_INFO_END];
580*c165b184SJames Collins			}
581*c165b184SJames Collins
582*c165b184SJames Collins			// Get list of target nodes
583*c165b184SJames Collins			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
584*c165b184SJames Collins			$nodes_count = $end - $nodes_start;
585*c165b184SJames Collins			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
586*c165b184SJames Collins		} elseif ($parent_cmd === '>') { // Child Combinator
587*c165b184SJames Collins			$nodes = $this->children;
588*c165b184SJames Collins		} elseif ($parent_cmd === '+'
589*c165b184SJames Collins			&& $this->parent
590*c165b184SJames Collins			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
591*c165b184SJames Collins				$index = array_search($this, $this->parent->children, true) + 1;
592*c165b184SJames Collins				if ($index < count($this->parent->children))
593*c165b184SJames Collins					$nodes[] = $this->parent->children[$index];
594*c165b184SJames Collins		} elseif ($parent_cmd === '~'
595*c165b184SJames Collins			&& $this->parent
596*c165b184SJames Collins			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
597*c165b184SJames Collins				$index = array_search($this, $this->parent->children, true);
598*c165b184SJames Collins				$nodes = array_slice($this->parent->children, $index);
599*c165b184SJames Collins		}
600*c165b184SJames Collins
601*c165b184SJames Collins		// Go throgh each element starting at this element until the end tag
602*c165b184SJames Collins		// Note: If this element is a void tag, any previous void element is
603*c165b184SJames Collins		// skipped.
604*c165b184SJames Collins		foreach($nodes as $node) {
605*c165b184SJames Collins			$pass = true;
606*c165b184SJames Collins
607*c165b184SJames Collins			// Skip root nodes
608*c165b184SJames Collins			if(!$node->parent) {
609*c165b184SJames Collins				$pass = false;
610*c165b184SJames Collins			}
611*c165b184SJames Collins
612*c165b184SJames Collins			// Handle 'text' selector
613*c165b184SJames Collins			if($pass && $tag === 'text' && $node->tag === 'text') {
614*c165b184SJames Collins				$ret[array_search($node, $this->dom->nodes, true)] = 1;
615*c165b184SJames Collins				unset($node);
616*c165b184SJames Collins				continue;
617*c165b184SJames Collins			}
618*c165b184SJames Collins
619*c165b184SJames Collins			// Skip if node isn't a child node (i.e. text nodes)
620*c165b184SJames Collins			if($pass && !in_array($node, $node->parent->children, true)) {
621*c165b184SJames Collins				$pass = false;
622*c165b184SJames Collins			}
623*c165b184SJames Collins
624*c165b184SJames Collins			// Skip if tag doesn't match
625*c165b184SJames Collins			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
626*c165b184SJames Collins				$pass = false;
627*c165b184SJames Collins			}
628*c165b184SJames Collins
629*c165b184SJames Collins			// Skip if ID doesn't exist
630*c165b184SJames Collins			if ($pass && $id !== '' && !isset($node->attr['id'])) {
631*c165b184SJames Collins				$pass = false;
632*c165b184SJames Collins			}
633*c165b184SJames Collins
634*c165b184SJames Collins			// Check if ID matches
635*c165b184SJames Collins			if ($pass && $id !== '' && isset($node->attr['id'])) {
636*c165b184SJames Collins				// Note: Only consider the first ID (as browsers do)
637*c165b184SJames Collins				$node_id = explode(' ', trim($node->attr['id']))[0];
638*c165b184SJames Collins
639*c165b184SJames Collins				if($id !== $node_id) { $pass = false; }
640*c165b184SJames Collins			}
641*c165b184SJames Collins
642*c165b184SJames Collins			// Check if all class(es) exist
643*c165b184SJames Collins			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
644*c165b184SJames Collins				if (isset($node->attr['class'])) {
645*c165b184SJames Collins					$node_classes = explode(' ', $node->attr['class']);
646*c165b184SJames Collins
647*c165b184SJames Collins					if ($lowercase) {
648*c165b184SJames Collins						$node_classes = array_map('strtolower', $node_classes);
649*c165b184SJames Collins					}
650*c165b184SJames Collins
651*c165b184SJames Collins					foreach($class as $c) {
652*c165b184SJames Collins						if(!in_array($c, $node_classes)) {
653*c165b184SJames Collins							$pass = false;
654*c165b184SJames Collins							break;
655*c165b184SJames Collins						}
656*c165b184SJames Collins					}
657*c165b184SJames Collins				} else {
658*c165b184SJames Collins					$pass = false;
659*c165b184SJames Collins				}
660*c165b184SJames Collins			}
661*c165b184SJames Collins
662*c165b184SJames Collins			// Check attributes
663*c165b184SJames Collins			if ($pass
664*c165b184SJames Collins				&& $attributes !== ''
665*c165b184SJames Collins				&& is_array($attributes)
666*c165b184SJames Collins				&& !empty($attributes)) {
667*c165b184SJames Collins					foreach($attributes as $a) {
668*c165b184SJames Collins						list (
669*c165b184SJames Collins							$att_name,
670*c165b184SJames Collins							$att_expr,
671*c165b184SJames Collins							$att_val,
672*c165b184SJames Collins							$att_inv,
673*c165b184SJames Collins							$att_case_sensitivity
674*c165b184SJames Collins						) = $a;
675*c165b184SJames Collins
676*c165b184SJames Collins						// Handle indexing attributes (i.e. "[2]")
677*c165b184SJames Collins						/**
678*c165b184SJames Collins						 * Note: This is not supported by the CSS Standard but adds
679*c165b184SJames Collins						 * the ability to select items compatible to XPath (i.e.
680*c165b184SJames Collins						 * the 3rd element within it's parent).
681*c165b184SJames Collins						 *
682*c165b184SJames Collins						 * Note: This doesn't conflict with the CSS Standard which
683*c165b184SJames Collins						 * doesn't work on numeric attributes anyway.
684*c165b184SJames Collins						 */
685*c165b184SJames Collins						if (is_numeric($att_name)
686*c165b184SJames Collins							&& $att_expr === ''
687*c165b184SJames Collins							&& $att_val === '') {
688*c165b184SJames Collins								$count = 0;
689*c165b184SJames Collins
690*c165b184SJames Collins								// Find index of current element in parent
691*c165b184SJames Collins								foreach ($node->parent->children as $c) {
692*c165b184SJames Collins									if ($c->tag === $node->tag) ++$count;
693*c165b184SJames Collins									if ($c === $node) break;
694*c165b184SJames Collins								}
695*c165b184SJames Collins
696*c165b184SJames Collins								// If this is the correct node, continue with next
697*c165b184SJames Collins								// attribute
698*c165b184SJames Collins								if ($count === (int)$att_name) continue;
699*c165b184SJames Collins						}
700*c165b184SJames Collins
701*c165b184SJames Collins						// Check attribute availability
702*c165b184SJames Collins						if ($att_inv) { // Attribute should NOT be set
703*c165b184SJames Collins							if (isset($node->attr[$att_name])) {
704*c165b184SJames Collins								$pass = false;
705*c165b184SJames Collins								break;
706*c165b184SJames Collins							}
707*c165b184SJames Collins						} else { // Attribute should be set
708*c165b184SJames Collins							// todo: "plaintext" is not a valid CSS selector!
709*c165b184SJames Collins							if ($att_name !== 'plaintext'
710*c165b184SJames Collins								&& !isset($node->attr[$att_name])) {
711*c165b184SJames Collins									$pass = false;
712*c165b184SJames Collins									break;
713*c165b184SJames Collins							}
714*c165b184SJames Collins						}
715*c165b184SJames Collins
716*c165b184SJames Collins						// Continue with next attribute if expression isn't defined
717*c165b184SJames Collins						if ($att_expr === '') continue;
718*c165b184SJames Collins
719*c165b184SJames Collins						// If they have told us that this is a "plaintext"
720*c165b184SJames Collins						// search then we want the plaintext of the node - right?
721*c165b184SJames Collins						// todo "plaintext" is not a valid CSS selector!
722*c165b184SJames Collins						if ($att_name === 'plaintext') {
723*c165b184SJames Collins							$nodeKeyValue = $node->text();
724*c165b184SJames Collins						} else {
725*c165b184SJames Collins							$nodeKeyValue = $node->attr[$att_name];
726*c165b184SJames Collins						}
727*c165b184SJames Collins
728*c165b184SJames Collins						if (is_object($debug_object)) {
729*c165b184SJames Collins							$debug_object->debug_log(2,
730*c165b184SJames Collins								'testing node: '
731*c165b184SJames Collins								. $node->tag
732*c165b184SJames Collins								. ' for attribute: '
733*c165b184SJames Collins								. $att_name
734*c165b184SJames Collins								. $att_expr
735*c165b184SJames Collins								. $att_val
736*c165b184SJames Collins								. ' where nodes value is: '
737*c165b184SJames Collins								. $nodeKeyValue
738*c165b184SJames Collins							);
739*c165b184SJames Collins						}
740*c165b184SJames Collins
741*c165b184SJames Collins						// If lowercase is set, do a case insensitive test of
742*c165b184SJames Collins						// the value of the selector.
743*c165b184SJames Collins						if ($lowercase) {
744*c165b184SJames Collins							$check = $this->match(
745*c165b184SJames Collins								$att_expr,
746*c165b184SJames Collins								strtolower($att_val),
747*c165b184SJames Collins								strtolower($nodeKeyValue),
748*c165b184SJames Collins								$att_case_sensitivity
749*c165b184SJames Collins							);
750*c165b184SJames Collins						} else {
751*c165b184SJames Collins							$check = $this->match(
752*c165b184SJames Collins								$att_expr,
753*c165b184SJames Collins								$att_val,
754*c165b184SJames Collins								$nodeKeyValue,
755*c165b184SJames Collins								$att_case_sensitivity
756*c165b184SJames Collins							);
757*c165b184SJames Collins						}
758*c165b184SJames Collins
759*c165b184SJames Collins						if (is_object($debug_object)) {
760*c165b184SJames Collins							$debug_object->debug_log(2,
761*c165b184SJames Collins								'after match: '
762*c165b184SJames Collins								. ($check ? 'true' : 'false')
763*c165b184SJames Collins							);
764*c165b184SJames Collins						}
765*c165b184SJames Collins
766*c165b184SJames Collins						if (!$check) {
767*c165b184SJames Collins							$pass = false;
768*c165b184SJames Collins							break;
769*c165b184SJames Collins						}
770*c165b184SJames Collins					}
771*c165b184SJames Collins			}
772*c165b184SJames Collins
773*c165b184SJames Collins			// Found a match. Add to list and clear node
774*c165b184SJames Collins			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
775*c165b184SJames Collins			unset($node);
776*c165b184SJames Collins		}
777*c165b184SJames Collins		// It's passed by reference so this is actually what this function returns.
778*c165b184SJames Collins		if (is_object($debug_object)) {
779*c165b184SJames Collins			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
780*c165b184SJames Collins		}
781*c165b184SJames Collins	}
782*c165b184SJames Collins
783*c165b184SJames Collins	protected function match($exp, $pattern, $value, $case_sensitivity)
784*c165b184SJames Collins	{
785*c165b184SJames Collins		global $debug_object;
786*c165b184SJames Collins		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
787*c165b184SJames Collins
788*c165b184SJames Collins		if ($case_sensitivity === 'i') {
789*c165b184SJames Collins			$pattern = strtolower($pattern);
790*c165b184SJames Collins			$value = strtolower($value);
791*c165b184SJames Collins		}
792*c165b184SJames Collins
793*c165b184SJames Collins		switch ($exp) {
794*c165b184SJames Collins			case '=':
795*c165b184SJames Collins				return ($value === $pattern);
796*c165b184SJames Collins			case '!=':
797*c165b184SJames Collins				return ($value !== $pattern);
798*c165b184SJames Collins			case '^=':
799*c165b184SJames Collins				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
800*c165b184SJames Collins			case '$=':
801*c165b184SJames Collins				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
802*c165b184SJames Collins			case '*=':
803*c165b184SJames Collins				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
804*c165b184SJames Collins			case '|=':
805*c165b184SJames Collins				/**
806*c165b184SJames Collins				 * [att|=val]
807*c165b184SJames Collins				 *
808*c165b184SJames Collins				 * Represents an element with the att attribute, its value
809*c165b184SJames Collins				 * either being exactly "val" or beginning with "val"
810*c165b184SJames Collins				 * immediately followed by "-" (U+002D).
811*c165b184SJames Collins				 */
812*c165b184SJames Collins				return strpos($value, $pattern) === 0;
813*c165b184SJames Collins			case '~=':
814*c165b184SJames Collins				/**
815*c165b184SJames Collins				 * [att~=val]
816*c165b184SJames Collins				 *
817*c165b184SJames Collins				 * Represents an element with the att attribute whose value is a
818*c165b184SJames Collins				 * whitespace-separated list of words, one of which is exactly
819*c165b184SJames Collins				 * "val". If "val" contains whitespace, it will never represent
820*c165b184SJames Collins				 * anything (since the words are separated by spaces). Also if
821*c165b184SJames Collins				 * "val" is the empty string, it will never represent anything.
822*c165b184SJames Collins				 */
823*c165b184SJames Collins				return in_array($pattern, explode(' ', trim($value)), true);
824*c165b184SJames Collins		}
825*c165b184SJames Collins		return false;
826*c165b184SJames Collins	}
827*c165b184SJames Collins
828*c165b184SJames Collins	protected function parse_selector($selector_string)
829*c165b184SJames Collins	{
830*c165b184SJames Collins		global $debug_object;
831*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
832*c165b184SJames Collins
833*c165b184SJames Collins		/**
834*c165b184SJames Collins		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
835*c165b184SJames Collins		 *
836*c165b184SJames Collins		 * Paperg: Add the colon to the attribute, so that it properly finds
837*c165b184SJames Collins		 * <tag attr:ibute="something" > like google does.
838*c165b184SJames Collins		 *
839*c165b184SJames Collins		 * Note: if you try to look at this attribute, you MUST use getAttribute
840*c165b184SJames Collins		 * since $dom->x:y will fail the php syntax check.
841*c165b184SJames Collins		 *
842*c165b184SJames Collins		 * Notice the \[ starting the attribute? and the @? following? This
843*c165b184SJames Collins		 * implies that an attribute can begin with an @ sign that is not
844*c165b184SJames Collins		 * captured. This implies that an html attribute specifier may start
845*c165b184SJames Collins		 * with an @ sign that is NOT captured by the expression. Farther study
846*c165b184SJames Collins		 * is required to determine of this should be documented or removed.
847*c165b184SJames Collins		 *
848*c165b184SJames Collins		 * Matches selectors in this order:
849*c165b184SJames Collins		 *
850*c165b184SJames Collins		 * [0] - full match
851*c165b184SJames Collins		 *
852*c165b184SJames Collins		 * [1] - tag name
853*c165b184SJames Collins		 *     ([\w:\*-]*)
854*c165b184SJames Collins		 *     Matches the tag name consisting of zero or more words, colons,
855*c165b184SJames Collins		 *     asterisks and hyphens.
856*c165b184SJames Collins		 *
857*c165b184SJames Collins		 * [2] - id name
858*c165b184SJames Collins		 *     (?:\#([\w-]+))
859*c165b184SJames Collins		 *     Optionally matches a id name, consisting of an "#" followed by
860*c165b184SJames Collins		 *     the id name (one or more words and hyphens).
861*c165b184SJames Collins		 *
862*c165b184SJames Collins		 * [3] - class names (including dots)
863*c165b184SJames Collins		 *     (?:\.([\w\.-]+))?
864*c165b184SJames Collins		 *     Optionally matches a list of classs, consisting of an "."
865*c165b184SJames Collins		 *     followed by the class name (one or more words and hyphens)
866*c165b184SJames Collins		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
867*c165b184SJames Collins		 *
868*c165b184SJames Collins		 * [4] - attributes
869*c165b184SJames Collins		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
870*c165b184SJames Collins		 *     Optionally matches the attributes list
871*c165b184SJames Collins		 *
872*c165b184SJames Collins		 * [5] - separator
873*c165b184SJames Collins		 *     ([\/, >+~]+)
874*c165b184SJames Collins		 *     Matches the selector list separator
875*c165b184SJames Collins		 */
876*c165b184SJames Collins		// phpcs:ignore Generic.Files.LineLength
877*c165b184SJames Collins		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
878*c165b184SJames Collins
879*c165b184SJames Collins		preg_match_all(
880*c165b184SJames Collins			$pattern,
881*c165b184SJames Collins			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
882*c165b184SJames Collins			$matches,
883*c165b184SJames Collins			PREG_SET_ORDER
884*c165b184SJames Collins		);
885*c165b184SJames Collins
886*c165b184SJames Collins		if (is_object($debug_object)) {
887*c165b184SJames Collins			$debug_object->debug_log(2, 'Matches Array: ', $matches);
888*c165b184SJames Collins		}
889*c165b184SJames Collins
890*c165b184SJames Collins		$selectors = array();
891*c165b184SJames Collins		$result = array();
892*c165b184SJames Collins
893*c165b184SJames Collins		foreach ($matches as $m) {
894*c165b184SJames Collins			$m[0] = trim($m[0]);
895*c165b184SJames Collins
896*c165b184SJames Collins			// Skip NoOps
897*c165b184SJames Collins			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
898*c165b184SJames Collins
899*c165b184SJames Collins			// Convert to lowercase
900*c165b184SJames Collins			if ($this->dom->lowercase) {
901*c165b184SJames Collins				$m[1] = strtolower($m[1]);
902*c165b184SJames Collins			}
903*c165b184SJames Collins
904*c165b184SJames Collins			// Extract classes
905*c165b184SJames Collins			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
906*c165b184SJames Collins
907*c165b184SJames Collins			/* Extract attributes (pattern based on the pattern above!)
908*c165b184SJames Collins
909*c165b184SJames Collins			 * [0] - full match
910*c165b184SJames Collins			 * [1] - attribute name
911*c165b184SJames Collins			 * [2] - attribute expression
912*c165b184SJames Collins			 * [3] - attribute value
913*c165b184SJames Collins			 * [4] - case sensitivity
914*c165b184SJames Collins			 *
915*c165b184SJames Collins			 * Note: Attributes can be negated with a "!" prefix to their name
916*c165b184SJames Collins			 */
917*c165b184SJames Collins			if($m[4] !== '') {
918*c165b184SJames Collins				preg_match_all(
919*c165b184SJames Collins					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
920*c165b184SJames Collins					trim($m[4]),
921*c165b184SJames Collins					$attributes,
922*c165b184SJames Collins					PREG_SET_ORDER
923*c165b184SJames Collins				);
924*c165b184SJames Collins
925*c165b184SJames Collins				// Replace element by array
926*c165b184SJames Collins				$m[4] = array();
927*c165b184SJames Collins
928*c165b184SJames Collins				foreach($attributes as $att) {
929*c165b184SJames Collins					// Skip empty matches
930*c165b184SJames Collins					if(trim($att[0]) === '') { continue; }
931*c165b184SJames Collins
932*c165b184SJames Collins					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
933*c165b184SJames Collins					$m[4][] = array(
934*c165b184SJames Collins						$inverted ? substr($att[1], 1) : $att[1], // Name
935*c165b184SJames Collins						(isset($att[2])) ? $att[2] : '', // Expression
936*c165b184SJames Collins						(isset($att[3])) ? $att[3] : '', // Value
937*c165b184SJames Collins						$inverted, // Inverted Flag
938*c165b184SJames Collins						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
939*c165b184SJames Collins					);
940*c165b184SJames Collins				}
941*c165b184SJames Collins			}
942*c165b184SJames Collins
943*c165b184SJames Collins			// Sanitize Separator
944*c165b184SJames Collins			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
945*c165b184SJames Collins				$m[5] = ' ';
946*c165b184SJames Collins			} else { // Other Separator
947*c165b184SJames Collins				$m[5] = trim($m[5]);
948*c165b184SJames Collins			}
949*c165b184SJames Collins
950*c165b184SJames Collins			// Clear Separator if it's a Selector List
951*c165b184SJames Collins			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
952*c165b184SJames Collins
953*c165b184SJames Collins			// Remove full match before adding to results
954*c165b184SJames Collins			array_shift($m);
955*c165b184SJames Collins			$result[] = $m;
956*c165b184SJames Collins
957*c165b184SJames Collins			if ($is_list) { // Selector List
958*c165b184SJames Collins				$selectors[] = $result;
959*c165b184SJames Collins				$result = array();
960*c165b184SJames Collins			}
961*c165b184SJames Collins		}
962*c165b184SJames Collins
963*c165b184SJames Collins		if (count($result) > 0) { $selectors[] = $result; }
964*c165b184SJames Collins		return $selectors;
965*c165b184SJames Collins	}
966*c165b184SJames Collins
967*c165b184SJames Collins	function __get($name)
968*c165b184SJames Collins	{
969*c165b184SJames Collins		if (isset($this->attr[$name])) {
970*c165b184SJames Collins			return $this->convert_text($this->attr[$name]);
971*c165b184SJames Collins		}
972*c165b184SJames Collins		switch ($name) {
973*c165b184SJames Collins			case 'outertext': return $this->outertext();
974*c165b184SJames Collins			case 'innertext': return $this->innertext();
975*c165b184SJames Collins			case 'plaintext': return $this->text();
976*c165b184SJames Collins			case 'xmltext': return $this->xmltext();
977*c165b184SJames Collins			default: return array_key_exists($name, $this->attr);
978*c165b184SJames Collins		}
979*c165b184SJames Collins	}
980*c165b184SJames Collins
981*c165b184SJames Collins	function __set($name, $value)
982*c165b184SJames Collins	{
983*c165b184SJames Collins		global $debug_object;
984*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
985*c165b184SJames Collins
986*c165b184SJames Collins		switch ($name) {
987*c165b184SJames Collins			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
988*c165b184SJames Collins			case 'innertext':
989*c165b184SJames Collins				if (isset($this->_[HDOM_INFO_TEXT])) {
990*c165b184SJames Collins					return $this->_[HDOM_INFO_TEXT] = $value;
991*c165b184SJames Collins				}
992*c165b184SJames Collins				return $this->_[HDOM_INFO_INNER] = $value;
993*c165b184SJames Collins		}
994*c165b184SJames Collins
995*c165b184SJames Collins		if (!isset($this->attr[$name])) {
996*c165b184SJames Collins			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
997*c165b184SJames Collins			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
998*c165b184SJames Collins		}
999*c165b184SJames Collins
1000*c165b184SJames Collins		$this->attr[$name] = $value;
1001*c165b184SJames Collins	}
1002*c165b184SJames Collins
1003*c165b184SJames Collins	function __isset($name)
1004*c165b184SJames Collins	{
1005*c165b184SJames Collins		switch ($name) {
1006*c165b184SJames Collins			case 'outertext': return true;
1007*c165b184SJames Collins			case 'innertext': return true;
1008*c165b184SJames Collins			case 'plaintext': return true;
1009*c165b184SJames Collins		}
1010*c165b184SJames Collins		//no value attr: nowrap, checked selected...
1011*c165b184SJames Collins		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1012*c165b184SJames Collins	}
1013*c165b184SJames Collins
1014*c165b184SJames Collins	function __unset($name)
1015*c165b184SJames Collins	{
1016*c165b184SJames Collins		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1017*c165b184SJames Collins	}
1018*c165b184SJames Collins
1019*c165b184SJames Collins	function convert_text($text)
1020*c165b184SJames Collins	{
1021*c165b184SJames Collins		global $debug_object;
1022*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1023*c165b184SJames Collins
1024*c165b184SJames Collins		$converted_text = $text;
1025*c165b184SJames Collins
1026*c165b184SJames Collins		$sourceCharset = '';
1027*c165b184SJames Collins		$targetCharset = '';
1028*c165b184SJames Collins
1029*c165b184SJames Collins		if ($this->dom) {
1030*c165b184SJames Collins			$sourceCharset = strtoupper($this->dom->_charset);
1031*c165b184SJames Collins			$targetCharset = strtoupper($this->dom->_target_charset);
1032*c165b184SJames Collins		}
1033*c165b184SJames Collins
1034*c165b184SJames Collins		if (is_object($debug_object)) {
1035*c165b184SJames Collins			$debug_object->debug_log(3,
1036*c165b184SJames Collins				'source charset: '
1037*c165b184SJames Collins				. $sourceCharset
1038*c165b184SJames Collins				. ' target charaset: '
1039*c165b184SJames Collins				. $targetCharset
1040*c165b184SJames Collins			);
1041*c165b184SJames Collins		}
1042*c165b184SJames Collins
1043*c165b184SJames Collins		if (!empty($sourceCharset)
1044*c165b184SJames Collins			&& !empty($targetCharset)
1045*c165b184SJames Collins			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1046*c165b184SJames Collins			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1047*c165b184SJames Collins			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1048*c165b184SJames Collins				&& ($this->is_utf8($text))) {
1049*c165b184SJames Collins				$converted_text = $text;
1050*c165b184SJames Collins			} else {
1051*c165b184SJames Collins				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1052*c165b184SJames Collins			}
1053*c165b184SJames Collins		}
1054*c165b184SJames Collins
1055*c165b184SJames Collins		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1056*c165b184SJames Collins		if ($targetCharset === 'UTF-8') {
1057*c165b184SJames Collins			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1058*c165b184SJames Collins				$converted_text = substr($converted_text, 3);
1059*c165b184SJames Collins			}
1060*c165b184SJames Collins
1061*c165b184SJames Collins			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1062*c165b184SJames Collins				$converted_text = substr($converted_text, 0, -3);
1063*c165b184SJames Collins			}
1064*c165b184SJames Collins		}
1065*c165b184SJames Collins
1066*c165b184SJames Collins		return $converted_text;
1067*c165b184SJames Collins	}
1068*c165b184SJames Collins
1069*c165b184SJames Collins	static function is_utf8($str)
1070*c165b184SJames Collins	{
1071*c165b184SJames Collins		$c = 0; $b = 0;
1072*c165b184SJames Collins		$bits = 0;
1073*c165b184SJames Collins		$len = strlen($str);
1074*c165b184SJames Collins		for($i = 0; $i < $len; $i++) {
1075*c165b184SJames Collins			$c = ord($str[$i]);
1076*c165b184SJames Collins			if($c > 128) {
1077*c165b184SJames Collins				if(($c >= 254)) { return false; }
1078*c165b184SJames Collins				elseif($c >= 252) { $bits = 6; }
1079*c165b184SJames Collins				elseif($c >= 248) { $bits = 5; }
1080*c165b184SJames Collins				elseif($c >= 240) { $bits = 4; }
1081*c165b184SJames Collins				elseif($c >= 224) { $bits = 3; }
1082*c165b184SJames Collins				elseif($c >= 192) { $bits = 2; }
1083*c165b184SJames Collins				else { return false; }
1084*c165b184SJames Collins				if(($i + $bits) > $len) { return false; }
1085*c165b184SJames Collins				while($bits > 1) {
1086*c165b184SJames Collins					$i++;
1087*c165b184SJames Collins					$b = ord($str[$i]);
1088*c165b184SJames Collins					if($b < 128 || $b > 191) { return false; }
1089*c165b184SJames Collins					$bits--;
1090*c165b184SJames Collins				}
1091*c165b184SJames Collins			}
1092*c165b184SJames Collins		}
1093*c165b184SJames Collins		return true;
1094*c165b184SJames Collins	}
1095*c165b184SJames Collins
1096*c165b184SJames Collins	function get_display_size()
1097*c165b184SJames Collins	{
1098*c165b184SJames Collins		global $debug_object;
1099*c165b184SJames Collins
1100*c165b184SJames Collins		$width = -1;
1101*c165b184SJames Collins		$height = -1;
1102*c165b184SJames Collins
1103*c165b184SJames Collins		if ($this->tag !== 'img') {
1104*c165b184SJames Collins			return false;
1105*c165b184SJames Collins		}
1106*c165b184SJames Collins
1107*c165b184SJames Collins		// See if there is aheight or width attribute in the tag itself.
1108*c165b184SJames Collins		if (isset($this->attr['width'])) {
1109*c165b184SJames Collins			$width = $this->attr['width'];
1110*c165b184SJames Collins		}
1111*c165b184SJames Collins
1112*c165b184SJames Collins		if (isset($this->attr['height'])) {
1113*c165b184SJames Collins			$height = $this->attr['height'];
1114*c165b184SJames Collins		}
1115*c165b184SJames Collins
1116*c165b184SJames Collins		// Now look for an inline style.
1117*c165b184SJames Collins		if (isset($this->attr['style'])) {
1118*c165b184SJames Collins			// Thanks to user gnarf from stackoverflow for this regular expression.
1119*c165b184SJames Collins			$attributes = array();
1120*c165b184SJames Collins
1121*c165b184SJames Collins			preg_match_all(
1122*c165b184SJames Collins				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1123*c165b184SJames Collins				$this->attr['style'],
1124*c165b184SJames Collins				$matches,
1125*c165b184SJames Collins				PREG_SET_ORDER
1126*c165b184SJames Collins			);
1127*c165b184SJames Collins
1128*c165b184SJames Collins			foreach ($matches as $match) {
1129*c165b184SJames Collins				$attributes[$match[1]] = $match[2];
1130*c165b184SJames Collins			}
1131*c165b184SJames Collins
1132*c165b184SJames Collins			// If there is a width in the style attributes:
1133*c165b184SJames Collins			if (isset($attributes['width']) && $width == -1) {
1134*c165b184SJames Collins				// check that the last two characters are px (pixels)
1135*c165b184SJames Collins				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1136*c165b184SJames Collins					$proposed_width = substr($attributes['width'], 0, -2);
1137*c165b184SJames Collins					// Now make sure that it's an integer and not something stupid.
1138*c165b184SJames Collins					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1139*c165b184SJames Collins						$width = $proposed_width;
1140*c165b184SJames Collins					}
1141*c165b184SJames Collins				}
1142*c165b184SJames Collins			}
1143*c165b184SJames Collins
1144*c165b184SJames Collins			// If there is a width in the style attributes:
1145*c165b184SJames Collins			if (isset($attributes['height']) && $height == -1) {
1146*c165b184SJames Collins				// check that the last two characters are px (pixels)
1147*c165b184SJames Collins				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1148*c165b184SJames Collins					$proposed_height = substr($attributes['height'], 0, -2);
1149*c165b184SJames Collins					// Now make sure that it's an integer and not something stupid.
1150*c165b184SJames Collins					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1151*c165b184SJames Collins						$height = $proposed_height;
1152*c165b184SJames Collins					}
1153*c165b184SJames Collins				}
1154*c165b184SJames Collins			}
1155*c165b184SJames Collins
1156*c165b184SJames Collins		}
1157*c165b184SJames Collins
1158*c165b184SJames Collins		// Future enhancement:
1159*c165b184SJames Collins		// Look in the tag to see if there is a class or id specified that has
1160*c165b184SJames Collins		// a height or width attribute to it.
1161*c165b184SJames Collins
1162*c165b184SJames Collins		// Far future enhancement
1163*c165b184SJames Collins		// Look at all the parent tags of this image to see if they specify a
1164*c165b184SJames Collins		// class or id that has an img selector that specifies a height or width
1165*c165b184SJames Collins		// Note that in this case, the class or id will have the img subselector
1166*c165b184SJames Collins		// for it to apply to the image.
1167*c165b184SJames Collins
1168*c165b184SJames Collins		// ridiculously far future development
1169*c165b184SJames Collins		// If the class or id is specified in a SEPARATE css file thats not on
1170*c165b184SJames Collins		// the page, go get it and do what we were just doing for the ones on
1171*c165b184SJames Collins		// the page.
1172*c165b184SJames Collins
1173*c165b184SJames Collins		$result = array(
1174*c165b184SJames Collins			'height' => $height,
1175*c165b184SJames Collins			'width' => $width
1176*c165b184SJames Collins		);
1177*c165b184SJames Collins
1178*c165b184SJames Collins		return $result;
1179*c165b184SJames Collins	}
1180*c165b184SJames Collins
1181*c165b184SJames Collins	function save($filepath = '')
1182*c165b184SJames Collins	{
1183*c165b184SJames Collins		$ret = $this->outertext();
1184*c165b184SJames Collins
1185*c165b184SJames Collins		if ($filepath !== '') {
1186*c165b184SJames Collins			file_put_contents($filepath, $ret, LOCK_EX);
1187*c165b184SJames Collins		}
1188*c165b184SJames Collins
1189*c165b184SJames Collins		return $ret;
1190*c165b184SJames Collins	}
1191*c165b184SJames Collins
1192*c165b184SJames Collins	function addClass($class)
1193*c165b184SJames Collins	{
1194*c165b184SJames Collins		if (is_string($class)) {
1195*c165b184SJames Collins			$class = explode(' ', $class);
1196*c165b184SJames Collins		}
1197*c165b184SJames Collins
1198*c165b184SJames Collins		if (is_array($class)) {
1199*c165b184SJames Collins			foreach($class as $c) {
1200*c165b184SJames Collins				if (isset($this->class)) {
1201*c165b184SJames Collins					if ($this->hasClass($c)) {
1202*c165b184SJames Collins						continue;
1203*c165b184SJames Collins					} else {
1204*c165b184SJames Collins						$this->class .= ' ' . $c;
1205*c165b184SJames Collins					}
1206*c165b184SJames Collins				} else {
1207*c165b184SJames Collins					$this->class = $c;
1208*c165b184SJames Collins				}
1209*c165b184SJames Collins			}
1210*c165b184SJames Collins		} else {
1211*c165b184SJames Collins			if (is_object($debug_object)) {
1212*c165b184SJames Collins				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1213*c165b184SJames Collins			}
1214*c165b184SJames Collins		}
1215*c165b184SJames Collins	}
1216*c165b184SJames Collins
1217*c165b184SJames Collins	function hasClass($class)
1218*c165b184SJames Collins	{
1219*c165b184SJames Collins		if (is_string($class)) {
1220*c165b184SJames Collins			if (isset($this->class)) {
1221*c165b184SJames Collins				return in_array($class, explode(' ', $this->class), true);
1222*c165b184SJames Collins			}
1223*c165b184SJames Collins		} else {
1224*c165b184SJames Collins			if (is_object($debug_object)) {
1225*c165b184SJames Collins				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1226*c165b184SJames Collins			}
1227*c165b184SJames Collins		}
1228*c165b184SJames Collins
1229*c165b184SJames Collins		return false;
1230*c165b184SJames Collins	}
1231*c165b184SJames Collins
1232*c165b184SJames Collins	function removeClass($class = null)
1233*c165b184SJames Collins	{
1234*c165b184SJames Collins		if (!isset($this->class)) {
1235*c165b184SJames Collins			return;
1236*c165b184SJames Collins		}
1237*c165b184SJames Collins
1238*c165b184SJames Collins		if (is_null($class)) {
1239*c165b184SJames Collins			$this->removeAttribute('class');
1240*c165b184SJames Collins			return;
1241*c165b184SJames Collins		}
1242*c165b184SJames Collins
1243*c165b184SJames Collins		if (is_string($class)) {
1244*c165b184SJames Collins			$class = explode(' ', $class);
1245*c165b184SJames Collins		}
1246*c165b184SJames Collins
1247*c165b184SJames Collins		if (is_array($class)) {
1248*c165b184SJames Collins			$class = array_diff(explode(' ', $this->class), $class);
1249*c165b184SJames Collins			if (empty($class)) {
1250*c165b184SJames Collins				$this->removeAttribute('class');
1251*c165b184SJames Collins			} else {
1252*c165b184SJames Collins				$this->class = implode(' ', $class);
1253*c165b184SJames Collins			}
1254*c165b184SJames Collins		}
1255*c165b184SJames Collins	}
1256*c165b184SJames Collins
1257*c165b184SJames Collins	function getAllAttributes()
1258*c165b184SJames Collins	{
1259*c165b184SJames Collins		return $this->attr;
1260*c165b184SJames Collins	}
1261*c165b184SJames Collins
1262*c165b184SJames Collins	function getAttribute($name)
1263*c165b184SJames Collins	{
1264*c165b184SJames Collins		return $this->__get($name);
1265*c165b184SJames Collins	}
1266*c165b184SJames Collins
1267*c165b184SJames Collins	function setAttribute($name, $value)
1268*c165b184SJames Collins	{
1269*c165b184SJames Collins		$this->__set($name, $value);
1270*c165b184SJames Collins	}
1271*c165b184SJames Collins
1272*c165b184SJames Collins	function hasAttribute($name)
1273*c165b184SJames Collins	{
1274*c165b184SJames Collins		return $this->__isset($name);
1275*c165b184SJames Collins	}
1276*c165b184SJames Collins
1277*c165b184SJames Collins	function removeAttribute($name)
1278*c165b184SJames Collins	{
1279*c165b184SJames Collins		$this->__set($name, null);
1280*c165b184SJames Collins	}
1281*c165b184SJames Collins
1282*c165b184SJames Collins	function remove()
1283*c165b184SJames Collins	{
1284*c165b184SJames Collins		if ($this->parent) {
1285*c165b184SJames Collins			$this->parent->removeChild($this);
1286*c165b184SJames Collins		}
1287*c165b184SJames Collins	}
1288*c165b184SJames Collins
1289*c165b184SJames Collins	function removeChild($node)
1290*c165b184SJames Collins	{
1291*c165b184SJames Collins		$nidx = array_search($node, $this->nodes, true);
1292*c165b184SJames Collins		$cidx = array_search($node, $this->children, true);
1293*c165b184SJames Collins		$didx = array_search($node, $this->dom->nodes, true);
1294*c165b184SJames Collins
1295*c165b184SJames Collins		if ($nidx !== false && $cidx !== false && $didx !== false) {
1296*c165b184SJames Collins
1297*c165b184SJames Collins			foreach($node->children as $child) {
1298*c165b184SJames Collins				$node->removeChild($child);
1299*c165b184SJames Collins			}
1300*c165b184SJames Collins
1301*c165b184SJames Collins			foreach($node->nodes as $entity) {
1302*c165b184SJames Collins				$enidx = array_search($entity, $node->nodes, true);
1303*c165b184SJames Collins				$edidx = array_search($entity, $node->dom->nodes, true);
1304*c165b184SJames Collins
1305*c165b184SJames Collins				if ($enidx !== false && $edidx !== false) {
1306*c165b184SJames Collins					unset($node->nodes[$enidx]);
1307*c165b184SJames Collins					unset($node->dom->nodes[$edidx]);
1308*c165b184SJames Collins				}
1309*c165b184SJames Collins			}
1310*c165b184SJames Collins
1311*c165b184SJames Collins			unset($this->nodes[$nidx]);
1312*c165b184SJames Collins			unset($this->children[$cidx]);
1313*c165b184SJames Collins			unset($this->dom->nodes[$didx]);
1314*c165b184SJames Collins
1315*c165b184SJames Collins			$node->clear();
1316*c165b184SJames Collins
1317*c165b184SJames Collins		}
1318*c165b184SJames Collins	}
1319*c165b184SJames Collins
1320*c165b184SJames Collins	function getElementById($id)
1321*c165b184SJames Collins	{
1322*c165b184SJames Collins		return $this->find("#$id", 0);
1323*c165b184SJames Collins	}
1324*c165b184SJames Collins
1325*c165b184SJames Collins	function getElementsById($id, $idx = null)
1326*c165b184SJames Collins	{
1327*c165b184SJames Collins		return $this->find("#$id", $idx);
1328*c165b184SJames Collins	}
1329*c165b184SJames Collins
1330*c165b184SJames Collins	function getElementByTagName($name)
1331*c165b184SJames Collins	{
1332*c165b184SJames Collins		return $this->find($name, 0);
1333*c165b184SJames Collins	}
1334*c165b184SJames Collins
1335*c165b184SJames Collins	function getElementsByTagName($name, $idx = null)
1336*c165b184SJames Collins	{
1337*c165b184SJames Collins		return $this->find($name, $idx);
1338*c165b184SJames Collins	}
1339*c165b184SJames Collins
1340*c165b184SJames Collins	function parentNode()
1341*c165b184SJames Collins	{
1342*c165b184SJames Collins		return $this->parent();
1343*c165b184SJames Collins	}
1344*c165b184SJames Collins
1345*c165b184SJames Collins	function childNodes($idx = -1)
1346*c165b184SJames Collins	{
1347*c165b184SJames Collins		return $this->children($idx);
1348*c165b184SJames Collins	}
1349*c165b184SJames Collins
1350*c165b184SJames Collins	function firstChild()
1351*c165b184SJames Collins	{
1352*c165b184SJames Collins		return $this->first_child();
1353*c165b184SJames Collins	}
1354*c165b184SJames Collins
1355*c165b184SJames Collins	function lastChild()
1356*c165b184SJames Collins	{
1357*c165b184SJames Collins		return $this->last_child();
1358*c165b184SJames Collins	}
1359*c165b184SJames Collins
1360*c165b184SJames Collins	function nextSibling()
1361*c165b184SJames Collins	{
1362*c165b184SJames Collins		return $this->next_sibling();
1363*c165b184SJames Collins	}
1364*c165b184SJames Collins
1365*c165b184SJames Collins	function previousSibling()
1366*c165b184SJames Collins	{
1367*c165b184SJames Collins		return $this->prev_sibling();
1368*c165b184SJames Collins	}
1369*c165b184SJames Collins
1370*c165b184SJames Collins	function hasChildNodes()
1371*c165b184SJames Collins	{
1372*c165b184SJames Collins		return $this->has_child();
1373*c165b184SJames Collins	}
1374*c165b184SJames Collins
1375*c165b184SJames Collins	function nodeName()
1376*c165b184SJames Collins	{
1377*c165b184SJames Collins		return $this->tag;
1378*c165b184SJames Collins	}
1379*c165b184SJames Collins
1380*c165b184SJames Collins	function appendChild($node)
1381*c165b184SJames Collins	{
1382*c165b184SJames Collins		$node->parent($this);
1383*c165b184SJames Collins		return $node;
1384*c165b184SJames Collins	}
1385*c165b184SJames Collins
1386*c165b184SJames Collins}
1387*c165b184SJames Collins
1388*c165b184SJames Collinsclass simple_html_dom
1389*c165b184SJames Collins{
1390*c165b184SJames Collins	public $root = null;
1391*c165b184SJames Collins	public $nodes = array();
1392*c165b184SJames Collins	public $callback = null;
1393*c165b184SJames Collins	public $lowercase = false;
1394*c165b184SJames Collins	public $original_size;
1395*c165b184SJames Collins	public $size;
1396*c165b184SJames Collins
1397*c165b184SJames Collins	protected $pos;
1398*c165b184SJames Collins	protected $doc;
1399*c165b184SJames Collins	protected $char;
1400*c165b184SJames Collins
1401*c165b184SJames Collins	protected $cursor;
1402*c165b184SJames Collins	protected $parent;
1403*c165b184SJames Collins	protected $noise = array();
1404*c165b184SJames Collins	protected $token_blank = " \t\r\n";
1405*c165b184SJames Collins	protected $token_equal = ' =/>';
1406*c165b184SJames Collins	protected $token_slash = " />\r\n\t";
1407*c165b184SJames Collins	protected $token_attr = ' >';
1408*c165b184SJames Collins
1409*c165b184SJames Collins	public $_charset = '';
1410*c165b184SJames Collins	public $_target_charset = '';
1411*c165b184SJames Collins
1412*c165b184SJames Collins	protected $default_br_text = '';
1413*c165b184SJames Collins
1414*c165b184SJames Collins	public $default_span_text = '';
1415*c165b184SJames Collins
1416*c165b184SJames Collins	protected $self_closing_tags = array(
1417*c165b184SJames Collins		'area' => 1,
1418*c165b184SJames Collins		'base' => 1,
1419*c165b184SJames Collins		'br' => 1,
1420*c165b184SJames Collins		'col' => 1,
1421*c165b184SJames Collins		'embed' => 1,
1422*c165b184SJames Collins		'hr' => 1,
1423*c165b184SJames Collins		'img' => 1,
1424*c165b184SJames Collins		'input' => 1,
1425*c165b184SJames Collins		'link' => 1,
1426*c165b184SJames Collins		'meta' => 1,
1427*c165b184SJames Collins		'param' => 1,
1428*c165b184SJames Collins		'source' => 1,
1429*c165b184SJames Collins		'track' => 1,
1430*c165b184SJames Collins		'wbr' => 1
1431*c165b184SJames Collins	);
1432*c165b184SJames Collins	protected $block_tags = array(
1433*c165b184SJames Collins		'body' => 1,
1434*c165b184SJames Collins		'div' => 1,
1435*c165b184SJames Collins		'form' => 1,
1436*c165b184SJames Collins		'root' => 1,
1437*c165b184SJames Collins		'span' => 1,
1438*c165b184SJames Collins		'table' => 1
1439*c165b184SJames Collins	);
1440*c165b184SJames Collins	protected $optional_closing_tags = array(
1441*c165b184SJames Collins		// Not optional, see
1442*c165b184SJames Collins		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1443*c165b184SJames Collins		'b' => array('b' => 1),
1444*c165b184SJames Collins		'dd' => array('dd' => 1, 'dt' => 1),
1445*c165b184SJames Collins		// Not optional, see
1446*c165b184SJames Collins		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1447*c165b184SJames Collins		'dl' => array('dd' => 1, 'dt' => 1),
1448*c165b184SJames Collins		'dt' => array('dd' => 1, 'dt' => 1),
1449*c165b184SJames Collins		'li' => array('li' => 1),
1450*c165b184SJames Collins		'optgroup' => array('optgroup' => 1, 'option' => 1),
1451*c165b184SJames Collins		'option' => array('optgroup' => 1, 'option' => 1),
1452*c165b184SJames Collins		'p' => array('p' => 1),
1453*c165b184SJames Collins		'rp' => array('rp' => 1, 'rt' => 1),
1454*c165b184SJames Collins		'rt' => array('rp' => 1, 'rt' => 1),
1455*c165b184SJames Collins		'td' => array('td' => 1, 'th' => 1),
1456*c165b184SJames Collins		'th' => array('td' => 1, 'th' => 1),
1457*c165b184SJames Collins		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1458*c165b184SJames Collins	);
1459*c165b184SJames Collins
1460*c165b184SJames Collins	function __construct(
1461*c165b184SJames Collins		$str = null,
1462*c165b184SJames Collins		$lowercase = true,
1463*c165b184SJames Collins		$forceTagsClosed = true,
1464*c165b184SJames Collins		$target_charset = DEFAULT_TARGET_CHARSET,
1465*c165b184SJames Collins		$stripRN = true,
1466*c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1467*c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT,
1468*c165b184SJames Collins		$options = 0)
1469*c165b184SJames Collins	{
1470*c165b184SJames Collins		if ($str) {
1471*c165b184SJames Collins			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1472*c165b184SJames Collins				$this->load_file($str);
1473*c165b184SJames Collins			} else {
1474*c165b184SJames Collins				$this->load(
1475*c165b184SJames Collins					$str,
1476*c165b184SJames Collins					$lowercase,
1477*c165b184SJames Collins					$stripRN,
1478*c165b184SJames Collins					$defaultBRText,
1479*c165b184SJames Collins					$defaultSpanText,
1480*c165b184SJames Collins					$options
1481*c165b184SJames Collins				);
1482*c165b184SJames Collins			}
1483*c165b184SJames Collins		}
1484*c165b184SJames Collins		// Forcing tags to be closed implies that we don't trust the html, but
1485*c165b184SJames Collins		// it can lead to parsing errors if we SHOULD trust the html.
1486*c165b184SJames Collins		if (!$forceTagsClosed) {
1487*c165b184SJames Collins			$this->optional_closing_array = array();
1488*c165b184SJames Collins		}
1489*c165b184SJames Collins
1490*c165b184SJames Collins		$this->_target_charset = $target_charset;
1491*c165b184SJames Collins	}
1492*c165b184SJames Collins
1493*c165b184SJames Collins	function __destruct()
1494*c165b184SJames Collins	{
1495*c165b184SJames Collins		$this->clear();
1496*c165b184SJames Collins	}
1497*c165b184SJames Collins
1498*c165b184SJames Collins	function load(
1499*c165b184SJames Collins		$str,
1500*c165b184SJames Collins		$lowercase = true,
1501*c165b184SJames Collins		$stripRN = true,
1502*c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1503*c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT,
1504*c165b184SJames Collins		$options = 0)
1505*c165b184SJames Collins	{
1506*c165b184SJames Collins		global $debug_object;
1507*c165b184SJames Collins
1508*c165b184SJames Collins		// prepare
1509*c165b184SJames Collins		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1510*c165b184SJames Collins
1511*c165b184SJames Collins		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1512*c165b184SJames Collins		// Script tags removal now preceeds style tag removal.
1513*c165b184SJames Collins		// strip out <script> tags
1514*c165b184SJames Collins		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1515*c165b184SJames Collins		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1516*c165b184SJames Collins
1517*c165b184SJames Collins		// strip out the \r \n's if we are told to.
1518*c165b184SJames Collins		if ($stripRN) {
1519*c165b184SJames Collins			$this->doc = str_replace("\r", ' ', $this->doc);
1520*c165b184SJames Collins			$this->doc = str_replace("\n", ' ', $this->doc);
1521*c165b184SJames Collins
1522*c165b184SJames Collins			// set the length of content since we have changed it.
1523*c165b184SJames Collins			$this->size = strlen($this->doc);
1524*c165b184SJames Collins		}
1525*c165b184SJames Collins
1526*c165b184SJames Collins		// strip out cdata
1527*c165b184SJames Collins		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1528*c165b184SJames Collins		// strip out comments
1529*c165b184SJames Collins		$this->remove_noise("'<!--(.*?)-->'is");
1530*c165b184SJames Collins		// strip out <style> tags
1531*c165b184SJames Collins		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1532*c165b184SJames Collins		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1533*c165b184SJames Collins		// strip out preformatted tags
1534*c165b184SJames Collins		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1535*c165b184SJames Collins		// strip out server side scripts
1536*c165b184SJames Collins		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1537*c165b184SJames Collins
1538*c165b184SJames Collins		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1539*c165b184SJames Collins			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1540*c165b184SJames Collins		}
1541*c165b184SJames Collins
1542*c165b184SJames Collins		// parsing
1543*c165b184SJames Collins		$this->parse();
1544*c165b184SJames Collins		// end
1545*c165b184SJames Collins		$this->root->_[HDOM_INFO_END] = $this->cursor;
1546*c165b184SJames Collins		$this->parse_charset();
1547*c165b184SJames Collins
1548*c165b184SJames Collins		// make load function chainable
1549*c165b184SJames Collins		return $this;
1550*c165b184SJames Collins	}
1551*c165b184SJames Collins
1552*c165b184SJames Collins	function load_file()
1553*c165b184SJames Collins	{
1554*c165b184SJames Collins		$args = func_get_args();
1555*c165b184SJames Collins
1556*c165b184SJames Collins		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1557*c165b184SJames Collins			$this->load($doc, true);
1558*c165b184SJames Collins		} else {
1559*c165b184SJames Collins			return false;
1560*c165b184SJames Collins		}
1561*c165b184SJames Collins	}
1562*c165b184SJames Collins
1563*c165b184SJames Collins	function set_callback($function_name)
1564*c165b184SJames Collins	{
1565*c165b184SJames Collins		$this->callback = $function_name;
1566*c165b184SJames Collins	}
1567*c165b184SJames Collins
1568*c165b184SJames Collins	function remove_callback()
1569*c165b184SJames Collins	{
1570*c165b184SJames Collins		$this->callback = null;
1571*c165b184SJames Collins	}
1572*c165b184SJames Collins
1573*c165b184SJames Collins	function save($filepath = '')
1574*c165b184SJames Collins	{
1575*c165b184SJames Collins		$ret = $this->root->innertext();
1576*c165b184SJames Collins		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1577*c165b184SJames Collins		return $ret;
1578*c165b184SJames Collins	}
1579*c165b184SJames Collins
1580*c165b184SJames Collins	function find($selector, $idx = null, $lowercase = false)
1581*c165b184SJames Collins	{
1582*c165b184SJames Collins		return $this->root->find($selector, $idx, $lowercase);
1583*c165b184SJames Collins	}
1584*c165b184SJames Collins
1585*c165b184SJames Collins	function clear()
1586*c165b184SJames Collins	{
1587*c165b184SJames Collins		if (isset($this->nodes)) {
1588*c165b184SJames Collins			foreach ($this->nodes as $n) {
1589*c165b184SJames Collins				$n->clear();
1590*c165b184SJames Collins				$n = null;
1591*c165b184SJames Collins			}
1592*c165b184SJames Collins		}
1593*c165b184SJames Collins
1594*c165b184SJames Collins		// This add next line is documented in the sourceforge repository.
1595*c165b184SJames Collins		// 2977248 as a fix for ongoing memory leaks that occur even with the
1596*c165b184SJames Collins		// use of clear.
1597*c165b184SJames Collins		if (isset($this->children)) {
1598*c165b184SJames Collins			foreach ($this->children as $n) {
1599*c165b184SJames Collins				$n->clear();
1600*c165b184SJames Collins				$n = null;
1601*c165b184SJames Collins			}
1602*c165b184SJames Collins		}
1603*c165b184SJames Collins
1604*c165b184SJames Collins		if (isset($this->parent)) {
1605*c165b184SJames Collins			$this->parent->clear();
1606*c165b184SJames Collins			unset($this->parent);
1607*c165b184SJames Collins		}
1608*c165b184SJames Collins
1609*c165b184SJames Collins		if (isset($this->root)) {
1610*c165b184SJames Collins			$this->root->clear();
1611*c165b184SJames Collins			unset($this->root);
1612*c165b184SJames Collins		}
1613*c165b184SJames Collins
1614*c165b184SJames Collins		unset($this->doc);
1615*c165b184SJames Collins		unset($this->noise);
1616*c165b184SJames Collins	}
1617*c165b184SJames Collins
1618*c165b184SJames Collins	function dump($show_attr = true)
1619*c165b184SJames Collins	{
1620*c165b184SJames Collins		$this->root->dump($show_attr);
1621*c165b184SJames Collins	}
1622*c165b184SJames Collins
1623*c165b184SJames Collins	protected function prepare(
1624*c165b184SJames Collins		$str, $lowercase = true,
1625*c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1626*c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT)
1627*c165b184SJames Collins	{
1628*c165b184SJames Collins		$this->clear();
1629*c165b184SJames Collins
1630*c165b184SJames Collins		$this->doc = trim($str);
1631*c165b184SJames Collins		$this->size = strlen($this->doc);
1632*c165b184SJames Collins		$this->original_size = $this->size; // original size of the html
1633*c165b184SJames Collins		$this->pos = 0;
1634*c165b184SJames Collins		$this->cursor = 1;
1635*c165b184SJames Collins		$this->noise = array();
1636*c165b184SJames Collins		$this->nodes = array();
1637*c165b184SJames Collins		$this->lowercase = $lowercase;
1638*c165b184SJames Collins		$this->default_br_text = $defaultBRText;
1639*c165b184SJames Collins		$this->default_span_text = $defaultSpanText;
1640*c165b184SJames Collins		$this->root = new simple_html_dom_node($this);
1641*c165b184SJames Collins		$this->root->tag = 'root';
1642*c165b184SJames Collins		$this->root->_[HDOM_INFO_BEGIN] = -1;
1643*c165b184SJames Collins		$this->root->nodetype = HDOM_TYPE_ROOT;
1644*c165b184SJames Collins		$this->parent = $this->root;
1645*c165b184SJames Collins		if ($this->size > 0) { $this->char = $this->doc[0]; }
1646*c165b184SJames Collins	}
1647*c165b184SJames Collins
1648*c165b184SJames Collins	protected function parse()
1649*c165b184SJames Collins	{
1650*c165b184SJames Collins		while (true) {
1651*c165b184SJames Collins			// Read next tag if there is no text between current position and the
1652*c165b184SJames Collins			// next opening tag.
1653*c165b184SJames Collins			if (($s = $this->copy_until_char('<')) === '') {
1654*c165b184SJames Collins				if($this->read_tag()) {
1655*c165b184SJames Collins					continue;
1656*c165b184SJames Collins				} else {
1657*c165b184SJames Collins					return true;
1658*c165b184SJames Collins				}
1659*c165b184SJames Collins			}
1660*c165b184SJames Collins
1661*c165b184SJames Collins			// Add a text node for text between tags
1662*c165b184SJames Collins			$node = new simple_html_dom_node($this);
1663*c165b184SJames Collins			++$this->cursor;
1664*c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = $s;
1665*c165b184SJames Collins			$this->link_nodes($node, false);
1666*c165b184SJames Collins		}
1667*c165b184SJames Collins	}
1668*c165b184SJames Collins
1669*c165b184SJames Collins	protected function parse_charset()
1670*c165b184SJames Collins	{
1671*c165b184SJames Collins		global $debug_object;
1672*c165b184SJames Collins
1673*c165b184SJames Collins		$charset = null;
1674*c165b184SJames Collins
1675*c165b184SJames Collins		if (function_exists('get_last_retrieve_url_contents_content_type')) {
1676*c165b184SJames Collins			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1677*c165b184SJames Collins			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1678*c165b184SJames Collins			if ($success) {
1679*c165b184SJames Collins				$charset = $matches[1];
1680*c165b184SJames Collins				if (is_object($debug_object)) {
1681*c165b184SJames Collins					$debug_object->debug_log(2,
1682*c165b184SJames Collins						'header content-type found charset of: '
1683*c165b184SJames Collins						. $charset
1684*c165b184SJames Collins					);
1685*c165b184SJames Collins				}
1686*c165b184SJames Collins			}
1687*c165b184SJames Collins		}
1688*c165b184SJames Collins
1689*c165b184SJames Collins		if (empty($charset)) {
1690*c165b184SJames Collins			// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1691*c165b184SJames Collins			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
1692*c165b184SJames Collins
1693*c165b184SJames Collins			if (!empty($el)) {
1694*c165b184SJames Collins				$fullvalue = $el->content;
1695*c165b184SJames Collins				if (is_object($debug_object)) {
1696*c165b184SJames Collins					$debug_object->debug_log(2,
1697*c165b184SJames Collins						'meta content-type tag found'
1698*c165b184SJames Collins						. $fullvalue
1699*c165b184SJames Collins					);
1700*c165b184SJames Collins				}
1701*c165b184SJames Collins
1702*c165b184SJames Collins				if (!empty($fullvalue)) {
1703*c165b184SJames Collins					$success = preg_match(
1704*c165b184SJames Collins						'/charset=(.+)/i',
1705*c165b184SJames Collins						$fullvalue,
1706*c165b184SJames Collins						$matches
1707*c165b184SJames Collins					);
1708*c165b184SJames Collins
1709*c165b184SJames Collins					if ($success) {
1710*c165b184SJames Collins						$charset = $matches[1];
1711*c165b184SJames Collins					} else {
1712*c165b184SJames Collins						// If there is a meta tag, and they don't specify the
1713*c165b184SJames Collins						// character set, research says that it's typically
1714*c165b184SJames Collins						// ISO-8859-1
1715*c165b184SJames Collins						if (is_object($debug_object)) {
1716*c165b184SJames Collins							$debug_object->debug_log(2,
1717*c165b184SJames Collins								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
1718*c165b184SJames Collins							);
1719*c165b184SJames Collins						}
1720*c165b184SJames Collins
1721*c165b184SJames Collins						$charset = 'ISO-8859-1';
1722*c165b184SJames Collins					}
1723*c165b184SJames Collins				}
1724*c165b184SJames Collins			}
1725*c165b184SJames Collins		}
1726*c165b184SJames Collins
1727*c165b184SJames Collins		if (empty($charset)) {
1728*c165b184SJames Collins			// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
1729*c165b184SJames Collins			if ($meta = $this->root->find('meta[charset]', 0)) {
1730*c165b184SJames Collins				$charset = $meta->charset;
1731*c165b184SJames Collins				if (is_object($debug_object)) {
1732*c165b184SJames Collins					$debug_object->debug_log(2, 'meta charset: ' . $charset);
1733*c165b184SJames Collins				}
1734*c165b184SJames Collins			}
1735*c165b184SJames Collins		}
1736*c165b184SJames Collins
1737*c165b184SJames Collins		if (empty($charset)) {
1738*c165b184SJames Collins			// Try to guess the charset based on the content
1739*c165b184SJames Collins			// Requires Multibyte String (mbstring) support (optional)
1740*c165b184SJames Collins			if (function_exists('mb_detect_encoding')) {
1741*c165b184SJames Collins				/**
1742*c165b184SJames Collins				 * mb_detect_encoding() is not intended to distinguish between
1743*c165b184SJames Collins				 * charsets, especially single-byte charsets. Its primary
1744*c165b184SJames Collins				 * purpose is to detect which multibyte encoding is in use,
1745*c165b184SJames Collins				 * i.e. UTF-8, UTF-16, shift-JIS, etc.
1746*c165b184SJames Collins				 *
1747*c165b184SJames Collins				 * -- https://bugs.php.net/bug.php?id=38138
1748*c165b184SJames Collins				 *
1749*c165b184SJames Collins				 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1750*c165b184SJames Collins				 * always result in CP1251/ISO-8859-5 and vice versa.
1751*c165b184SJames Collins				 *
1752*c165b184SJames Collins				 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1753*c165b184SJames Collins				 * to stay compatible.
1754*c165b184SJames Collins				 */
1755*c165b184SJames Collins				$encoding = mb_detect_encoding(
1756*c165b184SJames Collins					$this->doc,
1757*c165b184SJames Collins					array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
1758*c165b184SJames Collins				);
1759*c165b184SJames Collins
1760*c165b184SJames Collins				if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
1761*c165b184SJames Collins					// Due to a limitation of mb_detect_encoding
1762*c165b184SJames Collins					// 'CP1251'/'ISO-8859-5' will be detected as
1763*c165b184SJames Collins					// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1764*c165b184SJames Collins					// which case we can simply assume it is the other charset.
1765*c165b184SJames Collins					if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
1766*c165b184SJames Collins						$encoding = 'CP1251';
1767*c165b184SJames Collins					}
1768*c165b184SJames Collins				}
1769*c165b184SJames Collins
1770*c165b184SJames Collins				if ($encoding !== false) {
1771*c165b184SJames Collins					$charset = $encoding;
1772*c165b184SJames Collins					if (is_object($debug_object)) {
1773*c165b184SJames Collins						$debug_object->debug_log(2, 'mb_detect: ' . $charset);
1774*c165b184SJames Collins					}
1775*c165b184SJames Collins				}
1776*c165b184SJames Collins			}
1777*c165b184SJames Collins		}
1778*c165b184SJames Collins
1779*c165b184SJames Collins		if (empty($charset)) {
1780*c165b184SJames Collins			// Assume it's UTF-8 as it is the most likely charset to be used
1781*c165b184SJames Collins			$charset = 'UTF-8';
1782*c165b184SJames Collins			if (is_object($debug_object)) {
1783*c165b184SJames Collins				$debug_object->debug_log(2, 'No match found, assume ' . $charset);
1784*c165b184SJames Collins			}
1785*c165b184SJames Collins		}
1786*c165b184SJames Collins
1787*c165b184SJames Collins		// Since CP1252 is a superset, if we get one of it's subsets, we want
1788*c165b184SJames Collins		// it instead.
1789*c165b184SJames Collins		if ((strtolower($charset) == 'iso-8859-1')
1790*c165b184SJames Collins			|| (strtolower($charset) == 'latin1')
1791*c165b184SJames Collins			|| (strtolower($charset) == 'latin-1')) {
1792*c165b184SJames Collins			$charset = 'CP1252';
1793*c165b184SJames Collins			if (is_object($debug_object)) {
1794*c165b184SJames Collins				$debug_object->debug_log(2,
1795*c165b184SJames Collins					'replacing ' . $charset . ' with CP1252 as its a superset'
1796*c165b184SJames Collins				);
1797*c165b184SJames Collins			}
1798*c165b184SJames Collins		}
1799*c165b184SJames Collins
1800*c165b184SJames Collins		if (is_object($debug_object)) {
1801*c165b184SJames Collins			$debug_object->debug_log(1, 'EXIT - ' . $charset);
1802*c165b184SJames Collins		}
1803*c165b184SJames Collins
1804*c165b184SJames Collins		return $this->_charset = $charset;
1805*c165b184SJames Collins	}
1806*c165b184SJames Collins
1807*c165b184SJames Collins	protected function read_tag()
1808*c165b184SJames Collins	{
1809*c165b184SJames Collins		// Set end position if no further tags found
1810*c165b184SJames Collins		if ($this->char !== '<') {
1811*c165b184SJames Collins			$this->root->_[HDOM_INFO_END] = $this->cursor;
1812*c165b184SJames Collins			return false;
1813*c165b184SJames Collins		}
1814*c165b184SJames Collins
1815*c165b184SJames Collins		$begin_tag_pos = $this->pos;
1816*c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1817*c165b184SJames Collins
1818*c165b184SJames Collins		// end tag
1819*c165b184SJames Collins		if ($this->char === '/') {
1820*c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1821*c165b184SJames Collins
1822*c165b184SJames Collins			// Skip whitespace in end tags (i.e. in "</   html>")
1823*c165b184SJames Collins			$this->skip($this->token_blank);
1824*c165b184SJames Collins			$tag = $this->copy_until_char('>');
1825*c165b184SJames Collins
1826*c165b184SJames Collins			// Skip attributes in end tags
1827*c165b184SJames Collins			if (($pos = strpos($tag, ' ')) !== false) {
1828*c165b184SJames Collins				$tag = substr($tag, 0, $pos);
1829*c165b184SJames Collins			}
1830*c165b184SJames Collins
1831*c165b184SJames Collins			$parent_lower = strtolower($this->parent->tag);
1832*c165b184SJames Collins			$tag_lower = strtolower($tag);
1833*c165b184SJames Collins
1834*c165b184SJames Collins			// The end tag is supposed to close the parent tag. Handle situations
1835*c165b184SJames Collins			// when it doesn't
1836*c165b184SJames Collins			if ($parent_lower !== $tag_lower) {
1837*c165b184SJames Collins				// Parent tag does not have to be closed necessarily (optional closing tag)
1838*c165b184SJames Collins				// Current tag is a block tag, so it may close an ancestor
1839*c165b184SJames Collins				if (isset($this->optional_closing_tags[$parent_lower])
1840*c165b184SJames Collins					&& isset($this->block_tags[$tag_lower])) {
1841*c165b184SJames Collins
1842*c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0;
1843*c165b184SJames Collins					$org_parent = $this->parent;
1844*c165b184SJames Collins
1845*c165b184SJames Collins					// Traverse ancestors to find a matching opening tag
1846*c165b184SJames Collins					// Stop at root node
1847*c165b184SJames Collins					while (($this->parent->parent)
1848*c165b184SJames Collins						&& strtolower($this->parent->tag) !== $tag_lower
1849*c165b184SJames Collins					){
1850*c165b184SJames Collins						$this->parent = $this->parent->parent;
1851*c165b184SJames Collins					}
1852*c165b184SJames Collins
1853*c165b184SJames Collins					// If we don't have a match add current tag as text node
1854*c165b184SJames Collins					if (strtolower($this->parent->tag) !== $tag_lower) {
1855*c165b184SJames Collins						$this->parent = $org_parent; // restore origonal parent
1856*c165b184SJames Collins
1857*c165b184SJames Collins						if ($this->parent->parent) {
1858*c165b184SJames Collins							$this->parent = $this->parent->parent;
1859*c165b184SJames Collins						}
1860*c165b184SJames Collins
1861*c165b184SJames Collins						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1862*c165b184SJames Collins						return $this->as_text_node($tag);
1863*c165b184SJames Collins					}
1864*c165b184SJames Collins				} elseif (($this->parent->parent)
1865*c165b184SJames Collins					&& isset($this->block_tags[$tag_lower])
1866*c165b184SJames Collins				) {
1867*c165b184SJames Collins					// Grandparent exists and current tag is a block tag, so our
1868*c165b184SJames Collins					// parent doesn't have an end tag
1869*c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
1870*c165b184SJames Collins					$org_parent = $this->parent;
1871*c165b184SJames Collins
1872*c165b184SJames Collins					// Traverse ancestors to find a matching opening tag
1873*c165b184SJames Collins					// Stop at root node
1874*c165b184SJames Collins					while (($this->parent->parent)
1875*c165b184SJames Collins						&& strtolower($this->parent->tag) !== $tag_lower
1876*c165b184SJames Collins					) {
1877*c165b184SJames Collins						$this->parent = $this->parent->parent;
1878*c165b184SJames Collins					}
1879*c165b184SJames Collins
1880*c165b184SJames Collins					// If we don't have a match add current tag as text node
1881*c165b184SJames Collins					if (strtolower($this->parent->tag) !== $tag_lower) {
1882*c165b184SJames Collins						$this->parent = $org_parent; // restore origonal parent
1883*c165b184SJames Collins						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1884*c165b184SJames Collins						return $this->as_text_node($tag);
1885*c165b184SJames Collins					}
1886*c165b184SJames Collins				} elseif (($this->parent->parent)
1887*c165b184SJames Collins					&& strtolower($this->parent->parent->tag) === $tag_lower
1888*c165b184SJames Collins				) { // Grandparent exists and current tag closes it
1889*c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0;
1890*c165b184SJames Collins					$this->parent = $this->parent->parent;
1891*c165b184SJames Collins				} else { // Random tag, add as text node
1892*c165b184SJames Collins					return $this->as_text_node($tag);
1893*c165b184SJames Collins				}
1894*c165b184SJames Collins			}
1895*c165b184SJames Collins
1896*c165b184SJames Collins			// Set end position of parent tag to current cursor position
1897*c165b184SJames Collins			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1898*c165b184SJames Collins
1899*c165b184SJames Collins			if ($this->parent->parent) {
1900*c165b184SJames Collins				$this->parent = $this->parent->parent;
1901*c165b184SJames Collins			}
1902*c165b184SJames Collins
1903*c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1904*c165b184SJames Collins			return true;
1905*c165b184SJames Collins		}
1906*c165b184SJames Collins
1907*c165b184SJames Collins		// start tag
1908*c165b184SJames Collins		$node = new simple_html_dom_node($this);
1909*c165b184SJames Collins		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1910*c165b184SJames Collins		++$this->cursor;
1911*c165b184SJames Collins		$tag = $this->copy_until($this->token_slash); // Get tag name
1912*c165b184SJames Collins		$node->tag_start = $begin_tag_pos;
1913*c165b184SJames Collins
1914*c165b184SJames Collins		// doctype, cdata & comments...
1915*c165b184SJames Collins		// <!DOCTYPE html>
1916*c165b184SJames Collins		// <![CDATA[ ... ]]>
1917*c165b184SJames Collins		// <!-- Comment -->
1918*c165b184SJames Collins		if (isset($tag[0]) && $tag[0] === '!') {
1919*c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1920*c165b184SJames Collins
1921*c165b184SJames Collins			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
1922*c165b184SJames Collins				$node->nodetype = HDOM_TYPE_COMMENT;
1923*c165b184SJames Collins				$node->tag = 'comment';
1924*c165b184SJames Collins			} else { // Could be doctype or CDATA but we don't care
1925*c165b184SJames Collins				$node->nodetype = HDOM_TYPE_UNKNOWN;
1926*c165b184SJames Collins				$node->tag = 'unknown';
1927*c165b184SJames Collins			}
1928*c165b184SJames Collins
1929*c165b184SJames Collins			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1930*c165b184SJames Collins
1931*c165b184SJames Collins			$this->link_nodes($node, true);
1932*c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1933*c165b184SJames Collins			return true;
1934*c165b184SJames Collins		}
1935*c165b184SJames Collins
1936*c165b184SJames Collins		// The start tag cannot contain another start tag, if so add as text
1937*c165b184SJames Collins		// i.e. "<<html>"
1938*c165b184SJames Collins		if ($pos = strpos($tag, '<') !== false) {
1939*c165b184SJames Collins			$tag = '<' . substr($tag, 0, -1);
1940*c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = $tag;
1941*c165b184SJames Collins			$this->link_nodes($node, false);
1942*c165b184SJames Collins			$this->char = $this->doc[--$this->pos]; // prev
1943*c165b184SJames Collins			return true;
1944*c165b184SJames Collins		}
1945*c165b184SJames Collins
1946*c165b184SJames Collins		// Handle invalid tag names (i.e. "<html#doc>")
1947*c165b184SJames Collins		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
1948*c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1949*c165b184SJames Collins
1950*c165b184SJames Collins			// Next char is the beginning of a new tag, don't touch it.
1951*c165b184SJames Collins			if ($this->char === '<') {
1952*c165b184SJames Collins				$this->link_nodes($node, false);
1953*c165b184SJames Collins				return true;
1954*c165b184SJames Collins			}
1955*c165b184SJames Collins
1956*c165b184SJames Collins			// Next char closes current tag, add and be done with it.
1957*c165b184SJames Collins			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1958*c165b184SJames Collins			$this->link_nodes($node, false);
1959*c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1960*c165b184SJames Collins			return true;
1961*c165b184SJames Collins		}
1962*c165b184SJames Collins
1963*c165b184SJames Collins		// begin tag, add new node
1964*c165b184SJames Collins		$node->nodetype = HDOM_TYPE_ELEMENT;
1965*c165b184SJames Collins		$tag_lower = strtolower($tag);
1966*c165b184SJames Collins		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1967*c165b184SJames Collins
1968*c165b184SJames Collins		// handle optional closing tags
1969*c165b184SJames Collins		if (isset($this->optional_closing_tags[$tag_lower])) {
1970*c165b184SJames Collins			// Traverse ancestors to close all optional closing tags
1971*c165b184SJames Collins			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
1972*c165b184SJames Collins				$this->parent->_[HDOM_INFO_END] = 0;
1973*c165b184SJames Collins				$this->parent = $this->parent->parent;
1974*c165b184SJames Collins			}
1975*c165b184SJames Collins			$node->parent = $this->parent;
1976*c165b184SJames Collins		}
1977*c165b184SJames Collins
1978*c165b184SJames Collins		$guard = 0; // prevent infinity loop
1979*c165b184SJames Collins
1980*c165b184SJames Collins		// [0] Space between tag and first attribute
1981*c165b184SJames Collins		$space = array($this->copy_skip($this->token_blank), '', '');
1982*c165b184SJames Collins
1983*c165b184SJames Collins		// attributes
1984*c165b184SJames Collins		do {
1985*c165b184SJames Collins			// Everything until the first equal sign should be the attribute name
1986*c165b184SJames Collins			$name = $this->copy_until($this->token_equal);
1987*c165b184SJames Collins
1988*c165b184SJames Collins			if ($name === '' && $this->char !== null && $space[0] === '') {
1989*c165b184SJames Collins				break;
1990*c165b184SJames Collins			}
1991*c165b184SJames Collins
1992*c165b184SJames Collins			if ($guard === $this->pos) { // Escape infinite loop
1993*c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1994*c165b184SJames Collins				continue;
1995*c165b184SJames Collins			}
1996*c165b184SJames Collins
1997*c165b184SJames Collins			$guard = $this->pos;
1998*c165b184SJames Collins
1999*c165b184SJames Collins			// handle endless '<'
2000*c165b184SJames Collins			// Out of bounds before the tag ended
2001*c165b184SJames Collins			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2002*c165b184SJames Collins				$node->nodetype = HDOM_TYPE_TEXT;
2003*c165b184SJames Collins				$node->_[HDOM_INFO_END] = 0;
2004*c165b184SJames Collins				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2005*c165b184SJames Collins				$node->tag = 'text';
2006*c165b184SJames Collins				$this->link_nodes($node, false);
2007*c165b184SJames Collins				return true;
2008*c165b184SJames Collins			}
2009*c165b184SJames Collins
2010*c165b184SJames Collins			// handle mismatch '<'
2011*c165b184SJames Collins			// Attributes cannot start after opening tag
2012*c165b184SJames Collins			if ($this->doc[$this->pos - 1] == '<') {
2013*c165b184SJames Collins				$node->nodetype = HDOM_TYPE_TEXT;
2014*c165b184SJames Collins				$node->tag = 'text';
2015*c165b184SJames Collins				$node->attr = array();
2016*c165b184SJames Collins				$node->_[HDOM_INFO_END] = 0;
2017*c165b184SJames Collins				$node->_[HDOM_INFO_TEXT] = substr(
2018*c165b184SJames Collins					$this->doc,
2019*c165b184SJames Collins					$begin_tag_pos,
2020*c165b184SJames Collins					$this->pos - $begin_tag_pos - 1
2021*c165b184SJames Collins				);
2022*c165b184SJames Collins				$this->pos -= 2;
2023*c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2024*c165b184SJames Collins				$this->link_nodes($node, false);
2025*c165b184SJames Collins				return true;
2026*c165b184SJames Collins			}
2027*c165b184SJames Collins
2028*c165b184SJames Collins			if ($name !== '/' && $name !== '') { // this is a attribute name
2029*c165b184SJames Collins				// [1] Whitespace after attribute name
2030*c165b184SJames Collins				$space[1] = $this->copy_skip($this->token_blank);
2031*c165b184SJames Collins
2032*c165b184SJames Collins				$name = $this->restore_noise($name); // might be a noisy name
2033*c165b184SJames Collins
2034*c165b184SJames Collins				if ($this->lowercase) { $name = strtolower($name); }
2035*c165b184SJames Collins
2036*c165b184SJames Collins				if ($this->char === '=') { // attribute with value
2037*c165b184SJames Collins					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2038*c165b184SJames Collins					$this->parse_attr($node, $name, $space); // get attribute value
2039*c165b184SJames Collins				} else {
2040*c165b184SJames Collins					//no value attr: nowrap, checked selected...
2041*c165b184SJames Collins					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2042*c165b184SJames Collins					$node->attr[$name] = true;
2043*c165b184SJames Collins					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2044*c165b184SJames Collins				}
2045*c165b184SJames Collins
2046*c165b184SJames Collins				$node->_[HDOM_INFO_SPACE][] = $space;
2047*c165b184SJames Collins
2048*c165b184SJames Collins				// prepare for next attribute
2049*c165b184SJames Collins				$space = array(
2050*c165b184SJames Collins					$this->copy_skip($this->token_blank),
2051*c165b184SJames Collins					'',
2052*c165b184SJames Collins					''
2053*c165b184SJames Collins				);
2054*c165b184SJames Collins			} else { // no more attributes
2055*c165b184SJames Collins				break;
2056*c165b184SJames Collins			}
2057*c165b184SJames Collins		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2058*c165b184SJames Collins
2059*c165b184SJames Collins		$this->link_nodes($node, true);
2060*c165b184SJames Collins		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2061*c165b184SJames Collins
2062*c165b184SJames Collins		// handle empty tags (i.e. "<div/>")
2063*c165b184SJames Collins		if ($this->copy_until_char('>') === '/') {
2064*c165b184SJames Collins			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2065*c165b184SJames Collins			$node->_[HDOM_INFO_END] = 0;
2066*c165b184SJames Collins		} else {
2067*c165b184SJames Collins			// reset parent
2068*c165b184SJames Collins			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2069*c165b184SJames Collins				$this->parent = $node;
2070*c165b184SJames Collins			}
2071*c165b184SJames Collins		}
2072*c165b184SJames Collins
2073*c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2074*c165b184SJames Collins
2075*c165b184SJames Collins		// If it's a BR tag, we need to set it's text to the default text.
2076*c165b184SJames Collins		// This way when we see it in plaintext, we can generate formatting that the user wants.
2077*c165b184SJames Collins		// since a br tag never has sub nodes, this works well.
2078*c165b184SJames Collins		if ($node->tag === 'br') {
2079*c165b184SJames Collins			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2080*c165b184SJames Collins		}
2081*c165b184SJames Collins
2082*c165b184SJames Collins		return true;
2083*c165b184SJames Collins	}
2084*c165b184SJames Collins
2085*c165b184SJames Collins	protected function parse_attr($node, $name, &$space)
2086*c165b184SJames Collins	{
2087*c165b184SJames Collins		$is_duplicate = isset($node->attr[$name]);
2088*c165b184SJames Collins
2089*c165b184SJames Collins		if (!$is_duplicate) // Copy whitespace between "=" and value
2090*c165b184SJames Collins			$space[2] = $this->copy_skip($this->token_blank);
2091*c165b184SJames Collins
2092*c165b184SJames Collins		switch ($this->char) {
2093*c165b184SJames Collins			case '"':
2094*c165b184SJames Collins				$quote_type = HDOM_QUOTE_DOUBLE;
2095*c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2096*c165b184SJames Collins				$value = $this->copy_until_char('"');
2097*c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2098*c165b184SJames Collins				break;
2099*c165b184SJames Collins			case '\'':
2100*c165b184SJames Collins				$quote_type = HDOM_QUOTE_SINGLE;
2101*c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2102*c165b184SJames Collins				$value = $this->copy_until_char('\'');
2103*c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2104*c165b184SJames Collins				break;
2105*c165b184SJames Collins			default:
2106*c165b184SJames Collins				$quote_type = HDOM_QUOTE_NO;
2107*c165b184SJames Collins				$value = $this->copy_until($this->token_attr);
2108*c165b184SJames Collins		}
2109*c165b184SJames Collins
2110*c165b184SJames Collins		$value = $this->restore_noise($value);
2111*c165b184SJames Collins
2112*c165b184SJames Collins		// PaperG: Attributes should not have \r or \n in them, that counts as
2113*c165b184SJames Collins		// html whitespace.
2114*c165b184SJames Collins		$value = str_replace("\r", '', $value);
2115*c165b184SJames Collins		$value = str_replace("\n", '', $value);
2116*c165b184SJames Collins
2117*c165b184SJames Collins		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2118*c165b184SJames Collins		// and trailing space since some people leave it in the multi class case.
2119*c165b184SJames Collins		if ($name === 'class') {
2120*c165b184SJames Collins			$value = trim($value);
2121*c165b184SJames Collins		}
2122*c165b184SJames Collins
2123*c165b184SJames Collins		if (!$is_duplicate) {
2124*c165b184SJames Collins			$node->_[HDOM_INFO_QUOTE][] = $quote_type;
2125*c165b184SJames Collins			$node->attr[$name] = $value;
2126*c165b184SJames Collins		}
2127*c165b184SJames Collins	}
2128*c165b184SJames Collins
2129*c165b184SJames Collins	protected function link_nodes(&$node, $is_child)
2130*c165b184SJames Collins	{
2131*c165b184SJames Collins		$node->parent = $this->parent;
2132*c165b184SJames Collins		$this->parent->nodes[] = $node;
2133*c165b184SJames Collins		if ($is_child) {
2134*c165b184SJames Collins			$this->parent->children[] = $node;
2135*c165b184SJames Collins		}
2136*c165b184SJames Collins	}
2137*c165b184SJames Collins
2138*c165b184SJames Collins	protected function as_text_node($tag)
2139*c165b184SJames Collins	{
2140*c165b184SJames Collins		$node = new simple_html_dom_node($this);
2141*c165b184SJames Collins		++$this->cursor;
2142*c165b184SJames Collins		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2143*c165b184SJames Collins		$this->link_nodes($node, false);
2144*c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2145*c165b184SJames Collins		return true;
2146*c165b184SJames Collins	}
2147*c165b184SJames Collins
2148*c165b184SJames Collins	protected function skip($chars)
2149*c165b184SJames Collins	{
2150*c165b184SJames Collins		$this->pos += strspn($this->doc, $chars, $this->pos);
2151*c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2152*c165b184SJames Collins	}
2153*c165b184SJames Collins
2154*c165b184SJames Collins	protected function copy_skip($chars)
2155*c165b184SJames Collins	{
2156*c165b184SJames Collins		$pos = $this->pos;
2157*c165b184SJames Collins		$len = strspn($this->doc, $chars, $pos);
2158*c165b184SJames Collins		$this->pos += $len;
2159*c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2160*c165b184SJames Collins		if ($len === 0) { return ''; }
2161*c165b184SJames Collins		return substr($this->doc, $pos, $len);
2162*c165b184SJames Collins	}
2163*c165b184SJames Collins
2164*c165b184SJames Collins	protected function copy_until($chars)
2165*c165b184SJames Collins	{
2166*c165b184SJames Collins		$pos = $this->pos;
2167*c165b184SJames Collins		$len = strcspn($this->doc, $chars, $pos);
2168*c165b184SJames Collins		$this->pos += $len;
2169*c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2170*c165b184SJames Collins		return substr($this->doc, $pos, $len);
2171*c165b184SJames Collins	}
2172*c165b184SJames Collins
2173*c165b184SJames Collins	protected function copy_until_char($char)
2174*c165b184SJames Collins	{
2175*c165b184SJames Collins		if ($this->char === null) { return ''; }
2176*c165b184SJames Collins
2177*c165b184SJames Collins		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2178*c165b184SJames Collins			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2179*c165b184SJames Collins			$this->char = null;
2180*c165b184SJames Collins			$this->pos = $this->size;
2181*c165b184SJames Collins			return $ret;
2182*c165b184SJames Collins		}
2183*c165b184SJames Collins
2184*c165b184SJames Collins		if ($pos === $this->pos) { return ''; }
2185*c165b184SJames Collins
2186*c165b184SJames Collins		$pos_old = $this->pos;
2187*c165b184SJames Collins		$this->char = $this->doc[$pos];
2188*c165b184SJames Collins		$this->pos = $pos;
2189*c165b184SJames Collins		return substr($this->doc, $pos_old, $pos - $pos_old);
2190*c165b184SJames Collins	}
2191*c165b184SJames Collins
2192*c165b184SJames Collins	protected function remove_noise($pattern, $remove_tag = false)
2193*c165b184SJames Collins	{
2194*c165b184SJames Collins		global $debug_object;
2195*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2196*c165b184SJames Collins
2197*c165b184SJames Collins		$count = preg_match_all(
2198*c165b184SJames Collins			$pattern,
2199*c165b184SJames Collins			$this->doc,
2200*c165b184SJames Collins			$matches,
2201*c165b184SJames Collins			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2202*c165b184SJames Collins		);
2203*c165b184SJames Collins
2204*c165b184SJames Collins		for ($i = $count - 1; $i > -1; --$i) {
2205*c165b184SJames Collins			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2206*c165b184SJames Collins
2207*c165b184SJames Collins			if (is_object($debug_object)) {
2208*c165b184SJames Collins				$debug_object->debug_log(2, 'key is: ' . $key);
2209*c165b184SJames Collins			}
2210*c165b184SJames Collins
2211*c165b184SJames Collins			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2212*c165b184SJames Collins			$this->noise[$key] = $matches[$i][$idx][0];
2213*c165b184SJames Collins			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2214*c165b184SJames Collins		}
2215*c165b184SJames Collins
2216*c165b184SJames Collins		// reset the length of content
2217*c165b184SJames Collins		$this->size = strlen($this->doc);
2218*c165b184SJames Collins
2219*c165b184SJames Collins		if ($this->size > 0) {
2220*c165b184SJames Collins			$this->char = $this->doc[0];
2221*c165b184SJames Collins		}
2222*c165b184SJames Collins	}
2223*c165b184SJames Collins
2224*c165b184SJames Collins	function restore_noise($text)
2225*c165b184SJames Collins	{
2226*c165b184SJames Collins		global $debug_object;
2227*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2228*c165b184SJames Collins
2229*c165b184SJames Collins		while (($pos = strpos($text, '___noise___')) !== false) {
2230*c165b184SJames Collins			// Sometimes there is a broken piece of markup, and we don't GET the
2231*c165b184SJames Collins			// pos+11 etc... token which indicates a problem outside of us...
2232*c165b184SJames Collins
2233*c165b184SJames Collins			// todo: "___noise___1000" (or any number with four or more digits)
2234*c165b184SJames Collins			// in the DOM causes an infinite loop which could be utilized by
2235*c165b184SJames Collins			// malicious software
2236*c165b184SJames Collins			if (strlen($text) > $pos + 15) {
2237*c165b184SJames Collins				$key = '___noise___'
2238*c165b184SJames Collins				. $text[$pos + 11]
2239*c165b184SJames Collins				. $text[$pos + 12]
2240*c165b184SJames Collins				. $text[$pos + 13]
2241*c165b184SJames Collins				. $text[$pos + 14]
2242*c165b184SJames Collins				. $text[$pos + 15];
2243*c165b184SJames Collins
2244*c165b184SJames Collins				if (is_object($debug_object)) {
2245*c165b184SJames Collins					$debug_object->debug_log(2, 'located key of: ' . $key);
2246*c165b184SJames Collins				}
2247*c165b184SJames Collins
2248*c165b184SJames Collins				if (isset($this->noise[$key])) {
2249*c165b184SJames Collins					$text = substr($text, 0, $pos)
2250*c165b184SJames Collins					. $this->noise[$key]
2251*c165b184SJames Collins					. substr($text, $pos + 16);
2252*c165b184SJames Collins				} else {
2253*c165b184SJames Collins					// do this to prevent an infinite loop.
2254*c165b184SJames Collins					$text = substr($text, 0, $pos)
2255*c165b184SJames Collins					. 'UNDEFINED NOISE FOR KEY: '
2256*c165b184SJames Collins					. $key
2257*c165b184SJames Collins					. substr($text, $pos + 16);
2258*c165b184SJames Collins				}
2259*c165b184SJames Collins			} else {
2260*c165b184SJames Collins				// There is no valid key being given back to us... We must get
2261*c165b184SJames Collins				// rid of the ___noise___ or we will have a problem.
2262*c165b184SJames Collins				$text = substr($text, 0, $pos)
2263*c165b184SJames Collins				. 'NO NUMERIC NOISE KEY'
2264*c165b184SJames Collins				. substr($text, $pos + 11);
2265*c165b184SJames Collins			}
2266*c165b184SJames Collins		}
2267*c165b184SJames Collins		return $text;
2268*c165b184SJames Collins	}
2269*c165b184SJames Collins
2270*c165b184SJames Collins	function search_noise($text)
2271*c165b184SJames Collins	{
2272*c165b184SJames Collins		global $debug_object;
2273*c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2274*c165b184SJames Collins
2275*c165b184SJames Collins		foreach($this->noise as $noiseElement) {
2276*c165b184SJames Collins			if (strpos($noiseElement, $text) !== false) {
2277*c165b184SJames Collins				return $noiseElement;
2278*c165b184SJames Collins			}
2279*c165b184SJames Collins		}
2280*c165b184SJames Collins	}
2281*c165b184SJames Collins
2282*c165b184SJames Collins	function __toString()
2283*c165b184SJames Collins	{
2284*c165b184SJames Collins		return $this->root->innertext();
2285*c165b184SJames Collins	}
2286*c165b184SJames Collins
2287*c165b184SJames Collins	function __get($name)
2288*c165b184SJames Collins	{
2289*c165b184SJames Collins		switch ($name) {
2290*c165b184SJames Collins			case 'outertext':
2291*c165b184SJames Collins				return $this->root->innertext();
2292*c165b184SJames Collins			case 'innertext':
2293*c165b184SJames Collins				return $this->root->innertext();
2294*c165b184SJames Collins			case 'plaintext':
2295*c165b184SJames Collins				return $this->root->text();
2296*c165b184SJames Collins			case 'charset':
2297*c165b184SJames Collins				return $this->_charset;
2298*c165b184SJames Collins			case 'target_charset':
2299*c165b184SJames Collins				return $this->_target_charset;
2300*c165b184SJames Collins		}
2301*c165b184SJames Collins	}
2302*c165b184SJames Collins
2303*c165b184SJames Collins	function childNodes($idx = -1)
2304*c165b184SJames Collins	{
2305*c165b184SJames Collins		return $this->root->childNodes($idx);
2306*c165b184SJames Collins	}
2307*c165b184SJames Collins
2308*c165b184SJames Collins	function firstChild()
2309*c165b184SJames Collins	{
2310*c165b184SJames Collins		return $this->root->first_child();
2311*c165b184SJames Collins	}
2312*c165b184SJames Collins
2313*c165b184SJames Collins	function lastChild()
2314*c165b184SJames Collins	{
2315*c165b184SJames Collins		return $this->root->last_child();
2316*c165b184SJames Collins	}
2317*c165b184SJames Collins
2318*c165b184SJames Collins	function createElement($name, $value = null)
2319*c165b184SJames Collins	{
2320*c165b184SJames Collins		return @str_get_html("<$name>$value</$name>")->firstChild();
2321*c165b184SJames Collins	}
2322*c165b184SJames Collins
2323*c165b184SJames Collins	function createTextNode($value)
2324*c165b184SJames Collins	{
2325*c165b184SJames Collins		return @end(str_get_html($value)->nodes);
2326*c165b184SJames Collins	}
2327*c165b184SJames Collins
2328*c165b184SJames Collins	function getElementById($id)
2329*c165b184SJames Collins	{
2330*c165b184SJames Collins		return $this->find("#$id", 0);
2331*c165b184SJames Collins	}
2332*c165b184SJames Collins
2333*c165b184SJames Collins	function getElementsById($id, $idx = null)
2334*c165b184SJames Collins	{
2335*c165b184SJames Collins		return $this->find("#$id", $idx);
2336*c165b184SJames Collins	}
2337*c165b184SJames Collins
2338*c165b184SJames Collins	function getElementByTagName($name)
2339*c165b184SJames Collins	{
2340*c165b184SJames Collins		return $this->find($name, 0);
2341*c165b184SJames Collins	}
2342*c165b184SJames Collins
2343*c165b184SJames Collins	function getElementsByTagName($name, $idx = -1)
2344*c165b184SJames Collins	{
2345*c165b184SJames Collins		return $this->find($name, $idx);
2346*c165b184SJames Collins	}
2347*c165b184SJames Collins
2348*c165b184SJames Collins	function loadFile()
2349*c165b184SJames Collins	{
2350*c165b184SJames Collins		$args = func_get_args();
2351*c165b184SJames Collins		$this->load_file($args);
2352*c165b184SJames Collins	}
2353*c165b184SJames Collins}
2354