xref: /template/mikio/inc/simple_html_dom.php (revision bc1032d943e70c9d150b2375967767709cd8fc08)
1c165b184SJames Collins<?php
2c165b184SJames Collins/**
3c165b184SJames Collins * Website: http://sourceforge.net/projects/simplehtmldom/
4c165b184SJames Collins * Additional projects: http://sourceforge.net/projects/debugobject/
5c165b184SJames Collins * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6c165b184SJames Collins *
7c165b184SJames Collins * Licensed under The MIT License
8c165b184SJames Collins * See the LICENSE file in the project root for more information.
9c165b184SJames Collins *
10c165b184SJames Collins * Authors:
11c165b184SJames Collins *   S.C. Chen
12c165b184SJames Collins *   John Schlick
13c165b184SJames Collins *   Rus Carroll
14c165b184SJames Collins *   logmanoriginal
15c165b184SJames Collins *
16c165b184SJames Collins * Contributors:
17c165b184SJames Collins *   Yousuke Kumakura
18c165b184SJames Collins *   Vadim Voituk
19c165b184SJames Collins *   Antcs
20*bc1032d9SJames Collins *   James Collins (nomadjimbob)
21c165b184SJames Collins *
22*bc1032d9SJames Collins * Based on Version Rev. 1.9.1 (291)
23*bc1032d9SJames Collins * Version 1.9.1.1
24c165b184SJames Collins */
25c165b184SJames Collins
26c165b184SJames Collinsdefine('HDOM_TYPE_ELEMENT', 1);
27c165b184SJames Collinsdefine('HDOM_TYPE_COMMENT', 2);
28c165b184SJames Collinsdefine('HDOM_TYPE_TEXT', 3);
29c165b184SJames Collinsdefine('HDOM_TYPE_ENDTAG', 4);
30c165b184SJames Collinsdefine('HDOM_TYPE_ROOT', 5);
31c165b184SJames Collinsdefine('HDOM_TYPE_UNKNOWN', 6);
32c165b184SJames Collinsdefine('HDOM_QUOTE_DOUBLE', 0);
33c165b184SJames Collinsdefine('HDOM_QUOTE_SINGLE', 1);
34c165b184SJames Collinsdefine('HDOM_QUOTE_NO', 3);
35c165b184SJames Collinsdefine('HDOM_INFO_BEGIN', 0);
36c165b184SJames Collinsdefine('HDOM_INFO_END', 1);
37c165b184SJames Collinsdefine('HDOM_INFO_QUOTE', 2);
38c165b184SJames Collinsdefine('HDOM_INFO_SPACE', 3);
39c165b184SJames Collinsdefine('HDOM_INFO_TEXT', 4);
40c165b184SJames Collinsdefine('HDOM_INFO_INNER', 5);
41c165b184SJames Collinsdefine('HDOM_INFO_OUTER', 6);
42c165b184SJames Collinsdefine('HDOM_INFO_ENDSPACE', 7);
43c165b184SJames Collins
44c165b184SJames Collinsdefined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
45c165b184SJames Collinsdefined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
46c165b184SJames Collinsdefined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
47c165b184SJames Collinsdefined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
48c165b184SJames Collinsdefine('HDOM_SMARTY_AS_TEXT', 1);
49c165b184SJames Collins
50c165b184SJames Collinsfunction file_get_html(
51c165b184SJames Collins	$url,
52c165b184SJames Collins	$use_include_path = false,
53c165b184SJames Collins	$context = null,
54c165b184SJames Collins	$offset = 0,
55c165b184SJames Collins	$maxLen = -1,
56c165b184SJames Collins	$lowercase = true,
57c165b184SJames Collins	$forceTagsClosed = true,
58c165b184SJames Collins	$target_charset = DEFAULT_TARGET_CHARSET,
59c165b184SJames Collins	$stripRN = true,
60c165b184SJames Collins	$defaultBRText = DEFAULT_BR_TEXT,
61c165b184SJames Collins	$defaultSpanText = DEFAULT_SPAN_TEXT)
62c165b184SJames Collins{
63c165b184SJames Collins	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
64c165b184SJames Collins
65c165b184SJames Collins	$dom = new simple_html_dom(
66c165b184SJames Collins		null,
67c165b184SJames Collins		$lowercase,
68c165b184SJames Collins		$forceTagsClosed,
69c165b184SJames Collins		$target_charset,
70c165b184SJames Collins		$stripRN,
71c165b184SJames Collins		$defaultBRText,
72c165b184SJames Collins		$defaultSpanText
73c165b184SJames Collins	);
74c165b184SJames Collins
75c165b184SJames Collins	/**
76c165b184SJames Collins	 * For sourceforge users: uncomment the next line and comment the
77c165b184SJames Collins	 * retrieve_url_contents line 2 lines down if it is not already done.
78c165b184SJames Collins	 */
79c165b184SJames Collins	$contents = file_get_contents(
80c165b184SJames Collins		$url,
81c165b184SJames Collins		$use_include_path,
82c165b184SJames Collins		$context,
83c165b184SJames Collins		$offset,
84c165b184SJames Collins		$maxLen
85c165b184SJames Collins	);
86c165b184SJames Collins	// $contents = retrieve_url_contents($url);
87c165b184SJames Collins
88c165b184SJames Collins	if (empty($contents) || strlen($contents) > $maxLen) {
89c165b184SJames Collins		$dom->clear();
90c165b184SJames Collins		return false;
91c165b184SJames Collins	}
92c165b184SJames Collins
93c165b184SJames Collins	return $dom->load($contents, $lowercase, $stripRN);
94c165b184SJames Collins}
95c165b184SJames Collins
96c165b184SJames Collinsfunction str_get_html(
97c165b184SJames Collins	$str,
98c165b184SJames Collins	$lowercase = true,
99c165b184SJames Collins	$forceTagsClosed = true,
100c165b184SJames Collins	$target_charset = DEFAULT_TARGET_CHARSET,
101c165b184SJames Collins	$stripRN = true,
102c165b184SJames Collins	$defaultBRText = DEFAULT_BR_TEXT,
103c165b184SJames Collins	$defaultSpanText = DEFAULT_SPAN_TEXT)
104c165b184SJames Collins{
105c165b184SJames Collins	$dom = new simple_html_dom(
106c165b184SJames Collins		null,
107c165b184SJames Collins		$lowercase,
108c165b184SJames Collins		$forceTagsClosed,
109c165b184SJames Collins		$target_charset,
110c165b184SJames Collins		$stripRN,
111c165b184SJames Collins		$defaultBRText,
112c165b184SJames Collins		$defaultSpanText
113c165b184SJames Collins	);
114c165b184SJames Collins
115c165b184SJames Collins	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
116c165b184SJames Collins		$dom->clear();
117c165b184SJames Collins		return false;
118c165b184SJames Collins	}
119c165b184SJames Collins
120c165b184SJames Collins	return $dom->load($str, $lowercase, $stripRN);
121c165b184SJames Collins}
122c165b184SJames Collins
123c165b184SJames Collinsfunction dump_html_tree($node, $show_attr = true, $deep = 0)
124c165b184SJames Collins{
125c165b184SJames Collins	$node->dump($node);
126c165b184SJames Collins}
127c165b184SJames Collins
128c165b184SJames Collinsclass simple_html_dom_node
129c165b184SJames Collins{
130c165b184SJames Collins	public $nodetype = HDOM_TYPE_TEXT;
131c165b184SJames Collins	public $tag = 'text';
132c165b184SJames Collins	public $attr = array();
133c165b184SJames Collins	public $children = array();
134c165b184SJames Collins	public $nodes = array();
135c165b184SJames Collins	public $parent = null;
136c165b184SJames Collins	public $_ = array();
137c165b184SJames Collins	public $tag_start = 0;
138c165b184SJames Collins	private $dom = null;
139c165b184SJames Collins
140c165b184SJames Collins	function __construct($dom)
141c165b184SJames Collins	{
142c165b184SJames Collins		$this->dom = $dom;
143c165b184SJames Collins		$dom->nodes[] = $this;
144c165b184SJames Collins	}
145c165b184SJames Collins
146c165b184SJames Collins	function __destruct()
147c165b184SJames Collins	{
148c165b184SJames Collins		$this->clear();
149c165b184SJames Collins	}
150c165b184SJames Collins
151c165b184SJames Collins	function __toString()
152c165b184SJames Collins	{
153c165b184SJames Collins		return $this->outertext();
154c165b184SJames Collins	}
155c165b184SJames Collins
156c165b184SJames Collins	function clear()
157c165b184SJames Collins	{
158c165b184SJames Collins		$this->dom = null;
159c165b184SJames Collins		$this->nodes = null;
160c165b184SJames Collins		$this->parent = null;
161c165b184SJames Collins		$this->children = null;
162c165b184SJames Collins	}
163c165b184SJames Collins
164c165b184SJames Collins	function dump($show_attr = true, $depth = 0)
165c165b184SJames Collins	{
166c165b184SJames Collins		echo str_repeat("\t", $depth) . $this->tag;
167c165b184SJames Collins
168c165b184SJames Collins		if ($show_attr && count($this->attr) > 0) {
169c165b184SJames Collins			echo '(';
170c165b184SJames Collins			foreach ($this->attr as $k => $v) {
171c165b184SJames Collins				echo "[$k]=>\"$v\", ";
172c165b184SJames Collins			}
173c165b184SJames Collins			echo ')';
174c165b184SJames Collins		}
175c165b184SJames Collins
176c165b184SJames Collins		echo "\n";
177c165b184SJames Collins
178c165b184SJames Collins		if ($this->nodes) {
179c165b184SJames Collins			foreach ($this->nodes as $node) {
180c165b184SJames Collins				$node->dump($show_attr, $depth + 1);
181c165b184SJames Collins			}
182c165b184SJames Collins		}
183c165b184SJames Collins	}
184c165b184SJames Collins
185c165b184SJames Collins	function dump_node($echo = true)
186c165b184SJames Collins	{
187c165b184SJames Collins		$string = $this->tag;
188c165b184SJames Collins
189c165b184SJames Collins		if (count($this->attr) > 0) {
190c165b184SJames Collins			$string .= '(';
191c165b184SJames Collins			foreach ($this->attr as $k => $v) {
192c165b184SJames Collins				$string .= "[$k]=>\"$v\", ";
193c165b184SJames Collins			}
194c165b184SJames Collins			$string .= ')';
195c165b184SJames Collins		}
196c165b184SJames Collins
197c165b184SJames Collins		if (count($this->_) > 0) {
198c165b184SJames Collins			$string .= ' $_ (';
199c165b184SJames Collins			foreach ($this->_ as $k => $v) {
200c165b184SJames Collins				if (is_array($v)) {
201c165b184SJames Collins					$string .= "[$k]=>(";
202c165b184SJames Collins					foreach ($v as $k2 => $v2) {
203c165b184SJames Collins						$string .= "[$k2]=>\"$v2\", ";
204c165b184SJames Collins					}
205c165b184SJames Collins					$string .= ')';
206c165b184SJames Collins				} else {
207c165b184SJames Collins					$string .= "[$k]=>\"$v\", ";
208c165b184SJames Collins				}
209c165b184SJames Collins			}
210c165b184SJames Collins			$string .= ')';
211c165b184SJames Collins		}
212c165b184SJames Collins
213c165b184SJames Collins		if (isset($this->text)) {
214c165b184SJames Collins			$string .= " text: ({$this->text})";
215c165b184SJames Collins		}
216c165b184SJames Collins
217c165b184SJames Collins		$string .= ' HDOM_INNER_INFO: ';
218c165b184SJames Collins
219c165b184SJames Collins		if (isset($node->_[HDOM_INFO_INNER])) {
220c165b184SJames Collins			$string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
221c165b184SJames Collins		} else {
222c165b184SJames Collins			$string .= ' NULL ';
223c165b184SJames Collins		}
224c165b184SJames Collins
225c165b184SJames Collins		$string .= ' children: ' . count($this->children);
226c165b184SJames Collins		$string .= ' nodes: ' . count($this->nodes);
227c165b184SJames Collins		$string .= ' tag_start: ' . $this->tag_start;
228c165b184SJames Collins		$string .= "\n";
229c165b184SJames Collins
230c165b184SJames Collins		if ($echo) {
231c165b184SJames Collins			echo $string;
232c165b184SJames Collins			return;
233c165b184SJames Collins		} else {
234c165b184SJames Collins			return $string;
235c165b184SJames Collins		}
236c165b184SJames Collins	}
237c165b184SJames Collins
238c165b184SJames Collins	function parent($parent = null)
239c165b184SJames Collins	{
240c165b184SJames Collins		// I am SURE that this doesn't work properly.
241c165b184SJames Collins		// It fails to unset the current node from it's current parents nodes or
242c165b184SJames Collins		// children list first.
243c165b184SJames Collins		if ($parent !== null) {
244c165b184SJames Collins			$this->parent = $parent;
245c165b184SJames Collins			$this->parent->nodes[] = $this;
246c165b184SJames Collins			$this->parent->children[] = $this;
247c165b184SJames Collins		}
248c165b184SJames Collins
249c165b184SJames Collins		return $this->parent;
250c165b184SJames Collins	}
251c165b184SJames Collins
252c165b184SJames Collins	function has_child()
253c165b184SJames Collins	{
254c165b184SJames Collins		return !empty($this->children);
255c165b184SJames Collins	}
256c165b184SJames Collins
257c165b184SJames Collins	function children($idx = -1)
258c165b184SJames Collins	{
259c165b184SJames Collins		if ($idx === -1) {
260c165b184SJames Collins			return $this->children;
261c165b184SJames Collins		}
262c165b184SJames Collins
263c165b184SJames Collins		if (isset($this->children[$idx])) {
264c165b184SJames Collins			return $this->children[$idx];
265c165b184SJames Collins		}
266c165b184SJames Collins
267c165b184SJames Collins		return null;
268c165b184SJames Collins	}
269c165b184SJames Collins
270c165b184SJames Collins	function first_child()
271c165b184SJames Collins	{
272c165b184SJames Collins		if (count($this->children) > 0) {
273c165b184SJames Collins			return $this->children[0];
274c165b184SJames Collins		}
275c165b184SJames Collins		return null;
276c165b184SJames Collins	}
277c165b184SJames Collins
278c165b184SJames Collins	function last_child()
279c165b184SJames Collins	{
280c165b184SJames Collins		if (count($this->children) > 0) {
281c165b184SJames Collins			return end($this->children);
282c165b184SJames Collins		}
283c165b184SJames Collins		return null;
284c165b184SJames Collins	}
285c165b184SJames Collins
286c165b184SJames Collins	function next_sibling()
287c165b184SJames Collins	{
288c165b184SJames Collins		if ($this->parent === null) {
289c165b184SJames Collins			return null;
290c165b184SJames Collins		}
291c165b184SJames Collins
292c165b184SJames Collins		$idx = array_search($this, $this->parent->children, true);
293c165b184SJames Collins
294c165b184SJames Collins		if ($idx !== false && isset($this->parent->children[$idx + 1])) {
295c165b184SJames Collins			return $this->parent->children[$idx + 1];
296c165b184SJames Collins		}
297c165b184SJames Collins
298c165b184SJames Collins		return null;
299c165b184SJames Collins	}
300c165b184SJames Collins
301c165b184SJames Collins	function prev_sibling()
302c165b184SJames Collins	{
303c165b184SJames Collins		if ($this->parent === null) {
304c165b184SJames Collins			return null;
305c165b184SJames Collins		}
306c165b184SJames Collins
307c165b184SJames Collins		$idx = array_search($this, $this->parent->children, true);
308c165b184SJames Collins
309c165b184SJames Collins		if ($idx !== false && $idx > 0) {
310c165b184SJames Collins			return $this->parent->children[$idx - 1];
311c165b184SJames Collins		}
312c165b184SJames Collins
313c165b184SJames Collins		return null;
314c165b184SJames Collins	}
315c165b184SJames Collins
316c165b184SJames Collins	function find_ancestor_tag($tag)
317c165b184SJames Collins	{
318c165b184SJames Collins		global $debug_object;
319c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
320c165b184SJames Collins
321c165b184SJames Collins		if ($this->parent === null) {
322c165b184SJames Collins			return null;
323c165b184SJames Collins		}
324c165b184SJames Collins
325c165b184SJames Collins		$ancestor = $this->parent;
326c165b184SJames Collins
327c165b184SJames Collins		while (!is_null($ancestor)) {
328c165b184SJames Collins			if (is_object($debug_object)) {
329c165b184SJames Collins				$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
330c165b184SJames Collins			}
331c165b184SJames Collins
332c165b184SJames Collins			if ($ancestor->tag === $tag) {
333c165b184SJames Collins				break;
334c165b184SJames Collins			}
335c165b184SJames Collins
336c165b184SJames Collins			$ancestor = $ancestor->parent;
337c165b184SJames Collins		}
338c165b184SJames Collins
339c165b184SJames Collins		return $ancestor;
340c165b184SJames Collins	}
341c165b184SJames Collins
342c165b184SJames Collins	function innertext()
343c165b184SJames Collins	{
344c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
345c165b184SJames Collins			return $this->_[HDOM_INFO_INNER];
346c165b184SJames Collins		}
347c165b184SJames Collins
348c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
349c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
350c165b184SJames Collins		}
351c165b184SJames Collins
352c165b184SJames Collins		$ret = '';
353c165b184SJames Collins
354c165b184SJames Collins		foreach ($this->nodes as $n) {
355c165b184SJames Collins			$ret .= $n->outertext();
356c165b184SJames Collins		}
357c165b184SJames Collins
358c165b184SJames Collins		return $ret;
359c165b184SJames Collins	}
360c165b184SJames Collins
361c165b184SJames Collins	function outertext()
362c165b184SJames Collins	{
363c165b184SJames Collins		global $debug_object;
364c165b184SJames Collins
365c165b184SJames Collins		if (is_object($debug_object)) {
366c165b184SJames Collins			$text = '';
367c165b184SJames Collins
368c165b184SJames Collins			if ($this->tag === 'text') {
369c165b184SJames Collins				if (!empty($this->text)) {
370c165b184SJames Collins					$text = ' with text: ' . $this->text;
371c165b184SJames Collins				}
372c165b184SJames Collins			}
373c165b184SJames Collins
374c165b184SJames Collins			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
375c165b184SJames Collins		}
376c165b184SJames Collins
377c165b184SJames Collins		if ($this->tag === 'root') {
378c165b184SJames Collins			return $this->innertext();
379c165b184SJames Collins		}
380c165b184SJames Collins
381c165b184SJames Collins		// todo: What is the use of this callback? Remove?
382c165b184SJames Collins		if ($this->dom && $this->dom->callback !== null) {
383c165b184SJames Collins			call_user_func_array($this->dom->callback, array($this));
384c165b184SJames Collins		}
385c165b184SJames Collins
386c165b184SJames Collins		if (isset($this->_[HDOM_INFO_OUTER])) {
387c165b184SJames Collins			return $this->_[HDOM_INFO_OUTER];
388c165b184SJames Collins		}
389c165b184SJames Collins
390c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
391c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
392c165b184SJames Collins		}
393c165b184SJames Collins
394c165b184SJames Collins		$ret = '';
395c165b184SJames Collins
396c165b184SJames Collins		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
397c165b184SJames Collins			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
398c165b184SJames Collins		}
399c165b184SJames Collins
400c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
401c165b184SJames Collins			// todo: <br> should either never have HDOM_INFO_INNER or always
402c165b184SJames Collins			if ($this->tag !== 'br') {
403c165b184SJames Collins				$ret .= $this->_[HDOM_INFO_INNER];
404c165b184SJames Collins			}
405c165b184SJames Collins		} elseif ($this->nodes) {
406c165b184SJames Collins			foreach ($this->nodes as $n) {
407c165b184SJames Collins				$ret .= $this->convert_text($n->outertext());
408c165b184SJames Collins			}
409c165b184SJames Collins		}
410c165b184SJames Collins
411c165b184SJames Collins		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
412c165b184SJames Collins			$ret .= '</' . $this->tag . '>';
413c165b184SJames Collins		}
414c165b184SJames Collins
415c165b184SJames Collins		return $ret;
416c165b184SJames Collins	}
417c165b184SJames Collins
418c165b184SJames Collins	function text()
419c165b184SJames Collins	{
420c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
421c165b184SJames Collins			return $this->_[HDOM_INFO_INNER];
422c165b184SJames Collins		}
423c165b184SJames Collins
424c165b184SJames Collins		switch ($this->nodetype) {
425c165b184SJames Collins			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
426c165b184SJames Collins			case HDOM_TYPE_COMMENT: return '';
427c165b184SJames Collins			case HDOM_TYPE_UNKNOWN: return '';
428c165b184SJames Collins		}
429c165b184SJames Collins
430c165b184SJames Collins		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
431c165b184SJames Collins		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
432c165b184SJames Collins
433c165b184SJames Collins		$ret = '';
434c165b184SJames Collins
435c165b184SJames Collins		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
436c165b184SJames Collins		// for some span tags, and some p tags) $this->nodes is set to NULL.
437c165b184SJames Collins		// NOTE: This indicates that there is a problem where it's set to NULL
438c165b184SJames Collins		// without a clear happening.
439c165b184SJames Collins		// WHY is this happening?
440c165b184SJames Collins		if (!is_null($this->nodes)) {
441c165b184SJames Collins			foreach ($this->nodes as $n) {
442c165b184SJames Collins				// Start paragraph after a blank line
443c165b184SJames Collins				if ($n->tag === 'p') {
444c165b184SJames Collins					$ret = trim($ret) . "\n\n";
445c165b184SJames Collins				}
446c165b184SJames Collins
447c165b184SJames Collins				$ret .= $this->convert_text($n->text());
448c165b184SJames Collins
449c165b184SJames Collins				// If this node is a span... add a space at the end of it so
450c165b184SJames Collins				// multiple spans don't run into each other.  This is plaintext
451c165b184SJames Collins				// after all.
452c165b184SJames Collins				if ($n->tag === 'span') {
453c165b184SJames Collins					$ret .= $this->dom->default_span_text;
454c165b184SJames Collins				}
455c165b184SJames Collins			}
456c165b184SJames Collins		}
457c165b184SJames Collins		return $ret;
458c165b184SJames Collins	}
459c165b184SJames Collins
460c165b184SJames Collins	function xmltext()
461c165b184SJames Collins	{
462c165b184SJames Collins		$ret = $this->innertext();
463c165b184SJames Collins		$ret = str_ireplace('<![CDATA[', '', $ret);
464c165b184SJames Collins		$ret = str_replace(']]>', '', $ret);
465c165b184SJames Collins		return $ret;
466c165b184SJames Collins	}
467c165b184SJames Collins
468c165b184SJames Collins	function makeup()
469c165b184SJames Collins	{
470c165b184SJames Collins		// text, comment, unknown
471c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
472c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
473c165b184SJames Collins		}
474c165b184SJames Collins
475c165b184SJames Collins		$ret = '<' . $this->tag;
476c165b184SJames Collins		$i = -1;
477c165b184SJames Collins
478c165b184SJames Collins		foreach ($this->attr as $key => $val) {
479c165b184SJames Collins			++$i;
480c165b184SJames Collins
481c165b184SJames Collins			// skip removed attribute
482c165b184SJames Collins			if ($val === null || $val === false) { continue; }
483c165b184SJames Collins
484c165b184SJames Collins			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
485c165b184SJames Collins
486c165b184SJames Collins			//no value attr: nowrap, checked selected...
487c165b184SJames Collins			if ($val === true) {
488c165b184SJames Collins				$ret .= $key;
489c165b184SJames Collins			} else {
490c165b184SJames Collins				switch ($this->_[HDOM_INFO_QUOTE][$i])
491c165b184SJames Collins				{
492c165b184SJames Collins					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
493c165b184SJames Collins					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
494c165b184SJames Collins					default: $quote = '';
495c165b184SJames Collins				}
496c165b184SJames Collins
497c165b184SJames Collins				$ret .= $key
498c165b184SJames Collins				. $this->_[HDOM_INFO_SPACE][$i][1]
499c165b184SJames Collins				. '='
500c165b184SJames Collins				. $this->_[HDOM_INFO_SPACE][$i][2]
501c165b184SJames Collins				. $quote
502c165b184SJames Collins				. $val
503c165b184SJames Collins				. $quote;
504c165b184SJames Collins			}
505c165b184SJames Collins		}
506c165b184SJames Collins
507c165b184SJames Collins		$ret = $this->dom->restore_noise($ret);
508c165b184SJames Collins		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
509c165b184SJames Collins	}
510c165b184SJames Collins
511c165b184SJames Collins	function find($selector, $idx = null, $lowercase = false)
512c165b184SJames Collins	{
513c165b184SJames Collins		$selectors = $this->parse_selector($selector);
514c165b184SJames Collins		if (($count = count($selectors)) === 0) { return array(); }
515c165b184SJames Collins		$found_keys = array();
516c165b184SJames Collins
517c165b184SJames Collins		// find each selector
518c165b184SJames Collins		for ($c = 0; $c < $count; ++$c) {
519c165b184SJames Collins			// The change on the below line was documented on the sourceforge
520c165b184SJames Collins			// code tracker id 2788009
521c165b184SJames Collins			// used to be: if (($levle=count($selectors[0]))===0) return array();
522c165b184SJames Collins			if (($levle = count($selectors[$c])) === 0) { return array(); }
523c165b184SJames Collins			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
524c165b184SJames Collins
525c165b184SJames Collins			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
526c165b184SJames Collins			$cmd = ' '; // Combinator
527c165b184SJames Collins
528c165b184SJames Collins			// handle descendant selectors, no recursive!
529c165b184SJames Collins			for ($l = 0; $l < $levle; ++$l) {
530c165b184SJames Collins				$ret = array();
531c165b184SJames Collins
532c165b184SJames Collins				foreach ($head as $k => $v) {
533c165b184SJames Collins					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
534c165b184SJames Collins					//PaperG - Pass this optional parameter on to the seek function.
535c165b184SJames Collins					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
536c165b184SJames Collins				}
537c165b184SJames Collins
538c165b184SJames Collins				$head = $ret;
539c165b184SJames Collins				$cmd = $selectors[$c][$l][4]; // Next Combinator
540c165b184SJames Collins			}
541c165b184SJames Collins
542c165b184SJames Collins			foreach ($head as $k => $v) {
543c165b184SJames Collins				if (!isset($found_keys[$k])) {
544c165b184SJames Collins					$found_keys[$k] = 1;
545c165b184SJames Collins				}
546c165b184SJames Collins			}
547c165b184SJames Collins		}
548c165b184SJames Collins
549c165b184SJames Collins		// sort keys
550c165b184SJames Collins		ksort($found_keys);
551c165b184SJames Collins
552c165b184SJames Collins		$found = array();
553c165b184SJames Collins		foreach ($found_keys as $k => $v) {
554c165b184SJames Collins			$found[] = $this->dom->nodes[$k];
555c165b184SJames Collins		}
556c165b184SJames Collins
557c165b184SJames Collins		// return nth-element or array
558c165b184SJames Collins		if (is_null($idx)) { return $found; }
559c165b184SJames Collins		elseif ($idx < 0) { $idx = count($found) + $idx; }
560c165b184SJames Collins		return (isset($found[$idx])) ? $found[$idx] : null;
561c165b184SJames Collins	}
562c165b184SJames Collins
563c165b184SJames Collins	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
564c165b184SJames Collins	{
565c165b184SJames Collins		global $debug_object;
566c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
567c165b184SJames Collins
568c165b184SJames Collins		list($tag, $id, $class, $attributes, $cmb) = $selector;
569c165b184SJames Collins		$nodes = array();
570c165b184SJames Collins
571c165b184SJames Collins		if ($parent_cmd === ' ') { // Descendant Combinator
572c165b184SJames Collins			// Find parent closing tag if the current element doesn't have a closing
573c165b184SJames Collins			// tag (i.e. void element)
574c165b184SJames Collins			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
575c165b184SJames Collins			if ($end == 0) {
576c165b184SJames Collins				$parent = $this->parent;
577c165b184SJames Collins				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
578c165b184SJames Collins					$end -= 1;
579c165b184SJames Collins					$parent = $parent->parent;
580c165b184SJames Collins				}
581c165b184SJames Collins				$end += $parent->_[HDOM_INFO_END];
582c165b184SJames Collins			}
583c165b184SJames Collins
584c165b184SJames Collins			// Get list of target nodes
585c165b184SJames Collins			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
586c165b184SJames Collins			$nodes_count = $end - $nodes_start;
587c165b184SJames Collins			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
588c165b184SJames Collins		} elseif ($parent_cmd === '>') { // Child Combinator
589c165b184SJames Collins			$nodes = $this->children;
590c165b184SJames Collins		} elseif ($parent_cmd === '+'
591c165b184SJames Collins			&& $this->parent
592c165b184SJames Collins			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
593c165b184SJames Collins				$index = array_search($this, $this->parent->children, true) + 1;
594c165b184SJames Collins				if ($index < count($this->parent->children))
595c165b184SJames Collins					$nodes[] = $this->parent->children[$index];
596c165b184SJames Collins		} elseif ($parent_cmd === '~'
597c165b184SJames Collins			&& $this->parent
598c165b184SJames Collins			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
599c165b184SJames Collins				$index = array_search($this, $this->parent->children, true);
600c165b184SJames Collins				$nodes = array_slice($this->parent->children, $index);
601c165b184SJames Collins		}
602c165b184SJames Collins
603c165b184SJames Collins		// Go throgh each element starting at this element until the end tag
604c165b184SJames Collins		// Note: If this element is a void tag, any previous void element is
605c165b184SJames Collins		// skipped.
606c165b184SJames Collins		foreach($nodes as $node) {
607c165b184SJames Collins			$pass = true;
608c165b184SJames Collins
609c165b184SJames Collins			// Skip root nodes
610c165b184SJames Collins			if(!$node->parent) {
611c165b184SJames Collins				$pass = false;
612c165b184SJames Collins			}
613c165b184SJames Collins
614c165b184SJames Collins			// Handle 'text' selector
615c165b184SJames Collins			if($pass && $tag === 'text' && $node->tag === 'text') {
616c165b184SJames Collins				$ret[array_search($node, $this->dom->nodes, true)] = 1;
617c165b184SJames Collins				unset($node);
618c165b184SJames Collins				continue;
619c165b184SJames Collins			}
620c165b184SJames Collins
621c165b184SJames Collins			// Skip if node isn't a child node (i.e. text nodes)
622c165b184SJames Collins			if($pass && !in_array($node, $node->parent->children, true)) {
623c165b184SJames Collins				$pass = false;
624c165b184SJames Collins			}
625c165b184SJames Collins
626c165b184SJames Collins			// Skip if tag doesn't match
627c165b184SJames Collins			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
628c165b184SJames Collins				$pass = false;
629c165b184SJames Collins			}
630c165b184SJames Collins
631c165b184SJames Collins			// Skip if ID doesn't exist
632c165b184SJames Collins			if ($pass && $id !== '' && !isset($node->attr['id'])) {
633c165b184SJames Collins				$pass = false;
634c165b184SJames Collins			}
635c165b184SJames Collins
636c165b184SJames Collins			// Check if ID matches
637c165b184SJames Collins			if ($pass && $id !== '' && isset($node->attr['id'])) {
638c165b184SJames Collins				// Note: Only consider the first ID (as browsers do)
639c165b184SJames Collins				$node_id = explode(' ', trim($node->attr['id']))[0];
640c165b184SJames Collins
641c165b184SJames Collins				if($id !== $node_id) { $pass = false; }
642c165b184SJames Collins			}
643c165b184SJames Collins
644c165b184SJames Collins			// Check if all class(es) exist
645c165b184SJames Collins			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
646c165b184SJames Collins				if (isset($node->attr['class'])) {
647c165b184SJames Collins					$node_classes = explode(' ', $node->attr['class']);
648c165b184SJames Collins
649c165b184SJames Collins					if ($lowercase) {
650c165b184SJames Collins						$node_classes = array_map('strtolower', $node_classes);
651c165b184SJames Collins					}
652c165b184SJames Collins
653c165b184SJames Collins					foreach($class as $c) {
654c165b184SJames Collins						if(!in_array($c, $node_classes)) {
655c165b184SJames Collins							$pass = false;
656c165b184SJames Collins							break;
657c165b184SJames Collins						}
658c165b184SJames Collins					}
659c165b184SJames Collins				} else {
660c165b184SJames Collins					$pass = false;
661c165b184SJames Collins				}
662c165b184SJames Collins			}
663c165b184SJames Collins
664c165b184SJames Collins			// Check attributes
665c165b184SJames Collins			if ($pass
666c165b184SJames Collins				&& $attributes !== ''
667c165b184SJames Collins				&& is_array($attributes)
668c165b184SJames Collins				&& !empty($attributes)) {
669c165b184SJames Collins					foreach($attributes as $a) {
670c165b184SJames Collins						list (
671c165b184SJames Collins							$att_name,
672c165b184SJames Collins							$att_expr,
673c165b184SJames Collins							$att_val,
674c165b184SJames Collins							$att_inv,
675c165b184SJames Collins							$att_case_sensitivity
676c165b184SJames Collins						) = $a;
677c165b184SJames Collins
678c165b184SJames Collins						// Handle indexing attributes (i.e. "[2]")
679c165b184SJames Collins						/**
680c165b184SJames Collins						 * Note: This is not supported by the CSS Standard but adds
681c165b184SJames Collins						 * the ability to select items compatible to XPath (i.e.
682c165b184SJames Collins						 * the 3rd element within it's parent).
683c165b184SJames Collins						 *
684c165b184SJames Collins						 * Note: This doesn't conflict with the CSS Standard which
685c165b184SJames Collins						 * doesn't work on numeric attributes anyway.
686c165b184SJames Collins						 */
687c165b184SJames Collins						if (is_numeric($att_name)
688c165b184SJames Collins							&& $att_expr === ''
689c165b184SJames Collins							&& $att_val === '') {
690c165b184SJames Collins								$count = 0;
691c165b184SJames Collins
692c165b184SJames Collins								// Find index of current element in parent
693c165b184SJames Collins								foreach ($node->parent->children as $c) {
694c165b184SJames Collins									if ($c->tag === $node->tag) ++$count;
695c165b184SJames Collins									if ($c === $node) break;
696c165b184SJames Collins								}
697c165b184SJames Collins
698c165b184SJames Collins								// If this is the correct node, continue with next
699c165b184SJames Collins								// attribute
700c165b184SJames Collins								if ($count === (int)$att_name) continue;
701c165b184SJames Collins						}
702c165b184SJames Collins
703c165b184SJames Collins						// Check attribute availability
704c165b184SJames Collins						if ($att_inv) { // Attribute should NOT be set
705c165b184SJames Collins							if (isset($node->attr[$att_name])) {
706c165b184SJames Collins								$pass = false;
707c165b184SJames Collins								break;
708c165b184SJames Collins							}
709c165b184SJames Collins						} else { // Attribute should be set
710c165b184SJames Collins							// todo: "plaintext" is not a valid CSS selector!
711c165b184SJames Collins							if ($att_name !== 'plaintext'
712c165b184SJames Collins								&& !isset($node->attr[$att_name])) {
713c165b184SJames Collins									$pass = false;
714c165b184SJames Collins									break;
715c165b184SJames Collins							}
716c165b184SJames Collins						}
717c165b184SJames Collins
718c165b184SJames Collins						// Continue with next attribute if expression isn't defined
719c165b184SJames Collins						if ($att_expr === '') continue;
720c165b184SJames Collins
721c165b184SJames Collins						// If they have told us that this is a "plaintext"
722c165b184SJames Collins						// search then we want the plaintext of the node - right?
723c165b184SJames Collins						// todo "plaintext" is not a valid CSS selector!
724c165b184SJames Collins						if ($att_name === 'plaintext') {
725c165b184SJames Collins							$nodeKeyValue = $node->text();
726c165b184SJames Collins						} else {
727c165b184SJames Collins							$nodeKeyValue = $node->attr[$att_name];
728c165b184SJames Collins						}
729c165b184SJames Collins
730c165b184SJames Collins						if (is_object($debug_object)) {
731c165b184SJames Collins							$debug_object->debug_log(2,
732c165b184SJames Collins								'testing node: '
733c165b184SJames Collins								. $node->tag
734c165b184SJames Collins								. ' for attribute: '
735c165b184SJames Collins								. $att_name
736c165b184SJames Collins								. $att_expr
737c165b184SJames Collins								. $att_val
738c165b184SJames Collins								. ' where nodes value is: '
739c165b184SJames Collins								. $nodeKeyValue
740c165b184SJames Collins							);
741c165b184SJames Collins						}
742c165b184SJames Collins
743c165b184SJames Collins						// If lowercase is set, do a case insensitive test of
744c165b184SJames Collins						// the value of the selector.
745c165b184SJames Collins						if ($lowercase) {
746c165b184SJames Collins							$check = $this->match(
747c165b184SJames Collins								$att_expr,
748c165b184SJames Collins								strtolower($att_val),
749c165b184SJames Collins								strtolower($nodeKeyValue),
750c165b184SJames Collins								$att_case_sensitivity
751c165b184SJames Collins							);
752c165b184SJames Collins						} else {
753c165b184SJames Collins							$check = $this->match(
754c165b184SJames Collins								$att_expr,
755c165b184SJames Collins								$att_val,
756c165b184SJames Collins								$nodeKeyValue,
757c165b184SJames Collins								$att_case_sensitivity
758c165b184SJames Collins							);
759c165b184SJames Collins						}
760c165b184SJames Collins
761c165b184SJames Collins						if (is_object($debug_object)) {
762c165b184SJames Collins							$debug_object->debug_log(2,
763c165b184SJames Collins								'after match: '
764c165b184SJames Collins								. ($check ? 'true' : 'false')
765c165b184SJames Collins							);
766c165b184SJames Collins						}
767c165b184SJames Collins
768c165b184SJames Collins						if (!$check) {
769c165b184SJames Collins							$pass = false;
770c165b184SJames Collins							break;
771c165b184SJames Collins						}
772c165b184SJames Collins					}
773c165b184SJames Collins			}
774c165b184SJames Collins
775c165b184SJames Collins			// Found a match. Add to list and clear node
776c165b184SJames Collins			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
777c165b184SJames Collins			unset($node);
778c165b184SJames Collins		}
779c165b184SJames Collins		// It's passed by reference so this is actually what this function returns.
780c165b184SJames Collins		if (is_object($debug_object)) {
781c165b184SJames Collins			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
782c165b184SJames Collins		}
783c165b184SJames Collins	}
784c165b184SJames Collins
785c165b184SJames Collins	protected function match($exp, $pattern, $value, $case_sensitivity)
786c165b184SJames Collins	{
787c165b184SJames Collins		global $debug_object;
788c165b184SJames Collins		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
789c165b184SJames Collins
790c165b184SJames Collins		if ($case_sensitivity === 'i') {
791c165b184SJames Collins			$pattern = strtolower($pattern);
792c165b184SJames Collins			$value = strtolower($value);
793c165b184SJames Collins		}
794c165b184SJames Collins
795c165b184SJames Collins		switch ($exp) {
796c165b184SJames Collins			case '=':
797c165b184SJames Collins				return ($value === $pattern);
798c165b184SJames Collins			case '!=':
799c165b184SJames Collins				return ($value !== $pattern);
800c165b184SJames Collins			case '^=':
801c165b184SJames Collins				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
802c165b184SJames Collins			case '$=':
803c165b184SJames Collins				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
804c165b184SJames Collins			case '*=':
805c165b184SJames Collins				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
806c165b184SJames Collins			case '|=':
807c165b184SJames Collins				/**
808c165b184SJames Collins				 * [att|=val]
809c165b184SJames Collins				 *
810c165b184SJames Collins				 * Represents an element with the att attribute, its value
811c165b184SJames Collins				 * either being exactly "val" or beginning with "val"
812c165b184SJames Collins				 * immediately followed by "-" (U+002D).
813c165b184SJames Collins				 */
814c165b184SJames Collins				return strpos($value, $pattern) === 0;
815c165b184SJames Collins			case '~=':
816c165b184SJames Collins				/**
817c165b184SJames Collins				 * [att~=val]
818c165b184SJames Collins				 *
819c165b184SJames Collins				 * Represents an element with the att attribute whose value is a
820c165b184SJames Collins				 * whitespace-separated list of words, one of which is exactly
821c165b184SJames Collins				 * "val". If "val" contains whitespace, it will never represent
822c165b184SJames Collins				 * anything (since the words are separated by spaces). Also if
823c165b184SJames Collins				 * "val" is the empty string, it will never represent anything.
824c165b184SJames Collins				 */
825c165b184SJames Collins				return in_array($pattern, explode(' ', trim($value)), true);
826c165b184SJames Collins		}
827c165b184SJames Collins		return false;
828c165b184SJames Collins	}
829c165b184SJames Collins
830c165b184SJames Collins	protected function parse_selector($selector_string)
831c165b184SJames Collins	{
832c165b184SJames Collins		global $debug_object;
833c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
834c165b184SJames Collins
835c165b184SJames Collins		/**
836c165b184SJames Collins		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
837c165b184SJames Collins		 *
838c165b184SJames Collins		 * Paperg: Add the colon to the attribute, so that it properly finds
839c165b184SJames Collins		 * <tag attr:ibute="something" > like google does.
840c165b184SJames Collins		 *
841c165b184SJames Collins		 * Note: if you try to look at this attribute, you MUST use getAttribute
842c165b184SJames Collins		 * since $dom->x:y will fail the php syntax check.
843c165b184SJames Collins		 *
844c165b184SJames Collins		 * Notice the \[ starting the attribute? and the @? following? This
845c165b184SJames Collins		 * implies that an attribute can begin with an @ sign that is not
846c165b184SJames Collins		 * captured. This implies that an html attribute specifier may start
847c165b184SJames Collins		 * with an @ sign that is NOT captured by the expression. Farther study
848c165b184SJames Collins		 * is required to determine of this should be documented or removed.
849c165b184SJames Collins		 *
850c165b184SJames Collins		 * Matches selectors in this order:
851c165b184SJames Collins		 *
852c165b184SJames Collins		 * [0] - full match
853c165b184SJames Collins		 *
854c165b184SJames Collins		 * [1] - tag name
855c165b184SJames Collins		 *     ([\w:\*-]*)
856c165b184SJames Collins		 *     Matches the tag name consisting of zero or more words, colons,
857c165b184SJames Collins		 *     asterisks and hyphens.
858c165b184SJames Collins		 *
859c165b184SJames Collins		 * [2] - id name
860c165b184SJames Collins		 *     (?:\#([\w-]+))
861c165b184SJames Collins		 *     Optionally matches a id name, consisting of an "#" followed by
862c165b184SJames Collins		 *     the id name (one or more words and hyphens).
863c165b184SJames Collins		 *
864c165b184SJames Collins		 * [3] - class names (including dots)
865c165b184SJames Collins		 *     (?:\.([\w\.-]+))?
866c165b184SJames Collins		 *     Optionally matches a list of classs, consisting of an "."
867c165b184SJames Collins		 *     followed by the class name (one or more words and hyphens)
868c165b184SJames Collins		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
869c165b184SJames Collins		 *
870c165b184SJames Collins		 * [4] - attributes
871c165b184SJames Collins		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
872c165b184SJames Collins		 *     Optionally matches the attributes list
873c165b184SJames Collins		 *
874c165b184SJames Collins		 * [5] - separator
875c165b184SJames Collins		 *     ([\/, >+~]+)
876c165b184SJames Collins		 *     Matches the selector list separator
877c165b184SJames Collins		 */
878c165b184SJames Collins		// phpcs:ignore Generic.Files.LineLength
879c165b184SJames Collins		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
880c165b184SJames Collins
881c165b184SJames Collins		preg_match_all(
882c165b184SJames Collins			$pattern,
883c165b184SJames Collins			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
884c165b184SJames Collins			$matches,
885c165b184SJames Collins			PREG_SET_ORDER
886c165b184SJames Collins		);
887c165b184SJames Collins
888c165b184SJames Collins		if (is_object($debug_object)) {
889c165b184SJames Collins			$debug_object->debug_log(2, 'Matches Array: ', $matches);
890c165b184SJames Collins		}
891c165b184SJames Collins
892c165b184SJames Collins		$selectors = array();
893c165b184SJames Collins		$result = array();
894c165b184SJames Collins
895c165b184SJames Collins		foreach ($matches as $m) {
896c165b184SJames Collins			$m[0] = trim($m[0]);
897c165b184SJames Collins
898c165b184SJames Collins			// Skip NoOps
899c165b184SJames Collins			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
900c165b184SJames Collins
901c165b184SJames Collins			// Convert to lowercase
902c165b184SJames Collins			if ($this->dom->lowercase) {
903c165b184SJames Collins				$m[1] = strtolower($m[1]);
904c165b184SJames Collins			}
905c165b184SJames Collins
906c165b184SJames Collins			// Extract classes
907c165b184SJames Collins			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
908c165b184SJames Collins
909c165b184SJames Collins			/* Extract attributes (pattern based on the pattern above!)
910c165b184SJames Collins
911c165b184SJames Collins			 * [0] - full match
912c165b184SJames Collins			 * [1] - attribute name
913c165b184SJames Collins			 * [2] - attribute expression
914c165b184SJames Collins			 * [3] - attribute value
915c165b184SJames Collins			 * [4] - case sensitivity
916c165b184SJames Collins			 *
917c165b184SJames Collins			 * Note: Attributes can be negated with a "!" prefix to their name
918c165b184SJames Collins			 */
919c165b184SJames Collins			if($m[4] !== '') {
920c165b184SJames Collins				preg_match_all(
921c165b184SJames Collins					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
922c165b184SJames Collins					trim($m[4]),
923c165b184SJames Collins					$attributes,
924c165b184SJames Collins					PREG_SET_ORDER
925c165b184SJames Collins				);
926c165b184SJames Collins
927c165b184SJames Collins				// Replace element by array
928c165b184SJames Collins				$m[4] = array();
929c165b184SJames Collins
930c165b184SJames Collins				foreach($attributes as $att) {
931c165b184SJames Collins					// Skip empty matches
932c165b184SJames Collins					if(trim($att[0]) === '') { continue; }
933c165b184SJames Collins
934c165b184SJames Collins					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
935c165b184SJames Collins					$m[4][] = array(
936c165b184SJames Collins						$inverted ? substr($att[1], 1) : $att[1], // Name
937c165b184SJames Collins						(isset($att[2])) ? $att[2] : '', // Expression
938c165b184SJames Collins						(isset($att[3])) ? $att[3] : '', // Value
939c165b184SJames Collins						$inverted, // Inverted Flag
940c165b184SJames Collins						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
941c165b184SJames Collins					);
942c165b184SJames Collins				}
943c165b184SJames Collins			}
944c165b184SJames Collins
945c165b184SJames Collins			// Sanitize Separator
946c165b184SJames Collins			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
947c165b184SJames Collins				$m[5] = ' ';
948c165b184SJames Collins			} else { // Other Separator
949c165b184SJames Collins				$m[5] = trim($m[5]);
950c165b184SJames Collins			}
951c165b184SJames Collins
952c165b184SJames Collins			// Clear Separator if it's a Selector List
953c165b184SJames Collins			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
954c165b184SJames Collins
955c165b184SJames Collins			// Remove full match before adding to results
956c165b184SJames Collins			array_shift($m);
957c165b184SJames Collins			$result[] = $m;
958c165b184SJames Collins
959c165b184SJames Collins			if ($is_list) { // Selector List
960c165b184SJames Collins				$selectors[] = $result;
961c165b184SJames Collins				$result = array();
962c165b184SJames Collins			}
963c165b184SJames Collins		}
964c165b184SJames Collins
965c165b184SJames Collins		if (count($result) > 0) { $selectors[] = $result; }
966c165b184SJames Collins		return $selectors;
967c165b184SJames Collins	}
968c165b184SJames Collins
969c165b184SJames Collins	function __get($name)
970c165b184SJames Collins	{
971c165b184SJames Collins		if (isset($this->attr[$name])) {
972c165b184SJames Collins			return $this->convert_text($this->attr[$name]);
973c165b184SJames Collins		}
974c165b184SJames Collins		switch ($name) {
975c165b184SJames Collins			case 'outertext': return $this->outertext();
976c165b184SJames Collins			case 'innertext': return $this->innertext();
977c165b184SJames Collins			case 'plaintext': return $this->text();
978c165b184SJames Collins			case 'xmltext': return $this->xmltext();
979c165b184SJames Collins			default: return array_key_exists($name, $this->attr);
980c165b184SJames Collins		}
981c165b184SJames Collins	}
982c165b184SJames Collins
983c165b184SJames Collins	function __set($name, $value)
984c165b184SJames Collins	{
985c165b184SJames Collins		global $debug_object;
986c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
987c165b184SJames Collins
988c165b184SJames Collins		switch ($name) {
989c165b184SJames Collins			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
990c165b184SJames Collins			case 'innertext':
991c165b184SJames Collins				if (isset($this->_[HDOM_INFO_TEXT])) {
992c165b184SJames Collins					return $this->_[HDOM_INFO_TEXT] = $value;
993c165b184SJames Collins				}
994c165b184SJames Collins				return $this->_[HDOM_INFO_INNER] = $value;
995c165b184SJames Collins		}
996c165b184SJames Collins
997c165b184SJames Collins		if (!isset($this->attr[$name])) {
998c165b184SJames Collins			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
999c165b184SJames Collins			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1000c165b184SJames Collins		}
1001c165b184SJames Collins
1002c165b184SJames Collins		$this->attr[$name] = $value;
1003c165b184SJames Collins	}
1004c165b184SJames Collins
1005c165b184SJames Collins	function __isset($name)
1006c165b184SJames Collins	{
1007c165b184SJames Collins		switch ($name) {
1008c165b184SJames Collins			case 'outertext': return true;
1009c165b184SJames Collins			case 'innertext': return true;
1010c165b184SJames Collins			case 'plaintext': return true;
1011c165b184SJames Collins		}
1012c165b184SJames Collins		//no value attr: nowrap, checked selected...
1013c165b184SJames Collins		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1014c165b184SJames Collins	}
1015c165b184SJames Collins
1016c165b184SJames Collins	function __unset($name)
1017c165b184SJames Collins	{
1018c165b184SJames Collins		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1019c165b184SJames Collins	}
1020c165b184SJames Collins
1021c165b184SJames Collins	function convert_text($text)
1022c165b184SJames Collins	{
1023c165b184SJames Collins		global $debug_object;
1024c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1025c165b184SJames Collins
1026c165b184SJames Collins		$converted_text = $text;
1027c165b184SJames Collins
1028c165b184SJames Collins		$sourceCharset = '';
1029c165b184SJames Collins		$targetCharset = '';
1030c165b184SJames Collins
1031c165b184SJames Collins		if ($this->dom) {
1032c165b184SJames Collins			$sourceCharset = strtoupper($this->dom->_charset);
1033c165b184SJames Collins			$targetCharset = strtoupper($this->dom->_target_charset);
1034c165b184SJames Collins		}
1035c165b184SJames Collins
1036c165b184SJames Collins		if (is_object($debug_object)) {
1037c165b184SJames Collins			$debug_object->debug_log(3,
1038c165b184SJames Collins				'source charset: '
1039c165b184SJames Collins				. $sourceCharset
1040c165b184SJames Collins				. ' target charaset: '
1041c165b184SJames Collins				. $targetCharset
1042c165b184SJames Collins			);
1043c165b184SJames Collins		}
1044c165b184SJames Collins
1045c165b184SJames Collins		if (!empty($sourceCharset)
1046c165b184SJames Collins			&& !empty($targetCharset)
1047c165b184SJames Collins			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1048c165b184SJames Collins			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1049c165b184SJames Collins			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1050c165b184SJames Collins				&& ($this->is_utf8($text))) {
1051c165b184SJames Collins				$converted_text = $text;
1052c165b184SJames Collins			} else {
1053c165b184SJames Collins				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1054c165b184SJames Collins			}
1055c165b184SJames Collins		}
1056c165b184SJames Collins
1057c165b184SJames Collins		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1058c165b184SJames Collins		if ($targetCharset === 'UTF-8') {
1059c165b184SJames Collins			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1060c165b184SJames Collins				$converted_text = substr($converted_text, 3);
1061c165b184SJames Collins			}
1062c165b184SJames Collins
1063c165b184SJames Collins			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1064c165b184SJames Collins				$converted_text = substr($converted_text, 0, -3);
1065c165b184SJames Collins			}
1066c165b184SJames Collins		}
1067c165b184SJames Collins
1068c165b184SJames Collins		return $converted_text;
1069c165b184SJames Collins	}
1070c165b184SJames Collins
1071c165b184SJames Collins	static function is_utf8($str)
1072c165b184SJames Collins	{
1073c165b184SJames Collins		$c = 0; $b = 0;
1074c165b184SJames Collins		$bits = 0;
1075c165b184SJames Collins		$len = strlen($str);
1076c165b184SJames Collins		for($i = 0; $i < $len; $i++) {
1077c165b184SJames Collins			$c = ord($str[$i]);
1078c165b184SJames Collins			if($c > 128) {
1079c165b184SJames Collins				if(($c >= 254)) { return false; }
1080c165b184SJames Collins				elseif($c >= 252) { $bits = 6; }
1081c165b184SJames Collins				elseif($c >= 248) { $bits = 5; }
1082c165b184SJames Collins				elseif($c >= 240) { $bits = 4; }
1083c165b184SJames Collins				elseif($c >= 224) { $bits = 3; }
1084c165b184SJames Collins				elseif($c >= 192) { $bits = 2; }
1085c165b184SJames Collins				else { return false; }
1086c165b184SJames Collins				if(($i + $bits) > $len) { return false; }
1087c165b184SJames Collins				while($bits > 1) {
1088c165b184SJames Collins					$i++;
1089c165b184SJames Collins					$b = ord($str[$i]);
1090c165b184SJames Collins					if($b < 128 || $b > 191) { return false; }
1091c165b184SJames Collins					$bits--;
1092c165b184SJames Collins				}
1093c165b184SJames Collins			}
1094c165b184SJames Collins		}
1095c165b184SJames Collins		return true;
1096c165b184SJames Collins	}
1097c165b184SJames Collins
1098c165b184SJames Collins	function get_display_size()
1099c165b184SJames Collins	{
1100c165b184SJames Collins		global $debug_object;
1101c165b184SJames Collins
1102c165b184SJames Collins		$width = -1;
1103c165b184SJames Collins		$height = -1;
1104c165b184SJames Collins
1105c165b184SJames Collins		if ($this->tag !== 'img') {
1106c165b184SJames Collins			return false;
1107c165b184SJames Collins		}
1108c165b184SJames Collins
1109c165b184SJames Collins		// See if there is aheight or width attribute in the tag itself.
1110c165b184SJames Collins		if (isset($this->attr['width'])) {
1111c165b184SJames Collins			$width = $this->attr['width'];
1112c165b184SJames Collins		}
1113c165b184SJames Collins
1114c165b184SJames Collins		if (isset($this->attr['height'])) {
1115c165b184SJames Collins			$height = $this->attr['height'];
1116c165b184SJames Collins		}
1117c165b184SJames Collins
1118c165b184SJames Collins		// Now look for an inline style.
1119c165b184SJames Collins		if (isset($this->attr['style'])) {
1120c165b184SJames Collins			// Thanks to user gnarf from stackoverflow for this regular expression.
1121c165b184SJames Collins			$attributes = array();
1122c165b184SJames Collins
1123c165b184SJames Collins			preg_match_all(
1124c165b184SJames Collins				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1125c165b184SJames Collins				$this->attr['style'],
1126c165b184SJames Collins				$matches,
1127c165b184SJames Collins				PREG_SET_ORDER
1128c165b184SJames Collins			);
1129c165b184SJames Collins
1130c165b184SJames Collins			foreach ($matches as $match) {
1131c165b184SJames Collins				$attributes[$match[1]] = $match[2];
1132c165b184SJames Collins			}
1133c165b184SJames Collins
1134c165b184SJames Collins			// If there is a width in the style attributes:
1135c165b184SJames Collins			if (isset($attributes['width']) && $width == -1) {
1136c165b184SJames Collins				// check that the last two characters are px (pixels)
1137c165b184SJames Collins				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1138c165b184SJames Collins					$proposed_width = substr($attributes['width'], 0, -2);
1139c165b184SJames Collins					// Now make sure that it's an integer and not something stupid.
1140c165b184SJames Collins					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1141c165b184SJames Collins						$width = $proposed_width;
1142c165b184SJames Collins					}
1143c165b184SJames Collins				}
1144c165b184SJames Collins			}
1145c165b184SJames Collins
1146c165b184SJames Collins			// If there is a width in the style attributes:
1147c165b184SJames Collins			if (isset($attributes['height']) && $height == -1) {
1148c165b184SJames Collins				// check that the last two characters are px (pixels)
1149c165b184SJames Collins				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1150c165b184SJames Collins					$proposed_height = substr($attributes['height'], 0, -2);
1151c165b184SJames Collins					// Now make sure that it's an integer and not something stupid.
1152c165b184SJames Collins					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1153c165b184SJames Collins						$height = $proposed_height;
1154c165b184SJames Collins					}
1155c165b184SJames Collins				}
1156c165b184SJames Collins			}
1157c165b184SJames Collins
1158c165b184SJames Collins		}
1159c165b184SJames Collins
1160c165b184SJames Collins		// Future enhancement:
1161c165b184SJames Collins		// Look in the tag to see if there is a class or id specified that has
1162c165b184SJames Collins		// a height or width attribute to it.
1163c165b184SJames Collins
1164c165b184SJames Collins		// Far future enhancement
1165c165b184SJames Collins		// Look at all the parent tags of this image to see if they specify a
1166c165b184SJames Collins		// class or id that has an img selector that specifies a height or width
1167c165b184SJames Collins		// Note that in this case, the class or id will have the img subselector
1168c165b184SJames Collins		// for it to apply to the image.
1169c165b184SJames Collins
1170c165b184SJames Collins		// ridiculously far future development
1171c165b184SJames Collins		// If the class or id is specified in a SEPARATE css file thats not on
1172c165b184SJames Collins		// the page, go get it and do what we were just doing for the ones on
1173c165b184SJames Collins		// the page.
1174c165b184SJames Collins
1175c165b184SJames Collins		$result = array(
1176c165b184SJames Collins			'height' => $height,
1177c165b184SJames Collins			'width' => $width
1178c165b184SJames Collins		);
1179c165b184SJames Collins
1180c165b184SJames Collins		return $result;
1181c165b184SJames Collins	}
1182c165b184SJames Collins
1183c165b184SJames Collins	function save($filepath = '')
1184c165b184SJames Collins	{
1185c165b184SJames Collins		$ret = $this->outertext();
1186c165b184SJames Collins
1187c165b184SJames Collins		if ($filepath !== '') {
1188c165b184SJames Collins			file_put_contents($filepath, $ret, LOCK_EX);
1189c165b184SJames Collins		}
1190c165b184SJames Collins
1191c165b184SJames Collins		return $ret;
1192c165b184SJames Collins	}
1193c165b184SJames Collins
1194c165b184SJames Collins	function addClass($class)
1195c165b184SJames Collins	{
1196c165b184SJames Collins		if (is_string($class)) {
1197c165b184SJames Collins			$class = explode(' ', $class);
1198c165b184SJames Collins		}
1199c165b184SJames Collins
1200c165b184SJames Collins		if (is_array($class)) {
1201c165b184SJames Collins			foreach($class as $c) {
1202c165b184SJames Collins				if (isset($this->class)) {
1203c165b184SJames Collins					if ($this->hasClass($c)) {
1204c165b184SJames Collins						continue;
1205c165b184SJames Collins					} else {
1206c165b184SJames Collins						$this->class .= ' ' . $c;
1207c165b184SJames Collins					}
1208c165b184SJames Collins				} else {
1209c165b184SJames Collins					$this->class = $c;
1210c165b184SJames Collins				}
1211c165b184SJames Collins			}
1212c165b184SJames Collins		} else {
1213c165b184SJames Collins			if (is_object($debug_object)) {
1214c165b184SJames Collins				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1215c165b184SJames Collins			}
1216c165b184SJames Collins		}
1217c165b184SJames Collins	}
1218c165b184SJames Collins
1219c165b184SJames Collins	function hasClass($class)
1220c165b184SJames Collins	{
1221c165b184SJames Collins		if (is_string($class)) {
1222c165b184SJames Collins			if (isset($this->class)) {
1223c165b184SJames Collins				return in_array($class, explode(' ', $this->class), true);
1224c165b184SJames Collins			}
1225c165b184SJames Collins		} else {
1226c165b184SJames Collins			if (is_object($debug_object)) {
1227c165b184SJames Collins				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1228c165b184SJames Collins			}
1229c165b184SJames Collins		}
1230c165b184SJames Collins
1231c165b184SJames Collins		return false;
1232c165b184SJames Collins	}
1233c165b184SJames Collins
1234c165b184SJames Collins	function removeClass($class = null)
1235c165b184SJames Collins	{
1236c165b184SJames Collins		if (!isset($this->class)) {
1237c165b184SJames Collins			return;
1238c165b184SJames Collins		}
1239c165b184SJames Collins
1240c165b184SJames Collins		if (is_null($class)) {
1241c165b184SJames Collins			$this->removeAttribute('class');
1242c165b184SJames Collins			return;
1243c165b184SJames Collins		}
1244c165b184SJames Collins
1245c165b184SJames Collins		if (is_string($class)) {
1246c165b184SJames Collins			$class = explode(' ', $class);
1247c165b184SJames Collins		}
1248c165b184SJames Collins
1249c165b184SJames Collins		if (is_array($class)) {
1250c165b184SJames Collins			$class = array_diff(explode(' ', $this->class), $class);
1251c165b184SJames Collins			if (empty($class)) {
1252c165b184SJames Collins				$this->removeAttribute('class');
1253c165b184SJames Collins			} else {
1254c165b184SJames Collins				$this->class = implode(' ', $class);
1255c165b184SJames Collins			}
1256c165b184SJames Collins		}
1257c165b184SJames Collins	}
1258c165b184SJames Collins
1259c165b184SJames Collins	function getAllAttributes()
1260c165b184SJames Collins	{
1261c165b184SJames Collins		return $this->attr;
1262c165b184SJames Collins	}
1263c165b184SJames Collins
1264c165b184SJames Collins	function getAttribute($name)
1265c165b184SJames Collins	{
1266c165b184SJames Collins		return $this->__get($name);
1267c165b184SJames Collins	}
1268c165b184SJames Collins
1269c165b184SJames Collins	function setAttribute($name, $value)
1270c165b184SJames Collins	{
1271c165b184SJames Collins		$this->__set($name, $value);
1272c165b184SJames Collins	}
1273c165b184SJames Collins
1274c165b184SJames Collins	function hasAttribute($name)
1275c165b184SJames Collins	{
1276c165b184SJames Collins		return $this->__isset($name);
1277c165b184SJames Collins	}
1278c165b184SJames Collins
1279c165b184SJames Collins	function removeAttribute($name)
1280c165b184SJames Collins	{
1281c165b184SJames Collins		$this->__set($name, null);
1282c165b184SJames Collins	}
1283c165b184SJames Collins
1284c165b184SJames Collins	function remove()
1285c165b184SJames Collins	{
1286c165b184SJames Collins		if ($this->parent) {
1287c165b184SJames Collins			$this->parent->removeChild($this);
1288c165b184SJames Collins		}
1289c165b184SJames Collins	}
1290c165b184SJames Collins
1291c165b184SJames Collins	function removeChild($node)
1292c165b184SJames Collins	{
1293c165b184SJames Collins		$nidx = array_search($node, $this->nodes, true);
1294c165b184SJames Collins		$cidx = array_search($node, $this->children, true);
1295c165b184SJames Collins		$didx = array_search($node, $this->dom->nodes, true);
1296c165b184SJames Collins
1297c165b184SJames Collins		if ($nidx !== false && $cidx !== false && $didx !== false) {
1298c165b184SJames Collins
1299c165b184SJames Collins			foreach($node->children as $child) {
1300c165b184SJames Collins				$node->removeChild($child);
1301c165b184SJames Collins			}
1302c165b184SJames Collins
1303c165b184SJames Collins			foreach($node->nodes as $entity) {
1304c165b184SJames Collins				$enidx = array_search($entity, $node->nodes, true);
1305c165b184SJames Collins				$edidx = array_search($entity, $node->dom->nodes, true);
1306c165b184SJames Collins
1307c165b184SJames Collins				if ($enidx !== false && $edidx !== false) {
1308c165b184SJames Collins					unset($node->nodes[$enidx]);
1309c165b184SJames Collins					unset($node->dom->nodes[$edidx]);
1310c165b184SJames Collins				}
1311c165b184SJames Collins			}
1312c165b184SJames Collins
1313c165b184SJames Collins			unset($this->nodes[$nidx]);
1314c165b184SJames Collins			unset($this->children[$cidx]);
1315c165b184SJames Collins			unset($this->dom->nodes[$didx]);
1316c165b184SJames Collins
1317c165b184SJames Collins			$node->clear();
1318c165b184SJames Collins
1319c165b184SJames Collins		}
1320c165b184SJames Collins	}
1321c165b184SJames Collins
1322c165b184SJames Collins	function getElementById($id)
1323c165b184SJames Collins	{
1324c165b184SJames Collins		return $this->find("#$id", 0);
1325c165b184SJames Collins	}
1326c165b184SJames Collins
1327c165b184SJames Collins	function getElementsById($id, $idx = null)
1328c165b184SJames Collins	{
1329c165b184SJames Collins		return $this->find("#$id", $idx);
1330c165b184SJames Collins	}
1331c165b184SJames Collins
1332c165b184SJames Collins	function getElementByTagName($name)
1333c165b184SJames Collins	{
1334c165b184SJames Collins		return $this->find($name, 0);
1335c165b184SJames Collins	}
1336c165b184SJames Collins
1337c165b184SJames Collins	function getElementsByTagName($name, $idx = null)
1338c165b184SJames Collins	{
1339c165b184SJames Collins		return $this->find($name, $idx);
1340c165b184SJames Collins	}
1341c165b184SJames Collins
1342c165b184SJames Collins	function parentNode()
1343c165b184SJames Collins	{
1344c165b184SJames Collins		return $this->parent();
1345c165b184SJames Collins	}
1346c165b184SJames Collins
1347c165b184SJames Collins	function childNodes($idx = -1)
1348c165b184SJames Collins	{
1349c165b184SJames Collins		return $this->children($idx);
1350c165b184SJames Collins	}
1351c165b184SJames Collins
1352c165b184SJames Collins	function firstChild()
1353c165b184SJames Collins	{
1354c165b184SJames Collins		return $this->first_child();
1355c165b184SJames Collins	}
1356c165b184SJames Collins
1357c165b184SJames Collins	function lastChild()
1358c165b184SJames Collins	{
1359c165b184SJames Collins		return $this->last_child();
1360c165b184SJames Collins	}
1361c165b184SJames Collins
1362c165b184SJames Collins	function nextSibling()
1363c165b184SJames Collins	{
1364c165b184SJames Collins		return $this->next_sibling();
1365c165b184SJames Collins	}
1366c165b184SJames Collins
1367c165b184SJames Collins	function previousSibling()
1368c165b184SJames Collins	{
1369c165b184SJames Collins		return $this->prev_sibling();
1370c165b184SJames Collins	}
1371c165b184SJames Collins
1372c165b184SJames Collins	function hasChildNodes()
1373c165b184SJames Collins	{
1374c165b184SJames Collins		return $this->has_child();
1375c165b184SJames Collins	}
1376c165b184SJames Collins
1377c165b184SJames Collins	function nodeName()
1378c165b184SJames Collins	{
1379c165b184SJames Collins		return $this->tag;
1380c165b184SJames Collins	}
1381c165b184SJames Collins
1382c165b184SJames Collins	function appendChild($node)
1383c165b184SJames Collins	{
1384c165b184SJames Collins		$node->parent($this);
1385c165b184SJames Collins		return $node;
1386c165b184SJames Collins	}
1387c165b184SJames Collins
1388c165b184SJames Collins}
1389c165b184SJames Collins
1390c165b184SJames Collinsclass simple_html_dom
1391c165b184SJames Collins{
1392c165b184SJames Collins	public $root = null;
1393c165b184SJames Collins	public $nodes = array();
1394c165b184SJames Collins	public $callback = null;
1395c165b184SJames Collins	public $lowercase = false;
1396c165b184SJames Collins	public $original_size;
1397c165b184SJames Collins    public $size;
1398c165b184SJames Collins
1399*bc1032d9SJames Collins    public $stripRNAttrValues = true;       // added option to ignore RN in attr values - nomadjimbob
1400*bc1032d9SJames Collins
1401c165b184SJames Collins	protected $pos;
1402c165b184SJames Collins	protected $doc;
1403c165b184SJames Collins	protected $char;
1404c165b184SJames Collins
1405c165b184SJames Collins	protected $cursor;
1406c165b184SJames Collins	protected $parent;
1407c165b184SJames Collins	protected $noise = array();
1408c165b184SJames Collins	protected $token_blank = " \t\r\n";
1409c165b184SJames Collins	protected $token_equal = ' =/>';
1410c165b184SJames Collins	protected $token_slash = " />\r\n\t";
1411c165b184SJames Collins	protected $token_attr = ' >';
1412c165b184SJames Collins
1413c165b184SJames Collins	public $_charset = '';
1414c165b184SJames Collins	public $_target_charset = '';
1415c165b184SJames Collins
1416c165b184SJames Collins	protected $default_br_text = '';
1417c165b184SJames Collins
1418c165b184SJames Collins	public $default_span_text = '';
1419c165b184SJames Collins
1420c165b184SJames Collins	protected $self_closing_tags = array(
1421c165b184SJames Collins		'area' => 1,
1422c165b184SJames Collins		'base' => 1,
1423c165b184SJames Collins		'br' => 1,
1424c165b184SJames Collins		'col' => 1,
1425c165b184SJames Collins		'embed' => 1,
1426c165b184SJames Collins		'hr' => 1,
1427c165b184SJames Collins		'img' => 1,
1428c165b184SJames Collins		'input' => 1,
1429c165b184SJames Collins		'link' => 1,
1430c165b184SJames Collins		'meta' => 1,
1431c165b184SJames Collins		'param' => 1,
1432c165b184SJames Collins		'source' => 1,
1433c165b184SJames Collins		'track' => 1,
1434c165b184SJames Collins		'wbr' => 1
1435c165b184SJames Collins	);
1436c165b184SJames Collins	protected $block_tags = array(
1437c165b184SJames Collins		'body' => 1,
1438c165b184SJames Collins		'div' => 1,
1439c165b184SJames Collins		'form' => 1,
1440c165b184SJames Collins		'root' => 1,
1441c165b184SJames Collins		'span' => 1,
1442c165b184SJames Collins		'table' => 1
1443c165b184SJames Collins	);
1444c165b184SJames Collins	protected $optional_closing_tags = array(
1445c165b184SJames Collins		// Not optional, see
1446c165b184SJames Collins		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1447c165b184SJames Collins		'b' => array('b' => 1),
1448c165b184SJames Collins		'dd' => array('dd' => 1, 'dt' => 1),
1449c165b184SJames Collins		// Not optional, see
1450c165b184SJames Collins		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1451c165b184SJames Collins		'dl' => array('dd' => 1, 'dt' => 1),
1452c165b184SJames Collins		'dt' => array('dd' => 1, 'dt' => 1),
1453c165b184SJames Collins		'li' => array('li' => 1),
1454c165b184SJames Collins		'optgroup' => array('optgroup' => 1, 'option' => 1),
1455c165b184SJames Collins		'option' => array('optgroup' => 1, 'option' => 1),
1456c165b184SJames Collins		'p' => array('p' => 1),
1457c165b184SJames Collins		'rp' => array('rp' => 1, 'rt' => 1),
1458c165b184SJames Collins		'rt' => array('rp' => 1, 'rt' => 1),
1459c165b184SJames Collins		'td' => array('td' => 1, 'th' => 1),
1460c165b184SJames Collins		'th' => array('td' => 1, 'th' => 1),
1461c165b184SJames Collins		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1462c165b184SJames Collins	);
1463c165b184SJames Collins
1464c165b184SJames Collins	function __construct(
1465c165b184SJames Collins		$str = null,
1466c165b184SJames Collins		$lowercase = true,
1467c165b184SJames Collins		$forceTagsClosed = true,
1468c165b184SJames Collins		$target_charset = DEFAULT_TARGET_CHARSET,
1469c165b184SJames Collins		$stripRN = true,
1470c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1471c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT,
1472c165b184SJames Collins		$options = 0)
1473c165b184SJames Collins	{
1474c165b184SJames Collins		if ($str) {
1475c165b184SJames Collins			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1476c165b184SJames Collins				$this->load_file($str);
1477c165b184SJames Collins			} else {
1478c165b184SJames Collins				$this->load(
1479c165b184SJames Collins					$str,
1480c165b184SJames Collins					$lowercase,
1481c165b184SJames Collins					$stripRN,
1482c165b184SJames Collins					$defaultBRText,
1483c165b184SJames Collins					$defaultSpanText,
1484c165b184SJames Collins					$options
1485c165b184SJames Collins				);
1486c165b184SJames Collins			}
1487c165b184SJames Collins		}
1488c165b184SJames Collins		// Forcing tags to be closed implies that we don't trust the html, but
1489c165b184SJames Collins		// it can lead to parsing errors if we SHOULD trust the html.
1490c165b184SJames Collins		if (!$forceTagsClosed) {
1491c165b184SJames Collins			$this->optional_closing_array = array();
1492c165b184SJames Collins		}
1493c165b184SJames Collins
1494c165b184SJames Collins		$this->_target_charset = $target_charset;
1495c165b184SJames Collins	}
1496c165b184SJames Collins
1497c165b184SJames Collins	function __destruct()
1498c165b184SJames Collins	{
1499c165b184SJames Collins		$this->clear();
1500c165b184SJames Collins	}
1501c165b184SJames Collins
1502c165b184SJames Collins	function load(
1503c165b184SJames Collins		$str,
1504c165b184SJames Collins		$lowercase = true,
1505c165b184SJames Collins		$stripRN = true,
1506c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1507c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT,
1508c165b184SJames Collins		$options = 0)
1509c165b184SJames Collins	{
1510c165b184SJames Collins		global $debug_object;
1511c165b184SJames Collins
1512c165b184SJames Collins		// prepare
1513c165b184SJames Collins		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1514c165b184SJames Collins
1515c165b184SJames Collins		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1516c165b184SJames Collins		// Script tags removal now preceeds style tag removal.
1517c165b184SJames Collins		// strip out <script> tags
1518c165b184SJames Collins		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1519c165b184SJames Collins		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1520c165b184SJames Collins
1521c165b184SJames Collins		// strip out the \r \n's if we are told to.
1522c165b184SJames Collins		if ($stripRN) {
1523c165b184SJames Collins			$this->doc = str_replace("\r", ' ', $this->doc);
1524c165b184SJames Collins			$this->doc = str_replace("\n", ' ', $this->doc);
1525c165b184SJames Collins
1526c165b184SJames Collins			// set the length of content since we have changed it.
1527c165b184SJames Collins			$this->size = strlen($this->doc);
1528c165b184SJames Collins		}
1529c165b184SJames Collins
1530c165b184SJames Collins		// strip out cdata
1531c165b184SJames Collins		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1532c165b184SJames Collins		// strip out comments
1533c165b184SJames Collins		$this->remove_noise("'<!--(.*?)-->'is");
1534c165b184SJames Collins		// strip out <style> tags
1535c165b184SJames Collins		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1536c165b184SJames Collins		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1537c165b184SJames Collins		// strip out preformatted tags
1538c165b184SJames Collins		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1539c165b184SJames Collins		// strip out server side scripts
1540c165b184SJames Collins		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1541c165b184SJames Collins
1542c165b184SJames Collins		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1543c165b184SJames Collins			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1544c165b184SJames Collins		}
1545c165b184SJames Collins
1546c165b184SJames Collins		// parsing
1547c165b184SJames Collins		$this->parse();
1548c165b184SJames Collins		// end
1549c165b184SJames Collins		$this->root->_[HDOM_INFO_END] = $this->cursor;
1550c165b184SJames Collins		$this->parse_charset();
1551c165b184SJames Collins
1552c165b184SJames Collins		// make load function chainable
1553c165b184SJames Collins		return $this;
1554c165b184SJames Collins	}
1555c165b184SJames Collins
1556c165b184SJames Collins	function load_file()
1557c165b184SJames Collins	{
1558c165b184SJames Collins		$args = func_get_args();
1559c165b184SJames Collins
1560c165b184SJames Collins		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1561c165b184SJames Collins			$this->load($doc, true);
1562c165b184SJames Collins		} else {
1563c165b184SJames Collins			return false;
1564c165b184SJames Collins		}
1565c165b184SJames Collins	}
1566c165b184SJames Collins
1567c165b184SJames Collins	function set_callback($function_name)
1568c165b184SJames Collins	{
1569c165b184SJames Collins		$this->callback = $function_name;
1570c165b184SJames Collins	}
1571c165b184SJames Collins
1572c165b184SJames Collins	function remove_callback()
1573c165b184SJames Collins	{
1574c165b184SJames Collins		$this->callback = null;
1575c165b184SJames Collins	}
1576c165b184SJames Collins
1577c165b184SJames Collins	function save($filepath = '')
1578c165b184SJames Collins	{
1579c165b184SJames Collins		$ret = $this->root->innertext();
1580c165b184SJames Collins		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1581c165b184SJames Collins		return $ret;
1582c165b184SJames Collins	}
1583c165b184SJames Collins
1584c165b184SJames Collins	function find($selector, $idx = null, $lowercase = false)
1585c165b184SJames Collins	{
1586c165b184SJames Collins		return $this->root->find($selector, $idx, $lowercase);
1587c165b184SJames Collins	}
1588c165b184SJames Collins
1589c165b184SJames Collins	function clear()
1590c165b184SJames Collins	{
1591c165b184SJames Collins		if (isset($this->nodes)) {
1592c165b184SJames Collins			foreach ($this->nodes as $n) {
1593c165b184SJames Collins				$n->clear();
1594c165b184SJames Collins				$n = null;
1595c165b184SJames Collins			}
1596c165b184SJames Collins		}
1597c165b184SJames Collins
1598c165b184SJames Collins		// This add next line is documented in the sourceforge repository.
1599c165b184SJames Collins		// 2977248 as a fix for ongoing memory leaks that occur even with the
1600c165b184SJames Collins		// use of clear.
1601c165b184SJames Collins		if (isset($this->children)) {
1602c165b184SJames Collins			foreach ($this->children as $n) {
1603c165b184SJames Collins				$n->clear();
1604c165b184SJames Collins				$n = null;
1605c165b184SJames Collins			}
1606c165b184SJames Collins		}
1607c165b184SJames Collins
1608c165b184SJames Collins		if (isset($this->parent)) {
1609c165b184SJames Collins			$this->parent->clear();
1610c165b184SJames Collins			unset($this->parent);
1611c165b184SJames Collins		}
1612c165b184SJames Collins
1613c165b184SJames Collins		if (isset($this->root)) {
1614c165b184SJames Collins			$this->root->clear();
1615c165b184SJames Collins			unset($this->root);
1616c165b184SJames Collins		}
1617c165b184SJames Collins
1618c165b184SJames Collins		unset($this->doc);
1619c165b184SJames Collins		unset($this->noise);
1620c165b184SJames Collins	}
1621c165b184SJames Collins
1622c165b184SJames Collins	function dump($show_attr = true)
1623c165b184SJames Collins	{
1624c165b184SJames Collins		$this->root->dump($show_attr);
1625c165b184SJames Collins	}
1626c165b184SJames Collins
1627c165b184SJames Collins	protected function prepare(
1628c165b184SJames Collins		$str, $lowercase = true,
1629c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1630c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT)
1631c165b184SJames Collins	{
1632c165b184SJames Collins		$this->clear();
1633c165b184SJames Collins
1634c165b184SJames Collins		$this->doc = trim($str);
1635c165b184SJames Collins		$this->size = strlen($this->doc);
1636c165b184SJames Collins		$this->original_size = $this->size; // original size of the html
1637c165b184SJames Collins		$this->pos = 0;
1638c165b184SJames Collins		$this->cursor = 1;
1639c165b184SJames Collins		$this->noise = array();
1640c165b184SJames Collins		$this->nodes = array();
1641c165b184SJames Collins		$this->lowercase = $lowercase;
1642c165b184SJames Collins		$this->default_br_text = $defaultBRText;
1643c165b184SJames Collins		$this->default_span_text = $defaultSpanText;
1644c165b184SJames Collins		$this->root = new simple_html_dom_node($this);
1645c165b184SJames Collins		$this->root->tag = 'root';
1646c165b184SJames Collins		$this->root->_[HDOM_INFO_BEGIN] = -1;
1647c165b184SJames Collins		$this->root->nodetype = HDOM_TYPE_ROOT;
1648c165b184SJames Collins		$this->parent = $this->root;
1649c165b184SJames Collins		if ($this->size > 0) { $this->char = $this->doc[0]; }
1650c165b184SJames Collins	}
1651c165b184SJames Collins
1652c165b184SJames Collins	protected function parse()
1653c165b184SJames Collins	{
1654c165b184SJames Collins		while (true) {
1655c165b184SJames Collins			// Read next tag if there is no text between current position and the
1656c165b184SJames Collins			// next opening tag.
1657c165b184SJames Collins			if (($s = $this->copy_until_char('<')) === '') {
1658c165b184SJames Collins				if($this->read_tag()) {
1659c165b184SJames Collins					continue;
1660c165b184SJames Collins				} else {
1661c165b184SJames Collins					return true;
1662c165b184SJames Collins				}
1663c165b184SJames Collins			}
1664c165b184SJames Collins
1665c165b184SJames Collins			// Add a text node for text between tags
1666c165b184SJames Collins			$node = new simple_html_dom_node($this);
1667c165b184SJames Collins			++$this->cursor;
1668c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = $s;
1669c165b184SJames Collins			$this->link_nodes($node, false);
1670c165b184SJames Collins		}
1671c165b184SJames Collins	}
1672c165b184SJames Collins
1673c165b184SJames Collins	protected function parse_charset()
1674c165b184SJames Collins	{
1675c165b184SJames Collins		global $debug_object;
1676c165b184SJames Collins
1677c165b184SJames Collins		$charset = null;
1678c165b184SJames Collins
1679c165b184SJames Collins		if (function_exists('get_last_retrieve_url_contents_content_type')) {
1680c165b184SJames Collins			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1681c165b184SJames Collins			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1682c165b184SJames Collins			if ($success) {
1683c165b184SJames Collins				$charset = $matches[1];
1684c165b184SJames Collins				if (is_object($debug_object)) {
1685c165b184SJames Collins					$debug_object->debug_log(2,
1686c165b184SJames Collins						'header content-type found charset of: '
1687c165b184SJames Collins						. $charset
1688c165b184SJames Collins					);
1689c165b184SJames Collins				}
1690c165b184SJames Collins			}
1691c165b184SJames Collins		}
1692c165b184SJames Collins
1693c165b184SJames Collins		if (empty($charset)) {
1694c165b184SJames Collins			// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1695c165b184SJames Collins			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
1696c165b184SJames Collins
1697c165b184SJames Collins			if (!empty($el)) {
1698c165b184SJames Collins				$fullvalue = $el->content;
1699c165b184SJames Collins				if (is_object($debug_object)) {
1700c165b184SJames Collins					$debug_object->debug_log(2,
1701c165b184SJames Collins						'meta content-type tag found'
1702c165b184SJames Collins						. $fullvalue
1703c165b184SJames Collins					);
1704c165b184SJames Collins				}
1705c165b184SJames Collins
1706c165b184SJames Collins				if (!empty($fullvalue)) {
1707c165b184SJames Collins					$success = preg_match(
1708c165b184SJames Collins						'/charset=(.+)/i',
1709c165b184SJames Collins						$fullvalue,
1710c165b184SJames Collins						$matches
1711c165b184SJames Collins					);
1712c165b184SJames Collins
1713c165b184SJames Collins					if ($success) {
1714c165b184SJames Collins						$charset = $matches[1];
1715c165b184SJames Collins					} else {
1716c165b184SJames Collins						// If there is a meta tag, and they don't specify the
1717c165b184SJames Collins						// character set, research says that it's typically
1718c165b184SJames Collins						// ISO-8859-1
1719c165b184SJames Collins						if (is_object($debug_object)) {
1720c165b184SJames Collins							$debug_object->debug_log(2,
1721c165b184SJames Collins								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
1722c165b184SJames Collins							);
1723c165b184SJames Collins						}
1724c165b184SJames Collins
1725c165b184SJames Collins						$charset = 'ISO-8859-1';
1726c165b184SJames Collins					}
1727c165b184SJames Collins				}
1728c165b184SJames Collins			}
1729c165b184SJames Collins		}
1730c165b184SJames Collins
1731c165b184SJames Collins		if (empty($charset)) {
1732c165b184SJames Collins			// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
1733c165b184SJames Collins			if ($meta = $this->root->find('meta[charset]', 0)) {
1734c165b184SJames Collins				$charset = $meta->charset;
1735c165b184SJames Collins				if (is_object($debug_object)) {
1736c165b184SJames Collins					$debug_object->debug_log(2, 'meta charset: ' . $charset);
1737c165b184SJames Collins				}
1738c165b184SJames Collins			}
1739c165b184SJames Collins		}
1740c165b184SJames Collins
1741c165b184SJames Collins		if (empty($charset)) {
1742c165b184SJames Collins			// Try to guess the charset based on the content
1743c165b184SJames Collins			// Requires Multibyte String (mbstring) support (optional)
1744c165b184SJames Collins			if (function_exists('mb_detect_encoding')) {
1745c165b184SJames Collins				/**
1746c165b184SJames Collins				 * mb_detect_encoding() is not intended to distinguish between
1747c165b184SJames Collins				 * charsets, especially single-byte charsets. Its primary
1748c165b184SJames Collins				 * purpose is to detect which multibyte encoding is in use,
1749c165b184SJames Collins				 * i.e. UTF-8, UTF-16, shift-JIS, etc.
1750c165b184SJames Collins				 *
1751c165b184SJames Collins				 * -- https://bugs.php.net/bug.php?id=38138
1752c165b184SJames Collins				 *
1753c165b184SJames Collins				 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1754c165b184SJames Collins				 * always result in CP1251/ISO-8859-5 and vice versa.
1755c165b184SJames Collins				 *
1756c165b184SJames Collins				 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1757c165b184SJames Collins				 * to stay compatible.
1758c165b184SJames Collins				 */
1759c165b184SJames Collins				$encoding = mb_detect_encoding(
1760c165b184SJames Collins					$this->doc,
1761c165b184SJames Collins					array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
1762c165b184SJames Collins				);
1763c165b184SJames Collins
1764c165b184SJames Collins				if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
1765c165b184SJames Collins					// Due to a limitation of mb_detect_encoding
1766c165b184SJames Collins					// 'CP1251'/'ISO-8859-5' will be detected as
1767c165b184SJames Collins					// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1768c165b184SJames Collins					// which case we can simply assume it is the other charset.
1769c165b184SJames Collins					if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
1770c165b184SJames Collins						$encoding = 'CP1251';
1771c165b184SJames Collins					}
1772c165b184SJames Collins				}
1773c165b184SJames Collins
1774c165b184SJames Collins				if ($encoding !== false) {
1775c165b184SJames Collins					$charset = $encoding;
1776c165b184SJames Collins					if (is_object($debug_object)) {
1777c165b184SJames Collins						$debug_object->debug_log(2, 'mb_detect: ' . $charset);
1778c165b184SJames Collins					}
1779c165b184SJames Collins				}
1780c165b184SJames Collins			}
1781c165b184SJames Collins		}
1782c165b184SJames Collins
1783c165b184SJames Collins		if (empty($charset)) {
1784c165b184SJames Collins			// Assume it's UTF-8 as it is the most likely charset to be used
1785c165b184SJames Collins			$charset = 'UTF-8';
1786c165b184SJames Collins			if (is_object($debug_object)) {
1787c165b184SJames Collins				$debug_object->debug_log(2, 'No match found, assume ' . $charset);
1788c165b184SJames Collins			}
1789c165b184SJames Collins		}
1790c165b184SJames Collins
1791c165b184SJames Collins		// Since CP1252 is a superset, if we get one of it's subsets, we want
1792c165b184SJames Collins		// it instead.
1793c165b184SJames Collins		if ((strtolower($charset) == 'iso-8859-1')
1794c165b184SJames Collins			|| (strtolower($charset) == 'latin1')
1795c165b184SJames Collins			|| (strtolower($charset) == 'latin-1')) {
1796c165b184SJames Collins			$charset = 'CP1252';
1797c165b184SJames Collins			if (is_object($debug_object)) {
1798c165b184SJames Collins				$debug_object->debug_log(2,
1799c165b184SJames Collins					'replacing ' . $charset . ' with CP1252 as its a superset'
1800c165b184SJames Collins				);
1801c165b184SJames Collins			}
1802c165b184SJames Collins		}
1803c165b184SJames Collins
1804c165b184SJames Collins		if (is_object($debug_object)) {
1805c165b184SJames Collins			$debug_object->debug_log(1, 'EXIT - ' . $charset);
1806c165b184SJames Collins		}
1807c165b184SJames Collins
1808c165b184SJames Collins		return $this->_charset = $charset;
1809c165b184SJames Collins	}
1810c165b184SJames Collins
1811c165b184SJames Collins	protected function read_tag()
1812c165b184SJames Collins	{
1813c165b184SJames Collins		// Set end position if no further tags found
1814c165b184SJames Collins		if ($this->char !== '<') {
1815c165b184SJames Collins			$this->root->_[HDOM_INFO_END] = $this->cursor;
1816c165b184SJames Collins			return false;
1817c165b184SJames Collins		}
1818c165b184SJames Collins
1819c165b184SJames Collins		$begin_tag_pos = $this->pos;
1820c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1821c165b184SJames Collins
1822c165b184SJames Collins		// end tag
1823c165b184SJames Collins		if ($this->char === '/') {
1824c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1825c165b184SJames Collins
1826c165b184SJames Collins			// Skip whitespace in end tags (i.e. in "</   html>")
1827c165b184SJames Collins			$this->skip($this->token_blank);
1828c165b184SJames Collins			$tag = $this->copy_until_char('>');
1829c165b184SJames Collins
1830c165b184SJames Collins			// Skip attributes in end tags
1831c165b184SJames Collins			if (($pos = strpos($tag, ' ')) !== false) {
1832c165b184SJames Collins				$tag = substr($tag, 0, $pos);
1833c165b184SJames Collins			}
1834c165b184SJames Collins
1835c165b184SJames Collins			$parent_lower = strtolower($this->parent->tag);
1836c165b184SJames Collins			$tag_lower = strtolower($tag);
1837c165b184SJames Collins
1838c165b184SJames Collins			// The end tag is supposed to close the parent tag. Handle situations
1839c165b184SJames Collins			// when it doesn't
1840c165b184SJames Collins			if ($parent_lower !== $tag_lower) {
1841c165b184SJames Collins				// Parent tag does not have to be closed necessarily (optional closing tag)
1842c165b184SJames Collins				// Current tag is a block tag, so it may close an ancestor
1843c165b184SJames Collins				if (isset($this->optional_closing_tags[$parent_lower])
1844c165b184SJames Collins					&& isset($this->block_tags[$tag_lower])) {
1845c165b184SJames Collins
1846c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0;
1847c165b184SJames Collins					$org_parent = $this->parent;
1848c165b184SJames Collins
1849c165b184SJames Collins					// Traverse ancestors to find a matching opening tag
1850c165b184SJames Collins					// Stop at root node
1851c165b184SJames Collins					while (($this->parent->parent)
1852c165b184SJames Collins						&& strtolower($this->parent->tag) !== $tag_lower
1853c165b184SJames Collins					){
1854c165b184SJames Collins						$this->parent = $this->parent->parent;
1855c165b184SJames Collins					}
1856c165b184SJames Collins
1857c165b184SJames Collins					// If we don't have a match add current tag as text node
1858c165b184SJames Collins					if (strtolower($this->parent->tag) !== $tag_lower) {
1859c165b184SJames Collins						$this->parent = $org_parent; // restore origonal parent
1860c165b184SJames Collins
1861c165b184SJames Collins						if ($this->parent->parent) {
1862c165b184SJames Collins							$this->parent = $this->parent->parent;
1863c165b184SJames Collins						}
1864c165b184SJames Collins
1865c165b184SJames Collins						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1866c165b184SJames Collins						return $this->as_text_node($tag);
1867c165b184SJames Collins					}
1868c165b184SJames Collins				} elseif (($this->parent->parent)
1869c165b184SJames Collins					&& isset($this->block_tags[$tag_lower])
1870c165b184SJames Collins				) {
1871c165b184SJames Collins					// Grandparent exists and current tag is a block tag, so our
1872c165b184SJames Collins					// parent doesn't have an end tag
1873c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
1874c165b184SJames Collins					$org_parent = $this->parent;
1875c165b184SJames Collins
1876c165b184SJames Collins					// Traverse ancestors to find a matching opening tag
1877c165b184SJames Collins					// Stop at root node
1878c165b184SJames Collins					while (($this->parent->parent)
1879c165b184SJames Collins						&& strtolower($this->parent->tag) !== $tag_lower
1880c165b184SJames Collins					) {
1881c165b184SJames Collins						$this->parent = $this->parent->parent;
1882c165b184SJames Collins					}
1883c165b184SJames Collins
1884c165b184SJames Collins					// If we don't have a match add current tag as text node
1885c165b184SJames Collins					if (strtolower($this->parent->tag) !== $tag_lower) {
1886c165b184SJames Collins						$this->parent = $org_parent; // restore origonal parent
1887c165b184SJames Collins						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1888c165b184SJames Collins						return $this->as_text_node($tag);
1889c165b184SJames Collins					}
1890c165b184SJames Collins				} elseif (($this->parent->parent)
1891c165b184SJames Collins					&& strtolower($this->parent->parent->tag) === $tag_lower
1892c165b184SJames Collins				) { // Grandparent exists and current tag closes it
1893c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0;
1894c165b184SJames Collins					$this->parent = $this->parent->parent;
1895c165b184SJames Collins				} else { // Random tag, add as text node
1896c165b184SJames Collins					return $this->as_text_node($tag);
1897c165b184SJames Collins				}
1898c165b184SJames Collins			}
1899c165b184SJames Collins
1900c165b184SJames Collins			// Set end position of parent tag to current cursor position
1901c165b184SJames Collins			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1902c165b184SJames Collins
1903c165b184SJames Collins			if ($this->parent->parent) {
1904c165b184SJames Collins				$this->parent = $this->parent->parent;
1905c165b184SJames Collins			}
1906c165b184SJames Collins
1907c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1908c165b184SJames Collins			return true;
1909c165b184SJames Collins		}
1910c165b184SJames Collins
1911c165b184SJames Collins		// start tag
1912c165b184SJames Collins		$node = new simple_html_dom_node($this);
1913c165b184SJames Collins		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1914c165b184SJames Collins		++$this->cursor;
1915c165b184SJames Collins		$tag = $this->copy_until($this->token_slash); // Get tag name
1916c165b184SJames Collins		$node->tag_start = $begin_tag_pos;
1917c165b184SJames Collins
1918c165b184SJames Collins		// doctype, cdata & comments...
1919c165b184SJames Collins		// <!DOCTYPE html>
1920c165b184SJames Collins		// <![CDATA[ ... ]]>
1921c165b184SJames Collins		// <!-- Comment -->
1922c165b184SJames Collins		if (isset($tag[0]) && $tag[0] === '!') {
1923c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1924c165b184SJames Collins
1925c165b184SJames Collins			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
1926c165b184SJames Collins				$node->nodetype = HDOM_TYPE_COMMENT;
1927c165b184SJames Collins				$node->tag = 'comment';
1928c165b184SJames Collins			} else { // Could be doctype or CDATA but we don't care
1929c165b184SJames Collins				$node->nodetype = HDOM_TYPE_UNKNOWN;
1930c165b184SJames Collins				$node->tag = 'unknown';
1931c165b184SJames Collins			}
1932c165b184SJames Collins
1933c165b184SJames Collins			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1934c165b184SJames Collins
1935c165b184SJames Collins			$this->link_nodes($node, true);
1936c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1937c165b184SJames Collins			return true;
1938c165b184SJames Collins		}
1939c165b184SJames Collins
1940c165b184SJames Collins		// The start tag cannot contain another start tag, if so add as text
1941c165b184SJames Collins		// i.e. "<<html>"
1942c165b184SJames Collins		if ($pos = strpos($tag, '<') !== false) {
1943c165b184SJames Collins			$tag = '<' . substr($tag, 0, -1);
1944c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = $tag;
1945c165b184SJames Collins			$this->link_nodes($node, false);
1946c165b184SJames Collins			$this->char = $this->doc[--$this->pos]; // prev
1947c165b184SJames Collins			return true;
1948c165b184SJames Collins		}
1949c165b184SJames Collins
1950c165b184SJames Collins		// Handle invalid tag names (i.e. "<html#doc>")
1951c165b184SJames Collins		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
1952c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1953c165b184SJames Collins
1954c165b184SJames Collins			// Next char is the beginning of a new tag, don't touch it.
1955c165b184SJames Collins			if ($this->char === '<') {
1956c165b184SJames Collins				$this->link_nodes($node, false);
1957c165b184SJames Collins				return true;
1958c165b184SJames Collins			}
1959c165b184SJames Collins
1960c165b184SJames Collins			// Next char closes current tag, add and be done with it.
1961c165b184SJames Collins			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1962c165b184SJames Collins			$this->link_nodes($node, false);
1963c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1964c165b184SJames Collins			return true;
1965c165b184SJames Collins		}
1966c165b184SJames Collins
1967c165b184SJames Collins		// begin tag, add new node
1968c165b184SJames Collins		$node->nodetype = HDOM_TYPE_ELEMENT;
1969c165b184SJames Collins		$tag_lower = strtolower($tag);
1970c165b184SJames Collins		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1971c165b184SJames Collins
1972c165b184SJames Collins		// handle optional closing tags
1973c165b184SJames Collins		if (isset($this->optional_closing_tags[$tag_lower])) {
1974c165b184SJames Collins			// Traverse ancestors to close all optional closing tags
1975c165b184SJames Collins			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
1976c165b184SJames Collins				$this->parent->_[HDOM_INFO_END] = 0;
1977c165b184SJames Collins				$this->parent = $this->parent->parent;
1978c165b184SJames Collins			}
1979c165b184SJames Collins			$node->parent = $this->parent;
1980c165b184SJames Collins		}
1981c165b184SJames Collins
1982c165b184SJames Collins		$guard = 0; // prevent infinity loop
1983c165b184SJames Collins
1984c165b184SJames Collins		// [0] Space between tag and first attribute
1985c165b184SJames Collins		$space = array($this->copy_skip($this->token_blank), '', '');
1986c165b184SJames Collins
1987c165b184SJames Collins		// attributes
1988c165b184SJames Collins		do {
1989c165b184SJames Collins			// Everything until the first equal sign should be the attribute name
1990c165b184SJames Collins			$name = $this->copy_until($this->token_equal);
1991c165b184SJames Collins
1992c165b184SJames Collins			if ($name === '' && $this->char !== null && $space[0] === '') {
1993c165b184SJames Collins				break;
1994c165b184SJames Collins			}
1995c165b184SJames Collins
1996c165b184SJames Collins			if ($guard === $this->pos) { // Escape infinite loop
1997c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1998c165b184SJames Collins				continue;
1999c165b184SJames Collins			}
2000c165b184SJames Collins
2001c165b184SJames Collins			$guard = $this->pos;
2002c165b184SJames Collins
2003c165b184SJames Collins			// handle endless '<'
2004c165b184SJames Collins			// Out of bounds before the tag ended
2005c165b184SJames Collins			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2006c165b184SJames Collins				$node->nodetype = HDOM_TYPE_TEXT;
2007c165b184SJames Collins				$node->_[HDOM_INFO_END] = 0;
2008c165b184SJames Collins				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2009c165b184SJames Collins				$node->tag = 'text';
2010c165b184SJames Collins				$this->link_nodes($node, false);
2011c165b184SJames Collins				return true;
2012c165b184SJames Collins			}
2013c165b184SJames Collins
2014c165b184SJames Collins			// handle mismatch '<'
2015c165b184SJames Collins			// Attributes cannot start after opening tag
2016c165b184SJames Collins			if ($this->doc[$this->pos - 1] == '<') {
2017c165b184SJames Collins				$node->nodetype = HDOM_TYPE_TEXT;
2018c165b184SJames Collins				$node->tag = 'text';
2019c165b184SJames Collins				$node->attr = array();
2020c165b184SJames Collins				$node->_[HDOM_INFO_END] = 0;
2021c165b184SJames Collins				$node->_[HDOM_INFO_TEXT] = substr(
2022c165b184SJames Collins					$this->doc,
2023c165b184SJames Collins					$begin_tag_pos,
2024c165b184SJames Collins					$this->pos - $begin_tag_pos - 1
2025c165b184SJames Collins				);
2026c165b184SJames Collins				$this->pos -= 2;
2027c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2028c165b184SJames Collins				$this->link_nodes($node, false);
2029c165b184SJames Collins				return true;
2030c165b184SJames Collins			}
2031c165b184SJames Collins
2032c165b184SJames Collins			if ($name !== '/' && $name !== '') { // this is a attribute name
2033c165b184SJames Collins				// [1] Whitespace after attribute name
2034c165b184SJames Collins				$space[1] = $this->copy_skip($this->token_blank);
2035c165b184SJames Collins
2036c165b184SJames Collins				$name = $this->restore_noise($name); // might be a noisy name
2037c165b184SJames Collins
2038c165b184SJames Collins				if ($this->lowercase) { $name = strtolower($name); }
2039c165b184SJames Collins
2040c165b184SJames Collins				if ($this->char === '=') { // attribute with value
2041c165b184SJames Collins					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2042c165b184SJames Collins					$this->parse_attr($node, $name, $space); // get attribute value
2043c165b184SJames Collins				} else {
2044c165b184SJames Collins					//no value attr: nowrap, checked selected...
2045c165b184SJames Collins					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2046c165b184SJames Collins					$node->attr[$name] = true;
2047c165b184SJames Collins					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2048c165b184SJames Collins				}
2049c165b184SJames Collins
2050c165b184SJames Collins				$node->_[HDOM_INFO_SPACE][] = $space;
2051c165b184SJames Collins
2052c165b184SJames Collins				// prepare for next attribute
2053c165b184SJames Collins				$space = array(
2054c165b184SJames Collins					$this->copy_skip($this->token_blank),
2055c165b184SJames Collins					'',
2056c165b184SJames Collins					''
2057c165b184SJames Collins				);
2058c165b184SJames Collins			} else { // no more attributes
2059c165b184SJames Collins				break;
2060c165b184SJames Collins			}
2061c165b184SJames Collins		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2062c165b184SJames Collins
2063c165b184SJames Collins		$this->link_nodes($node, true);
2064c165b184SJames Collins		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2065c165b184SJames Collins
2066c165b184SJames Collins		// handle empty tags (i.e. "<div/>")
2067c165b184SJames Collins		if ($this->copy_until_char('>') === '/') {
2068c165b184SJames Collins			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2069c165b184SJames Collins			$node->_[HDOM_INFO_END] = 0;
2070c165b184SJames Collins		} else {
2071c165b184SJames Collins			// reset parent
2072c165b184SJames Collins			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2073c165b184SJames Collins				$this->parent = $node;
2074c165b184SJames Collins			}
2075c165b184SJames Collins		}
2076c165b184SJames Collins
2077c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2078c165b184SJames Collins
2079c165b184SJames Collins		// If it's a BR tag, we need to set it's text to the default text.
2080c165b184SJames Collins		// This way when we see it in plaintext, we can generate formatting that the user wants.
2081c165b184SJames Collins		// since a br tag never has sub nodes, this works well.
2082c165b184SJames Collins		if ($node->tag === 'br') {
2083c165b184SJames Collins			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2084c165b184SJames Collins		}
2085c165b184SJames Collins
2086c165b184SJames Collins		return true;
2087c165b184SJames Collins	}
2088c165b184SJames Collins
2089c165b184SJames Collins	protected function parse_attr($node, $name, &$space)
2090c165b184SJames Collins	{
2091c165b184SJames Collins		$is_duplicate = isset($node->attr[$name]);
2092c165b184SJames Collins
2093c165b184SJames Collins		if (!$is_duplicate) // Copy whitespace between "=" and value
2094c165b184SJames Collins			$space[2] = $this->copy_skip($this->token_blank);
2095c165b184SJames Collins
2096c165b184SJames Collins		switch ($this->char) {
2097c165b184SJames Collins			case '"':
2098c165b184SJames Collins				$quote_type = HDOM_QUOTE_DOUBLE;
2099c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2100c165b184SJames Collins				$value = $this->copy_until_char('"');
2101c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2102c165b184SJames Collins				break;
2103c165b184SJames Collins			case '\'':
2104c165b184SJames Collins				$quote_type = HDOM_QUOTE_SINGLE;
2105c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2106c165b184SJames Collins				$value = $this->copy_until_char('\'');
2107c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2108c165b184SJames Collins				break;
2109c165b184SJames Collins			default:
2110c165b184SJames Collins				$quote_type = HDOM_QUOTE_NO;
2111c165b184SJames Collins				$value = $this->copy_until($this->token_attr);
2112c165b184SJames Collins		}
2113c165b184SJames Collins
2114c165b184SJames Collins		$value = $this->restore_noise($value);
2115c165b184SJames Collins
2116c165b184SJames Collins		// PaperG: Attributes should not have \r or \n in them, that counts as
2117c165b184SJames Collins        // html whitespace.
2118cdddb6f0SJames Collins
2119*bc1032d9SJames Collins        // Added $stripRNAttrValues option for DokuWiki - nomadjimbob
2120*bc1032d9SJames Collins        if($this->stripRNAttrValues) {
2121*bc1032d9SJames Collins            $value = str_replace("\r", '', $value);
2122*bc1032d9SJames Collins            $value = str_replace("\n", '', $value);
2123*bc1032d9SJames Collins        }
2124c165b184SJames Collins
2125c165b184SJames Collins		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2126c165b184SJames Collins		// and trailing space since some people leave it in the multi class case.
2127c165b184SJames Collins		if ($name === 'class') {
2128c165b184SJames Collins			$value = trim($value);
2129c165b184SJames Collins		}
2130c165b184SJames Collins
2131c165b184SJames Collins		if (!$is_duplicate) {
2132c165b184SJames Collins			$node->_[HDOM_INFO_QUOTE][] = $quote_type;
2133c165b184SJames Collins			$node->attr[$name] = $value;
2134c165b184SJames Collins		}
2135c165b184SJames Collins	}
2136c165b184SJames Collins
2137c165b184SJames Collins	protected function link_nodes(&$node, $is_child)
2138c165b184SJames Collins	{
2139c165b184SJames Collins		$node->parent = $this->parent;
2140c165b184SJames Collins		$this->parent->nodes[] = $node;
2141c165b184SJames Collins		if ($is_child) {
2142c165b184SJames Collins			$this->parent->children[] = $node;
2143c165b184SJames Collins		}
2144c165b184SJames Collins	}
2145c165b184SJames Collins
2146c165b184SJames Collins	protected function as_text_node($tag)
2147c165b184SJames Collins	{
2148c165b184SJames Collins		$node = new simple_html_dom_node($this);
2149c165b184SJames Collins		++$this->cursor;
2150c165b184SJames Collins		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2151c165b184SJames Collins		$this->link_nodes($node, false);
2152c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2153c165b184SJames Collins		return true;
2154c165b184SJames Collins	}
2155c165b184SJames Collins
2156c165b184SJames Collins	protected function skip($chars)
2157c165b184SJames Collins	{
2158c165b184SJames Collins		$this->pos += strspn($this->doc, $chars, $this->pos);
2159c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2160c165b184SJames Collins	}
2161c165b184SJames Collins
2162c165b184SJames Collins	protected function copy_skip($chars)
2163c165b184SJames Collins	{
2164c165b184SJames Collins		$pos = $this->pos;
2165c165b184SJames Collins		$len = strspn($this->doc, $chars, $pos);
2166c165b184SJames Collins		$this->pos += $len;
2167c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2168c165b184SJames Collins		if ($len === 0) { return ''; }
2169c165b184SJames Collins		return substr($this->doc, $pos, $len);
2170c165b184SJames Collins	}
2171c165b184SJames Collins
2172c165b184SJames Collins	protected function copy_until($chars)
2173c165b184SJames Collins	{
2174c165b184SJames Collins		$pos = $this->pos;
2175c165b184SJames Collins		$len = strcspn($this->doc, $chars, $pos);
2176c165b184SJames Collins		$this->pos += $len;
2177c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2178c165b184SJames Collins		return substr($this->doc, $pos, $len);
2179c165b184SJames Collins	}
2180c165b184SJames Collins
2181c165b184SJames Collins	protected function copy_until_char($char)
2182c165b184SJames Collins	{
2183c165b184SJames Collins		if ($this->char === null) { return ''; }
2184c165b184SJames Collins
2185c165b184SJames Collins		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2186c165b184SJames Collins			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2187c165b184SJames Collins			$this->char = null;
2188c165b184SJames Collins			$this->pos = $this->size;
2189c165b184SJames Collins			return $ret;
2190c165b184SJames Collins		}
2191c165b184SJames Collins
2192c165b184SJames Collins		if ($pos === $this->pos) { return ''; }
2193c165b184SJames Collins
2194c165b184SJames Collins		$pos_old = $this->pos;
2195c165b184SJames Collins		$this->char = $this->doc[$pos];
2196c165b184SJames Collins		$this->pos = $pos;
2197c165b184SJames Collins		return substr($this->doc, $pos_old, $pos - $pos_old);
2198c165b184SJames Collins	}
2199c165b184SJames Collins
2200c165b184SJames Collins	protected function remove_noise($pattern, $remove_tag = false)
2201c165b184SJames Collins	{
2202c165b184SJames Collins		global $debug_object;
2203c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2204c165b184SJames Collins
2205c165b184SJames Collins		$count = preg_match_all(
2206c165b184SJames Collins			$pattern,
2207c165b184SJames Collins			$this->doc,
2208c165b184SJames Collins			$matches,
2209c165b184SJames Collins			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2210c165b184SJames Collins		);
2211c165b184SJames Collins
2212c165b184SJames Collins		for ($i = $count - 1; $i > -1; --$i) {
2213c165b184SJames Collins			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2214c165b184SJames Collins
2215c165b184SJames Collins			if (is_object($debug_object)) {
2216c165b184SJames Collins				$debug_object->debug_log(2, 'key is: ' . $key);
2217c165b184SJames Collins			}
2218c165b184SJames Collins
2219c165b184SJames Collins			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2220c165b184SJames Collins			$this->noise[$key] = $matches[$i][$idx][0];
2221c165b184SJames Collins			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2222c165b184SJames Collins		}
2223c165b184SJames Collins
2224c165b184SJames Collins		// reset the length of content
2225c165b184SJames Collins		$this->size = strlen($this->doc);
2226c165b184SJames Collins
2227c165b184SJames Collins		if ($this->size > 0) {
2228c165b184SJames Collins			$this->char = $this->doc[0];
2229c165b184SJames Collins		}
2230c165b184SJames Collins	}
2231c165b184SJames Collins
2232c165b184SJames Collins	function restore_noise($text)
2233c165b184SJames Collins	{
2234c165b184SJames Collins		global $debug_object;
2235c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2236c165b184SJames Collins
2237c165b184SJames Collins		while (($pos = strpos($text, '___noise___')) !== false) {
2238c165b184SJames Collins			// Sometimes there is a broken piece of markup, and we don't GET the
2239c165b184SJames Collins			// pos+11 etc... token which indicates a problem outside of us...
2240c165b184SJames Collins
2241c165b184SJames Collins			// todo: "___noise___1000" (or any number with four or more digits)
2242c165b184SJames Collins			// in the DOM causes an infinite loop which could be utilized by
2243c165b184SJames Collins			// malicious software
2244c165b184SJames Collins			if (strlen($text) > $pos + 15) {
2245c165b184SJames Collins				$key = '___noise___'
2246c165b184SJames Collins				. $text[$pos + 11]
2247c165b184SJames Collins				. $text[$pos + 12]
2248c165b184SJames Collins				. $text[$pos + 13]
2249c165b184SJames Collins				. $text[$pos + 14]
2250c165b184SJames Collins				. $text[$pos + 15];
2251c165b184SJames Collins
2252c165b184SJames Collins				if (is_object($debug_object)) {
2253c165b184SJames Collins					$debug_object->debug_log(2, 'located key of: ' . $key);
2254c165b184SJames Collins				}
2255c165b184SJames Collins
2256c165b184SJames Collins				if (isset($this->noise[$key])) {
2257c165b184SJames Collins					$text = substr($text, 0, $pos)
2258c165b184SJames Collins					. $this->noise[$key]
2259c165b184SJames Collins					. substr($text, $pos + 16);
2260c165b184SJames Collins				} else {
2261c165b184SJames Collins					// do this to prevent an infinite loop.
2262c165b184SJames Collins					$text = substr($text, 0, $pos)
2263c165b184SJames Collins					. 'UNDEFINED NOISE FOR KEY: '
2264c165b184SJames Collins					. $key
2265c165b184SJames Collins					. substr($text, $pos + 16);
2266c165b184SJames Collins				}
2267c165b184SJames Collins			} else {
2268c165b184SJames Collins				// There is no valid key being given back to us... We must get
2269c165b184SJames Collins				// rid of the ___noise___ or we will have a problem.
2270c165b184SJames Collins				$text = substr($text, 0, $pos)
2271c165b184SJames Collins				. 'NO NUMERIC NOISE KEY'
2272c165b184SJames Collins				. substr($text, $pos + 11);
2273c165b184SJames Collins			}
2274c165b184SJames Collins		}
2275c165b184SJames Collins		return $text;
2276c165b184SJames Collins	}
2277c165b184SJames Collins
2278c165b184SJames Collins	function search_noise($text)
2279c165b184SJames Collins	{
2280c165b184SJames Collins		global $debug_object;
2281c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2282c165b184SJames Collins
2283c165b184SJames Collins		foreach($this->noise as $noiseElement) {
2284c165b184SJames Collins			if (strpos($noiseElement, $text) !== false) {
2285c165b184SJames Collins				return $noiseElement;
2286c165b184SJames Collins			}
2287c165b184SJames Collins		}
2288c165b184SJames Collins	}
2289c165b184SJames Collins
2290c165b184SJames Collins	function __toString()
2291c165b184SJames Collins	{
2292c165b184SJames Collins		return $this->root->innertext();
2293c165b184SJames Collins	}
2294c165b184SJames Collins
2295c165b184SJames Collins	function __get($name)
2296c165b184SJames Collins	{
2297c165b184SJames Collins		switch ($name) {
2298c165b184SJames Collins			case 'outertext':
2299c165b184SJames Collins				return $this->root->innertext();
2300c165b184SJames Collins			case 'innertext':
2301c165b184SJames Collins				return $this->root->innertext();
2302c165b184SJames Collins			case 'plaintext':
2303c165b184SJames Collins				return $this->root->text();
2304c165b184SJames Collins			case 'charset':
2305c165b184SJames Collins				return $this->_charset;
2306c165b184SJames Collins			case 'target_charset':
2307c165b184SJames Collins				return $this->_target_charset;
2308c165b184SJames Collins		}
2309c165b184SJames Collins	}
2310c165b184SJames Collins
2311c165b184SJames Collins	function childNodes($idx = -1)
2312c165b184SJames Collins	{
2313c165b184SJames Collins		return $this->root->childNodes($idx);
2314c165b184SJames Collins	}
2315c165b184SJames Collins
2316c165b184SJames Collins	function firstChild()
2317c165b184SJames Collins	{
2318c165b184SJames Collins		return $this->root->first_child();
2319c165b184SJames Collins	}
2320c165b184SJames Collins
2321c165b184SJames Collins	function lastChild()
2322c165b184SJames Collins	{
2323c165b184SJames Collins		return $this->root->last_child();
2324c165b184SJames Collins	}
2325c165b184SJames Collins
2326c165b184SJames Collins	function createElement($name, $value = null)
2327c165b184SJames Collins	{
2328c165b184SJames Collins		return @str_get_html("<$name>$value</$name>")->firstChild();
2329c165b184SJames Collins	}
2330c165b184SJames Collins
2331c165b184SJames Collins	function createTextNode($value)
2332c165b184SJames Collins	{
2333c165b184SJames Collins		return @end(str_get_html($value)->nodes);
2334c165b184SJames Collins	}
2335c165b184SJames Collins
2336c165b184SJames Collins	function getElementById($id)
2337c165b184SJames Collins	{
2338c165b184SJames Collins		return $this->find("#$id", 0);
2339c165b184SJames Collins	}
2340c165b184SJames Collins
2341c165b184SJames Collins	function getElementsById($id, $idx = null)
2342c165b184SJames Collins	{
2343c165b184SJames Collins		return $this->find("#$id", $idx);
2344c165b184SJames Collins	}
2345c165b184SJames Collins
2346c165b184SJames Collins	function getElementByTagName($name)
2347c165b184SJames Collins	{
2348c165b184SJames Collins		return $this->find($name, 0);
2349c165b184SJames Collins	}
2350c165b184SJames Collins
2351c165b184SJames Collins	function getElementsByTagName($name, $idx = -1)
2352c165b184SJames Collins	{
2353c165b184SJames Collins		return $this->find($name, $idx);
2354c165b184SJames Collins	}
2355c165b184SJames Collins
2356c165b184SJames Collins	function loadFile()
2357c165b184SJames Collins	{
2358c165b184SJames Collins		$args = func_get_args();
2359c165b184SJames Collins		$this->load_file($args);
2360c165b184SJames Collins	}
2361c165b184SJames Collins}
2362