xref: /template/mikio/inc/simple_html_dom.php (revision cdddb6f08ed0bb068c0ed1d82050d7a241f4438f)
1c165b184SJames Collins<?php
2c165b184SJames Collins/**
3c165b184SJames Collins * Website: http://sourceforge.net/projects/simplehtmldom/
4c165b184SJames Collins * Additional projects: http://sourceforge.net/projects/debugobject/
5c165b184SJames Collins * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6c165b184SJames Collins *
7c165b184SJames Collins * Licensed under The MIT License
8c165b184SJames Collins * See the LICENSE file in the project root for more information.
9c165b184SJames Collins *
10c165b184SJames Collins * Authors:
11c165b184SJames Collins *   S.C. Chen
12c165b184SJames Collins *   John Schlick
13c165b184SJames Collins *   Rus Carroll
14c165b184SJames Collins *   logmanoriginal
15c165b184SJames Collins *
16c165b184SJames Collins * Contributors:
17c165b184SJames Collins *   Yousuke Kumakura
18c165b184SJames Collins *   Vadim Voituk
19c165b184SJames Collins *   Antcs
20c165b184SJames Collins *
21c165b184SJames Collins * Version Rev. 1.9.1 (291)
22*cdddb6f0SJames Collins *
23*cdddb6f0SJames Collins * THIS LIBRARY HAS BEEN MODIFIED BY NOMADJIMBOB - james.collins@outlook.com.au
24*cdddb6f0SJames Collins * Lines 2116 - stripping of \r\n from attributes has been disabled
25c165b184SJames Collins */
26c165b184SJames Collins
27c165b184SJames Collinsdefine('HDOM_TYPE_ELEMENT', 1);
28c165b184SJames Collinsdefine('HDOM_TYPE_COMMENT', 2);
29c165b184SJames Collinsdefine('HDOM_TYPE_TEXT', 3);
30c165b184SJames Collinsdefine('HDOM_TYPE_ENDTAG', 4);
31c165b184SJames Collinsdefine('HDOM_TYPE_ROOT', 5);
32c165b184SJames Collinsdefine('HDOM_TYPE_UNKNOWN', 6);
33c165b184SJames Collinsdefine('HDOM_QUOTE_DOUBLE', 0);
34c165b184SJames Collinsdefine('HDOM_QUOTE_SINGLE', 1);
35c165b184SJames Collinsdefine('HDOM_QUOTE_NO', 3);
36c165b184SJames Collinsdefine('HDOM_INFO_BEGIN', 0);
37c165b184SJames Collinsdefine('HDOM_INFO_END', 1);
38c165b184SJames Collinsdefine('HDOM_INFO_QUOTE', 2);
39c165b184SJames Collinsdefine('HDOM_INFO_SPACE', 3);
40c165b184SJames Collinsdefine('HDOM_INFO_TEXT', 4);
41c165b184SJames Collinsdefine('HDOM_INFO_INNER', 5);
42c165b184SJames Collinsdefine('HDOM_INFO_OUTER', 6);
43c165b184SJames Collinsdefine('HDOM_INFO_ENDSPACE', 7);
44c165b184SJames Collins
45c165b184SJames Collinsdefined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
46c165b184SJames Collinsdefined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
47c165b184SJames Collinsdefined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
48c165b184SJames Collinsdefined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
49c165b184SJames Collinsdefine('HDOM_SMARTY_AS_TEXT', 1);
50c165b184SJames Collins
51c165b184SJames Collinsfunction file_get_html(
52c165b184SJames Collins	$url,
53c165b184SJames Collins	$use_include_path = false,
54c165b184SJames Collins	$context = null,
55c165b184SJames Collins	$offset = 0,
56c165b184SJames Collins	$maxLen = -1,
57c165b184SJames Collins	$lowercase = true,
58c165b184SJames Collins	$forceTagsClosed = true,
59c165b184SJames Collins	$target_charset = DEFAULT_TARGET_CHARSET,
60c165b184SJames Collins	$stripRN = true,
61c165b184SJames Collins	$defaultBRText = DEFAULT_BR_TEXT,
62c165b184SJames Collins	$defaultSpanText = DEFAULT_SPAN_TEXT)
63c165b184SJames Collins{
64c165b184SJames Collins	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
65c165b184SJames Collins
66c165b184SJames Collins	$dom = new simple_html_dom(
67c165b184SJames Collins		null,
68c165b184SJames Collins		$lowercase,
69c165b184SJames Collins		$forceTagsClosed,
70c165b184SJames Collins		$target_charset,
71c165b184SJames Collins		$stripRN,
72c165b184SJames Collins		$defaultBRText,
73c165b184SJames Collins		$defaultSpanText
74c165b184SJames Collins	);
75c165b184SJames Collins
76c165b184SJames Collins	/**
77c165b184SJames Collins	 * For sourceforge users: uncomment the next line and comment the
78c165b184SJames Collins	 * retrieve_url_contents line 2 lines down if it is not already done.
79c165b184SJames Collins	 */
80c165b184SJames Collins	$contents = file_get_contents(
81c165b184SJames Collins		$url,
82c165b184SJames Collins		$use_include_path,
83c165b184SJames Collins		$context,
84c165b184SJames Collins		$offset,
85c165b184SJames Collins		$maxLen
86c165b184SJames Collins	);
87c165b184SJames Collins	// $contents = retrieve_url_contents($url);
88c165b184SJames Collins
89c165b184SJames Collins	if (empty($contents) || strlen($contents) > $maxLen) {
90c165b184SJames Collins		$dom->clear();
91c165b184SJames Collins		return false;
92c165b184SJames Collins	}
93c165b184SJames Collins
94c165b184SJames Collins	return $dom->load($contents, $lowercase, $stripRN);
95c165b184SJames Collins}
96c165b184SJames Collins
97c165b184SJames Collinsfunction str_get_html(
98c165b184SJames Collins	$str,
99c165b184SJames Collins	$lowercase = true,
100c165b184SJames Collins	$forceTagsClosed = true,
101c165b184SJames Collins	$target_charset = DEFAULT_TARGET_CHARSET,
102c165b184SJames Collins	$stripRN = true,
103c165b184SJames Collins	$defaultBRText = DEFAULT_BR_TEXT,
104c165b184SJames Collins	$defaultSpanText = DEFAULT_SPAN_TEXT)
105c165b184SJames Collins{
106c165b184SJames Collins	$dom = new simple_html_dom(
107c165b184SJames Collins		null,
108c165b184SJames Collins		$lowercase,
109c165b184SJames Collins		$forceTagsClosed,
110c165b184SJames Collins		$target_charset,
111c165b184SJames Collins		$stripRN,
112c165b184SJames Collins		$defaultBRText,
113c165b184SJames Collins		$defaultSpanText
114c165b184SJames Collins	);
115c165b184SJames Collins
116c165b184SJames Collins	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
117c165b184SJames Collins		$dom->clear();
118c165b184SJames Collins		return false;
119c165b184SJames Collins	}
120c165b184SJames Collins
121c165b184SJames Collins	return $dom->load($str, $lowercase, $stripRN);
122c165b184SJames Collins}
123c165b184SJames Collins
124c165b184SJames Collinsfunction dump_html_tree($node, $show_attr = true, $deep = 0)
125c165b184SJames Collins{
126c165b184SJames Collins	$node->dump($node);
127c165b184SJames Collins}
128c165b184SJames Collins
129c165b184SJames Collinsclass simple_html_dom_node
130c165b184SJames Collins{
131c165b184SJames Collins	public $nodetype = HDOM_TYPE_TEXT;
132c165b184SJames Collins	public $tag = 'text';
133c165b184SJames Collins	public $attr = array();
134c165b184SJames Collins	public $children = array();
135c165b184SJames Collins	public $nodes = array();
136c165b184SJames Collins	public $parent = null;
137c165b184SJames Collins	public $_ = array();
138c165b184SJames Collins	public $tag_start = 0;
139c165b184SJames Collins	private $dom = null;
140c165b184SJames Collins
141c165b184SJames Collins	function __construct($dom)
142c165b184SJames Collins	{
143c165b184SJames Collins		$this->dom = $dom;
144c165b184SJames Collins		$dom->nodes[] = $this;
145c165b184SJames Collins	}
146c165b184SJames Collins
147c165b184SJames Collins	function __destruct()
148c165b184SJames Collins	{
149c165b184SJames Collins		$this->clear();
150c165b184SJames Collins	}
151c165b184SJames Collins
152c165b184SJames Collins	function __toString()
153c165b184SJames Collins	{
154c165b184SJames Collins		return $this->outertext();
155c165b184SJames Collins	}
156c165b184SJames Collins
157c165b184SJames Collins	function clear()
158c165b184SJames Collins	{
159c165b184SJames Collins		$this->dom = null;
160c165b184SJames Collins		$this->nodes = null;
161c165b184SJames Collins		$this->parent = null;
162c165b184SJames Collins		$this->children = null;
163c165b184SJames Collins	}
164c165b184SJames Collins
165c165b184SJames Collins	function dump($show_attr = true, $depth = 0)
166c165b184SJames Collins	{
167c165b184SJames Collins		echo str_repeat("\t", $depth) . $this->tag;
168c165b184SJames Collins
169c165b184SJames Collins		if ($show_attr && count($this->attr) > 0) {
170c165b184SJames Collins			echo '(';
171c165b184SJames Collins			foreach ($this->attr as $k => $v) {
172c165b184SJames Collins				echo "[$k]=>\"$v\", ";
173c165b184SJames Collins			}
174c165b184SJames Collins			echo ')';
175c165b184SJames Collins		}
176c165b184SJames Collins
177c165b184SJames Collins		echo "\n";
178c165b184SJames Collins
179c165b184SJames Collins		if ($this->nodes) {
180c165b184SJames Collins			foreach ($this->nodes as $node) {
181c165b184SJames Collins				$node->dump($show_attr, $depth + 1);
182c165b184SJames Collins			}
183c165b184SJames Collins		}
184c165b184SJames Collins	}
185c165b184SJames Collins
186c165b184SJames Collins	function dump_node($echo = true)
187c165b184SJames Collins	{
188c165b184SJames Collins		$string = $this->tag;
189c165b184SJames Collins
190c165b184SJames Collins		if (count($this->attr) > 0) {
191c165b184SJames Collins			$string .= '(';
192c165b184SJames Collins			foreach ($this->attr as $k => $v) {
193c165b184SJames Collins				$string .= "[$k]=>\"$v\", ";
194c165b184SJames Collins			}
195c165b184SJames Collins			$string .= ')';
196c165b184SJames Collins		}
197c165b184SJames Collins
198c165b184SJames Collins		if (count($this->_) > 0) {
199c165b184SJames Collins			$string .= ' $_ (';
200c165b184SJames Collins			foreach ($this->_ as $k => $v) {
201c165b184SJames Collins				if (is_array($v)) {
202c165b184SJames Collins					$string .= "[$k]=>(";
203c165b184SJames Collins					foreach ($v as $k2 => $v2) {
204c165b184SJames Collins						$string .= "[$k2]=>\"$v2\", ";
205c165b184SJames Collins					}
206c165b184SJames Collins					$string .= ')';
207c165b184SJames Collins				} else {
208c165b184SJames Collins					$string .= "[$k]=>\"$v\", ";
209c165b184SJames Collins				}
210c165b184SJames Collins			}
211c165b184SJames Collins			$string .= ')';
212c165b184SJames Collins		}
213c165b184SJames Collins
214c165b184SJames Collins		if (isset($this->text)) {
215c165b184SJames Collins			$string .= " text: ({$this->text})";
216c165b184SJames Collins		}
217c165b184SJames Collins
218c165b184SJames Collins		$string .= ' HDOM_INNER_INFO: ';
219c165b184SJames Collins
220c165b184SJames Collins		if (isset($node->_[HDOM_INFO_INNER])) {
221c165b184SJames Collins			$string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
222c165b184SJames Collins		} else {
223c165b184SJames Collins			$string .= ' NULL ';
224c165b184SJames Collins		}
225c165b184SJames Collins
226c165b184SJames Collins		$string .= ' children: ' . count($this->children);
227c165b184SJames Collins		$string .= ' nodes: ' . count($this->nodes);
228c165b184SJames Collins		$string .= ' tag_start: ' . $this->tag_start;
229c165b184SJames Collins		$string .= "\n";
230c165b184SJames Collins
231c165b184SJames Collins		if ($echo) {
232c165b184SJames Collins			echo $string;
233c165b184SJames Collins			return;
234c165b184SJames Collins		} else {
235c165b184SJames Collins			return $string;
236c165b184SJames Collins		}
237c165b184SJames Collins	}
238c165b184SJames Collins
239c165b184SJames Collins	function parent($parent = null)
240c165b184SJames Collins	{
241c165b184SJames Collins		// I am SURE that this doesn't work properly.
242c165b184SJames Collins		// It fails to unset the current node from it's current parents nodes or
243c165b184SJames Collins		// children list first.
244c165b184SJames Collins		if ($parent !== null) {
245c165b184SJames Collins			$this->parent = $parent;
246c165b184SJames Collins			$this->parent->nodes[] = $this;
247c165b184SJames Collins			$this->parent->children[] = $this;
248c165b184SJames Collins		}
249c165b184SJames Collins
250c165b184SJames Collins		return $this->parent;
251c165b184SJames Collins	}
252c165b184SJames Collins
253c165b184SJames Collins	function has_child()
254c165b184SJames Collins	{
255c165b184SJames Collins		return !empty($this->children);
256c165b184SJames Collins	}
257c165b184SJames Collins
258c165b184SJames Collins	function children($idx = -1)
259c165b184SJames Collins	{
260c165b184SJames Collins		if ($idx === -1) {
261c165b184SJames Collins			return $this->children;
262c165b184SJames Collins		}
263c165b184SJames Collins
264c165b184SJames Collins		if (isset($this->children[$idx])) {
265c165b184SJames Collins			return $this->children[$idx];
266c165b184SJames Collins		}
267c165b184SJames Collins
268c165b184SJames Collins		return null;
269c165b184SJames Collins	}
270c165b184SJames Collins
271c165b184SJames Collins	function first_child()
272c165b184SJames Collins	{
273c165b184SJames Collins		if (count($this->children) > 0) {
274c165b184SJames Collins			return $this->children[0];
275c165b184SJames Collins		}
276c165b184SJames Collins		return null;
277c165b184SJames Collins	}
278c165b184SJames Collins
279c165b184SJames Collins	function last_child()
280c165b184SJames Collins	{
281c165b184SJames Collins		if (count($this->children) > 0) {
282c165b184SJames Collins			return end($this->children);
283c165b184SJames Collins		}
284c165b184SJames Collins		return null;
285c165b184SJames Collins	}
286c165b184SJames Collins
287c165b184SJames Collins	function next_sibling()
288c165b184SJames Collins	{
289c165b184SJames Collins		if ($this->parent === null) {
290c165b184SJames Collins			return null;
291c165b184SJames Collins		}
292c165b184SJames Collins
293c165b184SJames Collins		$idx = array_search($this, $this->parent->children, true);
294c165b184SJames Collins
295c165b184SJames Collins		if ($idx !== false && isset($this->parent->children[$idx + 1])) {
296c165b184SJames Collins			return $this->parent->children[$idx + 1];
297c165b184SJames Collins		}
298c165b184SJames Collins
299c165b184SJames Collins		return null;
300c165b184SJames Collins	}
301c165b184SJames Collins
302c165b184SJames Collins	function prev_sibling()
303c165b184SJames Collins	{
304c165b184SJames Collins		if ($this->parent === null) {
305c165b184SJames Collins			return null;
306c165b184SJames Collins		}
307c165b184SJames Collins
308c165b184SJames Collins		$idx = array_search($this, $this->parent->children, true);
309c165b184SJames Collins
310c165b184SJames Collins		if ($idx !== false && $idx > 0) {
311c165b184SJames Collins			return $this->parent->children[$idx - 1];
312c165b184SJames Collins		}
313c165b184SJames Collins
314c165b184SJames Collins		return null;
315c165b184SJames Collins	}
316c165b184SJames Collins
317c165b184SJames Collins	function find_ancestor_tag($tag)
318c165b184SJames Collins	{
319c165b184SJames Collins		global $debug_object;
320c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
321c165b184SJames Collins
322c165b184SJames Collins		if ($this->parent === null) {
323c165b184SJames Collins			return null;
324c165b184SJames Collins		}
325c165b184SJames Collins
326c165b184SJames Collins		$ancestor = $this->parent;
327c165b184SJames Collins
328c165b184SJames Collins		while (!is_null($ancestor)) {
329c165b184SJames Collins			if (is_object($debug_object)) {
330c165b184SJames Collins				$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
331c165b184SJames Collins			}
332c165b184SJames Collins
333c165b184SJames Collins			if ($ancestor->tag === $tag) {
334c165b184SJames Collins				break;
335c165b184SJames Collins			}
336c165b184SJames Collins
337c165b184SJames Collins			$ancestor = $ancestor->parent;
338c165b184SJames Collins		}
339c165b184SJames Collins
340c165b184SJames Collins		return $ancestor;
341c165b184SJames Collins	}
342c165b184SJames Collins
343c165b184SJames Collins	function innertext()
344c165b184SJames Collins	{
345c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
346c165b184SJames Collins			return $this->_[HDOM_INFO_INNER];
347c165b184SJames Collins		}
348c165b184SJames Collins
349c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
350c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
351c165b184SJames Collins		}
352c165b184SJames Collins
353c165b184SJames Collins		$ret = '';
354c165b184SJames Collins
355c165b184SJames Collins		foreach ($this->nodes as $n) {
356c165b184SJames Collins			$ret .= $n->outertext();
357c165b184SJames Collins		}
358c165b184SJames Collins
359c165b184SJames Collins		return $ret;
360c165b184SJames Collins	}
361c165b184SJames Collins
362c165b184SJames Collins	function outertext()
363c165b184SJames Collins	{
364c165b184SJames Collins		global $debug_object;
365c165b184SJames Collins
366c165b184SJames Collins		if (is_object($debug_object)) {
367c165b184SJames Collins			$text = '';
368c165b184SJames Collins
369c165b184SJames Collins			if ($this->tag === 'text') {
370c165b184SJames Collins				if (!empty($this->text)) {
371c165b184SJames Collins					$text = ' with text: ' . $this->text;
372c165b184SJames Collins				}
373c165b184SJames Collins			}
374c165b184SJames Collins
375c165b184SJames Collins			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
376c165b184SJames Collins		}
377c165b184SJames Collins
378c165b184SJames Collins		if ($this->tag === 'root') {
379c165b184SJames Collins			return $this->innertext();
380c165b184SJames Collins		}
381c165b184SJames Collins
382c165b184SJames Collins		// todo: What is the use of this callback? Remove?
383c165b184SJames Collins		if ($this->dom && $this->dom->callback !== null) {
384c165b184SJames Collins			call_user_func_array($this->dom->callback, array($this));
385c165b184SJames Collins		}
386c165b184SJames Collins
387c165b184SJames Collins		if (isset($this->_[HDOM_INFO_OUTER])) {
388c165b184SJames Collins			return $this->_[HDOM_INFO_OUTER];
389c165b184SJames Collins		}
390c165b184SJames Collins
391c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
392c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
393c165b184SJames Collins		}
394c165b184SJames Collins
395c165b184SJames Collins		$ret = '';
396c165b184SJames Collins
397c165b184SJames Collins		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
398c165b184SJames Collins			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
399c165b184SJames Collins		}
400c165b184SJames Collins
401c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
402c165b184SJames Collins			// todo: <br> should either never have HDOM_INFO_INNER or always
403c165b184SJames Collins			if ($this->tag !== 'br') {
404c165b184SJames Collins				$ret .= $this->_[HDOM_INFO_INNER];
405c165b184SJames Collins			}
406c165b184SJames Collins		} elseif ($this->nodes) {
407c165b184SJames Collins			foreach ($this->nodes as $n) {
408c165b184SJames Collins				$ret .= $this->convert_text($n->outertext());
409c165b184SJames Collins			}
410c165b184SJames Collins		}
411c165b184SJames Collins
412c165b184SJames Collins		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
413c165b184SJames Collins			$ret .= '</' . $this->tag . '>';
414c165b184SJames Collins		}
415c165b184SJames Collins
416c165b184SJames Collins		return $ret;
417c165b184SJames Collins	}
418c165b184SJames Collins
419c165b184SJames Collins	function text()
420c165b184SJames Collins	{
421c165b184SJames Collins		if (isset($this->_[HDOM_INFO_INNER])) {
422c165b184SJames Collins			return $this->_[HDOM_INFO_INNER];
423c165b184SJames Collins		}
424c165b184SJames Collins
425c165b184SJames Collins		switch ($this->nodetype) {
426c165b184SJames Collins			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
427c165b184SJames Collins			case HDOM_TYPE_COMMENT: return '';
428c165b184SJames Collins			case HDOM_TYPE_UNKNOWN: return '';
429c165b184SJames Collins		}
430c165b184SJames Collins
431c165b184SJames Collins		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
432c165b184SJames Collins		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
433c165b184SJames Collins
434c165b184SJames Collins		$ret = '';
435c165b184SJames Collins
436c165b184SJames Collins		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
437c165b184SJames Collins		// for some span tags, and some p tags) $this->nodes is set to NULL.
438c165b184SJames Collins		// NOTE: This indicates that there is a problem where it's set to NULL
439c165b184SJames Collins		// without a clear happening.
440c165b184SJames Collins		// WHY is this happening?
441c165b184SJames Collins		if (!is_null($this->nodes)) {
442c165b184SJames Collins			foreach ($this->nodes as $n) {
443c165b184SJames Collins				// Start paragraph after a blank line
444c165b184SJames Collins				if ($n->tag === 'p') {
445c165b184SJames Collins					$ret = trim($ret) . "\n\n";
446c165b184SJames Collins				}
447c165b184SJames Collins
448c165b184SJames Collins				$ret .= $this->convert_text($n->text());
449c165b184SJames Collins
450c165b184SJames Collins				// If this node is a span... add a space at the end of it so
451c165b184SJames Collins				// multiple spans don't run into each other.  This is plaintext
452c165b184SJames Collins				// after all.
453c165b184SJames Collins				if ($n->tag === 'span') {
454c165b184SJames Collins					$ret .= $this->dom->default_span_text;
455c165b184SJames Collins				}
456c165b184SJames Collins			}
457c165b184SJames Collins		}
458c165b184SJames Collins		return $ret;
459c165b184SJames Collins	}
460c165b184SJames Collins
461c165b184SJames Collins	function xmltext()
462c165b184SJames Collins	{
463c165b184SJames Collins		$ret = $this->innertext();
464c165b184SJames Collins		$ret = str_ireplace('<![CDATA[', '', $ret);
465c165b184SJames Collins		$ret = str_replace(']]>', '', $ret);
466c165b184SJames Collins		return $ret;
467c165b184SJames Collins	}
468c165b184SJames Collins
469c165b184SJames Collins	function makeup()
470c165b184SJames Collins	{
471c165b184SJames Collins		// text, comment, unknown
472c165b184SJames Collins		if (isset($this->_[HDOM_INFO_TEXT])) {
473c165b184SJames Collins			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
474c165b184SJames Collins		}
475c165b184SJames Collins
476c165b184SJames Collins		$ret = '<' . $this->tag;
477c165b184SJames Collins		$i = -1;
478c165b184SJames Collins
479c165b184SJames Collins		foreach ($this->attr as $key => $val) {
480c165b184SJames Collins			++$i;
481c165b184SJames Collins
482c165b184SJames Collins			// skip removed attribute
483c165b184SJames Collins			if ($val === null || $val === false) { continue; }
484c165b184SJames Collins
485c165b184SJames Collins			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
486c165b184SJames Collins
487c165b184SJames Collins			//no value attr: nowrap, checked selected...
488c165b184SJames Collins			if ($val === true) {
489c165b184SJames Collins				$ret .= $key;
490c165b184SJames Collins			} else {
491c165b184SJames Collins				switch ($this->_[HDOM_INFO_QUOTE][$i])
492c165b184SJames Collins				{
493c165b184SJames Collins					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
494c165b184SJames Collins					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
495c165b184SJames Collins					default: $quote = '';
496c165b184SJames Collins				}
497c165b184SJames Collins
498c165b184SJames Collins				$ret .= $key
499c165b184SJames Collins				. $this->_[HDOM_INFO_SPACE][$i][1]
500c165b184SJames Collins				. '='
501c165b184SJames Collins				. $this->_[HDOM_INFO_SPACE][$i][2]
502c165b184SJames Collins				. $quote
503c165b184SJames Collins				. $val
504c165b184SJames Collins				. $quote;
505c165b184SJames Collins			}
506c165b184SJames Collins		}
507c165b184SJames Collins
508c165b184SJames Collins		$ret = $this->dom->restore_noise($ret);
509c165b184SJames Collins		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
510c165b184SJames Collins	}
511c165b184SJames Collins
512c165b184SJames Collins	function find($selector, $idx = null, $lowercase = false)
513c165b184SJames Collins	{
514c165b184SJames Collins		$selectors = $this->parse_selector($selector);
515c165b184SJames Collins		if (($count = count($selectors)) === 0) { return array(); }
516c165b184SJames Collins		$found_keys = array();
517c165b184SJames Collins
518c165b184SJames Collins		// find each selector
519c165b184SJames Collins		for ($c = 0; $c < $count; ++$c) {
520c165b184SJames Collins			// The change on the below line was documented on the sourceforge
521c165b184SJames Collins			// code tracker id 2788009
522c165b184SJames Collins			// used to be: if (($levle=count($selectors[0]))===0) return array();
523c165b184SJames Collins			if (($levle = count($selectors[$c])) === 0) { return array(); }
524c165b184SJames Collins			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
525c165b184SJames Collins
526c165b184SJames Collins			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
527c165b184SJames Collins			$cmd = ' '; // Combinator
528c165b184SJames Collins
529c165b184SJames Collins			// handle descendant selectors, no recursive!
530c165b184SJames Collins			for ($l = 0; $l < $levle; ++$l) {
531c165b184SJames Collins				$ret = array();
532c165b184SJames Collins
533c165b184SJames Collins				foreach ($head as $k => $v) {
534c165b184SJames Collins					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
535c165b184SJames Collins					//PaperG - Pass this optional parameter on to the seek function.
536c165b184SJames Collins					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
537c165b184SJames Collins				}
538c165b184SJames Collins
539c165b184SJames Collins				$head = $ret;
540c165b184SJames Collins				$cmd = $selectors[$c][$l][4]; // Next Combinator
541c165b184SJames Collins			}
542c165b184SJames Collins
543c165b184SJames Collins			foreach ($head as $k => $v) {
544c165b184SJames Collins				if (!isset($found_keys[$k])) {
545c165b184SJames Collins					$found_keys[$k] = 1;
546c165b184SJames Collins				}
547c165b184SJames Collins			}
548c165b184SJames Collins		}
549c165b184SJames Collins
550c165b184SJames Collins		// sort keys
551c165b184SJames Collins		ksort($found_keys);
552c165b184SJames Collins
553c165b184SJames Collins		$found = array();
554c165b184SJames Collins		foreach ($found_keys as $k => $v) {
555c165b184SJames Collins			$found[] = $this->dom->nodes[$k];
556c165b184SJames Collins		}
557c165b184SJames Collins
558c165b184SJames Collins		// return nth-element or array
559c165b184SJames Collins		if (is_null($idx)) { return $found; }
560c165b184SJames Collins		elseif ($idx < 0) { $idx = count($found) + $idx; }
561c165b184SJames Collins		return (isset($found[$idx])) ? $found[$idx] : null;
562c165b184SJames Collins	}
563c165b184SJames Collins
564c165b184SJames Collins	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
565c165b184SJames Collins	{
566c165b184SJames Collins		global $debug_object;
567c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
568c165b184SJames Collins
569c165b184SJames Collins		list($tag, $id, $class, $attributes, $cmb) = $selector;
570c165b184SJames Collins		$nodes = array();
571c165b184SJames Collins
572c165b184SJames Collins		if ($parent_cmd === ' ') { // Descendant Combinator
573c165b184SJames Collins			// Find parent closing tag if the current element doesn't have a closing
574c165b184SJames Collins			// tag (i.e. void element)
575c165b184SJames Collins			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
576c165b184SJames Collins			if ($end == 0) {
577c165b184SJames Collins				$parent = $this->parent;
578c165b184SJames Collins				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
579c165b184SJames Collins					$end -= 1;
580c165b184SJames Collins					$parent = $parent->parent;
581c165b184SJames Collins				}
582c165b184SJames Collins				$end += $parent->_[HDOM_INFO_END];
583c165b184SJames Collins			}
584c165b184SJames Collins
585c165b184SJames Collins			// Get list of target nodes
586c165b184SJames Collins			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
587c165b184SJames Collins			$nodes_count = $end - $nodes_start;
588c165b184SJames Collins			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
589c165b184SJames Collins		} elseif ($parent_cmd === '>') { // Child Combinator
590c165b184SJames Collins			$nodes = $this->children;
591c165b184SJames Collins		} elseif ($parent_cmd === '+'
592c165b184SJames Collins			&& $this->parent
593c165b184SJames Collins			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
594c165b184SJames Collins				$index = array_search($this, $this->parent->children, true) + 1;
595c165b184SJames Collins				if ($index < count($this->parent->children))
596c165b184SJames Collins					$nodes[] = $this->parent->children[$index];
597c165b184SJames Collins		} elseif ($parent_cmd === '~'
598c165b184SJames Collins			&& $this->parent
599c165b184SJames Collins			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
600c165b184SJames Collins				$index = array_search($this, $this->parent->children, true);
601c165b184SJames Collins				$nodes = array_slice($this->parent->children, $index);
602c165b184SJames Collins		}
603c165b184SJames Collins
604c165b184SJames Collins		// Go throgh each element starting at this element until the end tag
605c165b184SJames Collins		// Note: If this element is a void tag, any previous void element is
606c165b184SJames Collins		// skipped.
607c165b184SJames Collins		foreach($nodes as $node) {
608c165b184SJames Collins			$pass = true;
609c165b184SJames Collins
610c165b184SJames Collins			// Skip root nodes
611c165b184SJames Collins			if(!$node->parent) {
612c165b184SJames Collins				$pass = false;
613c165b184SJames Collins			}
614c165b184SJames Collins
615c165b184SJames Collins			// Handle 'text' selector
616c165b184SJames Collins			if($pass && $tag === 'text' && $node->tag === 'text') {
617c165b184SJames Collins				$ret[array_search($node, $this->dom->nodes, true)] = 1;
618c165b184SJames Collins				unset($node);
619c165b184SJames Collins				continue;
620c165b184SJames Collins			}
621c165b184SJames Collins
622c165b184SJames Collins			// Skip if node isn't a child node (i.e. text nodes)
623c165b184SJames Collins			if($pass && !in_array($node, $node->parent->children, true)) {
624c165b184SJames Collins				$pass = false;
625c165b184SJames Collins			}
626c165b184SJames Collins
627c165b184SJames Collins			// Skip if tag doesn't match
628c165b184SJames Collins			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
629c165b184SJames Collins				$pass = false;
630c165b184SJames Collins			}
631c165b184SJames Collins
632c165b184SJames Collins			// Skip if ID doesn't exist
633c165b184SJames Collins			if ($pass && $id !== '' && !isset($node->attr['id'])) {
634c165b184SJames Collins				$pass = false;
635c165b184SJames Collins			}
636c165b184SJames Collins
637c165b184SJames Collins			// Check if ID matches
638c165b184SJames Collins			if ($pass && $id !== '' && isset($node->attr['id'])) {
639c165b184SJames Collins				// Note: Only consider the first ID (as browsers do)
640c165b184SJames Collins				$node_id = explode(' ', trim($node->attr['id']))[0];
641c165b184SJames Collins
642c165b184SJames Collins				if($id !== $node_id) { $pass = false; }
643c165b184SJames Collins			}
644c165b184SJames Collins
645c165b184SJames Collins			// Check if all class(es) exist
646c165b184SJames Collins			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
647c165b184SJames Collins				if (isset($node->attr['class'])) {
648c165b184SJames Collins					$node_classes = explode(' ', $node->attr['class']);
649c165b184SJames Collins
650c165b184SJames Collins					if ($lowercase) {
651c165b184SJames Collins						$node_classes = array_map('strtolower', $node_classes);
652c165b184SJames Collins					}
653c165b184SJames Collins
654c165b184SJames Collins					foreach($class as $c) {
655c165b184SJames Collins						if(!in_array($c, $node_classes)) {
656c165b184SJames Collins							$pass = false;
657c165b184SJames Collins							break;
658c165b184SJames Collins						}
659c165b184SJames Collins					}
660c165b184SJames Collins				} else {
661c165b184SJames Collins					$pass = false;
662c165b184SJames Collins				}
663c165b184SJames Collins			}
664c165b184SJames Collins
665c165b184SJames Collins			// Check attributes
666c165b184SJames Collins			if ($pass
667c165b184SJames Collins				&& $attributes !== ''
668c165b184SJames Collins				&& is_array($attributes)
669c165b184SJames Collins				&& !empty($attributes)) {
670c165b184SJames Collins					foreach($attributes as $a) {
671c165b184SJames Collins						list (
672c165b184SJames Collins							$att_name,
673c165b184SJames Collins							$att_expr,
674c165b184SJames Collins							$att_val,
675c165b184SJames Collins							$att_inv,
676c165b184SJames Collins							$att_case_sensitivity
677c165b184SJames Collins						) = $a;
678c165b184SJames Collins
679c165b184SJames Collins						// Handle indexing attributes (i.e. "[2]")
680c165b184SJames Collins						/**
681c165b184SJames Collins						 * Note: This is not supported by the CSS Standard but adds
682c165b184SJames Collins						 * the ability to select items compatible to XPath (i.e.
683c165b184SJames Collins						 * the 3rd element within it's parent).
684c165b184SJames Collins						 *
685c165b184SJames Collins						 * Note: This doesn't conflict with the CSS Standard which
686c165b184SJames Collins						 * doesn't work on numeric attributes anyway.
687c165b184SJames Collins						 */
688c165b184SJames Collins						if (is_numeric($att_name)
689c165b184SJames Collins							&& $att_expr === ''
690c165b184SJames Collins							&& $att_val === '') {
691c165b184SJames Collins								$count = 0;
692c165b184SJames Collins
693c165b184SJames Collins								// Find index of current element in parent
694c165b184SJames Collins								foreach ($node->parent->children as $c) {
695c165b184SJames Collins									if ($c->tag === $node->tag) ++$count;
696c165b184SJames Collins									if ($c === $node) break;
697c165b184SJames Collins								}
698c165b184SJames Collins
699c165b184SJames Collins								// If this is the correct node, continue with next
700c165b184SJames Collins								// attribute
701c165b184SJames Collins								if ($count === (int)$att_name) continue;
702c165b184SJames Collins						}
703c165b184SJames Collins
704c165b184SJames Collins						// Check attribute availability
705c165b184SJames Collins						if ($att_inv) { // Attribute should NOT be set
706c165b184SJames Collins							if (isset($node->attr[$att_name])) {
707c165b184SJames Collins								$pass = false;
708c165b184SJames Collins								break;
709c165b184SJames Collins							}
710c165b184SJames Collins						} else { // Attribute should be set
711c165b184SJames Collins							// todo: "plaintext" is not a valid CSS selector!
712c165b184SJames Collins							if ($att_name !== 'plaintext'
713c165b184SJames Collins								&& !isset($node->attr[$att_name])) {
714c165b184SJames Collins									$pass = false;
715c165b184SJames Collins									break;
716c165b184SJames Collins							}
717c165b184SJames Collins						}
718c165b184SJames Collins
719c165b184SJames Collins						// Continue with next attribute if expression isn't defined
720c165b184SJames Collins						if ($att_expr === '') continue;
721c165b184SJames Collins
722c165b184SJames Collins						// If they have told us that this is a "plaintext"
723c165b184SJames Collins						// search then we want the plaintext of the node - right?
724c165b184SJames Collins						// todo "plaintext" is not a valid CSS selector!
725c165b184SJames Collins						if ($att_name === 'plaintext') {
726c165b184SJames Collins							$nodeKeyValue = $node->text();
727c165b184SJames Collins						} else {
728c165b184SJames Collins							$nodeKeyValue = $node->attr[$att_name];
729c165b184SJames Collins						}
730c165b184SJames Collins
731c165b184SJames Collins						if (is_object($debug_object)) {
732c165b184SJames Collins							$debug_object->debug_log(2,
733c165b184SJames Collins								'testing node: '
734c165b184SJames Collins								. $node->tag
735c165b184SJames Collins								. ' for attribute: '
736c165b184SJames Collins								. $att_name
737c165b184SJames Collins								. $att_expr
738c165b184SJames Collins								. $att_val
739c165b184SJames Collins								. ' where nodes value is: '
740c165b184SJames Collins								. $nodeKeyValue
741c165b184SJames Collins							);
742c165b184SJames Collins						}
743c165b184SJames Collins
744c165b184SJames Collins						// If lowercase is set, do a case insensitive test of
745c165b184SJames Collins						// the value of the selector.
746c165b184SJames Collins						if ($lowercase) {
747c165b184SJames Collins							$check = $this->match(
748c165b184SJames Collins								$att_expr,
749c165b184SJames Collins								strtolower($att_val),
750c165b184SJames Collins								strtolower($nodeKeyValue),
751c165b184SJames Collins								$att_case_sensitivity
752c165b184SJames Collins							);
753c165b184SJames Collins						} else {
754c165b184SJames Collins							$check = $this->match(
755c165b184SJames Collins								$att_expr,
756c165b184SJames Collins								$att_val,
757c165b184SJames Collins								$nodeKeyValue,
758c165b184SJames Collins								$att_case_sensitivity
759c165b184SJames Collins							);
760c165b184SJames Collins						}
761c165b184SJames Collins
762c165b184SJames Collins						if (is_object($debug_object)) {
763c165b184SJames Collins							$debug_object->debug_log(2,
764c165b184SJames Collins								'after match: '
765c165b184SJames Collins								. ($check ? 'true' : 'false')
766c165b184SJames Collins							);
767c165b184SJames Collins						}
768c165b184SJames Collins
769c165b184SJames Collins						if (!$check) {
770c165b184SJames Collins							$pass = false;
771c165b184SJames Collins							break;
772c165b184SJames Collins						}
773c165b184SJames Collins					}
774c165b184SJames Collins			}
775c165b184SJames Collins
776c165b184SJames Collins			// Found a match. Add to list and clear node
777c165b184SJames Collins			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
778c165b184SJames Collins			unset($node);
779c165b184SJames Collins		}
780c165b184SJames Collins		// It's passed by reference so this is actually what this function returns.
781c165b184SJames Collins		if (is_object($debug_object)) {
782c165b184SJames Collins			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
783c165b184SJames Collins		}
784c165b184SJames Collins	}
785c165b184SJames Collins
786c165b184SJames Collins	protected function match($exp, $pattern, $value, $case_sensitivity)
787c165b184SJames Collins	{
788c165b184SJames Collins		global $debug_object;
789c165b184SJames Collins		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
790c165b184SJames Collins
791c165b184SJames Collins		if ($case_sensitivity === 'i') {
792c165b184SJames Collins			$pattern = strtolower($pattern);
793c165b184SJames Collins			$value = strtolower($value);
794c165b184SJames Collins		}
795c165b184SJames Collins
796c165b184SJames Collins		switch ($exp) {
797c165b184SJames Collins			case '=':
798c165b184SJames Collins				return ($value === $pattern);
799c165b184SJames Collins			case '!=':
800c165b184SJames Collins				return ($value !== $pattern);
801c165b184SJames Collins			case '^=':
802c165b184SJames Collins				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
803c165b184SJames Collins			case '$=':
804c165b184SJames Collins				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
805c165b184SJames Collins			case '*=':
806c165b184SJames Collins				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
807c165b184SJames Collins			case '|=':
808c165b184SJames Collins				/**
809c165b184SJames Collins				 * [att|=val]
810c165b184SJames Collins				 *
811c165b184SJames Collins				 * Represents an element with the att attribute, its value
812c165b184SJames Collins				 * either being exactly "val" or beginning with "val"
813c165b184SJames Collins				 * immediately followed by "-" (U+002D).
814c165b184SJames Collins				 */
815c165b184SJames Collins				return strpos($value, $pattern) === 0;
816c165b184SJames Collins			case '~=':
817c165b184SJames Collins				/**
818c165b184SJames Collins				 * [att~=val]
819c165b184SJames Collins				 *
820c165b184SJames Collins				 * Represents an element with the att attribute whose value is a
821c165b184SJames Collins				 * whitespace-separated list of words, one of which is exactly
822c165b184SJames Collins				 * "val". If "val" contains whitespace, it will never represent
823c165b184SJames Collins				 * anything (since the words are separated by spaces). Also if
824c165b184SJames Collins				 * "val" is the empty string, it will never represent anything.
825c165b184SJames Collins				 */
826c165b184SJames Collins				return in_array($pattern, explode(' ', trim($value)), true);
827c165b184SJames Collins		}
828c165b184SJames Collins		return false;
829c165b184SJames Collins	}
830c165b184SJames Collins
831c165b184SJames Collins	protected function parse_selector($selector_string)
832c165b184SJames Collins	{
833c165b184SJames Collins		global $debug_object;
834c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
835c165b184SJames Collins
836c165b184SJames Collins		/**
837c165b184SJames Collins		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
838c165b184SJames Collins		 *
839c165b184SJames Collins		 * Paperg: Add the colon to the attribute, so that it properly finds
840c165b184SJames Collins		 * <tag attr:ibute="something" > like google does.
841c165b184SJames Collins		 *
842c165b184SJames Collins		 * Note: if you try to look at this attribute, you MUST use getAttribute
843c165b184SJames Collins		 * since $dom->x:y will fail the php syntax check.
844c165b184SJames Collins		 *
845c165b184SJames Collins		 * Notice the \[ starting the attribute? and the @? following? This
846c165b184SJames Collins		 * implies that an attribute can begin with an @ sign that is not
847c165b184SJames Collins		 * captured. This implies that an html attribute specifier may start
848c165b184SJames Collins		 * with an @ sign that is NOT captured by the expression. Farther study
849c165b184SJames Collins		 * is required to determine of this should be documented or removed.
850c165b184SJames Collins		 *
851c165b184SJames Collins		 * Matches selectors in this order:
852c165b184SJames Collins		 *
853c165b184SJames Collins		 * [0] - full match
854c165b184SJames Collins		 *
855c165b184SJames Collins		 * [1] - tag name
856c165b184SJames Collins		 *     ([\w:\*-]*)
857c165b184SJames Collins		 *     Matches the tag name consisting of zero or more words, colons,
858c165b184SJames Collins		 *     asterisks and hyphens.
859c165b184SJames Collins		 *
860c165b184SJames Collins		 * [2] - id name
861c165b184SJames Collins		 *     (?:\#([\w-]+))
862c165b184SJames Collins		 *     Optionally matches a id name, consisting of an "#" followed by
863c165b184SJames Collins		 *     the id name (one or more words and hyphens).
864c165b184SJames Collins		 *
865c165b184SJames Collins		 * [3] - class names (including dots)
866c165b184SJames Collins		 *     (?:\.([\w\.-]+))?
867c165b184SJames Collins		 *     Optionally matches a list of classs, consisting of an "."
868c165b184SJames Collins		 *     followed by the class name (one or more words and hyphens)
869c165b184SJames Collins		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
870c165b184SJames Collins		 *
871c165b184SJames Collins		 * [4] - attributes
872c165b184SJames Collins		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
873c165b184SJames Collins		 *     Optionally matches the attributes list
874c165b184SJames Collins		 *
875c165b184SJames Collins		 * [5] - separator
876c165b184SJames Collins		 *     ([\/, >+~]+)
877c165b184SJames Collins		 *     Matches the selector list separator
878c165b184SJames Collins		 */
879c165b184SJames Collins		// phpcs:ignore Generic.Files.LineLength
880c165b184SJames Collins		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
881c165b184SJames Collins
882c165b184SJames Collins		preg_match_all(
883c165b184SJames Collins			$pattern,
884c165b184SJames Collins			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
885c165b184SJames Collins			$matches,
886c165b184SJames Collins			PREG_SET_ORDER
887c165b184SJames Collins		);
888c165b184SJames Collins
889c165b184SJames Collins		if (is_object($debug_object)) {
890c165b184SJames Collins			$debug_object->debug_log(2, 'Matches Array: ', $matches);
891c165b184SJames Collins		}
892c165b184SJames Collins
893c165b184SJames Collins		$selectors = array();
894c165b184SJames Collins		$result = array();
895c165b184SJames Collins
896c165b184SJames Collins		foreach ($matches as $m) {
897c165b184SJames Collins			$m[0] = trim($m[0]);
898c165b184SJames Collins
899c165b184SJames Collins			// Skip NoOps
900c165b184SJames Collins			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
901c165b184SJames Collins
902c165b184SJames Collins			// Convert to lowercase
903c165b184SJames Collins			if ($this->dom->lowercase) {
904c165b184SJames Collins				$m[1] = strtolower($m[1]);
905c165b184SJames Collins			}
906c165b184SJames Collins
907c165b184SJames Collins			// Extract classes
908c165b184SJames Collins			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
909c165b184SJames Collins
910c165b184SJames Collins			/* Extract attributes (pattern based on the pattern above!)
911c165b184SJames Collins
912c165b184SJames Collins			 * [0] - full match
913c165b184SJames Collins			 * [1] - attribute name
914c165b184SJames Collins			 * [2] - attribute expression
915c165b184SJames Collins			 * [3] - attribute value
916c165b184SJames Collins			 * [4] - case sensitivity
917c165b184SJames Collins			 *
918c165b184SJames Collins			 * Note: Attributes can be negated with a "!" prefix to their name
919c165b184SJames Collins			 */
920c165b184SJames Collins			if($m[4] !== '') {
921c165b184SJames Collins				preg_match_all(
922c165b184SJames Collins					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
923c165b184SJames Collins					trim($m[4]),
924c165b184SJames Collins					$attributes,
925c165b184SJames Collins					PREG_SET_ORDER
926c165b184SJames Collins				);
927c165b184SJames Collins
928c165b184SJames Collins				// Replace element by array
929c165b184SJames Collins				$m[4] = array();
930c165b184SJames Collins
931c165b184SJames Collins				foreach($attributes as $att) {
932c165b184SJames Collins					// Skip empty matches
933c165b184SJames Collins					if(trim($att[0]) === '') { continue; }
934c165b184SJames Collins
935c165b184SJames Collins					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
936c165b184SJames Collins					$m[4][] = array(
937c165b184SJames Collins						$inverted ? substr($att[1], 1) : $att[1], // Name
938c165b184SJames Collins						(isset($att[2])) ? $att[2] : '', // Expression
939c165b184SJames Collins						(isset($att[3])) ? $att[3] : '', // Value
940c165b184SJames Collins						$inverted, // Inverted Flag
941c165b184SJames Collins						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
942c165b184SJames Collins					);
943c165b184SJames Collins				}
944c165b184SJames Collins			}
945c165b184SJames Collins
946c165b184SJames Collins			// Sanitize Separator
947c165b184SJames Collins			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
948c165b184SJames Collins				$m[5] = ' ';
949c165b184SJames Collins			} else { // Other Separator
950c165b184SJames Collins				$m[5] = trim($m[5]);
951c165b184SJames Collins			}
952c165b184SJames Collins
953c165b184SJames Collins			// Clear Separator if it's a Selector List
954c165b184SJames Collins			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
955c165b184SJames Collins
956c165b184SJames Collins			// Remove full match before adding to results
957c165b184SJames Collins			array_shift($m);
958c165b184SJames Collins			$result[] = $m;
959c165b184SJames Collins
960c165b184SJames Collins			if ($is_list) { // Selector List
961c165b184SJames Collins				$selectors[] = $result;
962c165b184SJames Collins				$result = array();
963c165b184SJames Collins			}
964c165b184SJames Collins		}
965c165b184SJames Collins
966c165b184SJames Collins		if (count($result) > 0) { $selectors[] = $result; }
967c165b184SJames Collins		return $selectors;
968c165b184SJames Collins	}
969c165b184SJames Collins
970c165b184SJames Collins	function __get($name)
971c165b184SJames Collins	{
972c165b184SJames Collins		if (isset($this->attr[$name])) {
973c165b184SJames Collins			return $this->convert_text($this->attr[$name]);
974c165b184SJames Collins		}
975c165b184SJames Collins		switch ($name) {
976c165b184SJames Collins			case 'outertext': return $this->outertext();
977c165b184SJames Collins			case 'innertext': return $this->innertext();
978c165b184SJames Collins			case 'plaintext': return $this->text();
979c165b184SJames Collins			case 'xmltext': return $this->xmltext();
980c165b184SJames Collins			default: return array_key_exists($name, $this->attr);
981c165b184SJames Collins		}
982c165b184SJames Collins	}
983c165b184SJames Collins
984c165b184SJames Collins	function __set($name, $value)
985c165b184SJames Collins	{
986c165b184SJames Collins		global $debug_object;
987c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
988c165b184SJames Collins
989c165b184SJames Collins		switch ($name) {
990c165b184SJames Collins			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
991c165b184SJames Collins			case 'innertext':
992c165b184SJames Collins				if (isset($this->_[HDOM_INFO_TEXT])) {
993c165b184SJames Collins					return $this->_[HDOM_INFO_TEXT] = $value;
994c165b184SJames Collins				}
995c165b184SJames Collins				return $this->_[HDOM_INFO_INNER] = $value;
996c165b184SJames Collins		}
997c165b184SJames Collins
998c165b184SJames Collins		if (!isset($this->attr[$name])) {
999c165b184SJames Collins			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1000c165b184SJames Collins			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1001c165b184SJames Collins		}
1002c165b184SJames Collins
1003c165b184SJames Collins		$this->attr[$name] = $value;
1004c165b184SJames Collins	}
1005c165b184SJames Collins
1006c165b184SJames Collins	function __isset($name)
1007c165b184SJames Collins	{
1008c165b184SJames Collins		switch ($name) {
1009c165b184SJames Collins			case 'outertext': return true;
1010c165b184SJames Collins			case 'innertext': return true;
1011c165b184SJames Collins			case 'plaintext': return true;
1012c165b184SJames Collins		}
1013c165b184SJames Collins		//no value attr: nowrap, checked selected...
1014c165b184SJames Collins		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1015c165b184SJames Collins	}
1016c165b184SJames Collins
1017c165b184SJames Collins	function __unset($name)
1018c165b184SJames Collins	{
1019c165b184SJames Collins		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1020c165b184SJames Collins	}
1021c165b184SJames Collins
1022c165b184SJames Collins	function convert_text($text)
1023c165b184SJames Collins	{
1024c165b184SJames Collins		global $debug_object;
1025c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1026c165b184SJames Collins
1027c165b184SJames Collins		$converted_text = $text;
1028c165b184SJames Collins
1029c165b184SJames Collins		$sourceCharset = '';
1030c165b184SJames Collins		$targetCharset = '';
1031c165b184SJames Collins
1032c165b184SJames Collins		if ($this->dom) {
1033c165b184SJames Collins			$sourceCharset = strtoupper($this->dom->_charset);
1034c165b184SJames Collins			$targetCharset = strtoupper($this->dom->_target_charset);
1035c165b184SJames Collins		}
1036c165b184SJames Collins
1037c165b184SJames Collins		if (is_object($debug_object)) {
1038c165b184SJames Collins			$debug_object->debug_log(3,
1039c165b184SJames Collins				'source charset: '
1040c165b184SJames Collins				. $sourceCharset
1041c165b184SJames Collins				. ' target charaset: '
1042c165b184SJames Collins				. $targetCharset
1043c165b184SJames Collins			);
1044c165b184SJames Collins		}
1045c165b184SJames Collins
1046c165b184SJames Collins		if (!empty($sourceCharset)
1047c165b184SJames Collins			&& !empty($targetCharset)
1048c165b184SJames Collins			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1049c165b184SJames Collins			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1050c165b184SJames Collins			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1051c165b184SJames Collins				&& ($this->is_utf8($text))) {
1052c165b184SJames Collins				$converted_text = $text;
1053c165b184SJames Collins			} else {
1054c165b184SJames Collins				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1055c165b184SJames Collins			}
1056c165b184SJames Collins		}
1057c165b184SJames Collins
1058c165b184SJames Collins		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1059c165b184SJames Collins		if ($targetCharset === 'UTF-8') {
1060c165b184SJames Collins			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1061c165b184SJames Collins				$converted_text = substr($converted_text, 3);
1062c165b184SJames Collins			}
1063c165b184SJames Collins
1064c165b184SJames Collins			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1065c165b184SJames Collins				$converted_text = substr($converted_text, 0, -3);
1066c165b184SJames Collins			}
1067c165b184SJames Collins		}
1068c165b184SJames Collins
1069c165b184SJames Collins		return $converted_text;
1070c165b184SJames Collins	}
1071c165b184SJames Collins
1072c165b184SJames Collins	static function is_utf8($str)
1073c165b184SJames Collins	{
1074c165b184SJames Collins		$c = 0; $b = 0;
1075c165b184SJames Collins		$bits = 0;
1076c165b184SJames Collins		$len = strlen($str);
1077c165b184SJames Collins		for($i = 0; $i < $len; $i++) {
1078c165b184SJames Collins			$c = ord($str[$i]);
1079c165b184SJames Collins			if($c > 128) {
1080c165b184SJames Collins				if(($c >= 254)) { return false; }
1081c165b184SJames Collins				elseif($c >= 252) { $bits = 6; }
1082c165b184SJames Collins				elseif($c >= 248) { $bits = 5; }
1083c165b184SJames Collins				elseif($c >= 240) { $bits = 4; }
1084c165b184SJames Collins				elseif($c >= 224) { $bits = 3; }
1085c165b184SJames Collins				elseif($c >= 192) { $bits = 2; }
1086c165b184SJames Collins				else { return false; }
1087c165b184SJames Collins				if(($i + $bits) > $len) { return false; }
1088c165b184SJames Collins				while($bits > 1) {
1089c165b184SJames Collins					$i++;
1090c165b184SJames Collins					$b = ord($str[$i]);
1091c165b184SJames Collins					if($b < 128 || $b > 191) { return false; }
1092c165b184SJames Collins					$bits--;
1093c165b184SJames Collins				}
1094c165b184SJames Collins			}
1095c165b184SJames Collins		}
1096c165b184SJames Collins		return true;
1097c165b184SJames Collins	}
1098c165b184SJames Collins
1099c165b184SJames Collins	function get_display_size()
1100c165b184SJames Collins	{
1101c165b184SJames Collins		global $debug_object;
1102c165b184SJames Collins
1103c165b184SJames Collins		$width = -1;
1104c165b184SJames Collins		$height = -1;
1105c165b184SJames Collins
1106c165b184SJames Collins		if ($this->tag !== 'img') {
1107c165b184SJames Collins			return false;
1108c165b184SJames Collins		}
1109c165b184SJames Collins
1110c165b184SJames Collins		// See if there is aheight or width attribute in the tag itself.
1111c165b184SJames Collins		if (isset($this->attr['width'])) {
1112c165b184SJames Collins			$width = $this->attr['width'];
1113c165b184SJames Collins		}
1114c165b184SJames Collins
1115c165b184SJames Collins		if (isset($this->attr['height'])) {
1116c165b184SJames Collins			$height = $this->attr['height'];
1117c165b184SJames Collins		}
1118c165b184SJames Collins
1119c165b184SJames Collins		// Now look for an inline style.
1120c165b184SJames Collins		if (isset($this->attr['style'])) {
1121c165b184SJames Collins			// Thanks to user gnarf from stackoverflow for this regular expression.
1122c165b184SJames Collins			$attributes = array();
1123c165b184SJames Collins
1124c165b184SJames Collins			preg_match_all(
1125c165b184SJames Collins				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1126c165b184SJames Collins				$this->attr['style'],
1127c165b184SJames Collins				$matches,
1128c165b184SJames Collins				PREG_SET_ORDER
1129c165b184SJames Collins			);
1130c165b184SJames Collins
1131c165b184SJames Collins			foreach ($matches as $match) {
1132c165b184SJames Collins				$attributes[$match[1]] = $match[2];
1133c165b184SJames Collins			}
1134c165b184SJames Collins
1135c165b184SJames Collins			// If there is a width in the style attributes:
1136c165b184SJames Collins			if (isset($attributes['width']) && $width == -1) {
1137c165b184SJames Collins				// check that the last two characters are px (pixels)
1138c165b184SJames Collins				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1139c165b184SJames Collins					$proposed_width = substr($attributes['width'], 0, -2);
1140c165b184SJames Collins					// Now make sure that it's an integer and not something stupid.
1141c165b184SJames Collins					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1142c165b184SJames Collins						$width = $proposed_width;
1143c165b184SJames Collins					}
1144c165b184SJames Collins				}
1145c165b184SJames Collins			}
1146c165b184SJames Collins
1147c165b184SJames Collins			// If there is a width in the style attributes:
1148c165b184SJames Collins			if (isset($attributes['height']) && $height == -1) {
1149c165b184SJames Collins				// check that the last two characters are px (pixels)
1150c165b184SJames Collins				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1151c165b184SJames Collins					$proposed_height = substr($attributes['height'], 0, -2);
1152c165b184SJames Collins					// Now make sure that it's an integer and not something stupid.
1153c165b184SJames Collins					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1154c165b184SJames Collins						$height = $proposed_height;
1155c165b184SJames Collins					}
1156c165b184SJames Collins				}
1157c165b184SJames Collins			}
1158c165b184SJames Collins
1159c165b184SJames Collins		}
1160c165b184SJames Collins
1161c165b184SJames Collins		// Future enhancement:
1162c165b184SJames Collins		// Look in the tag to see if there is a class or id specified that has
1163c165b184SJames Collins		// a height or width attribute to it.
1164c165b184SJames Collins
1165c165b184SJames Collins		// Far future enhancement
1166c165b184SJames Collins		// Look at all the parent tags of this image to see if they specify a
1167c165b184SJames Collins		// class or id that has an img selector that specifies a height or width
1168c165b184SJames Collins		// Note that in this case, the class or id will have the img subselector
1169c165b184SJames Collins		// for it to apply to the image.
1170c165b184SJames Collins
1171c165b184SJames Collins		// ridiculously far future development
1172c165b184SJames Collins		// If the class or id is specified in a SEPARATE css file thats not on
1173c165b184SJames Collins		// the page, go get it and do what we were just doing for the ones on
1174c165b184SJames Collins		// the page.
1175c165b184SJames Collins
1176c165b184SJames Collins		$result = array(
1177c165b184SJames Collins			'height' => $height,
1178c165b184SJames Collins			'width' => $width
1179c165b184SJames Collins		);
1180c165b184SJames Collins
1181c165b184SJames Collins		return $result;
1182c165b184SJames Collins	}
1183c165b184SJames Collins
1184c165b184SJames Collins	function save($filepath = '')
1185c165b184SJames Collins	{
1186c165b184SJames Collins		$ret = $this->outertext();
1187c165b184SJames Collins
1188c165b184SJames Collins		if ($filepath !== '') {
1189c165b184SJames Collins			file_put_contents($filepath, $ret, LOCK_EX);
1190c165b184SJames Collins		}
1191c165b184SJames Collins
1192c165b184SJames Collins		return $ret;
1193c165b184SJames Collins	}
1194c165b184SJames Collins
1195c165b184SJames Collins	function addClass($class)
1196c165b184SJames Collins	{
1197c165b184SJames Collins		if (is_string($class)) {
1198c165b184SJames Collins			$class = explode(' ', $class);
1199c165b184SJames Collins		}
1200c165b184SJames Collins
1201c165b184SJames Collins		if (is_array($class)) {
1202c165b184SJames Collins			foreach($class as $c) {
1203c165b184SJames Collins				if (isset($this->class)) {
1204c165b184SJames Collins					if ($this->hasClass($c)) {
1205c165b184SJames Collins						continue;
1206c165b184SJames Collins					} else {
1207c165b184SJames Collins						$this->class .= ' ' . $c;
1208c165b184SJames Collins					}
1209c165b184SJames Collins				} else {
1210c165b184SJames Collins					$this->class = $c;
1211c165b184SJames Collins				}
1212c165b184SJames Collins			}
1213c165b184SJames Collins		} else {
1214c165b184SJames Collins			if (is_object($debug_object)) {
1215c165b184SJames Collins				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1216c165b184SJames Collins			}
1217c165b184SJames Collins		}
1218c165b184SJames Collins	}
1219c165b184SJames Collins
1220c165b184SJames Collins	function hasClass($class)
1221c165b184SJames Collins	{
1222c165b184SJames Collins		if (is_string($class)) {
1223c165b184SJames Collins			if (isset($this->class)) {
1224c165b184SJames Collins				return in_array($class, explode(' ', $this->class), true);
1225c165b184SJames Collins			}
1226c165b184SJames Collins		} else {
1227c165b184SJames Collins			if (is_object($debug_object)) {
1228c165b184SJames Collins				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1229c165b184SJames Collins			}
1230c165b184SJames Collins		}
1231c165b184SJames Collins
1232c165b184SJames Collins		return false;
1233c165b184SJames Collins	}
1234c165b184SJames Collins
1235c165b184SJames Collins	function removeClass($class = null)
1236c165b184SJames Collins	{
1237c165b184SJames Collins		if (!isset($this->class)) {
1238c165b184SJames Collins			return;
1239c165b184SJames Collins		}
1240c165b184SJames Collins
1241c165b184SJames Collins		if (is_null($class)) {
1242c165b184SJames Collins			$this->removeAttribute('class');
1243c165b184SJames Collins			return;
1244c165b184SJames Collins		}
1245c165b184SJames Collins
1246c165b184SJames Collins		if (is_string($class)) {
1247c165b184SJames Collins			$class = explode(' ', $class);
1248c165b184SJames Collins		}
1249c165b184SJames Collins
1250c165b184SJames Collins		if (is_array($class)) {
1251c165b184SJames Collins			$class = array_diff(explode(' ', $this->class), $class);
1252c165b184SJames Collins			if (empty($class)) {
1253c165b184SJames Collins				$this->removeAttribute('class');
1254c165b184SJames Collins			} else {
1255c165b184SJames Collins				$this->class = implode(' ', $class);
1256c165b184SJames Collins			}
1257c165b184SJames Collins		}
1258c165b184SJames Collins	}
1259c165b184SJames Collins
1260c165b184SJames Collins	function getAllAttributes()
1261c165b184SJames Collins	{
1262c165b184SJames Collins		return $this->attr;
1263c165b184SJames Collins	}
1264c165b184SJames Collins
1265c165b184SJames Collins	function getAttribute($name)
1266c165b184SJames Collins	{
1267c165b184SJames Collins		return $this->__get($name);
1268c165b184SJames Collins	}
1269c165b184SJames Collins
1270c165b184SJames Collins	function setAttribute($name, $value)
1271c165b184SJames Collins	{
1272c165b184SJames Collins		$this->__set($name, $value);
1273c165b184SJames Collins	}
1274c165b184SJames Collins
1275c165b184SJames Collins	function hasAttribute($name)
1276c165b184SJames Collins	{
1277c165b184SJames Collins		return $this->__isset($name);
1278c165b184SJames Collins	}
1279c165b184SJames Collins
1280c165b184SJames Collins	function removeAttribute($name)
1281c165b184SJames Collins	{
1282c165b184SJames Collins		$this->__set($name, null);
1283c165b184SJames Collins	}
1284c165b184SJames Collins
1285c165b184SJames Collins	function remove()
1286c165b184SJames Collins	{
1287c165b184SJames Collins		if ($this->parent) {
1288c165b184SJames Collins			$this->parent->removeChild($this);
1289c165b184SJames Collins		}
1290c165b184SJames Collins	}
1291c165b184SJames Collins
1292c165b184SJames Collins	function removeChild($node)
1293c165b184SJames Collins	{
1294c165b184SJames Collins		$nidx = array_search($node, $this->nodes, true);
1295c165b184SJames Collins		$cidx = array_search($node, $this->children, true);
1296c165b184SJames Collins		$didx = array_search($node, $this->dom->nodes, true);
1297c165b184SJames Collins
1298c165b184SJames Collins		if ($nidx !== false && $cidx !== false && $didx !== false) {
1299c165b184SJames Collins
1300c165b184SJames Collins			foreach($node->children as $child) {
1301c165b184SJames Collins				$node->removeChild($child);
1302c165b184SJames Collins			}
1303c165b184SJames Collins
1304c165b184SJames Collins			foreach($node->nodes as $entity) {
1305c165b184SJames Collins				$enidx = array_search($entity, $node->nodes, true);
1306c165b184SJames Collins				$edidx = array_search($entity, $node->dom->nodes, true);
1307c165b184SJames Collins
1308c165b184SJames Collins				if ($enidx !== false && $edidx !== false) {
1309c165b184SJames Collins					unset($node->nodes[$enidx]);
1310c165b184SJames Collins					unset($node->dom->nodes[$edidx]);
1311c165b184SJames Collins				}
1312c165b184SJames Collins			}
1313c165b184SJames Collins
1314c165b184SJames Collins			unset($this->nodes[$nidx]);
1315c165b184SJames Collins			unset($this->children[$cidx]);
1316c165b184SJames Collins			unset($this->dom->nodes[$didx]);
1317c165b184SJames Collins
1318c165b184SJames Collins			$node->clear();
1319c165b184SJames Collins
1320c165b184SJames Collins		}
1321c165b184SJames Collins	}
1322c165b184SJames Collins
1323c165b184SJames Collins	function getElementById($id)
1324c165b184SJames Collins	{
1325c165b184SJames Collins		return $this->find("#$id", 0);
1326c165b184SJames Collins	}
1327c165b184SJames Collins
1328c165b184SJames Collins	function getElementsById($id, $idx = null)
1329c165b184SJames Collins	{
1330c165b184SJames Collins		return $this->find("#$id", $idx);
1331c165b184SJames Collins	}
1332c165b184SJames Collins
1333c165b184SJames Collins	function getElementByTagName($name)
1334c165b184SJames Collins	{
1335c165b184SJames Collins		return $this->find($name, 0);
1336c165b184SJames Collins	}
1337c165b184SJames Collins
1338c165b184SJames Collins	function getElementsByTagName($name, $idx = null)
1339c165b184SJames Collins	{
1340c165b184SJames Collins		return $this->find($name, $idx);
1341c165b184SJames Collins	}
1342c165b184SJames Collins
1343c165b184SJames Collins	function parentNode()
1344c165b184SJames Collins	{
1345c165b184SJames Collins		return $this->parent();
1346c165b184SJames Collins	}
1347c165b184SJames Collins
1348c165b184SJames Collins	function childNodes($idx = -1)
1349c165b184SJames Collins	{
1350c165b184SJames Collins		return $this->children($idx);
1351c165b184SJames Collins	}
1352c165b184SJames Collins
1353c165b184SJames Collins	function firstChild()
1354c165b184SJames Collins	{
1355c165b184SJames Collins		return $this->first_child();
1356c165b184SJames Collins	}
1357c165b184SJames Collins
1358c165b184SJames Collins	function lastChild()
1359c165b184SJames Collins	{
1360c165b184SJames Collins		return $this->last_child();
1361c165b184SJames Collins	}
1362c165b184SJames Collins
1363c165b184SJames Collins	function nextSibling()
1364c165b184SJames Collins	{
1365c165b184SJames Collins		return $this->next_sibling();
1366c165b184SJames Collins	}
1367c165b184SJames Collins
1368c165b184SJames Collins	function previousSibling()
1369c165b184SJames Collins	{
1370c165b184SJames Collins		return $this->prev_sibling();
1371c165b184SJames Collins	}
1372c165b184SJames Collins
1373c165b184SJames Collins	function hasChildNodes()
1374c165b184SJames Collins	{
1375c165b184SJames Collins		return $this->has_child();
1376c165b184SJames Collins	}
1377c165b184SJames Collins
1378c165b184SJames Collins	function nodeName()
1379c165b184SJames Collins	{
1380c165b184SJames Collins		return $this->tag;
1381c165b184SJames Collins	}
1382c165b184SJames Collins
1383c165b184SJames Collins	function appendChild($node)
1384c165b184SJames Collins	{
1385c165b184SJames Collins		$node->parent($this);
1386c165b184SJames Collins		return $node;
1387c165b184SJames Collins	}
1388c165b184SJames Collins
1389c165b184SJames Collins}
1390c165b184SJames Collins
1391c165b184SJames Collinsclass simple_html_dom
1392c165b184SJames Collins{
1393c165b184SJames Collins	public $root = null;
1394c165b184SJames Collins	public $nodes = array();
1395c165b184SJames Collins	public $callback = null;
1396c165b184SJames Collins	public $lowercase = false;
1397c165b184SJames Collins	public $original_size;
1398c165b184SJames Collins	public $size;
1399c165b184SJames Collins
1400c165b184SJames Collins	protected $pos;
1401c165b184SJames Collins	protected $doc;
1402c165b184SJames Collins	protected $char;
1403c165b184SJames Collins
1404c165b184SJames Collins	protected $cursor;
1405c165b184SJames Collins	protected $parent;
1406c165b184SJames Collins	protected $noise = array();
1407c165b184SJames Collins	protected $token_blank = " \t\r\n";
1408c165b184SJames Collins	protected $token_equal = ' =/>';
1409c165b184SJames Collins	protected $token_slash = " />\r\n\t";
1410c165b184SJames Collins	protected $token_attr = ' >';
1411c165b184SJames Collins
1412c165b184SJames Collins	public $_charset = '';
1413c165b184SJames Collins	public $_target_charset = '';
1414c165b184SJames Collins
1415c165b184SJames Collins	protected $default_br_text = '';
1416c165b184SJames Collins
1417c165b184SJames Collins	public $default_span_text = '';
1418c165b184SJames Collins
1419c165b184SJames Collins	protected $self_closing_tags = array(
1420c165b184SJames Collins		'area' => 1,
1421c165b184SJames Collins		'base' => 1,
1422c165b184SJames Collins		'br' => 1,
1423c165b184SJames Collins		'col' => 1,
1424c165b184SJames Collins		'embed' => 1,
1425c165b184SJames Collins		'hr' => 1,
1426c165b184SJames Collins		'img' => 1,
1427c165b184SJames Collins		'input' => 1,
1428c165b184SJames Collins		'link' => 1,
1429c165b184SJames Collins		'meta' => 1,
1430c165b184SJames Collins		'param' => 1,
1431c165b184SJames Collins		'source' => 1,
1432c165b184SJames Collins		'track' => 1,
1433c165b184SJames Collins		'wbr' => 1
1434c165b184SJames Collins	);
1435c165b184SJames Collins	protected $block_tags = array(
1436c165b184SJames Collins		'body' => 1,
1437c165b184SJames Collins		'div' => 1,
1438c165b184SJames Collins		'form' => 1,
1439c165b184SJames Collins		'root' => 1,
1440c165b184SJames Collins		'span' => 1,
1441c165b184SJames Collins		'table' => 1
1442c165b184SJames Collins	);
1443c165b184SJames Collins	protected $optional_closing_tags = array(
1444c165b184SJames Collins		// Not optional, see
1445c165b184SJames Collins		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1446c165b184SJames Collins		'b' => array('b' => 1),
1447c165b184SJames Collins		'dd' => array('dd' => 1, 'dt' => 1),
1448c165b184SJames Collins		// Not optional, see
1449c165b184SJames Collins		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1450c165b184SJames Collins		'dl' => array('dd' => 1, 'dt' => 1),
1451c165b184SJames Collins		'dt' => array('dd' => 1, 'dt' => 1),
1452c165b184SJames Collins		'li' => array('li' => 1),
1453c165b184SJames Collins		'optgroup' => array('optgroup' => 1, 'option' => 1),
1454c165b184SJames Collins		'option' => array('optgroup' => 1, 'option' => 1),
1455c165b184SJames Collins		'p' => array('p' => 1),
1456c165b184SJames Collins		'rp' => array('rp' => 1, 'rt' => 1),
1457c165b184SJames Collins		'rt' => array('rp' => 1, 'rt' => 1),
1458c165b184SJames Collins		'td' => array('td' => 1, 'th' => 1),
1459c165b184SJames Collins		'th' => array('td' => 1, 'th' => 1),
1460c165b184SJames Collins		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1461c165b184SJames Collins	);
1462c165b184SJames Collins
1463c165b184SJames Collins	function __construct(
1464c165b184SJames Collins		$str = null,
1465c165b184SJames Collins		$lowercase = true,
1466c165b184SJames Collins		$forceTagsClosed = true,
1467c165b184SJames Collins		$target_charset = DEFAULT_TARGET_CHARSET,
1468c165b184SJames Collins		$stripRN = true,
1469c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1470c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT,
1471c165b184SJames Collins		$options = 0)
1472c165b184SJames Collins	{
1473c165b184SJames Collins		if ($str) {
1474c165b184SJames Collins			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1475c165b184SJames Collins				$this->load_file($str);
1476c165b184SJames Collins			} else {
1477c165b184SJames Collins				$this->load(
1478c165b184SJames Collins					$str,
1479c165b184SJames Collins					$lowercase,
1480c165b184SJames Collins					$stripRN,
1481c165b184SJames Collins					$defaultBRText,
1482c165b184SJames Collins					$defaultSpanText,
1483c165b184SJames Collins					$options
1484c165b184SJames Collins				);
1485c165b184SJames Collins			}
1486c165b184SJames Collins		}
1487c165b184SJames Collins		// Forcing tags to be closed implies that we don't trust the html, but
1488c165b184SJames Collins		// it can lead to parsing errors if we SHOULD trust the html.
1489c165b184SJames Collins		if (!$forceTagsClosed) {
1490c165b184SJames Collins			$this->optional_closing_array = array();
1491c165b184SJames Collins		}
1492c165b184SJames Collins
1493c165b184SJames Collins		$this->_target_charset = $target_charset;
1494c165b184SJames Collins	}
1495c165b184SJames Collins
1496c165b184SJames Collins	function __destruct()
1497c165b184SJames Collins	{
1498c165b184SJames Collins		$this->clear();
1499c165b184SJames Collins	}
1500c165b184SJames Collins
1501c165b184SJames Collins	function load(
1502c165b184SJames Collins		$str,
1503c165b184SJames Collins		$lowercase = true,
1504c165b184SJames Collins		$stripRN = true,
1505c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1506c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT,
1507c165b184SJames Collins		$options = 0)
1508c165b184SJames Collins	{
1509c165b184SJames Collins		global $debug_object;
1510c165b184SJames Collins
1511c165b184SJames Collins		// prepare
1512c165b184SJames Collins		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1513c165b184SJames Collins
1514c165b184SJames Collins		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1515c165b184SJames Collins		// Script tags removal now preceeds style tag removal.
1516c165b184SJames Collins		// strip out <script> tags
1517c165b184SJames Collins		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1518c165b184SJames Collins		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1519c165b184SJames Collins
1520c165b184SJames Collins		// strip out the \r \n's if we are told to.
1521c165b184SJames Collins		if ($stripRN) {
1522c165b184SJames Collins			$this->doc = str_replace("\r", ' ', $this->doc);
1523c165b184SJames Collins			$this->doc = str_replace("\n", ' ', $this->doc);
1524c165b184SJames Collins
1525c165b184SJames Collins			// set the length of content since we have changed it.
1526c165b184SJames Collins			$this->size = strlen($this->doc);
1527c165b184SJames Collins		}
1528c165b184SJames Collins
1529c165b184SJames Collins		// strip out cdata
1530c165b184SJames Collins		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1531c165b184SJames Collins		// strip out comments
1532c165b184SJames Collins		$this->remove_noise("'<!--(.*?)-->'is");
1533c165b184SJames Collins		// strip out <style> tags
1534c165b184SJames Collins		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1535c165b184SJames Collins		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1536c165b184SJames Collins		// strip out preformatted tags
1537c165b184SJames Collins		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1538c165b184SJames Collins		// strip out server side scripts
1539c165b184SJames Collins		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1540c165b184SJames Collins
1541c165b184SJames Collins		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1542c165b184SJames Collins			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1543c165b184SJames Collins		}
1544c165b184SJames Collins
1545c165b184SJames Collins		// parsing
1546c165b184SJames Collins		$this->parse();
1547c165b184SJames Collins		// end
1548c165b184SJames Collins		$this->root->_[HDOM_INFO_END] = $this->cursor;
1549c165b184SJames Collins		$this->parse_charset();
1550c165b184SJames Collins
1551c165b184SJames Collins		// make load function chainable
1552c165b184SJames Collins		return $this;
1553c165b184SJames Collins	}
1554c165b184SJames Collins
1555c165b184SJames Collins	function load_file()
1556c165b184SJames Collins	{
1557c165b184SJames Collins		$args = func_get_args();
1558c165b184SJames Collins
1559c165b184SJames Collins		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1560c165b184SJames Collins			$this->load($doc, true);
1561c165b184SJames Collins		} else {
1562c165b184SJames Collins			return false;
1563c165b184SJames Collins		}
1564c165b184SJames Collins	}
1565c165b184SJames Collins
1566c165b184SJames Collins	function set_callback($function_name)
1567c165b184SJames Collins	{
1568c165b184SJames Collins		$this->callback = $function_name;
1569c165b184SJames Collins	}
1570c165b184SJames Collins
1571c165b184SJames Collins	function remove_callback()
1572c165b184SJames Collins	{
1573c165b184SJames Collins		$this->callback = null;
1574c165b184SJames Collins	}
1575c165b184SJames Collins
1576c165b184SJames Collins	function save($filepath = '')
1577c165b184SJames Collins	{
1578c165b184SJames Collins		$ret = $this->root->innertext();
1579c165b184SJames Collins		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1580c165b184SJames Collins		return $ret;
1581c165b184SJames Collins	}
1582c165b184SJames Collins
1583c165b184SJames Collins	function find($selector, $idx = null, $lowercase = false)
1584c165b184SJames Collins	{
1585c165b184SJames Collins		return $this->root->find($selector, $idx, $lowercase);
1586c165b184SJames Collins	}
1587c165b184SJames Collins
1588c165b184SJames Collins	function clear()
1589c165b184SJames Collins	{
1590c165b184SJames Collins		if (isset($this->nodes)) {
1591c165b184SJames Collins			foreach ($this->nodes as $n) {
1592c165b184SJames Collins				$n->clear();
1593c165b184SJames Collins				$n = null;
1594c165b184SJames Collins			}
1595c165b184SJames Collins		}
1596c165b184SJames Collins
1597c165b184SJames Collins		// This add next line is documented in the sourceforge repository.
1598c165b184SJames Collins		// 2977248 as a fix for ongoing memory leaks that occur even with the
1599c165b184SJames Collins		// use of clear.
1600c165b184SJames Collins		if (isset($this->children)) {
1601c165b184SJames Collins			foreach ($this->children as $n) {
1602c165b184SJames Collins				$n->clear();
1603c165b184SJames Collins				$n = null;
1604c165b184SJames Collins			}
1605c165b184SJames Collins		}
1606c165b184SJames Collins
1607c165b184SJames Collins		if (isset($this->parent)) {
1608c165b184SJames Collins			$this->parent->clear();
1609c165b184SJames Collins			unset($this->parent);
1610c165b184SJames Collins		}
1611c165b184SJames Collins
1612c165b184SJames Collins		if (isset($this->root)) {
1613c165b184SJames Collins			$this->root->clear();
1614c165b184SJames Collins			unset($this->root);
1615c165b184SJames Collins		}
1616c165b184SJames Collins
1617c165b184SJames Collins		unset($this->doc);
1618c165b184SJames Collins		unset($this->noise);
1619c165b184SJames Collins	}
1620c165b184SJames Collins
1621c165b184SJames Collins	function dump($show_attr = true)
1622c165b184SJames Collins	{
1623c165b184SJames Collins		$this->root->dump($show_attr);
1624c165b184SJames Collins	}
1625c165b184SJames Collins
1626c165b184SJames Collins	protected function prepare(
1627c165b184SJames Collins		$str, $lowercase = true,
1628c165b184SJames Collins		$defaultBRText = DEFAULT_BR_TEXT,
1629c165b184SJames Collins		$defaultSpanText = DEFAULT_SPAN_TEXT)
1630c165b184SJames Collins	{
1631c165b184SJames Collins		$this->clear();
1632c165b184SJames Collins
1633c165b184SJames Collins		$this->doc = trim($str);
1634c165b184SJames Collins		$this->size = strlen($this->doc);
1635c165b184SJames Collins		$this->original_size = $this->size; // original size of the html
1636c165b184SJames Collins		$this->pos = 0;
1637c165b184SJames Collins		$this->cursor = 1;
1638c165b184SJames Collins		$this->noise = array();
1639c165b184SJames Collins		$this->nodes = array();
1640c165b184SJames Collins		$this->lowercase = $lowercase;
1641c165b184SJames Collins		$this->default_br_text = $defaultBRText;
1642c165b184SJames Collins		$this->default_span_text = $defaultSpanText;
1643c165b184SJames Collins		$this->root = new simple_html_dom_node($this);
1644c165b184SJames Collins		$this->root->tag = 'root';
1645c165b184SJames Collins		$this->root->_[HDOM_INFO_BEGIN] = -1;
1646c165b184SJames Collins		$this->root->nodetype = HDOM_TYPE_ROOT;
1647c165b184SJames Collins		$this->parent = $this->root;
1648c165b184SJames Collins		if ($this->size > 0) { $this->char = $this->doc[0]; }
1649c165b184SJames Collins	}
1650c165b184SJames Collins
1651c165b184SJames Collins	protected function parse()
1652c165b184SJames Collins	{
1653c165b184SJames Collins		while (true) {
1654c165b184SJames Collins			// Read next tag if there is no text between current position and the
1655c165b184SJames Collins			// next opening tag.
1656c165b184SJames Collins			if (($s = $this->copy_until_char('<')) === '') {
1657c165b184SJames Collins				if($this->read_tag()) {
1658c165b184SJames Collins					continue;
1659c165b184SJames Collins				} else {
1660c165b184SJames Collins					return true;
1661c165b184SJames Collins				}
1662c165b184SJames Collins			}
1663c165b184SJames Collins
1664c165b184SJames Collins			// Add a text node for text between tags
1665c165b184SJames Collins			$node = new simple_html_dom_node($this);
1666c165b184SJames Collins			++$this->cursor;
1667c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = $s;
1668c165b184SJames Collins			$this->link_nodes($node, false);
1669c165b184SJames Collins		}
1670c165b184SJames Collins	}
1671c165b184SJames Collins
1672c165b184SJames Collins	protected function parse_charset()
1673c165b184SJames Collins	{
1674c165b184SJames Collins		global $debug_object;
1675c165b184SJames Collins
1676c165b184SJames Collins		$charset = null;
1677c165b184SJames Collins
1678c165b184SJames Collins		if (function_exists('get_last_retrieve_url_contents_content_type')) {
1679c165b184SJames Collins			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1680c165b184SJames Collins			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1681c165b184SJames Collins			if ($success) {
1682c165b184SJames Collins				$charset = $matches[1];
1683c165b184SJames Collins				if (is_object($debug_object)) {
1684c165b184SJames Collins					$debug_object->debug_log(2,
1685c165b184SJames Collins						'header content-type found charset of: '
1686c165b184SJames Collins						. $charset
1687c165b184SJames Collins					);
1688c165b184SJames Collins				}
1689c165b184SJames Collins			}
1690c165b184SJames Collins		}
1691c165b184SJames Collins
1692c165b184SJames Collins		if (empty($charset)) {
1693c165b184SJames Collins			// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1694c165b184SJames Collins			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
1695c165b184SJames Collins
1696c165b184SJames Collins			if (!empty($el)) {
1697c165b184SJames Collins				$fullvalue = $el->content;
1698c165b184SJames Collins				if (is_object($debug_object)) {
1699c165b184SJames Collins					$debug_object->debug_log(2,
1700c165b184SJames Collins						'meta content-type tag found'
1701c165b184SJames Collins						. $fullvalue
1702c165b184SJames Collins					);
1703c165b184SJames Collins				}
1704c165b184SJames Collins
1705c165b184SJames Collins				if (!empty($fullvalue)) {
1706c165b184SJames Collins					$success = preg_match(
1707c165b184SJames Collins						'/charset=(.+)/i',
1708c165b184SJames Collins						$fullvalue,
1709c165b184SJames Collins						$matches
1710c165b184SJames Collins					);
1711c165b184SJames Collins
1712c165b184SJames Collins					if ($success) {
1713c165b184SJames Collins						$charset = $matches[1];
1714c165b184SJames Collins					} else {
1715c165b184SJames Collins						// If there is a meta tag, and they don't specify the
1716c165b184SJames Collins						// character set, research says that it's typically
1717c165b184SJames Collins						// ISO-8859-1
1718c165b184SJames Collins						if (is_object($debug_object)) {
1719c165b184SJames Collins							$debug_object->debug_log(2,
1720c165b184SJames Collins								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
1721c165b184SJames Collins							);
1722c165b184SJames Collins						}
1723c165b184SJames Collins
1724c165b184SJames Collins						$charset = 'ISO-8859-1';
1725c165b184SJames Collins					}
1726c165b184SJames Collins				}
1727c165b184SJames Collins			}
1728c165b184SJames Collins		}
1729c165b184SJames Collins
1730c165b184SJames Collins		if (empty($charset)) {
1731c165b184SJames Collins			// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
1732c165b184SJames Collins			if ($meta = $this->root->find('meta[charset]', 0)) {
1733c165b184SJames Collins				$charset = $meta->charset;
1734c165b184SJames Collins				if (is_object($debug_object)) {
1735c165b184SJames Collins					$debug_object->debug_log(2, 'meta charset: ' . $charset);
1736c165b184SJames Collins				}
1737c165b184SJames Collins			}
1738c165b184SJames Collins		}
1739c165b184SJames Collins
1740c165b184SJames Collins		if (empty($charset)) {
1741c165b184SJames Collins			// Try to guess the charset based on the content
1742c165b184SJames Collins			// Requires Multibyte String (mbstring) support (optional)
1743c165b184SJames Collins			if (function_exists('mb_detect_encoding')) {
1744c165b184SJames Collins				/**
1745c165b184SJames Collins				 * mb_detect_encoding() is not intended to distinguish between
1746c165b184SJames Collins				 * charsets, especially single-byte charsets. Its primary
1747c165b184SJames Collins				 * purpose is to detect which multibyte encoding is in use,
1748c165b184SJames Collins				 * i.e. UTF-8, UTF-16, shift-JIS, etc.
1749c165b184SJames Collins				 *
1750c165b184SJames Collins				 * -- https://bugs.php.net/bug.php?id=38138
1751c165b184SJames Collins				 *
1752c165b184SJames Collins				 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1753c165b184SJames Collins				 * always result in CP1251/ISO-8859-5 and vice versa.
1754c165b184SJames Collins				 *
1755c165b184SJames Collins				 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1756c165b184SJames Collins				 * to stay compatible.
1757c165b184SJames Collins				 */
1758c165b184SJames Collins				$encoding = mb_detect_encoding(
1759c165b184SJames Collins					$this->doc,
1760c165b184SJames Collins					array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
1761c165b184SJames Collins				);
1762c165b184SJames Collins
1763c165b184SJames Collins				if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
1764c165b184SJames Collins					// Due to a limitation of mb_detect_encoding
1765c165b184SJames Collins					// 'CP1251'/'ISO-8859-5' will be detected as
1766c165b184SJames Collins					// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1767c165b184SJames Collins					// which case we can simply assume it is the other charset.
1768c165b184SJames Collins					if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
1769c165b184SJames Collins						$encoding = 'CP1251';
1770c165b184SJames Collins					}
1771c165b184SJames Collins				}
1772c165b184SJames Collins
1773c165b184SJames Collins				if ($encoding !== false) {
1774c165b184SJames Collins					$charset = $encoding;
1775c165b184SJames Collins					if (is_object($debug_object)) {
1776c165b184SJames Collins						$debug_object->debug_log(2, 'mb_detect: ' . $charset);
1777c165b184SJames Collins					}
1778c165b184SJames Collins				}
1779c165b184SJames Collins			}
1780c165b184SJames Collins		}
1781c165b184SJames Collins
1782c165b184SJames Collins		if (empty($charset)) {
1783c165b184SJames Collins			// Assume it's UTF-8 as it is the most likely charset to be used
1784c165b184SJames Collins			$charset = 'UTF-8';
1785c165b184SJames Collins			if (is_object($debug_object)) {
1786c165b184SJames Collins				$debug_object->debug_log(2, 'No match found, assume ' . $charset);
1787c165b184SJames Collins			}
1788c165b184SJames Collins		}
1789c165b184SJames Collins
1790c165b184SJames Collins		// Since CP1252 is a superset, if we get one of it's subsets, we want
1791c165b184SJames Collins		// it instead.
1792c165b184SJames Collins		if ((strtolower($charset) == 'iso-8859-1')
1793c165b184SJames Collins			|| (strtolower($charset) == 'latin1')
1794c165b184SJames Collins			|| (strtolower($charset) == 'latin-1')) {
1795c165b184SJames Collins			$charset = 'CP1252';
1796c165b184SJames Collins			if (is_object($debug_object)) {
1797c165b184SJames Collins				$debug_object->debug_log(2,
1798c165b184SJames Collins					'replacing ' . $charset . ' with CP1252 as its a superset'
1799c165b184SJames Collins				);
1800c165b184SJames Collins			}
1801c165b184SJames Collins		}
1802c165b184SJames Collins
1803c165b184SJames Collins		if (is_object($debug_object)) {
1804c165b184SJames Collins			$debug_object->debug_log(1, 'EXIT - ' . $charset);
1805c165b184SJames Collins		}
1806c165b184SJames Collins
1807c165b184SJames Collins		return $this->_charset = $charset;
1808c165b184SJames Collins	}
1809c165b184SJames Collins
1810c165b184SJames Collins	protected function read_tag()
1811c165b184SJames Collins	{
1812c165b184SJames Collins		// Set end position if no further tags found
1813c165b184SJames Collins		if ($this->char !== '<') {
1814c165b184SJames Collins			$this->root->_[HDOM_INFO_END] = $this->cursor;
1815c165b184SJames Collins			return false;
1816c165b184SJames Collins		}
1817c165b184SJames Collins
1818c165b184SJames Collins		$begin_tag_pos = $this->pos;
1819c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1820c165b184SJames Collins
1821c165b184SJames Collins		// end tag
1822c165b184SJames Collins		if ($this->char === '/') {
1823c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1824c165b184SJames Collins
1825c165b184SJames Collins			// Skip whitespace in end tags (i.e. in "</   html>")
1826c165b184SJames Collins			$this->skip($this->token_blank);
1827c165b184SJames Collins			$tag = $this->copy_until_char('>');
1828c165b184SJames Collins
1829c165b184SJames Collins			// Skip attributes in end tags
1830c165b184SJames Collins			if (($pos = strpos($tag, ' ')) !== false) {
1831c165b184SJames Collins				$tag = substr($tag, 0, $pos);
1832c165b184SJames Collins			}
1833c165b184SJames Collins
1834c165b184SJames Collins			$parent_lower = strtolower($this->parent->tag);
1835c165b184SJames Collins			$tag_lower = strtolower($tag);
1836c165b184SJames Collins
1837c165b184SJames Collins			// The end tag is supposed to close the parent tag. Handle situations
1838c165b184SJames Collins			// when it doesn't
1839c165b184SJames Collins			if ($parent_lower !== $tag_lower) {
1840c165b184SJames Collins				// Parent tag does not have to be closed necessarily (optional closing tag)
1841c165b184SJames Collins				// Current tag is a block tag, so it may close an ancestor
1842c165b184SJames Collins				if (isset($this->optional_closing_tags[$parent_lower])
1843c165b184SJames Collins					&& isset($this->block_tags[$tag_lower])) {
1844c165b184SJames Collins
1845c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0;
1846c165b184SJames Collins					$org_parent = $this->parent;
1847c165b184SJames Collins
1848c165b184SJames Collins					// Traverse ancestors to find a matching opening tag
1849c165b184SJames Collins					// Stop at root node
1850c165b184SJames Collins					while (($this->parent->parent)
1851c165b184SJames Collins						&& strtolower($this->parent->tag) !== $tag_lower
1852c165b184SJames Collins					){
1853c165b184SJames Collins						$this->parent = $this->parent->parent;
1854c165b184SJames Collins					}
1855c165b184SJames Collins
1856c165b184SJames Collins					// If we don't have a match add current tag as text node
1857c165b184SJames Collins					if (strtolower($this->parent->tag) !== $tag_lower) {
1858c165b184SJames Collins						$this->parent = $org_parent; // restore origonal parent
1859c165b184SJames Collins
1860c165b184SJames Collins						if ($this->parent->parent) {
1861c165b184SJames Collins							$this->parent = $this->parent->parent;
1862c165b184SJames Collins						}
1863c165b184SJames Collins
1864c165b184SJames Collins						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1865c165b184SJames Collins						return $this->as_text_node($tag);
1866c165b184SJames Collins					}
1867c165b184SJames Collins				} elseif (($this->parent->parent)
1868c165b184SJames Collins					&& isset($this->block_tags[$tag_lower])
1869c165b184SJames Collins				) {
1870c165b184SJames Collins					// Grandparent exists and current tag is a block tag, so our
1871c165b184SJames Collins					// parent doesn't have an end tag
1872c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
1873c165b184SJames Collins					$org_parent = $this->parent;
1874c165b184SJames Collins
1875c165b184SJames Collins					// Traverse ancestors to find a matching opening tag
1876c165b184SJames Collins					// Stop at root node
1877c165b184SJames Collins					while (($this->parent->parent)
1878c165b184SJames Collins						&& strtolower($this->parent->tag) !== $tag_lower
1879c165b184SJames Collins					) {
1880c165b184SJames Collins						$this->parent = $this->parent->parent;
1881c165b184SJames Collins					}
1882c165b184SJames Collins
1883c165b184SJames Collins					// If we don't have a match add current tag as text node
1884c165b184SJames Collins					if (strtolower($this->parent->tag) !== $tag_lower) {
1885c165b184SJames Collins						$this->parent = $org_parent; // restore origonal parent
1886c165b184SJames Collins						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1887c165b184SJames Collins						return $this->as_text_node($tag);
1888c165b184SJames Collins					}
1889c165b184SJames Collins				} elseif (($this->parent->parent)
1890c165b184SJames Collins					&& strtolower($this->parent->parent->tag) === $tag_lower
1891c165b184SJames Collins				) { // Grandparent exists and current tag closes it
1892c165b184SJames Collins					$this->parent->_[HDOM_INFO_END] = 0;
1893c165b184SJames Collins					$this->parent = $this->parent->parent;
1894c165b184SJames Collins				} else { // Random tag, add as text node
1895c165b184SJames Collins					return $this->as_text_node($tag);
1896c165b184SJames Collins				}
1897c165b184SJames Collins			}
1898c165b184SJames Collins
1899c165b184SJames Collins			// Set end position of parent tag to current cursor position
1900c165b184SJames Collins			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1901c165b184SJames Collins
1902c165b184SJames Collins			if ($this->parent->parent) {
1903c165b184SJames Collins				$this->parent = $this->parent->parent;
1904c165b184SJames Collins			}
1905c165b184SJames Collins
1906c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1907c165b184SJames Collins			return true;
1908c165b184SJames Collins		}
1909c165b184SJames Collins
1910c165b184SJames Collins		// start tag
1911c165b184SJames Collins		$node = new simple_html_dom_node($this);
1912c165b184SJames Collins		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1913c165b184SJames Collins		++$this->cursor;
1914c165b184SJames Collins		$tag = $this->copy_until($this->token_slash); // Get tag name
1915c165b184SJames Collins		$node->tag_start = $begin_tag_pos;
1916c165b184SJames Collins
1917c165b184SJames Collins		// doctype, cdata & comments...
1918c165b184SJames Collins		// <!DOCTYPE html>
1919c165b184SJames Collins		// <![CDATA[ ... ]]>
1920c165b184SJames Collins		// <!-- Comment -->
1921c165b184SJames Collins		if (isset($tag[0]) && $tag[0] === '!') {
1922c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1923c165b184SJames Collins
1924c165b184SJames Collins			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
1925c165b184SJames Collins				$node->nodetype = HDOM_TYPE_COMMENT;
1926c165b184SJames Collins				$node->tag = 'comment';
1927c165b184SJames Collins			} else { // Could be doctype or CDATA but we don't care
1928c165b184SJames Collins				$node->nodetype = HDOM_TYPE_UNKNOWN;
1929c165b184SJames Collins				$node->tag = 'unknown';
1930c165b184SJames Collins			}
1931c165b184SJames Collins
1932c165b184SJames Collins			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1933c165b184SJames Collins
1934c165b184SJames Collins			$this->link_nodes($node, true);
1935c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1936c165b184SJames Collins			return true;
1937c165b184SJames Collins		}
1938c165b184SJames Collins
1939c165b184SJames Collins		// The start tag cannot contain another start tag, if so add as text
1940c165b184SJames Collins		// i.e. "<<html>"
1941c165b184SJames Collins		if ($pos = strpos($tag, '<') !== false) {
1942c165b184SJames Collins			$tag = '<' . substr($tag, 0, -1);
1943c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = $tag;
1944c165b184SJames Collins			$this->link_nodes($node, false);
1945c165b184SJames Collins			$this->char = $this->doc[--$this->pos]; // prev
1946c165b184SJames Collins			return true;
1947c165b184SJames Collins		}
1948c165b184SJames Collins
1949c165b184SJames Collins		// Handle invalid tag names (i.e. "<html#doc>")
1950c165b184SJames Collins		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
1951c165b184SJames Collins			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1952c165b184SJames Collins
1953c165b184SJames Collins			// Next char is the beginning of a new tag, don't touch it.
1954c165b184SJames Collins			if ($this->char === '<') {
1955c165b184SJames Collins				$this->link_nodes($node, false);
1956c165b184SJames Collins				return true;
1957c165b184SJames Collins			}
1958c165b184SJames Collins
1959c165b184SJames Collins			// Next char closes current tag, add and be done with it.
1960c165b184SJames Collins			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1961c165b184SJames Collins			$this->link_nodes($node, false);
1962c165b184SJames Collins			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1963c165b184SJames Collins			return true;
1964c165b184SJames Collins		}
1965c165b184SJames Collins
1966c165b184SJames Collins		// begin tag, add new node
1967c165b184SJames Collins		$node->nodetype = HDOM_TYPE_ELEMENT;
1968c165b184SJames Collins		$tag_lower = strtolower($tag);
1969c165b184SJames Collins		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1970c165b184SJames Collins
1971c165b184SJames Collins		// handle optional closing tags
1972c165b184SJames Collins		if (isset($this->optional_closing_tags[$tag_lower])) {
1973c165b184SJames Collins			// Traverse ancestors to close all optional closing tags
1974c165b184SJames Collins			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
1975c165b184SJames Collins				$this->parent->_[HDOM_INFO_END] = 0;
1976c165b184SJames Collins				$this->parent = $this->parent->parent;
1977c165b184SJames Collins			}
1978c165b184SJames Collins			$node->parent = $this->parent;
1979c165b184SJames Collins		}
1980c165b184SJames Collins
1981c165b184SJames Collins		$guard = 0; // prevent infinity loop
1982c165b184SJames Collins
1983c165b184SJames Collins		// [0] Space between tag and first attribute
1984c165b184SJames Collins		$space = array($this->copy_skip($this->token_blank), '', '');
1985c165b184SJames Collins
1986c165b184SJames Collins		// attributes
1987c165b184SJames Collins		do {
1988c165b184SJames Collins			// Everything until the first equal sign should be the attribute name
1989c165b184SJames Collins			$name = $this->copy_until($this->token_equal);
1990c165b184SJames Collins
1991c165b184SJames Collins			if ($name === '' && $this->char !== null && $space[0] === '') {
1992c165b184SJames Collins				break;
1993c165b184SJames Collins			}
1994c165b184SJames Collins
1995c165b184SJames Collins			if ($guard === $this->pos) { // Escape infinite loop
1996c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1997c165b184SJames Collins				continue;
1998c165b184SJames Collins			}
1999c165b184SJames Collins
2000c165b184SJames Collins			$guard = $this->pos;
2001c165b184SJames Collins
2002c165b184SJames Collins			// handle endless '<'
2003c165b184SJames Collins			// Out of bounds before the tag ended
2004c165b184SJames Collins			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2005c165b184SJames Collins				$node->nodetype = HDOM_TYPE_TEXT;
2006c165b184SJames Collins				$node->_[HDOM_INFO_END] = 0;
2007c165b184SJames Collins				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2008c165b184SJames Collins				$node->tag = 'text';
2009c165b184SJames Collins				$this->link_nodes($node, false);
2010c165b184SJames Collins				return true;
2011c165b184SJames Collins			}
2012c165b184SJames Collins
2013c165b184SJames Collins			// handle mismatch '<'
2014c165b184SJames Collins			// Attributes cannot start after opening tag
2015c165b184SJames Collins			if ($this->doc[$this->pos - 1] == '<') {
2016c165b184SJames Collins				$node->nodetype = HDOM_TYPE_TEXT;
2017c165b184SJames Collins				$node->tag = 'text';
2018c165b184SJames Collins				$node->attr = array();
2019c165b184SJames Collins				$node->_[HDOM_INFO_END] = 0;
2020c165b184SJames Collins				$node->_[HDOM_INFO_TEXT] = substr(
2021c165b184SJames Collins					$this->doc,
2022c165b184SJames Collins					$begin_tag_pos,
2023c165b184SJames Collins					$this->pos - $begin_tag_pos - 1
2024c165b184SJames Collins				);
2025c165b184SJames Collins				$this->pos -= 2;
2026c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2027c165b184SJames Collins				$this->link_nodes($node, false);
2028c165b184SJames Collins				return true;
2029c165b184SJames Collins			}
2030c165b184SJames Collins
2031c165b184SJames Collins			if ($name !== '/' && $name !== '') { // this is a attribute name
2032c165b184SJames Collins				// [1] Whitespace after attribute name
2033c165b184SJames Collins				$space[1] = $this->copy_skip($this->token_blank);
2034c165b184SJames Collins
2035c165b184SJames Collins				$name = $this->restore_noise($name); // might be a noisy name
2036c165b184SJames Collins
2037c165b184SJames Collins				if ($this->lowercase) { $name = strtolower($name); }
2038c165b184SJames Collins
2039c165b184SJames Collins				if ($this->char === '=') { // attribute with value
2040c165b184SJames Collins					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2041c165b184SJames Collins					$this->parse_attr($node, $name, $space); // get attribute value
2042c165b184SJames Collins				} else {
2043c165b184SJames Collins					//no value attr: nowrap, checked selected...
2044c165b184SJames Collins					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2045c165b184SJames Collins					$node->attr[$name] = true;
2046c165b184SJames Collins					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2047c165b184SJames Collins				}
2048c165b184SJames Collins
2049c165b184SJames Collins				$node->_[HDOM_INFO_SPACE][] = $space;
2050c165b184SJames Collins
2051c165b184SJames Collins				// prepare for next attribute
2052c165b184SJames Collins				$space = array(
2053c165b184SJames Collins					$this->copy_skip($this->token_blank),
2054c165b184SJames Collins					'',
2055c165b184SJames Collins					''
2056c165b184SJames Collins				);
2057c165b184SJames Collins			} else { // no more attributes
2058c165b184SJames Collins				break;
2059c165b184SJames Collins			}
2060c165b184SJames Collins		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2061c165b184SJames Collins
2062c165b184SJames Collins		$this->link_nodes($node, true);
2063c165b184SJames Collins		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2064c165b184SJames Collins
2065c165b184SJames Collins		// handle empty tags (i.e. "<div/>")
2066c165b184SJames Collins		if ($this->copy_until_char('>') === '/') {
2067c165b184SJames Collins			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2068c165b184SJames Collins			$node->_[HDOM_INFO_END] = 0;
2069c165b184SJames Collins		} else {
2070c165b184SJames Collins			// reset parent
2071c165b184SJames Collins			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2072c165b184SJames Collins				$this->parent = $node;
2073c165b184SJames Collins			}
2074c165b184SJames Collins		}
2075c165b184SJames Collins
2076c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2077c165b184SJames Collins
2078c165b184SJames Collins		// If it's a BR tag, we need to set it's text to the default text.
2079c165b184SJames Collins		// This way when we see it in plaintext, we can generate formatting that the user wants.
2080c165b184SJames Collins		// since a br tag never has sub nodes, this works well.
2081c165b184SJames Collins		if ($node->tag === 'br') {
2082c165b184SJames Collins			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2083c165b184SJames Collins		}
2084c165b184SJames Collins
2085c165b184SJames Collins		return true;
2086c165b184SJames Collins	}
2087c165b184SJames Collins
2088c165b184SJames Collins	protected function parse_attr($node, $name, &$space)
2089c165b184SJames Collins	{
2090c165b184SJames Collins		$is_duplicate = isset($node->attr[$name]);
2091c165b184SJames Collins
2092c165b184SJames Collins		if (!$is_duplicate) // Copy whitespace between "=" and value
2093c165b184SJames Collins			$space[2] = $this->copy_skip($this->token_blank);
2094c165b184SJames Collins
2095c165b184SJames Collins		switch ($this->char) {
2096c165b184SJames Collins			case '"':
2097c165b184SJames Collins				$quote_type = HDOM_QUOTE_DOUBLE;
2098c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2099c165b184SJames Collins				$value = $this->copy_until_char('"');
2100c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2101c165b184SJames Collins				break;
2102c165b184SJames Collins			case '\'':
2103c165b184SJames Collins				$quote_type = HDOM_QUOTE_SINGLE;
2104c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2105c165b184SJames Collins				$value = $this->copy_until_char('\'');
2106c165b184SJames Collins				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2107c165b184SJames Collins				break;
2108c165b184SJames Collins			default:
2109c165b184SJames Collins				$quote_type = HDOM_QUOTE_NO;
2110c165b184SJames Collins				$value = $this->copy_until($this->token_attr);
2111c165b184SJames Collins		}
2112c165b184SJames Collins
2113c165b184SJames Collins		$value = $this->restore_noise($value);
2114c165b184SJames Collins
2115c165b184SJames Collins		// PaperG: Attributes should not have \r or \n in them, that counts as
2116c165b184SJames Collins        // html whitespace.
2117*cdddb6f0SJames Collins
2118*cdddb6f0SJames Collins//      The following was commented out as it interferes with DokuWiki edit mode - nomadjimbob
2119*cdddb6f0SJames Collins//
2120*cdddb6f0SJames Collins//		$value = str_replace("\r", '', $value);
2121*cdddb6f0SJames Collins//		$value = str_replace("\n", '', $value);
2122c165b184SJames Collins
2123c165b184SJames Collins		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2124c165b184SJames Collins		// and trailing space since some people leave it in the multi class case.
2125c165b184SJames Collins		if ($name === 'class') {
2126c165b184SJames Collins			$value = trim($value);
2127c165b184SJames Collins		}
2128c165b184SJames Collins
2129c165b184SJames Collins		if (!$is_duplicate) {
2130c165b184SJames Collins			$node->_[HDOM_INFO_QUOTE][] = $quote_type;
2131c165b184SJames Collins			$node->attr[$name] = $value;
2132c165b184SJames Collins		}
2133c165b184SJames Collins	}
2134c165b184SJames Collins
2135c165b184SJames Collins	protected function link_nodes(&$node, $is_child)
2136c165b184SJames Collins	{
2137c165b184SJames Collins		$node->parent = $this->parent;
2138c165b184SJames Collins		$this->parent->nodes[] = $node;
2139c165b184SJames Collins		if ($is_child) {
2140c165b184SJames Collins			$this->parent->children[] = $node;
2141c165b184SJames Collins		}
2142c165b184SJames Collins	}
2143c165b184SJames Collins
2144c165b184SJames Collins	protected function as_text_node($tag)
2145c165b184SJames Collins	{
2146c165b184SJames Collins		$node = new simple_html_dom_node($this);
2147c165b184SJames Collins		++$this->cursor;
2148c165b184SJames Collins		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2149c165b184SJames Collins		$this->link_nodes($node, false);
2150c165b184SJames Collins		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2151c165b184SJames Collins		return true;
2152c165b184SJames Collins	}
2153c165b184SJames Collins
2154c165b184SJames Collins	protected function skip($chars)
2155c165b184SJames Collins	{
2156c165b184SJames Collins		$this->pos += strspn($this->doc, $chars, $this->pos);
2157c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2158c165b184SJames Collins	}
2159c165b184SJames Collins
2160c165b184SJames Collins	protected function copy_skip($chars)
2161c165b184SJames Collins	{
2162c165b184SJames Collins		$pos = $this->pos;
2163c165b184SJames Collins		$len = strspn($this->doc, $chars, $pos);
2164c165b184SJames Collins		$this->pos += $len;
2165c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2166c165b184SJames Collins		if ($len === 0) { return ''; }
2167c165b184SJames Collins		return substr($this->doc, $pos, $len);
2168c165b184SJames Collins	}
2169c165b184SJames Collins
2170c165b184SJames Collins	protected function copy_until($chars)
2171c165b184SJames Collins	{
2172c165b184SJames Collins		$pos = $this->pos;
2173c165b184SJames Collins		$len = strcspn($this->doc, $chars, $pos);
2174c165b184SJames Collins		$this->pos += $len;
2175c165b184SJames Collins		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2176c165b184SJames Collins		return substr($this->doc, $pos, $len);
2177c165b184SJames Collins	}
2178c165b184SJames Collins
2179c165b184SJames Collins	protected function copy_until_char($char)
2180c165b184SJames Collins	{
2181c165b184SJames Collins		if ($this->char === null) { return ''; }
2182c165b184SJames Collins
2183c165b184SJames Collins		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2184c165b184SJames Collins			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2185c165b184SJames Collins			$this->char = null;
2186c165b184SJames Collins			$this->pos = $this->size;
2187c165b184SJames Collins			return $ret;
2188c165b184SJames Collins		}
2189c165b184SJames Collins
2190c165b184SJames Collins		if ($pos === $this->pos) { return ''; }
2191c165b184SJames Collins
2192c165b184SJames Collins		$pos_old = $this->pos;
2193c165b184SJames Collins		$this->char = $this->doc[$pos];
2194c165b184SJames Collins		$this->pos = $pos;
2195c165b184SJames Collins		return substr($this->doc, $pos_old, $pos - $pos_old);
2196c165b184SJames Collins	}
2197c165b184SJames Collins
2198c165b184SJames Collins	protected function remove_noise($pattern, $remove_tag = false)
2199c165b184SJames Collins	{
2200c165b184SJames Collins		global $debug_object;
2201c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2202c165b184SJames Collins
2203c165b184SJames Collins		$count = preg_match_all(
2204c165b184SJames Collins			$pattern,
2205c165b184SJames Collins			$this->doc,
2206c165b184SJames Collins			$matches,
2207c165b184SJames Collins			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2208c165b184SJames Collins		);
2209c165b184SJames Collins
2210c165b184SJames Collins		for ($i = $count - 1; $i > -1; --$i) {
2211c165b184SJames Collins			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2212c165b184SJames Collins
2213c165b184SJames Collins			if (is_object($debug_object)) {
2214c165b184SJames Collins				$debug_object->debug_log(2, 'key is: ' . $key);
2215c165b184SJames Collins			}
2216c165b184SJames Collins
2217c165b184SJames Collins			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2218c165b184SJames Collins			$this->noise[$key] = $matches[$i][$idx][0];
2219c165b184SJames Collins			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2220c165b184SJames Collins		}
2221c165b184SJames Collins
2222c165b184SJames Collins		// reset the length of content
2223c165b184SJames Collins		$this->size = strlen($this->doc);
2224c165b184SJames Collins
2225c165b184SJames Collins		if ($this->size > 0) {
2226c165b184SJames Collins			$this->char = $this->doc[0];
2227c165b184SJames Collins		}
2228c165b184SJames Collins	}
2229c165b184SJames Collins
2230c165b184SJames Collins	function restore_noise($text)
2231c165b184SJames Collins	{
2232c165b184SJames Collins		global $debug_object;
2233c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2234c165b184SJames Collins
2235c165b184SJames Collins		while (($pos = strpos($text, '___noise___')) !== false) {
2236c165b184SJames Collins			// Sometimes there is a broken piece of markup, and we don't GET the
2237c165b184SJames Collins			// pos+11 etc... token which indicates a problem outside of us...
2238c165b184SJames Collins
2239c165b184SJames Collins			// todo: "___noise___1000" (or any number with four or more digits)
2240c165b184SJames Collins			// in the DOM causes an infinite loop which could be utilized by
2241c165b184SJames Collins			// malicious software
2242c165b184SJames Collins			if (strlen($text) > $pos + 15) {
2243c165b184SJames Collins				$key = '___noise___'
2244c165b184SJames Collins				. $text[$pos + 11]
2245c165b184SJames Collins				. $text[$pos + 12]
2246c165b184SJames Collins				. $text[$pos + 13]
2247c165b184SJames Collins				. $text[$pos + 14]
2248c165b184SJames Collins				. $text[$pos + 15];
2249c165b184SJames Collins
2250c165b184SJames Collins				if (is_object($debug_object)) {
2251c165b184SJames Collins					$debug_object->debug_log(2, 'located key of: ' . $key);
2252c165b184SJames Collins				}
2253c165b184SJames Collins
2254c165b184SJames Collins				if (isset($this->noise[$key])) {
2255c165b184SJames Collins					$text = substr($text, 0, $pos)
2256c165b184SJames Collins					. $this->noise[$key]
2257c165b184SJames Collins					. substr($text, $pos + 16);
2258c165b184SJames Collins				} else {
2259c165b184SJames Collins					// do this to prevent an infinite loop.
2260c165b184SJames Collins					$text = substr($text, 0, $pos)
2261c165b184SJames Collins					. 'UNDEFINED NOISE FOR KEY: '
2262c165b184SJames Collins					. $key
2263c165b184SJames Collins					. substr($text, $pos + 16);
2264c165b184SJames Collins				}
2265c165b184SJames Collins			} else {
2266c165b184SJames Collins				// There is no valid key being given back to us... We must get
2267c165b184SJames Collins				// rid of the ___noise___ or we will have a problem.
2268c165b184SJames Collins				$text = substr($text, 0, $pos)
2269c165b184SJames Collins				. 'NO NUMERIC NOISE KEY'
2270c165b184SJames Collins				. substr($text, $pos + 11);
2271c165b184SJames Collins			}
2272c165b184SJames Collins		}
2273c165b184SJames Collins		return $text;
2274c165b184SJames Collins	}
2275c165b184SJames Collins
2276c165b184SJames Collins	function search_noise($text)
2277c165b184SJames Collins	{
2278c165b184SJames Collins		global $debug_object;
2279c165b184SJames Collins		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2280c165b184SJames Collins
2281c165b184SJames Collins		foreach($this->noise as $noiseElement) {
2282c165b184SJames Collins			if (strpos($noiseElement, $text) !== false) {
2283c165b184SJames Collins				return $noiseElement;
2284c165b184SJames Collins			}
2285c165b184SJames Collins		}
2286c165b184SJames Collins	}
2287c165b184SJames Collins
2288c165b184SJames Collins	function __toString()
2289c165b184SJames Collins	{
2290c165b184SJames Collins		return $this->root->innertext();
2291c165b184SJames Collins	}
2292c165b184SJames Collins
2293c165b184SJames Collins	function __get($name)
2294c165b184SJames Collins	{
2295c165b184SJames Collins		switch ($name) {
2296c165b184SJames Collins			case 'outertext':
2297c165b184SJames Collins				return $this->root->innertext();
2298c165b184SJames Collins			case 'innertext':
2299c165b184SJames Collins				return $this->root->innertext();
2300c165b184SJames Collins			case 'plaintext':
2301c165b184SJames Collins				return $this->root->text();
2302c165b184SJames Collins			case 'charset':
2303c165b184SJames Collins				return $this->_charset;
2304c165b184SJames Collins			case 'target_charset':
2305c165b184SJames Collins				return $this->_target_charset;
2306c165b184SJames Collins		}
2307c165b184SJames Collins	}
2308c165b184SJames Collins
2309c165b184SJames Collins	function childNodes($idx = -1)
2310c165b184SJames Collins	{
2311c165b184SJames Collins		return $this->root->childNodes($idx);
2312c165b184SJames Collins	}
2313c165b184SJames Collins
2314c165b184SJames Collins	function firstChild()
2315c165b184SJames Collins	{
2316c165b184SJames Collins		return $this->root->first_child();
2317c165b184SJames Collins	}
2318c165b184SJames Collins
2319c165b184SJames Collins	function lastChild()
2320c165b184SJames Collins	{
2321c165b184SJames Collins		return $this->root->last_child();
2322c165b184SJames Collins	}
2323c165b184SJames Collins
2324c165b184SJames Collins	function createElement($name, $value = null)
2325c165b184SJames Collins	{
2326c165b184SJames Collins		return @str_get_html("<$name>$value</$name>")->firstChild();
2327c165b184SJames Collins	}
2328c165b184SJames Collins
2329c165b184SJames Collins	function createTextNode($value)
2330c165b184SJames Collins	{
2331c165b184SJames Collins		return @end(str_get_html($value)->nodes);
2332c165b184SJames Collins	}
2333c165b184SJames Collins
2334c165b184SJames Collins	function getElementById($id)
2335c165b184SJames Collins	{
2336c165b184SJames Collins		return $this->find("#$id", 0);
2337c165b184SJames Collins	}
2338c165b184SJames Collins
2339c165b184SJames Collins	function getElementsById($id, $idx = null)
2340c165b184SJames Collins	{
2341c165b184SJames Collins		return $this->find("#$id", $idx);
2342c165b184SJames Collins	}
2343c165b184SJames Collins
2344c165b184SJames Collins	function getElementByTagName($name)
2345c165b184SJames Collins	{
2346c165b184SJames Collins		return $this->find($name, 0);
2347c165b184SJames Collins	}
2348c165b184SJames Collins
2349c165b184SJames Collins	function getElementsByTagName($name, $idx = -1)
2350c165b184SJames Collins	{
2351c165b184SJames Collins		return $this->find($name, $idx);
2352c165b184SJames Collins	}
2353c165b184SJames Collins
2354c165b184SJames Collins	function loadFile()
2355c165b184SJames Collins	{
2356c165b184SJames Collins		$args = func_get_args();
2357c165b184SJames Collins		$this->load_file($args);
2358c165b184SJames Collins	}
2359c165b184SJames Collins}
2360