1<?php
2/**
3 * SimplePie
4 *
5 * A PHP-Based RSS and Atom Feed Framework.
6 * Takes the hard work out of managing a complete RSS/Atom solution.
7 *
8 * Copyright (c) 2004-2016, Ryan Parman, Sam Sneddon, Ryan McCue, and contributors
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without modification, are
12 * permitted provided that the following conditions are met:
13 *
14 * 	* Redistributions of source code must retain the above copyright notice, this list of
15 * 	  conditions and the following disclaimer.
16 *
17 * 	* Redistributions in binary form must reproduce the above copyright notice, this list
18 * 	  of conditions and the following disclaimer in the documentation and/or other materials
19 * 	  provided with the distribution.
20 *
21 * 	* Neither the name of the SimplePie Team nor the names of its contributors may be used
22 * 	  to endorse or promote products derived from this software without specific prior
23 * 	  written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 *
35 * @package SimplePie
36 * @copyright 2004-2016 Ryan Parman, Sam Sneddon, Ryan McCue
37 * @author Ryan Parman
38 * @author Sam Sneddon
39 * @author Ryan McCue
40 * @link http://simplepie.org/ SimplePie
41 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
42 */
43
44/**
45 * Used for data cleanup and post-processing
46 *
47 *
48 * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
49 *
50 * @package SimplePie
51 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
52 */
53class SimplePie_Sanitize
54{
55	// Private vars
56	var $base;
57
58	// Options
59	var $remove_div = true;
60	var $image_handler = '';
61	var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
62	var $encode_instead_of_strip = false;
63	var $strip_attributes = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
64	var $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
65	var $strip_comments = false;
66	var $output_encoding = 'UTF-8';
67	var $enable_cache = true;
68	var $cache_location = './cache';
69	var $cache_name_function = 'md5';
70	var $timeout = 10;
71	var $useragent = '';
72	var $force_fsockopen = false;
73	var $replace_url_attributes = null;
74
75	public function __construct()
76	{
77		// Set defaults
78		$this->set_url_replacements(null);
79	}
80
81	public function remove_div($enable = true)
82	{
83		$this->remove_div = (bool) $enable;
84	}
85
86	public function set_image_handler($page = false)
87	{
88		if ($page)
89		{
90			$this->image_handler = (string) $page;
91		}
92		else
93		{
94			$this->image_handler = false;
95		}
96	}
97
98	public function set_registry(SimplePie_Registry $registry)
99	{
100		$this->registry = $registry;
101	}
102
103	public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
104	{
105		if (isset($enable_cache))
106		{
107			$this->enable_cache = (bool) $enable_cache;
108		}
109
110		if ($cache_location)
111		{
112			$this->cache_location = (string) $cache_location;
113		}
114
115		if ($cache_name_function)
116		{
117			$this->cache_name_function = (string) $cache_name_function;
118		}
119	}
120
121	public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
122	{
123		if ($timeout)
124		{
125			$this->timeout = (string) $timeout;
126		}
127
128		if ($useragent)
129		{
130			$this->useragent = (string) $useragent;
131		}
132
133		if ($force_fsockopen)
134		{
135			$this->force_fsockopen = (string) $force_fsockopen;
136		}
137	}
138
139	public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
140	{
141		if ($tags)
142		{
143			if (is_array($tags))
144			{
145				$this->strip_htmltags = $tags;
146			}
147			else
148			{
149				$this->strip_htmltags = explode(',', $tags);
150			}
151		}
152		else
153		{
154			$this->strip_htmltags = false;
155		}
156	}
157
158	public function encode_instead_of_strip($encode = false)
159	{
160		$this->encode_instead_of_strip = (bool) $encode;
161	}
162
163	public function strip_attributes($attribs = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
164	{
165		if ($attribs)
166		{
167			if (is_array($attribs))
168			{
169				$this->strip_attributes = $attribs;
170			}
171			else
172			{
173				$this->strip_attributes = explode(',', $attribs);
174			}
175		}
176		else
177		{
178			$this->strip_attributes = false;
179		}
180	}
181
182	public function add_attributes($attribs = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none')))
183	{
184		if ($attribs)
185		{
186			if (is_array($attribs))
187			{
188				$this->add_attributes = $attribs;
189			}
190			else
191			{
192				$this->add_attributes = explode(',', $attribs);
193			}
194		}
195		else
196		{
197			$this->add_attributes = false;
198		}
199	}
200
201	public function strip_comments($strip = false)
202	{
203		$this->strip_comments = (bool) $strip;
204	}
205
206	public function set_output_encoding($encoding = 'UTF-8')
207	{
208		$this->output_encoding = (string) $encoding;
209	}
210
211	/**
212	 * Set element/attribute key/value pairs of HTML attributes
213	 * containing URLs that need to be resolved relative to the feed
214	 *
215	 * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
216	 * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
217	 * |q|@cite
218	 *
219	 * @since 1.0
220	 * @param array|null $element_attribute Element/attribute key/value pairs, null for default
221	 */
222	public function set_url_replacements($element_attribute = null)
223	{
224		if ($element_attribute === null)
225		{
226			$element_attribute = array(
227				'a' => 'href',
228				'area' => 'href',
229				'blockquote' => 'cite',
230				'del' => 'cite',
231				'form' => 'action',
232				'img' => array(
233					'longdesc',
234					'src'
235				),
236				'input' => 'src',
237				'ins' => 'cite',
238				'q' => 'cite'
239			);
240		}
241		$this->replace_url_attributes = (array) $element_attribute;
242	}
243
244	public function sanitize($data, $type, $base = '')
245	{
246		$data = trim($data);
247		if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
248		{
249			if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
250			{
251				if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
252				{
253					$type |= SIMPLEPIE_CONSTRUCT_HTML;
254				}
255				else
256				{
257					$type |= SIMPLEPIE_CONSTRUCT_TEXT;
258				}
259			}
260
261			if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
262			{
263				$data = base64_decode($data);
264			}
265
266			if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
267			{
268
269				if (!class_exists('DOMDocument'))
270				{
271					throw new SimplePie_Exception('DOMDocument not found, unable to use sanitizer');
272				}
273				$document = new DOMDocument();
274				$document->encoding = 'UTF-8';
275
276				$data = $this->preprocess($data, $type);
277
278				set_error_handler(array('SimplePie_Misc', 'silence_errors'));
279				$document->loadHTML($data);
280				restore_error_handler();
281
282				$xpath = new DOMXPath($document);
283
284				// Strip comments
285				if ($this->strip_comments)
286				{
287					$comments = $xpath->query('//comment()');
288
289					foreach ($comments as $comment)
290					{
291						$comment->parentNode->removeChild($comment);
292					}
293				}
294
295				// Strip out HTML tags and attributes that might cause various security problems.
296				// Based on recommendations by Mark Pilgrim at:
297				// http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
298				if ($this->strip_htmltags)
299				{
300					foreach ($this->strip_htmltags as $tag)
301					{
302						$this->strip_tag($tag, $document, $xpath, $type);
303					}
304				}
305
306				if ($this->strip_attributes)
307				{
308					foreach ($this->strip_attributes as $attrib)
309					{
310						$this->strip_attr($attrib, $xpath);
311					}
312				}
313
314				if ($this->add_attributes)
315				{
316					foreach ($this->add_attributes as $tag => $valuePairs)
317					{
318						$this->add_attr($tag, $valuePairs, $document);
319					}
320				}
321
322				// Replace relative URLs
323				$this->base = $base;
324				foreach ($this->replace_url_attributes as $element => $attributes)
325				{
326					$this->replace_urls($document, $element, $attributes);
327				}
328
329				// If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
330				if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
331				{
332					$images = $document->getElementsByTagName('img');
333					foreach ($images as $img)
334					{
335						if ($img->hasAttribute('src'))
336						{
337							$image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
338							$cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
339
340							if ($cache->load())
341							{
342								$img->setAttribute('src', $this->image_handler . $image_url);
343							}
344							else
345							{
346								$file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
347								$headers = $file->headers;
348
349								if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
350								{
351									if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
352									{
353										$img->setAttribute('src', $this->image_handler . $image_url);
354									}
355									else
356									{
357										trigger_error("$this->cache_location is not writable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
358									}
359								}
360							}
361						}
362					}
363				}
364
365				// Get content node
366				$div = $document->getElementsByTagName('body')->item(0)->firstChild;
367				// Finally, convert to a HTML string
368				$data = trim($document->saveHTML($div));
369
370				if ($this->remove_div)
371				{
372					$data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
373					$data = preg_replace('/<\/div>$/', '', $data);
374				}
375				else
376				{
377					$data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
378				}
379			}
380
381			if ($type & SIMPLEPIE_CONSTRUCT_IRI)
382			{
383				$absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
384				if ($absolute !== false)
385				{
386					$data = $absolute;
387				}
388			}
389
390			if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
391			{
392				$data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
393			}
394
395			if ($this->output_encoding !== 'UTF-8')
396			{
397				$data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
398			}
399		}
400		return $data;
401	}
402
403	protected function preprocess($html, $type)
404	{
405		$ret = '';
406		$html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html);
407		if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
408		{
409			// Atom XHTML constructs are wrapped with a div by default
410			// Note: No protection if $html contains a stray </div>!
411			$html = '<div>' . $html . '</div>';
412			$ret .= '<!DOCTYPE html>';
413			$content_type = 'text/html';
414		}
415		else
416		{
417			$ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
418			$content_type = 'application/xhtml+xml';
419		}
420
421		$ret .= '<html><head>';
422		$ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
423		$ret .= '</head><body>' . $html . '</body></html>';
424		return $ret;
425	}
426
427	public function replace_urls($document, $tag, $attributes)
428	{
429		if (!is_array($attributes))
430		{
431			$attributes = array($attributes);
432		}
433
434		if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
435		{
436			$elements = $document->getElementsByTagName($tag);
437			foreach ($elements as $element)
438			{
439				foreach ($attributes as $attribute)
440				{
441					if ($element->hasAttribute($attribute))
442					{
443						$value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
444						if ($value !== false)
445						{
446							$element->setAttribute($attribute, $value);
447						}
448					}
449				}
450			}
451		}
452	}
453
454	public function do_strip_htmltags($match)
455	{
456		if ($this->encode_instead_of_strip)
457		{
458			if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
459			{
460				$match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
461				$match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
462				return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
463			}
464			else
465			{
466				return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
467			}
468		}
469		elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
470		{
471			return $match[4];
472		}
473		else
474		{
475			return '';
476		}
477	}
478
479	protected function strip_tag($tag, $document, $xpath, $type)
480	{
481		$elements = $xpath->query('body//' . $tag);
482		if ($this->encode_instead_of_strip)
483		{
484			foreach ($elements as $element)
485			{
486				$fragment = $document->createDocumentFragment();
487
488				// For elements which aren't script or style, include the tag itself
489				if (!in_array($tag, array('script', 'style')))
490				{
491					$text = '<' . $tag;
492					if ($element->hasAttributes())
493					{
494						$attrs = array();
495						foreach ($element->attributes as $name => $attr)
496						{
497							$value = $attr->value;
498
499							// In XHTML, empty values should never exist, so we repeat the value
500							if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
501							{
502								$value = $name;
503							}
504							// For HTML, empty is fine
505							elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
506							{
507								$attrs[] = $name;
508								continue;
509							}
510
511							// Standard attribute text
512							$attrs[] = $name . '="' . $attr->value . '"';
513						}
514						$text .= ' ' . implode(' ', $attrs);
515					}
516					$text .= '>';
517					$fragment->appendChild(new DOMText($text));
518				}
519
520				$number = $element->childNodes->length;
521				for ($i = $number; $i > 0; $i--)
522				{
523					$child = $element->childNodes->item(0);
524					$fragment->appendChild($child);
525				}
526
527				if (!in_array($tag, array('script', 'style')))
528				{
529					$fragment->appendChild(new DOMText('</' . $tag . '>'));
530				}
531
532				$element->parentNode->replaceChild($fragment, $element);
533			}
534
535			return;
536		}
537		elseif (in_array($tag, array('script', 'style')))
538		{
539			foreach ($elements as $element)
540			{
541				$element->parentNode->removeChild($element);
542			}
543
544			return;
545		}
546		else
547		{
548			foreach ($elements as $element)
549			{
550				$fragment = $document->createDocumentFragment();
551				$number = $element->childNodes->length;
552				for ($i = $number; $i > 0; $i--)
553				{
554					$child = $element->childNodes->item(0);
555					$fragment->appendChild($child);
556				}
557
558				$element->parentNode->replaceChild($fragment, $element);
559			}
560		}
561	}
562
563	protected function strip_attr($attrib, $xpath)
564	{
565		$elements = $xpath->query('//*[@' . $attrib . ']');
566
567		foreach ($elements as $element)
568		{
569			$element->removeAttribute($attrib);
570		}
571	}
572
573	protected function add_attr($tag, $valuePairs, $document)
574	{
575		$elements = $document->getElementsByTagName($tag);
576		foreach ($elements as $element)
577		{
578			foreach ($valuePairs as $attrib => $value)
579			{
580				$element->setAttribute($attrib, $value);
581			}
582		}
583	}
584}
585