1<?php 2 3declare(strict_types=1); 4/** 5 * SimplePie 6 * 7 * A PHP-Based RSS and Atom Feed Framework. 8 * Takes the hard work out of managing a complete RSS/Atom solution. 9 * 10 * Copyright (c) 2004-2022, Ryan Parman, Sam Sneddon, Ryan McCue, and contributors 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without modification, are 14 * permitted provided that the following conditions are met: 15 * 16 * * Redistributions of source code must retain the above copyright notice, this list of 17 * conditions and the following disclaimer. 18 * 19 * * Redistributions in binary form must reproduce the above copyright notice, this list 20 * of conditions and the following disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * * Neither the name of the SimplePie Team nor the names of its contributors may be used 24 * to endorse or promote products derived from this software without specific prior 25 * written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS 28 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 29 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS 30 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 34 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 * 37 * @package SimplePie 38 * @copyright 2004-2016 Ryan Parman, Sam Sneddon, Ryan McCue 39 * @author Ryan Parman 40 * @author Sam Sneddon 41 * @author Ryan McCue 42 * @link http://simplepie.org/ SimplePie 43 * @license http://www.opensource.org/licenses/bsd-license.php BSD License 44 */ 45 46namespace SimplePie; 47 48use InvalidArgumentException; 49use SimplePie\Cache\Base; 50use SimplePie\Cache\BaseDataCache; 51use SimplePie\Cache\CallableNameFilter; 52use SimplePie\Cache\DataCache; 53use SimplePie\Cache\NameFilter; 54 55/** 56 * Used for data cleanup and post-processing 57 * 58 * 59 * This class can be overloaded with {@see \SimplePie\SimplePie::set_sanitize_class()} 60 * 61 * @package SimplePie 62 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags 63 */ 64class Sanitize implements RegistryAware 65{ 66 // Private vars 67 public $base; 68 69 // Options 70 public $remove_div = true; 71 public $image_handler = ''; 72 public $strip_htmltags = ['base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style']; 73 public $encode_instead_of_strip = false; 74 public $strip_attributes = ['bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc']; 75 public $rename_attributes = []; 76 public $add_attributes = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']]; 77 public $strip_comments = false; 78 public $output_encoding = 'UTF-8'; 79 public $enable_cache = true; 80 public $cache_location = './cache'; 81 public $cache_name_function = 'md5'; 82 83 /** 84 * @var NameFilter 85 */ 86 private $cache_namefilter; 87 public $timeout = 10; 88 public $useragent = ''; 89 public $force_fsockopen = false; 90 public $replace_url_attributes = null; 91 public $registry; 92 93 /** 94 * @var DataCache|null 95 */ 96 private $cache = null; 97 98 /** 99 * @var int Cache duration (in seconds) 100 */ 101 private $cache_duration = 3600; 102 103 /** 104 * List of domains for which to force HTTPS. 105 * @see \SimplePie\Sanitize::set_https_domains() 106 * Array is a tree split at DNS levels. Example: 107 * array('biz' => true, 'com' => array('example' => true), 'net' => array('example' => array('www' => true))) 108 */ 109 public $https_domains = []; 110 111 public function __construct() 112 { 113 // Set defaults 114 $this->set_url_replacements(null); 115 } 116 117 public function remove_div($enable = true) 118 { 119 $this->remove_div = (bool) $enable; 120 } 121 122 public function set_image_handler($page = false) 123 { 124 if ($page) { 125 $this->image_handler = (string) $page; 126 } else { 127 $this->image_handler = false; 128 } 129 } 130 131 public function set_registry(\SimplePie\Registry $registry)/* : void */ 132 { 133 $this->registry = $registry; 134 } 135 136 public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie\Cache', DataCache $cache = null) 137 { 138 if (isset($enable_cache)) { 139 $this->enable_cache = (bool) $enable_cache; 140 } 141 142 if ($cache_location) { 143 $this->cache_location = (string) $cache_location; 144 } 145 146 if (! is_string($cache_name_function) && ! is_object($cache_name_function) && ! $cache_name_function instanceof NameFilter) { 147 throw new InvalidArgumentException(sprintf( 148 '%s(): Argument #3 ($cache_name_function) must be of type %s', 149 __METHOD__, 150 NameFilter::class 151 ), 1); 152 } 153 154 // BC: $cache_name_function could be a callable as string 155 if (is_string($cache_name_function)) { 156 // trigger_error(sprintf('Providing $cache_name_function as string in "%s()" is deprecated since SimplePie 1.8.0, provide as "%s" instead.', __METHOD__, NameFilter::class), \E_USER_DEPRECATED); 157 $this->cache_name_function = (string) $cache_name_function; 158 159 $cache_name_function = new CallableNameFilter($cache_name_function); 160 } 161 162 $this->cache_namefilter = $cache_name_function; 163 164 if ($cache !== null) { 165 $this->cache = $cache; 166 } 167 } 168 169 public function pass_file_data($file_class = 'SimplePie\File', $timeout = 10, $useragent = '', $force_fsockopen = false) 170 { 171 if ($timeout) { 172 $this->timeout = (string) $timeout; 173 } 174 175 if ($useragent) { 176 $this->useragent = (string) $useragent; 177 } 178 179 if ($force_fsockopen) { 180 $this->force_fsockopen = (string) $force_fsockopen; 181 } 182 } 183 184 public function strip_htmltags($tags = ['base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style']) 185 { 186 if ($tags) { 187 if (is_array($tags)) { 188 $this->strip_htmltags = $tags; 189 } else { 190 $this->strip_htmltags = explode(',', $tags); 191 } 192 } else { 193 $this->strip_htmltags = false; 194 } 195 } 196 197 public function encode_instead_of_strip($encode = false) 198 { 199 $this->encode_instead_of_strip = (bool) $encode; 200 } 201 202 public function rename_attributes($attribs = []) 203 { 204 if ($attribs) { 205 if (is_array($attribs)) { 206 $this->rename_attributes = $attribs; 207 } else { 208 $this->rename_attributes = explode(',', $attribs); 209 } 210 } else { 211 $this->rename_attributes = false; 212 } 213 } 214 215 public function strip_attributes($attribs = ['bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc']) 216 { 217 if ($attribs) { 218 if (is_array($attribs)) { 219 $this->strip_attributes = $attribs; 220 } else { 221 $this->strip_attributes = explode(',', $attribs); 222 } 223 } else { 224 $this->strip_attributes = false; 225 } 226 } 227 228 public function add_attributes($attribs = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']]) 229 { 230 if ($attribs) { 231 if (is_array($attribs)) { 232 $this->add_attributes = $attribs; 233 } else { 234 $this->add_attributes = explode(',', $attribs); 235 } 236 } else { 237 $this->add_attributes = false; 238 } 239 } 240 241 public function strip_comments($strip = false) 242 { 243 $this->strip_comments = (bool) $strip; 244 } 245 246 public function set_output_encoding($encoding = 'UTF-8') 247 { 248 $this->output_encoding = (string) $encoding; 249 } 250 251 /** 252 * Set element/attribute key/value pairs of HTML attributes 253 * containing URLs that need to be resolved relative to the feed 254 * 255 * Defaults to |a|@href, |area|@href, |audio|@src, |blockquote|@cite, 256 * |del|@cite, |form|@action, |img|@longdesc, |img|@src, |input|@src, 257 * |ins|@cite, |q|@cite, |source|@src, |video|@src 258 * 259 * @since 1.0 260 * @param array|null $element_attribute Element/attribute key/value pairs, null for default 261 */ 262 public function set_url_replacements($element_attribute = null) 263 { 264 if ($element_attribute === null) { 265 $element_attribute = [ 266 'a' => 'href', 267 'area' => 'href', 268 'audio' => 'src', 269 'blockquote' => 'cite', 270 'del' => 'cite', 271 'form' => 'action', 272 'img' => [ 273 'longdesc', 274 'src' 275 ], 276 'input' => 'src', 277 'ins' => 'cite', 278 'q' => 'cite', 279 'source' => 'src', 280 'video' => [ 281 'poster', 282 'src' 283 ] 284 ]; 285 } 286 $this->replace_url_attributes = (array) $element_attribute; 287 } 288 289 /** 290 * Set the list of domains for which to force HTTPS. 291 * @see \SimplePie\Misc::https_url() 292 * Example array('biz', 'example.com', 'example.org', 'www.example.net'); 293 */ 294 public function set_https_domains($domains) 295 { 296 $this->https_domains = []; 297 foreach ($domains as $domain) { 298 $domain = trim($domain, ". \t\n\r\0\x0B"); 299 $segments = array_reverse(explode('.', $domain)); 300 $node =& $this->https_domains; 301 foreach ($segments as $segment) {//Build a tree 302 if ($node === true) { 303 break; 304 } 305 if (!isset($node[$segment])) { 306 $node[$segment] = []; 307 } 308 $node =& $node[$segment]; 309 } 310 $node = true; 311 } 312 } 313 314 /** 315 * Check if the domain is in the list of forced HTTPS. 316 */ 317 protected function is_https_domain($domain) 318 { 319 $domain = trim($domain, '. '); 320 $segments = array_reverse(explode('.', $domain)); 321 $node =& $this->https_domains; 322 foreach ($segments as $segment) {//Explore the tree 323 if (isset($node[$segment])) { 324 $node =& $node[$segment]; 325 } else { 326 break; 327 } 328 } 329 return $node === true; 330 } 331 332 /** 333 * Force HTTPS for selected Web sites. 334 */ 335 public function https_url($url) 336 { 337 return (strtolower(substr($url, 0, 7)) === 'http://') && 338 $this->is_https_domain(parse_url($url, PHP_URL_HOST)) ? 339 substr_replace($url, 's', 4, 0) : //Add the 's' to HTTPS 340 $url; 341 } 342 343 public function sanitize($data, $type, $base = '') 344 { 345 $data = trim($data); 346 if ($data !== '' || $type & \SimplePie\SimplePie::CONSTRUCT_IRI) { 347 if ($type & \SimplePie\SimplePie::CONSTRUCT_MAYBE_HTML) { 348 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . \SimplePie\SimplePie::PCRE_HTML_ATTRIBUTE . '>)/', $data)) { 349 $type |= \SimplePie\SimplePie::CONSTRUCT_HTML; 350 } else { 351 $type |= \SimplePie\SimplePie::CONSTRUCT_TEXT; 352 } 353 } 354 355 if ($type & \SimplePie\SimplePie::CONSTRUCT_BASE64) { 356 $data = base64_decode($data); 357 } 358 359 if ($type & (\SimplePie\SimplePie::CONSTRUCT_HTML | \SimplePie\SimplePie::CONSTRUCT_XHTML)) { 360 if (!class_exists('DOMDocument')) { 361 throw new \SimplePie\Exception('DOMDocument not found, unable to use sanitizer'); 362 } 363 $document = new \DOMDocument(); 364 $document->encoding = 'UTF-8'; 365 366 $data = $this->preprocess($data, $type); 367 368 set_error_handler(['SimplePie\Misc', 'silence_errors']); 369 $document->loadHTML($data); 370 restore_error_handler(); 371 372 $xpath = new \DOMXPath($document); 373 374 // Strip comments 375 if ($this->strip_comments) { 376 $comments = $xpath->query('//comment()'); 377 378 foreach ($comments as $comment) { 379 $comment->parentNode->removeChild($comment); 380 } 381 } 382 383 // Strip out HTML tags and attributes that might cause various security problems. 384 // Based on recommendations by Mark Pilgrim at: 385 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely 386 if ($this->strip_htmltags) { 387 foreach ($this->strip_htmltags as $tag) { 388 $this->strip_tag($tag, $document, $xpath, $type); 389 } 390 } 391 392 if ($this->rename_attributes) { 393 foreach ($this->rename_attributes as $attrib) { 394 $this->rename_attr($attrib, $xpath); 395 } 396 } 397 398 if ($this->strip_attributes) { 399 foreach ($this->strip_attributes as $attrib) { 400 $this->strip_attr($attrib, $xpath); 401 } 402 } 403 404 if ($this->add_attributes) { 405 foreach ($this->add_attributes as $tag => $valuePairs) { 406 $this->add_attr($tag, $valuePairs, $document); 407 } 408 } 409 410 // Replace relative URLs 411 $this->base = $base; 412 foreach ($this->replace_url_attributes as $element => $attributes) { 413 $this->replace_urls($document, $element, $attributes); 414 } 415 416 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags. 417 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache) { 418 $images = $document->getElementsByTagName('img'); 419 420 foreach ($images as $img) { 421 if ($img->hasAttribute('src')) { 422 $image_url = $this->cache_namefilter->filter($img->getAttribute('src')); 423 $cache = $this->get_cache($image_url); 424 425 if ($cache->get_data($image_url, false)) { 426 $img->setAttribute('src', $this->image_handler . $image_url); 427 } else { 428 $file = $this->registry->create(File::class, [$img->getAttribute('src'), $this->timeout, 5, ['X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']], $this->useragent, $this->force_fsockopen]); 429 $headers = $file->headers; 430 431 if ($file->success && ($file->method & \SimplePie\SimplePie::FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300))) { 432 if ($cache->set_data($image_url, ['headers' => $file->headers, 'body' => $file->body], $this->cache_duration)) { 433 $img->setAttribute('src', $this->image_handler . $image_url); 434 } else { 435 trigger_error("$this->cache_location is not writable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING); 436 } 437 } 438 } 439 } 440 } 441 } 442 443 // Get content node 444 $div = $document->getElementsByTagName('body')->item(0)->firstChild; 445 // Finally, convert to a HTML string 446 $data = trim($document->saveHTML($div)); 447 448 if ($this->remove_div) { 449 $data = preg_replace('/^<div' . \SimplePie\SimplePie::PCRE_XML_ATTRIBUTE . '>/', '', $data); 450 $data = preg_replace('/<\/div>$/', '', $data); 451 } else { 452 $data = preg_replace('/^<div' . \SimplePie\SimplePie::PCRE_XML_ATTRIBUTE . '>/', '<div>', $data); 453 } 454 455 $data = str_replace('</source>', '', $data); 456 } 457 458 if ($type & \SimplePie\SimplePie::CONSTRUCT_IRI) { 459 $absolute = $this->registry->call(Misc::class, 'absolutize_url', [$data, $base]); 460 if ($absolute !== false) { 461 $data = $absolute; 462 } 463 } 464 465 if ($type & (\SimplePie\SimplePie::CONSTRUCT_TEXT | \SimplePie\SimplePie::CONSTRUCT_IRI)) { 466 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8'); 467 } 468 469 if ($this->output_encoding !== 'UTF-8') { 470 $data = $this->registry->call(Misc::class, 'change_encoding', [$data, 'UTF-8', $this->output_encoding]); 471 } 472 } 473 return $data; 474 } 475 476 protected function preprocess($html, $type) 477 { 478 $ret = ''; 479 $html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html); 480 if ($type & ~\SimplePie\SimplePie::CONSTRUCT_XHTML) { 481 // Atom XHTML constructs are wrapped with a div by default 482 // Note: No protection if $html contains a stray </div>! 483 $html = '<div>' . $html . '</div>'; 484 $ret .= '<!DOCTYPE html>'; 485 $content_type = 'text/html'; 486 } else { 487 $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'; 488 $content_type = 'application/xhtml+xml'; 489 } 490 491 $ret .= '<html><head>'; 492 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />'; 493 $ret .= '</head><body>' . $html . '</body></html>'; 494 return $ret; 495 } 496 497 public function replace_urls($document, $tag, $attributes) 498 { 499 if (!is_array($attributes)) { 500 $attributes = [$attributes]; 501 } 502 503 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags)) { 504 $elements = $document->getElementsByTagName($tag); 505 foreach ($elements as $element) { 506 foreach ($attributes as $attribute) { 507 if ($element->hasAttribute($attribute)) { 508 $value = $this->registry->call(Misc::class, 'absolutize_url', [$element->getAttribute($attribute), $this->base]); 509 if ($value !== false) { 510 $value = $this->https_url($value); 511 $element->setAttribute($attribute, $value); 512 } 513 } 514 } 515 } 516 } 517 } 518 519 public function do_strip_htmltags($match) 520 { 521 if ($this->encode_instead_of_strip) { 522 if (isset($match[4]) && !in_array(strtolower($match[1]), ['script', 'style'])) { 523 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8'); 524 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8'); 525 return "<$match[1]$match[2]>$match[3]</$match[1]>"; 526 } else { 527 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8'); 528 } 529 } elseif (isset($match[4]) && !in_array(strtolower($match[1]), ['script', 'style'])) { 530 return $match[4]; 531 } else { 532 return ''; 533 } 534 } 535 536 protected function strip_tag($tag, $document, $xpath, $type) 537 { 538 $elements = $xpath->query('body//' . $tag); 539 if ($this->encode_instead_of_strip) { 540 foreach ($elements as $element) { 541 $fragment = $document->createDocumentFragment(); 542 543 // For elements which aren't script or style, include the tag itself 544 if (!in_array($tag, ['script', 'style'])) { 545 $text = '<' . $tag; 546 if ($element->hasAttributes()) { 547 $attrs = []; 548 foreach ($element->attributes as $name => $attr) { 549 $value = $attr->value; 550 551 // In XHTML, empty values should never exist, so we repeat the value 552 if (empty($value) && ($type & \SimplePie\SimplePie::CONSTRUCT_XHTML)) { 553 $value = $name; 554 } 555 // For HTML, empty is fine 556 elseif (empty($value) && ($type & \SimplePie\SimplePie::CONSTRUCT_HTML)) { 557 $attrs[] = $name; 558 continue; 559 } 560 561 // Standard attribute text 562 $attrs[] = $name . '="' . $attr->value . '"'; 563 } 564 $text .= ' ' . implode(' ', $attrs); 565 } 566 $text .= '>'; 567 $fragment->appendChild(new \DOMText($text)); 568 } 569 570 $number = $element->childNodes->length; 571 for ($i = $number; $i > 0; $i--) { 572 $child = $element->childNodes->item(0); 573 $fragment->appendChild($child); 574 } 575 576 if (!in_array($tag, ['script', 'style'])) { 577 $fragment->appendChild(new \DOMText('</' . $tag . '>')); 578 } 579 580 $element->parentNode->replaceChild($fragment, $element); 581 } 582 583 return; 584 } elseif (in_array($tag, ['script', 'style'])) { 585 foreach ($elements as $element) { 586 $element->parentNode->removeChild($element); 587 } 588 589 return; 590 } else { 591 foreach ($elements as $element) { 592 $fragment = $document->createDocumentFragment(); 593 $number = $element->childNodes->length; 594 for ($i = $number; $i > 0; $i--) { 595 $child = $element->childNodes->item(0); 596 $fragment->appendChild($child); 597 } 598 599 $element->parentNode->replaceChild($fragment, $element); 600 } 601 } 602 } 603 604 protected function strip_attr($attrib, $xpath) 605 { 606 $elements = $xpath->query('//*[@' . $attrib . ']'); 607 608 foreach ($elements as $element) { 609 $element->removeAttribute($attrib); 610 } 611 } 612 613 protected function rename_attr($attrib, $xpath) 614 { 615 $elements = $xpath->query('//*[@' . $attrib . ']'); 616 617 foreach ($elements as $element) { 618 $element->setAttribute('data-sanitized-' . $attrib, $element->getAttribute($attrib)); 619 $element->removeAttribute($attrib); 620 } 621 } 622 623 protected function add_attr($tag, $valuePairs, $document) 624 { 625 $elements = $document->getElementsByTagName($tag); 626 foreach ($elements as $element) { 627 foreach ($valuePairs as $attrib => $value) { 628 $element->setAttribute($attrib, $value); 629 } 630 } 631 } 632 633 /** 634 * Get a DataCache 635 * 636 * @param string $image_url Only needed for BC, can be removed in SimplePie 2.0.0 637 * 638 * @return DataCache 639 */ 640 private function get_cache($image_url = '') 641 { 642 if ($this->cache === null) { 643 // @trigger_error(sprintf('Not providing as PSR-16 cache implementation is deprecated since SimplePie 1.8.0, please use "SimplePie\SimplePie::set_cache()".'), \E_USER_DEPRECATED); 644 $cache = $this->registry->call(Cache::class, 'get_handler', [ 645 $this->cache_location, 646 $image_url, 647 Base::TYPE_IMAGE 648 ]); 649 650 return new BaseDataCache($cache); 651 } 652 653 return $this->cache; 654 } 655} 656 657class_alias('SimplePie\Sanitize', 'SimplePie_Sanitize'); 658