1<?php 2 3/** 4 * @file 5 * This file was auto-generated by generate-includes.php and includes all of 6 * the core files required by HTML Purifier. Use this if performance is a 7 * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS 8 * FILE, changes will be overwritten the next time the script is run. 9 * 10 * @version 4.3.0 11 * 12 * @warning 13 * You must *not* include any other HTML Purifier files before this file, 14 * because 'require' not 'require_once' is used. 15 * 16 * @warning 17 * This file requires that the include path contains the HTML Purifier 18 * library directory; this is not auto-set. 19 */ 20 21 22 23/*! @mainpage 24 * 25 * HTML Purifier is an HTML filter that will take an arbitrary snippet of 26 * HTML and rigorously test, validate and filter it into a version that 27 * is safe for output onto webpages. It achieves this by: 28 * 29 * -# Lexing (parsing into tokens) the document, 30 * -# Executing various strategies on the tokens: 31 * -# Removing all elements not in the whitelist, 32 * -# Making the tokens well-formed, 33 * -# Fixing the nesting of the nodes, and 34 * -# Validating attributes of the nodes; and 35 * -# Generating HTML from the purified tokens. 36 * 37 * However, most users will only need to interface with the HTMLPurifier 38 * and HTMLPurifier_Config. 39 */ 40 41/* 42 HTML Purifier 4.3.0 - Standards Compliant HTML Filtering 43 Copyright (C) 2006-2008 Edward Z. Yang 44 45 This library is free software; you can redistribute it and/or 46 modify it under the terms of the GNU Lesser General Public 47 License as published by the Free Software Foundation; either 48 version 2.1 of the License, or (at your option) any later version. 49 50 This library is distributed in the hope that it will be useful, 51 but WITHOUT ANY WARRANTY; without even the implied warranty of 52 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 53 Lesser General Public License for more details. 54 55 You should have received a copy of the GNU Lesser General Public 56 License along with this library; if not, write to the Free Software 57 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 58 */ 59 60/** 61 * Facade that coordinates HTML Purifier's subsystems in order to purify HTML. 62 * 63 * @note There are several points in which configuration can be specified 64 * for HTML Purifier. The precedence of these (from lowest to 65 * highest) is as follows: 66 * -# Instance: new HTMLPurifier($config) 67 * -# Invocation: purify($html, $config) 68 * These configurations are entirely independent of each other and 69 * are *not* merged (this behavior may change in the future). 70 * 71 * @todo We need an easier way to inject strategies using the configuration 72 * object. 73 */ 74class HTMLPurifier 75{ 76 77 /** Version of HTML Purifier */ 78 public $version = '4.3.0'; 79 80 /** Constant with version of HTML Purifier */ 81 const VERSION = '4.3.0'; 82 83 /** Global configuration object */ 84 public $config; 85 86 /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */ 87 private $filters = array(); 88 89 /** Single instance of HTML Purifier */ 90 private static $instance; 91 92 protected $strategy, $generator; 93 94 /** 95 * Resultant HTMLPurifier_Context of last run purification. Is an array 96 * of contexts if the last called method was purifyArray(). 97 */ 98 public $context; 99 100 /** 101 * Initializes the purifier. 102 * @param $config Optional HTMLPurifier_Config object for all instances of 103 * the purifier, if omitted, a default configuration is 104 * supplied (which can be overridden on a per-use basis). 105 * The parameter can also be any type that 106 * HTMLPurifier_Config::create() supports. 107 */ 108 public function __construct($config = null) { 109 110 $this->config = HTMLPurifier_Config::create($config); 111 112 $this->strategy = new HTMLPurifier_Strategy_Core(); 113 114 } 115 116 /** 117 * Adds a filter to process the output. First come first serve 118 * @param $filter HTMLPurifier_Filter object 119 */ 120 public function addFilter($filter) { 121 trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING); 122 $this->filters[] = $filter; 123 } 124 125 /** 126 * Filters an HTML snippet/document to be XSS-free and standards-compliant. 127 * 128 * @param $html String of HTML to purify 129 * @param $config HTMLPurifier_Config object for this operation, if omitted, 130 * defaults to the config object specified during this 131 * object's construction. The parameter can also be any type 132 * that HTMLPurifier_Config::create() supports. 133 * @return Purified HTML 134 */ 135 public function purify($html, $config = null) { 136 137 // :TODO: make the config merge in, instead of replace 138 $config = $config ? HTMLPurifier_Config::create($config) : $this->config; 139 140 // implementation is partially environment dependant, partially 141 // configuration dependant 142 $lexer = HTMLPurifier_Lexer::create($config); 143 144 $context = new HTMLPurifier_Context(); 145 146 // setup HTML generator 147 $this->generator = new HTMLPurifier_Generator($config, $context); 148 $context->register('Generator', $this->generator); 149 150 // set up global context variables 151 if ($config->get('Core.CollectErrors')) { 152 // may get moved out if other facilities use it 153 $language_factory = HTMLPurifier_LanguageFactory::instance(); 154 $language = $language_factory->create($config, $context); 155 $context->register('Locale', $language); 156 157 $error_collector = new HTMLPurifier_ErrorCollector($context); 158 $context->register('ErrorCollector', $error_collector); 159 } 160 161 // setup id_accumulator context, necessary due to the fact that 162 // AttrValidator can be called from many places 163 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context); 164 $context->register('IDAccumulator', $id_accumulator); 165 166 $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context); 167 168 // setup filters 169 $filter_flags = $config->getBatch('Filter'); 170 $custom_filters = $filter_flags['Custom']; 171 unset($filter_flags['Custom']); 172 $filters = array(); 173 foreach ($filter_flags as $filter => $flag) { 174 if (!$flag) continue; 175 if (strpos($filter, '.') !== false) continue; 176 $class = "HTMLPurifier_Filter_$filter"; 177 $filters[] = new $class; 178 } 179 foreach ($custom_filters as $filter) { 180 // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat 181 $filters[] = $filter; 182 } 183 $filters = array_merge($filters, $this->filters); 184 // maybe prepare(), but later 185 186 for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) { 187 $html = $filters[$i]->preFilter($html, $config, $context); 188 } 189 190 // purified HTML 191 $html = 192 $this->generator->generateFromTokens( 193 // list of tokens 194 $this->strategy->execute( 195 // list of un-purified tokens 196 $lexer->tokenizeHTML( 197 // un-purified HTML 198 $html, $config, $context 199 ), 200 $config, $context 201 ) 202 ); 203 204 for ($i = $filter_size - 1; $i >= 0; $i--) { 205 $html = $filters[$i]->postFilter($html, $config, $context); 206 } 207 208 $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context); 209 $this->context =& $context; 210 return $html; 211 } 212 213 /** 214 * Filters an array of HTML snippets 215 * @param $config Optional HTMLPurifier_Config object for this operation. 216 * See HTMLPurifier::purify() for more details. 217 * @return Array of purified HTML 218 */ 219 public function purifyArray($array_of_html, $config = null) { 220 $context_array = array(); 221 foreach ($array_of_html as $key => $html) { 222 $array_of_html[$key] = $this->purify($html, $config); 223 $context_array[$key] = $this->context; 224 } 225 $this->context = $context_array; 226 return $array_of_html; 227 } 228 229 /** 230 * Singleton for enforcing just one HTML Purifier in your system 231 * @param $prototype Optional prototype HTMLPurifier instance to 232 * overload singleton with, or HTMLPurifier_Config 233 * instance to configure the generated version with. 234 */ 235 public static function instance($prototype = null) { 236 if (!self::$instance || $prototype) { 237 if ($prototype instanceof HTMLPurifier) { 238 self::$instance = $prototype; 239 } elseif ($prototype) { 240 self::$instance = new HTMLPurifier($prototype); 241 } else { 242 self::$instance = new HTMLPurifier(); 243 } 244 } 245 return self::$instance; 246 } 247 248 /** 249 * @note Backwards compatibility, see instance() 250 */ 251 public static function getInstance($prototype = null) { 252 return HTMLPurifier::instance($prototype); 253 } 254 255} 256 257 258 259 260 261/** 262 * Defines common attribute collections that modules reference 263 */ 264 265class HTMLPurifier_AttrCollections 266{ 267 268 /** 269 * Associative array of attribute collections, indexed by name 270 */ 271 public $info = array(); 272 273 /** 274 * Performs all expansions on internal data for use by other inclusions 275 * It also collects all attribute collection extensions from 276 * modules 277 * @param $attr_types HTMLPurifier_AttrTypes instance 278 * @param $modules Hash array of HTMLPurifier_HTMLModule members 279 */ 280 public function __construct($attr_types, $modules) { 281 // load extensions from the modules 282 foreach ($modules as $module) { 283 foreach ($module->attr_collections as $coll_i => $coll) { 284 if (!isset($this->info[$coll_i])) { 285 $this->info[$coll_i] = array(); 286 } 287 foreach ($coll as $attr_i => $attr) { 288 if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) { 289 // merge in includes 290 $this->info[$coll_i][$attr_i] = array_merge( 291 $this->info[$coll_i][$attr_i], $attr); 292 continue; 293 } 294 $this->info[$coll_i][$attr_i] = $attr; 295 } 296 } 297 } 298 // perform internal expansions and inclusions 299 foreach ($this->info as $name => $attr) { 300 // merge attribute collections that include others 301 $this->performInclusions($this->info[$name]); 302 // replace string identifiers with actual attribute objects 303 $this->expandIdentifiers($this->info[$name], $attr_types); 304 } 305 } 306 307 /** 308 * Takes a reference to an attribute associative array and performs 309 * all inclusions specified by the zero index. 310 * @param &$attr Reference to attribute array 311 */ 312 public function performInclusions(&$attr) { 313 if (!isset($attr[0])) return; 314 $merge = $attr[0]; 315 $seen = array(); // recursion guard 316 // loop through all the inclusions 317 for ($i = 0; isset($merge[$i]); $i++) { 318 if (isset($seen[$merge[$i]])) continue; 319 $seen[$merge[$i]] = true; 320 // foreach attribute of the inclusion, copy it over 321 if (!isset($this->info[$merge[$i]])) continue; 322 foreach ($this->info[$merge[$i]] as $key => $value) { 323 if (isset($attr[$key])) continue; // also catches more inclusions 324 $attr[$key] = $value; 325 } 326 if (isset($this->info[$merge[$i]][0])) { 327 // recursion 328 $merge = array_merge($merge, $this->info[$merge[$i]][0]); 329 } 330 } 331 unset($attr[0]); 332 } 333 334 /** 335 * Expands all string identifiers in an attribute array by replacing 336 * them with the appropriate values inside HTMLPurifier_AttrTypes 337 * @param &$attr Reference to attribute array 338 * @param $attr_types HTMLPurifier_AttrTypes instance 339 */ 340 public function expandIdentifiers(&$attr, $attr_types) { 341 342 // because foreach will process new elements we add, make sure we 343 // skip duplicates 344 $processed = array(); 345 346 foreach ($attr as $def_i => $def) { 347 // skip inclusions 348 if ($def_i === 0) continue; 349 350 if (isset($processed[$def_i])) continue; 351 352 // determine whether or not attribute is required 353 if ($required = (strpos($def_i, '*') !== false)) { 354 // rename the definition 355 unset($attr[$def_i]); 356 $def_i = trim($def_i, '*'); 357 $attr[$def_i] = $def; 358 } 359 360 $processed[$def_i] = true; 361 362 // if we've already got a literal object, move on 363 if (is_object($def)) { 364 // preserve previous required 365 $attr[$def_i]->required = ($required || $attr[$def_i]->required); 366 continue; 367 } 368 369 if ($def === false) { 370 unset($attr[$def_i]); 371 continue; 372 } 373 374 if ($t = $attr_types->get($def)) { 375 $attr[$def_i] = $t; 376 $attr[$def_i]->required = $required; 377 } else { 378 unset($attr[$def_i]); 379 } 380 } 381 382 } 383 384} 385 386 387 388 389 390/** 391 * Base class for all validating attribute definitions. 392 * 393 * This family of classes forms the core for not only HTML attribute validation, 394 * but also any sort of string that needs to be validated or cleaned (which 395 * means CSS properties and composite definitions are defined here too). 396 * Besides defining (through code) what precisely makes the string valid, 397 * subclasses are also responsible for cleaning the code if possible. 398 */ 399 400abstract class HTMLPurifier_AttrDef 401{ 402 403 /** 404 * Tells us whether or not an HTML attribute is minimized. Has no 405 * meaning in other contexts. 406 */ 407 public $minimized = false; 408 409 /** 410 * Tells us whether or not an HTML attribute is required. Has no 411 * meaning in other contexts 412 */ 413 public $required = false; 414 415 /** 416 * Validates and cleans passed string according to a definition. 417 * 418 * @param $string String to be validated and cleaned. 419 * @param $config Mandatory HTMLPurifier_Config object. 420 * @param $context Mandatory HTMLPurifier_AttrContext object. 421 */ 422 abstract public function validate($string, $config, $context); 423 424 /** 425 * Convenience method that parses a string as if it were CDATA. 426 * 427 * This method process a string in the manner specified at 428 * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing 429 * leading and trailing whitespace, ignoring line feeds, and replacing 430 * carriage returns and tabs with spaces. While most useful for HTML 431 * attributes specified as CDATA, it can also be applied to most CSS 432 * values. 433 * 434 * @note This method is not entirely standards compliant, as trim() removes 435 * more types of whitespace than specified in the spec. In practice, 436 * this is rarely a problem, as those extra characters usually have 437 * already been removed by HTMLPurifier_Encoder. 438 * 439 * @warning This processing is inconsistent with XML's whitespace handling 440 * as specified by section 3.3.3 and referenced XHTML 1.0 section 441 * 4.7. However, note that we are NOT necessarily 442 * parsing XML, thus, this behavior may still be correct. We 443 * assume that newlines have been normalized. 444 */ 445 public function parseCDATA($string) { 446 $string = trim($string); 447 $string = str_replace(array("\n", "\t", "\r"), ' ', $string); 448 return $string; 449 } 450 451 /** 452 * Factory method for creating this class from a string. 453 * @param $string String construction info 454 * @return Created AttrDef object corresponding to $string 455 */ 456 public function make($string) { 457 // default implementation, return a flyweight of this object. 458 // If $string has an effect on the returned object (i.e. you 459 // need to overload this method), it is best 460 // to clone or instantiate new copies. (Instantiation is safer.) 461 return $this; 462 } 463 464 /** 465 * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work 466 * properly. THIS IS A HACK! 467 */ 468 protected function mungeRgb($string) { 469 return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string); 470 } 471 472 /** 473 * Parses a possibly escaped CSS string and returns the "pure" 474 * version of it. 475 */ 476 protected function expandCSSEscape($string) { 477 // flexibly parse it 478 $ret = ''; 479 for ($i = 0, $c = strlen($string); $i < $c; $i++) { 480 if ($string[$i] === '\\') { 481 $i++; 482 if ($i >= $c) { 483 $ret .= '\\'; 484 break; 485 } 486 if (ctype_xdigit($string[$i])) { 487 $code = $string[$i]; 488 for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) { 489 if (!ctype_xdigit($string[$i])) break; 490 $code .= $string[$i]; 491 } 492 // We have to be extremely careful when adding 493 // new characters, to make sure we're not breaking 494 // the encoding. 495 $char = HTMLPurifier_Encoder::unichr(hexdec($code)); 496 if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue; 497 $ret .= $char; 498 if ($i < $c && trim($string[$i]) !== '') $i--; 499 continue; 500 } 501 if ($string[$i] === "\n") continue; 502 } 503 $ret .= $string[$i]; 504 } 505 return $ret; 506 } 507 508} 509 510 511 512 513 514/** 515 * Processes an entire attribute array for corrections needing multiple values. 516 * 517 * Occasionally, a certain attribute will need to be removed and popped onto 518 * another value. Instead of creating a complex return syntax for 519 * HTMLPurifier_AttrDef, we just pass the whole attribute array to a 520 * specialized object and have that do the special work. That is the 521 * family of HTMLPurifier_AttrTransform. 522 * 523 * An attribute transformation can be assigned to run before or after 524 * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for 525 * more details. 526 */ 527 528abstract class HTMLPurifier_AttrTransform 529{ 530 531 /** 532 * Abstract: makes changes to the attributes dependent on multiple values. 533 * 534 * @param $attr Assoc array of attributes, usually from 535 * HTMLPurifier_Token_Tag::$attr 536 * @param $config Mandatory HTMLPurifier_Config object. 537 * @param $context Mandatory HTMLPurifier_Context object 538 * @returns Processed attribute array. 539 */ 540 abstract public function transform($attr, $config, $context); 541 542 /** 543 * Prepends CSS properties to the style attribute, creating the 544 * attribute if it doesn't exist. 545 * @param $attr Attribute array to process (passed by reference) 546 * @param $css CSS to prepend 547 */ 548 public function prependCSS(&$attr, $css) { 549 $attr['style'] = isset($attr['style']) ? $attr['style'] : ''; 550 $attr['style'] = $css . $attr['style']; 551 } 552 553 /** 554 * Retrieves and removes an attribute 555 * @param $attr Attribute array to process (passed by reference) 556 * @param $key Key of attribute to confiscate 557 */ 558 public function confiscateAttr(&$attr, $key) { 559 if (!isset($attr[$key])) return null; 560 $value = $attr[$key]; 561 unset($attr[$key]); 562 return $value; 563 } 564 565} 566 567 568 569 570 571/** 572 * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects 573 */ 574class HTMLPurifier_AttrTypes 575{ 576 /** 577 * Lookup array of attribute string identifiers to concrete implementations 578 */ 579 protected $info = array(); 580 581 /** 582 * Constructs the info array, supplying default implementations for attribute 583 * types. 584 */ 585 public function __construct() { 586 // pseudo-types, must be instantiated via shorthand 587 $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum(); 588 $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool(); 589 590 $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text(); 591 $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID(); 592 $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length(); 593 $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength(); 594 $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens(); 595 $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels(); 596 $this->info['Text'] = new HTMLPurifier_AttrDef_Text(); 597 $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); 598 $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang(); 599 $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color(); 600 601 // unimplemented aliases 602 $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text(); 603 $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text(); 604 $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text(); 605 $this->info['Character'] = new HTMLPurifier_AttrDef_Text(); 606 607 // "proprietary" types 608 $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class(); 609 610 // number is really a positive integer (one or more digits) 611 // FIXME: ^^ not always, see start and value of list items 612 $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); 613 } 614 615 /** 616 * Retrieves a type 617 * @param $type String type name 618 * @return Object AttrDef for type 619 */ 620 public function get($type) { 621 622 // determine if there is any extra info tacked on 623 if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2); 624 else $string = ''; 625 626 if (!isset($this->info[$type])) { 627 trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR); 628 return; 629 } 630 631 return $this->info[$type]->make($string); 632 633 } 634 635 /** 636 * Sets a new implementation for a type 637 * @param $type String type name 638 * @param $impl Object AttrDef for type 639 */ 640 public function set($type, $impl) { 641 $this->info[$type] = $impl; 642 } 643} 644 645 646 647 648 649/** 650 * Validates the attributes of a token. Doesn't manage required attributes 651 * very well. The only reason we factored this out was because RemoveForeignElements 652 * also needed it besides ValidateAttributes. 653 */ 654class HTMLPurifier_AttrValidator 655{ 656 657 /** 658 * Validates the attributes of a token, returning a modified token 659 * that has valid tokens 660 * @param $token Reference to token to validate. We require a reference 661 * because the operation this class performs on the token are 662 * not atomic, so the context CurrentToken to be updated 663 * throughout 664 * @param $config Instance of HTMLPurifier_Config 665 * @param $context Instance of HTMLPurifier_Context 666 */ 667 public function validateToken(&$token, &$config, $context) { 668 669 $definition = $config->getHTMLDefinition(); 670 $e =& $context->get('ErrorCollector', true); 671 672 // initialize IDAccumulator if necessary 673 $ok =& $context->get('IDAccumulator', true); 674 if (!$ok) { 675 $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context); 676 $context->register('IDAccumulator', $id_accumulator); 677 } 678 679 // initialize CurrentToken if necessary 680 $current_token =& $context->get('CurrentToken', true); 681 if (!$current_token) $context->register('CurrentToken', $token); 682 683 if ( 684 !$token instanceof HTMLPurifier_Token_Start && 685 !$token instanceof HTMLPurifier_Token_Empty 686 ) return $token; 687 688 // create alias to global definition array, see also $defs 689 // DEFINITION CALL 690 $d_defs = $definition->info_global_attr; 691 692 // don't update token until the very end, to ensure an atomic update 693 $attr = $token->attr; 694 695 // do global transformations (pre) 696 // nothing currently utilizes this 697 foreach ($definition->info_attr_transform_pre as $transform) { 698 $attr = $transform->transform($o = $attr, $config, $context); 699 if ($e) { 700 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 701 } 702 } 703 704 // do local transformations only applicable to this element (pre) 705 // ex. <p align="right"> to <p style="text-align:right;"> 706 foreach ($definition->info[$token->name]->attr_transform_pre as $transform) { 707 $attr = $transform->transform($o = $attr, $config, $context); 708 if ($e) { 709 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 710 } 711 } 712 713 // create alias to this element's attribute definition array, see 714 // also $d_defs (global attribute definition array) 715 // DEFINITION CALL 716 $defs = $definition->info[$token->name]->attr; 717 718 $attr_key = false; 719 $context->register('CurrentAttr', $attr_key); 720 721 // iterate through all the attribute keypairs 722 // Watch out for name collisions: $key has previously been used 723 foreach ($attr as $attr_key => $value) { 724 725 // call the definition 726 if ( isset($defs[$attr_key]) ) { 727 // there is a local definition defined 728 if ($defs[$attr_key] === false) { 729 // We've explicitly been told not to allow this element. 730 // This is usually when there's a global definition 731 // that must be overridden. 732 // Theoretically speaking, we could have a 733 // AttrDef_DenyAll, but this is faster! 734 $result = false; 735 } else { 736 // validate according to the element's definition 737 $result = $defs[$attr_key]->validate( 738 $value, $config, $context 739 ); 740 } 741 } elseif ( isset($d_defs[$attr_key]) ) { 742 // there is a global definition defined, validate according 743 // to the global definition 744 $result = $d_defs[$attr_key]->validate( 745 $value, $config, $context 746 ); 747 } else { 748 // system never heard of the attribute? DELETE! 749 $result = false; 750 } 751 752 // put the results into effect 753 if ($result === false || $result === null) { 754 // this is a generic error message that should replaced 755 // with more specific ones when possible 756 if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed'); 757 758 // remove the attribute 759 unset($attr[$attr_key]); 760 } elseif (is_string($result)) { 761 // generally, if a substitution is happening, there 762 // was some sort of implicit correction going on. We'll 763 // delegate it to the attribute classes to say exactly what. 764 765 // simple substitution 766 $attr[$attr_key] = $result; 767 } else { 768 // nothing happens 769 } 770 771 // we'd also want slightly more complicated substitution 772 // involving an array as the return value, 773 // although we're not sure how colliding attributes would 774 // resolve (certain ones would be completely overriden, 775 // others would prepend themselves). 776 } 777 778 $context->destroy('CurrentAttr'); 779 780 // post transforms 781 782 // global (error reporting untested) 783 foreach ($definition->info_attr_transform_post as $transform) { 784 $attr = $transform->transform($o = $attr, $config, $context); 785 if ($e) { 786 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 787 } 788 } 789 790 // local (error reporting untested) 791 foreach ($definition->info[$token->name]->attr_transform_post as $transform) { 792 $attr = $transform->transform($o = $attr, $config, $context); 793 if ($e) { 794 if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr); 795 } 796 } 797 798 $token->attr = $attr; 799 800 // destroy CurrentToken if we made it ourselves 801 if (!$current_token) $context->destroy('CurrentToken'); 802 803 } 804 805 806} 807 808 809 810 811 812// constants are slow, so we use as few as possible 813if (!defined('HTMLPURIFIER_PREFIX')) { 814 define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone'); 815 set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path()); 816} 817 818// accomodations for versions earlier than 5.0.2 819// borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net> 820if (!defined('PHP_EOL')) { 821 switch (strtoupper(substr(PHP_OS, 0, 3))) { 822 case 'WIN': 823 define('PHP_EOL', "\r\n"); 824 break; 825 case 'DAR': 826 define('PHP_EOL', "\r"); 827 break; 828 default: 829 define('PHP_EOL', "\n"); 830 } 831} 832 833/** 834 * Bootstrap class that contains meta-functionality for HTML Purifier such as 835 * the autoload function. 836 * 837 * @note 838 * This class may be used without any other files from HTML Purifier. 839 */ 840class HTMLPurifier_Bootstrap 841{ 842 843 /** 844 * Autoload function for HTML Purifier 845 * @param $class Class to load 846 */ 847 public static function autoload($class) { 848 $file = HTMLPurifier_Bootstrap::getPath($class); 849 if (!$file) return false; 850 // Technically speaking, it should be ok and more efficient to 851 // just do 'require', but Antonio Parraga reports that with 852 // Zend extensions such as Zend debugger and APC, this invariant 853 // may be broken. Since we have efficient alternatives, pay 854 // the cost here and avoid the bug. 855 require_once HTMLPURIFIER_PREFIX . '/' . $file; 856 return true; 857 } 858 859 /** 860 * Returns the path for a specific class. 861 */ 862 public static function getPath($class) { 863 if (strncmp('HTMLPurifier', $class, 12) !== 0) return false; 864 // Custom implementations 865 if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) { 866 $code = str_replace('_', '-', substr($class, 22)); 867 $file = 'HTMLPurifier/Language/classes/' . $code . '.php'; 868 } else { 869 $file = str_replace('_', '/', $class) . '.php'; 870 } 871 if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false; 872 return $file; 873 } 874 875 /** 876 * "Pre-registers" our autoloader on the SPL stack. 877 */ 878 public static function registerAutoload() { 879 $autoload = array('HTMLPurifier_Bootstrap', 'autoload'); 880 if ( ($funcs = spl_autoload_functions()) === false ) { 881 spl_autoload_register($autoload); 882 } elseif (function_exists('spl_autoload_unregister')) { 883 $buggy = version_compare(PHP_VERSION, '5.2.11', '<'); 884 $compat = version_compare(PHP_VERSION, '5.1.2', '<=') && 885 version_compare(PHP_VERSION, '5.1.0', '>='); 886 foreach ($funcs as $func) { 887 if ($buggy && is_array($func)) { 888 // :TRICKY: There are some compatibility issues and some 889 // places where we need to error out 890 $reflector = new ReflectionMethod($func[0], $func[1]); 891 if (!$reflector->isStatic()) { 892 throw new Exception(' 893 HTML Purifier autoloader registrar is not compatible 894 with non-static object methods due to PHP Bug #44144; 895 Please do not use HTMLPurifier.autoload.php (or any 896 file that includes this file); instead, place the code: 897 spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\')) 898 after your own autoloaders. 899 '); 900 } 901 // Suprisingly, spl_autoload_register supports the 902 // Class::staticMethod callback format, although call_user_func doesn't 903 if ($compat) $func = implode('::', $func); 904 } 905 spl_autoload_unregister($func); 906 } 907 spl_autoload_register($autoload); 908 foreach ($funcs as $func) spl_autoload_register($func); 909 } 910 } 911 912} 913 914 915 916 917 918/** 919 * Super-class for definition datatype objects, implements serialization 920 * functions for the class. 921 */ 922abstract class HTMLPurifier_Definition 923{ 924 925 /** 926 * Has setup() been called yet? 927 */ 928 public $setup = false; 929 930 /** 931 * If true, write out the final definition object to the cache after 932 * setup. This will be true only if all invocations to get a raw 933 * definition object are also optimized. This does not cause file 934 * system thrashing because on subsequent calls the cached object 935 * is used and any writes to the raw definition object are short 936 * circuited. See enduser-customize.html for the high-level 937 * picture. 938 */ 939 public $optimized = null; 940 941 /** 942 * What type of definition is it? 943 */ 944 public $type; 945 946 /** 947 * Sets up the definition object into the final form, something 948 * not done by the constructor 949 * @param $config HTMLPurifier_Config instance 950 */ 951 abstract protected function doSetup($config); 952 953 /** 954 * Setup function that aborts if already setup 955 * @param $config HTMLPurifier_Config instance 956 */ 957 public function setup($config) { 958 if ($this->setup) return; 959 $this->setup = true; 960 $this->doSetup($config); 961 } 962 963} 964 965 966 967 968 969/** 970 * Defines allowed CSS attributes and what their values are. 971 * @see HTMLPurifier_HTMLDefinition 972 */ 973class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition 974{ 975 976 public $type = 'CSS'; 977 978 /** 979 * Assoc array of attribute name to definition object. 980 */ 981 public $info = array(); 982 983 /** 984 * Constructs the info array. The meat of this class. 985 */ 986 protected function doSetup($config) { 987 988 $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum( 989 array('left', 'right', 'center', 'justify'), false); 990 991 $border_style = 992 $this->info['border-bottom-style'] = 993 $this->info['border-right-style'] = 994 $this->info['border-left-style'] = 995 $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum( 996 array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double', 997 'groove', 'ridge', 'inset', 'outset'), false); 998 999 $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style); 1000 1001 $this->info['clear'] = new HTMLPurifier_AttrDef_Enum( 1002 array('none', 'left', 'right', 'both'), false); 1003 $this->info['float'] = new HTMLPurifier_AttrDef_Enum( 1004 array('none', 'left', 'right'), false); 1005 $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum( 1006 array('normal', 'italic', 'oblique'), false); 1007 $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum( 1008 array('normal', 'small-caps'), false); 1009 1010 $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite( 1011 array( 1012 new HTMLPurifier_AttrDef_Enum(array('none')), 1013 new HTMLPurifier_AttrDef_CSS_URI() 1014 ) 1015 ); 1016 1017 $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum( 1018 array('inside', 'outside'), false); 1019 $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum( 1020 array('disc', 'circle', 'square', 'decimal', 'lower-roman', 1021 'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false); 1022 $this->info['list-style-image'] = $uri_or_none; 1023 1024 $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config); 1025 1026 $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum( 1027 array('capitalize', 'uppercase', 'lowercase', 'none'), false); 1028 $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1029 1030 $this->info['background-image'] = $uri_or_none; 1031 $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum( 1032 array('repeat', 'repeat-x', 'repeat-y', 'no-repeat') 1033 ); 1034 $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum( 1035 array('scroll', 'fixed') 1036 ); 1037 $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition(); 1038 1039 $border_color = 1040 $this->info['border-top-color'] = 1041 $this->info['border-bottom-color'] = 1042 $this->info['border-left-color'] = 1043 $this->info['border-right-color'] = 1044 $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1045 new HTMLPurifier_AttrDef_Enum(array('transparent')), 1046 new HTMLPurifier_AttrDef_CSS_Color() 1047 )); 1048 1049 $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config); 1050 1051 $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color); 1052 1053 $border_width = 1054 $this->info['border-top-width'] = 1055 $this->info['border-bottom-width'] = 1056 $this->info['border-left-width'] = 1057 $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1058 new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')), 1059 new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative 1060 )); 1061 1062 $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width); 1063 1064 $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1065 new HTMLPurifier_AttrDef_Enum(array('normal')), 1066 new HTMLPurifier_AttrDef_CSS_Length() 1067 )); 1068 1069 $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1070 new HTMLPurifier_AttrDef_Enum(array('normal')), 1071 new HTMLPurifier_AttrDef_CSS_Length() 1072 )); 1073 1074 $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1075 new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small', 1076 'small', 'medium', 'large', 'x-large', 'xx-large', 1077 'larger', 'smaller')), 1078 new HTMLPurifier_AttrDef_CSS_Percentage(), 1079 new HTMLPurifier_AttrDef_CSS_Length() 1080 )); 1081 1082 $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1083 new HTMLPurifier_AttrDef_Enum(array('normal')), 1084 new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives 1085 new HTMLPurifier_AttrDef_CSS_Length('0'), 1086 new HTMLPurifier_AttrDef_CSS_Percentage(true) 1087 )); 1088 1089 $margin = 1090 $this->info['margin-top'] = 1091 $this->info['margin-bottom'] = 1092 $this->info['margin-left'] = 1093 $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1094 new HTMLPurifier_AttrDef_CSS_Length(), 1095 new HTMLPurifier_AttrDef_CSS_Percentage(), 1096 new HTMLPurifier_AttrDef_Enum(array('auto')) 1097 )); 1098 1099 $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin); 1100 1101 // non-negative 1102 $padding = 1103 $this->info['padding-top'] = 1104 $this->info['padding-bottom'] = 1105 $this->info['padding-left'] = 1106 $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1107 new HTMLPurifier_AttrDef_CSS_Length('0'), 1108 new HTMLPurifier_AttrDef_CSS_Percentage(true) 1109 )); 1110 1111 $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding); 1112 1113 $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1114 new HTMLPurifier_AttrDef_CSS_Length(), 1115 new HTMLPurifier_AttrDef_CSS_Percentage() 1116 )); 1117 1118 $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array( 1119 new HTMLPurifier_AttrDef_CSS_Length('0'), 1120 new HTMLPurifier_AttrDef_CSS_Percentage(true), 1121 new HTMLPurifier_AttrDef_Enum(array('auto')) 1122 )); 1123 $max = $config->get('CSS.MaxImgLength'); 1124 1125 $this->info['width'] = 1126 $this->info['height'] = 1127 $max === null ? 1128 $trusted_wh : 1129 new HTMLPurifier_AttrDef_Switch('img', 1130 // For img tags: 1131 new HTMLPurifier_AttrDef_CSS_Composite(array( 1132 new HTMLPurifier_AttrDef_CSS_Length('0', $max), 1133 new HTMLPurifier_AttrDef_Enum(array('auto')) 1134 )), 1135 // For everyone else: 1136 $trusted_wh 1137 ); 1138 1139 $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration(); 1140 1141 $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily(); 1142 1143 // this could use specialized code 1144 $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum( 1145 array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300', 1146 '400', '500', '600', '700', '800', '900'), false); 1147 1148 // MUST be called after other font properties, as it references 1149 // a CSSDefinition object 1150 $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config); 1151 1152 // same here 1153 $this->info['border'] = 1154 $this->info['border-bottom'] = 1155 $this->info['border-top'] = 1156 $this->info['border-left'] = 1157 $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config); 1158 1159 $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array( 1160 'collapse', 'separate')); 1161 1162 $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array( 1163 'top', 'bottom')); 1164 1165 $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array( 1166 'auto', 'fixed')); 1167 1168 $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1169 new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super', 1170 'top', 'text-top', 'middle', 'bottom', 'text-bottom')), 1171 new HTMLPurifier_AttrDef_CSS_Length(), 1172 new HTMLPurifier_AttrDef_CSS_Percentage() 1173 )); 1174 1175 $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2); 1176 1177 // partial support 1178 $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap')); 1179 1180 if ($config->get('CSS.Proprietary')) { 1181 $this->doSetupProprietary($config); 1182 } 1183 1184 if ($config->get('CSS.AllowTricky')) { 1185 $this->doSetupTricky($config); 1186 } 1187 1188 if ($config->get('CSS.Trusted')) { 1189 $this->doSetupTrusted($config); 1190 } 1191 1192 $allow_important = $config->get('CSS.AllowImportant'); 1193 // wrap all attr-defs with decorator that handles !important 1194 foreach ($this->info as $k => $v) { 1195 $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important); 1196 } 1197 1198 $this->setupConfigStuff($config); 1199 } 1200 1201 protected function doSetupProprietary($config) { 1202 // Internet Explorer only scrollbar colors 1203 $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1204 $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1205 $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1206 $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1207 $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1208 $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color(); 1209 1210 // technically not proprietary, but CSS3, and no one supports it 1211 $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 1212 $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 1213 $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue(); 1214 1215 // only opacity, for now 1216 $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter(); 1217 1218 } 1219 1220 protected function doSetupTricky($config) { 1221 $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array( 1222 'inline', 'block', 'list-item', 'run-in', 'compact', 1223 'marker', 'table', 'inline-table', 'table-row-group', 1224 'table-header-group', 'table-footer-group', 'table-row', 1225 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none' 1226 )); 1227 $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array( 1228 'visible', 'hidden', 'collapse' 1229 )); 1230 $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll')); 1231 } 1232 1233 protected function doSetupTrusted($config) { 1234 $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array( 1235 'static', 'relative', 'absolute', 'fixed' 1236 )); 1237 $this->info['top'] = 1238 $this->info['left'] = 1239 $this->info['right'] = 1240 $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1241 new HTMLPurifier_AttrDef_CSS_Length(), 1242 new HTMLPurifier_AttrDef_CSS_Percentage(), 1243 new HTMLPurifier_AttrDef_Enum(array('auto')), 1244 )); 1245 $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array( 1246 new HTMLPurifier_AttrDef_Integer(), 1247 new HTMLPurifier_AttrDef_Enum(array('auto')), 1248 )); 1249 } 1250 1251 /** 1252 * Performs extra config-based processing. Based off of 1253 * HTMLPurifier_HTMLDefinition. 1254 * @todo Refactor duplicate elements into common class (probably using 1255 * composition, not inheritance). 1256 */ 1257 protected function setupConfigStuff($config) { 1258 1259 // setup allowed elements 1260 $support = "(for information on implementing this, see the ". 1261 "support forums) "; 1262 $allowed_properties = $config->get('CSS.AllowedProperties'); 1263 if ($allowed_properties !== null) { 1264 foreach ($this->info as $name => $d) { 1265 if(!isset($allowed_properties[$name])) unset($this->info[$name]); 1266 unset($allowed_properties[$name]); 1267 } 1268 // emit errors 1269 foreach ($allowed_properties as $name => $d) { 1270 // :TODO: Is this htmlspecialchars() call really necessary? 1271 $name = htmlspecialchars($name); 1272 trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING); 1273 } 1274 } 1275 1276 $forbidden_properties = $config->get('CSS.ForbiddenProperties'); 1277 if ($forbidden_properties !== null) { 1278 foreach ($this->info as $name => $d) { 1279 if (isset($forbidden_properties[$name])) { 1280 unset($this->info[$name]); 1281 } 1282 } 1283 } 1284 1285 } 1286} 1287 1288 1289 1290 1291 1292/** 1293 * Defines allowed child nodes and validates tokens against it. 1294 */ 1295abstract class HTMLPurifier_ChildDef 1296{ 1297 /** 1298 * Type of child definition, usually right-most part of class name lowercase. 1299 * Used occasionally in terms of context. 1300 */ 1301 public $type; 1302 1303 /** 1304 * Bool that indicates whether or not an empty array of children is okay 1305 * 1306 * This is necessary for redundant checking when changes affecting 1307 * a child node may cause a parent node to now be disallowed. 1308 */ 1309 public $allow_empty; 1310 1311 /** 1312 * Lookup array of all elements that this definition could possibly allow 1313 */ 1314 public $elements = array(); 1315 1316 /** 1317 * Get lookup of tag names that should not close this element automatically. 1318 * All other elements will do so. 1319 */ 1320 public function getAllowedElements($config) { 1321 return $this->elements; 1322 } 1323 1324 /** 1325 * Validates nodes according to definition and returns modification. 1326 * 1327 * @param $tokens_of_children Array of HTMLPurifier_Token 1328 * @param $config HTMLPurifier_Config object 1329 * @param $context HTMLPurifier_Context object 1330 * @return bool true to leave nodes as is 1331 * @return bool false to remove parent node 1332 * @return array of replacement child tokens 1333 */ 1334 abstract public function validateChildren($tokens_of_children, $config, $context); 1335} 1336 1337 1338 1339 1340 1341/** 1342 * Configuration object that triggers customizable behavior. 1343 * 1344 * @warning This class is strongly defined: that means that the class 1345 * will fail if an undefined directive is retrieved or set. 1346 * 1347 * @note Many classes that could (although many times don't) use the 1348 * configuration object make it a mandatory parameter. This is 1349 * because a configuration object should always be forwarded, 1350 * otherwise, you run the risk of missing a parameter and then 1351 * being stumped when a configuration directive doesn't work. 1352 * 1353 * @todo Reconsider some of the public member variables 1354 */ 1355class HTMLPurifier_Config 1356{ 1357 1358 /** 1359 * HTML Purifier's version 1360 */ 1361 public $version = '4.3.0'; 1362 1363 /** 1364 * Bool indicator whether or not to automatically finalize 1365 * the object if a read operation is done 1366 */ 1367 public $autoFinalize = true; 1368 1369 // protected member variables 1370 1371 /** 1372 * Namespace indexed array of serials for specific namespaces (see 1373 * getSerial() for more info). 1374 */ 1375 protected $serials = array(); 1376 1377 /** 1378 * Serial for entire configuration object 1379 */ 1380 protected $serial; 1381 1382 /** 1383 * Parser for variables 1384 */ 1385 protected $parser; 1386 1387 /** 1388 * Reference HTMLPurifier_ConfigSchema for value checking 1389 * @note This is public for introspective purposes. Please don't 1390 * abuse! 1391 */ 1392 public $def; 1393 1394 /** 1395 * Indexed array of definitions 1396 */ 1397 protected $definitions; 1398 1399 /** 1400 * Bool indicator whether or not config is finalized 1401 */ 1402 protected $finalized = false; 1403 1404 /** 1405 * Property list containing configuration directives. 1406 */ 1407 protected $plist; 1408 1409 /** 1410 * Whether or not a set is taking place due to an 1411 * alias lookup. 1412 */ 1413 private $aliasMode; 1414 1415 /** 1416 * Set to false if you do not want line and file numbers in errors 1417 * (useful when unit testing). This will also compress some errors 1418 * and exceptions. 1419 */ 1420 public $chatty = true; 1421 1422 /** 1423 * Current lock; only gets to this namespace are allowed. 1424 */ 1425 private $lock; 1426 1427 /** 1428 * @param $definition HTMLPurifier_ConfigSchema that defines what directives 1429 * are allowed. 1430 */ 1431 public function __construct($definition, $parent = null) { 1432 $parent = $parent ? $parent : $definition->defaultPlist; 1433 $this->plist = new HTMLPurifier_PropertyList($parent); 1434 $this->def = $definition; // keep a copy around for checking 1435 $this->parser = new HTMLPurifier_VarParser_Flexible(); 1436 } 1437 1438 /** 1439 * Convenience constructor that creates a config object based on a mixed var 1440 * @param mixed $config Variable that defines the state of the config 1441 * object. Can be: a HTMLPurifier_Config() object, 1442 * an array of directives based on loadArray(), 1443 * or a string filename of an ini file. 1444 * @param HTMLPurifier_ConfigSchema Schema object 1445 * @return Configured HTMLPurifier_Config object 1446 */ 1447 public static function create($config, $schema = null) { 1448 if ($config instanceof HTMLPurifier_Config) { 1449 // pass-through 1450 return $config; 1451 } 1452 if (!$schema) { 1453 $ret = HTMLPurifier_Config::createDefault(); 1454 } else { 1455 $ret = new HTMLPurifier_Config($schema); 1456 } 1457 if (is_string($config)) $ret->loadIni($config); 1458 elseif (is_array($config)) $ret->loadArray($config); 1459 return $ret; 1460 } 1461 1462 /** 1463 * Creates a new config object that inherits from a previous one. 1464 * @param HTMLPurifier_Config $config Configuration object to inherit 1465 * from. 1466 * @return HTMLPurifier_Config object with $config as its parent. 1467 */ 1468 public static function inherit(HTMLPurifier_Config $config) { 1469 return new HTMLPurifier_Config($config->def, $config->plist); 1470 } 1471 1472 /** 1473 * Convenience constructor that creates a default configuration object. 1474 * @return Default HTMLPurifier_Config object. 1475 */ 1476 public static function createDefault() { 1477 $definition = HTMLPurifier_ConfigSchema::instance(); 1478 $config = new HTMLPurifier_Config($definition); 1479 return $config; 1480 } 1481 1482 /** 1483 * Retreives a value from the configuration. 1484 * @param $key String key 1485 */ 1486 public function get($key, $a = null) { 1487 if ($a !== null) { 1488 $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING); 1489 $key = "$key.$a"; 1490 } 1491 if (!$this->finalized) $this->autoFinalize(); 1492 if (!isset($this->def->info[$key])) { 1493 // can't add % due to SimpleTest bug 1494 $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key), 1495 E_USER_WARNING); 1496 return; 1497 } 1498 if (isset($this->def->info[$key]->isAlias)) { 1499 $d = $this->def->info[$key]; 1500 $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key, 1501 E_USER_ERROR); 1502 return; 1503 } 1504 if ($this->lock) { 1505 list($ns) = explode('.', $key); 1506 if ($ns !== $this->lock) { 1507 $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR); 1508 return; 1509 } 1510 } 1511 return $this->plist->get($key); 1512 } 1513 1514 /** 1515 * Retreives an array of directives to values from a given namespace 1516 * @param $namespace String namespace 1517 */ 1518 public function getBatch($namespace) { 1519 if (!$this->finalized) $this->autoFinalize(); 1520 $full = $this->getAll(); 1521 if (!isset($full[$namespace])) { 1522 $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace), 1523 E_USER_WARNING); 1524 return; 1525 } 1526 return $full[$namespace]; 1527 } 1528 1529 /** 1530 * Returns a md5 signature of a segment of the configuration object 1531 * that uniquely identifies that particular configuration 1532 * @note Revision is handled specially and is removed from the batch 1533 * before processing! 1534 * @param $namespace Namespace to get serial for 1535 */ 1536 public function getBatchSerial($namespace) { 1537 if (empty($this->serials[$namespace])) { 1538 $batch = $this->getBatch($namespace); 1539 unset($batch['DefinitionRev']); 1540 $this->serials[$namespace] = md5(serialize($batch)); 1541 } 1542 return $this->serials[$namespace]; 1543 } 1544 1545 /** 1546 * Returns a md5 signature for the entire configuration object 1547 * that uniquely identifies that particular configuration 1548 */ 1549 public function getSerial() { 1550 if (empty($this->serial)) { 1551 $this->serial = md5(serialize($this->getAll())); 1552 } 1553 return $this->serial; 1554 } 1555 1556 /** 1557 * Retrieves all directives, organized by namespace 1558 * @warning This is a pretty inefficient function, avoid if you can 1559 */ 1560 public function getAll() { 1561 if (!$this->finalized) $this->autoFinalize(); 1562 $ret = array(); 1563 foreach ($this->plist->squash() as $name => $value) { 1564 list($ns, $key) = explode('.', $name, 2); 1565 $ret[$ns][$key] = $value; 1566 } 1567 return $ret; 1568 } 1569 1570 /** 1571 * Sets a value to configuration. 1572 * @param $key String key 1573 * @param $value Mixed value 1574 */ 1575 public function set($key, $value, $a = null) { 1576 if (strpos($key, '.') === false) { 1577 $namespace = $key; 1578 $directive = $value; 1579 $value = $a; 1580 $key = "$key.$directive"; 1581 $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE); 1582 } else { 1583 list($namespace) = explode('.', $key); 1584 } 1585 if ($this->isFinalized('Cannot set directive after finalization')) return; 1586 if (!isset($this->def->info[$key])) { 1587 $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value', 1588 E_USER_WARNING); 1589 return; 1590 } 1591 $def = $this->def->info[$key]; 1592 1593 if (isset($def->isAlias)) { 1594 if ($this->aliasMode) { 1595 $this->triggerError('Double-aliases not allowed, please fix '. 1596 'ConfigSchema bug with' . $key, E_USER_ERROR); 1597 return; 1598 } 1599 $this->aliasMode = true; 1600 $this->set($def->key, $value); 1601 $this->aliasMode = false; 1602 $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE); 1603 return; 1604 } 1605 1606 // Raw type might be negative when using the fully optimized form 1607 // of stdclass, which indicates allow_null == true 1608 $rtype = is_int($def) ? $def : $def->type; 1609 if ($rtype < 0) { 1610 $type = -$rtype; 1611 $allow_null = true; 1612 } else { 1613 $type = $rtype; 1614 $allow_null = isset($def->allow_null); 1615 } 1616 1617 try { 1618 $value = $this->parser->parse($value, $type, $allow_null); 1619 } catch (HTMLPurifier_VarParserException $e) { 1620 $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING); 1621 return; 1622 } 1623 if (is_string($value) && is_object($def)) { 1624 // resolve value alias if defined 1625 if (isset($def->aliases[$value])) { 1626 $value = $def->aliases[$value]; 1627 } 1628 // check to see if the value is allowed 1629 if (isset($def->allowed) && !isset($def->allowed[$value])) { 1630 $this->triggerError('Value not supported, valid values are: ' . 1631 $this->_listify($def->allowed), E_USER_WARNING); 1632 return; 1633 } 1634 } 1635 $this->plist->set($key, $value); 1636 1637 // reset definitions if the directives they depend on changed 1638 // this is a very costly process, so it's discouraged 1639 // with finalization 1640 if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') { 1641 $this->definitions[$namespace] = null; 1642 } 1643 1644 $this->serials[$namespace] = false; 1645 } 1646 1647 /** 1648 * Convenience function for error reporting 1649 */ 1650 private function _listify($lookup) { 1651 $list = array(); 1652 foreach ($lookup as $name => $b) $list[] = $name; 1653 return implode(', ', $list); 1654 } 1655 1656 /** 1657 * Retrieves object reference to the HTML definition. 1658 * @param $raw Return a copy that has not been setup yet. Must be 1659 * called before it's been setup, otherwise won't work. 1660 * @param $optimized If true, this method may return null, to 1661 * indicate that a cached version of the modified 1662 * definition object is available and no further edits 1663 * are necessary. Consider using 1664 * maybeGetRawHTMLDefinition, which is more explicitly 1665 * named, instead. 1666 */ 1667 public function getHTMLDefinition($raw = false, $optimized = false) { 1668 return $this->getDefinition('HTML', $raw, $optimized); 1669 } 1670 1671 /** 1672 * Retrieves object reference to the CSS definition 1673 * @param $raw Return a copy that has not been setup yet. Must be 1674 * called before it's been setup, otherwise won't work. 1675 * @param $optimized If true, this method may return null, to 1676 * indicate that a cached version of the modified 1677 * definition object is available and no further edits 1678 * are necessary. Consider using 1679 * maybeGetRawCSSDefinition, which is more explicitly 1680 * named, instead. 1681 */ 1682 public function getCSSDefinition($raw = false, $optimized = false) { 1683 return $this->getDefinition('CSS', $raw, $optimized); 1684 } 1685 1686 /** 1687 * Retrieves object reference to the URI definition 1688 * @param $raw Return a copy that has not been setup yet. Must be 1689 * called before it's been setup, otherwise won't work. 1690 * @param $optimized If true, this method may return null, to 1691 * indicate that a cached version of the modified 1692 * definition object is available and no further edits 1693 * are necessary. Consider using 1694 * maybeGetRawURIDefinition, which is more explicitly 1695 * named, instead. 1696 */ 1697 public function getURIDefinition($raw = false, $optimized = false) { 1698 return $this->getDefinition('URI', $raw, $optimized); 1699 } 1700 1701 /** 1702 * Retrieves a definition 1703 * @param $type Type of definition: HTML, CSS, etc 1704 * @param $raw Whether or not definition should be returned raw 1705 * @param $optimized Only has an effect when $raw is true. Whether 1706 * or not to return null if the result is already present in 1707 * the cache. This is off by default for backwards 1708 * compatibility reasons, but you need to do things this 1709 * way in order to ensure that caching is done properly. 1710 * Check out enduser-customize.html for more details. 1711 * We probably won't ever change this default, as much as the 1712 * maybe semantics is the "right thing to do." 1713 */ 1714 public function getDefinition($type, $raw = false, $optimized = false) { 1715 if ($optimized && !$raw) { 1716 throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false"); 1717 } 1718 if (!$this->finalized) $this->autoFinalize(); 1719 // temporarily suspend locks, so we can handle recursive definition calls 1720 $lock = $this->lock; 1721 $this->lock = null; 1722 $factory = HTMLPurifier_DefinitionCacheFactory::instance(); 1723 $cache = $factory->create($type, $this); 1724 $this->lock = $lock; 1725 if (!$raw) { 1726 // full definition 1727 // --------------- 1728 // check if definition is in memory 1729 if (!empty($this->definitions[$type])) { 1730 $def = $this->definitions[$type]; 1731 // check if the definition is setup 1732 if ($def->setup) { 1733 return $def; 1734 } else { 1735 $def->setup($this); 1736 if ($def->optimized) $cache->add($def, $this); 1737 return $def; 1738 } 1739 } 1740 // check if definition is in cache 1741 $def = $cache->get($this); 1742 if ($def) { 1743 // definition in cache, save to memory and return it 1744 $this->definitions[$type] = $def; 1745 return $def; 1746 } 1747 // initialize it 1748 $def = $this->initDefinition($type); 1749 // set it up 1750 $this->lock = $type; 1751 $def->setup($this); 1752 $this->lock = null; 1753 // save in cache 1754 $cache->add($def, $this); 1755 // return it 1756 return $def; 1757 } else { 1758 // raw definition 1759 // -------------- 1760 // check preconditions 1761 $def = null; 1762 if ($optimized) { 1763 if (is_null($this->get($type . '.DefinitionID'))) { 1764 // fatally error out if definition ID not set 1765 throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID"); 1766 } 1767 } 1768 if (!empty($this->definitions[$type])) { 1769 $def = $this->definitions[$type]; 1770 if ($def->setup && !$optimized) { 1771 $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : ""; 1772 throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra); 1773 } 1774 if ($def->optimized === null) { 1775 $extra = $this->chatty ? " (try flushing your cache)" : ""; 1776 throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra); 1777 } 1778 if ($def->optimized !== $optimized) { 1779 $msg = $optimized ? "optimized" : "unoptimized"; 1780 $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : ""; 1781 throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra); 1782 } 1783 } 1784 // check if definition was in memory 1785 if ($def) { 1786 if ($def->setup) { 1787 // invariant: $optimized === true (checked above) 1788 return null; 1789 } else { 1790 return $def; 1791 } 1792 } 1793 // if optimized, check if definition was in cache 1794 // (because we do the memory check first, this formulation 1795 // is prone to cache slamming, but I think 1796 // guaranteeing that either /all/ of the raw 1797 // setup code or /none/ of it is run is more important.) 1798 if ($optimized) { 1799 // This code path only gets run once; once we put 1800 // something in $definitions (which is guaranteed by the 1801 // trailing code), we always short-circuit above. 1802 $def = $cache->get($this); 1803 if ($def) { 1804 // save the full definition for later, but don't 1805 // return it yet 1806 $this->definitions[$type] = $def; 1807 return null; 1808 } 1809 } 1810 // check invariants for creation 1811 if (!$optimized) { 1812 if (!is_null($this->get($type . '.DefinitionID'))) { 1813 if ($this->chatty) { 1814 $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING); 1815 } else { 1816 $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING); 1817 } 1818 } 1819 } 1820 // initialize it 1821 $def = $this->initDefinition($type); 1822 $def->optimized = $optimized; 1823 return $def; 1824 } 1825 throw new HTMLPurifier_Exception("The impossible happened!"); 1826 } 1827 1828 private function initDefinition($type) { 1829 // quick checks failed, let's create the object 1830 if ($type == 'HTML') { 1831 $def = new HTMLPurifier_HTMLDefinition(); 1832 } elseif ($type == 'CSS') { 1833 $def = new HTMLPurifier_CSSDefinition(); 1834 } elseif ($type == 'URI') { 1835 $def = new HTMLPurifier_URIDefinition(); 1836 } else { 1837 throw new HTMLPurifier_Exception("Definition of $type type not supported"); 1838 } 1839 $this->definitions[$type] = $def; 1840 return $def; 1841 } 1842 1843 public function maybeGetRawDefinition($name) { 1844 return $this->getDefinition($name, true, true); 1845 } 1846 1847 public function maybeGetRawHTMLDefinition() { 1848 return $this->getDefinition('HTML', true, true); 1849 } 1850 1851 public function maybeGetRawCSSDefinition() { 1852 return $this->getDefinition('CSS', true, true); 1853 } 1854 1855 public function maybeGetRawURIDefinition() { 1856 return $this->getDefinition('URI', true, true); 1857 } 1858 1859 /** 1860 * Loads configuration values from an array with the following structure: 1861 * Namespace.Directive => Value 1862 * @param $config_array Configuration associative array 1863 */ 1864 public function loadArray($config_array) { 1865 if ($this->isFinalized('Cannot load directives after finalization')) return; 1866 foreach ($config_array as $key => $value) { 1867 $key = str_replace('_', '.', $key); 1868 if (strpos($key, '.') !== false) { 1869 $this->set($key, $value); 1870 } else { 1871 $namespace = $key; 1872 $namespace_values = $value; 1873 foreach ($namespace_values as $directive => $value) { 1874 $this->set($namespace .'.'. $directive, $value); 1875 } 1876 } 1877 } 1878 } 1879 1880 /** 1881 * Returns a list of array(namespace, directive) for all directives 1882 * that are allowed in a web-form context as per an allowed 1883 * namespaces/directives list. 1884 * @param $allowed List of allowed namespaces/directives 1885 */ 1886 public static function getAllowedDirectivesForForm($allowed, $schema = null) { 1887 if (!$schema) { 1888 $schema = HTMLPurifier_ConfigSchema::instance(); 1889 } 1890 if ($allowed !== true) { 1891 if (is_string($allowed)) $allowed = array($allowed); 1892 $allowed_ns = array(); 1893 $allowed_directives = array(); 1894 $blacklisted_directives = array(); 1895 foreach ($allowed as $ns_or_directive) { 1896 if (strpos($ns_or_directive, '.') !== false) { 1897 // directive 1898 if ($ns_or_directive[0] == '-') { 1899 $blacklisted_directives[substr($ns_or_directive, 1)] = true; 1900 } else { 1901 $allowed_directives[$ns_or_directive] = true; 1902 } 1903 } else { 1904 // namespace 1905 $allowed_ns[$ns_or_directive] = true; 1906 } 1907 } 1908 } 1909 $ret = array(); 1910 foreach ($schema->info as $key => $def) { 1911 list($ns, $directive) = explode('.', $key, 2); 1912 if ($allowed !== true) { 1913 if (isset($blacklisted_directives["$ns.$directive"])) continue; 1914 if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue; 1915 } 1916 if (isset($def->isAlias)) continue; 1917 if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue; 1918 $ret[] = array($ns, $directive); 1919 } 1920 return $ret; 1921 } 1922 1923 /** 1924 * Loads configuration values from $_GET/$_POST that were posted 1925 * via ConfigForm 1926 * @param $array $_GET or $_POST array to import 1927 * @param $index Index/name that the config variables are in 1928 * @param $allowed List of allowed namespaces/directives 1929 * @param $mq_fix Boolean whether or not to enable magic quotes fix 1930 * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy 1931 */ 1932 public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) { 1933 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema); 1934 $config = HTMLPurifier_Config::create($ret, $schema); 1935 return $config; 1936 } 1937 1938 /** 1939 * Merges in configuration values from $_GET/$_POST to object. NOT STATIC. 1940 * @note Same parameters as loadArrayFromForm 1941 */ 1942 public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) { 1943 $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def); 1944 $this->loadArray($ret); 1945 } 1946 1947 /** 1948 * Prepares an array from a form into something usable for the more 1949 * strict parts of HTMLPurifier_Config 1950 */ 1951 public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) { 1952 if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array(); 1953 $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc(); 1954 1955 $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema); 1956 $ret = array(); 1957 foreach ($allowed as $key) { 1958 list($ns, $directive) = $key; 1959 $skey = "$ns.$directive"; 1960 if (!empty($array["Null_$skey"])) { 1961 $ret[$ns][$directive] = null; 1962 continue; 1963 } 1964 if (!isset($array[$skey])) continue; 1965 $value = $mq ? stripslashes($array[$skey]) : $array[$skey]; 1966 $ret[$ns][$directive] = $value; 1967 } 1968 return $ret; 1969 } 1970 1971 /** 1972 * Loads configuration values from an ini file 1973 * @param $filename Name of ini file 1974 */ 1975 public function loadIni($filename) { 1976 if ($this->isFinalized('Cannot load directives after finalization')) return; 1977 $array = parse_ini_file($filename, true); 1978 $this->loadArray($array); 1979 } 1980 1981 /** 1982 * Checks whether or not the configuration object is finalized. 1983 * @param $error String error message, or false for no error 1984 */ 1985 public function isFinalized($error = false) { 1986 if ($this->finalized && $error) { 1987 $this->triggerError($error, E_USER_ERROR); 1988 } 1989 return $this->finalized; 1990 } 1991 1992 /** 1993 * Finalizes configuration only if auto finalize is on and not 1994 * already finalized 1995 */ 1996 public function autoFinalize() { 1997 if ($this->autoFinalize) { 1998 $this->finalize(); 1999 } else { 2000 $this->plist->squash(true); 2001 } 2002 } 2003 2004 /** 2005 * Finalizes a configuration object, prohibiting further change 2006 */ 2007 public function finalize() { 2008 $this->finalized = true; 2009 unset($this->parser); 2010 } 2011 2012 /** 2013 * Produces a nicely formatted error message by supplying the 2014 * stack frame information OUTSIDE of HTMLPurifier_Config. 2015 */ 2016 protected function triggerError($msg, $no) { 2017 // determine previous stack frame 2018 $extra = ''; 2019 if ($this->chatty) { 2020 $trace = debug_backtrace(); 2021 // zip(tail(trace), trace) -- but PHP is not Haskell har har 2022 for ($i = 0, $c = count($trace); $i < $c - 1; $i++) { 2023 if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') { 2024 continue; 2025 } 2026 $frame = $trace[$i]; 2027 $extra = " invoked on line {$frame['line']} in file {$frame['file']}"; 2028 break; 2029 } 2030 } 2031 trigger_error($msg . $extra, $no); 2032 } 2033 2034 /** 2035 * Returns a serialized form of the configuration object that can 2036 * be reconstituted. 2037 */ 2038 public function serialize() { 2039 $this->getDefinition('HTML'); 2040 $this->getDefinition('CSS'); 2041 $this->getDefinition('URI'); 2042 return serialize($this); 2043 } 2044 2045} 2046 2047 2048 2049 2050 2051/** 2052 * Configuration definition, defines directives and their defaults. 2053 */ 2054class HTMLPurifier_ConfigSchema { 2055 2056 /** 2057 * Defaults of the directives and namespaces. 2058 * @note This shares the exact same structure as HTMLPurifier_Config::$conf 2059 */ 2060 public $defaults = array(); 2061 2062 /** 2063 * The default property list. Do not edit this property list. 2064 */ 2065 public $defaultPlist; 2066 2067 /** 2068 * Definition of the directives. The structure of this is: 2069 * 2070 * array( 2071 * 'Namespace' => array( 2072 * 'Directive' => new stdclass(), 2073 * ) 2074 * ) 2075 * 2076 * The stdclass may have the following properties: 2077 * 2078 * - If isAlias isn't set: 2079 * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions 2080 * - allow_null: If set, this directive allows null values 2081 * - aliases: If set, an associative array of value aliases to real values 2082 * - allowed: If set, a lookup array of allowed (string) values 2083 * - If isAlias is set: 2084 * - namespace: Namespace this directive aliases to 2085 * - name: Directive name this directive aliases to 2086 * 2087 * In certain degenerate cases, stdclass will actually be an integer. In 2088 * that case, the value is equivalent to an stdclass with the type 2089 * property set to the integer. If the integer is negative, type is 2090 * equal to the absolute value of integer, and allow_null is true. 2091 * 2092 * This class is friendly with HTMLPurifier_Config. If you need introspection 2093 * about the schema, you're better of using the ConfigSchema_Interchange, 2094 * which uses more memory but has much richer information. 2095 */ 2096 public $info = array(); 2097 2098 /** 2099 * Application-wide singleton 2100 */ 2101 static protected $singleton; 2102 2103 public function __construct() { 2104 $this->defaultPlist = new HTMLPurifier_PropertyList(); 2105 } 2106 2107 /** 2108 * Unserializes the default ConfigSchema. 2109 */ 2110 public static function makeFromSerial() { 2111 $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'); 2112 $r = unserialize($contents); 2113 if (!$r) { 2114 $hash = sha1($contents); 2115 trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR); 2116 } 2117 return $r; 2118 } 2119 2120 /** 2121 * Retrieves an instance of the application-wide configuration definition. 2122 */ 2123 public static function instance($prototype = null) { 2124 if ($prototype !== null) { 2125 HTMLPurifier_ConfigSchema::$singleton = $prototype; 2126 } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) { 2127 HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial(); 2128 } 2129 return HTMLPurifier_ConfigSchema::$singleton; 2130 } 2131 2132 /** 2133 * Defines a directive for configuration 2134 * @warning Will fail of directive's namespace is defined. 2135 * @warning This method's signature is slightly different from the legacy 2136 * define() static method! Beware! 2137 * @param $namespace Namespace the directive is in 2138 * @param $name Key of directive 2139 * @param $default Default value of directive 2140 * @param $type Allowed type of the directive. See 2141 * HTMLPurifier_DirectiveDef::$type for allowed values 2142 * @param $allow_null Whether or not to allow null values 2143 */ 2144 public function add($key, $default, $type, $allow_null) { 2145 $obj = new stdclass(); 2146 $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type]; 2147 if ($allow_null) $obj->allow_null = true; 2148 $this->info[$key] = $obj; 2149 $this->defaults[$key] = $default; 2150 $this->defaultPlist->set($key, $default); 2151 } 2152 2153 /** 2154 * Defines a directive value alias. 2155 * 2156 * Directive value aliases are convenient for developers because it lets 2157 * them set a directive to several values and get the same result. 2158 * @param $namespace Directive's namespace 2159 * @param $name Name of Directive 2160 * @param $aliases Hash of aliased values to the real alias 2161 */ 2162 public function addValueAliases($key, $aliases) { 2163 if (!isset($this->info[$key]->aliases)) { 2164 $this->info[$key]->aliases = array(); 2165 } 2166 foreach ($aliases as $alias => $real) { 2167 $this->info[$key]->aliases[$alias] = $real; 2168 } 2169 } 2170 2171 /** 2172 * Defines a set of allowed values for a directive. 2173 * @warning This is slightly different from the corresponding static 2174 * method definition. 2175 * @param $namespace Namespace of directive 2176 * @param $name Name of directive 2177 * @param $allowed Lookup array of allowed values 2178 */ 2179 public function addAllowedValues($key, $allowed) { 2180 $this->info[$key]->allowed = $allowed; 2181 } 2182 2183 /** 2184 * Defines a directive alias for backwards compatibility 2185 * @param $namespace 2186 * @param $name Directive that will be aliased 2187 * @param $new_namespace 2188 * @param $new_name Directive that the alias will be to 2189 */ 2190 public function addAlias($key, $new_key) { 2191 $obj = new stdclass; 2192 $obj->key = $new_key; 2193 $obj->isAlias = true; 2194 $this->info[$key] = $obj; 2195 } 2196 2197 /** 2198 * Replaces any stdclass that only has the type property with type integer. 2199 */ 2200 public function postProcess() { 2201 foreach ($this->info as $key => $v) { 2202 if (count((array) $v) == 1) { 2203 $this->info[$key] = $v->type; 2204 } elseif (count((array) $v) == 2 && isset($v->allow_null)) { 2205 $this->info[$key] = -$v->type; 2206 } 2207 } 2208 } 2209 2210} 2211 2212 2213 2214 2215 2216/** 2217 * @todo Unit test 2218 */ 2219class HTMLPurifier_ContentSets 2220{ 2221 2222 /** 2223 * List of content set strings (pipe seperators) indexed by name. 2224 */ 2225 public $info = array(); 2226 2227 /** 2228 * List of content set lookups (element => true) indexed by name. 2229 * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets 2230 */ 2231 public $lookup = array(); 2232 2233 /** 2234 * Synchronized list of defined content sets (keys of info) 2235 */ 2236 protected $keys = array(); 2237 /** 2238 * Synchronized list of defined content values (values of info) 2239 */ 2240 protected $values = array(); 2241 2242 /** 2243 * Merges in module's content sets, expands identifiers in the content 2244 * sets and populates the keys, values and lookup member variables. 2245 * @param $modules List of HTMLPurifier_HTMLModule 2246 */ 2247 public function __construct($modules) { 2248 if (!is_array($modules)) $modules = array($modules); 2249 // populate content_sets based on module hints 2250 // sorry, no way of overloading 2251 foreach ($modules as $module_i => $module) { 2252 foreach ($module->content_sets as $key => $value) { 2253 $temp = $this->convertToLookup($value); 2254 if (isset($this->lookup[$key])) { 2255 // add it into the existing content set 2256 $this->lookup[$key] = array_merge($this->lookup[$key], $temp); 2257 } else { 2258 $this->lookup[$key] = $temp; 2259 } 2260 } 2261 } 2262 $old_lookup = false; 2263 while ($old_lookup !== $this->lookup) { 2264 $old_lookup = $this->lookup; 2265 foreach ($this->lookup as $i => $set) { 2266 $add = array(); 2267 foreach ($set as $element => $x) { 2268 if (isset($this->lookup[$element])) { 2269 $add += $this->lookup[$element]; 2270 unset($this->lookup[$i][$element]); 2271 } 2272 } 2273 $this->lookup[$i] += $add; 2274 } 2275 } 2276 2277 foreach ($this->lookup as $key => $lookup) { 2278 $this->info[$key] = implode(' | ', array_keys($lookup)); 2279 } 2280 $this->keys = array_keys($this->info); 2281 $this->values = array_values($this->info); 2282 } 2283 2284 /** 2285 * Accepts a definition; generates and assigns a ChildDef for it 2286 * @param $def HTMLPurifier_ElementDef reference 2287 * @param $module Module that defined the ElementDef 2288 */ 2289 public function generateChildDef(&$def, $module) { 2290 if (!empty($def->child)) return; // already done! 2291 $content_model = $def->content_model; 2292 if (is_string($content_model)) { 2293 // Assume that $this->keys is alphanumeric 2294 $def->content_model = preg_replace_callback( 2295 '/\b(' . implode('|', $this->keys) . ')\b/', 2296 array($this, 'generateChildDefCallback'), 2297 $content_model 2298 ); 2299 //$def->content_model = str_replace( 2300 // $this->keys, $this->values, $content_model); 2301 } 2302 $def->child = $this->getChildDef($def, $module); 2303 } 2304 2305 public function generateChildDefCallback($matches) { 2306 return $this->info[$matches[0]]; 2307 } 2308 2309 /** 2310 * Instantiates a ChildDef based on content_model and content_model_type 2311 * member variables in HTMLPurifier_ElementDef 2312 * @note This will also defer to modules for custom HTMLPurifier_ChildDef 2313 * subclasses that need content set expansion 2314 * @param $def HTMLPurifier_ElementDef to have ChildDef extracted 2315 * @return HTMLPurifier_ChildDef corresponding to ElementDef 2316 */ 2317 public function getChildDef($def, $module) { 2318 $value = $def->content_model; 2319 if (is_object($value)) { 2320 trigger_error( 2321 'Literal object child definitions should be stored in '. 2322 'ElementDef->child not ElementDef->content_model', 2323 E_USER_NOTICE 2324 ); 2325 return $value; 2326 } 2327 switch ($def->content_model_type) { 2328 case 'required': 2329 return new HTMLPurifier_ChildDef_Required($value); 2330 case 'optional': 2331 return new HTMLPurifier_ChildDef_Optional($value); 2332 case 'empty': 2333 return new HTMLPurifier_ChildDef_Empty(); 2334 case 'custom': 2335 return new HTMLPurifier_ChildDef_Custom($value); 2336 } 2337 // defer to its module 2338 $return = false; 2339 if ($module->defines_child_def) { // save a func call 2340 $return = $module->getChildDef($def); 2341 } 2342 if ($return !== false) return $return; 2343 // error-out 2344 trigger_error( 2345 'Could not determine which ChildDef class to instantiate', 2346 E_USER_ERROR 2347 ); 2348 return false; 2349 } 2350 2351 /** 2352 * Converts a string list of elements separated by pipes into 2353 * a lookup array. 2354 * @param $string List of elements 2355 * @return Lookup array of elements 2356 */ 2357 protected function convertToLookup($string) { 2358 $array = explode('|', str_replace(' ', '', $string)); 2359 $ret = array(); 2360 foreach ($array as $i => $k) { 2361 $ret[$k] = true; 2362 } 2363 return $ret; 2364 } 2365 2366} 2367 2368 2369 2370 2371 2372/** 2373 * Registry object that contains information about the current context. 2374 * @warning Is a bit buggy when variables are set to null: it thinks 2375 * they don't exist! So use false instead, please. 2376 * @note Since the variables Context deals with may not be objects, 2377 * references are very important here! Do not remove! 2378 */ 2379class HTMLPurifier_Context 2380{ 2381 2382 /** 2383 * Private array that stores the references. 2384 */ 2385 private $_storage = array(); 2386 2387 /** 2388 * Registers a variable into the context. 2389 * @param $name String name 2390 * @param $ref Reference to variable to be registered 2391 */ 2392 public function register($name, &$ref) { 2393 if (isset($this->_storage[$name])) { 2394 trigger_error("Name $name produces collision, cannot re-register", 2395 E_USER_ERROR); 2396 return; 2397 } 2398 $this->_storage[$name] =& $ref; 2399 } 2400 2401 /** 2402 * Retrieves a variable reference from the context. 2403 * @param $name String name 2404 * @param $ignore_error Boolean whether or not to ignore error 2405 */ 2406 public function &get($name, $ignore_error = false) { 2407 if (!isset($this->_storage[$name])) { 2408 if (!$ignore_error) { 2409 trigger_error("Attempted to retrieve non-existent variable $name", 2410 E_USER_ERROR); 2411 } 2412 $var = null; // so we can return by reference 2413 return $var; 2414 } 2415 return $this->_storage[$name]; 2416 } 2417 2418 /** 2419 * Destorys a variable in the context. 2420 * @param $name String name 2421 */ 2422 public function destroy($name) { 2423 if (!isset($this->_storage[$name])) { 2424 trigger_error("Attempted to destroy non-existent variable $name", 2425 E_USER_ERROR); 2426 return; 2427 } 2428 unset($this->_storage[$name]); 2429 } 2430 2431 /** 2432 * Checks whether or not the variable exists. 2433 * @param $name String name 2434 */ 2435 public function exists($name) { 2436 return isset($this->_storage[$name]); 2437 } 2438 2439 /** 2440 * Loads a series of variables from an associative array 2441 * @param $context_array Assoc array of variables to load 2442 */ 2443 public function loadArray($context_array) { 2444 foreach ($context_array as $key => $discard) { 2445 $this->register($key, $context_array[$key]); 2446 } 2447 } 2448 2449} 2450 2451 2452 2453 2454 2455/** 2456 * Abstract class representing Definition cache managers that implements 2457 * useful common methods and is a factory. 2458 * @todo Create a separate maintenance file advanced users can use to 2459 * cache their custom HTMLDefinition, which can be loaded 2460 * via a configuration directive 2461 * @todo Implement memcached 2462 */ 2463abstract class HTMLPurifier_DefinitionCache 2464{ 2465 2466 public $type; 2467 2468 /** 2469 * @param $name Type of definition objects this instance of the 2470 * cache will handle. 2471 */ 2472 public function __construct($type) { 2473 $this->type = $type; 2474 } 2475 2476 /** 2477 * Generates a unique identifier for a particular configuration 2478 * @param Instance of HTMLPurifier_Config 2479 */ 2480 public function generateKey($config) { 2481 return $config->version . ',' . // possibly replace with function calls 2482 $config->getBatchSerial($this->type) . ',' . 2483 $config->get($this->type . '.DefinitionRev'); 2484 } 2485 2486 /** 2487 * Tests whether or not a key is old with respect to the configuration's 2488 * version and revision number. 2489 * @param $key Key to test 2490 * @param $config Instance of HTMLPurifier_Config to test against 2491 */ 2492 public function isOld($key, $config) { 2493 if (substr_count($key, ',') < 2) return true; 2494 list($version, $hash, $revision) = explode(',', $key, 3); 2495 $compare = version_compare($version, $config->version); 2496 // version mismatch, is always old 2497 if ($compare != 0) return true; 2498 // versions match, ids match, check revision number 2499 if ( 2500 $hash == $config->getBatchSerial($this->type) && 2501 $revision < $config->get($this->type . '.DefinitionRev') 2502 ) return true; 2503 return false; 2504 } 2505 2506 /** 2507 * Checks if a definition's type jives with the cache's type 2508 * @note Throws an error on failure 2509 * @param $def Definition object to check 2510 * @return Boolean true if good, false if not 2511 */ 2512 public function checkDefType($def) { 2513 if ($def->type !== $this->type) { 2514 trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}"); 2515 return false; 2516 } 2517 return true; 2518 } 2519 2520 /** 2521 * Adds a definition object to the cache 2522 */ 2523 abstract public function add($def, $config); 2524 2525 /** 2526 * Unconditionally saves a definition object to the cache 2527 */ 2528 abstract public function set($def, $config); 2529 2530 /** 2531 * Replace an object in the cache 2532 */ 2533 abstract public function replace($def, $config); 2534 2535 /** 2536 * Retrieves a definition object from the cache 2537 */ 2538 abstract public function get($config); 2539 2540 /** 2541 * Removes a definition object to the cache 2542 */ 2543 abstract public function remove($config); 2544 2545 /** 2546 * Clears all objects from cache 2547 */ 2548 abstract public function flush($config); 2549 2550 /** 2551 * Clears all expired (older version or revision) objects from cache 2552 * @note Be carefuly implementing this method as flush. Flush must 2553 * not interfere with other Definition types, and cleanup() 2554 * should not be repeatedly called by userland code. 2555 */ 2556 abstract public function cleanup($config); 2557 2558} 2559 2560 2561 2562 2563 2564/** 2565 * Responsible for creating definition caches. 2566 */ 2567class HTMLPurifier_DefinitionCacheFactory 2568{ 2569 2570 protected $caches = array('Serializer' => array()); 2571 protected $implementations = array(); 2572 protected $decorators = array(); 2573 2574 /** 2575 * Initialize default decorators 2576 */ 2577 public function setup() { 2578 $this->addDecorator('Cleanup'); 2579 } 2580 2581 /** 2582 * Retrieves an instance of global definition cache factory. 2583 */ 2584 public static function instance($prototype = null) { 2585 static $instance; 2586 if ($prototype !== null) { 2587 $instance = $prototype; 2588 } elseif ($instance === null || $prototype === true) { 2589 $instance = new HTMLPurifier_DefinitionCacheFactory(); 2590 $instance->setup(); 2591 } 2592 return $instance; 2593 } 2594 2595 /** 2596 * Registers a new definition cache object 2597 * @param $short Short name of cache object, for reference 2598 * @param $long Full class name of cache object, for construction 2599 */ 2600 public function register($short, $long) { 2601 $this->implementations[$short] = $long; 2602 } 2603 2604 /** 2605 * Factory method that creates a cache object based on configuration 2606 * @param $name Name of definitions handled by cache 2607 * @param $config Instance of HTMLPurifier_Config 2608 */ 2609 public function create($type, $config) { 2610 $method = $config->get('Cache.DefinitionImpl'); 2611 if ($method === null) { 2612 return new HTMLPurifier_DefinitionCache_Null($type); 2613 } 2614 if (!empty($this->caches[$method][$type])) { 2615 return $this->caches[$method][$type]; 2616 } 2617 if ( 2618 isset($this->implementations[$method]) && 2619 class_exists($class = $this->implementations[$method], false) 2620 ) { 2621 $cache = new $class($type); 2622 } else { 2623 if ($method != 'Serializer') { 2624 trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING); 2625 } 2626 $cache = new HTMLPurifier_DefinitionCache_Serializer($type); 2627 } 2628 foreach ($this->decorators as $decorator) { 2629 $new_cache = $decorator->decorate($cache); 2630 // prevent infinite recursion in PHP 4 2631 unset($cache); 2632 $cache = $new_cache; 2633 } 2634 $this->caches[$method][$type] = $cache; 2635 return $this->caches[$method][$type]; 2636 } 2637 2638 /** 2639 * Registers a decorator to add to all new cache objects 2640 * @param 2641 */ 2642 public function addDecorator($decorator) { 2643 if (is_string($decorator)) { 2644 $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator"; 2645 $decorator = new $class; 2646 } 2647 $this->decorators[$decorator->name] = $decorator; 2648 } 2649 2650} 2651 2652 2653 2654 2655 2656/** 2657 * Represents a document type, contains information on which modules 2658 * need to be loaded. 2659 * @note This class is inspected by Printer_HTMLDefinition->renderDoctype. 2660 * If structure changes, please update that function. 2661 */ 2662class HTMLPurifier_Doctype 2663{ 2664 /** 2665 * Full name of doctype 2666 */ 2667 public $name; 2668 2669 /** 2670 * List of standard modules (string identifiers or literal objects) 2671 * that this doctype uses 2672 */ 2673 public $modules = array(); 2674 2675 /** 2676 * List of modules to use for tidying up code 2677 */ 2678 public $tidyModules = array(); 2679 2680 /** 2681 * Is the language derived from XML (i.e. XHTML)? 2682 */ 2683 public $xml = true; 2684 2685 /** 2686 * List of aliases for this doctype 2687 */ 2688 public $aliases = array(); 2689 2690 /** 2691 * Public DTD identifier 2692 */ 2693 public $dtdPublic; 2694 2695 /** 2696 * System DTD identifier 2697 */ 2698 public $dtdSystem; 2699 2700 public function __construct($name = null, $xml = true, $modules = array(), 2701 $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null 2702 ) { 2703 $this->name = $name; 2704 $this->xml = $xml; 2705 $this->modules = $modules; 2706 $this->tidyModules = $tidyModules; 2707 $this->aliases = $aliases; 2708 $this->dtdPublic = $dtd_public; 2709 $this->dtdSystem = $dtd_system; 2710 } 2711} 2712 2713 2714 2715 2716 2717class HTMLPurifier_DoctypeRegistry 2718{ 2719 2720 /** 2721 * Hash of doctype names to doctype objects 2722 */ 2723 protected $doctypes; 2724 2725 /** 2726 * Lookup table of aliases to real doctype names 2727 */ 2728 protected $aliases; 2729 2730 /** 2731 * Registers a doctype to the registry 2732 * @note Accepts a fully-formed doctype object, or the 2733 * parameters for constructing a doctype object 2734 * @param $doctype Name of doctype or literal doctype object 2735 * @param $modules Modules doctype will load 2736 * @param $modules_for_modes Modules doctype will load for certain modes 2737 * @param $aliases Alias names for doctype 2738 * @return Editable registered doctype 2739 */ 2740 public function register($doctype, $xml = true, $modules = array(), 2741 $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null 2742 ) { 2743 if (!is_array($modules)) $modules = array($modules); 2744 if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules); 2745 if (!is_array($aliases)) $aliases = array($aliases); 2746 if (!is_object($doctype)) { 2747 $doctype = new HTMLPurifier_Doctype( 2748 $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system 2749 ); 2750 } 2751 $this->doctypes[$doctype->name] = $doctype; 2752 $name = $doctype->name; 2753 // hookup aliases 2754 foreach ($doctype->aliases as $alias) { 2755 if (isset($this->doctypes[$alias])) continue; 2756 $this->aliases[$alias] = $name; 2757 } 2758 // remove old aliases 2759 if (isset($this->aliases[$name])) unset($this->aliases[$name]); 2760 return $doctype; 2761 } 2762 2763 /** 2764 * Retrieves reference to a doctype of a certain name 2765 * @note This function resolves aliases 2766 * @note When possible, use the more fully-featured make() 2767 * @param $doctype Name of doctype 2768 * @return Editable doctype object 2769 */ 2770 public function get($doctype) { 2771 if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype]; 2772 if (!isset($this->doctypes[$doctype])) { 2773 trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR); 2774 $anon = new HTMLPurifier_Doctype($doctype); 2775 return $anon; 2776 } 2777 return $this->doctypes[$doctype]; 2778 } 2779 2780 /** 2781 * Creates a doctype based on a configuration object, 2782 * will perform initialization on the doctype 2783 * @note Use this function to get a copy of doctype that config 2784 * can hold on to (this is necessary in order to tell 2785 * Generator whether or not the current document is XML 2786 * based or not). 2787 */ 2788 public function make($config) { 2789 return clone $this->get($this->getDoctypeFromConfig($config)); 2790 } 2791 2792 /** 2793 * Retrieves the doctype from the configuration object 2794 */ 2795 public function getDoctypeFromConfig($config) { 2796 // recommended test 2797 $doctype = $config->get('HTML.Doctype'); 2798 if (!empty($doctype)) return $doctype; 2799 $doctype = $config->get('HTML.CustomDoctype'); 2800 if (!empty($doctype)) return $doctype; 2801 // backwards-compatibility 2802 if ($config->get('HTML.XHTML')) { 2803 $doctype = 'XHTML 1.0'; 2804 } else { 2805 $doctype = 'HTML 4.01'; 2806 } 2807 if ($config->get('HTML.Strict')) { 2808 $doctype .= ' Strict'; 2809 } else { 2810 $doctype .= ' Transitional'; 2811 } 2812 return $doctype; 2813 } 2814 2815} 2816 2817 2818 2819 2820 2821/** 2822 * Structure that stores an HTML element definition. Used by 2823 * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule. 2824 * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition. 2825 * Please update that class too. 2826 * @warning If you add new properties to this class, you MUST update 2827 * the mergeIn() method. 2828 */ 2829class HTMLPurifier_ElementDef 2830{ 2831 2832 /** 2833 * Does the definition work by itself, or is it created solely 2834 * for the purpose of merging into another definition? 2835 */ 2836 public $standalone = true; 2837 2838 /** 2839 * Associative array of attribute name to HTMLPurifier_AttrDef 2840 * @note Before being processed by HTMLPurifier_AttrCollections 2841 * when modules are finalized during 2842 * HTMLPurifier_HTMLDefinition->setup(), this array may also 2843 * contain an array at index 0 that indicates which attribute 2844 * collections to load into the full array. It may also 2845 * contain string indentifiers in lieu of HTMLPurifier_AttrDef, 2846 * see HTMLPurifier_AttrTypes on how they are expanded during 2847 * HTMLPurifier_HTMLDefinition->setup() processing. 2848 */ 2849 public $attr = array(); 2850 2851 /** 2852 * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation 2853 */ 2854 public $attr_transform_pre = array(); 2855 2856 /** 2857 * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation 2858 */ 2859 public $attr_transform_post = array(); 2860 2861 /** 2862 * HTMLPurifier_ChildDef of this tag. 2863 */ 2864 public $child; 2865 2866 /** 2867 * Abstract string representation of internal ChildDef rules. See 2868 * HTMLPurifier_ContentSets for how this is parsed and then transformed 2869 * into an HTMLPurifier_ChildDef. 2870 * @warning This is a temporary variable that is not available after 2871 * being processed by HTMLDefinition 2872 */ 2873 public $content_model; 2874 2875 /** 2876 * Value of $child->type, used to determine which ChildDef to use, 2877 * used in combination with $content_model. 2878 * @warning This must be lowercase 2879 * @warning This is a temporary variable that is not available after 2880 * being processed by HTMLDefinition 2881 */ 2882 public $content_model_type; 2883 2884 2885 2886 /** 2887 * Does the element have a content model (#PCDATA | Inline)*? This 2888 * is important for chameleon ins and del processing in 2889 * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't 2890 * have to worry about this one. 2891 */ 2892 public $descendants_are_inline = false; 2893 2894 /** 2895 * List of the names of required attributes this element has. Dynamically 2896 * populated by HTMLPurifier_HTMLDefinition::getElement 2897 */ 2898 public $required_attr = array(); 2899 2900 /** 2901 * Lookup table of tags excluded from all descendants of this tag. 2902 * @note SGML permits exclusions for all descendants, but this is 2903 * not possible with DTDs or XML Schemas. W3C has elected to 2904 * use complicated compositions of content_models to simulate 2905 * exclusion for children, but we go the simpler, SGML-style 2906 * route of flat-out exclusions, which correctly apply to 2907 * all descendants and not just children. Note that the XHTML 2908 * Modularization Abstract Modules are blithely unaware of such 2909 * distinctions. 2910 */ 2911 public $excludes = array(); 2912 2913 /** 2914 * This tag is explicitly auto-closed by the following tags. 2915 */ 2916 public $autoclose = array(); 2917 2918 /** 2919 * If a foreign element is found in this element, test if it is 2920 * allowed by this sub-element; if it is, instead of closing the 2921 * current element, place it inside this element. 2922 */ 2923 public $wrap; 2924 2925 /** 2926 * Whether or not this is a formatting element affected by the 2927 * "Active Formatting Elements" algorithm. 2928 */ 2929 public $formatting; 2930 2931 /** 2932 * Low-level factory constructor for creating new standalone element defs 2933 */ 2934 public static function create($content_model, $content_model_type, $attr) { 2935 $def = new HTMLPurifier_ElementDef(); 2936 $def->content_model = $content_model; 2937 $def->content_model_type = $content_model_type; 2938 $def->attr = $attr; 2939 return $def; 2940 } 2941 2942 /** 2943 * Merges the values of another element definition into this one. 2944 * Values from the new element def take precedence if a value is 2945 * not mergeable. 2946 */ 2947 public function mergeIn($def) { 2948 2949 // later keys takes precedence 2950 foreach($def->attr as $k => $v) { 2951 if ($k === 0) { 2952 // merge in the includes 2953 // sorry, no way to override an include 2954 foreach ($v as $v2) { 2955 $this->attr[0][] = $v2; 2956 } 2957 continue; 2958 } 2959 if ($v === false) { 2960 if (isset($this->attr[$k])) unset($this->attr[$k]); 2961 continue; 2962 } 2963 $this->attr[$k] = $v; 2964 } 2965 $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre); 2966 $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post); 2967 $this->_mergeAssocArray($this->excludes, $def->excludes); 2968 2969 if(!empty($def->content_model)) { 2970 $this->content_model = 2971 str_replace("#SUPER", $this->content_model, $def->content_model); 2972 $this->child = false; 2973 } 2974 if(!empty($def->content_model_type)) { 2975 $this->content_model_type = $def->content_model_type; 2976 $this->child = false; 2977 } 2978 if(!is_null($def->child)) $this->child = $def->child; 2979 if(!is_null($def->formatting)) $this->formatting = $def->formatting; 2980 if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline; 2981 2982 } 2983 2984 /** 2985 * Merges one array into another, removes values which equal false 2986 * @param $a1 Array by reference that is merged into 2987 * @param $a2 Array that merges into $a1 2988 */ 2989 private function _mergeAssocArray(&$a1, $a2) { 2990 foreach ($a2 as $k => $v) { 2991 if ($v === false) { 2992 if (isset($a1[$k])) unset($a1[$k]); 2993 continue; 2994 } 2995 $a1[$k] = $v; 2996 } 2997 } 2998 2999} 3000 3001 3002 3003 3004 3005/** 3006 * A UTF-8 specific character encoder that handles cleaning and transforming. 3007 * @note All functions in this class should be static. 3008 */ 3009class HTMLPurifier_Encoder 3010{ 3011 3012 /** 3013 * Constructor throws fatal error if you attempt to instantiate class 3014 */ 3015 private function __construct() { 3016 trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR); 3017 } 3018 3019 /** 3020 * Error-handler that mutes errors, alternative to shut-up operator. 3021 */ 3022 public static function muteErrorHandler() {} 3023 3024 /** 3025 * Cleans a UTF-8 string for well-formedness and SGML validity 3026 * 3027 * It will parse according to UTF-8 and return a valid UTF8 string, with 3028 * non-SGML codepoints excluded. 3029 * 3030 * @note Just for reference, the non-SGML code points are 0 to 31 and 3031 * 127 to 159, inclusive. However, we allow code points 9, 10 3032 * and 13, which are the tab, line feed and carriage return 3033 * respectively. 128 and above the code points map to multibyte 3034 * UTF-8 representations. 3035 * 3036 * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and 3037 * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the 3038 * LGPL license. Notes on what changed are inside, but in general, 3039 * the original code transformed UTF-8 text into an array of integer 3040 * Unicode codepoints. Understandably, transforming that back to 3041 * a string would be somewhat expensive, so the function was modded to 3042 * directly operate on the string. However, this discourages code 3043 * reuse, and the logic enumerated here would be useful for any 3044 * function that needs to be able to understand UTF-8 characters. 3045 * As of right now, only smart lossless character encoding converters 3046 * would need that, and I'm probably not going to implement them. 3047 * Once again, PHP 6 should solve all our problems. 3048 */ 3049 public static function cleanUTF8($str, $force_php = false) { 3050 3051 // UTF-8 validity is checked since PHP 4.3.5 3052 // This is an optimization: if the string is already valid UTF-8, no 3053 // need to do PHP stuff. 99% of the time, this will be the case. 3054 // The regexp matches the XML char production, as well as well as excluding 3055 // non-SGML codepoints U+007F to U+009F 3056 if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) { 3057 return $str; 3058 } 3059 3060 $mState = 0; // cached expected number of octets after the current octet 3061 // until the beginning of the next UTF8 character sequence 3062 $mUcs4 = 0; // cached Unicode character 3063 $mBytes = 1; // cached expected number of octets in the current sequence 3064 3065 // original code involved an $out that was an array of Unicode 3066 // codepoints. Instead of having to convert back into UTF-8, we've 3067 // decided to directly append valid UTF-8 characters onto a string 3068 // $out once they're done. $char accumulates raw bytes, while $mUcs4 3069 // turns into the Unicode code point, so there's some redundancy. 3070 3071 $out = ''; 3072 $char = ''; 3073 3074 $len = strlen($str); 3075 for($i = 0; $i < $len; $i++) { 3076 $in = ord($str{$i}); 3077 $char .= $str[$i]; // append byte to char 3078 if (0 == $mState) { 3079 // When mState is zero we expect either a US-ASCII character 3080 // or a multi-octet sequence. 3081 if (0 == (0x80 & ($in))) { 3082 // US-ASCII, pass straight through. 3083 if (($in <= 31 || $in == 127) && 3084 !($in == 9 || $in == 13 || $in == 10) // save \r\t\n 3085 ) { 3086 // control characters, remove 3087 } else { 3088 $out .= $char; 3089 } 3090 // reset 3091 $char = ''; 3092 $mBytes = 1; 3093 } elseif (0xC0 == (0xE0 & ($in))) { 3094 // First octet of 2 octet sequence 3095 $mUcs4 = ($in); 3096 $mUcs4 = ($mUcs4 & 0x1F) << 6; 3097 $mState = 1; 3098 $mBytes = 2; 3099 } elseif (0xE0 == (0xF0 & ($in))) { 3100 // First octet of 3 octet sequence 3101 $mUcs4 = ($in); 3102 $mUcs4 = ($mUcs4 & 0x0F) << 12; 3103 $mState = 2; 3104 $mBytes = 3; 3105 } elseif (0xF0 == (0xF8 & ($in))) { 3106 // First octet of 4 octet sequence 3107 $mUcs4 = ($in); 3108 $mUcs4 = ($mUcs4 & 0x07) << 18; 3109 $mState = 3; 3110 $mBytes = 4; 3111 } elseif (0xF8 == (0xFC & ($in))) { 3112 // First octet of 5 octet sequence. 3113 // 3114 // This is illegal because the encoded codepoint must be 3115 // either: 3116 // (a) not the shortest form or 3117 // (b) outside the Unicode range of 0-0x10FFFF. 3118 // Rather than trying to resynchronize, we will carry on 3119 // until the end of the sequence and let the later error 3120 // handling code catch it. 3121 $mUcs4 = ($in); 3122 $mUcs4 = ($mUcs4 & 0x03) << 24; 3123 $mState = 4; 3124 $mBytes = 5; 3125 } elseif (0xFC == (0xFE & ($in))) { 3126 // First octet of 6 octet sequence, see comments for 5 3127 // octet sequence. 3128 $mUcs4 = ($in); 3129 $mUcs4 = ($mUcs4 & 1) << 30; 3130 $mState = 5; 3131 $mBytes = 6; 3132 } else { 3133 // Current octet is neither in the US-ASCII range nor a 3134 // legal first octet of a multi-octet sequence. 3135 $mState = 0; 3136 $mUcs4 = 0; 3137 $mBytes = 1; 3138 $char = ''; 3139 } 3140 } else { 3141 // When mState is non-zero, we expect a continuation of the 3142 // multi-octet sequence 3143 if (0x80 == (0xC0 & ($in))) { 3144 // Legal continuation. 3145 $shift = ($mState - 1) * 6; 3146 $tmp = $in; 3147 $tmp = ($tmp & 0x0000003F) << $shift; 3148 $mUcs4 |= $tmp; 3149 3150 if (0 == --$mState) { 3151 // End of the multi-octet sequence. mUcs4 now contains 3152 // the final Unicode codepoint to be output 3153 3154 // Check for illegal sequences and codepoints. 3155 3156 // From Unicode 3.1, non-shortest form is illegal 3157 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 3158 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 3159 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 3160 (4 < $mBytes) || 3161 // From Unicode 3.2, surrogate characters = illegal 3162 (($mUcs4 & 0xFFFFF800) == 0xD800) || 3163 // Codepoints outside the Unicode range are illegal 3164 ($mUcs4 > 0x10FFFF) 3165 ) { 3166 3167 } elseif (0xFEFF != $mUcs4 && // omit BOM 3168 // check for valid Char unicode codepoints 3169 ( 3170 0x9 == $mUcs4 || 3171 0xA == $mUcs4 || 3172 0xD == $mUcs4 || 3173 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) || 3174 // 7F-9F is not strictly prohibited by XML, 3175 // but it is non-SGML, and thus we don't allow it 3176 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) || 3177 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4) 3178 ) 3179 ) { 3180 $out .= $char; 3181 } 3182 // initialize UTF8 cache (reset) 3183 $mState = 0; 3184 $mUcs4 = 0; 3185 $mBytes = 1; 3186 $char = ''; 3187 } 3188 } else { 3189 // ((0xC0 & (*in) != 0x80) && (mState != 0)) 3190 // Incomplete multi-octet sequence. 3191 // used to result in complete fail, but we'll reset 3192 $mState = 0; 3193 $mUcs4 = 0; 3194 $mBytes = 1; 3195 $char =''; 3196 } 3197 } 3198 } 3199 return $out; 3200 } 3201 3202 /** 3203 * Translates a Unicode codepoint into its corresponding UTF-8 character. 3204 * @note Based on Feyd's function at 3205 * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>, 3206 * which is in public domain. 3207 * @note While we're going to do code point parsing anyway, a good 3208 * optimization would be to refuse to translate code points that 3209 * are non-SGML characters. However, this could lead to duplication. 3210 * @note This is very similar to the unichr function in 3211 * maintenance/generate-entity-file.php (although this is superior, 3212 * due to its sanity checks). 3213 */ 3214 3215 // +----------+----------+----------+----------+ 3216 // | 33222222 | 22221111 | 111111 | | 3217 // | 10987654 | 32109876 | 54321098 | 76543210 | bit 3218 // +----------+----------+----------+----------+ 3219 // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F 3220 // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF 3221 // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF 3222 // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF 3223 // +----------+----------+----------+----------+ 3224 // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF) 3225 // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes 3226 // +----------+----------+----------+----------+ 3227 3228 public static function unichr($code) { 3229 if($code > 1114111 or $code < 0 or 3230 ($code >= 55296 and $code <= 57343) ) { 3231 // bits are set outside the "valid" range as defined 3232 // by UNICODE 4.1.0 3233 return ''; 3234 } 3235 3236 $x = $y = $z = $w = 0; 3237 if ($code < 128) { 3238 // regular ASCII character 3239 $x = $code; 3240 } else { 3241 // set up bits for UTF-8 3242 $x = ($code & 63) | 128; 3243 if ($code < 2048) { 3244 $y = (($code & 2047) >> 6) | 192; 3245 } else { 3246 $y = (($code & 4032) >> 6) | 128; 3247 if($code < 65536) { 3248 $z = (($code >> 12) & 15) | 224; 3249 } else { 3250 $z = (($code >> 12) & 63) | 128; 3251 $w = (($code >> 18) & 7) | 240; 3252 } 3253 } 3254 } 3255 // set up the actual character 3256 $ret = ''; 3257 if($w) $ret .= chr($w); 3258 if($z) $ret .= chr($z); 3259 if($y) $ret .= chr($y); 3260 $ret .= chr($x); 3261 3262 return $ret; 3263 } 3264 3265 /** 3266 * Converts a string to UTF-8 based on configuration. 3267 */ 3268 public static function convertToUTF8($str, $config, $context) { 3269 $encoding = $config->get('Core.Encoding'); 3270 if ($encoding === 'utf-8') return $str; 3271 static $iconv = null; 3272 if ($iconv === null) $iconv = function_exists('iconv'); 3273 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); 3274 if ($iconv && !$config->get('Test.ForceNoIconv')) { 3275 $str = iconv($encoding, 'utf-8//IGNORE', $str); 3276 if ($str === false) { 3277 // $encoding is not a valid encoding 3278 restore_error_handler(); 3279 trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR); 3280 return ''; 3281 } 3282 // If the string is bjorked by Shift_JIS or a similar encoding 3283 // that doesn't support all of ASCII, convert the naughty 3284 // characters to their true byte-wise ASCII/UTF-8 equivalents. 3285 $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding)); 3286 restore_error_handler(); 3287 return $str; 3288 } elseif ($encoding === 'iso-8859-1') { 3289 $str = utf8_encode($str); 3290 restore_error_handler(); 3291 return $str; 3292 } 3293 trigger_error('Encoding not supported, please install iconv', E_USER_ERROR); 3294 } 3295 3296 /** 3297 * Converts a string from UTF-8 based on configuration. 3298 * @note Currently, this is a lossy conversion, with unexpressable 3299 * characters being omitted. 3300 */ 3301 public static function convertFromUTF8($str, $config, $context) { 3302 $encoding = $config->get('Core.Encoding'); 3303 if ($encoding === 'utf-8') return $str; 3304 static $iconv = null; 3305 if ($iconv === null) $iconv = function_exists('iconv'); 3306 if ($escape = $config->get('Core.EscapeNonASCIICharacters')) { 3307 $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); 3308 } 3309 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); 3310 if ($iconv && !$config->get('Test.ForceNoIconv')) { 3311 // Undo our previous fix in convertToUTF8, otherwise iconv will barf 3312 $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding); 3313 if (!$escape && !empty($ascii_fix)) { 3314 $clear_fix = array(); 3315 foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = ''; 3316 $str = strtr($str, $clear_fix); 3317 } 3318 $str = strtr($str, array_flip($ascii_fix)); 3319 // Normal stuff 3320 $str = iconv('utf-8', $encoding . '//IGNORE', $str); 3321 restore_error_handler(); 3322 return $str; 3323 } elseif ($encoding === 'iso-8859-1') { 3324 $str = utf8_decode($str); 3325 restore_error_handler(); 3326 return $str; 3327 } 3328 trigger_error('Encoding not supported', E_USER_ERROR); 3329 } 3330 3331 /** 3332 * Lossless (character-wise) conversion of HTML to ASCII 3333 * @param $str UTF-8 string to be converted to ASCII 3334 * @returns ASCII encoded string with non-ASCII character entity-ized 3335 * @warning Adapted from MediaWiki, claiming fair use: this is a common 3336 * algorithm. If you disagree with this license fudgery, 3337 * implement it yourself. 3338 * @note Uses decimal numeric entities since they are best supported. 3339 * @note This is a DUMB function: it has no concept of keeping 3340 * character entities that the projected character encoding 3341 * can allow. We could possibly implement a smart version 3342 * but that would require it to also know which Unicode 3343 * codepoints the charset supported (not an easy task). 3344 * @note Sort of with cleanUTF8() but it assumes that $str is 3345 * well-formed UTF-8 3346 */ 3347 public static function convertToASCIIDumbLossless($str) { 3348 $bytesleft = 0; 3349 $result = ''; 3350 $working = 0; 3351 $len = strlen($str); 3352 for( $i = 0; $i < $len; $i++ ) { 3353 $bytevalue = ord( $str[$i] ); 3354 if( $bytevalue <= 0x7F ) { //0xxx xxxx 3355 $result .= chr( $bytevalue ); 3356 $bytesleft = 0; 3357 } elseif( $bytevalue <= 0xBF ) { //10xx xxxx 3358 $working = $working << 6; 3359 $working += ($bytevalue & 0x3F); 3360 $bytesleft--; 3361 if( $bytesleft <= 0 ) { 3362 $result .= "&#" . $working . ";"; 3363 } 3364 } elseif( $bytevalue <= 0xDF ) { //110x xxxx 3365 $working = $bytevalue & 0x1F; 3366 $bytesleft = 1; 3367 } elseif( $bytevalue <= 0xEF ) { //1110 xxxx 3368 $working = $bytevalue & 0x0F; 3369 $bytesleft = 2; 3370 } else { //1111 0xxx 3371 $working = $bytevalue & 0x07; 3372 $bytesleft = 3; 3373 } 3374 } 3375 return $result; 3376 } 3377 3378 /** 3379 * This expensive function tests whether or not a given character 3380 * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will 3381 * fail this test, and require special processing. Variable width 3382 * encodings shouldn't ever fail. 3383 * 3384 * @param string $encoding Encoding name to test, as per iconv format 3385 * @param bool $bypass Whether or not to bypass the precompiled arrays. 3386 * @return Array of UTF-8 characters to their corresponding ASCII, 3387 * which can be used to "undo" any overzealous iconv action. 3388 */ 3389 public static function testEncodingSupportsASCII($encoding, $bypass = false) { 3390 static $encodings = array(); 3391 if (!$bypass) { 3392 if (isset($encodings[$encoding])) return $encodings[$encoding]; 3393 $lenc = strtolower($encoding); 3394 switch ($lenc) { 3395 case 'shift_jis': 3396 return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'); 3397 case 'johab': 3398 return array("\xE2\x82\xA9" => '\\'); 3399 } 3400 if (strpos($lenc, 'iso-8859-') === 0) return array(); 3401 } 3402 $ret = array(); 3403 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); 3404 if (iconv('UTF-8', $encoding, 'a') === false) return false; 3405 for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars 3406 $c = chr($i); // UTF-8 char 3407 $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion 3408 if ( 3409 $r === '' || 3410 // This line is needed for iconv implementations that do not 3411 // omit characters that do not exist in the target character set 3412 ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c) 3413 ) { 3414 // Reverse engineer: what's the UTF-8 equiv of this byte 3415 // sequence? This assumes that there's no variable width 3416 // encoding that doesn't support ASCII. 3417 $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c; 3418 } 3419 } 3420 restore_error_handler(); 3421 $encodings[$encoding] = $ret; 3422 return $ret; 3423 } 3424 3425 3426} 3427 3428 3429 3430 3431 3432/** 3433 * Object that provides entity lookup table from entity name to character 3434 */ 3435class HTMLPurifier_EntityLookup { 3436 3437 /** 3438 * Assoc array of entity name to character represented. 3439 */ 3440 public $table; 3441 3442 /** 3443 * Sets up the entity lookup table from the serialized file contents. 3444 * @note The serialized contents are versioned, but were generated 3445 * using the maintenance script generate_entity_file.php 3446 * @warning This is not in constructor to help enforce the Singleton 3447 */ 3448 public function setup($file = false) { 3449 if (!$file) { 3450 $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser'; 3451 } 3452 $this->table = unserialize(file_get_contents($file)); 3453 } 3454 3455 /** 3456 * Retrieves sole instance of the object. 3457 * @param Optional prototype of custom lookup table to overload with. 3458 */ 3459 public static function instance($prototype = false) { 3460 // no references, since PHP doesn't copy unless modified 3461 static $instance = null; 3462 if ($prototype) { 3463 $instance = $prototype; 3464 } elseif (!$instance) { 3465 $instance = new HTMLPurifier_EntityLookup(); 3466 $instance->setup(); 3467 } 3468 return $instance; 3469 } 3470 3471} 3472 3473 3474 3475 3476 3477// if want to implement error collecting here, we'll need to use some sort 3478// of global data (probably trigger_error) because it's impossible to pass 3479// $config or $context to the callback functions. 3480 3481/** 3482 * Handles referencing and derefencing character entities 3483 */ 3484class HTMLPurifier_EntityParser 3485{ 3486 3487 /** 3488 * Reference to entity lookup table. 3489 */ 3490 protected $_entity_lookup; 3491 3492 /** 3493 * Callback regex string for parsing entities. 3494 */ 3495 protected $_substituteEntitiesRegex = 3496'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/'; 3497// 1. hex 2. dec 3. string (XML style) 3498 3499 3500 /** 3501 * Decimal to parsed string conversion table for special entities. 3502 */ 3503 protected $_special_dec2str = 3504 array( 3505 34 => '"', 3506 38 => '&', 3507 39 => "'", 3508 60 => '<', 3509 62 => '>' 3510 ); 3511 3512 /** 3513 * Stripped entity names to decimal conversion table for special entities. 3514 */ 3515 protected $_special_ent2dec = 3516 array( 3517 'quot' => 34, 3518 'amp' => 38, 3519 'lt' => 60, 3520 'gt' => 62 3521 ); 3522 3523 /** 3524 * Substitutes non-special entities with their parsed equivalents. Since 3525 * running this whenever you have parsed character is t3h 5uck, we run 3526 * it before everything else. 3527 * 3528 * @param $string String to have non-special entities parsed. 3529 * @returns Parsed string. 3530 */ 3531 public function substituteNonSpecialEntities($string) { 3532 // it will try to detect missing semicolons, but don't rely on it 3533 return preg_replace_callback( 3534 $this->_substituteEntitiesRegex, 3535 array($this, 'nonSpecialEntityCallback'), 3536 $string 3537 ); 3538 } 3539 3540 /** 3541 * Callback function for substituteNonSpecialEntities() that does the work. 3542 * 3543 * @param $matches PCRE matches array, with 0 the entire match, and 3544 * either index 1, 2 or 3 set with a hex value, dec value, 3545 * or string (respectively). 3546 * @returns Replacement string. 3547 */ 3548 3549 protected function nonSpecialEntityCallback($matches) { 3550 // replaces all but big five 3551 $entity = $matches[0]; 3552 $is_num = (@$matches[0][1] === '#'); 3553 if ($is_num) { 3554 $is_hex = (@$entity[2] === 'x'); 3555 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 3556 3557 // abort for special characters 3558 if (isset($this->_special_dec2str[$code])) return $entity; 3559 3560 return HTMLPurifier_Encoder::unichr($code); 3561 } else { 3562 if (isset($this->_special_ent2dec[$matches[3]])) return $entity; 3563 if (!$this->_entity_lookup) { 3564 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 3565 } 3566 if (isset($this->_entity_lookup->table[$matches[3]])) { 3567 return $this->_entity_lookup->table[$matches[3]]; 3568 } else { 3569 return $entity; 3570 } 3571 } 3572 } 3573 3574 /** 3575 * Substitutes only special entities with their parsed equivalents. 3576 * 3577 * @notice We try to avoid calling this function because otherwise, it 3578 * would have to be called a lot (for every parsed section). 3579 * 3580 * @param $string String to have non-special entities parsed. 3581 * @returns Parsed string. 3582 */ 3583 public function substituteSpecialEntities($string) { 3584 return preg_replace_callback( 3585 $this->_substituteEntitiesRegex, 3586 array($this, 'specialEntityCallback'), 3587 $string); 3588 } 3589 3590 /** 3591 * Callback function for substituteSpecialEntities() that does the work. 3592 * 3593 * This callback has same syntax as nonSpecialEntityCallback(). 3594 * 3595 * @param $matches PCRE-style matches array, with 0 the entire match, and 3596 * either index 1, 2 or 3 set with a hex value, dec value, 3597 * or string (respectively). 3598 * @returns Replacement string. 3599 */ 3600 protected function specialEntityCallback($matches) { 3601 $entity = $matches[0]; 3602 $is_num = (@$matches[0][1] === '#'); 3603 if ($is_num) { 3604 $is_hex = (@$entity[2] === 'x'); 3605 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 3606 return isset($this->_special_dec2str[$int]) ? 3607 $this->_special_dec2str[$int] : 3608 $entity; 3609 } else { 3610 return isset($this->_special_ent2dec[$matches[3]]) ? 3611 $this->_special_ent2dec[$matches[3]] : 3612 $entity; 3613 } 3614 } 3615 3616} 3617 3618 3619 3620 3621 3622/** 3623 * Error collection class that enables HTML Purifier to report HTML 3624 * problems back to the user 3625 */ 3626class HTMLPurifier_ErrorCollector 3627{ 3628 3629 /** 3630 * Identifiers for the returned error array. These are purposely numeric 3631 * so list() can be used. 3632 */ 3633 const LINENO = 0; 3634 const SEVERITY = 1; 3635 const MESSAGE = 2; 3636 const CHILDREN = 3; 3637 3638 protected $errors; 3639 protected $_current; 3640 protected $_stacks = array(array()); 3641 protected $locale; 3642 protected $generator; 3643 protected $context; 3644 3645 protected $lines = array(); 3646 3647 public function __construct($context) { 3648 $this->locale =& $context->get('Locale'); 3649 $this->context = $context; 3650 $this->_current =& $this->_stacks[0]; 3651 $this->errors =& $this->_stacks[0]; 3652 } 3653 3654 /** 3655 * Sends an error message to the collector for later use 3656 * @param $severity int Error severity, PHP error style (don't use E_USER_) 3657 * @param $msg string Error message text 3658 * @param $subst1 string First substitution for $msg 3659 * @param $subst2 string ... 3660 */ 3661 public function send($severity, $msg) { 3662 3663 $args = array(); 3664 if (func_num_args() > 2) { 3665 $args = func_get_args(); 3666 array_shift($args); 3667 unset($args[0]); 3668 } 3669 3670 $token = $this->context->get('CurrentToken', true); 3671 $line = $token ? $token->line : $this->context->get('CurrentLine', true); 3672 $col = $token ? $token->col : $this->context->get('CurrentCol', true); 3673 $attr = $this->context->get('CurrentAttr', true); 3674 3675 // perform special substitutions, also add custom parameters 3676 $subst = array(); 3677 if (!is_null($token)) { 3678 $args['CurrentToken'] = $token; 3679 } 3680 if (!is_null($attr)) { 3681 $subst['$CurrentAttr.Name'] = $attr; 3682 if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr]; 3683 } 3684 3685 if (empty($args)) { 3686 $msg = $this->locale->getMessage($msg); 3687 } else { 3688 $msg = $this->locale->formatMessage($msg, $args); 3689 } 3690 3691 if (!empty($subst)) $msg = strtr($msg, $subst); 3692 3693 // (numerically indexed) 3694 $error = array( 3695 self::LINENO => $line, 3696 self::SEVERITY => $severity, 3697 self::MESSAGE => $msg, 3698 self::CHILDREN => array() 3699 ); 3700 $this->_current[] = $error; 3701 3702 3703 // NEW CODE BELOW ... 3704 3705 $struct = null; 3706 // Top-level errors are either: 3707 // TOKEN type, if $value is set appropriately, or 3708 // "syntax" type, if $value is null 3709 $new_struct = new HTMLPurifier_ErrorStruct(); 3710 $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN; 3711 if ($token) $new_struct->value = clone $token; 3712 if (is_int($line) && is_int($col)) { 3713 if (isset($this->lines[$line][$col])) { 3714 $struct = $this->lines[$line][$col]; 3715 } else { 3716 $struct = $this->lines[$line][$col] = $new_struct; 3717 } 3718 // These ksorts may present a performance problem 3719 ksort($this->lines[$line], SORT_NUMERIC); 3720 } else { 3721 if (isset($this->lines[-1])) { 3722 $struct = $this->lines[-1]; 3723 } else { 3724 $struct = $this->lines[-1] = $new_struct; 3725 } 3726 } 3727 ksort($this->lines, SORT_NUMERIC); 3728 3729 // Now, check if we need to operate on a lower structure 3730 if (!empty($attr)) { 3731 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr); 3732 if (!$struct->value) { 3733 $struct->value = array($attr, 'PUT VALUE HERE'); 3734 } 3735 } 3736 if (!empty($cssprop)) { 3737 $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop); 3738 if (!$struct->value) { 3739 // if we tokenize CSS this might be a little more difficult to do 3740 $struct->value = array($cssprop, 'PUT VALUE HERE'); 3741 } 3742 } 3743 3744 // Ok, structs are all setup, now time to register the error 3745 $struct->addError($severity, $msg); 3746 } 3747 3748 /** 3749 * Retrieves raw error data for custom formatter to use 3750 * @param List of arrays in format of array(line of error, 3751 * error severity, error message, 3752 * recursive sub-errors array) 3753 */ 3754 public function getRaw() { 3755 return $this->errors; 3756 } 3757 3758 /** 3759 * Default HTML formatting implementation for error messages 3760 * @param $config Configuration array, vital for HTML output nature 3761 * @param $errors Errors array to display; used for recursion. 3762 */ 3763 public function getHTMLFormatted($config, $errors = null) { 3764 $ret = array(); 3765 3766 $this->generator = new HTMLPurifier_Generator($config, $this->context); 3767 if ($errors === null) $errors = $this->errors; 3768 3769 // 'At line' message needs to be removed 3770 3771 // generation code for new structure goes here. It needs to be recursive. 3772 foreach ($this->lines as $line => $col_array) { 3773 if ($line == -1) continue; 3774 foreach ($col_array as $col => $struct) { 3775 $this->_renderStruct($ret, $struct, $line, $col); 3776 } 3777 } 3778 if (isset($this->lines[-1])) { 3779 $this->_renderStruct($ret, $this->lines[-1]); 3780 } 3781 3782 if (empty($errors)) { 3783 return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>'; 3784 } else { 3785 return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>'; 3786 } 3787 3788 } 3789 3790 private function _renderStruct(&$ret, $struct, $line = null, $col = null) { 3791 $stack = array($struct); 3792 $context_stack = array(array()); 3793 while ($current = array_pop($stack)) { 3794 $context = array_pop($context_stack); 3795 foreach ($current->errors as $error) { 3796 list($severity, $msg) = $error; 3797 $string = ''; 3798 $string .= '<div>'; 3799 // W3C uses an icon to indicate the severity of the error. 3800 $error = $this->locale->getErrorName($severity); 3801 $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> "; 3802 if (!is_null($line) && !is_null($col)) { 3803 $string .= "<em class=\"location\">Line $line, Column $col: </em> "; 3804 } else { 3805 $string .= '<em class="location">End of Document: </em> '; 3806 } 3807 $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> '; 3808 $string .= '</div>'; 3809 // Here, have a marker for the character on the column appropriate. 3810 // Be sure to clip extremely long lines. 3811 //$string .= '<pre>'; 3812 //$string .= ''; 3813 //$string .= '</pre>'; 3814 $ret[] = $string; 3815 } 3816 foreach ($current->children as $type => $array) { 3817 $context[] = $current; 3818 $stack = array_merge($stack, array_reverse($array, true)); 3819 for ($i = count($array); $i > 0; $i--) { 3820 $context_stack[] = $context; 3821 } 3822 } 3823 } 3824 } 3825 3826} 3827 3828 3829 3830 3831 3832/** 3833 * Records errors for particular segments of an HTML document such as tokens, 3834 * attributes or CSS properties. They can contain error structs (which apply 3835 * to components of what they represent), but their main purpose is to hold 3836 * errors applying to whatever struct is being used. 3837 */ 3838class HTMLPurifier_ErrorStruct 3839{ 3840 3841 /** 3842 * Possible values for $children first-key. Note that top-level structures 3843 * are automatically token-level. 3844 */ 3845 const TOKEN = 0; 3846 const ATTR = 1; 3847 const CSSPROP = 2; 3848 3849 /** 3850 * Type of this struct. 3851 */ 3852 public $type; 3853 3854 /** 3855 * Value of the struct we are recording errors for. There are various 3856 * values for this: 3857 * - TOKEN: Instance of HTMLPurifier_Token 3858 * - ATTR: array('attr-name', 'value') 3859 * - CSSPROP: array('prop-name', 'value') 3860 */ 3861 public $value; 3862 3863 /** 3864 * Errors registered for this structure. 3865 */ 3866 public $errors = array(); 3867 3868 /** 3869 * Child ErrorStructs that are from this structure. For example, a TOKEN 3870 * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional 3871 * array in structure: [TYPE]['identifier'] 3872 */ 3873 public $children = array(); 3874 3875 public function getChild($type, $id) { 3876 if (!isset($this->children[$type][$id])) { 3877 $this->children[$type][$id] = new HTMLPurifier_ErrorStruct(); 3878 $this->children[$type][$id]->type = $type; 3879 } 3880 return $this->children[$type][$id]; 3881 } 3882 3883 public function addError($severity, $message) { 3884 $this->errors[] = array($severity, $message); 3885 } 3886 3887} 3888 3889 3890 3891 3892 3893/** 3894 * Global exception class for HTML Purifier; any exceptions we throw 3895 * are from here. 3896 */ 3897class HTMLPurifier_Exception extends Exception 3898{ 3899 3900} 3901 3902 3903 3904 3905 3906/** 3907 * Represents a pre or post processing filter on HTML Purifier's output 3908 * 3909 * Sometimes, a little ad-hoc fixing of HTML has to be done before 3910 * it gets sent through HTML Purifier: you can use filters to acheive 3911 * this effect. For instance, YouTube videos can be preserved using 3912 * this manner. You could have used a decorator for this task, but 3913 * PHP's support for them is not terribly robust, so we're going 3914 * to just loop through the filters. 3915 * 3916 * Filters should be exited first in, last out. If there are three filters, 3917 * named 1, 2 and 3, the order of execution should go 1->preFilter, 3918 * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter, 3919 * 1->postFilter. 3920 * 3921 * @note Methods are not declared abstract as it is perfectly legitimate 3922 * for an implementation not to want anything to happen on a step 3923 */ 3924 3925class HTMLPurifier_Filter 3926{ 3927 3928 /** 3929 * Name of the filter for identification purposes 3930 */ 3931 public $name; 3932 3933 /** 3934 * Pre-processor function, handles HTML before HTML Purifier 3935 */ 3936 public function preFilter($html, $config, $context) { 3937 return $html; 3938 } 3939 3940 /** 3941 * Post-processor function, handles HTML after HTML Purifier 3942 */ 3943 public function postFilter($html, $config, $context) { 3944 return $html; 3945 } 3946 3947} 3948 3949 3950 3951 3952 3953/** 3954 * Generates HTML from tokens. 3955 * @todo Refactor interface so that configuration/context is determined 3956 * upon instantiation, no need for messy generateFromTokens() calls 3957 * @todo Make some of the more internal functions protected, and have 3958 * unit tests work around that 3959 */ 3960class HTMLPurifier_Generator 3961{ 3962 3963 /** 3964 * Whether or not generator should produce XML output 3965 */ 3966 private $_xhtml = true; 3967 3968 /** 3969 * :HACK: Whether or not generator should comment the insides of <script> tags 3970 */ 3971 private $_scriptFix = false; 3972 3973 /** 3974 * Cache of HTMLDefinition during HTML output to determine whether or 3975 * not attributes should be minimized. 3976 */ 3977 private $_def; 3978 3979 /** 3980 * Cache of %Output.SortAttr 3981 */ 3982 private $_sortAttr; 3983 3984 /** 3985 * Cache of %Output.FlashCompat 3986 */ 3987 private $_flashCompat; 3988 3989 /** 3990 * Cache of %Output.FixInnerHTML 3991 */ 3992 private $_innerHTMLFix; 3993 3994 /** 3995 * Stack for keeping track of object information when outputting IE 3996 * compatibility code. 3997 */ 3998 private $_flashStack = array(); 3999 4000 /** 4001 * Configuration for the generator 4002 */ 4003 protected $config; 4004 4005 /** 4006 * @param $config Instance of HTMLPurifier_Config 4007 * @param $context Instance of HTMLPurifier_Context 4008 */ 4009 public function __construct($config, $context) { 4010 $this->config = $config; 4011 $this->_scriptFix = $config->get('Output.CommentScriptContents'); 4012 $this->_innerHTMLFix = $config->get('Output.FixInnerHTML'); 4013 $this->_sortAttr = $config->get('Output.SortAttr'); 4014 $this->_flashCompat = $config->get('Output.FlashCompat'); 4015 $this->_def = $config->getHTMLDefinition(); 4016 $this->_xhtml = $this->_def->doctype->xml; 4017 } 4018 4019 /** 4020 * Generates HTML from an array of tokens. 4021 * @param $tokens Array of HTMLPurifier_Token 4022 * @param $config HTMLPurifier_Config object 4023 * @return Generated HTML 4024 */ 4025 public function generateFromTokens($tokens) { 4026 if (!$tokens) return ''; 4027 4028 // Basic algorithm 4029 $html = ''; 4030 for ($i = 0, $size = count($tokens); $i < $size; $i++) { 4031 if ($this->_scriptFix && $tokens[$i]->name === 'script' 4032 && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) { 4033 // script special case 4034 // the contents of the script block must be ONE token 4035 // for this to work. 4036 $html .= $this->generateFromToken($tokens[$i++]); 4037 $html .= $this->generateScriptFromToken($tokens[$i++]); 4038 } 4039 $html .= $this->generateFromToken($tokens[$i]); 4040 } 4041 4042 // Tidy cleanup 4043 if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { 4044 $tidy = new Tidy; 4045 $tidy->parseString($html, array( 4046 'indent'=> true, 4047 'output-xhtml' => $this->_xhtml, 4048 'show-body-only' => true, 4049 'indent-spaces' => 2, 4050 'wrap' => 68, 4051 ), 'utf8'); 4052 $tidy->cleanRepair(); 4053 $html = (string) $tidy; // explicit cast necessary 4054 } 4055 4056 // Normalize newlines to system defined value 4057 if ($this->config->get('Core.NormalizeNewlines')) { 4058 $nl = $this->config->get('Output.Newline'); 4059 if ($nl === null) $nl = PHP_EOL; 4060 if ($nl !== "\n") $html = str_replace("\n", $nl, $html); 4061 } 4062 return $html; 4063 } 4064 4065 /** 4066 * Generates HTML from a single token. 4067 * @param $token HTMLPurifier_Token object. 4068 * @return Generated HTML 4069 */ 4070 public function generateFromToken($token) { 4071 if (!$token instanceof HTMLPurifier_Token) { 4072 trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING); 4073 return ''; 4074 4075 } elseif ($token instanceof HTMLPurifier_Token_Start) { 4076 $attr = $this->generateAttributes($token->attr, $token->name); 4077 if ($this->_flashCompat) { 4078 if ($token->name == "object") { 4079 $flash = new stdclass(); 4080 $flash->attr = $token->attr; 4081 $flash->param = array(); 4082 $this->_flashStack[] = $flash; 4083 } 4084 } 4085 return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; 4086 4087 } elseif ($token instanceof HTMLPurifier_Token_End) { 4088 $_extra = ''; 4089 if ($this->_flashCompat) { 4090 if ($token->name == "object" && !empty($this->_flashStack)) { 4091 // doesn't do anything for now 4092 } 4093 } 4094 return $_extra . '</' . $token->name . '>'; 4095 4096 } elseif ($token instanceof HTMLPurifier_Token_Empty) { 4097 if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) { 4098 $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value']; 4099 } 4100 $attr = $this->generateAttributes($token->attr, $token->name); 4101 return '<' . $token->name . ($attr ? ' ' : '') . $attr . 4102 ( $this->_xhtml ? ' /': '' ) // <br /> v. <br> 4103 . '>'; 4104 4105 } elseif ($token instanceof HTMLPurifier_Token_Text) { 4106 return $this->escape($token->data, ENT_NOQUOTES); 4107 4108 } elseif ($token instanceof HTMLPurifier_Token_Comment) { 4109 return '<!--' . $token->data . '-->'; 4110 } else { 4111 return ''; 4112 4113 } 4114 } 4115 4116 /** 4117 * Special case processor for the contents of script tags 4118 * @warning This runs into problems if there's already a literal 4119 * --> somewhere inside the script contents. 4120 */ 4121 public function generateScriptFromToken($token) { 4122 if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token); 4123 // Thanks <http://lachy.id.au/log/2005/05/script-comments> 4124 $data = preg_replace('#//\s*$#', '', $token->data); 4125 return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>'; 4126 } 4127 4128 /** 4129 * Generates attribute declarations from attribute array. 4130 * @note This does not include the leading or trailing space. 4131 * @param $assoc_array_of_attributes Attribute array 4132 * @param $element Name of element attributes are for, used to check 4133 * attribute minimization. 4134 * @return Generate HTML fragment for insertion. 4135 */ 4136 public function generateAttributes($assoc_array_of_attributes, $element = false) { 4137 $html = ''; 4138 if ($this->_sortAttr) ksort($assoc_array_of_attributes); 4139 foreach ($assoc_array_of_attributes as $key => $value) { 4140 if (!$this->_xhtml) { 4141 // Remove namespaced attributes 4142 if (strpos($key, ':') !== false) continue; 4143 // Check if we should minimize the attribute: val="val" -> val 4144 if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) { 4145 $html .= $key . ' '; 4146 continue; 4147 } 4148 } 4149 // Workaround for Internet Explorer innerHTML bug. 4150 // Essentially, Internet Explorer, when calculating 4151 // innerHTML, omits quotes if there are no instances of 4152 // angled brackets, quotes or spaces. However, when parsing 4153 // HTML (for example, when you assign to innerHTML), it 4154 // treats backticks as quotes. Thus, 4155 // <img alt="``" /> 4156 // becomes 4157 // <img alt=`` /> 4158 // becomes 4159 // <img alt='' /> 4160 // Fortunately, all we need to do is trigger an appropriate 4161 // quoting style, which we do by adding an extra space. 4162 // This also is consistent with the W3C spec, which states 4163 // that user agents may ignore leading or trailing 4164 // whitespace (in fact, most don't, at least for attributes 4165 // like alt, but an extra space at the end is barely 4166 // noticeable). Still, we have a configuration knob for 4167 // this, since this transformation is not necesary if you 4168 // don't process user input with innerHTML or you don't plan 4169 // on supporting Internet Explorer. 4170 if ($this->_innerHTMLFix) { 4171 if (strpos($value, '`') !== false) { 4172 // check if correct quoting style would not already be 4173 // triggered 4174 if (strcspn($value, '"\' <>') === strlen($value)) { 4175 // protect! 4176 $value .= ' '; 4177 } 4178 } 4179 } 4180 $html .= $key.'="'.$this->escape($value).'" '; 4181 } 4182 return rtrim($html); 4183 } 4184 4185 /** 4186 * Escapes raw text data. 4187 * @todo This really ought to be protected, but until we have a facility 4188 * for properly generating HTML here w/o using tokens, it stays 4189 * public. 4190 * @param $string String data to escape for HTML. 4191 * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is 4192 * permissible for non-attribute output. 4193 * @return String escaped data. 4194 */ 4195 public function escape($string, $quote = null) { 4196 // Workaround for APC bug on Mac Leopard reported by sidepodcast 4197 // http://htmlpurifier.org/phorum/read.php?3,4823,4846 4198 if ($quote === null) $quote = ENT_COMPAT; 4199 return htmlspecialchars($string, $quote, 'UTF-8'); 4200 } 4201 4202} 4203 4204 4205 4206 4207 4208/** 4209 * Definition of the purified HTML that describes allowed children, 4210 * attributes, and many other things. 4211 * 4212 * Conventions: 4213 * 4214 * All member variables that are prefixed with info 4215 * (including the main $info array) are used by HTML Purifier internals 4216 * and should not be directly edited when customizing the HTMLDefinition. 4217 * They can usually be set via configuration directives or custom 4218 * modules. 4219 * 4220 * On the other hand, member variables without the info prefix are used 4221 * internally by the HTMLDefinition and MUST NOT be used by other HTML 4222 * Purifier internals. Many of them, however, are public, and may be 4223 * edited by userspace code to tweak the behavior of HTMLDefinition. 4224 * 4225 * @note This class is inspected by Printer_HTMLDefinition; please 4226 * update that class if things here change. 4227 * 4228 * @warning Directives that change this object's structure must be in 4229 * the HTML or Attr namespace! 4230 */ 4231class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition 4232{ 4233 4234 // FULLY-PUBLIC VARIABLES --------------------------------------------- 4235 4236 /** 4237 * Associative array of element names to HTMLPurifier_ElementDef 4238 */ 4239 public $info = array(); 4240 4241 /** 4242 * Associative array of global attribute name to attribute definition. 4243 */ 4244 public $info_global_attr = array(); 4245 4246 /** 4247 * String name of parent element HTML will be going into. 4248 */ 4249 public $info_parent = 'div'; 4250 4251 /** 4252 * Definition for parent element, allows parent element to be a 4253 * tag that's not allowed inside the HTML fragment. 4254 */ 4255 public $info_parent_def; 4256 4257 /** 4258 * String name of element used to wrap inline elements in block context 4259 * @note This is rarely used except for BLOCKQUOTEs in strict mode 4260 */ 4261 public $info_block_wrapper = 'p'; 4262 4263 /** 4264 * Associative array of deprecated tag name to HTMLPurifier_TagTransform 4265 */ 4266 public $info_tag_transform = array(); 4267 4268 /** 4269 * Indexed list of HTMLPurifier_AttrTransform to be performed before validation. 4270 */ 4271 public $info_attr_transform_pre = array(); 4272 4273 /** 4274 * Indexed list of HTMLPurifier_AttrTransform to be performed after validation. 4275 */ 4276 public $info_attr_transform_post = array(); 4277 4278 /** 4279 * Nested lookup array of content set name (Block, Inline) to 4280 * element name to whether or not it belongs in that content set. 4281 */ 4282 public $info_content_sets = array(); 4283 4284 /** 4285 * Indexed list of HTMLPurifier_Injector to be used. 4286 */ 4287 public $info_injector = array(); 4288 4289 /** 4290 * Doctype object 4291 */ 4292 public $doctype; 4293 4294 4295 4296 // RAW CUSTOMIZATION STUFF -------------------------------------------- 4297 4298 /** 4299 * Adds a custom attribute to a pre-existing element 4300 * @note This is strictly convenience, and does not have a corresponding 4301 * method in HTMLPurifier_HTMLModule 4302 * @param $element_name String element name to add attribute to 4303 * @param $attr_name String name of attribute 4304 * @param $def Attribute definition, can be string or object, see 4305 * HTMLPurifier_AttrTypes for details 4306 */ 4307 public function addAttribute($element_name, $attr_name, $def) { 4308 $module = $this->getAnonymousModule(); 4309 if (!isset($module->info[$element_name])) { 4310 $element = $module->addBlankElement($element_name); 4311 } else { 4312 $element = $module->info[$element_name]; 4313 } 4314 $element->attr[$attr_name] = $def; 4315 } 4316 4317 /** 4318 * Adds a custom element to your HTML definition 4319 * @note See HTMLPurifier_HTMLModule::addElement for detailed 4320 * parameter and return value descriptions. 4321 */ 4322 public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) { 4323 $module = $this->getAnonymousModule(); 4324 // assume that if the user is calling this, the element 4325 // is safe. This may not be a good idea 4326 $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes); 4327 return $element; 4328 } 4329 4330 /** 4331 * Adds a blank element to your HTML definition, for overriding 4332 * existing behavior 4333 * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed 4334 * parameter and return value descriptions. 4335 */ 4336 public function addBlankElement($element_name) { 4337 $module = $this->getAnonymousModule(); 4338 $element = $module->addBlankElement($element_name); 4339 return $element; 4340 } 4341 4342 /** 4343 * Retrieves a reference to the anonymous module, so you can 4344 * bust out advanced features without having to make your own 4345 * module. 4346 */ 4347 public function getAnonymousModule() { 4348 if (!$this->_anonModule) { 4349 $this->_anonModule = new HTMLPurifier_HTMLModule(); 4350 $this->_anonModule->name = 'Anonymous'; 4351 } 4352 return $this->_anonModule; 4353 } 4354 4355 private $_anonModule; 4356 4357 4358 // PUBLIC BUT INTERNAL VARIABLES -------------------------------------- 4359 4360 public $type = 'HTML'; 4361 public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */ 4362 4363 /** 4364 * Performs low-cost, preliminary initialization. 4365 */ 4366 public function __construct() { 4367 $this->manager = new HTMLPurifier_HTMLModuleManager(); 4368 } 4369 4370 protected function doSetup($config) { 4371 $this->processModules($config); 4372 $this->setupConfigStuff($config); 4373 unset($this->manager); 4374 4375 // cleanup some of the element definitions 4376 foreach ($this->info as $k => $v) { 4377 unset($this->info[$k]->content_model); 4378 unset($this->info[$k]->content_model_type); 4379 } 4380 } 4381 4382 /** 4383 * Extract out the information from the manager 4384 */ 4385 protected function processModules($config) { 4386 4387 if ($this->_anonModule) { 4388 // for user specific changes 4389 // this is late-loaded so we don't have to deal with PHP4 4390 // reference wonky-ness 4391 $this->manager->addModule($this->_anonModule); 4392 unset($this->_anonModule); 4393 } 4394 4395 $this->manager->setup($config); 4396 $this->doctype = $this->manager->doctype; 4397 4398 foreach ($this->manager->modules as $module) { 4399 foreach($module->info_tag_transform as $k => $v) { 4400 if ($v === false) unset($this->info_tag_transform[$k]); 4401 else $this->info_tag_transform[$k] = $v; 4402 } 4403 foreach($module->info_attr_transform_pre as $k => $v) { 4404 if ($v === false) unset($this->info_attr_transform_pre[$k]); 4405 else $this->info_attr_transform_pre[$k] = $v; 4406 } 4407 foreach($module->info_attr_transform_post as $k => $v) { 4408 if ($v === false) unset($this->info_attr_transform_post[$k]); 4409 else $this->info_attr_transform_post[$k] = $v; 4410 } 4411 foreach ($module->info_injector as $k => $v) { 4412 if ($v === false) unset($this->info_injector[$k]); 4413 else $this->info_injector[$k] = $v; 4414 } 4415 } 4416 4417 $this->info = $this->manager->getElements(); 4418 $this->info_content_sets = $this->manager->contentSets->lookup; 4419 4420 } 4421 4422 /** 4423 * Sets up stuff based on config. We need a better way of doing this. 4424 */ 4425 protected function setupConfigStuff($config) { 4426 4427 $block_wrapper = $config->get('HTML.BlockWrapper'); 4428 if (isset($this->info_content_sets['Block'][$block_wrapper])) { 4429 $this->info_block_wrapper = $block_wrapper; 4430 } else { 4431 trigger_error('Cannot use non-block element as block wrapper', 4432 E_USER_ERROR); 4433 } 4434 4435 $parent = $config->get('HTML.Parent'); 4436 $def = $this->manager->getElement($parent, true); 4437 if ($def) { 4438 $this->info_parent = $parent; 4439 $this->info_parent_def = $def; 4440 } else { 4441 trigger_error('Cannot use unrecognized element as parent', 4442 E_USER_ERROR); 4443 $this->info_parent_def = $this->manager->getElement($this->info_parent, true); 4444 } 4445 4446 // support template text 4447 $support = "(for information on implementing this, see the ". 4448 "support forums) "; 4449 4450 // setup allowed elements ----------------------------------------- 4451 4452 $allowed_elements = $config->get('HTML.AllowedElements'); 4453 $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early 4454 4455 if (!is_array($allowed_elements) && !is_array($allowed_attributes)) { 4456 $allowed = $config->get('HTML.Allowed'); 4457 if (is_string($allowed)) { 4458 list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed); 4459 } 4460 } 4461 4462 if (is_array($allowed_elements)) { 4463 foreach ($this->info as $name => $d) { 4464 if(!isset($allowed_elements[$name])) unset($this->info[$name]); 4465 unset($allowed_elements[$name]); 4466 } 4467 // emit errors 4468 foreach ($allowed_elements as $element => $d) { 4469 $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful! 4470 trigger_error("Element '$element' is not supported $support", E_USER_WARNING); 4471 } 4472 } 4473 4474 // setup allowed attributes --------------------------------------- 4475 4476 $allowed_attributes_mutable = $allowed_attributes; // by copy! 4477 if (is_array($allowed_attributes)) { 4478 4479 // This actually doesn't do anything, since we went away from 4480 // global attributes. It's possible that userland code uses 4481 // it, but HTMLModuleManager doesn't! 4482 foreach ($this->info_global_attr as $attr => $x) { 4483 $keys = array($attr, "*@$attr", "*.$attr"); 4484 $delete = true; 4485 foreach ($keys as $key) { 4486 if ($delete && isset($allowed_attributes[$key])) { 4487 $delete = false; 4488 } 4489 if (isset($allowed_attributes_mutable[$key])) { 4490 unset($allowed_attributes_mutable[$key]); 4491 } 4492 } 4493 if ($delete) unset($this->info_global_attr[$attr]); 4494 } 4495 4496 foreach ($this->info as $tag => $info) { 4497 foreach ($info->attr as $attr => $x) { 4498 $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr"); 4499 $delete = true; 4500 foreach ($keys as $key) { 4501 if ($delete && isset($allowed_attributes[$key])) { 4502 $delete = false; 4503 } 4504 if (isset($allowed_attributes_mutable[$key])) { 4505 unset($allowed_attributes_mutable[$key]); 4506 } 4507 } 4508 if ($delete) { 4509 if ($this->info[$tag]->attr[$attr]->required) { 4510 trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING); 4511 } 4512 unset($this->info[$tag]->attr[$attr]); 4513 } 4514 } 4515 } 4516 // emit errors 4517 foreach ($allowed_attributes_mutable as $elattr => $d) { 4518 $bits = preg_split('/[.@]/', $elattr, 2); 4519 $c = count($bits); 4520 switch ($c) { 4521 case 2: 4522 if ($bits[0] !== '*') { 4523 $element = htmlspecialchars($bits[0]); 4524 $attribute = htmlspecialchars($bits[1]); 4525 if (!isset($this->info[$element])) { 4526 trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support"); 4527 } else { 4528 trigger_error("Attribute '$attribute' in element '$element' not supported $support", 4529 E_USER_WARNING); 4530 } 4531 break; 4532 } 4533 // otherwise fall through 4534 case 1: 4535 $attribute = htmlspecialchars($bits[0]); 4536 trigger_error("Global attribute '$attribute' is not ". 4537 "supported in any elements $support", 4538 E_USER_WARNING); 4539 break; 4540 } 4541 } 4542 4543 } 4544 4545 // setup forbidden elements --------------------------------------- 4546 4547 $forbidden_elements = $config->get('HTML.ForbiddenElements'); 4548 $forbidden_attributes = $config->get('HTML.ForbiddenAttributes'); 4549 4550 foreach ($this->info as $tag => $info) { 4551 if (isset($forbidden_elements[$tag])) { 4552 unset($this->info[$tag]); 4553 continue; 4554 } 4555 foreach ($info->attr as $attr => $x) { 4556 if ( 4557 isset($forbidden_attributes["$tag@$attr"]) || 4558 isset($forbidden_attributes["*@$attr"]) || 4559 isset($forbidden_attributes[$attr]) 4560 ) { 4561 unset($this->info[$tag]->attr[$attr]); 4562 continue; 4563 } // this segment might get removed eventually 4564 elseif (isset($forbidden_attributes["$tag.$attr"])) { 4565 // $tag.$attr are not user supplied, so no worries! 4566 trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING); 4567 } 4568 } 4569 } 4570 foreach ($forbidden_attributes as $key => $v) { 4571 if (strlen($key) < 2) continue; 4572 if ($key[0] != '*') continue; 4573 if ($key[1] == '.') { 4574 trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING); 4575 } 4576 } 4577 4578 // setup injectors ----------------------------------------------------- 4579 foreach ($this->info_injector as $i => $injector) { 4580 if ($injector->checkNeeded($config) !== false) { 4581 // remove injector that does not have it's required 4582 // elements/attributes present, and is thus not needed. 4583 unset($this->info_injector[$i]); 4584 } 4585 } 4586 } 4587 4588 /** 4589 * Parses a TinyMCE-flavored Allowed Elements and Attributes list into 4590 * separate lists for processing. Format is element[attr1|attr2],element2... 4591 * @warning Although it's largely drawn from TinyMCE's implementation, 4592 * it is different, and you'll probably have to modify your lists 4593 * @param $list String list to parse 4594 * @param array($allowed_elements, $allowed_attributes) 4595 * @todo Give this its own class, probably static interface 4596 */ 4597 public function parseTinyMCEAllowedList($list) { 4598 4599 $list = str_replace(array(' ', "\t"), '', $list); 4600 4601 $elements = array(); 4602 $attributes = array(); 4603 4604 $chunks = preg_split('/(,|[\n\r]+)/', $list); 4605 foreach ($chunks as $chunk) { 4606 if (empty($chunk)) continue; 4607 // remove TinyMCE element control characters 4608 if (!strpos($chunk, '[')) { 4609 $element = $chunk; 4610 $attr = false; 4611 } else { 4612 list($element, $attr) = explode('[', $chunk); 4613 } 4614 if ($element !== '*') $elements[$element] = true; 4615 if (!$attr) continue; 4616 $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ] 4617 $attr = explode('|', $attr); 4618 foreach ($attr as $key) { 4619 $attributes["$element.$key"] = true; 4620 } 4621 } 4622 4623 return array($elements, $attributes); 4624 4625 } 4626 4627 4628} 4629 4630 4631 4632 4633 4634/** 4635 * Represents an XHTML 1.1 module, with information on elements, tags 4636 * and attributes. 4637 * @note Even though this is technically XHTML 1.1, it is also used for 4638 * regular HTML parsing. We are using modulization as a convenient 4639 * way to represent the internals of HTMLDefinition, and our 4640 * implementation is by no means conforming and does not directly 4641 * use the normative DTDs or XML schemas. 4642 * @note The public variables in a module should almost directly 4643 * correspond to the variables in HTMLPurifier_HTMLDefinition. 4644 * However, the prefix info carries no special meaning in these 4645 * objects (include it anyway if that's the correspondence though). 4646 * @todo Consider making some member functions protected 4647 */ 4648 4649class HTMLPurifier_HTMLModule 4650{ 4651 4652 // -- Overloadable ---------------------------------------------------- 4653 4654 /** 4655 * Short unique string identifier of the module 4656 */ 4657 public $name; 4658 4659 /** 4660 * Informally, a list of elements this module changes. Not used in 4661 * any significant way. 4662 */ 4663 public $elements = array(); 4664 4665 /** 4666 * Associative array of element names to element definitions. 4667 * Some definitions may be incomplete, to be merged in later 4668 * with the full definition. 4669 */ 4670 public $info = array(); 4671 4672 /** 4673 * Associative array of content set names to content set additions. 4674 * This is commonly used to, say, add an A element to the Inline 4675 * content set. This corresponds to an internal variable $content_sets 4676 * and NOT info_content_sets member variable of HTMLDefinition. 4677 */ 4678 public $content_sets = array(); 4679 4680 /** 4681 * Associative array of attribute collection names to attribute 4682 * collection additions. More rarely used for adding attributes to 4683 * the global collections. Example is the StyleAttribute module adding 4684 * the style attribute to the Core. Corresponds to HTMLDefinition's 4685 * attr_collections->info, since the object's data is only info, 4686 * with extra behavior associated with it. 4687 */ 4688 public $attr_collections = array(); 4689 4690 /** 4691 * Associative array of deprecated tag name to HTMLPurifier_TagTransform 4692 */ 4693 public $info_tag_transform = array(); 4694 4695 /** 4696 * List of HTMLPurifier_AttrTransform to be performed before validation. 4697 */ 4698 public $info_attr_transform_pre = array(); 4699 4700 /** 4701 * List of HTMLPurifier_AttrTransform to be performed after validation. 4702 */ 4703 public $info_attr_transform_post = array(); 4704 4705 /** 4706 * List of HTMLPurifier_Injector to be performed during well-formedness fixing. 4707 * An injector will only be invoked if all of it's pre-requisites are met; 4708 * if an injector fails setup, there will be no error; it will simply be 4709 * silently disabled. 4710 */ 4711 public $info_injector = array(); 4712 4713 /** 4714 * Boolean flag that indicates whether or not getChildDef is implemented. 4715 * For optimization reasons: may save a call to a function. Be sure 4716 * to set it if you do implement getChildDef(), otherwise it will have 4717 * no effect! 4718 */ 4719 public $defines_child_def = false; 4720