1<?php
2
3/**
4 * @file
5 * This file was auto-generated by generate-includes.php and includes all of
6 * the core files required by HTML Purifier. Use this if performance is a
7 * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
8 * FILE, changes will be overwritten the next time the script is run.
9 *
10 * @version 4.3.0
11 *
12 * @warning
13 *      You must *not* include any other HTML Purifier files before this file,
14 *      because 'require' not 'require_once' is used.
15 *
16 * @warning
17 *      This file requires that the include path contains the HTML Purifier
18 *      library directory; this is not auto-set.
19 */
20
21
22
23/*! @mainpage
24 *
25 * HTML Purifier is an HTML filter that will take an arbitrary snippet of
26 * HTML and rigorously test, validate and filter it into a version that
27 * is safe for output onto webpages. It achieves this by:
28 *
29 *  -# Lexing (parsing into tokens) the document,
30 *  -# Executing various strategies on the tokens:
31 *      -# Removing all elements not in the whitelist,
32 *      -# Making the tokens well-formed,
33 *      -# Fixing the nesting of the nodes, and
34 *      -# Validating attributes of the nodes; and
35 *  -# Generating HTML from the purified tokens.
36 *
37 * However, most users will only need to interface with the HTMLPurifier
38 * and HTMLPurifier_Config.
39 */
40
41/*
42    HTML Purifier 4.3.0 - Standards Compliant HTML Filtering
43    Copyright (C) 2006-2008 Edward Z. Yang
44
45    This library is free software; you can redistribute it and/or
46    modify it under the terms of the GNU Lesser General Public
47    License as published by the Free Software Foundation; either
48    version 2.1 of the License, or (at your option) any later version.
49
50    This library is distributed in the hope that it will be useful,
51    but WITHOUT ANY WARRANTY; without even the implied warranty of
52    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
53    Lesser General Public License for more details.
54
55    You should have received a copy of the GNU Lesser General Public
56    License along with this library; if not, write to the Free Software
57    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
58 */
59
60/**
61 * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
62 *
63 * @note There are several points in which configuration can be specified
64 *       for HTML Purifier.  The precedence of these (from lowest to
65 *       highest) is as follows:
66 *          -# Instance: new HTMLPurifier($config)
67 *          -# Invocation: purify($html, $config)
68 *       These configurations are entirely independent of each other and
69 *       are *not* merged (this behavior may change in the future).
70 *
71 * @todo We need an easier way to inject strategies using the configuration
72 *       object.
73 */
74class HTMLPurifier
75{
76
77    /** Version of HTML Purifier */
78    public $version = '4.3.0';
79
80    /** Constant with version of HTML Purifier */
81    const VERSION = '4.3.0';
82
83    /** Global configuration object */
84    public $config;
85
86    /** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
87    private $filters = array();
88
89    /** Single instance of HTML Purifier */
90    private static $instance;
91
92    protected $strategy, $generator;
93
94    /**
95     * Resultant HTMLPurifier_Context of last run purification. Is an array
96     * of contexts if the last called method was purifyArray().
97     */
98    public $context;
99
100    /**
101     * Initializes the purifier.
102     * @param $config Optional HTMLPurifier_Config object for all instances of
103     *                the purifier, if omitted, a default configuration is
104     *                supplied (which can be overridden on a per-use basis).
105     *                The parameter can also be any type that
106     *                HTMLPurifier_Config::create() supports.
107     */
108    public function __construct($config = null) {
109
110        $this->config = HTMLPurifier_Config::create($config);
111
112        $this->strategy     = new HTMLPurifier_Strategy_Core();
113
114    }
115
116    /**
117     * Adds a filter to process the output. First come first serve
118     * @param $filter HTMLPurifier_Filter object
119     */
120    public function addFilter($filter) {
121        trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
122        $this->filters[] = $filter;
123    }
124
125    /**
126     * Filters an HTML snippet/document to be XSS-free and standards-compliant.
127     *
128     * @param $html String of HTML to purify
129     * @param $config HTMLPurifier_Config object for this operation, if omitted,
130     *                defaults to the config object specified during this
131     *                object's construction. The parameter can also be any type
132     *                that HTMLPurifier_Config::create() supports.
133     * @return Purified HTML
134     */
135    public function purify($html, $config = null) {
136
137        // :TODO: make the config merge in, instead of replace
138        $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
139
140        // implementation is partially environment dependant, partially
141        // configuration dependant
142        $lexer = HTMLPurifier_Lexer::create($config);
143
144        $context = new HTMLPurifier_Context();
145
146        // setup HTML generator
147        $this->generator = new HTMLPurifier_Generator($config, $context);
148        $context->register('Generator', $this->generator);
149
150        // set up global context variables
151        if ($config->get('Core.CollectErrors')) {
152            // may get moved out if other facilities use it
153            $language_factory = HTMLPurifier_LanguageFactory::instance();
154            $language = $language_factory->create($config, $context);
155            $context->register('Locale', $language);
156
157            $error_collector = new HTMLPurifier_ErrorCollector($context);
158            $context->register('ErrorCollector', $error_collector);
159        }
160
161        // setup id_accumulator context, necessary due to the fact that
162        // AttrValidator can be called from many places
163        $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
164        $context->register('IDAccumulator', $id_accumulator);
165
166        $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
167
168        // setup filters
169        $filter_flags = $config->getBatch('Filter');
170        $custom_filters = $filter_flags['Custom'];
171        unset($filter_flags['Custom']);
172        $filters = array();
173        foreach ($filter_flags as $filter => $flag) {
174            if (!$flag) continue;
175            if (strpos($filter, '.') !== false) continue;
176            $class = "HTMLPurifier_Filter_$filter";
177            $filters[] = new $class;
178        }
179        foreach ($custom_filters as $filter) {
180            // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
181            $filters[] = $filter;
182        }
183        $filters = array_merge($filters, $this->filters);
184        // maybe prepare(), but later
185
186        for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
187            $html = $filters[$i]->preFilter($html, $config, $context);
188        }
189
190        // purified HTML
191        $html =
192            $this->generator->generateFromTokens(
193                // list of tokens
194                $this->strategy->execute(
195                    // list of un-purified tokens
196                    $lexer->tokenizeHTML(
197                        // un-purified HTML
198                        $html, $config, $context
199                    ),
200                    $config, $context
201                )
202            );
203
204        for ($i = $filter_size - 1; $i >= 0; $i--) {
205            $html = $filters[$i]->postFilter($html, $config, $context);
206        }
207
208        $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
209        $this->context =& $context;
210        return $html;
211    }
212
213    /**
214     * Filters an array of HTML snippets
215     * @param $config Optional HTMLPurifier_Config object for this operation.
216     *                See HTMLPurifier::purify() for more details.
217     * @return Array of purified HTML
218     */
219    public function purifyArray($array_of_html, $config = null) {
220        $context_array = array();
221        foreach ($array_of_html as $key => $html) {
222            $array_of_html[$key] = $this->purify($html, $config);
223            $context_array[$key] = $this->context;
224        }
225        $this->context = $context_array;
226        return $array_of_html;
227    }
228
229    /**
230     * Singleton for enforcing just one HTML Purifier in your system
231     * @param $prototype Optional prototype HTMLPurifier instance to
232     *                   overload singleton with, or HTMLPurifier_Config
233     *                   instance to configure the generated version with.
234     */
235    public static function instance($prototype = null) {
236        if (!self::$instance || $prototype) {
237            if ($prototype instanceof HTMLPurifier) {
238                self::$instance = $prototype;
239            } elseif ($prototype) {
240                self::$instance = new HTMLPurifier($prototype);
241            } else {
242                self::$instance = new HTMLPurifier();
243            }
244        }
245        return self::$instance;
246    }
247
248    /**
249     * @note Backwards compatibility, see instance()
250     */
251    public static function getInstance($prototype = null) {
252        return HTMLPurifier::instance($prototype);
253    }
254
255}
256
257
258
259
260
261/**
262 * Defines common attribute collections that modules reference
263 */
264
265class HTMLPurifier_AttrCollections
266{
267
268    /**
269     * Associative array of attribute collections, indexed by name
270     */
271    public $info = array();
272
273    /**
274     * Performs all expansions on internal data for use by other inclusions
275     * It also collects all attribute collection extensions from
276     * modules
277     * @param $attr_types HTMLPurifier_AttrTypes instance
278     * @param $modules Hash array of HTMLPurifier_HTMLModule members
279     */
280    public function __construct($attr_types, $modules) {
281        // load extensions from the modules
282        foreach ($modules as $module) {
283            foreach ($module->attr_collections as $coll_i => $coll) {
284                if (!isset($this->info[$coll_i])) {
285                    $this->info[$coll_i] = array();
286                }
287                foreach ($coll as $attr_i => $attr) {
288                    if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
289                        // merge in includes
290                        $this->info[$coll_i][$attr_i] = array_merge(
291                            $this->info[$coll_i][$attr_i], $attr);
292                        continue;
293                    }
294                    $this->info[$coll_i][$attr_i] = $attr;
295                }
296            }
297        }
298        // perform internal expansions and inclusions
299        foreach ($this->info as $name => $attr) {
300            // merge attribute collections that include others
301            $this->performInclusions($this->info[$name]);
302            // replace string identifiers with actual attribute objects
303            $this->expandIdentifiers($this->info[$name], $attr_types);
304        }
305    }
306
307    /**
308     * Takes a reference to an attribute associative array and performs
309     * all inclusions specified by the zero index.
310     * @param &$attr Reference to attribute array
311     */
312    public function performInclusions(&$attr) {
313        if (!isset($attr[0])) return;
314        $merge = $attr[0];
315        $seen  = array(); // recursion guard
316        // loop through all the inclusions
317        for ($i = 0; isset($merge[$i]); $i++) {
318            if (isset($seen[$merge[$i]])) continue;
319            $seen[$merge[$i]] = true;
320            // foreach attribute of the inclusion, copy it over
321            if (!isset($this->info[$merge[$i]])) continue;
322            foreach ($this->info[$merge[$i]] as $key => $value) {
323                if (isset($attr[$key])) continue; // also catches more inclusions
324                $attr[$key] = $value;
325            }
326            if (isset($this->info[$merge[$i]][0])) {
327                // recursion
328                $merge = array_merge($merge, $this->info[$merge[$i]][0]);
329            }
330        }
331        unset($attr[0]);
332    }
333
334    /**
335     * Expands all string identifiers in an attribute array by replacing
336     * them with the appropriate values inside HTMLPurifier_AttrTypes
337     * @param &$attr Reference to attribute array
338     * @param $attr_types HTMLPurifier_AttrTypes instance
339     */
340    public function expandIdentifiers(&$attr, $attr_types) {
341
342        // because foreach will process new elements we add, make sure we
343        // skip duplicates
344        $processed = array();
345
346        foreach ($attr as $def_i => $def) {
347            // skip inclusions
348            if ($def_i === 0) continue;
349
350            if (isset($processed[$def_i])) continue;
351
352            // determine whether or not attribute is required
353            if ($required = (strpos($def_i, '*') !== false)) {
354                // rename the definition
355                unset($attr[$def_i]);
356                $def_i = trim($def_i, '*');
357                $attr[$def_i] = $def;
358            }
359
360            $processed[$def_i] = true;
361
362            // if we've already got a literal object, move on
363            if (is_object($def)) {
364                // preserve previous required
365                $attr[$def_i]->required = ($required || $attr[$def_i]->required);
366                continue;
367            }
368
369            if ($def === false) {
370                unset($attr[$def_i]);
371                continue;
372            }
373
374            if ($t = $attr_types->get($def)) {
375                $attr[$def_i] = $t;
376                $attr[$def_i]->required = $required;
377            } else {
378                unset($attr[$def_i]);
379            }
380        }
381
382    }
383
384}
385
386
387
388
389
390/**
391 * Base class for all validating attribute definitions.
392 *
393 * This family of classes forms the core for not only HTML attribute validation,
394 * but also any sort of string that needs to be validated or cleaned (which
395 * means CSS properties and composite definitions are defined here too).
396 * Besides defining (through code) what precisely makes the string valid,
397 * subclasses are also responsible for cleaning the code if possible.
398 */
399
400abstract class HTMLPurifier_AttrDef
401{
402
403    /**
404     * Tells us whether or not an HTML attribute is minimized. Has no
405     * meaning in other contexts.
406     */
407    public $minimized = false;
408
409    /**
410     * Tells us whether or not an HTML attribute is required. Has no
411     * meaning in other contexts
412     */
413    public $required = false;
414
415    /**
416     * Validates and cleans passed string according to a definition.
417     *
418     * @param $string String to be validated and cleaned.
419     * @param $config Mandatory HTMLPurifier_Config object.
420     * @param $context Mandatory HTMLPurifier_AttrContext object.
421     */
422    abstract public function validate($string, $config, $context);
423
424    /**
425     * Convenience method that parses a string as if it were CDATA.
426     *
427     * This method process a string in the manner specified at
428     * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
429     * leading and trailing whitespace, ignoring line feeds, and replacing
430     * carriage returns and tabs with spaces.  While most useful for HTML
431     * attributes specified as CDATA, it can also be applied to most CSS
432     * values.
433     *
434     * @note This method is not entirely standards compliant, as trim() removes
435     *       more types of whitespace than specified in the spec. In practice,
436     *       this is rarely a problem, as those extra characters usually have
437     *       already been removed by HTMLPurifier_Encoder.
438     *
439     * @warning This processing is inconsistent with XML's whitespace handling
440     *          as specified by section 3.3.3 and referenced XHTML 1.0 section
441     *          4.7.  However, note that we are NOT necessarily
442     *          parsing XML, thus, this behavior may still be correct. We
443     *          assume that newlines have been normalized.
444     */
445    public function parseCDATA($string) {
446        $string = trim($string);
447        $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
448        return $string;
449    }
450
451    /**
452     * Factory method for creating this class from a string.
453     * @param $string String construction info
454     * @return Created AttrDef object corresponding to $string
455     */
456    public function make($string) {
457        // default implementation, return a flyweight of this object.
458        // If $string has an effect on the returned object (i.e. you
459        // need to overload this method), it is best
460        // to clone or instantiate new copies. (Instantiation is safer.)
461        return $this;
462    }
463
464    /**
465     * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
466     * properly. THIS IS A HACK!
467     */
468    protected function mungeRgb($string) {
469        return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
470    }
471
472    /**
473     * Parses a possibly escaped CSS string and returns the "pure"
474     * version of it.
475     */
476    protected function expandCSSEscape($string) {
477        // flexibly parse it
478        $ret = '';
479        for ($i = 0, $c = strlen($string); $i < $c; $i++) {
480            if ($string[$i] === '\\') {
481                $i++;
482                if ($i >= $c) {
483                    $ret .= '\\';
484                    break;
485                }
486                if (ctype_xdigit($string[$i])) {
487                    $code = $string[$i];
488                    for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
489                        if (!ctype_xdigit($string[$i])) break;
490                        $code .= $string[$i];
491                    }
492                    // We have to be extremely careful when adding
493                    // new characters, to make sure we're not breaking
494                    // the encoding.
495                    $char = HTMLPurifier_Encoder::unichr(hexdec($code));
496                    if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
497                    $ret .= $char;
498                    if ($i < $c && trim($string[$i]) !== '') $i--;
499                    continue;
500                }
501                if ($string[$i] === "\n") continue;
502            }
503            $ret .= $string[$i];
504        }
505        return $ret;
506    }
507
508}
509
510
511
512
513
514/**
515 * Processes an entire attribute array for corrections needing multiple values.
516 *
517 * Occasionally, a certain attribute will need to be removed and popped onto
518 * another value.  Instead of creating a complex return syntax for
519 * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
520 * specialized object and have that do the special work.  That is the
521 * family of HTMLPurifier_AttrTransform.
522 *
523 * An attribute transformation can be assigned to run before or after
524 * HTMLPurifier_AttrDef validation.  See HTMLPurifier_HTMLDefinition for
525 * more details.
526 */
527
528abstract class HTMLPurifier_AttrTransform
529{
530
531    /**
532     * Abstract: makes changes to the attributes dependent on multiple values.
533     *
534     * @param $attr Assoc array of attributes, usually from
535     *              HTMLPurifier_Token_Tag::$attr
536     * @param $config Mandatory HTMLPurifier_Config object.
537     * @param $context Mandatory HTMLPurifier_Context object
538     * @returns Processed attribute array.
539     */
540    abstract public function transform($attr, $config, $context);
541
542    /**
543     * Prepends CSS properties to the style attribute, creating the
544     * attribute if it doesn't exist.
545     * @param $attr Attribute array to process (passed by reference)
546     * @param $css CSS to prepend
547     */
548    public function prependCSS(&$attr, $css) {
549        $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
550        $attr['style'] = $css . $attr['style'];
551    }
552
553    /**
554     * Retrieves and removes an attribute
555     * @param $attr Attribute array to process (passed by reference)
556     * @param $key Key of attribute to confiscate
557     */
558    public function confiscateAttr(&$attr, $key) {
559        if (!isset($attr[$key])) return null;
560        $value = $attr[$key];
561        unset($attr[$key]);
562        return $value;
563    }
564
565}
566
567
568
569
570
571/**
572 * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
573 */
574class HTMLPurifier_AttrTypes
575{
576    /**
577     * Lookup array of attribute string identifiers to concrete implementations
578     */
579    protected $info = array();
580
581    /**
582     * Constructs the info array, supplying default implementations for attribute
583     * types.
584     */
585    public function __construct() {
586        // pseudo-types, must be instantiated via shorthand
587        $this->info['Enum']    = new HTMLPurifier_AttrDef_Enum();
588        $this->info['Bool']    = new HTMLPurifier_AttrDef_HTML_Bool();
589
590        $this->info['CDATA']    = new HTMLPurifier_AttrDef_Text();
591        $this->info['ID']       = new HTMLPurifier_AttrDef_HTML_ID();
592        $this->info['Length']   = new HTMLPurifier_AttrDef_HTML_Length();
593        $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
594        $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
595        $this->info['Pixels']   = new HTMLPurifier_AttrDef_HTML_Pixels();
596        $this->info['Text']     = new HTMLPurifier_AttrDef_Text();
597        $this->info['URI']      = new HTMLPurifier_AttrDef_URI();
598        $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
599        $this->info['Color']    = new HTMLPurifier_AttrDef_HTML_Color();
600
601        // unimplemented aliases
602        $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
603        $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
604        $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
605        $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
606
607        // "proprietary" types
608        $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
609
610        // number is really a positive integer (one or more digits)
611        // FIXME: ^^ not always, see start and value of list items
612        $this->info['Number']   = new HTMLPurifier_AttrDef_Integer(false, false, true);
613    }
614
615    /**
616     * Retrieves a type
617     * @param $type String type name
618     * @return Object AttrDef for type
619     */
620    public function get($type) {
621
622        // determine if there is any extra info tacked on
623        if (strpos($type, '#') !== false) list($type, $string) = explode('#', $type, 2);
624        else $string = '';
625
626        if (!isset($this->info[$type])) {
627            trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
628            return;
629        }
630
631        return $this->info[$type]->make($string);
632
633    }
634
635    /**
636     * Sets a new implementation for a type
637     * @param $type String type name
638     * @param $impl Object AttrDef for type
639     */
640    public function set($type, $impl) {
641        $this->info[$type] = $impl;
642    }
643}
644
645
646
647
648
649/**
650 * Validates the attributes of a token. Doesn't manage required attributes
651 * very well. The only reason we factored this out was because RemoveForeignElements
652 * also needed it besides ValidateAttributes.
653 */
654class HTMLPurifier_AttrValidator
655{
656
657    /**
658     * Validates the attributes of a token, returning a modified token
659     * that has valid tokens
660     * @param $token Reference to token to validate. We require a reference
661     *     because the operation this class performs on the token are
662     *     not atomic, so the context CurrentToken to be updated
663     *     throughout
664     * @param $config Instance of HTMLPurifier_Config
665     * @param $context Instance of HTMLPurifier_Context
666     */
667    public function validateToken(&$token, &$config, $context) {
668
669        $definition = $config->getHTMLDefinition();
670        $e =& $context->get('ErrorCollector', true);
671
672        // initialize IDAccumulator if necessary
673        $ok =& $context->get('IDAccumulator', true);
674        if (!$ok) {
675            $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
676            $context->register('IDAccumulator', $id_accumulator);
677        }
678
679        // initialize CurrentToken if necessary
680        $current_token =& $context->get('CurrentToken', true);
681        if (!$current_token) $context->register('CurrentToken', $token);
682
683        if (
684            !$token instanceof HTMLPurifier_Token_Start &&
685            !$token instanceof HTMLPurifier_Token_Empty
686        ) return $token;
687
688        // create alias to global definition array, see also $defs
689        // DEFINITION CALL
690        $d_defs = $definition->info_global_attr;
691
692        // don't update token until the very end, to ensure an atomic update
693        $attr = $token->attr;
694
695        // do global transformations (pre)
696        // nothing currently utilizes this
697        foreach ($definition->info_attr_transform_pre as $transform) {
698            $attr = $transform->transform($o = $attr, $config, $context);
699            if ($e) {
700                if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
701            }
702        }
703
704        // do local transformations only applicable to this element (pre)
705        // ex. <p align="right"> to <p style="text-align:right;">
706        foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
707            $attr = $transform->transform($o = $attr, $config, $context);
708            if ($e) {
709                if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
710            }
711        }
712
713        // create alias to this element's attribute definition array, see
714        // also $d_defs (global attribute definition array)
715        // DEFINITION CALL
716        $defs = $definition->info[$token->name]->attr;
717
718        $attr_key = false;
719        $context->register('CurrentAttr', $attr_key);
720
721        // iterate through all the attribute keypairs
722        // Watch out for name collisions: $key has previously been used
723        foreach ($attr as $attr_key => $value) {
724
725            // call the definition
726            if ( isset($defs[$attr_key]) ) {
727                // there is a local definition defined
728                if ($defs[$attr_key] === false) {
729                    // We've explicitly been told not to allow this element.
730                    // This is usually when there's a global definition
731                    // that must be overridden.
732                    // Theoretically speaking, we could have a
733                    // AttrDef_DenyAll, but this is faster!
734                    $result = false;
735                } else {
736                    // validate according to the element's definition
737                    $result = $defs[$attr_key]->validate(
738                                    $value, $config, $context
739                               );
740                }
741            } elseif ( isset($d_defs[$attr_key]) ) {
742                // there is a global definition defined, validate according
743                // to the global definition
744                $result = $d_defs[$attr_key]->validate(
745                                $value, $config, $context
746                           );
747            } else {
748                // system never heard of the attribute? DELETE!
749                $result = false;
750            }
751
752            // put the results into effect
753            if ($result === false || $result === null) {
754                // this is a generic error message that should replaced
755                // with more specific ones when possible
756                if ($e) $e->send(E_ERROR, 'AttrValidator: Attribute removed');
757
758                // remove the attribute
759                unset($attr[$attr_key]);
760            } elseif (is_string($result)) {
761                // generally, if a substitution is happening, there
762                // was some sort of implicit correction going on. We'll
763                // delegate it to the attribute classes to say exactly what.
764
765                // simple substitution
766                $attr[$attr_key] = $result;
767            } else {
768                // nothing happens
769            }
770
771            // we'd also want slightly more complicated substitution
772            // involving an array as the return value,
773            // although we're not sure how colliding attributes would
774            // resolve (certain ones would be completely overriden,
775            // others would prepend themselves).
776        }
777
778        $context->destroy('CurrentAttr');
779
780        // post transforms
781
782        // global (error reporting untested)
783        foreach ($definition->info_attr_transform_post as $transform) {
784            $attr = $transform->transform($o = $attr, $config, $context);
785            if ($e) {
786                if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
787            }
788        }
789
790        // local (error reporting untested)
791        foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
792            $attr = $transform->transform($o = $attr, $config, $context);
793            if ($e) {
794                if ($attr != $o) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
795            }
796        }
797
798        $token->attr = $attr;
799
800        // destroy CurrentToken if we made it ourselves
801        if (!$current_token) $context->destroy('CurrentToken');
802
803    }
804
805
806}
807
808
809
810
811
812// constants are slow, so we use as few as possible
813if (!defined('HTMLPURIFIER_PREFIX')) {
814    define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
815    set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
816}
817
818// accomodations for versions earlier than 5.0.2
819// borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
820if (!defined('PHP_EOL')) {
821    switch (strtoupper(substr(PHP_OS, 0, 3))) {
822        case 'WIN':
823            define('PHP_EOL', "\r\n");
824            break;
825        case 'DAR':
826            define('PHP_EOL', "\r");
827            break;
828        default:
829            define('PHP_EOL', "\n");
830    }
831}
832
833/**
834 * Bootstrap class that contains meta-functionality for HTML Purifier such as
835 * the autoload function.
836 *
837 * @note
838 *      This class may be used without any other files from HTML Purifier.
839 */
840class HTMLPurifier_Bootstrap
841{
842
843    /**
844     * Autoload function for HTML Purifier
845     * @param $class Class to load
846     */
847    public static function autoload($class) {
848        $file = HTMLPurifier_Bootstrap::getPath($class);
849        if (!$file) return false;
850        // Technically speaking, it should be ok and more efficient to
851        // just do 'require', but Antonio Parraga reports that with
852        // Zend extensions such as Zend debugger and APC, this invariant
853        // may be broken.  Since we have efficient alternatives, pay
854        // the cost here and avoid the bug.
855        require_once HTMLPURIFIER_PREFIX . '/' . $file;
856        return true;
857    }
858
859    /**
860     * Returns the path for a specific class.
861     */
862    public static function getPath($class) {
863        if (strncmp('HTMLPurifier', $class, 12) !== 0) return false;
864        // Custom implementations
865        if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
866            $code = str_replace('_', '-', substr($class, 22));
867            $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
868        } else {
869            $file = str_replace('_', '/', $class) . '.php';
870        }
871        if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) return false;
872        return $file;
873    }
874
875    /**
876     * "Pre-registers" our autoloader on the SPL stack.
877     */
878    public static function registerAutoload() {
879        $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
880        if ( ($funcs = spl_autoload_functions()) === false ) {
881            spl_autoload_register($autoload);
882        } elseif (function_exists('spl_autoload_unregister')) {
883            $buggy  = version_compare(PHP_VERSION, '5.2.11', '<');
884            $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
885                      version_compare(PHP_VERSION, '5.1.0', '>=');
886            foreach ($funcs as $func) {
887                if ($buggy && is_array($func)) {
888                    // :TRICKY: There are some compatibility issues and some
889                    // places where we need to error out
890                    $reflector = new ReflectionMethod($func[0], $func[1]);
891                    if (!$reflector->isStatic()) {
892                        throw new Exception('
893                            HTML Purifier autoloader registrar is not compatible
894                            with non-static object methods due to PHP Bug #44144;
895                            Please do not use HTMLPurifier.autoload.php (or any
896                            file that includes this file); instead, place the code:
897                            spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
898                            after your own autoloaders.
899                        ');
900                    }
901                    // Suprisingly, spl_autoload_register supports the
902                    // Class::staticMethod callback format, although call_user_func doesn't
903                    if ($compat) $func = implode('::', $func);
904                }
905                spl_autoload_unregister($func);
906            }
907            spl_autoload_register($autoload);
908            foreach ($funcs as $func) spl_autoload_register($func);
909        }
910    }
911
912}
913
914
915
916
917
918/**
919 * Super-class for definition datatype objects, implements serialization
920 * functions for the class.
921 */
922abstract class HTMLPurifier_Definition
923{
924
925    /**
926     * Has setup() been called yet?
927     */
928    public $setup = false;
929
930    /**
931     * If true, write out the final definition object to the cache after
932     * setup.  This will be true only if all invocations to get a raw
933     * definition object are also optimized.  This does not cause file
934     * system thrashing because on subsequent calls the cached object
935     * is used and any writes to the raw definition object are short
936     * circuited.  See enduser-customize.html for the high-level
937     * picture.
938     */
939    public $optimized = null;
940
941    /**
942     * What type of definition is it?
943     */
944    public $type;
945
946    /**
947     * Sets up the definition object into the final form, something
948     * not done by the constructor
949     * @param $config HTMLPurifier_Config instance
950     */
951    abstract protected function doSetup($config);
952
953    /**
954     * Setup function that aborts if already setup
955     * @param $config HTMLPurifier_Config instance
956     */
957    public function setup($config) {
958        if ($this->setup) return;
959        $this->setup = true;
960        $this->doSetup($config);
961    }
962
963}
964
965
966
967
968
969/**
970 * Defines allowed CSS attributes and what their values are.
971 * @see HTMLPurifier_HTMLDefinition
972 */
973class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
974{
975
976    public $type = 'CSS';
977
978    /**
979     * Assoc array of attribute name to definition object.
980     */
981    public $info = array();
982
983    /**
984     * Constructs the info array.  The meat of this class.
985     */
986    protected function doSetup($config) {
987
988        $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
989            array('left', 'right', 'center', 'justify'), false);
990
991        $border_style =
992        $this->info['border-bottom-style'] =
993        $this->info['border-right-style'] =
994        $this->info['border-left-style'] =
995        $this->info['border-top-style'] =  new HTMLPurifier_AttrDef_Enum(
996            array('none', 'hidden', 'dotted', 'dashed', 'solid', 'double',
997            'groove', 'ridge', 'inset', 'outset'), false);
998
999        $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
1000
1001        $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
1002            array('none', 'left', 'right', 'both'), false);
1003        $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
1004            array('none', 'left', 'right'), false);
1005        $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
1006            array('normal', 'italic', 'oblique'), false);
1007        $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
1008            array('normal', 'small-caps'), false);
1009
1010        $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
1011            array(
1012                new HTMLPurifier_AttrDef_Enum(array('none')),
1013                new HTMLPurifier_AttrDef_CSS_URI()
1014            )
1015        );
1016
1017        $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
1018            array('inside', 'outside'), false);
1019        $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
1020            array('disc', 'circle', 'square', 'decimal', 'lower-roman',
1021            'upper-roman', 'lower-alpha', 'upper-alpha', 'none'), false);
1022        $this->info['list-style-image'] = $uri_or_none;
1023
1024        $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
1025
1026        $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
1027            array('capitalize', 'uppercase', 'lowercase', 'none'), false);
1028        $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
1029
1030        $this->info['background-image'] = $uri_or_none;
1031        $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
1032            array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
1033        );
1034        $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
1035            array('scroll', 'fixed')
1036        );
1037        $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
1038
1039        $border_color =
1040        $this->info['border-top-color'] =
1041        $this->info['border-bottom-color'] =
1042        $this->info['border-left-color'] =
1043        $this->info['border-right-color'] =
1044        $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1045            new HTMLPurifier_AttrDef_Enum(array('transparent')),
1046            new HTMLPurifier_AttrDef_CSS_Color()
1047        ));
1048
1049        $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
1050
1051        $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
1052
1053        $border_width =
1054        $this->info['border-top-width'] =
1055        $this->info['border-bottom-width'] =
1056        $this->info['border-left-width'] =
1057        $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1058            new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
1059            new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
1060        ));
1061
1062        $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
1063
1064        $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1065            new HTMLPurifier_AttrDef_Enum(array('normal')),
1066            new HTMLPurifier_AttrDef_CSS_Length()
1067        ));
1068
1069        $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1070            new HTMLPurifier_AttrDef_Enum(array('normal')),
1071            new HTMLPurifier_AttrDef_CSS_Length()
1072        ));
1073
1074        $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1075            new HTMLPurifier_AttrDef_Enum(array('xx-small', 'x-small',
1076                'small', 'medium', 'large', 'x-large', 'xx-large',
1077                'larger', 'smaller')),
1078            new HTMLPurifier_AttrDef_CSS_Percentage(),
1079            new HTMLPurifier_AttrDef_CSS_Length()
1080        ));
1081
1082        $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1083            new HTMLPurifier_AttrDef_Enum(array('normal')),
1084            new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
1085            new HTMLPurifier_AttrDef_CSS_Length('0'),
1086            new HTMLPurifier_AttrDef_CSS_Percentage(true)
1087        ));
1088
1089        $margin =
1090        $this->info['margin-top'] =
1091        $this->info['margin-bottom'] =
1092        $this->info['margin-left'] =
1093        $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1094            new HTMLPurifier_AttrDef_CSS_Length(),
1095            new HTMLPurifier_AttrDef_CSS_Percentage(),
1096            new HTMLPurifier_AttrDef_Enum(array('auto'))
1097        ));
1098
1099        $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
1100
1101        // non-negative
1102        $padding =
1103        $this->info['padding-top'] =
1104        $this->info['padding-bottom'] =
1105        $this->info['padding-left'] =
1106        $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1107            new HTMLPurifier_AttrDef_CSS_Length('0'),
1108            new HTMLPurifier_AttrDef_CSS_Percentage(true)
1109        ));
1110
1111        $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
1112
1113        $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1114            new HTMLPurifier_AttrDef_CSS_Length(),
1115            new HTMLPurifier_AttrDef_CSS_Percentage()
1116        ));
1117
1118        $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
1119            new HTMLPurifier_AttrDef_CSS_Length('0'),
1120            new HTMLPurifier_AttrDef_CSS_Percentage(true),
1121            new HTMLPurifier_AttrDef_Enum(array('auto'))
1122        ));
1123        $max = $config->get('CSS.MaxImgLength');
1124
1125        $this->info['width'] =
1126        $this->info['height'] =
1127            $max === null ?
1128            $trusted_wh :
1129            new HTMLPurifier_AttrDef_Switch('img',
1130                // For img tags:
1131                new HTMLPurifier_AttrDef_CSS_Composite(array(
1132                    new HTMLPurifier_AttrDef_CSS_Length('0', $max),
1133                    new HTMLPurifier_AttrDef_Enum(array('auto'))
1134                )),
1135                // For everyone else:
1136                $trusted_wh
1137            );
1138
1139        $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
1140
1141        $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
1142
1143        // this could use specialized code
1144        $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
1145            array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
1146            '400', '500', '600', '700', '800', '900'), false);
1147
1148        // MUST be called after other font properties, as it references
1149        // a CSSDefinition object
1150        $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
1151
1152        // same here
1153        $this->info['border'] =
1154        $this->info['border-bottom'] =
1155        $this->info['border-top'] =
1156        $this->info['border-left'] =
1157        $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
1158
1159        $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
1160            'collapse', 'separate'));
1161
1162        $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
1163            'top', 'bottom'));
1164
1165        $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
1166            'auto', 'fixed'));
1167
1168        $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1169            new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
1170                'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
1171            new HTMLPurifier_AttrDef_CSS_Length(),
1172            new HTMLPurifier_AttrDef_CSS_Percentage()
1173        ));
1174
1175        $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
1176
1177        // partial support
1178        $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));
1179
1180        if ($config->get('CSS.Proprietary')) {
1181            $this->doSetupProprietary($config);
1182        }
1183
1184        if ($config->get('CSS.AllowTricky')) {
1185            $this->doSetupTricky($config);
1186        }
1187
1188        if ($config->get('CSS.Trusted')) {
1189            $this->doSetupTrusted($config);
1190        }
1191
1192        $allow_important = $config->get('CSS.AllowImportant');
1193        // wrap all attr-defs with decorator that handles !important
1194        foreach ($this->info as $k => $v) {
1195            $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
1196        }
1197
1198        $this->setupConfigStuff($config);
1199    }
1200
1201    protected function doSetupProprietary($config) {
1202        // Internet Explorer only scrollbar colors
1203        $this->info['scrollbar-arrow-color']        = new HTMLPurifier_AttrDef_CSS_Color();
1204        $this->info['scrollbar-base-color']         = new HTMLPurifier_AttrDef_CSS_Color();
1205        $this->info['scrollbar-darkshadow-color']   = new HTMLPurifier_AttrDef_CSS_Color();
1206        $this->info['scrollbar-face-color']         = new HTMLPurifier_AttrDef_CSS_Color();
1207        $this->info['scrollbar-highlight-color']    = new HTMLPurifier_AttrDef_CSS_Color();
1208        $this->info['scrollbar-shadow-color']       = new HTMLPurifier_AttrDef_CSS_Color();
1209
1210        // technically not proprietary, but CSS3, and no one supports it
1211        $this->info['opacity']          = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1212        $this->info['-moz-opacity']     = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1213        $this->info['-khtml-opacity']   = new HTMLPurifier_AttrDef_CSS_AlphaValue();
1214
1215        // only opacity, for now
1216        $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
1217
1218    }
1219
1220    protected function doSetupTricky($config) {
1221        $this->info['display'] = new HTMLPurifier_AttrDef_Enum(array(
1222            'inline', 'block', 'list-item', 'run-in', 'compact',
1223            'marker', 'table', 'inline-table', 'table-row-group',
1224            'table-header-group', 'table-footer-group', 'table-row',
1225            'table-column-group', 'table-column', 'table-cell', 'table-caption', 'none'
1226        ));
1227        $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(array(
1228            'visible', 'hidden', 'collapse'
1229        ));
1230        $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
1231    }
1232
1233    protected function doSetupTrusted($config) {
1234        $this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
1235            'static', 'relative', 'absolute', 'fixed'
1236        ));
1237        $this->info['top'] =
1238        $this->info['left'] =
1239        $this->info['right'] =
1240        $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1241            new HTMLPurifier_AttrDef_CSS_Length(),
1242            new HTMLPurifier_AttrDef_CSS_Percentage(),
1243            new HTMLPurifier_AttrDef_Enum(array('auto')),
1244        ));
1245        $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
1246            new HTMLPurifier_AttrDef_Integer(),
1247            new HTMLPurifier_AttrDef_Enum(array('auto')),
1248        ));
1249    }
1250
1251    /**
1252     * Performs extra config-based processing. Based off of
1253     * HTMLPurifier_HTMLDefinition.
1254     * @todo Refactor duplicate elements into common class (probably using
1255     *       composition, not inheritance).
1256     */
1257    protected function setupConfigStuff($config) {
1258
1259        // setup allowed elements
1260        $support = "(for information on implementing this, see the ".
1261                   "support forums) ";
1262        $allowed_properties = $config->get('CSS.AllowedProperties');
1263        if ($allowed_properties !== null) {
1264            foreach ($this->info as $name => $d) {
1265                if(!isset($allowed_properties[$name])) unset($this->info[$name]);
1266                unset($allowed_properties[$name]);
1267            }
1268            // emit errors
1269            foreach ($allowed_properties as $name => $d) {
1270                // :TODO: Is this htmlspecialchars() call really necessary?
1271                $name = htmlspecialchars($name);
1272                trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
1273            }
1274        }
1275
1276        $forbidden_properties = $config->get('CSS.ForbiddenProperties');
1277        if ($forbidden_properties !== null) {
1278            foreach ($this->info as $name => $d) {
1279                if (isset($forbidden_properties[$name])) {
1280                    unset($this->info[$name]);
1281                }
1282            }
1283        }
1284
1285    }
1286}
1287
1288
1289
1290
1291
1292/**
1293 * Defines allowed child nodes and validates tokens against it.
1294 */
1295abstract class HTMLPurifier_ChildDef
1296{
1297    /**
1298     * Type of child definition, usually right-most part of class name lowercase.
1299     * Used occasionally in terms of context.
1300     */
1301    public $type;
1302
1303    /**
1304     * Bool that indicates whether or not an empty array of children is okay
1305     *
1306     * This is necessary for redundant checking when changes affecting
1307     * a child node may cause a parent node to now be disallowed.
1308     */
1309    public $allow_empty;
1310
1311    /**
1312     * Lookup array of all elements that this definition could possibly allow
1313     */
1314    public $elements = array();
1315
1316    /**
1317     * Get lookup of tag names that should not close this element automatically.
1318     * All other elements will do so.
1319     */
1320    public function getAllowedElements($config) {
1321        return $this->elements;
1322    }
1323
1324    /**
1325     * Validates nodes according to definition and returns modification.
1326     *
1327     * @param $tokens_of_children Array of HTMLPurifier_Token
1328     * @param $config HTMLPurifier_Config object
1329     * @param $context HTMLPurifier_Context object
1330     * @return bool true to leave nodes as is
1331     * @return bool false to remove parent node
1332     * @return array of replacement child tokens
1333     */
1334    abstract public function validateChildren($tokens_of_children, $config, $context);
1335}
1336
1337
1338
1339
1340
1341/**
1342 * Configuration object that triggers customizable behavior.
1343 *
1344 * @warning This class is strongly defined: that means that the class
1345 *          will fail if an undefined directive is retrieved or set.
1346 *
1347 * @note Many classes that could (although many times don't) use the
1348 *       configuration object make it a mandatory parameter.  This is
1349 *       because a configuration object should always be forwarded,
1350 *       otherwise, you run the risk of missing a parameter and then
1351 *       being stumped when a configuration directive doesn't work.
1352 *
1353 * @todo Reconsider some of the public member variables
1354 */
1355class HTMLPurifier_Config
1356{
1357
1358    /**
1359     * HTML Purifier's version
1360     */
1361    public $version = '4.3.0';
1362
1363    /**
1364     * Bool indicator whether or not to automatically finalize
1365     * the object if a read operation is done
1366     */
1367    public $autoFinalize = true;
1368
1369    // protected member variables
1370
1371    /**
1372     * Namespace indexed array of serials for specific namespaces (see
1373     * getSerial() for more info).
1374     */
1375    protected $serials = array();
1376
1377    /**
1378     * Serial for entire configuration object
1379     */
1380    protected $serial;
1381
1382    /**
1383     * Parser for variables
1384     */
1385    protected $parser;
1386
1387    /**
1388     * Reference HTMLPurifier_ConfigSchema for value checking
1389     * @note This is public for introspective purposes. Please don't
1390     *       abuse!
1391     */
1392    public $def;
1393
1394    /**
1395     * Indexed array of definitions
1396     */
1397    protected $definitions;
1398
1399    /**
1400     * Bool indicator whether or not config is finalized
1401     */
1402    protected $finalized = false;
1403
1404    /**
1405     * Property list containing configuration directives.
1406     */
1407    protected $plist;
1408
1409    /**
1410     * Whether or not a set is taking place due to an
1411     * alias lookup.
1412     */
1413    private $aliasMode;
1414
1415    /**
1416     * Set to false if you do not want line and file numbers in errors
1417     * (useful when unit testing).  This will also compress some errors
1418     * and exceptions.
1419     */
1420    public $chatty = true;
1421
1422    /**
1423     * Current lock; only gets to this namespace are allowed.
1424     */
1425    private $lock;
1426
1427    /**
1428     * @param $definition HTMLPurifier_ConfigSchema that defines what directives
1429     *                    are allowed.
1430     */
1431    public function __construct($definition, $parent = null) {
1432        $parent = $parent ? $parent : $definition->defaultPlist;
1433        $this->plist = new HTMLPurifier_PropertyList($parent);
1434        $this->def = $definition; // keep a copy around for checking
1435        $this->parser = new HTMLPurifier_VarParser_Flexible();
1436    }
1437
1438    /**
1439     * Convenience constructor that creates a config object based on a mixed var
1440     * @param mixed $config Variable that defines the state of the config
1441     *                      object. Can be: a HTMLPurifier_Config() object,
1442     *                      an array of directives based on loadArray(),
1443     *                      or a string filename of an ini file.
1444     * @param HTMLPurifier_ConfigSchema Schema object
1445     * @return Configured HTMLPurifier_Config object
1446     */
1447    public static function create($config, $schema = null) {
1448        if ($config instanceof HTMLPurifier_Config) {
1449            // pass-through
1450            return $config;
1451        }
1452        if (!$schema) {
1453            $ret = HTMLPurifier_Config::createDefault();
1454        } else {
1455            $ret = new HTMLPurifier_Config($schema);
1456        }
1457        if (is_string($config)) $ret->loadIni($config);
1458        elseif (is_array($config)) $ret->loadArray($config);
1459        return $ret;
1460    }
1461
1462    /**
1463     * Creates a new config object that inherits from a previous one.
1464     * @param HTMLPurifier_Config $config Configuration object to inherit
1465     *        from.
1466     * @return HTMLPurifier_Config object with $config as its parent.
1467     */
1468    public static function inherit(HTMLPurifier_Config $config) {
1469        return new HTMLPurifier_Config($config->def, $config->plist);
1470    }
1471
1472    /**
1473     * Convenience constructor that creates a default configuration object.
1474     * @return Default HTMLPurifier_Config object.
1475     */
1476    public static function createDefault() {
1477        $definition = HTMLPurifier_ConfigSchema::instance();
1478        $config = new HTMLPurifier_Config($definition);
1479        return $config;
1480    }
1481
1482    /**
1483     * Retreives a value from the configuration.
1484     * @param $key String key
1485     */
1486    public function get($key, $a = null) {
1487        if ($a !== null) {
1488            $this->triggerError("Using deprecated API: use \$config->get('$key.$a') instead", E_USER_WARNING);
1489            $key = "$key.$a";
1490        }
1491        if (!$this->finalized) $this->autoFinalize();
1492        if (!isset($this->def->info[$key])) {
1493            // can't add % due to SimpleTest bug
1494            $this->triggerError('Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
1495                E_USER_WARNING);
1496            return;
1497        }
1498        if (isset($this->def->info[$key]->isAlias)) {
1499            $d = $this->def->info[$key];
1500            $this->triggerError('Cannot get value from aliased directive, use real name ' . $d->key,
1501                E_USER_ERROR);
1502            return;
1503        }
1504        if ($this->lock) {
1505            list($ns) = explode('.', $key);
1506            if ($ns !== $this->lock) {
1507                $this->triggerError('Cannot get value of namespace ' . $ns . ' when lock for ' . $this->lock . ' is active, this probably indicates a Definition setup method is accessing directives that are not within its namespace', E_USER_ERROR);
1508                return;
1509            }
1510        }
1511        return $this->plist->get($key);
1512    }
1513
1514    /**
1515     * Retreives an array of directives to values from a given namespace
1516     * @param $namespace String namespace
1517     */
1518    public function getBatch($namespace) {
1519        if (!$this->finalized) $this->autoFinalize();
1520        $full = $this->getAll();
1521        if (!isset($full[$namespace])) {
1522            $this->triggerError('Cannot retrieve undefined namespace ' . htmlspecialchars($namespace),
1523                E_USER_WARNING);
1524            return;
1525        }
1526        return $full[$namespace];
1527    }
1528
1529    /**
1530     * Returns a md5 signature of a segment of the configuration object
1531     * that uniquely identifies that particular configuration
1532     * @note Revision is handled specially and is removed from the batch
1533     *       before processing!
1534     * @param $namespace Namespace to get serial for
1535     */
1536    public function getBatchSerial($namespace) {
1537        if (empty($this->serials[$namespace])) {
1538            $batch = $this->getBatch($namespace);
1539            unset($batch['DefinitionRev']);
1540            $this->serials[$namespace] = md5(serialize($batch));
1541        }
1542        return $this->serials[$namespace];
1543    }
1544
1545    /**
1546     * Returns a md5 signature for the entire configuration object
1547     * that uniquely identifies that particular configuration
1548     */
1549    public function getSerial() {
1550        if (empty($this->serial)) {
1551            $this->serial = md5(serialize($this->getAll()));
1552        }
1553        return $this->serial;
1554    }
1555
1556    /**
1557     * Retrieves all directives, organized by namespace
1558     * @warning This is a pretty inefficient function, avoid if you can
1559     */
1560    public function getAll() {
1561        if (!$this->finalized) $this->autoFinalize();
1562        $ret = array();
1563        foreach ($this->plist->squash() as $name => $value) {
1564            list($ns, $key) = explode('.', $name, 2);
1565            $ret[$ns][$key] = $value;
1566        }
1567        return $ret;
1568    }
1569
1570    /**
1571     * Sets a value to configuration.
1572     * @param $key String key
1573     * @param $value Mixed value
1574     */
1575    public function set($key, $value, $a = null) {
1576        if (strpos($key, '.') === false) {
1577            $namespace = $key;
1578            $directive = $value;
1579            $value = $a;
1580            $key = "$key.$directive";
1581            $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
1582        } else {
1583            list($namespace) = explode('.', $key);
1584        }
1585        if ($this->isFinalized('Cannot set directive after finalization')) return;
1586        if (!isset($this->def->info[$key])) {
1587            $this->triggerError('Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
1588                E_USER_WARNING);
1589            return;
1590        }
1591        $def = $this->def->info[$key];
1592
1593        if (isset($def->isAlias)) {
1594            if ($this->aliasMode) {
1595                $this->triggerError('Double-aliases not allowed, please fix '.
1596                    'ConfigSchema bug with' . $key, E_USER_ERROR);
1597                return;
1598            }
1599            $this->aliasMode = true;
1600            $this->set($def->key, $value);
1601            $this->aliasMode = false;
1602            $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
1603            return;
1604        }
1605
1606        // Raw type might be negative when using the fully optimized form
1607        // of stdclass, which indicates allow_null == true
1608        $rtype = is_int($def) ? $def : $def->type;
1609        if ($rtype < 0) {
1610            $type = -$rtype;
1611            $allow_null = true;
1612        } else {
1613            $type = $rtype;
1614            $allow_null = isset($def->allow_null);
1615        }
1616
1617        try {
1618            $value = $this->parser->parse($value, $type, $allow_null);
1619        } catch (HTMLPurifier_VarParserException $e) {
1620            $this->triggerError('Value for ' . $key . ' is of invalid type, should be ' . HTMLPurifier_VarParser::getTypeName($type), E_USER_WARNING);
1621            return;
1622        }
1623        if (is_string($value) && is_object($def)) {
1624            // resolve value alias if defined
1625            if (isset($def->aliases[$value])) {
1626                $value = $def->aliases[$value];
1627            }
1628            // check to see if the value is allowed
1629            if (isset($def->allowed) && !isset($def->allowed[$value])) {
1630                $this->triggerError('Value not supported, valid values are: ' .
1631                    $this->_listify($def->allowed), E_USER_WARNING);
1632                return;
1633            }
1634        }
1635        $this->plist->set($key, $value);
1636
1637        // reset definitions if the directives they depend on changed
1638        // this is a very costly process, so it's discouraged
1639        // with finalization
1640        if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
1641            $this->definitions[$namespace] = null;
1642        }
1643
1644        $this->serials[$namespace] = false;
1645    }
1646
1647    /**
1648     * Convenience function for error reporting
1649     */
1650    private function _listify($lookup) {
1651        $list = array();
1652        foreach ($lookup as $name => $b) $list[] = $name;
1653        return implode(', ', $list);
1654    }
1655
1656    /**
1657     * Retrieves object reference to the HTML definition.
1658     * @param $raw Return a copy that has not been setup yet. Must be
1659     *             called before it's been setup, otherwise won't work.
1660     * @param $optimized If true, this method may return null, to
1661     *             indicate that a cached version of the modified
1662     *             definition object is available and no further edits
1663     *             are necessary.  Consider using
1664     *             maybeGetRawHTMLDefinition, which is more explicitly
1665     *             named, instead.
1666     */
1667    public function getHTMLDefinition($raw = false, $optimized = false) {
1668        return $this->getDefinition('HTML', $raw, $optimized);
1669    }
1670
1671    /**
1672     * Retrieves object reference to the CSS definition
1673     * @param $raw Return a copy that has not been setup yet. Must be
1674     *             called before it's been setup, otherwise won't work.
1675     * @param $optimized If true, this method may return null, to
1676     *             indicate that a cached version of the modified
1677     *             definition object is available and no further edits
1678     *             are necessary.  Consider using
1679     *             maybeGetRawCSSDefinition, which is more explicitly
1680     *             named, instead.
1681     */
1682    public function getCSSDefinition($raw = false, $optimized = false) {
1683        return $this->getDefinition('CSS', $raw, $optimized);
1684    }
1685
1686    /**
1687     * Retrieves object reference to the URI definition
1688     * @param $raw Return a copy that has not been setup yet. Must be
1689     *             called before it's been setup, otherwise won't work.
1690     * @param $optimized If true, this method may return null, to
1691     *             indicate that a cached version of the modified
1692     *             definition object is available and no further edits
1693     *             are necessary.  Consider using
1694     *             maybeGetRawURIDefinition, which is more explicitly
1695     *             named, instead.
1696     */
1697    public function getURIDefinition($raw = false, $optimized = false) {
1698        return $this->getDefinition('URI', $raw, $optimized);
1699    }
1700
1701    /**
1702     * Retrieves a definition
1703     * @param $type Type of definition: HTML, CSS, etc
1704     * @param $raw  Whether or not definition should be returned raw
1705     * @param $optimized Only has an effect when $raw is true.  Whether
1706     *        or not to return null if the result is already present in
1707     *        the cache.  This is off by default for backwards
1708     *        compatibility reasons, but you need to do things this
1709     *        way in order to ensure that caching is done properly.
1710     *        Check out enduser-customize.html for more details.
1711     *        We probably won't ever change this default, as much as the
1712     *        maybe semantics is the "right thing to do."
1713     */
1714    public function getDefinition($type, $raw = false, $optimized = false) {
1715        if ($optimized && !$raw) {
1716            throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
1717        }
1718        if (!$this->finalized) $this->autoFinalize();
1719        // temporarily suspend locks, so we can handle recursive definition calls
1720        $lock = $this->lock;
1721        $this->lock = null;
1722        $factory = HTMLPurifier_DefinitionCacheFactory::instance();
1723        $cache = $factory->create($type, $this);
1724        $this->lock = $lock;
1725        if (!$raw) {
1726            // full definition
1727            // ---------------
1728            // check if definition is in memory
1729            if (!empty($this->definitions[$type])) {
1730                $def = $this->definitions[$type];
1731                // check if the definition is setup
1732                if ($def->setup) {
1733                    return $def;
1734                } else {
1735                    $def->setup($this);
1736                    if ($def->optimized) $cache->add($def, $this);
1737                    return $def;
1738                }
1739            }
1740            // check if definition is in cache
1741            $def = $cache->get($this);
1742            if ($def) {
1743                // definition in cache, save to memory and return it
1744                $this->definitions[$type] = $def;
1745                return $def;
1746            }
1747            // initialize it
1748            $def = $this->initDefinition($type);
1749            // set it up
1750            $this->lock = $type;
1751            $def->setup($this);
1752            $this->lock = null;
1753            // save in cache
1754            $cache->add($def, $this);
1755            // return it
1756            return $def;
1757        } else {
1758            // raw definition
1759            // --------------
1760            // check preconditions
1761            $def = null;
1762            if ($optimized) {
1763                if (is_null($this->get($type . '.DefinitionID'))) {
1764                    // fatally error out if definition ID not set
1765                    throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
1766                }
1767            }
1768            if (!empty($this->definitions[$type])) {
1769                $def = $this->definitions[$type];
1770                if ($def->setup && !$optimized) {
1771                    $extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
1772                    throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
1773                }
1774                if ($def->optimized === null) {
1775                    $extra = $this->chatty ? " (try flushing your cache)" : "";
1776                    throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
1777                }
1778                if ($def->optimized !== $optimized) {
1779                    $msg = $optimized ? "optimized" : "unoptimized";
1780                    $extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
1781                    throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
1782                }
1783            }
1784            // check if definition was in memory
1785            if ($def) {
1786                if ($def->setup) {
1787                    // invariant: $optimized === true (checked above)
1788                    return null;
1789                } else {
1790                    return $def;
1791                }
1792            }
1793            // if optimized, check if definition was in cache
1794            // (because we do the memory check first, this formulation
1795            // is prone to cache slamming, but I think
1796            // guaranteeing that either /all/ of the raw
1797            // setup code or /none/ of it is run is more important.)
1798            if ($optimized) {
1799                // This code path only gets run once; once we put
1800                // something in $definitions (which is guaranteed by the
1801                // trailing code), we always short-circuit above.
1802                $def = $cache->get($this);
1803                if ($def) {
1804                    // save the full definition for later, but don't
1805                    // return it yet
1806                    $this->definitions[$type] = $def;
1807                    return null;
1808                }
1809            }
1810            // check invariants for creation
1811            if (!$optimized) {
1812                if (!is_null($this->get($type . '.DefinitionID'))) {
1813                    if ($this->chatty) {
1814                        $this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached.  If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration.  Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary).  See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
1815                    } else {
1816                        $this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
1817                    }
1818                }
1819            }
1820            // initialize it
1821            $def = $this->initDefinition($type);
1822            $def->optimized = $optimized;
1823            return $def;
1824        }
1825        throw new HTMLPurifier_Exception("The impossible happened!");
1826    }
1827
1828    private function initDefinition($type) {
1829        // quick checks failed, let's create the object
1830        if ($type == 'HTML') {
1831            $def = new HTMLPurifier_HTMLDefinition();
1832        } elseif ($type == 'CSS') {
1833            $def = new HTMLPurifier_CSSDefinition();
1834        } elseif ($type == 'URI') {
1835            $def = new HTMLPurifier_URIDefinition();
1836        } else {
1837            throw new HTMLPurifier_Exception("Definition of $type type not supported");
1838        }
1839        $this->definitions[$type] = $def;
1840        return $def;
1841    }
1842
1843    public function maybeGetRawDefinition($name) {
1844        return $this->getDefinition($name, true, true);
1845    }
1846
1847    public function maybeGetRawHTMLDefinition() {
1848        return $this->getDefinition('HTML', true, true);
1849    }
1850
1851    public function maybeGetRawCSSDefinition() {
1852        return $this->getDefinition('CSS', true, true);
1853    }
1854
1855    public function maybeGetRawURIDefinition() {
1856        return $this->getDefinition('URI', true, true);
1857    }
1858
1859    /**
1860     * Loads configuration values from an array with the following structure:
1861     * Namespace.Directive => Value
1862     * @param $config_array Configuration associative array
1863     */
1864    public function loadArray($config_array) {
1865        if ($this->isFinalized('Cannot load directives after finalization')) return;
1866        foreach ($config_array as $key => $value) {
1867            $key = str_replace('_', '.', $key);
1868            if (strpos($key, '.') !== false) {
1869                $this->set($key, $value);
1870            } else {
1871                $namespace = $key;
1872                $namespace_values = $value;
1873                foreach ($namespace_values as $directive => $value) {
1874                    $this->set($namespace .'.'. $directive, $value);
1875                }
1876            }
1877        }
1878    }
1879
1880    /**
1881     * Returns a list of array(namespace, directive) for all directives
1882     * that are allowed in a web-form context as per an allowed
1883     * namespaces/directives list.
1884     * @param $allowed List of allowed namespaces/directives
1885     */
1886    public static function getAllowedDirectivesForForm($allowed, $schema = null) {
1887        if (!$schema) {
1888            $schema = HTMLPurifier_ConfigSchema::instance();
1889        }
1890        if ($allowed !== true) {
1891             if (is_string($allowed)) $allowed = array($allowed);
1892             $allowed_ns = array();
1893             $allowed_directives = array();
1894             $blacklisted_directives = array();
1895             foreach ($allowed as $ns_or_directive) {
1896                 if (strpos($ns_or_directive, '.') !== false) {
1897                     // directive
1898                     if ($ns_or_directive[0] == '-') {
1899                         $blacklisted_directives[substr($ns_or_directive, 1)] = true;
1900                     } else {
1901                         $allowed_directives[$ns_or_directive] = true;
1902                     }
1903                 } else {
1904                     // namespace
1905                     $allowed_ns[$ns_or_directive] = true;
1906                 }
1907             }
1908        }
1909        $ret = array();
1910        foreach ($schema->info as $key => $def) {
1911            list($ns, $directive) = explode('.', $key, 2);
1912            if ($allowed !== true) {
1913                if (isset($blacklisted_directives["$ns.$directive"])) continue;
1914                if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) continue;
1915            }
1916            if (isset($def->isAlias)) continue;
1917            if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') continue;
1918            $ret[] = array($ns, $directive);
1919        }
1920        return $ret;
1921    }
1922
1923    /**
1924     * Loads configuration values from $_GET/$_POST that were posted
1925     * via ConfigForm
1926     * @param $array $_GET or $_POST array to import
1927     * @param $index Index/name that the config variables are in
1928     * @param $allowed List of allowed namespaces/directives
1929     * @param $mq_fix Boolean whether or not to enable magic quotes fix
1930     * @param $schema Instance of HTMLPurifier_ConfigSchema to use, if not global copy
1931     */
1932    public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1933        $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
1934        $config = HTMLPurifier_Config::create($ret, $schema);
1935        return $config;
1936    }
1937
1938    /**
1939     * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
1940     * @note Same parameters as loadArrayFromForm
1941     */
1942    public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true) {
1943         $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
1944         $this->loadArray($ret);
1945    }
1946
1947    /**
1948     * Prepares an array from a form into something usable for the more
1949     * strict parts of HTMLPurifier_Config
1950     */
1951    public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null) {
1952        if ($index !== false) $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
1953        $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
1954
1955        $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
1956        $ret = array();
1957        foreach ($allowed as $key) {
1958            list($ns, $directive) = $key;
1959            $skey = "$ns.$directive";
1960            if (!empty($array["Null_$skey"])) {
1961                $ret[$ns][$directive] = null;
1962                continue;
1963            }
1964            if (!isset($array[$skey])) continue;
1965            $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
1966            $ret[$ns][$directive] = $value;
1967        }
1968        return $ret;
1969    }
1970
1971    /**
1972     * Loads configuration values from an ini file
1973     * @param $filename Name of ini file
1974     */
1975    public function loadIni($filename) {
1976        if ($this->isFinalized('Cannot load directives after finalization')) return;
1977        $array = parse_ini_file($filename, true);
1978        $this->loadArray($array);
1979    }
1980
1981    /**
1982     * Checks whether or not the configuration object is finalized.
1983     * @param $error String error message, or false for no error
1984     */
1985    public function isFinalized($error = false) {
1986        if ($this->finalized && $error) {
1987            $this->triggerError($error, E_USER_ERROR);
1988        }
1989        return $this->finalized;
1990    }
1991
1992    /**
1993     * Finalizes configuration only if auto finalize is on and not
1994     * already finalized
1995     */
1996    public function autoFinalize() {
1997        if ($this->autoFinalize) {
1998            $this->finalize();
1999        } else {
2000            $this->plist->squash(true);
2001        }
2002    }
2003
2004    /**
2005     * Finalizes a configuration object, prohibiting further change
2006     */
2007    public function finalize() {
2008        $this->finalized = true;
2009        unset($this->parser);
2010    }
2011
2012    /**
2013     * Produces a nicely formatted error message by supplying the
2014     * stack frame information OUTSIDE of HTMLPurifier_Config.
2015     */
2016    protected function triggerError($msg, $no) {
2017        // determine previous stack frame
2018        $extra = '';
2019        if ($this->chatty) {
2020            $trace = debug_backtrace();
2021            // zip(tail(trace), trace) -- but PHP is not Haskell har har
2022            for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
2023                if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
2024                    continue;
2025                }
2026                $frame = $trace[$i];
2027                $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
2028                break;
2029            }
2030        }
2031        trigger_error($msg . $extra, $no);
2032    }
2033
2034    /**
2035     * Returns a serialized form of the configuration object that can
2036     * be reconstituted.
2037     */
2038    public function serialize() {
2039        $this->getDefinition('HTML');
2040        $this->getDefinition('CSS');
2041        $this->getDefinition('URI');
2042        return serialize($this);
2043    }
2044
2045}
2046
2047
2048
2049
2050
2051/**
2052 * Configuration definition, defines directives and their defaults.
2053 */
2054class HTMLPurifier_ConfigSchema {
2055
2056    /**
2057     * Defaults of the directives and namespaces.
2058     * @note This shares the exact same structure as HTMLPurifier_Config::$conf
2059     */
2060    public $defaults = array();
2061
2062    /**
2063     * The default property list. Do not edit this property list.
2064     */
2065    public $defaultPlist;
2066
2067    /**
2068     * Definition of the directives. The structure of this is:
2069     *
2070     *  array(
2071     *      'Namespace' => array(
2072     *          'Directive' => new stdclass(),
2073     *      )
2074     *  )
2075     *
2076     * The stdclass may have the following properties:
2077     *
2078     *  - If isAlias isn't set:
2079     *      - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
2080     *      - allow_null: If set, this directive allows null values
2081     *      - aliases: If set, an associative array of value aliases to real values
2082     *      - allowed: If set, a lookup array of allowed (string) values
2083     *  - If isAlias is set:
2084     *      - namespace: Namespace this directive aliases to
2085     *      - name: Directive name this directive aliases to
2086     *
2087     * In certain degenerate cases, stdclass will actually be an integer. In
2088     * that case, the value is equivalent to an stdclass with the type
2089     * property set to the integer. If the integer is negative, type is
2090     * equal to the absolute value of integer, and allow_null is true.
2091     *
2092     * This class is friendly with HTMLPurifier_Config. If you need introspection
2093     * about the schema, you're better of using the ConfigSchema_Interchange,
2094     * which uses more memory but has much richer information.
2095     */
2096    public $info = array();
2097
2098    /**
2099     * Application-wide singleton
2100     */
2101    static protected $singleton;
2102
2103    public function __construct() {
2104        $this->defaultPlist = new HTMLPurifier_PropertyList();
2105    }
2106
2107    /**
2108     * Unserializes the default ConfigSchema.
2109     */
2110    public static function makeFromSerial() {
2111        $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
2112        $r = unserialize($contents);
2113        if (!$r) {
2114            $hash = sha1($contents);
2115            trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
2116        }
2117        return $r;
2118    }
2119
2120    /**
2121     * Retrieves an instance of the application-wide configuration definition.
2122     */
2123    public static function instance($prototype = null) {
2124        if ($prototype !== null) {
2125            HTMLPurifier_ConfigSchema::$singleton = $prototype;
2126        } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
2127            HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
2128        }
2129        return HTMLPurifier_ConfigSchema::$singleton;
2130    }
2131
2132    /**
2133     * Defines a directive for configuration
2134     * @warning Will fail of directive's namespace is defined.
2135     * @warning This method's signature is slightly different from the legacy
2136     *          define() static method! Beware!
2137     * @param $namespace Namespace the directive is in
2138     * @param $name Key of directive
2139     * @param $default Default value of directive
2140     * @param $type Allowed type of the directive. See
2141     *      HTMLPurifier_DirectiveDef::$type for allowed values
2142     * @param $allow_null Whether or not to allow null values
2143     */
2144    public function add($key, $default, $type, $allow_null) {
2145        $obj = new stdclass();
2146        $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
2147        if ($allow_null) $obj->allow_null = true;
2148        $this->info[$key] = $obj;
2149        $this->defaults[$key] = $default;
2150        $this->defaultPlist->set($key, $default);
2151    }
2152
2153    /**
2154     * Defines a directive value alias.
2155     *
2156     * Directive value aliases are convenient for developers because it lets
2157     * them set a directive to several values and get the same result.
2158     * @param $namespace Directive's namespace
2159     * @param $name Name of Directive
2160     * @param $aliases Hash of aliased values to the real alias
2161     */
2162    public function addValueAliases($key, $aliases) {
2163        if (!isset($this->info[$key]->aliases)) {
2164            $this->info[$key]->aliases = array();
2165        }
2166        foreach ($aliases as $alias => $real) {
2167            $this->info[$key]->aliases[$alias] = $real;
2168        }
2169    }
2170
2171    /**
2172     * Defines a set of allowed values for a directive.
2173     * @warning This is slightly different from the corresponding static
2174     *          method definition.
2175     * @param $namespace Namespace of directive
2176     * @param $name Name of directive
2177     * @param $allowed Lookup array of allowed values
2178     */
2179    public function addAllowedValues($key, $allowed) {
2180        $this->info[$key]->allowed = $allowed;
2181    }
2182
2183    /**
2184     * Defines a directive alias for backwards compatibility
2185     * @param $namespace
2186     * @param $name Directive that will be aliased
2187     * @param $new_namespace
2188     * @param $new_name Directive that the alias will be to
2189     */
2190    public function addAlias($key, $new_key) {
2191        $obj = new stdclass;
2192        $obj->key = $new_key;
2193        $obj->isAlias = true;
2194        $this->info[$key] = $obj;
2195    }
2196
2197    /**
2198     * Replaces any stdclass that only has the type property with type integer.
2199     */
2200    public function postProcess() {
2201        foreach ($this->info as $key => $v) {
2202            if (count((array) $v) == 1) {
2203                $this->info[$key] = $v->type;
2204            } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
2205                $this->info[$key] = -$v->type;
2206            }
2207        }
2208    }
2209
2210}
2211
2212
2213
2214
2215
2216/**
2217 * @todo Unit test
2218 */
2219class HTMLPurifier_ContentSets
2220{
2221
2222    /**
2223     * List of content set strings (pipe seperators) indexed by name.
2224     */
2225    public $info = array();
2226
2227    /**
2228     * List of content set lookups (element => true) indexed by name.
2229     * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
2230     */
2231    public $lookup = array();
2232
2233    /**
2234     * Synchronized list of defined content sets (keys of info)
2235     */
2236    protected $keys = array();
2237    /**
2238     * Synchronized list of defined content values (values of info)
2239     */
2240    protected $values = array();
2241
2242    /**
2243     * Merges in module's content sets, expands identifiers in the content
2244     * sets and populates the keys, values and lookup member variables.
2245     * @param $modules List of HTMLPurifier_HTMLModule
2246     */
2247    public function __construct($modules) {
2248        if (!is_array($modules)) $modules = array($modules);
2249        // populate content_sets based on module hints
2250        // sorry, no way of overloading
2251        foreach ($modules as $module_i => $module) {
2252            foreach ($module->content_sets as $key => $value) {
2253                $temp = $this->convertToLookup($value);
2254                if (isset($this->lookup[$key])) {
2255                    // add it into the existing content set
2256                    $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
2257                } else {
2258                    $this->lookup[$key] = $temp;
2259                }
2260            }
2261        }
2262        $old_lookup = false;
2263        while ($old_lookup !== $this->lookup) {
2264            $old_lookup = $this->lookup;
2265            foreach ($this->lookup as $i => $set) {
2266                $add = array();
2267                foreach ($set as $element => $x) {
2268                    if (isset($this->lookup[$element])) {
2269                        $add += $this->lookup[$element];
2270                        unset($this->lookup[$i][$element]);
2271                    }
2272                }
2273                $this->lookup[$i] += $add;
2274            }
2275        }
2276
2277        foreach ($this->lookup as $key => $lookup) {
2278            $this->info[$key] = implode(' | ', array_keys($lookup));
2279        }
2280        $this->keys   = array_keys($this->info);
2281        $this->values = array_values($this->info);
2282    }
2283
2284    /**
2285     * Accepts a definition; generates and assigns a ChildDef for it
2286     * @param $def HTMLPurifier_ElementDef reference
2287     * @param $module Module that defined the ElementDef
2288     */
2289    public function generateChildDef(&$def, $module) {
2290        if (!empty($def->child)) return; // already done!
2291        $content_model = $def->content_model;
2292        if (is_string($content_model)) {
2293            // Assume that $this->keys is alphanumeric
2294            $def->content_model = preg_replace_callback(
2295                '/\b(' . implode('|', $this->keys) . ')\b/',
2296                array($this, 'generateChildDefCallback'),
2297                $content_model
2298            );
2299            //$def->content_model = str_replace(
2300            //    $this->keys, $this->values, $content_model);
2301        }
2302        $def->child = $this->getChildDef($def, $module);
2303    }
2304
2305    public function generateChildDefCallback($matches) {
2306        return $this->info[$matches[0]];
2307    }
2308
2309    /**
2310     * Instantiates a ChildDef based on content_model and content_model_type
2311     * member variables in HTMLPurifier_ElementDef
2312     * @note This will also defer to modules for custom HTMLPurifier_ChildDef
2313     *       subclasses that need content set expansion
2314     * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
2315     * @return HTMLPurifier_ChildDef corresponding to ElementDef
2316     */
2317    public function getChildDef($def, $module) {
2318        $value = $def->content_model;
2319        if (is_object($value)) {
2320            trigger_error(
2321                'Literal object child definitions should be stored in '.
2322                'ElementDef->child not ElementDef->content_model',
2323                E_USER_NOTICE
2324            );
2325            return $value;
2326        }
2327        switch ($def->content_model_type) {
2328            case 'required':
2329                return new HTMLPurifier_ChildDef_Required($value);
2330            case 'optional':
2331                return new HTMLPurifier_ChildDef_Optional($value);
2332            case 'empty':
2333                return new HTMLPurifier_ChildDef_Empty();
2334            case 'custom':
2335                return new HTMLPurifier_ChildDef_Custom($value);
2336        }
2337        // defer to its module
2338        $return = false;
2339        if ($module->defines_child_def) { // save a func call
2340            $return = $module->getChildDef($def);
2341        }
2342        if ($return !== false) return $return;
2343        // error-out
2344        trigger_error(
2345            'Could not determine which ChildDef class to instantiate',
2346            E_USER_ERROR
2347        );
2348        return false;
2349    }
2350
2351    /**
2352     * Converts a string list of elements separated by pipes into
2353     * a lookup array.
2354     * @param $string List of elements
2355     * @return Lookup array of elements
2356     */
2357    protected function convertToLookup($string) {
2358        $array = explode('|', str_replace(' ', '', $string));
2359        $ret = array();
2360        foreach ($array as $i => $k) {
2361            $ret[$k] = true;
2362        }
2363        return $ret;
2364    }
2365
2366}
2367
2368
2369
2370
2371
2372/**
2373 * Registry object that contains information about the current context.
2374 * @warning Is a bit buggy when variables are set to null: it thinks
2375 *          they don't exist! So use false instead, please.
2376 * @note Since the variables Context deals with may not be objects,
2377 *       references are very important here! Do not remove!
2378 */
2379class HTMLPurifier_Context
2380{
2381
2382    /**
2383     * Private array that stores the references.
2384     */
2385    private $_storage = array();
2386
2387    /**
2388     * Registers a variable into the context.
2389     * @param $name String name
2390     * @param $ref Reference to variable to be registered
2391     */
2392    public function register($name, &$ref) {
2393        if (isset($this->_storage[$name])) {
2394            trigger_error("Name $name produces collision, cannot re-register",
2395                          E_USER_ERROR);
2396            return;
2397        }
2398        $this->_storage[$name] =& $ref;
2399    }
2400
2401    /**
2402     * Retrieves a variable reference from the context.
2403     * @param $name String name
2404     * @param $ignore_error Boolean whether or not to ignore error
2405     */
2406    public function &get($name, $ignore_error = false) {
2407        if (!isset($this->_storage[$name])) {
2408            if (!$ignore_error) {
2409                trigger_error("Attempted to retrieve non-existent variable $name",
2410                              E_USER_ERROR);
2411            }
2412            $var = null; // so we can return by reference
2413            return $var;
2414        }
2415        return $this->_storage[$name];
2416    }
2417
2418    /**
2419     * Destorys a variable in the context.
2420     * @param $name String name
2421     */
2422    public function destroy($name) {
2423        if (!isset($this->_storage[$name])) {
2424            trigger_error("Attempted to destroy non-existent variable $name",
2425                          E_USER_ERROR);
2426            return;
2427        }
2428        unset($this->_storage[$name]);
2429    }
2430
2431    /**
2432     * Checks whether or not the variable exists.
2433     * @param $name String name
2434     */
2435    public function exists($name) {
2436        return isset($this->_storage[$name]);
2437    }
2438
2439    /**
2440     * Loads a series of variables from an associative array
2441     * @param $context_array Assoc array of variables to load
2442     */
2443    public function loadArray($context_array) {
2444        foreach ($context_array as $key => $discard) {
2445            $this->register($key, $context_array[$key]);
2446        }
2447    }
2448
2449}
2450
2451
2452
2453
2454
2455/**
2456 * Abstract class representing Definition cache managers that implements
2457 * useful common methods and is a factory.
2458 * @todo Create a separate maintenance file advanced users can use to
2459 *       cache their custom HTMLDefinition, which can be loaded
2460 *       via a configuration directive
2461 * @todo Implement memcached
2462 */
2463abstract class HTMLPurifier_DefinitionCache
2464{
2465
2466    public $type;
2467
2468    /**
2469     * @param $name Type of definition objects this instance of the
2470     *      cache will handle.
2471     */
2472    public function __construct($type) {
2473        $this->type = $type;
2474    }
2475
2476    /**
2477     * Generates a unique identifier for a particular configuration
2478     * @param Instance of HTMLPurifier_Config
2479     */
2480    public function generateKey($config) {
2481        return $config->version . ',' . // possibly replace with function calls
2482               $config->getBatchSerial($this->type) . ',' .
2483               $config->get($this->type . '.DefinitionRev');
2484    }
2485
2486    /**
2487     * Tests whether or not a key is old with respect to the configuration's
2488     * version and revision number.
2489     * @param $key Key to test
2490     * @param $config Instance of HTMLPurifier_Config to test against
2491     */
2492    public function isOld($key, $config) {
2493        if (substr_count($key, ',') < 2) return true;
2494        list($version, $hash, $revision) = explode(',', $key, 3);
2495        $compare = version_compare($version, $config->version);
2496        // version mismatch, is always old
2497        if ($compare != 0) return true;
2498        // versions match, ids match, check revision number
2499        if (
2500            $hash == $config->getBatchSerial($this->type) &&
2501            $revision < $config->get($this->type . '.DefinitionRev')
2502        ) return true;
2503        return false;
2504    }
2505
2506    /**
2507     * Checks if a definition's type jives with the cache's type
2508     * @note Throws an error on failure
2509     * @param $def Definition object to check
2510     * @return Boolean true if good, false if not
2511     */
2512    public function checkDefType($def) {
2513        if ($def->type !== $this->type) {
2514            trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
2515            return false;
2516        }
2517        return true;
2518    }
2519
2520    /**
2521     * Adds a definition object to the cache
2522     */
2523    abstract public function add($def, $config);
2524
2525    /**
2526     * Unconditionally saves a definition object to the cache
2527     */
2528    abstract public function set($def, $config);
2529
2530    /**
2531     * Replace an object in the cache
2532     */
2533    abstract public function replace($def, $config);
2534
2535    /**
2536     * Retrieves a definition object from the cache
2537     */
2538    abstract public function get($config);
2539
2540    /**
2541     * Removes a definition object to the cache
2542     */
2543    abstract public function remove($config);
2544
2545    /**
2546     * Clears all objects from cache
2547     */
2548    abstract public function flush($config);
2549
2550    /**
2551     * Clears all expired (older version or revision) objects from cache
2552     * @note Be carefuly implementing this method as flush. Flush must
2553     *       not interfere with other Definition types, and cleanup()
2554     *       should not be repeatedly called by userland code.
2555     */
2556    abstract public function cleanup($config);
2557
2558}
2559
2560
2561
2562
2563
2564/**
2565 * Responsible for creating definition caches.
2566 */
2567class HTMLPurifier_DefinitionCacheFactory
2568{
2569
2570    protected $caches = array('Serializer' => array());
2571    protected $implementations = array();
2572    protected $decorators = array();
2573
2574    /**
2575     * Initialize default decorators
2576     */
2577    public function setup() {
2578        $this->addDecorator('Cleanup');
2579    }
2580
2581    /**
2582     * Retrieves an instance of global definition cache factory.
2583     */
2584    public static function instance($prototype = null) {
2585        static $instance;
2586        if ($prototype !== null) {
2587            $instance = $prototype;
2588        } elseif ($instance === null || $prototype === true) {
2589            $instance = new HTMLPurifier_DefinitionCacheFactory();
2590            $instance->setup();
2591        }
2592        return $instance;
2593    }
2594
2595    /**
2596     * Registers a new definition cache object
2597     * @param $short Short name of cache object, for reference
2598     * @param $long Full class name of cache object, for construction
2599     */
2600    public function register($short, $long) {
2601        $this->implementations[$short] = $long;
2602    }
2603
2604    /**
2605     * Factory method that creates a cache object based on configuration
2606     * @param $name Name of definitions handled by cache
2607     * @param $config Instance of HTMLPurifier_Config
2608     */
2609    public function create($type, $config) {
2610        $method = $config->get('Cache.DefinitionImpl');
2611        if ($method === null) {
2612            return new HTMLPurifier_DefinitionCache_Null($type);
2613        }
2614        if (!empty($this->caches[$method][$type])) {
2615            return $this->caches[$method][$type];
2616        }
2617        if (
2618          isset($this->implementations[$method]) &&
2619          class_exists($class = $this->implementations[$method], false)
2620        ) {
2621            $cache = new $class($type);
2622        } else {
2623            if ($method != 'Serializer') {
2624                trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
2625            }
2626            $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
2627        }
2628        foreach ($this->decorators as $decorator) {
2629            $new_cache = $decorator->decorate($cache);
2630            // prevent infinite recursion in PHP 4
2631            unset($cache);
2632            $cache = $new_cache;
2633        }
2634        $this->caches[$method][$type] = $cache;
2635        return $this->caches[$method][$type];
2636    }
2637
2638    /**
2639     * Registers a decorator to add to all new cache objects
2640     * @param
2641     */
2642    public function addDecorator($decorator) {
2643        if (is_string($decorator)) {
2644            $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
2645            $decorator = new $class;
2646        }
2647        $this->decorators[$decorator->name] = $decorator;
2648    }
2649
2650}
2651
2652
2653
2654
2655
2656/**
2657 * Represents a document type, contains information on which modules
2658 * need to be loaded.
2659 * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
2660 *       If structure changes, please update that function.
2661 */
2662class HTMLPurifier_Doctype
2663{
2664    /**
2665     * Full name of doctype
2666     */
2667    public $name;
2668
2669    /**
2670     * List of standard modules (string identifiers or literal objects)
2671     * that this doctype uses
2672     */
2673    public $modules = array();
2674
2675    /**
2676     * List of modules to use for tidying up code
2677     */
2678    public $tidyModules = array();
2679
2680    /**
2681     * Is the language derived from XML (i.e. XHTML)?
2682     */
2683    public $xml = true;
2684
2685    /**
2686     * List of aliases for this doctype
2687     */
2688    public $aliases = array();
2689
2690    /**
2691     * Public DTD identifier
2692     */
2693    public $dtdPublic;
2694
2695    /**
2696     * System DTD identifier
2697     */
2698    public $dtdSystem;
2699
2700    public function __construct($name = null, $xml = true, $modules = array(),
2701        $tidyModules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2702    ) {
2703        $this->name         = $name;
2704        $this->xml          = $xml;
2705        $this->modules      = $modules;
2706        $this->tidyModules  = $tidyModules;
2707        $this->aliases      = $aliases;
2708        $this->dtdPublic    = $dtd_public;
2709        $this->dtdSystem    = $dtd_system;
2710    }
2711}
2712
2713
2714
2715
2716
2717class HTMLPurifier_DoctypeRegistry
2718{
2719
2720    /**
2721     * Hash of doctype names to doctype objects
2722     */
2723    protected $doctypes;
2724
2725    /**
2726     * Lookup table of aliases to real doctype names
2727     */
2728    protected $aliases;
2729
2730    /**
2731     * Registers a doctype to the registry
2732     * @note Accepts a fully-formed doctype object, or the
2733     *       parameters for constructing a doctype object
2734     * @param $doctype Name of doctype or literal doctype object
2735     * @param $modules Modules doctype will load
2736     * @param $modules_for_modes Modules doctype will load for certain modes
2737     * @param $aliases Alias names for doctype
2738     * @return Editable registered doctype
2739     */
2740    public function register($doctype, $xml = true, $modules = array(),
2741        $tidy_modules = array(), $aliases = array(), $dtd_public = null, $dtd_system = null
2742    ) {
2743        if (!is_array($modules)) $modules = array($modules);
2744        if (!is_array($tidy_modules)) $tidy_modules = array($tidy_modules);
2745        if (!is_array($aliases)) $aliases = array($aliases);
2746        if (!is_object($doctype)) {
2747            $doctype = new HTMLPurifier_Doctype(
2748                $doctype, $xml, $modules, $tidy_modules, $aliases, $dtd_public, $dtd_system
2749            );
2750        }
2751        $this->doctypes[$doctype->name] = $doctype;
2752        $name = $doctype->name;
2753        // hookup aliases
2754        foreach ($doctype->aliases as $alias) {
2755            if (isset($this->doctypes[$alias])) continue;
2756            $this->aliases[$alias] = $name;
2757        }
2758        // remove old aliases
2759        if (isset($this->aliases[$name])) unset($this->aliases[$name]);
2760        return $doctype;
2761    }
2762
2763    /**
2764     * Retrieves reference to a doctype of a certain name
2765     * @note This function resolves aliases
2766     * @note When possible, use the more fully-featured make()
2767     * @param $doctype Name of doctype
2768     * @return Editable doctype object
2769     */
2770    public function get($doctype) {
2771        if (isset($this->aliases[$doctype])) $doctype = $this->aliases[$doctype];
2772        if (!isset($this->doctypes[$doctype])) {
2773            trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
2774            $anon = new HTMLPurifier_Doctype($doctype);
2775            return $anon;
2776        }
2777        return $this->doctypes[$doctype];
2778    }
2779
2780    /**
2781     * Creates a doctype based on a configuration object,
2782     * will perform initialization on the doctype
2783     * @note Use this function to get a copy of doctype that config
2784     *       can hold on to (this is necessary in order to tell
2785     *       Generator whether or not the current document is XML
2786     *       based or not).
2787     */
2788    public function make($config) {
2789        return clone $this->get($this->getDoctypeFromConfig($config));
2790    }
2791
2792    /**
2793     * Retrieves the doctype from the configuration object
2794     */
2795    public function getDoctypeFromConfig($config) {
2796        // recommended test
2797        $doctype = $config->get('HTML.Doctype');
2798        if (!empty($doctype)) return $doctype;
2799        $doctype = $config->get('HTML.CustomDoctype');
2800        if (!empty($doctype)) return $doctype;
2801        // backwards-compatibility
2802        if ($config->get('HTML.XHTML')) {
2803            $doctype = 'XHTML 1.0';
2804        } else {
2805            $doctype = 'HTML 4.01';
2806        }
2807        if ($config->get('HTML.Strict')) {
2808            $doctype .= ' Strict';
2809        } else {
2810            $doctype .= ' Transitional';
2811        }
2812        return $doctype;
2813    }
2814
2815}
2816
2817
2818
2819
2820
2821/**
2822 * Structure that stores an HTML element definition. Used by
2823 * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
2824 * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
2825 *       Please update that class too.
2826 * @warning If you add new properties to this class, you MUST update
2827 *          the mergeIn() method.
2828 */
2829class HTMLPurifier_ElementDef
2830{
2831
2832    /**
2833     * Does the definition work by itself, or is it created solely
2834     * for the purpose of merging into another definition?
2835     */
2836    public $standalone = true;
2837
2838    /**
2839     * Associative array of attribute name to HTMLPurifier_AttrDef
2840     * @note Before being processed by HTMLPurifier_AttrCollections
2841     *       when modules are finalized during
2842     *       HTMLPurifier_HTMLDefinition->setup(), this array may also
2843     *       contain an array at index 0 that indicates which attribute
2844     *       collections to load into the full array. It may also
2845     *       contain string indentifiers in lieu of HTMLPurifier_AttrDef,
2846     *       see HTMLPurifier_AttrTypes on how they are expanded during
2847     *       HTMLPurifier_HTMLDefinition->setup() processing.
2848     */
2849    public $attr = array();
2850
2851    /**
2852     * Indexed list of tag's HTMLPurifier_AttrTransform to be done before validation
2853     */
2854    public $attr_transform_pre = array();
2855
2856    /**
2857     * Indexed list of tag's HTMLPurifier_AttrTransform to be done after validation
2858     */
2859    public $attr_transform_post = array();
2860
2861    /**
2862     * HTMLPurifier_ChildDef of this tag.
2863     */
2864    public $child;
2865
2866    /**
2867     * Abstract string representation of internal ChildDef rules. See
2868     * HTMLPurifier_ContentSets for how this is parsed and then transformed
2869     * into an HTMLPurifier_ChildDef.
2870     * @warning This is a temporary variable that is not available after
2871     *      being processed by HTMLDefinition
2872     */
2873    public $content_model;
2874
2875    /**
2876     * Value of $child->type, used to determine which ChildDef to use,
2877     * used in combination with $content_model.
2878     * @warning This must be lowercase
2879     * @warning This is a temporary variable that is not available after
2880     *      being processed by HTMLDefinition
2881     */
2882    public $content_model_type;
2883
2884
2885
2886    /**
2887     * Does the element have a content model (#PCDATA | Inline)*? This
2888     * is important for chameleon ins and del processing in
2889     * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
2890     * have to worry about this one.
2891     */
2892    public $descendants_are_inline = false;
2893
2894    /**
2895     * List of the names of required attributes this element has. Dynamically
2896     * populated by HTMLPurifier_HTMLDefinition::getElement
2897     */
2898    public $required_attr = array();
2899
2900    /**
2901     * Lookup table of tags excluded from all descendants of this tag.
2902     * @note SGML permits exclusions for all descendants, but this is
2903     *       not possible with DTDs or XML Schemas. W3C has elected to
2904     *       use complicated compositions of content_models to simulate
2905     *       exclusion for children, but we go the simpler, SGML-style
2906     *       route of flat-out exclusions, which correctly apply to
2907     *       all descendants and not just children. Note that the XHTML
2908     *       Modularization Abstract Modules are blithely unaware of such
2909     *       distinctions.
2910     */
2911    public $excludes = array();
2912
2913    /**
2914     * This tag is explicitly auto-closed by the following tags.
2915     */
2916    public $autoclose = array();
2917
2918    /**
2919     * If a foreign element is found in this element, test if it is
2920     * allowed by this sub-element; if it is, instead of closing the
2921     * current element, place it inside this element.
2922     */
2923    public $wrap;
2924
2925    /**
2926     * Whether or not this is a formatting element affected by the
2927     * "Active Formatting Elements" algorithm.
2928     */
2929    public $formatting;
2930
2931    /**
2932     * Low-level factory constructor for creating new standalone element defs
2933     */
2934    public static function create($content_model, $content_model_type, $attr) {
2935        $def = new HTMLPurifier_ElementDef();
2936        $def->content_model = $content_model;
2937        $def->content_model_type = $content_model_type;
2938        $def->attr = $attr;
2939        return $def;
2940    }
2941
2942    /**
2943     * Merges the values of another element definition into this one.
2944     * Values from the new element def take precedence if a value is
2945     * not mergeable.
2946     */
2947    public function mergeIn($def) {
2948
2949        // later keys takes precedence
2950        foreach($def->attr as $k => $v) {
2951            if ($k === 0) {
2952                // merge in the includes
2953                // sorry, no way to override an include
2954                foreach ($v as $v2) {
2955                    $this->attr[0][] = $v2;
2956                }
2957                continue;
2958            }
2959            if ($v === false) {
2960                if (isset($this->attr[$k])) unset($this->attr[$k]);
2961                continue;
2962            }
2963            $this->attr[$k] = $v;
2964        }
2965        $this->_mergeAssocArray($this->attr_transform_pre, $def->attr_transform_pre);
2966        $this->_mergeAssocArray($this->attr_transform_post, $def->attr_transform_post);
2967        $this->_mergeAssocArray($this->excludes, $def->excludes);
2968
2969        if(!empty($def->content_model)) {
2970            $this->content_model =
2971                str_replace("#SUPER", $this->content_model, $def->content_model);
2972            $this->child = false;
2973        }
2974        if(!empty($def->content_model_type)) {
2975            $this->content_model_type = $def->content_model_type;
2976            $this->child = false;
2977        }
2978        if(!is_null($def->child)) $this->child = $def->child;
2979        if(!is_null($def->formatting)) $this->formatting = $def->formatting;
2980        if($def->descendants_are_inline) $this->descendants_are_inline = $def->descendants_are_inline;
2981
2982    }
2983
2984    /**
2985     * Merges one array into another, removes values which equal false
2986     * @param $a1 Array by reference that is merged into
2987     * @param $a2 Array that merges into $a1
2988     */
2989    private function _mergeAssocArray(&$a1, $a2) {
2990        foreach ($a2 as $k => $v) {
2991            if ($v === false) {
2992                if (isset($a1[$k])) unset($a1[$k]);
2993                continue;
2994            }
2995            $a1[$k] = $v;
2996        }
2997    }
2998
2999}
3000
3001
3002
3003
3004
3005/**
3006 * A UTF-8 specific character encoder that handles cleaning and transforming.
3007 * @note All functions in this class should be static.
3008 */
3009class HTMLPurifier_Encoder
3010{
3011
3012    /**
3013     * Constructor throws fatal error if you attempt to instantiate class
3014     */
3015    private function __construct() {
3016        trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
3017    }
3018
3019    /**
3020     * Error-handler that mutes errors, alternative to shut-up operator.
3021     */
3022    public static function muteErrorHandler() {}
3023
3024    /**
3025     * Cleans a UTF-8 string for well-formedness and SGML validity
3026     *
3027     * It will parse according to UTF-8 and return a valid UTF8 string, with
3028     * non-SGML codepoints excluded.
3029     *
3030     * @note Just for reference, the non-SGML code points are 0 to 31 and
3031     *       127 to 159, inclusive.  However, we allow code points 9, 10
3032     *       and 13, which are the tab, line feed and carriage return
3033     *       respectively. 128 and above the code points map to multibyte
3034     *       UTF-8 representations.
3035     *
3036     * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
3037     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
3038     *       LGPL license.  Notes on what changed are inside, but in general,
3039     *       the original code transformed UTF-8 text into an array of integer
3040     *       Unicode codepoints. Understandably, transforming that back to
3041     *       a string would be somewhat expensive, so the function was modded to
3042     *       directly operate on the string.  However, this discourages code
3043     *       reuse, and the logic enumerated here would be useful for any
3044     *       function that needs to be able to understand UTF-8 characters.
3045     *       As of right now, only smart lossless character encoding converters
3046     *       would need that, and I'm probably not going to implement them.
3047     *       Once again, PHP 6 should solve all our problems.
3048     */
3049    public static function cleanUTF8($str, $force_php = false) {
3050
3051        // UTF-8 validity is checked since PHP 4.3.5
3052        // This is an optimization: if the string is already valid UTF-8, no
3053        // need to do PHP stuff. 99% of the time, this will be the case.
3054        // The regexp matches the XML char production, as well as well as excluding
3055        // non-SGML codepoints U+007F to U+009F
3056        if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
3057            return $str;
3058        }
3059
3060        $mState = 0; // cached expected number of octets after the current octet
3061                     // until the beginning of the next UTF8 character sequence
3062        $mUcs4  = 0; // cached Unicode character
3063        $mBytes = 1; // cached expected number of octets in the current sequence
3064
3065        // original code involved an $out that was an array of Unicode
3066        // codepoints.  Instead of having to convert back into UTF-8, we've
3067        // decided to directly append valid UTF-8 characters onto a string
3068        // $out once they're done.  $char accumulates raw bytes, while $mUcs4
3069        // turns into the Unicode code point, so there's some redundancy.
3070
3071        $out = '';
3072        $char = '';
3073
3074        $len = strlen($str);
3075        for($i = 0; $i < $len; $i++) {
3076            $in = ord($str{$i});
3077            $char .= $str[$i]; // append byte to char
3078            if (0 == $mState) {
3079                // When mState is zero we expect either a US-ASCII character
3080                // or a multi-octet sequence.
3081                if (0 == (0x80 & ($in))) {
3082                    // US-ASCII, pass straight through.
3083                    if (($in <= 31 || $in == 127) &&
3084                        !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
3085                    ) {
3086                        // control characters, remove
3087                    } else {
3088                        $out .= $char;
3089                    }
3090                    // reset
3091                    $char = '';
3092                    $mBytes = 1;
3093                } elseif (0xC0 == (0xE0 & ($in))) {
3094                    // First octet of 2 octet sequence
3095                    $mUcs4 = ($in);
3096                    $mUcs4 = ($mUcs4 & 0x1F) << 6;
3097                    $mState = 1;
3098                    $mBytes = 2;
3099                } elseif (0xE0 == (0xF0 & ($in))) {
3100                    // First octet of 3 octet sequence
3101                    $mUcs4 = ($in);
3102                    $mUcs4 = ($mUcs4 & 0x0F) << 12;
3103                    $mState = 2;
3104                    $mBytes = 3;
3105                } elseif (0xF0 == (0xF8 & ($in))) {
3106                    // First octet of 4 octet sequence
3107                    $mUcs4 = ($in);
3108                    $mUcs4 = ($mUcs4 & 0x07) << 18;
3109                    $mState = 3;
3110                    $mBytes = 4;
3111                } elseif (0xF8 == (0xFC & ($in))) {
3112                    // First octet of 5 octet sequence.
3113                    //
3114                    // This is illegal because the encoded codepoint must be
3115                    // either:
3116                    // (a) not the shortest form or
3117                    // (b) outside the Unicode range of 0-0x10FFFF.
3118                    // Rather than trying to resynchronize, we will carry on
3119                    // until the end of the sequence and let the later error
3120                    // handling code catch it.
3121                    $mUcs4 = ($in);
3122                    $mUcs4 = ($mUcs4 & 0x03) << 24;
3123                    $mState = 4;
3124                    $mBytes = 5;
3125                } elseif (0xFC == (0xFE & ($in))) {
3126                    // First octet of 6 octet sequence, see comments for 5
3127                    // octet sequence.
3128                    $mUcs4 = ($in);
3129                    $mUcs4 = ($mUcs4 & 1) << 30;
3130                    $mState = 5;
3131                    $mBytes = 6;
3132                } else {
3133                    // Current octet is neither in the US-ASCII range nor a
3134                    // legal first octet of a multi-octet sequence.
3135                    $mState = 0;
3136                    $mUcs4  = 0;
3137                    $mBytes = 1;
3138                    $char = '';
3139                }
3140            } else {
3141                // When mState is non-zero, we expect a continuation of the
3142                // multi-octet sequence
3143                if (0x80 == (0xC0 & ($in))) {
3144                    // Legal continuation.
3145                    $shift = ($mState - 1) * 6;
3146                    $tmp = $in;
3147                    $tmp = ($tmp & 0x0000003F) << $shift;
3148                    $mUcs4 |= $tmp;
3149
3150                    if (0 == --$mState) {
3151                        // End of the multi-octet sequence. mUcs4 now contains
3152                        // the final Unicode codepoint to be output
3153
3154                        // Check for illegal sequences and codepoints.
3155
3156                        // From Unicode 3.1, non-shortest form is illegal
3157                        if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
3158                            ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
3159                            ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
3160                            (4 < $mBytes) ||
3161                            // From Unicode 3.2, surrogate characters = illegal
3162                            (($mUcs4 & 0xFFFFF800) == 0xD800) ||
3163                            // Codepoints outside the Unicode range are illegal
3164                            ($mUcs4 > 0x10FFFF)
3165                        ) {
3166
3167                        } elseif (0xFEFF != $mUcs4 && // omit BOM
3168                            // check for valid Char unicode codepoints
3169                            (
3170                                0x9 == $mUcs4 ||
3171                                0xA == $mUcs4 ||
3172                                0xD == $mUcs4 ||
3173                                (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
3174                                // 7F-9F is not strictly prohibited by XML,
3175                                // but it is non-SGML, and thus we don't allow it
3176                                (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
3177                                (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
3178                            )
3179                        ) {
3180                            $out .= $char;
3181                        }
3182                        // initialize UTF8 cache (reset)
3183                        $mState = 0;
3184                        $mUcs4  = 0;
3185                        $mBytes = 1;
3186                        $char = '';
3187                    }
3188                } else {
3189                    // ((0xC0 & (*in) != 0x80) && (mState != 0))
3190                    // Incomplete multi-octet sequence.
3191                    // used to result in complete fail, but we'll reset
3192                    $mState = 0;
3193                    $mUcs4  = 0;
3194                    $mBytes = 1;
3195                    $char ='';
3196                }
3197            }
3198        }
3199        return $out;
3200    }
3201
3202    /**
3203     * Translates a Unicode codepoint into its corresponding UTF-8 character.
3204     * @note Based on Feyd's function at
3205     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
3206     *       which is in public domain.
3207     * @note While we're going to do code point parsing anyway, a good
3208     *       optimization would be to refuse to translate code points that
3209     *       are non-SGML characters.  However, this could lead to duplication.
3210     * @note This is very similar to the unichr function in
3211     *       maintenance/generate-entity-file.php (although this is superior,
3212     *       due to its sanity checks).
3213     */
3214
3215    // +----------+----------+----------+----------+
3216    // | 33222222 | 22221111 | 111111   |          |
3217    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
3218    // +----------+----------+----------+----------+
3219    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
3220    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
3221    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
3222    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
3223    // +----------+----------+----------+----------+
3224    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
3225    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
3226    // +----------+----------+----------+----------+
3227
3228    public static function unichr($code) {
3229        if($code > 1114111 or $code < 0 or
3230          ($code >= 55296 and $code <= 57343) ) {
3231            // bits are set outside the "valid" range as defined
3232            // by UNICODE 4.1.0
3233            return '';
3234        }
3235
3236        $x = $y = $z = $w = 0;
3237        if ($code < 128) {
3238            // regular ASCII character
3239            $x = $code;
3240        } else {
3241            // set up bits for UTF-8
3242            $x = ($code & 63) | 128;
3243            if ($code < 2048) {
3244                $y = (($code & 2047) >> 6) | 192;
3245            } else {
3246                $y = (($code & 4032) >> 6) | 128;
3247                if($code < 65536) {
3248                    $z = (($code >> 12) & 15) | 224;
3249                } else {
3250                    $z = (($code >> 12) & 63) | 128;
3251                    $w = (($code >> 18) & 7)  | 240;
3252                }
3253            }
3254        }
3255        // set up the actual character
3256        $ret = '';
3257        if($w) $ret .= chr($w);
3258        if($z) $ret .= chr($z);
3259        if($y) $ret .= chr($y);
3260        $ret .= chr($x);
3261
3262        return $ret;
3263    }
3264
3265    /**
3266     * Converts a string to UTF-8 based on configuration.
3267     */
3268    public static function convertToUTF8($str, $config, $context) {
3269        $encoding = $config->get('Core.Encoding');
3270        if ($encoding === 'utf-8') return $str;
3271        static $iconv = null;
3272        if ($iconv === null) $iconv = function_exists('iconv');
3273        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3274        if ($iconv && !$config->get('Test.ForceNoIconv')) {
3275            $str = iconv($encoding, 'utf-8//IGNORE', $str);
3276            if ($str === false) {
3277                // $encoding is not a valid encoding
3278                restore_error_handler();
3279                trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
3280                return '';
3281            }
3282            // If the string is bjorked by Shift_JIS or a similar encoding
3283            // that doesn't support all of ASCII, convert the naughty
3284            // characters to their true byte-wise ASCII/UTF-8 equivalents.
3285            $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
3286            restore_error_handler();
3287            return $str;
3288        } elseif ($encoding === 'iso-8859-1') {
3289            $str = utf8_encode($str);
3290            restore_error_handler();
3291            return $str;
3292        }
3293        trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
3294    }
3295
3296    /**
3297     * Converts a string from UTF-8 based on configuration.
3298     * @note Currently, this is a lossy conversion, with unexpressable
3299     *       characters being omitted.
3300     */
3301    public static function convertFromUTF8($str, $config, $context) {
3302        $encoding = $config->get('Core.Encoding');
3303        if ($encoding === 'utf-8') return $str;
3304        static $iconv = null;
3305        if ($iconv === null) $iconv = function_exists('iconv');
3306        if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
3307            $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
3308        }
3309        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3310        if ($iconv && !$config->get('Test.ForceNoIconv')) {
3311            // Undo our previous fix in convertToUTF8, otherwise iconv will barf
3312            $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
3313            if (!$escape && !empty($ascii_fix)) {
3314                $clear_fix = array();
3315                foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
3316                $str = strtr($str, $clear_fix);
3317            }
3318            $str = strtr($str, array_flip($ascii_fix));
3319            // Normal stuff
3320            $str = iconv('utf-8', $encoding . '//IGNORE', $str);
3321            restore_error_handler();
3322            return $str;
3323        } elseif ($encoding === 'iso-8859-1') {
3324            $str = utf8_decode($str);
3325            restore_error_handler();
3326            return $str;
3327        }
3328        trigger_error('Encoding not supported', E_USER_ERROR);
3329    }
3330
3331    /**
3332     * Lossless (character-wise) conversion of HTML to ASCII
3333     * @param $str UTF-8 string to be converted to ASCII
3334     * @returns ASCII encoded string with non-ASCII character entity-ized
3335     * @warning Adapted from MediaWiki, claiming fair use: this is a common
3336     *       algorithm. If you disagree with this license fudgery,
3337     *       implement it yourself.
3338     * @note Uses decimal numeric entities since they are best supported.
3339     * @note This is a DUMB function: it has no concept of keeping
3340     *       character entities that the projected character encoding
3341     *       can allow. We could possibly implement a smart version
3342     *       but that would require it to also know which Unicode
3343     *       codepoints the charset supported (not an easy task).
3344     * @note Sort of with cleanUTF8() but it assumes that $str is
3345     *       well-formed UTF-8
3346     */
3347    public static function convertToASCIIDumbLossless($str) {
3348        $bytesleft = 0;
3349        $result = '';
3350        $working = 0;
3351        $len = strlen($str);
3352        for( $i = 0; $i < $len; $i++ ) {
3353            $bytevalue = ord( $str[$i] );
3354            if( $bytevalue <= 0x7F ) { //0xxx xxxx
3355                $result .= chr( $bytevalue );
3356                $bytesleft = 0;
3357            } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
3358                $working = $working << 6;
3359                $working += ($bytevalue & 0x3F);
3360                $bytesleft--;
3361                if( $bytesleft <= 0 ) {
3362                    $result .= "&#" . $working . ";";
3363                }
3364            } elseif( $bytevalue <= 0xDF ) { //110x xxxx
3365                $working = $bytevalue & 0x1F;
3366                $bytesleft = 1;
3367            } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
3368                $working = $bytevalue & 0x0F;
3369                $bytesleft = 2;
3370            } else { //1111 0xxx
3371                $working = $bytevalue & 0x07;
3372                $bytesleft = 3;
3373            }
3374        }
3375        return $result;
3376    }
3377
3378    /**
3379     * This expensive function tests whether or not a given character
3380     * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
3381     * fail this test, and require special processing. Variable width
3382     * encodings shouldn't ever fail.
3383     *
3384     * @param string $encoding Encoding name to test, as per iconv format
3385     * @param bool $bypass Whether or not to bypass the precompiled arrays.
3386     * @return Array of UTF-8 characters to their corresponding ASCII,
3387     *      which can be used to "undo" any overzealous iconv action.
3388     */
3389    public static function testEncodingSupportsASCII($encoding, $bypass = false) {
3390        static $encodings = array();
3391        if (!$bypass) {
3392            if (isset($encodings[$encoding])) return $encodings[$encoding];
3393            $lenc = strtolower($encoding);
3394            switch ($lenc) {
3395                case 'shift_jis':
3396                    return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
3397                case 'johab':
3398                    return array("\xE2\x82\xA9" => '\\');
3399            }
3400            if (strpos($lenc, 'iso-8859-') === 0) return array();
3401        }
3402        $ret = array();
3403        set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
3404        if (iconv('UTF-8', $encoding, 'a') === false) return false;
3405        for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
3406            $c = chr($i); // UTF-8 char
3407            $r = iconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
3408            if (
3409                $r === '' ||
3410                // This line is needed for iconv implementations that do not
3411                // omit characters that do not exist in the target character set
3412                ($r === $c && iconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
3413            ) {
3414                // Reverse engineer: what's the UTF-8 equiv of this byte
3415                // sequence? This assumes that there's no variable width
3416                // encoding that doesn't support ASCII.
3417                $ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
3418            }
3419        }
3420        restore_error_handler();
3421        $encodings[$encoding] = $ret;
3422        return $ret;
3423    }
3424
3425
3426}
3427
3428
3429
3430
3431
3432/**
3433 * Object that provides entity lookup table from entity name to character
3434 */
3435class HTMLPurifier_EntityLookup {
3436
3437    /**
3438     * Assoc array of entity name to character represented.
3439     */
3440    public $table;
3441
3442    /**
3443     * Sets up the entity lookup table from the serialized file contents.
3444     * @note The serialized contents are versioned, but were generated
3445     *       using the maintenance script generate_entity_file.php
3446     * @warning This is not in constructor to help enforce the Singleton
3447     */
3448    public function setup($file = false) {
3449        if (!$file) {
3450            $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
3451        }
3452        $this->table = unserialize(file_get_contents($file));
3453    }
3454
3455    /**
3456     * Retrieves sole instance of the object.
3457     * @param Optional prototype of custom lookup table to overload with.
3458     */
3459    public static function instance($prototype = false) {
3460        // no references, since PHP doesn't copy unless modified
3461        static $instance = null;
3462        if ($prototype) {
3463            $instance = $prototype;
3464        } elseif (!$instance) {
3465            $instance = new HTMLPurifier_EntityLookup();
3466            $instance->setup();
3467        }
3468        return $instance;
3469    }
3470
3471}
3472
3473
3474
3475
3476
3477// if want to implement error collecting here, we'll need to use some sort
3478// of global data (probably trigger_error) because it's impossible to pass
3479// $config or $context to the callback functions.
3480
3481/**
3482 * Handles referencing and derefencing character entities
3483 */
3484class HTMLPurifier_EntityParser
3485{
3486
3487    /**
3488     * Reference to entity lookup table.
3489     */
3490    protected $_entity_lookup;
3491
3492    /**
3493     * Callback regex string for parsing entities.
3494     */
3495    protected $_substituteEntitiesRegex =
3496'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
3497//     1. hex             2. dec      3. string (XML style)
3498
3499
3500    /**
3501     * Decimal to parsed string conversion table for special entities.
3502     */
3503    protected $_special_dec2str =
3504            array(
3505                    34 => '"',
3506                    38 => '&',
3507                    39 => "'",
3508                    60 => '<',
3509                    62 => '>'
3510            );
3511
3512    /**
3513     * Stripped entity names to decimal conversion table for special entities.
3514     */
3515    protected $_special_ent2dec =
3516            array(
3517                    'quot' => 34,
3518                    'amp'  => 38,
3519                    'lt'   => 60,
3520                    'gt'   => 62
3521            );
3522
3523    /**
3524     * Substitutes non-special entities with their parsed equivalents. Since
3525     * running this whenever you have parsed character is t3h 5uck, we run
3526     * it before everything else.
3527     *
3528     * @param $string String to have non-special entities parsed.
3529     * @returns Parsed string.
3530     */
3531    public function substituteNonSpecialEntities($string) {
3532        // it will try to detect missing semicolons, but don't rely on it
3533        return preg_replace_callback(
3534            $this->_substituteEntitiesRegex,
3535            array($this, 'nonSpecialEntityCallback'),
3536            $string
3537            );
3538    }
3539
3540    /**
3541     * Callback function for substituteNonSpecialEntities() that does the work.
3542     *
3543     * @param $matches  PCRE matches array, with 0 the entire match, and
3544     *                  either index 1, 2 or 3 set with a hex value, dec value,
3545     *                  or string (respectively).
3546     * @returns Replacement string.
3547     */
3548
3549    protected function nonSpecialEntityCallback($matches) {
3550        // replaces all but big five
3551        $entity = $matches[0];
3552        $is_num = (@$matches[0][1] === '#');
3553        if ($is_num) {
3554            $is_hex = (@$entity[2] === 'x');
3555            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3556
3557            // abort for special characters
3558            if (isset($this->_special_dec2str[$code]))  return $entity;
3559
3560            return HTMLPurifier_Encoder::unichr($code);
3561        } else {
3562            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
3563            if (!$this->_entity_lookup) {
3564                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
3565            }
3566            if (isset($this->_entity_lookup->table[$matches[3]])) {
3567                return $this->_entity_lookup->table[$matches[3]];
3568            } else {
3569                return $entity;
3570            }
3571        }
3572    }
3573
3574    /**
3575     * Substitutes only special entities with their parsed equivalents.
3576     *
3577     * @notice We try to avoid calling this function because otherwise, it
3578     * would have to be called a lot (for every parsed section).
3579     *
3580     * @param $string String to have non-special entities parsed.
3581     * @returns Parsed string.
3582     */
3583    public function substituteSpecialEntities($string) {
3584        return preg_replace_callback(
3585            $this->_substituteEntitiesRegex,
3586            array($this, 'specialEntityCallback'),
3587            $string);
3588    }
3589
3590    /**
3591     * Callback function for substituteSpecialEntities() that does the work.
3592     *
3593     * This callback has same syntax as nonSpecialEntityCallback().
3594     *
3595     * @param $matches  PCRE-style matches array, with 0 the entire match, and
3596     *                  either index 1, 2 or 3 set with a hex value, dec value,
3597     *                  or string (respectively).
3598     * @returns Replacement string.
3599     */
3600    protected function specialEntityCallback($matches) {
3601        $entity = $matches[0];
3602        $is_num = (@$matches[0][1] === '#');
3603        if ($is_num) {
3604            $is_hex = (@$entity[2] === 'x');
3605            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
3606            return isset($this->_special_dec2str[$int]) ?
3607                $this->_special_dec2str[$int] :
3608                $entity;
3609        } else {
3610            return isset($this->_special_ent2dec[$matches[3]]) ?
3611                $this->_special_ent2dec[$matches[3]] :
3612                $entity;
3613        }
3614    }
3615
3616}
3617
3618
3619
3620
3621
3622/**
3623 * Error collection class that enables HTML Purifier to report HTML
3624 * problems back to the user
3625 */
3626class HTMLPurifier_ErrorCollector
3627{
3628
3629    /**
3630     * Identifiers for the returned error array. These are purposely numeric
3631     * so list() can be used.
3632     */
3633    const LINENO   = 0;
3634    const SEVERITY = 1;
3635    const MESSAGE  = 2;
3636    const CHILDREN = 3;
3637
3638    protected $errors;
3639    protected $_current;
3640    protected $_stacks = array(array());
3641    protected $locale;
3642    protected $generator;
3643    protected $context;
3644
3645    protected $lines = array();
3646
3647    public function __construct($context) {
3648        $this->locale    =& $context->get('Locale');
3649        $this->context   = $context;
3650        $this->_current  =& $this->_stacks[0];
3651        $this->errors    =& $this->_stacks[0];
3652    }
3653
3654    /**
3655     * Sends an error message to the collector for later use
3656     * @param $severity int Error severity, PHP error style (don't use E_USER_)
3657     * @param $msg string Error message text
3658     * @param $subst1 string First substitution for $msg
3659     * @param $subst2 string ...
3660     */
3661    public function send($severity, $msg) {
3662
3663        $args = array();
3664        if (func_num_args() > 2) {
3665            $args = func_get_args();
3666            array_shift($args);
3667            unset($args[0]);
3668        }
3669
3670        $token = $this->context->get('CurrentToken', true);
3671        $line  = $token ? $token->line : $this->context->get('CurrentLine', true);
3672        $col   = $token ? $token->col  : $this->context->get('CurrentCol',  true);
3673        $attr  = $this->context->get('CurrentAttr', true);
3674
3675        // perform special substitutions, also add custom parameters
3676        $subst = array();
3677        if (!is_null($token)) {
3678            $args['CurrentToken'] = $token;
3679        }
3680        if (!is_null($attr)) {
3681            $subst['$CurrentAttr.Name'] = $attr;
3682            if (isset($token->attr[$attr])) $subst['$CurrentAttr.Value'] = $token->attr[$attr];
3683        }
3684
3685        if (empty($args)) {
3686            $msg = $this->locale->getMessage($msg);
3687        } else {
3688            $msg = $this->locale->formatMessage($msg, $args);
3689        }
3690
3691        if (!empty($subst)) $msg = strtr($msg, $subst);
3692
3693        // (numerically indexed)
3694        $error = array(
3695            self::LINENO   => $line,
3696            self::SEVERITY => $severity,
3697            self::MESSAGE  => $msg,
3698            self::CHILDREN => array()
3699        );
3700        $this->_current[] = $error;
3701
3702
3703        // NEW CODE BELOW ...
3704
3705        $struct = null;
3706        // Top-level errors are either:
3707        //  TOKEN type, if $value is set appropriately, or
3708        //  "syntax" type, if $value is null
3709        $new_struct = new HTMLPurifier_ErrorStruct();
3710        $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
3711        if ($token) $new_struct->value = clone $token;
3712        if (is_int($line) && is_int($col)) {
3713            if (isset($this->lines[$line][$col])) {
3714                $struct = $this->lines[$line][$col];
3715            } else {
3716                $struct = $this->lines[$line][$col] = $new_struct;
3717            }
3718            // These ksorts may present a performance problem
3719            ksort($this->lines[$line], SORT_NUMERIC);
3720        } else {
3721            if (isset($this->lines[-1])) {
3722                $struct = $this->lines[-1];
3723            } else {
3724                $struct = $this->lines[-1] = $new_struct;
3725            }
3726        }
3727        ksort($this->lines, SORT_NUMERIC);
3728
3729        // Now, check if we need to operate on a lower structure
3730        if (!empty($attr)) {
3731            $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
3732            if (!$struct->value) {
3733                $struct->value = array($attr, 'PUT VALUE HERE');
3734            }
3735        }
3736        if (!empty($cssprop)) {
3737            $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
3738            if (!$struct->value) {
3739                // if we tokenize CSS this might be a little more difficult to do
3740                $struct->value = array($cssprop, 'PUT VALUE HERE');
3741            }
3742        }
3743
3744        // Ok, structs are all setup, now time to register the error
3745        $struct->addError($severity, $msg);
3746    }
3747
3748    /**
3749     * Retrieves raw error data for custom formatter to use
3750     * @param List of arrays in format of array(line of error,
3751     *        error severity, error message,
3752     *        recursive sub-errors array)
3753     */
3754    public function getRaw() {
3755        return $this->errors;
3756    }
3757
3758    /**
3759     * Default HTML formatting implementation for error messages
3760     * @param $config Configuration array, vital for HTML output nature
3761     * @param $errors Errors array to display; used for recursion.
3762     */
3763    public function getHTMLFormatted($config, $errors = null) {
3764        $ret = array();
3765
3766        $this->generator = new HTMLPurifier_Generator($config, $this->context);
3767        if ($errors === null) $errors = $this->errors;
3768
3769        // 'At line' message needs to be removed
3770
3771        // generation code for new structure goes here. It needs to be recursive.
3772        foreach ($this->lines as $line => $col_array) {
3773            if ($line == -1) continue;
3774            foreach ($col_array as $col => $struct) {
3775                $this->_renderStruct($ret, $struct, $line, $col);
3776            }
3777        }
3778        if (isset($this->lines[-1])) {
3779            $this->_renderStruct($ret, $this->lines[-1]);
3780        }
3781
3782        if (empty($errors)) {
3783            return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
3784        } else {
3785            return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
3786        }
3787
3788    }
3789
3790    private function _renderStruct(&$ret, $struct, $line = null, $col = null) {
3791        $stack = array($struct);
3792        $context_stack = array(array());
3793        while ($current = array_pop($stack)) {
3794            $context = array_pop($context_stack);
3795            foreach ($current->errors as $error) {
3796                list($severity, $msg) = $error;
3797                $string = '';
3798                $string .= '<div>';
3799                // W3C uses an icon to indicate the severity of the error.
3800                $error = $this->locale->getErrorName($severity);
3801                $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
3802                if (!is_null($line) && !is_null($col)) {
3803                    $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
3804                } else {
3805                    $string .= '<em class="location">End of Document: </em> ';
3806                }
3807                $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
3808                $string .= '</div>';
3809                // Here, have a marker for the character on the column appropriate.
3810                // Be sure to clip extremely long lines.
3811                //$string .= '<pre>';
3812                //$string .= '';
3813                //$string .= '</pre>';
3814                $ret[] = $string;
3815            }
3816            foreach ($current->children as $type => $array) {
3817                $context[] = $current;
3818                $stack = array_merge($stack, array_reverse($array, true));
3819                for ($i = count($array); $i > 0; $i--) {
3820                    $context_stack[] = $context;
3821                }
3822            }
3823        }
3824    }
3825
3826}
3827
3828
3829
3830
3831
3832/**
3833 * Records errors for particular segments of an HTML document such as tokens,
3834 * attributes or CSS properties. They can contain error structs (which apply
3835 * to components of what they represent), but their main purpose is to hold
3836 * errors applying to whatever struct is being used.
3837 */
3838class HTMLPurifier_ErrorStruct
3839{
3840
3841    /**
3842     * Possible values for $children first-key. Note that top-level structures
3843     * are automatically token-level.
3844     */
3845    const TOKEN     = 0;
3846    const ATTR      = 1;
3847    const CSSPROP   = 2;
3848
3849    /**
3850     * Type of this struct.
3851     */
3852    public $type;
3853
3854    /**
3855     * Value of the struct we are recording errors for. There are various
3856     * values for this:
3857     *  - TOKEN: Instance of HTMLPurifier_Token
3858     *  - ATTR: array('attr-name', 'value')
3859     *  - CSSPROP: array('prop-name', 'value')
3860     */
3861    public $value;
3862
3863    /**
3864     * Errors registered for this structure.
3865     */
3866    public $errors = array();
3867
3868    /**
3869     * Child ErrorStructs that are from this structure. For example, a TOKEN
3870     * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
3871     * array in structure: [TYPE]['identifier']
3872     */
3873    public $children = array();
3874
3875    public function getChild($type, $id) {
3876        if (!isset($this->children[$type][$id])) {
3877            $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
3878            $this->children[$type][$id]->type = $type;
3879        }
3880        return $this->children[$type][$id];
3881    }
3882
3883    public function addError($severity, $message) {
3884        $this->errors[] = array($severity, $message);
3885    }
3886
3887}
3888
3889
3890
3891
3892
3893/**
3894 * Global exception class for HTML Purifier; any exceptions we throw
3895 * are from here.
3896 */
3897class HTMLPurifier_Exception extends Exception
3898{
3899
3900}
3901
3902
3903
3904
3905
3906/**
3907 * Represents a pre or post processing filter on HTML Purifier's output
3908 *
3909 * Sometimes, a little ad-hoc fixing of HTML has to be done before
3910 * it gets sent through HTML Purifier: you can use filters to acheive
3911 * this effect. For instance, YouTube videos can be preserved using
3912 * this manner. You could have used a decorator for this task, but
3913 * PHP's support for them is not terribly robust, so we're going
3914 * to just loop through the filters.
3915 *
3916 * Filters should be exited first in, last out. If there are three filters,
3917 * named 1, 2 and 3, the order of execution should go 1->preFilter,
3918 * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
3919 * 1->postFilter.
3920 *
3921 * @note Methods are not declared abstract as it is perfectly legitimate
3922 *       for an implementation not to want anything to happen on a step
3923 */
3924
3925class HTMLPurifier_Filter
3926{
3927
3928    /**
3929     * Name of the filter for identification purposes
3930     */
3931    public $name;
3932
3933    /**
3934     * Pre-processor function, handles HTML before HTML Purifier
3935     */
3936    public function preFilter($html, $config, $context) {
3937        return $html;
3938    }
3939
3940    /**
3941     * Post-processor function, handles HTML after HTML Purifier
3942     */
3943    public function postFilter($html, $config, $context) {
3944        return $html;
3945    }
3946
3947}
3948
3949
3950
3951
3952
3953/**
3954 * Generates HTML from tokens.
3955 * @todo Refactor interface so that configuration/context is determined
3956 *       upon instantiation, no need for messy generateFromTokens() calls
3957 * @todo Make some of the more internal functions protected, and have
3958 *       unit tests work around that
3959 */
3960class HTMLPurifier_Generator
3961{
3962
3963    /**
3964     * Whether or not generator should produce XML output
3965     */
3966    private $_xhtml = true;
3967
3968    /**
3969     * :HACK: Whether or not generator should comment the insides of <script> tags
3970     */
3971    private $_scriptFix = false;
3972
3973    /**
3974     * Cache of HTMLDefinition during HTML output to determine whether or
3975     * not attributes should be minimized.
3976     */
3977    private $_def;
3978
3979    /**
3980     * Cache of %Output.SortAttr
3981     */
3982    private $_sortAttr;
3983
3984    /**
3985     * Cache of %Output.FlashCompat
3986     */
3987    private $_flashCompat;
3988
3989    /**
3990     * Cache of %Output.FixInnerHTML
3991     */
3992    private $_innerHTMLFix;
3993
3994    /**
3995     * Stack for keeping track of object information when outputting IE
3996     * compatibility code.
3997     */
3998    private $_flashStack = array();
3999
4000    /**
4001     * Configuration for the generator
4002     */
4003    protected $config;
4004
4005    /**
4006     * @param $config Instance of HTMLPurifier_Config
4007     * @param $context Instance of HTMLPurifier_Context
4008     */
4009    public function __construct($config, $context) {
4010        $this->config = $config;
4011        $this->_scriptFix = $config->get('Output.CommentScriptContents');
4012        $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
4013        $this->_sortAttr = $config->get('Output.SortAttr');
4014        $this->_flashCompat = $config->get('Output.FlashCompat');
4015        $this->_def = $config->getHTMLDefinition();
4016        $this->_xhtml = $this->_def->doctype->xml;
4017    }
4018
4019    /**
4020     * Generates HTML from an array of tokens.
4021     * @param $tokens Array of HTMLPurifier_Token
4022     * @param $config HTMLPurifier_Config object
4023     * @return Generated HTML
4024     */
4025    public function generateFromTokens($tokens) {
4026        if (!$tokens) return '';
4027
4028        // Basic algorithm
4029        $html = '';
4030        for ($i = 0, $size = count($tokens); $i < $size; $i++) {
4031            if ($this->_scriptFix && $tokens[$i]->name === 'script'
4032                && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
4033                // script special case
4034                // the contents of the script block must be ONE token
4035                // for this to work.
4036                $html .= $this->generateFromToken($tokens[$i++]);
4037                $html .= $this->generateScriptFromToken($tokens[$i++]);
4038            }
4039            $html .= $this->generateFromToken($tokens[$i]);
4040        }
4041
4042        // Tidy cleanup
4043        if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
4044            $tidy = new Tidy;
4045            $tidy->parseString($html, array(
4046               'indent'=> true,
4047               'output-xhtml' => $this->_xhtml,
4048               'show-body-only' => true,
4049               'indent-spaces' => 2,
4050               'wrap' => 68,
4051            ), 'utf8');
4052            $tidy->cleanRepair();
4053            $html = (string) $tidy; // explicit cast necessary
4054        }
4055
4056        // Normalize newlines to system defined value
4057        if ($this->config->get('Core.NormalizeNewlines')) {
4058            $nl = $this->config->get('Output.Newline');
4059            if ($nl === null) $nl = PHP_EOL;
4060            if ($nl !== "\n") $html = str_replace("\n", $nl, $html);
4061        }
4062        return $html;
4063    }
4064
4065    /**
4066     * Generates HTML from a single token.
4067     * @param $token HTMLPurifier_Token object.
4068     * @return Generated HTML
4069     */
4070    public function generateFromToken($token) {
4071        if (!$token instanceof HTMLPurifier_Token) {
4072            trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
4073            return '';
4074
4075        } elseif ($token instanceof HTMLPurifier_Token_Start) {
4076            $attr = $this->generateAttributes($token->attr, $token->name);
4077            if ($this->_flashCompat) {
4078                if ($token->name == "object") {
4079                    $flash = new stdclass();
4080                    $flash->attr = $token->attr;
4081                    $flash->param = array();
4082                    $this->_flashStack[] = $flash;
4083                }
4084            }
4085            return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
4086
4087        } elseif ($token instanceof HTMLPurifier_Token_End) {
4088            $_extra = '';
4089            if ($this->_flashCompat) {
4090                if ($token->name == "object" && !empty($this->_flashStack)) {
4091                    // doesn't do anything for now
4092                }
4093            }
4094            return $_extra . '</' . $token->name . '>';
4095
4096        } elseif ($token instanceof HTMLPurifier_Token_Empty) {
4097            if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
4098                $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
4099            }
4100            $attr = $this->generateAttributes($token->attr, $token->name);
4101             return '<' . $token->name . ($attr ? ' ' : '') . $attr .
4102                ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
4103                . '>';
4104
4105        } elseif ($token instanceof HTMLPurifier_Token_Text) {
4106            return $this->escape($token->data, ENT_NOQUOTES);
4107
4108        } elseif ($token instanceof HTMLPurifier_Token_Comment) {
4109            return '<!--' . $token->data . '-->';
4110        } else {
4111            return '';
4112
4113        }
4114    }
4115
4116    /**
4117     * Special case processor for the contents of script tags
4118     * @warning This runs into problems if there's already a literal
4119     *          --> somewhere inside the script contents.
4120     */
4121    public function generateScriptFromToken($token) {
4122        if (!$token instanceof HTMLPurifier_Token_Text) return $this->generateFromToken($token);
4123        // Thanks <http://lachy.id.au/log/2005/05/script-comments>
4124        $data = preg_replace('#//\s*$#', '', $token->data);
4125        return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
4126    }
4127
4128    /**
4129     * Generates attribute declarations from attribute array.
4130     * @note This does not include the leading or trailing space.
4131     * @param $assoc_array_of_attributes Attribute array
4132     * @param $element Name of element attributes are for, used to check
4133     *        attribute minimization.
4134     * @return Generate HTML fragment for insertion.
4135     */
4136    public function generateAttributes($assoc_array_of_attributes, $element = false) {
4137        $html = '';
4138        if ($this->_sortAttr) ksort($assoc_array_of_attributes);
4139        foreach ($assoc_array_of_attributes as $key => $value) {
4140            if (!$this->_xhtml) {
4141                // Remove namespaced attributes
4142                if (strpos($key, ':') !== false) continue;
4143                // Check if we should minimize the attribute: val="val" -> val
4144                if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
4145                    $html .= $key . ' ';
4146                    continue;
4147                }
4148            }
4149            // Workaround for Internet Explorer innerHTML bug.
4150            // Essentially, Internet Explorer, when calculating
4151            // innerHTML, omits quotes if there are no instances of
4152            // angled brackets, quotes or spaces.  However, when parsing
4153            // HTML (for example, when you assign to innerHTML), it
4154            // treats backticks as quotes.  Thus,
4155            //      <img alt="``" />
4156            // becomes
4157            //      <img alt=`` />
4158            // becomes
4159            //      <img alt='' />
4160            // Fortunately, all we need to do is trigger an appropriate
4161            // quoting style, which we do by adding an extra space.
4162            // This also is consistent with the W3C spec, which states
4163            // that user agents may ignore leading or trailing
4164            // whitespace (in fact, most don't, at least for attributes
4165            // like alt, but an extra space at the end is barely
4166            // noticeable).  Still, we have a configuration knob for
4167            // this, since this transformation is not necesary if you
4168            // don't process user input with innerHTML or you don't plan
4169            // on supporting Internet Explorer.
4170            if ($this->_innerHTMLFix) {
4171                if (strpos($value, '`') !== false) {
4172                    // check if correct quoting style would not already be
4173                    // triggered
4174                    if (strcspn($value, '"\' <>') === strlen($value)) {
4175                        // protect!
4176                        $value .= ' ';
4177                    }
4178                }
4179            }
4180            $html .= $key.'="'.$this->escape($value).'" ';
4181        }
4182        return rtrim($html);
4183    }
4184
4185    /**
4186     * Escapes raw text data.
4187     * @todo This really ought to be protected, but until we have a facility
4188     *       for properly generating HTML here w/o using tokens, it stays
4189     *       public.
4190     * @param $string String data to escape for HTML.
4191     * @param $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
4192     *               permissible for non-attribute output.
4193     * @return String escaped data.
4194     */
4195    public function escape($string, $quote = null) {
4196        // Workaround for APC bug on Mac Leopard reported by sidepodcast
4197        // http://htmlpurifier.org/phorum/read.php?3,4823,4846
4198        if ($quote === null) $quote = ENT_COMPAT;
4199        return htmlspecialchars($string, $quote, 'UTF-8');
4200    }
4201
4202}
4203
4204
4205
4206
4207
4208/**
4209 * Definition of the purified HTML that describes allowed children,
4210 * attributes, and many other things.
4211 *
4212 * Conventions:
4213 *
4214 * All member variables that are prefixed with info
4215 * (including the main $info array) are used by HTML Purifier internals
4216 * and should not be directly edited when customizing the HTMLDefinition.
4217 * They can usually be set via configuration directives or custom
4218 * modules.
4219 *
4220 * On the other hand, member variables without the info prefix are used
4221 * internally by the HTMLDefinition and MUST NOT be used by other HTML
4222 * Purifier internals. Many of them, however, are public, and may be
4223 * edited by userspace code to tweak the behavior of HTMLDefinition.
4224 *
4225 * @note This class is inspected by Printer_HTMLDefinition; please
4226 *       update that class if things here change.
4227 *
4228 * @warning Directives that change this object's structure must be in
4229 *          the HTML or Attr namespace!
4230 */
4231class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
4232{
4233
4234    // FULLY-PUBLIC VARIABLES ---------------------------------------------
4235
4236    /**
4237     * Associative array of element names to HTMLPurifier_ElementDef
4238     */
4239    public $info = array();
4240
4241    /**
4242     * Associative array of global attribute name to attribute definition.
4243     */
4244    public $info_global_attr = array();
4245
4246    /**
4247     * String name of parent element HTML will be going into.
4248     */
4249    public $info_parent = 'div';
4250
4251    /**
4252     * Definition for parent element, allows parent element to be a
4253     * tag that's not allowed inside the HTML fragment.
4254     */
4255    public $info_parent_def;
4256
4257    /**
4258     * String name of element used to wrap inline elements in block context
4259     * @note This is rarely used except for BLOCKQUOTEs in strict mode
4260     */
4261    public $info_block_wrapper = 'p';
4262
4263    /**
4264     * Associative array of deprecated tag name to HTMLPurifier_TagTransform
4265     */
4266    public $info_tag_transform = array();
4267
4268    /**
4269     * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
4270     */
4271    public $info_attr_transform_pre = array();
4272
4273    /**
4274     * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
4275     */
4276    public $info_attr_transform_post = array();
4277
4278    /**
4279     * Nested lookup array of content set name (Block, Inline) to
4280     * element name to whether or not it belongs in that content set.
4281     */
4282    public $info_content_sets = array();
4283
4284    /**
4285     * Indexed list of HTMLPurifier_Injector to be used.
4286     */
4287    public $info_injector = array();
4288
4289    /**
4290     * Doctype object
4291     */
4292    public $doctype;
4293
4294
4295
4296    // RAW CUSTOMIZATION STUFF --------------------------------------------
4297
4298    /**
4299     * Adds a custom attribute to a pre-existing element
4300     * @note This is strictly convenience, and does not have a corresponding
4301     *       method in HTMLPurifier_HTMLModule
4302     * @param $element_name String element name to add attribute to
4303     * @param $attr_name String name of attribute
4304     * @param $def Attribute definition, can be string or object, see
4305     *             HTMLPurifier_AttrTypes for details
4306     */
4307    public function addAttribute($element_name, $attr_name, $def) {
4308        $module = $this->getAnonymousModule();
4309        if (!isset($module->info[$element_name])) {
4310            $element = $module->addBlankElement($element_name);
4311        } else {
4312            $element = $module->info[$element_name];
4313        }
4314        $element->attr[$attr_name] = $def;
4315    }
4316
4317    /**
4318     * Adds a custom element to your HTML definition
4319     * @note See HTMLPurifier_HTMLModule::addElement for detailed
4320     *       parameter and return value descriptions.
4321     */
4322    public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array()) {
4323        $module = $this->getAnonymousModule();
4324        // assume that if the user is calling this, the element
4325        // is safe. This may not be a good idea
4326        $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
4327        return $element;
4328    }
4329
4330    /**
4331     * Adds a blank element to your HTML definition, for overriding
4332     * existing behavior
4333     * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
4334     *       parameter and return value descriptions.
4335     */
4336    public function addBlankElement($element_name) {
4337        $module  = $this->getAnonymousModule();
4338        $element = $module->addBlankElement($element_name);
4339        return $element;
4340    }
4341
4342    /**
4343     * Retrieves a reference to the anonymous module, so you can
4344     * bust out advanced features without having to make your own
4345     * module.
4346     */
4347    public function getAnonymousModule() {
4348        if (!$this->_anonModule) {
4349            $this->_anonModule = new HTMLPurifier_HTMLModule();
4350            $this->_anonModule->name = 'Anonymous';
4351        }
4352        return $this->_anonModule;
4353    }
4354
4355    private $_anonModule;
4356
4357
4358    // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
4359
4360    public $type = 'HTML';
4361    public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
4362
4363    /**
4364     * Performs low-cost, preliminary initialization.
4365     */
4366    public function __construct() {
4367        $this->manager = new HTMLPurifier_HTMLModuleManager();
4368    }
4369
4370    protected function doSetup($config) {
4371        $this->processModules($config);
4372        $this->setupConfigStuff($config);
4373        unset($this->manager);
4374
4375        // cleanup some of the element definitions
4376        foreach ($this->info as $k => $v) {
4377            unset($this->info[$k]->content_model);
4378            unset($this->info[$k]->content_model_type);
4379        }
4380    }
4381
4382    /**
4383     * Extract out the information from the manager
4384     */
4385    protected function processModules($config) {
4386
4387        if ($this->_anonModule) {
4388            // for user specific changes
4389            // this is late-loaded so we don't have to deal with PHP4
4390            // reference wonky-ness
4391            $this->manager->addModule($this->_anonModule);
4392            unset($this->_anonModule);
4393        }
4394
4395        $this->manager->setup($config);
4396        $this->doctype = $this->manager->doctype;
4397
4398        foreach ($this->manager->modules as $module) {
4399            foreach($module->info_tag_transform as $k => $v) {
4400                if ($v === false) unset($this->info_tag_transform[$k]);
4401                else $this->info_tag_transform[$k] = $v;
4402            }
4403            foreach($module->info_attr_transform_pre as $k => $v) {
4404                if ($v === false) unset($this->info_attr_transform_pre[$k]);
4405                else $this->info_attr_transform_pre[$k] = $v;
4406            }
4407            foreach($module->info_attr_transform_post as $k => $v) {
4408                if ($v === false) unset($this->info_attr_transform_post[$k]);
4409                else $this->info_attr_transform_post[$k] = $v;
4410            }
4411            foreach ($module->info_injector as $k => $v) {
4412                if ($v === false) unset($this->info_injector[$k]);
4413                else $this->info_injector[$k] = $v;
4414            }
4415        }
4416
4417        $this->info = $this->manager->getElements();
4418        $this->info_content_sets = $this->manager->contentSets->lookup;
4419
4420    }
4421
4422    /**
4423     * Sets up stuff based on config. We need a better way of doing this.
4424     */
4425    protected function setupConfigStuff($config) {
4426
4427        $block_wrapper = $config->get('HTML.BlockWrapper');
4428        if (isset($this->info_content_sets['Block'][$block_wrapper])) {
4429            $this->info_block_wrapper = $block_wrapper;
4430        } else {
4431            trigger_error('Cannot use non-block element as block wrapper',
4432                E_USER_ERROR);
4433        }
4434
4435        $parent = $config->get('HTML.Parent');
4436        $def = $this->manager->getElement($parent, true);
4437        if ($def) {
4438            $this->info_parent = $parent;
4439            $this->info_parent_def = $def;
4440        } else {
4441            trigger_error('Cannot use unrecognized element as parent',
4442                E_USER_ERROR);
4443            $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
4444        }
4445
4446        // support template text
4447        $support = "(for information on implementing this, see the ".
4448                   "support forums) ";
4449
4450        // setup allowed elements -----------------------------------------
4451
4452        $allowed_elements = $config->get('HTML.AllowedElements');
4453        $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
4454
4455        if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
4456            $allowed = $config->get('HTML.Allowed');
4457            if (is_string($allowed)) {
4458                list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
4459            }
4460        }
4461
4462        if (is_array($allowed_elements)) {
4463            foreach ($this->info as $name => $d) {
4464                if(!isset($allowed_elements[$name])) unset($this->info[$name]);
4465                unset($allowed_elements[$name]);
4466            }
4467            // emit errors
4468            foreach ($allowed_elements as $element => $d) {
4469                $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
4470                trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
4471            }
4472        }
4473
4474        // setup allowed attributes ---------------------------------------
4475
4476        $allowed_attributes_mutable = $allowed_attributes; // by copy!
4477        if (is_array($allowed_attributes)) {
4478
4479            // This actually doesn't do anything, since we went away from
4480            // global attributes. It's possible that userland code uses
4481            // it, but HTMLModuleManager doesn't!
4482            foreach ($this->info_global_attr as $attr => $x) {
4483                $keys = array($attr, "*@$attr", "*.$attr");
4484                $delete = true;
4485                foreach ($keys as $key) {
4486                    if ($delete && isset($allowed_attributes[$key])) {
4487                        $delete = false;
4488                    }
4489                    if (isset($allowed_attributes_mutable[$key])) {
4490                        unset($allowed_attributes_mutable[$key]);
4491                    }
4492                }
4493                if ($delete) unset($this->info_global_attr[$attr]);
4494            }
4495
4496            foreach ($this->info as $tag => $info) {
4497                foreach ($info->attr as $attr => $x) {
4498                    $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
4499                    $delete = true;
4500                    foreach ($keys as $key) {
4501                        if ($delete && isset($allowed_attributes[$key])) {
4502                            $delete = false;
4503                        }
4504                        if (isset($allowed_attributes_mutable[$key])) {
4505                            unset($allowed_attributes_mutable[$key]);
4506                        }
4507                    }
4508                    if ($delete) {
4509                        if ($this->info[$tag]->attr[$attr]->required) {
4510                            trigger_error("Required attribute '$attr' in element '$tag' was not allowed, which means '$tag' will not be allowed either", E_USER_WARNING);
4511                        }
4512                        unset($this->info[$tag]->attr[$attr]);
4513                    }
4514                }
4515            }
4516            // emit errors
4517            foreach ($allowed_attributes_mutable as $elattr => $d) {
4518                $bits = preg_split('/[.@]/', $elattr, 2);
4519                $c = count($bits);
4520                switch ($c) {
4521                    case 2:
4522                        if ($bits[0] !== '*') {
4523                            $element = htmlspecialchars($bits[0]);
4524                            $attribute = htmlspecialchars($bits[1]);
4525                            if (!isset($this->info[$element])) {
4526                                trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
4527                            } else {
4528                                trigger_error("Attribute '$attribute' in element '$element' not supported $support",
4529                                    E_USER_WARNING);
4530                            }
4531                            break;
4532                        }
4533                        // otherwise fall through
4534                    case 1:
4535                        $attribute = htmlspecialchars($bits[0]);
4536                        trigger_error("Global attribute '$attribute' is not ".
4537                            "supported in any elements $support",
4538                            E_USER_WARNING);
4539                        break;
4540                }
4541            }
4542
4543        }
4544
4545        // setup forbidden elements ---------------------------------------
4546
4547        $forbidden_elements   = $config->get('HTML.ForbiddenElements');
4548        $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
4549
4550        foreach ($this->info as $tag => $info) {
4551            if (isset($forbidden_elements[$tag])) {
4552                unset($this->info[$tag]);
4553                continue;
4554            }
4555            foreach ($info->attr as $attr => $x) {
4556                if (
4557                    isset($forbidden_attributes["$tag@$attr"]) ||
4558                    isset($forbidden_attributes["*@$attr"]) ||
4559                    isset($forbidden_attributes[$attr])
4560                ) {
4561                    unset($this->info[$tag]->attr[$attr]);
4562                    continue;
4563                } // this segment might get removed eventually
4564                elseif (isset($forbidden_attributes["$tag.$attr"])) {
4565                    // $tag.$attr are not user supplied, so no worries!
4566                    trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
4567                }
4568            }
4569        }
4570        foreach ($forbidden_attributes as $key => $v) {
4571            if (strlen($key) < 2) continue;
4572            if ($key[0] != '*') continue;
4573            if ($key[1] == '.') {
4574                trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
4575            }
4576        }
4577
4578        // setup injectors -----------------------------------------------------
4579        foreach ($this->info_injector as $i => $injector) {
4580            if ($injector->checkNeeded($config) !== false) {
4581                // remove injector that does not have it's required
4582                // elements/attributes present, and is thus not needed.
4583                unset($this->info_injector[$i]);
4584            }
4585        }
4586    }
4587
4588    /**
4589     * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
4590     * separate lists for processing. Format is element[attr1|attr2],element2...
4591     * @warning Although it's largely drawn from TinyMCE's implementation,
4592     *      it is different, and you'll probably have to modify your lists
4593     * @param $list String list to parse
4594     * @param array($allowed_elements, $allowed_attributes)
4595     * @todo Give this its own class, probably static interface
4596     */
4597    public function parseTinyMCEAllowedList($list) {
4598
4599        $list = str_replace(array(' ', "\t"), '', $list);
4600
4601        $elements = array();
4602        $attributes = array();
4603
4604        $chunks = preg_split('/(,|[\n\r]+)/', $list);
4605        foreach ($chunks as $chunk) {
4606            if (empty($chunk)) continue;
4607            // remove TinyMCE element control characters
4608            if (!strpos($chunk, '[')) {
4609                $element = $chunk;
4610                $attr = false;
4611            } else {
4612                list($element, $attr) = explode('[', $chunk);
4613            }
4614            if ($element !== '*') $elements[$element] = true;
4615            if (!$attr) continue;
4616            $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
4617            $attr = explode('|', $attr);
4618            foreach ($attr as $key) {
4619                $attributes["$element.$key"] = true;
4620            }
4621        }
4622
4623        return array($elements, $attributes);
4624
4625    }
4626
4627
4628}
4629
4630
4631
4632
4633
4634/**
4635 * Represents an XHTML 1.1 module, with information on elements, tags
4636 * and attributes.
4637 * @note Even though this is technically XHTML 1.1, it is also used for
4638 *       regular HTML parsing. We are using modulization as a convenient
4639 *       way to represent the internals of HTMLDefinition, and our
4640 *       implementation is by no means conforming and does not directly
4641 *       use the normative DTDs or XML schemas.
4642 * @note The public variables in a module should almost directly
4643 *       correspond to the variables in HTMLPurifier_HTMLDefinition.
4644 *       However, the prefix info carries no special meaning in these
4645 *       objects (include it anyway if that's the correspondence though).
4646 * @todo Consider making some member functions protected
4647 */
4648
4649class HTMLPurifier_HTMLModule
4650{
4651
4652    // -- Overloadable ----------------------------------------------------
4653
4654    /**
4655     * Short unique string identifier of the module
4656     */
4657    public $name;
4658
4659    /**
4660     * Informally, a list of elements this module changes. Not used in
4661     * any significant way.
4662     */
4663    public $elements = array();
4664
4665    /**
4666     * Associative array of element names to element definitions.
4667     * Some definitions may be incomplete, to be merged in later
4668     * with the full definition.
4669     */
4670    public $info = array();
4671
4672    /**
4673     * Associative array of content set names to content set additions.
4674     * This is commonly used to, say, add an A element to the Inline
4675     * content set. This corresponds to an internal variable $content_sets
4676     * and NOT info_content_sets member variable of HTMLDefinition.
4677     */
4678    public $content_sets = array();
4679
4680    /**
4681     * Associative array of attribute collection names to attribute
4682     * collection additions. More rarely used for adding attributes to
4683     * the global collections. Example is the StyleAttribute module adding
4684     * the style attribute to the Core. Corresponds to HTMLDefinition's
4685     * attr_collections->info, since the object's data is only info,
4686     * with extra behavior associated with it.
4687     */
4688    public $attr_collections = array();
4689
4690    /**
4691     * Associative array of deprecated tag name to HTMLPurifier_TagTransform
4692     */
4693    public $info_tag_transform = array();
4694
4695    /**
4696     * List of HTMLPurifier_AttrTransform to be performed before validation.
4697     */
4698    public $info_attr_transform_pre = array();
4699
4700    /**
4701     * List of HTMLPurifier_AttrTransform to be performed after validation.
4702     */
4703    public $info_attr_transform_post = array();
4704
4705    /**
4706     * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
4707     * An injector will only be invoked if all of it's pre-requisites are met;
4708     * if an injector fails setup, there will be no error; it will simply be
4709     * silently disabled.
4710     */
4711    public $info_injector = array();
4712
4713    /**
4714     * Boolean flag that indicates whether or not getChildDef is implemented.
4715     * For optimization reasons: may save a call to a function. Be sure
4716     * to set it if you do implement getChildDef(), otherwise it will have
4717     * no effect!
4718     */
4719    public $defines_child_def = false;
4720