1<?php
2
3/**
4 * Forgivingly lexes HTML (SGML-style) markup into tokens.
5 *
6 * A lexer parses a string of SGML-style markup and converts them into
7 * corresponding tokens.  It doesn't check for well-formedness, although its
8 * internal mechanism may make this automatic (such as the case of
9 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
10 * from.
11 *
12 * A lexer is HTML-oriented: it might work with XML, but it's not
13 * recommended, as we adhere to a subset of the specification for optimization
14 * reasons. This might change in the future. Also, most tokenizers are not
15 * expected to handle DTDs or PIs.
16 *
17 * This class should not be directly instantiated, but you may use create() to
18 * retrieve a default copy of the lexer.  Being a supertype, this class
19 * does not actually define any implementation, but offers commonly used
20 * convenience functions for subclasses.
21 *
22 * @note The unit tests will instantiate this class for testing purposes, as
23 *       many of the utility functions require a class to be instantiated.
24 *       This means that, even though this class is not runnable, it will
25 *       not be declared abstract.
26 *
27 * @par
28 *
29 * @note
30 * We use tokens rather than create a DOM representation because DOM would:
31 *
32 * @par
33 *  -# Require more processing and memory to create,
34 *  -# Is not streamable, and
35 *  -# Has the entire document structure (html and body not needed).
36 *
37 * @par
38 * However, DOM is helpful in that it makes it easy to move around nodes
39 * without a lot of lookaheads to see when a tag is closed. This is a
40 * limitation of the token system and some workarounds would be nice.
41 */
42class HTMLPurifier_Lexer
43{
44
45    /**
46     * Whether or not this lexer implements line-number/column-number tracking.
47     * If it does, set to true.
48     */
49    public $tracksLineNumbers = false;
50
51    /**
52     * @type HTMLPurifier_EntityParser
53     */
54    private $_entity_parser;
55
56    // -- STATIC ----------------------------------------------------------
57
58    /**
59     * Retrieves or sets the default Lexer as a Prototype Factory.
60     *
61     * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
62     * a few exceptions involving special features that only DirectLex
63     * implements.
64     *
65     * @note The behavior of this class has changed, rather than accepting
66     *       a prototype object, it now accepts a configuration object.
67     *       To specify your own prototype, set %Core.LexerImpl to it.
68     *       This change in behavior de-singletonizes the lexer object.
69     *
70     * @param HTMLPurifier_Config $config
71     * @return HTMLPurifier_Lexer
72     * @throws HTMLPurifier_Exception
73     */
74    public static function create($config)
75    {
76        if (!($config instanceof HTMLPurifier_Config)) {
77            $lexer = $config;
78            trigger_error(
79                "Passing a prototype to
80                HTMLPurifier_Lexer::create() is deprecated, please instead
81                use %Core.LexerImpl",
82                E_USER_WARNING
83            );
84        } else {
85            $lexer = $config->get('Core.LexerImpl');
86        }
87
88        $needs_tracking =
89            $config->get('Core.MaintainLineNumbers') ||
90            $config->get('Core.CollectErrors');
91
92        $inst = null;
93        if (is_object($lexer)) {
94            $inst = $lexer;
95        } else {
96            if (is_null($lexer)) {
97                do {
98                    // auto-detection algorithm
99                    if ($needs_tracking) {
100                        $lexer = 'DirectLex';
101                        break;
102                    }
103
104                    if (class_exists('DOMDocument', false) &&
105                        method_exists('DOMDocument', 'loadHTML') &&
106                        !extension_loaded('domxml')
107                    ) {
108                        // check for DOM support, because while it's part of the
109                        // core, it can be disabled compile time. Also, the PECL
110                        // domxml extension overrides the default DOM, and is evil
111                        // and nasty and we shan't bother to support it
112                        $lexer = 'DOMLex';
113                    } else {
114                        $lexer = 'DirectLex';
115                    }
116                } while (0);
117            } // do..while so we can break
118
119            // instantiate recognized string names
120            switch ($lexer) {
121                case 'DOMLex':
122                    $inst = new HTMLPurifier_Lexer_DOMLex();
123                    break;
124                case 'DirectLex':
125                    $inst = new HTMLPurifier_Lexer_DirectLex();
126                    break;
127                case 'PH5P':
128                    $inst = new HTMLPurifier_Lexer_PH5P();
129                    break;
130                default:
131                    throw new HTMLPurifier_Exception(
132                        "Cannot instantiate unrecognized Lexer type " .
133                        htmlspecialchars($lexer)
134                    );
135            }
136        }
137
138        if (!$inst) {
139            throw new HTMLPurifier_Exception('No lexer was instantiated');
140        }
141
142        // once PHP DOM implements native line numbers, or we
143        // hack out something using XSLT, remove this stipulation
144        if ($needs_tracking && !$inst->tracksLineNumbers) {
145            throw new HTMLPurifier_Exception(
146                'Cannot use lexer that does not support line numbers with ' .
147                'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
148            );
149        }
150
151        return $inst;
152
153    }
154
155    // -- CONVENIENCE MEMBERS ---------------------------------------------
156
157    public function __construct()
158    {
159        $this->_entity_parser = new HTMLPurifier_EntityParser();
160    }
161
162    /**
163     * Most common entity to raw value conversion table for special entities.
164     * @type array
165     */
166    protected $_special_entity2str =
167        array(
168            '&quot;' => '"',
169            '&amp;' => '&',
170            '&lt;' => '<',
171            '&gt;' => '>',
172            '&#39;' => "'",
173            '&#039;' => "'",
174            '&#x27;' => "'"
175        );
176
177    public function parseText($string, $config) {
178        return $this->parseData($string, false, $config);
179    }
180
181    public function parseAttr($string, $config) {
182        return $this->parseData($string, true, $config);
183    }
184
185    /**
186     * Parses special entities into the proper characters.
187     *
188     * This string will translate escaped versions of the special characters
189     * into the correct ones.
190     *
191     * @param string $string String character data to be parsed.
192     * @return string Parsed character data.
193     */
194    public function parseData($string, $is_attr, $config)
195    {
196        // following functions require at least one character
197        if ($string === '') {
198            return '';
199        }
200
201        // subtracts amps that cannot possibly be escaped
202        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
203            ($string[strlen($string) - 1] === '&' ? 1 : 0);
204
205        if (!$num_amp) {
206            return $string;
207        } // abort if no entities
208        $num_esc_amp = substr_count($string, '&amp;');
209        $string = strtr($string, $this->_special_entity2str);
210
211        // code duplication for sake of optimization, see above
212        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
213            ($string[strlen($string) - 1] === '&' ? 1 : 0);
214
215        if ($num_amp_2 <= $num_esc_amp) {
216            return $string;
217        }
218
219        // hmm... now we have some uncommon entities. Use the callback.
220        if ($config->get('Core.LegacyEntityDecoder')) {
221            $string = $this->_entity_parser->substituteSpecialEntities($string);
222        } else {
223            if ($is_attr) {
224                $string = $this->_entity_parser->substituteAttrEntities($string);
225            } else {
226                $string = $this->_entity_parser->substituteTextEntities($string);
227            }
228        }
229        return $string;
230    }
231
232    /**
233     * Lexes an HTML string into tokens.
234     * @param $string String HTML.
235     * @param HTMLPurifier_Config $config
236     * @param HTMLPurifier_Context $context
237     * @return HTMLPurifier_Token[] array representation of HTML.
238     */
239    public function tokenizeHTML($string, $config, $context)
240    {
241        trigger_error('Call to abstract class', E_USER_ERROR);
242    }
243
244    /**
245     * Translates CDATA sections into regular sections (through escaping).
246     * @param string $string HTML string to process.
247     * @return string HTML with CDATA sections escaped.
248     */
249    protected static function escapeCDATA($string)
250    {
251        return preg_replace_callback(
252            '/<!\[CDATA\[(.+?)\]\]>/s',
253            array('HTMLPurifier_Lexer', 'CDATACallback'),
254            $string
255        );
256    }
257
258    /**
259     * Special CDATA case that is especially convoluted for <script>
260     * @param string $string HTML string to process.
261     * @return string HTML with CDATA sections escaped.
262     */
263    protected static function escapeCommentedCDATA($string)
264    {
265        return preg_replace_callback(
266            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
267            array('HTMLPurifier_Lexer', 'CDATACallback'),
268            $string
269        );
270    }
271
272    /**
273     * Special Internet Explorer conditional comments should be removed.
274     * @param string $string HTML string to process.
275     * @return string HTML with conditional comments removed.
276     */
277    protected static function removeIEConditional($string)
278    {
279        return preg_replace(
280            '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
281            '',
282            $string
283        );
284    }
285
286    /**
287     * Callback function for escapeCDATA() that does the work.
288     *
289     * @warning Though this is public in order to let the callback happen,
290     *          calling it directly is not recommended.
291     * @param array $matches PCRE matches array, with index 0 the entire match
292     *                  and 1 the inside of the CDATA section.
293     * @return string Escaped internals of the CDATA section.
294     */
295    protected static function CDATACallback($matches)
296    {
297        // not exactly sure why the character set is needed, but whatever
298        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
299    }
300
301    /**
302     * Takes a piece of HTML and normalizes it by converting entities, fixing
303     * encoding, extracting bits, and other good stuff.
304     * @param string $html HTML.
305     * @param HTMLPurifier_Config $config
306     * @param HTMLPurifier_Context $context
307     * @return string
308     * @todo Consider making protected
309     */
310    public function normalize($html, $config, $context)
311    {
312        // normalize newlines to \n
313        if ($config->get('Core.NormalizeNewlines')) {
314            $html = str_replace("\r\n", "\n", (string)$html);
315            $html = str_replace("\r", "\n", (string)$html);
316        }
317
318        if ($config->get('HTML.Trusted')) {
319            // escape convoluted CDATA
320            $html = $this->escapeCommentedCDATA($html);
321        }
322
323        // escape CDATA
324        $html = $this->escapeCDATA($html);
325
326        $html = $this->removeIEConditional($html);
327
328        // extract body from document if applicable
329        if ($config->get('Core.ConvertDocumentToFragment')) {
330            $e = false;
331            if ($config->get('Core.CollectErrors')) {
332                $e =& $context->get('ErrorCollector');
333            }
334            $new_html = $this->extractBody($html);
335            if ($e && $new_html != $html) {
336                $e->send(E_WARNING, 'Lexer: Extracted body');
337            }
338            $html = $new_html;
339        }
340
341        // expand entities that aren't the big five
342        if ($config->get('Core.LegacyEntityDecoder')) {
343            $html = $this->_entity_parser->substituteNonSpecialEntities($html);
344        }
345
346        // clean into wellformed UTF-8 string for an SGML context: this has
347        // to be done after entity expansion because the entities sometimes
348        // represent non-SGML characters (horror, horror!)
349        $html = HTMLPurifier_Encoder::cleanUTF8($html);
350
351        // if processing instructions are to removed, remove them now
352        if ($config->get('Core.RemoveProcessingInstructions')) {
353            $html = preg_replace('#<\?.+?\?>#s', '', $html);
354        }
355
356        $hidden_elements = $config->get('Core.HiddenElements');
357        if ($config->get('Core.AggressivelyRemoveScript') &&
358            !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
359            || empty($hidden_elements["script"]))) {
360            $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
361        }
362
363        return $html;
364    }
365
366    /**
367     * Takes a string of HTML (fragment or document) and returns the content
368     * @todo Consider making protected
369     */
370    public function extractBody($html)
371    {
372        $matches = array();
373        $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
374        if ($result) {
375            // Make sure it's not in a comment
376            $comment_start = strrpos($matches[1], '<!--');
377            $comment_end   = strrpos($matches[1], '-->');
378            if ($comment_start === false ||
379                ($comment_end !== false && $comment_end > $comment_start)) {
380                return $matches[2];
381            }
382        }
383        return $html;
384    }
385}
386
387// vim: et sw=4 sts=4
388