1<?php
2
3/**
4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
6 *
7 * @note
8 *    Recent changes to PHP's DOM extension have resulted in some fatal
9 *    error conditions with the original version of PH5P. Pending changes,
10 *    this lexer will punt to DirectLex if DOM throughs an exception.
11 */
12
13class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
14
15    public function tokenizeHTML($html, $config, $context) {
16        $new_html = $this->normalize($html, $config, $context);
17        $new_html = $this->wrapHTML($new_html, $config, $context);
18        try {
19            $parser = new HTML5($new_html);
20            $doc = $parser->save();
21        } catch (DOMException $e) {
22            // Uh oh, it failed. Punt to DirectLex.
23            $lexer = new HTMLPurifier_Lexer_DirectLex();
24            $context->register('PH5PError', $e); // save the error, so we can detect it
25            return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
26        }
27        $tokens = array();
28        $this->tokenizeDOM(
29            $doc->getElementsByTagName('html')->item(0)-> // <html>
30                  getElementsByTagName('body')->item(0)-> //   <body>
31                  getElementsByTagName('div')->item(0)    //     <div>
32            , $tokens);
33        return $tokens;
34    }
35
36}
37
38/*
39
40Copyright 2007 Jeroen van der Meer <http://jero.net/>
41
42Permission is hereby granted, free of charge, to any person obtaining a
43copy of this software and associated documentation files (the
44"Software"), to deal in the Software without restriction, including
45without limitation the rights to use, copy, modify, merge, publish,
46distribute, sublicense, and/or sell copies of the Software, and to
47permit persons to whom the Software is furnished to do so, subject to
48the following conditions:
49
50The above copyright notice and this permission notice shall be included
51in all copies or substantial portions of the Software.
52
53THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
54OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
60
61*/
62
63class HTML5 {
64    private $data;
65    private $char;
66    private $EOF;
67    private $state;
68    private $tree;
69    private $token;
70    private $content_model;
71    private $escape = false;
72    private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
73    'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
74    'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
75    'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
76    'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
77    'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
78    'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
79    'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
80    'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
81    'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
82    'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
83    'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
84    'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
85    'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
86    'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
87    'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
88    'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
89    'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
90    'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
91    'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
92    'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
93    'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
94    'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
95    'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
96    'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
97    'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
98    'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
99    'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
100    'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
101    'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
102    'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
103    'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
104    'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
105    'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
106    'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
107    'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
108    'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
109    'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
110    'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
111    'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
112    'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
113    'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
114
115    const PCDATA    = 0;
116    const RCDATA    = 1;
117    const CDATA     = 2;
118    const PLAINTEXT = 3;
119
120    const DOCTYPE  = 0;
121    const STARTTAG = 1;
122    const ENDTAG   = 2;
123    const COMMENT  = 3;
124    const CHARACTR = 4;
125    const EOF      = 5;
126
127    public function __construct($data) {
128
129        $this->data = $data;
130        $this->char = -1;
131        $this->EOF  = strlen($data);
132        $this->tree = new HTML5TreeConstructer;
133        $this->content_model = self::PCDATA;
134
135        $this->state = 'data';
136
137        while($this->state !== null) {
138            $this->{$this->state.'State'}();
139        }
140    }
141
142    public function save() {
143        return $this->tree->save();
144    }
145
146    private function char() {
147        return ($this->char < $this->EOF)
148            ? $this->data[$this->char]
149            : false;
150    }
151
152    private function character($s, $l = 0) {
153        if($s + $l < $this->EOF) {
154            if($l === 0) {
155                return $this->data[$s];
156            } else {
157                return substr($this->data, $s, $l);
158            }
159        }
160    }
161
162    private function characters($char_class, $start) {
163        return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
164    }
165
166    private function dataState() {
167        // Consume the next input character
168        $this->char++;
169        $char = $this->char();
170
171        if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
172            /* U+0026 AMPERSAND (&)
173            When the content model flag is set to one of the PCDATA or RCDATA
174            states: switch to the entity data state. Otherwise: treat it as per
175            the "anything else"    entry below. */
176            $this->state = 'entityData';
177
178        } elseif($char === '-') {
179            /* If the content model flag is set to either the RCDATA state or
180            the CDATA state, and the escape flag is false, and there are at
181            least three characters before this one in the input stream, and the
182            last four characters in the input stream, including this one, are
183            U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
184            and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
185            if(($this->content_model === self::RCDATA || $this->content_model ===
186            self::CDATA) && $this->escape === false &&
187            $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
188                $this->escape = true;
189            }
190
191            /* In any case, emit the input character as a character token. Stay
192            in the data state. */
193            $this->emitToken(array(
194                'type' => self::CHARACTR,
195                'data' => $char
196            ));
197
198        /* U+003C LESS-THAN SIGN (<) */
199        } elseif($char === '<' && ($this->content_model === self::PCDATA ||
200        (($this->content_model === self::RCDATA ||
201        $this->content_model === self::CDATA) && $this->escape === false))) {
202            /* When the content model flag is set to the PCDATA state: switch
203            to the tag open state.
204
205            When the content model flag is set to either the RCDATA state or
206            the CDATA state and the escape flag is false: switch to the tag
207            open state.
208
209            Otherwise: treat it as per the "anything else" entry below. */
210            $this->state = 'tagOpen';
211
212        /* U+003E GREATER-THAN SIGN (>) */
213        } elseif($char === '>') {
214            /* If the content model flag is set to either the RCDATA state or
215            the CDATA state, and the escape flag is true, and the last three
216            characters in the input stream including this one are U+002D
217            HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
218            set the escape flag to false. */
219            if(($this->content_model === self::RCDATA ||
220            $this->content_model === self::CDATA) && $this->escape === true &&
221            $this->character($this->char, 3) === '-->') {
222                $this->escape = false;
223            }
224
225            /* In any case, emit the input character as a character token.
226            Stay in the data state. */
227            $this->emitToken(array(
228                'type' => self::CHARACTR,
229                'data' => $char
230            ));
231
232        } elseif($this->char === $this->EOF) {
233            /* EOF
234            Emit an end-of-file token. */
235            $this->EOF();
236
237        } elseif($this->content_model === self::PLAINTEXT) {
238            /* When the content model flag is set to the PLAINTEXT state
239            THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
240            the text and emit it as a character token. */
241            $this->emitToken(array(
242                'type' => self::CHARACTR,
243                'data' => substr($this->data, $this->char)
244            ));
245
246            $this->EOF();
247
248        } else {
249            /* Anything else
250            THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
251            otherwise would also be treated as a character token and emit it
252            as a single character token. Stay in the data state. */
253            $len  = strcspn($this->data, '<&', $this->char);
254            $char = substr($this->data, $this->char, $len);
255            $this->char += $len - 1;
256
257            $this->emitToken(array(
258                'type' => self::CHARACTR,
259                'data' => $char
260            ));
261
262            $this->state = 'data';
263        }
264    }
265
266    private function entityDataState() {
267        // Attempt to consume an entity.
268        $entity = $this->entity();
269
270        // If nothing is returned, emit a U+0026 AMPERSAND character token.
271        // Otherwise, emit the character token that was returned.
272        $char = (!$entity) ? '&' : $entity;
273        $this->emitToken(array(
274            'type' => self::CHARACTR,
275            'data' => $char
276        ));
277
278        // Finally, switch to the data state.
279        $this->state = 'data';
280    }
281
282    private function tagOpenState() {
283        switch($this->content_model) {
284            case self::RCDATA:
285            case self::CDATA:
286                /* If the next input character is a U+002F SOLIDUS (/) character,
287                consume it and switch to the close tag open state. If the next
288                input character is not a U+002F SOLIDUS (/) character, emit a
289                U+003C LESS-THAN SIGN character token and switch to the data
290                state to process the next input character. */
291                if($this->character($this->char + 1) === '/') {
292                    $this->char++;
293                    $this->state = 'closeTagOpen';
294
295                } else {
296                    $this->emitToken(array(
297                        'type' => self::CHARACTR,
298                        'data' => '<'
299                    ));
300
301                    $this->state = 'data';
302                }
303            break;
304
305            case self::PCDATA:
306                // If the content model flag is set to the PCDATA state
307                // Consume the next input character:
308                $this->char++;
309                $char = $this->char();
310
311                if($char === '!') {
312                    /* U+0021 EXCLAMATION MARK (!)
313                    Switch to the markup declaration open state. */
314                    $this->state = 'markupDeclarationOpen';
315
316                } elseif($char === '/') {
317                    /* U+002F SOLIDUS (/)
318                    Switch to the close tag open state. */
319                    $this->state = 'closeTagOpen';
320
321                } elseif(preg_match('/^[A-Za-z]$/', $char)) {
322                    /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
323                    Create a new start tag token, set its tag name to the lowercase
324                    version of the input character (add 0x0020 to the character's code
325                    point), then switch to the tag name state. (Don't emit the token
326                    yet; further details will be filled in before it is emitted.) */
327                    $this->token = array(
328                        'name'  => strtolower($char),
329                        'type'  => self::STARTTAG,
330                        'attr'  => array()
331                    );
332
333                    $this->state = 'tagName';
334
335                } elseif($char === '>') {
336                    /* U+003E GREATER-THAN SIGN (>)
337                    Parse error. Emit a U+003C LESS-THAN SIGN character token and a
338                    U+003E GREATER-THAN SIGN character token. Switch to the data state. */
339                    $this->emitToken(array(
340                        'type' => self::CHARACTR,
341                        'data' => '<>'
342                    ));
343
344                    $this->state = 'data';
345
346                } elseif($char === '?') {
347                    /* U+003F QUESTION MARK (?)
348                    Parse error. Switch to the bogus comment state. */
349                    $this->state = 'bogusComment';
350
351                } else {
352                    /* Anything else
353                    Parse error. Emit a U+003C LESS-THAN SIGN character token and
354                    reconsume the current input character in the data state. */
355                    $this->emitToken(array(
356                        'type' => self::CHARACTR,
357                        'data' => '<'
358                    ));
359
360                    $this->char--;
361                    $this->state = 'data';
362                }
363            break;
364        }
365    }
366
367    private function closeTagOpenState() {
368        $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
369        $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
370
371        if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
372        (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
373        $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
374            /* If the content model flag is set to the RCDATA or CDATA states then
375            examine the next few characters. If they do not match the tag name of
376            the last start tag token emitted (case insensitively), or if they do but
377            they are not immediately followed by one of the following characters:
378                * U+0009 CHARACTER TABULATION
379                * U+000A LINE FEED (LF)
380                * U+000B LINE TABULATION
381                * U+000C FORM FEED (FF)
382                * U+0020 SPACE
383                * U+003E GREATER-THAN SIGN (>)
384                * U+002F SOLIDUS (/)
385                * EOF
386            ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
387            token, a U+002F SOLIDUS character token, and switch to the data state
388            to process the next input character. */
389            $this->emitToken(array(
390                'type' => self::CHARACTR,
391                'data' => '</'
392            ));
393
394            $this->state = 'data';
395
396        } else {
397            /* Otherwise, if the content model flag is set to the PCDATA state,
398            or if the next few characters do match that tag name, consume the
399            next input character: */
400            $this->char++;
401            $char = $this->char();
402
403            if(preg_match('/^[A-Za-z]$/', $char)) {
404                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
405                Create a new end tag token, set its tag name to the lowercase version
406                of the input character (add 0x0020 to the character's code point), then
407                switch to the tag name state. (Don't emit the token yet; further details
408                will be filled in before it is emitted.) */
409                $this->token = array(
410                    'name'  => strtolower($char),
411                    'type'  => self::ENDTAG
412                );
413
414                $this->state = 'tagName';
415
416            } elseif($char === '>') {
417                /* U+003E GREATER-THAN SIGN (>)
418                Parse error. Switch to the data state. */
419                $this->state = 'data';
420
421            } elseif($this->char === $this->EOF) {
422                /* EOF
423                Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
424                SOLIDUS character token. Reconsume the EOF character in the data state. */
425                $this->emitToken(array(
426                    'type' => self::CHARACTR,
427                    'data' => '</'
428                ));
429
430                $this->char--;
431                $this->state = 'data';
432
433            } else {
434                /* Parse error. Switch to the bogus comment state. */
435                $this->state = 'bogusComment';
436            }
437        }
438    }
439
440    private function tagNameState() {
441        // Consume the next input character:
442        $this->char++;
443        $char = $this->character($this->char);
444
445        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
446            /* U+0009 CHARACTER TABULATION
447            U+000A LINE FEED (LF)
448            U+000B LINE TABULATION
449            U+000C FORM FEED (FF)
450            U+0020 SPACE
451            Switch to the before attribute name state. */
452            $this->state = 'beforeAttributeName';
453
454        } elseif($char === '>') {
455            /* U+003E GREATER-THAN SIGN (>)
456            Emit the current tag token. Switch to the data state. */
457            $this->emitToken($this->token);
458            $this->state = 'data';
459
460        } elseif($this->char === $this->EOF) {
461            /* EOF
462            Parse error. Emit the current tag token. Reconsume the EOF
463            character in the data state. */
464            $this->emitToken($this->token);
465
466            $this->char--;
467            $this->state = 'data';
468
469        } elseif($char === '/') {
470            /* U+002F SOLIDUS (/)
471            Parse error unless this is a permitted slash. Switch to the before
472            attribute name state. */
473            $this->state = 'beforeAttributeName';
474
475        } else {
476            /* Anything else
477            Append the current input character to the current tag token's tag name.
478            Stay in the tag name state. */
479            $this->token['name'] .= strtolower($char);
480            $this->state = 'tagName';
481        }
482    }
483
484    private function beforeAttributeNameState() {
485        // Consume the next input character:
486        $this->char++;
487        $char = $this->character($this->char);
488
489        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490            /* U+0009 CHARACTER TABULATION
491            U+000A LINE FEED (LF)
492            U+000B LINE TABULATION
493            U+000C FORM FEED (FF)
494            U+0020 SPACE
495            Stay in the before attribute name state. */
496            $this->state = 'beforeAttributeName';
497
498        } elseif($char === '>') {
499            /* U+003E GREATER-THAN SIGN (>)
500            Emit the current tag token. Switch to the data state. */
501            $this->emitToken($this->token);
502            $this->state = 'data';
503
504        } elseif($char === '/') {
505            /* U+002F SOLIDUS (/)
506            Parse error unless this is a permitted slash. Stay in the before
507            attribute name state. */
508            $this->state = 'beforeAttributeName';
509
510        } elseif($this->char === $this->EOF) {
511            /* EOF
512            Parse error. Emit the current tag token. Reconsume the EOF
513            character in the data state. */
514            $this->emitToken($this->token);
515
516            $this->char--;
517            $this->state = 'data';
518
519        } else {
520            /* Anything else
521            Start a new attribute in the current tag token. Set that attribute's
522            name to the current input character, and its value to the empty string.
523            Switch to the attribute name state. */
524            $this->token['attr'][] = array(
525                'name'  => strtolower($char),
526                'value' => null
527            );
528
529            $this->state = 'attributeName';
530        }
531    }
532
533    private function attributeNameState() {
534        // Consume the next input character:
535        $this->char++;
536        $char = $this->character($this->char);
537
538        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
539            /* U+0009 CHARACTER TABULATION
540            U+000A LINE FEED (LF)
541            U+000B LINE TABULATION
542            U+000C FORM FEED (FF)
543            U+0020 SPACE
544            Stay in the before attribute name state. */
545            $this->state = 'afterAttributeName';
546
547        } elseif($char === '=') {
548            /* U+003D EQUALS SIGN (=)
549            Switch to the before attribute value state. */
550            $this->state = 'beforeAttributeValue';
551
552        } elseif($char === '>') {
553            /* U+003E GREATER-THAN SIGN (>)
554            Emit the current tag token. Switch to the data state. */
555            $this->emitToken($this->token);
556            $this->state = 'data';
557
558        } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
559            /* U+002F SOLIDUS (/)
560            Parse error unless this is a permitted slash. Switch to the before
561            attribute name state. */
562            $this->state = 'beforeAttributeName';
563
564        } elseif($this->char === $this->EOF) {
565            /* EOF
566            Parse error. Emit the current tag token. Reconsume the EOF
567            character in the data state. */
568            $this->emitToken($this->token);
569
570            $this->char--;
571            $this->state = 'data';
572
573        } else {
574            /* Anything else
575            Append the current input character to the current attribute's name.
576            Stay in the attribute name state. */
577            $last = count($this->token['attr']) - 1;
578            $this->token['attr'][$last]['name'] .= strtolower($char);
579
580            $this->state = 'attributeName';
581        }
582    }
583
584    private function afterAttributeNameState() {
585        // Consume the next input character:
586        $this->char++;
587        $char = $this->character($this->char);
588
589        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
590            /* U+0009 CHARACTER TABULATION
591            U+000A LINE FEED (LF)
592            U+000B LINE TABULATION
593            U+000C FORM FEED (FF)
594            U+0020 SPACE
595            Stay in the after attribute name state. */
596            $this->state = 'afterAttributeName';
597
598        } elseif($char === '=') {
599            /* U+003D EQUALS SIGN (=)
600            Switch to the before attribute value state. */
601            $this->state = 'beforeAttributeValue';
602
603        } elseif($char === '>') {
604            /* U+003E GREATER-THAN SIGN (>)
605            Emit the current tag token. Switch to the data state. */
606            $this->emitToken($this->token);
607            $this->state = 'data';
608
609        } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
610            /* U+002F SOLIDUS (/)
611            Parse error unless this is a permitted slash. Switch to the
612            before attribute name state. */
613            $this->state = 'beforeAttributeName';
614
615        } elseif($this->char === $this->EOF) {
616            /* EOF
617            Parse error. Emit the current tag token. Reconsume the EOF
618            character in the data state. */
619            $this->emitToken($this->token);
620
621            $this->char--;
622            $this->state = 'data';
623
624        } else {
625            /* Anything else
626            Start a new attribute in the current tag token. Set that attribute's
627            name to the current input character, and its value to the empty string.
628            Switch to the attribute name state. */
629            $this->token['attr'][] = array(
630                'name'  => strtolower($char),
631                'value' => null
632            );
633
634            $this->state = 'attributeName';
635        }
636    }
637
638    private function beforeAttributeValueState() {
639        // Consume the next input character:
640        $this->char++;
641        $char = $this->character($this->char);
642
643        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
644            /* U+0009 CHARACTER TABULATION
645            U+000A LINE FEED (LF)
646            U+000B LINE TABULATION
647            U+000C FORM FEED (FF)
648            U+0020 SPACE
649            Stay in the before attribute value state. */
650            $this->state = 'beforeAttributeValue';
651
652        } elseif($char === '"') {
653            /* U+0022 QUOTATION MARK (")
654            Switch to the attribute value (double-quoted) state. */
655            $this->state = 'attributeValueDoubleQuoted';
656
657        } elseif($char === '&') {
658            /* U+0026 AMPERSAND (&)
659            Switch to the attribute value (unquoted) state and reconsume
660            this input character. */
661            $this->char--;
662            $this->state = 'attributeValueUnquoted';
663
664        } elseif($char === '\'') {
665            /* U+0027 APOSTROPHE (')
666            Switch to the attribute value (single-quoted) state. */
667            $this->state = 'attributeValueSingleQuoted';
668
669        } elseif($char === '>') {
670            /* U+003E GREATER-THAN SIGN (>)
671            Emit the current tag token. Switch to the data state. */
672            $this->emitToken($this->token);
673            $this->state = 'data';
674
675        } else {
676            /* Anything else
677            Append the current input character to the current attribute's value.
678            Switch to the attribute value (unquoted) state. */
679            $last = count($this->token['attr']) - 1;
680            $this->token['attr'][$last]['value'] .= $char;
681
682            $this->state = 'attributeValueUnquoted';
683        }
684    }
685
686    private function attributeValueDoubleQuotedState() {
687        // Consume the next input character:
688        $this->char++;
689        $char = $this->character($this->char);
690
691        if($char === '"') {
692            /* U+0022 QUOTATION MARK (")
693            Switch to the before attribute name state. */
694            $this->state = 'beforeAttributeName';
695
696        } elseif($char === '&') {
697            /* U+0026 AMPERSAND (&)
698            Switch to the entity in attribute value state. */
699            $this->entityInAttributeValueState('double');
700
701        } elseif($this->char === $this->EOF) {
702            /* EOF
703            Parse error. Emit the current tag token. Reconsume the character
704            in the data state. */
705            $this->emitToken($this->token);
706
707            $this->char--;
708            $this->state = 'data';
709
710        } else {
711            /* Anything else
712            Append the current input character to the current attribute's value.
713            Stay in the attribute value (double-quoted) state. */
714            $last = count($this->token['attr']) - 1;
715            $this->token['attr'][$last]['value'] .= $char;
716
717            $this->state = 'attributeValueDoubleQuoted';
718        }
719    }
720
721    private function attributeValueSingleQuotedState() {
722        // Consume the next input character:
723        $this->char++;
724        $char = $this->character($this->char);
725
726        if($char === '\'') {
727            /* U+0022 QUOTATION MARK (')
728            Switch to the before attribute name state. */
729            $this->state = 'beforeAttributeName';
730
731        } elseif($char === '&') {
732            /* U+0026 AMPERSAND (&)
733            Switch to the entity in attribute value state. */
734            $this->entityInAttributeValueState('single');
735
736        } elseif($this->char === $this->EOF) {
737            /* EOF
738            Parse error. Emit the current tag token. Reconsume the character
739            in the data state. */
740            $this->emitToken($this->token);
741
742            $this->char--;
743            $this->state = 'data';
744
745        } else {
746            /* Anything else
747            Append the current input character to the current attribute's value.
748            Stay in the attribute value (single-quoted) state. */
749            $last = count($this->token['attr']) - 1;
750            $this->token['attr'][$last]['value'] .= $char;
751
752            $this->state = 'attributeValueSingleQuoted';
753        }
754    }
755
756    private function attributeValueUnquotedState() {
757        // Consume the next input character:
758        $this->char++;
759        $char = $this->character($this->char);
760
761        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
762            /* U+0009 CHARACTER TABULATION
763            U+000A LINE FEED (LF)
764            U+000B LINE TABULATION
765            U+000C FORM FEED (FF)
766            U+0020 SPACE
767            Switch to the before attribute name state. */
768            $this->state = 'beforeAttributeName';
769
770        } elseif($char === '&') {
771            /* U+0026 AMPERSAND (&)
772            Switch to the entity in attribute value state. */
773            $this->entityInAttributeValueState();
774
775        } elseif($char === '>') {
776            /* U+003E GREATER-THAN SIGN (>)
777            Emit the current tag token. Switch to the data state. */
778            $this->emitToken($this->token);
779            $this->state = 'data';
780
781        } else {
782            /* Anything else
783            Append the current input character to the current attribute's value.
784            Stay in the attribute value (unquoted) state. */
785            $last = count($this->token['attr']) - 1;
786            $this->token['attr'][$last]['value'] .= $char;
787
788            $this->state = 'attributeValueUnquoted';
789        }
790    }
791
792    private function entityInAttributeValueState() {
793        // Attempt to consume an entity.
794        $entity = $this->entity();
795
796        // If nothing is returned, append a U+0026 AMPERSAND character to the
797        // current attribute's value. Otherwise, emit the character token that
798        // was returned.
799        $char = (!$entity)
800            ? '&'
801            : $entity;
802
803        $last = count($this->token['attr']) - 1;
804        $this->token['attr'][$last]['value'] .= $char;
805    }
806
807    private function bogusCommentState() {
808        /* Consume every character up to the first U+003E GREATER-THAN SIGN
809        character (>) or the end of the file (EOF), whichever comes first. Emit
810        a comment token whose data is the concatenation of all the characters
811        starting from and including the character that caused the state machine
812        to switch into the bogus comment state, up to and including the last
813        consumed character before the U+003E character, if any, or up to the
814        end of the file otherwise. (If the comment was started by the end of
815        the file (EOF), the token is empty.) */
816        $data = $this->characters('^>', $this->char);
817        $this->emitToken(array(
818            'data' => $data,
819            'type' => self::COMMENT
820        ));
821
822        $this->char += strlen($data);
823
824        /* Switch to the data state. */
825        $this->state = 'data';
826
827        /* If the end of the file was reached, reconsume the EOF character. */
828        if($this->char === $this->EOF) {
829            $this->char = $this->EOF - 1;
830        }
831    }
832
833    private function markupDeclarationOpenState() {
834        /* If the next two characters are both U+002D HYPHEN-MINUS (-)
835        characters, consume those two characters, create a comment token whose
836        data is the empty string, and switch to the comment state. */
837        if($this->character($this->char + 1, 2) === '--') {
838            $this->char += 2;
839            $this->state = 'comment';
840            $this->token = array(
841                'data' => null,
842                'type' => self::COMMENT
843            );
844
845        /* Otherwise if the next seven chacacters are a case-insensitive match
846        for the word "DOCTYPE", then consume those characters and switch to the
847        DOCTYPE state. */
848        } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
849            $this->char += 7;
850            $this->state = 'doctype';
851
852        /* Otherwise, is is a parse error. Switch to the bogus comment state.
853        The next character that is consumed, if any, is the first character
854        that will be in the comment. */
855        } else {
856            $this->char++;
857            $this->state = 'bogusComment';
858        }
859    }
860
861    private function commentState() {
862        /* Consume the next input character: */
863        $this->char++;
864        $char = $this->char();
865
866        /* U+002D HYPHEN-MINUS (-) */
867        if($char === '-') {
868            /* Switch to the comment dash state  */
869            $this->state = 'commentDash';
870
871        /* EOF */
872        } elseif($this->char === $this->EOF) {
873            /* Parse error. Emit the comment token. Reconsume the EOF character
874            in the data state. */
875            $this->emitToken($this->token);
876            $this->char--;
877            $this->state = 'data';
878
879        /* Anything else */
880        } else {
881            /* Append the input character to the comment token's data. Stay in
882            the comment state. */
883            $this->token['data'] .= $char;
884        }
885    }
886
887    private function commentDashState() {
888        /* Consume the next input character: */
889        $this->char++;
890        $char = $this->char();
891
892        /* U+002D HYPHEN-MINUS (-) */
893        if($char === '-') {
894            /* Switch to the comment end state  */
895            $this->state = 'commentEnd';
896
897        /* EOF */
898        } elseif($this->char === $this->EOF) {
899            /* Parse error. Emit the comment token. Reconsume the EOF character
900            in the data state. */
901            $this->emitToken($this->token);
902            $this->char--;
903            $this->state = 'data';
904
905        /* Anything else */
906        } else {
907            /* Append a U+002D HYPHEN-MINUS (-) character and the input
908            character to the comment token's data. Switch to the comment state. */
909            $this->token['data'] .= '-'.$char;
910            $this->state = 'comment';
911        }
912    }
913
914    private function commentEndState() {
915        /* Consume the next input character: */
916        $this->char++;
917        $char = $this->char();
918
919        if($char === '>') {
920            $this->emitToken($this->token);
921            $this->state = 'data';
922
923        } elseif($char === '-') {
924            $this->token['data'] .= '-';
925
926        } elseif($this->char === $this->EOF) {
927            $this->emitToken($this->token);
928            $this->char--;
929            $this->state = 'data';
930
931        } else {
932            $this->token['data'] .= '--'.$char;
933            $this->state = 'comment';
934        }
935    }
936
937    private function doctypeState() {
938        /* Consume the next input character: */
939        $this->char++;
940        $char = $this->char();
941
942        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
943            $this->state = 'beforeDoctypeName';
944
945        } else {
946            $this->char--;
947            $this->state = 'beforeDoctypeName';
948        }
949    }
950
951    private function beforeDoctypeNameState() {
952        /* Consume the next input character: */
953        $this->char++;
954        $char = $this->char();
955
956        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
957            // Stay in the before DOCTYPE name state.
958
959        } elseif(preg_match('/^[a-z]$/', $char)) {
960            $this->token = array(
961                'name' => strtoupper($char),
962                'type' => self::DOCTYPE,
963                'error' => true
964            );
965
966            $this->state = 'doctypeName';
967
968        } elseif($char === '>') {
969            $this->emitToken(array(
970                'name' => null,
971                'type' => self::DOCTYPE,
972                'error' => true
973            ));
974
975            $this->state = 'data';
976
977        } elseif($this->char === $this->EOF) {
978            $this->emitToken(array(
979                'name' => null,
980                'type' => self::DOCTYPE,
981                'error' => true
982            ));
983
984            $this->char--;
985            $this->state = 'data';
986
987        } else {
988            $this->token = array(
989                'name' => $char,
990                'type' => self::DOCTYPE,
991                'error' => true
992            );
993
994            $this->state = 'doctypeName';
995        }
996    }
997
998    private function doctypeNameState() {
999        /* Consume the next input character: */
1000        $this->char++;
1001        $char = $this->char();
1002
1003        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1004            $this->state = 'AfterDoctypeName';
1005
1006        } elseif($char === '>') {
1007            $this->emitToken($this->token);
1008            $this->state = 'data';
1009
1010        } elseif(preg_match('/^[a-z]$/', $char)) {
1011            $this->token['name'] .= strtoupper($char);
1012
1013        } elseif($this->char === $this->EOF) {
1014            $this->emitToken($this->token);
1015            $this->char--;
1016            $this->state = 'data';
1017
1018        } else {
1019            $this->token['name'] .= $char;
1020        }
1021
1022        $this->token['error'] = ($this->token['name'] === 'HTML')
1023            ? false
1024            : true;
1025    }
1026
1027    private function afterDoctypeNameState() {
1028        /* Consume the next input character: */
1029        $this->char++;
1030        $char = $this->char();
1031
1032        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1033            // Stay in the DOCTYPE name state.
1034
1035        } elseif($char === '>') {
1036            $this->emitToken($this->token);
1037            $this->state = 'data';
1038
1039        } elseif($this->char === $this->EOF) {
1040            $this->emitToken($this->token);
1041            $this->char--;
1042            $this->state = 'data';
1043
1044        } else {
1045            $this->token['error'] = true;
1046            $this->state = 'bogusDoctype';
1047        }
1048    }
1049
1050    private function bogusDoctypeState() {
1051        /* Consume the next input character: */
1052        $this->char++;
1053        $char = $this->char();
1054
1055        if($char === '>') {
1056            $this->emitToken($this->token);
1057            $this->state = 'data';
1058
1059        } elseif($this->char === $this->EOF) {
1060            $this->emitToken($this->token);
1061            $this->char--;
1062            $this->state = 'data';
1063
1064        } else {
1065            // Stay in the bogus DOCTYPE state.
1066        }
1067    }
1068
1069    private function entity() {
1070        $start = $this->char;
1071
1072        // This section defines how to consume an entity. This definition is
1073        // used when parsing entities in text and in attributes.
1074
1075        // The behaviour depends on the identity of the next character (the
1076        // one immediately after the U+0026 AMPERSAND character):
1077
1078        switch($this->character($this->char + 1)) {
1079            // U+0023 NUMBER SIGN (#)
1080            case '#':
1081
1082                // The behaviour further depends on the character after the
1083                // U+0023 NUMBER SIGN:
1084                switch($this->character($this->char + 1)) {
1085                    // U+0078 LATIN SMALL LETTER X
1086                    // U+0058 LATIN CAPITAL LETTER X
1087                    case 'x':
1088                    case 'X':
1089                        // Follow the steps below, but using the range of
1090                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1091                        // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1092                        // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1093                        // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1094                        // words, 0-9, A-F, a-f).
1095                        $char = 1;
1096                        $char_class = '0-9A-Fa-f';
1097                    break;
1098
1099                    // Anything else
1100                    default:
1101                        // Follow the steps below, but using the range of
1102                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1103                        // NINE (i.e. just 0-9).
1104                        $char = 0;
1105                        $char_class = '0-9';
1106                    break;
1107                }
1108
1109                // Consume as many characters as match the range of characters
1110                // given above.
1111                $this->char++;
1112                $e_name = $this->characters($char_class, $this->char + $char + 1);
1113                $entity = $this->character($start, $this->char);
1114                $cond = strlen($e_name) > 0;
1115
1116                // The rest of the parsing happens bellow.
1117            break;
1118
1119            // Anything else
1120            default:
1121                // Consume the maximum number of characters possible, with the
1122                // consumed characters case-sensitively matching one of the
1123                // identifiers in the first column of the entities table.
1124                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1125                $len = strlen($e_name);
1126
1127                for($c = 1; $c <= $len; $c++) {
1128                    $id = substr($e_name, 0, $c);
1129                    $this->char++;
1130
1131                    if(in_array($id, $this->entities)) {
1132                        if ($e_name[$c-1] !== ';') {
1133                            if ($c < $len && $e_name[$c] == ';') {
1134                                $this->char++; // consume extra semicolon
1135                            }
1136                        }
1137                        $entity = $id;
1138                        break;
1139                    }
1140                }
1141
1142                $cond = isset($entity);
1143                // The rest of the parsing happens bellow.
1144            break;
1145        }
1146
1147        if(!$cond) {
1148            // If no match can be made, then this is a parse error. No
1149            // characters are consumed, and nothing is returned.
1150            $this->char = $start;
1151            return false;
1152        }
1153
1154        // Return a character token for the character corresponding to the
1155        // entity name (as given by the second column of the entities table).
1156        return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1157    }
1158
1159    private function emitToken($token) {
1160        $emit = $this->tree->emitToken($token);
1161
1162        if(is_int($emit)) {
1163            $this->content_model = $emit;
1164
1165        } elseif($token['type'] === self::ENDTAG) {
1166            $this->content_model = self::PCDATA;
1167        }
1168    }
1169
1170    private function EOF() {
1171        $this->state = null;
1172        $this->tree->emitToken(array(
1173            'type' => self::EOF
1174        ));
1175    }
1176}
1177
1178class HTML5TreeConstructer {
1179    public $stack = array();
1180
1181    private $phase;
1182    private $mode;
1183    private $dom;
1184    private $foster_parent = null;
1185    private $a_formatting  = array();
1186
1187    private $head_pointer = null;
1188    private $form_pointer = null;
1189
1190    private $scoping = array('button','caption','html','marquee','object','table','td','th');
1191    private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1192    private $special = array('address','area','base','basefont','bgsound',
1193    'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1194    'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1195    'h6','head','hr','iframe','image','img','input','isindex','li','link',
1196    'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1197    'option','p','param','plaintext','pre','script','select','spacer','style',
1198    'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1199
1200    // The different phases.
1201    const INIT_PHASE = 0;
1202    const ROOT_PHASE = 1;
1203    const MAIN_PHASE = 2;
1204    const END_PHASE  = 3;
1205
1206    // The different insertion modes for the main phase.
1207    const BEFOR_HEAD = 0;
1208    const IN_HEAD    = 1;
1209    const AFTER_HEAD = 2;
1210    const IN_BODY    = 3;
1211    const IN_TABLE   = 4;
1212    const IN_CAPTION = 5;
1213    const IN_CGROUP  = 6;
1214    const IN_TBODY   = 7;
1215    const IN_ROW     = 8;
1216    const IN_CELL    = 9;
1217    const IN_SELECT  = 10;
1218    const AFTER_BODY = 11;
1219    const IN_FRAME   = 12;
1220    const AFTR_FRAME = 13;
1221
1222    // The different types of elements.
1223    const SPECIAL    = 0;
1224    const SCOPING    = 1;
1225    const FORMATTING = 2;
1226    const PHRASING   = 3;
1227
1228    const MARKER     = 0;
1229
1230    public function __construct() {
1231        $this->phase = self::INIT_PHASE;
1232        $this->mode = self::BEFOR_HEAD;
1233        $this->dom = new DOMDocument;
1234
1235        $this->dom->encoding = 'UTF-8';
1236        $this->dom->preserveWhiteSpace = true;
1237        $this->dom->substituteEntities = true;
1238        $this->dom->strictErrorChecking = false;
1239    }
1240
1241    // Process tag tokens
1242    public function emitToken($token) {
1243        switch($this->phase) {
1244            case self::INIT_PHASE: return $this->initPhase($token); break;
1245            case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1246            case self::MAIN_PHASE: return $this->mainPhase($token); break;
1247            case self::END_PHASE : return $this->trailingEndPhase($token); break;
1248        }
1249    }
1250
1251    private function initPhase($token) {
1252        /* Initially, the tree construction stage must handle each token
1253        emitted from the tokenisation stage as follows: */
1254
1255        /* A DOCTYPE token that is marked as being in error
1256        A comment token
1257        A start tag token
1258        An end tag token
1259        A character token that is not one of one of U+0009 CHARACTER TABULATION,
1260            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1261            or U+0020 SPACE
1262        An end-of-file token */
1263        if((isset($token['error']) && $token['error']) ||
1264        $token['type'] === HTML5::COMMENT ||
1265        $token['type'] === HTML5::STARTTAG ||
1266        $token['type'] === HTML5::ENDTAG ||
1267        $token['type'] === HTML5::EOF ||
1268        ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1269        !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1270            /* This specification does not define how to handle this case. In
1271            particular, user agents may ignore the entirety of this specification
1272            altogether for such documents, and instead invoke special parse modes
1273            with a greater emphasis on backwards compatibility. */
1274
1275            $this->phase = self::ROOT_PHASE;
1276            return $this->rootElementPhase($token);
1277
1278        /* A DOCTYPE token marked as being correct */
1279        } elseif(isset($token['error']) && !$token['error']) {
1280            /* Append a DocumentType node to the Document  node, with the name
1281            attribute set to the name given in the DOCTYPE token (which will be
1282            "HTML"), and the other attributes specific to DocumentType objects
1283            set to null, empty lists, or the empty string as appropriate. */
1284            $doctype = new DOMDocumentType(null, null, 'HTML');
1285
1286            /* Then, switch to the root element phase of the tree construction
1287            stage. */
1288            $this->phase = self::ROOT_PHASE;
1289
1290        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1291        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1292        or U+0020 SPACE */
1293        } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1294        $token['data'])) {
1295            /* Append that character  to the Document node. */
1296            $text = $this->dom->createTextNode($token['data']);
1297            $this->dom->appendChild($text);
1298        }
1299    }
1300
1301    private function rootElementPhase($token) {
1302        /* After the initial phase, as each token is emitted from the tokenisation
1303        stage, it must be processed as described in this section. */
1304
1305        /* A DOCTYPE token */
1306        if($token['type'] === HTML5::DOCTYPE) {
1307            // Parse error. Ignore the token.
1308
1309        /* A comment token */
1310        } elseif($token['type'] === HTML5::COMMENT) {
1311            /* Append a Comment node to the Document object with the data
1312            attribute set to the data given in the comment token. */
1313            $comment = $this->dom->createComment($token['data']);
1314            $this->dom->appendChild($comment);
1315
1316        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1317        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1318        or U+0020 SPACE */
1319        } elseif($token['type'] === HTML5::CHARACTR &&
1320        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1321            /* Append that character  to the Document node. */
1322            $text = $this->dom->createTextNode($token['data']);
1323            $this->dom->appendChild($text);
1324
1325        /* A character token that is not one of U+0009 CHARACTER TABULATION,
1326            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1327            (FF), or U+0020 SPACE
1328        A start tag token
1329        An end tag token
1330        An end-of-file token */
1331        } elseif(($token['type'] === HTML5::CHARACTR &&
1332        !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1333        $token['type'] === HTML5::STARTTAG ||
1334        $token['type'] === HTML5::ENDTAG ||
1335        $token['type'] === HTML5::EOF) {
1336            /* Create an HTMLElement node with the tag name html, in the HTML
1337            namespace. Append it to the Document object. Switch to the main
1338            phase and reprocess the current token. */
1339            $html = $this->dom->createElement('html');
1340            $this->dom->appendChild($html);
1341            $this->stack[] = $html;
1342
1343            $this->phase = self::MAIN_PHASE;
1344            return $this->mainPhase($token);
1345        }
1346    }
1347
1348    private function mainPhase($token) {
1349        /* Tokens in the main phase must be handled as follows: */
1350
1351        /* A DOCTYPE token */
1352        if($token['type'] === HTML5::DOCTYPE) {
1353            // Parse error. Ignore the token.
1354
1355        /* A start tag token with the tag name "html" */
1356        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1357            /* If this start tag token was not the first start tag token, then
1358            it is a parse error. */
1359
1360            /* For each attribute on the token, check to see if the attribute
1361            is already present on the top element of the stack of open elements.
1362            If it is not, add the attribute and its corresponding value to that
1363            element. */
1364            foreach($token['attr'] as $attr) {
1365                if(!$this->stack[0]->hasAttribute($attr['name'])) {
1366                    $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1367                }
1368            }
1369
1370        /* An end-of-file token */
1371        } elseif($token['type'] === HTML5::EOF) {
1372            /* Generate implied end tags. */
1373            $this->generateImpliedEndTags();
1374
1375        /* Anything else. */
1376        } else {
1377            /* Depends on the insertion mode: */
1378            switch($this->mode) {
1379                case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1380                case self::IN_HEAD:    return $this->inHead($token); break;
1381                case self::AFTER_HEAD: return $this->afterHead($token); break;
1382                case self::IN_BODY:    return $this->inBody($token); break;
1383                case self::IN_TABLE:   return $this->inTable($token); break;
1384                case self::IN_CAPTION: return $this->inCaption($token); break;
1385                case self::IN_CGROUP:  return $this->inColumnGroup($token); break;
1386                case self::IN_TBODY:   return $this->inTableBody($token); break;
1387                case self::IN_ROW:     return $this->inRow($token); break;
1388                case self::IN_CELL:    return $this->inCell($token); break;
1389                case self::IN_SELECT:  return $this->inSelect($token); break;
1390                case self::AFTER_BODY: return $this->afterBody($token); break;
1391                case self::IN_FRAME:   return $this->inFrameset($token); break;
1392                case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1393                case self::END_PHASE:  return $this->trailingEndPhase($token); break;
1394            }
1395        }
1396    }
1397
1398    private function beforeHead($token) {
1399        /* Handle the token as follows: */
1400
1401        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1402        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1403        or U+0020 SPACE */
1404        if($token['type'] === HTML5::CHARACTR &&
1405        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1406            /* Append the character to the current node. */
1407            $this->insertText($token['data']);
1408
1409        /* A comment token */
1410        } elseif($token['type'] === HTML5::COMMENT) {
1411            /* Append a Comment node to the current node with the data attribute
1412            set to the data given in the comment token. */
1413            $this->insertComment($token['data']);
1414
1415        /* A start tag token with the tag name "head" */
1416        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1417            /* Create an element for the token, append the new element to the
1418            current node and push it onto the stack of open elements. */
1419            $element = $this->insertElement($token);
1420
1421            /* Set the head element pointer to this new element node. */
1422            $this->head_pointer = $element;
1423
1424            /* Change the insertion mode to "in head". */
1425            $this->mode = self::IN_HEAD;
1426
1427        /* A start tag token whose tag name is one of: "base", "link", "meta",
1428        "script", "style", "title". Or an end tag with the tag name "html".
1429        Or a character token that is not one of U+0009 CHARACTER TABULATION,
1430        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1431        or U+0020 SPACE. Or any other start tag token */
1432        } elseif($token['type'] === HTML5::STARTTAG ||
1433        ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1434        ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1435        $token['data']))) {
1436            /* Act as if a start tag token with the tag name "head" and no
1437            attributes had been seen, then reprocess the current token. */
1438            $this->beforeHead(array(
1439                'name' => 'head',
1440                'type' => HTML5::STARTTAG,
1441                'attr' => array()
1442            ));
1443
1444            return $this->inHead($token);
1445
1446        /* Any other end tag */
1447        } elseif($token['type'] === HTML5::ENDTAG) {
1448            /* Parse error. Ignore the token. */
1449        }
1450    }
1451
1452    private function inHead($token) {
1453        /* Handle the token as follows: */
1454
1455        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1456        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1457        or U+0020 SPACE.
1458
1459        THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1460        or script element, append the character to the current node regardless
1461        of its content. */
1462        if(($token['type'] === HTML5::CHARACTR &&
1463        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1464        $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1465        array('title', 'style', 'script')))) {
1466            /* Append the character to the current node. */
1467            $this->insertText($token['data']);
1468
1469        /* A comment token */
1470        } elseif($token['type'] === HTML5::COMMENT) {
1471            /* Append a Comment node to the current node with the data attribute
1472            set to the data given in the comment token. */
1473            $this->insertComment($token['data']);
1474
1475        } elseif($token['type'] === HTML5::ENDTAG &&
1476        in_array($token['name'], array('title', 'style', 'script'))) {
1477            array_pop($this->stack);
1478            return HTML5::PCDATA;
1479
1480        /* A start tag with the tag name "title" */
1481        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1482            /* Create an element for the token and append the new element to the
1483            node pointed to by the head element pointer, or, if that is null
1484            (innerHTML case), to the current node. */
1485            if($this->head_pointer !== null) {
1486                $element = $this->insertElement($token, false);
1487                $this->head_pointer->appendChild($element);
1488
1489            } else {
1490                $element = $this->insertElement($token);
1491            }
1492
1493            /* Switch the tokeniser's content model flag  to the RCDATA state. */
1494            return HTML5::RCDATA;
1495
1496        /* A start tag with the tag name "style" */
1497        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1498            /* Create an element for the token and append the new element to the
1499            node pointed to by the head element pointer, or, if that is null
1500            (innerHTML case), to the current node. */
1501            if($this->head_pointer !== null) {
1502                $element = $this->insertElement($token, false);
1503                $this->head_pointer->appendChild($element);
1504
1505            } else {
1506                $this->insertElement($token);
1507            }
1508
1509            /* Switch the tokeniser's content model flag  to the CDATA state. */
1510            return HTML5::CDATA;
1511
1512        /* A start tag with the tag name "script" */
1513        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1514            /* Create an element for the token. */
1515            $element = $this->insertElement($token, false);
1516            $this->head_pointer->appendChild($element);
1517
1518            /* Switch the tokeniser's content model flag  to the CDATA state. */
1519            return HTML5::CDATA;
1520
1521        /* A start tag with the tag name "base", "link", or "meta" */
1522        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1523        array('base', 'link', 'meta'))) {
1524            /* Create an element for the token and append the new element to the
1525            node pointed to by the head element pointer, or, if that is null
1526            (innerHTML case), to the current node. */
1527            if($this->head_pointer !== null) {
1528                $element = $this->insertElement($token, false);
1529                $this->head_pointer->appendChild($element);
1530                array_pop($this->stack);
1531
1532            } else {
1533                $this->insertElement($token);
1534            }
1535
1536        /* An end tag with the tag name "head" */
1537        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1538            /* If the current node is a head element, pop the current node off
1539            the stack of open elements. */
1540            if($this->head_pointer->isSameNode(end($this->stack))) {
1541                array_pop($this->stack);
1542
1543            /* Otherwise, this is a parse error. */
1544            } else {
1545                // k
1546            }
1547
1548            /* Change the insertion mode to "after head". */
1549            $this->mode = self::AFTER_HEAD;
1550
1551        /* A start tag with the tag name "head" or an end tag except "html". */
1552        } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1553        ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1554            // Parse error. Ignore the token.
1555
1556        /* Anything else */
1557        } else {
1558            /* If the current node is a head element, act as if an end tag
1559            token with the tag name "head" had been seen. */
1560            if($this->head_pointer->isSameNode(end($this->stack))) {
1561                $this->inHead(array(
1562                    'name' => 'head',
1563                    'type' => HTML5::ENDTAG
1564                ));
1565
1566            /* Otherwise, change the insertion mode to "after head". */
1567            } else {
1568                $this->mode = self::AFTER_HEAD;
1569            }
1570
1571            /* Then, reprocess the current token. */
1572            return $this->afterHead($token);
1573        }
1574    }
1575
1576    private function afterHead($token) {
1577        /* Handle the token as follows: */
1578
1579        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1580        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1581        or U+0020 SPACE */
1582        if($token['type'] === HTML5::CHARACTR &&
1583        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1584            /* Append the character to the current node. */
1585            $this->insertText($token['data']);
1586
1587        /* A comment token */
1588        } elseif($token['type'] === HTML5::COMMENT) {
1589            /* Append a Comment node to the current node with the data attribute
1590            set to the data given in the comment token. */
1591            $this->insertComment($token['data']);
1592
1593        /* A start tag token with the tag name "body" */
1594        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1595            /* Insert a body element for the token. */
1596            $this->insertElement($token);
1597
1598            /* Change the insertion mode to "in body". */
1599            $this->mode = self::IN_BODY;
1600
1601        /* A start tag token with the tag name "frameset" */
1602        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1603            /* Insert a frameset element for the token. */
1604            $this->insertElement($token);
1605
1606            /* Change the insertion mode to "in frameset". */
1607            $this->mode = self::IN_FRAME;
1608
1609        /* A start tag token whose tag name is one of: "base", "link", "meta",
1610        "script", "style", "title" */
1611        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1612        array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1613            /* Parse error. Switch the insertion mode back to "in head" and
1614            reprocess the token. */
1615            $this->mode = self::IN_HEAD;
1616            return $this->inHead($token);
1617
1618        /* Anything else */
1619        } else {
1620            /* Act as if a start tag token with the tag name "body" and no
1621            attributes had been seen, and then reprocess the current token. */
1622            $this->afterHead(array(
1623                'name' => 'body',
1624                'type' => HTML5::STARTTAG,
1625                'attr' => array()
1626            ));
1627
1628            return $this->inBody($token);
1629        }
1630    }
1631
1632    private function inBody($token) {
1633        /* Handle the token as follows: */
1634
1635        switch($token['type']) {
1636            /* A character token */
1637            case HTML5::CHARACTR:
1638                /* Reconstruct the active formatting elements, if any. */
1639                $this->reconstructActiveFormattingElements();
1640
1641                /* Append the token's character to the current node. */
1642                $this->insertText($token['data']);
1643            break;
1644
1645            /* A comment token */
1646            case HTML5::COMMENT:
1647                /* Append a Comment node to the current node with the data
1648                attribute set to the data given in the comment token. */
1649                $this->insertComment($token['data']);
1650            break;
1651
1652            case HTML5::STARTTAG:
1653            switch($token['name']) {
1654                /* A start tag token whose tag name is one of: "script",
1655                "style" */
1656                case 'script': case 'style':
1657                    /* Process the token as if the insertion mode had been "in
1658                    head". */
1659                    return $this->inHead($token);
1660                break;
1661
1662                /* A start tag token whose tag name is one of: "base", "link",
1663                "meta", "title" */
1664                case 'base': case 'link': case 'meta': case 'title':
1665                    /* Parse error. Process the token as if the insertion mode
1666                    had    been "in head". */
1667                    return $this->inHead($token);
1668                break;
1669
1670                /* A start tag token with the tag name "body" */
1671                case 'body':
1672                    /* Parse error. If the second element on the stack of open
1673                    elements is not a body element, or, if the stack of open
1674                    elements has only one node on it, then ignore the token.
1675                    (innerHTML case) */
1676                    if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1677                        // Ignore
1678
1679                    /* Otherwise, for each attribute on the token, check to see
1680                    if the attribute is already present on the body element (the
1681                    second element)    on the stack of open elements. If it is not,
1682                    add the attribute and its corresponding value to that
1683                    element. */
1684                    } else {
1685                        foreach($token['attr'] as $attr) {
1686                            if(!$this->stack[1]->hasAttribute($attr['name'])) {
1687                                $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1688                            }
1689                        }
1690                    }
1691                break;
1692
1693                /* A start tag whose tag name is one of: "address",
1694                "blockquote", "center", "dir", "div", "dl", "fieldset",
1695                "listing", "menu", "ol", "p", "ul" */
1696                case 'address': case 'blockquote': case 'center': case 'dir':
1697                case 'div': case 'dl': case 'fieldset': case 'listing':
1698                case 'menu': case 'ol': case 'p': case 'ul':
1699                    /* If the stack of open elements has a p element in scope,
1700                    then act as if an end tag with the tag name p had been
1701                    seen. */
1702                    if($this->elementInScope('p')) {
1703                        $this->emitToken(array(
1704                            'name' => 'p',
1705                            'type' => HTML5::ENDTAG
1706                        ));
1707                    }
1708
1709                    /* Insert an HTML element for the token. */
1710                    $this->insertElement($token);
1711                break;
1712
1713                /* A start tag whose tag name is "form" */
1714                case 'form':
1715                    /* If the form element pointer is not null, ignore the
1716                    token with a parse error. */
1717                    if($this->form_pointer !== null) {
1718                        // Ignore.
1719
1720                    /* Otherwise: */
1721                    } else {
1722                        /* If the stack of open elements has a p element in
1723                        scope, then act as if an end tag with the tag name p
1724                        had been seen. */
1725                        if($this->elementInScope('p')) {
1726                            $this->emitToken(array(
1727                                'name' => 'p',
1728                                'type' => HTML5::ENDTAG
1729                            ));
1730                        }
1731
1732                        /* Insert an HTML element for the token, and set the
1733                        form element pointer to point to the element created. */
1734                        $element = $this->insertElement($token);
1735                        $this->form_pointer = $element;
1736                    }
1737                break;
1738
1739                /* A start tag whose tag name is "li", "dd" or "dt" */
1740                case 'li': case 'dd': case 'dt':
1741                    /* If the stack of open elements has a p  element in scope,
1742                    then act as if an end tag with the tag name p had been
1743                    seen. */
1744                    if($this->elementInScope('p')) {
1745                        $this->emitToken(array(
1746                            'name' => 'p',
1747                            'type' => HTML5::ENDTAG
1748                        ));
1749                    }
1750
1751                    $stack_length = count($this->stack) - 1;
1752
1753                    for($n = $stack_length; 0 <= $n; $n--) {
1754                        /* 1. Initialise node to be the current node (the
1755                        bottommost node of the stack). */
1756                        $stop = false;
1757                        $node = $this->stack[$n];
1758                        $cat  = $this->getElementCategory($node->tagName);
1759
1760                        /* 2. If node is an li, dd or dt element, then pop all
1761                        the    nodes from the current node up to node, including
1762                        node, then stop this algorithm. */
1763                        if($token['name'] === $node->tagName ||    ($token['name'] !== 'li'
1764                        && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1765                            for($x = $stack_length; $x >= $n ; $x--) {
1766                                array_pop($this->stack);
1767                            }
1768
1769                            break;
1770                        }
1771
1772                        /* 3. If node is not in the formatting category, and is
1773                        not    in the phrasing category, and is not an address or
1774                        div element, then stop this algorithm. */
1775                        if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1776                        $node->tagName !== 'address' && $node->tagName !== 'div') {
1777                            break;
1778                        }
1779                    }
1780
1781                    /* Finally, insert an HTML element with the same tag
1782                    name as the    token's. */
1783                    $this->insertElement($token);
1784                break;
1785
1786                /* A start tag token whose tag name is "plaintext" */
1787                case 'plaintext':
1788                    /* If the stack of open elements has a p  element in scope,
1789                    then act as if an end tag with the tag name p had been
1790                    seen. */
1791                    if($this->elementInScope('p')) {
1792                        $this->emitToken(array(
1793                            'name' => 'p',
1794                            'type' => HTML5::ENDTAG
1795                        ));
1796                    }
1797
1798                    /* Insert an HTML element for the token. */
1799                    $this->insertElement($token);
1800
1801                    return HTML5::PLAINTEXT;
1802                break;
1803
1804                /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1805                "h5", "h6" */
1806                case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1807                    /* If the stack of open elements has a p  element in scope,
1808                    then act as if an end tag with the tag name p had been seen. */
1809                    if($this->elementInScope('p')) {
1810                        $this->emitToken(array(
1811                            'name' => 'p',
1812                            'type' => HTML5::ENDTAG
1813                        ));
1814                    }
1815
1816                    /* If the stack of open elements has in scope an element whose
1817                    tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1818                    this is a parse error; pop elements from the stack until an
1819                    element with one of those tag names has been popped from the
1820                    stack. */
1821                    while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1822                        array_pop($this->stack);
1823                    }
1824
1825                    /* Insert an HTML element for the token. */
1826                    $this->insertElement($token);
1827                break;
1828
1829                /* A start tag whose tag name is "a" */
1830                case 'a':
1831                    /* If the list of active formatting elements contains
1832                    an element whose tag name is "a" between the end of the
1833                    list and the last marker on the list (or the start of
1834                    the list if there is no marker on the list), then this
1835                    is a parse error; act as if an end tag with the tag name
1836                    "a" had been seen, then remove that element from the list
1837                    of active formatting elements and the stack of open
1838                    elements if the end tag didn't already remove it (it
1839                    might not have if the element is not in table scope). */
1840                    $leng = count($this->a_formatting);
1841
1842                    for($n = $leng - 1; $n >= 0; $n--) {
1843                        if($this->a_formatting[$n] === self::MARKER) {
1844                            break;
1845
1846                        } elseif($this->a_formatting[$n]->nodeName === 'a') {
1847                            $this->emitToken(array(
1848                                'name' => 'a',
1849                                'type' => HTML5::ENDTAG
1850                            ));
1851                            break;
1852                        }
1853                    }
1854
1855                    /* Reconstruct the active formatting elements, if any. */
1856                    $this->reconstructActiveFormattingElements();
1857
1858                    /* Insert an HTML element for the token. */
1859                    $el = $this->insertElement($token);
1860
1861                    /* Add that element to the list of active formatting
1862                    elements. */
1863                    $this->a_formatting[] = $el;
1864                break;
1865
1866                /* A start tag whose tag name is one of: "b", "big", "em", "font",
1867                "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1868                case 'b': case 'big': case 'em': case 'font': case 'i':
1869                case 'nobr': case 's': case 'small': case 'strike':
1870                case 'strong': case 'tt': case 'u':
1871                    /* Reconstruct the active formatting elements, if any. */
1872                    $this->reconstructActiveFormattingElements();
1873
1874                    /* Insert an HTML element for the token. */
1875                    $el = $this->insertElement($token);
1876
1877                    /* Add that element to the list of active formatting
1878                    elements. */
1879                    $this->a_formatting[] = $el;
1880                break;
1881
1882                /* A start tag token whose tag name is "button" */
1883                case 'button':
1884                    /* If the stack of open elements has a button element in scope,
1885                    then this is a parse error; act as if an end tag with the tag
1886                    name "button" had been seen, then reprocess the token. (We don't
1887                    do that. Unnecessary.) */
1888                    if($this->elementInScope('button')) {
1889                        $this->inBody(array(
1890                            'name' => 'button',
1891                            'type' => HTML5::ENDTAG
1892                        ));
1893                    }
1894
1895                    /* Reconstruct the active formatting elements, if any. */
1896                    $this->reconstructActiveFormattingElements();
1897
1898                    /* Insert an HTML element for the token. */
1899                    $this->insertElement($token);
1900
1901                    /* Insert a marker at the end of the list of active
1902                    formatting elements. */
1903                    $this->a_formatting[] = self::MARKER;
1904                break;
1905
1906                /* A start tag token whose tag name is one of: "marquee", "object" */
1907                case 'marquee': case 'object':
1908                    /* Reconstruct the active formatting elements, if any. */
1909                    $this->reconstructActiveFormattingElements();
1910
1911                    /* Insert an HTML element for the token. */
1912                    $this->insertElement($token);
1913
1914                    /* Insert a marker at the end of the list of active
1915                    formatting elements. */
1916                    $this->a_formatting[] = self::MARKER;
1917                break;
1918
1919                /* A start tag token whose tag name is "xmp" */
1920                case 'xmp':
1921                    /* Reconstruct the active formatting elements, if any. */
1922                    $this->reconstructActiveFormattingElements();
1923
1924                    /* Insert an HTML element for the token. */
1925                    $this->insertElement($token);
1926
1927                    /* Switch the content model flag to the CDATA state. */
1928                    return HTML5::CDATA;
1929                break;
1930
1931                /* A start tag whose tag name is "table" */
1932                case 'table':
1933                    /* If the stack of open elements has a p element in scope,
1934                    then act as if an end tag with the tag name p had been seen. */
1935                    if($this->elementInScope('p')) {
1936                        $this->emitToken(array(
1937                            'name' => 'p',
1938                            'type' => HTML5::ENDTAG
1939                        ));
1940                    }
1941
1942                    /* Insert an HTML element for the token. */
1943                    $this->insertElement($token);
1944
1945                    /* Change the insertion mode to "in table". */
1946                    $this->mode = self::IN_TABLE;
1947                break;
1948
1949                /* A start tag whose tag name is one of: "area", "basefont",
1950                "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1951                case 'area': case 'basefont': case 'bgsound': case 'br':
1952                case 'embed': case 'img': case 'param': case 'spacer':
1953                case 'wbr':
1954                    /* Reconstruct the active formatting elements, if any. */
1955                    $this->reconstructActiveFormattingElements();
1956
1957                    /* Insert an HTML element for the token. */
1958                    $this->insertElement($token);
1959
1960                    /* Immediately pop the current node off the stack of open elements. */
1961                    array_pop($this->stack);
1962                break;
1963
1964                /* A start tag whose tag name is "hr" */
1965                case 'hr':
1966                    /* If the stack of open elements has a p element in scope,
1967                    then act as if an end tag with the tag name p had been seen. */
1968                    if($this->elementInScope('p')) {
1969                        $this->emitToken(array(
1970                            'name' => 'p',
1971                            'type' => HTML5::ENDTAG
1972                        ));
1973                    }
1974
1975                    /* Insert an HTML element for the token. */
1976                    $this->insertElement($token);
1977
1978                    /* Immediately pop the current node off the stack of open elements. */
1979                    array_pop($this->stack);
1980                break;
1981
1982                /* A start tag whose tag name is "image" */
1983                case 'image':
1984                    /* Parse error. Change the token's tag name to "img" and
1985                    reprocess it. (Don't ask.) */
1986                    $token['name'] = 'img';
1987                    return $this->inBody($token);
1988                break;
1989
1990                /* A start tag whose tag name is "input" */
1991                case 'input':
1992                    /* Reconstruct the active formatting elements, if any. */
1993                    $this->reconstructActiveFormattingElements();
1994
1995                    /* Insert an input element for the token. */
1996                    $element = $this->insertElement($token, false);
1997
1998                    /* If the form element pointer is not null, then associate the
1999                    input element with the form element pointed to by the form
2000                    element pointer. */
2001                    $this->form_pointer !== null
2002                        ? $this->form_pointer->appendChild($element)
2003                        : end($this->stack)->appendChild($element);
2004
2005                    /* Pop that input element off the stack of open elements. */
2006                    array_pop($this->stack);
2007                break;
2008
2009                /* A start tag whose tag name is "isindex" */
2010                case 'isindex':
2011                    /* Parse error. */
2012                    // w/e
2013
2014                    /* If the form element pointer is not null,
2015                    then ignore the token. */
2016                    if($this->form_pointer === null) {
2017                        /* Act as if a start tag token with the tag name "form" had
2018                        been seen. */
2019                        $this->inBody(array(
2020                            'name' => 'body',
2021                            'type' => HTML5::STARTTAG,
2022                            'attr' => array()
2023                        ));
2024
2025                        /* Act as if a start tag token with the tag name "hr" had
2026                        been seen. */
2027                        $this->inBody(array(
2028                            'name' => 'hr',
2029                            'type' => HTML5::STARTTAG,
2030                            'attr' => array()
2031                        ));
2032
2033                        /* Act as if a start tag token with the tag name "p" had
2034                        been seen. */
2035                        $this->inBody(array(
2036                            'name' => 'p',
2037                            'type' => HTML5::STARTTAG,
2038                            'attr' => array()
2039                        ));
2040
2041                        /* Act as if a start tag token with the tag name "label"
2042                        had been seen. */
2043                        $this->inBody(array(
2044                            'name' => 'label',
2045                            'type' => HTML5::STARTTAG,
2046                            'attr' => array()
2047                        ));
2048
2049                        /* Act as if a stream of character tokens had been seen. */
2050                        $this->insertText('This is a searchable index. '.
2051                        'Insert your search keywords here: ');
2052
2053                        /* Act as if a start tag token with the tag name "input"
2054                        had been seen, with all the attributes from the "isindex"
2055                        token, except with the "name" attribute set to the value
2056                        "isindex" (ignoring any explicit "name" attribute). */
2057                        $attr = $token['attr'];
2058                        $attr[] = array('name' => 'name', 'value' => 'isindex');
2059
2060                        $this->inBody(array(
2061                            'name' => 'input',
2062                            'type' => HTML5::STARTTAG,
2063                            'attr' => $attr
2064                        ));
2065
2066                        /* Act as if a stream of character tokens had been seen
2067                        (see below for what they should say). */
2068                        $this->insertText('This is a searchable index. '.
2069                        'Insert your search keywords here: ');
2070
2071                        /* Act as if an end tag token with the tag name "label"
2072                        had been seen. */
2073                        $this->inBody(array(
2074                            'name' => 'label',
2075                            'type' => HTML5::ENDTAG
2076                        ));
2077
2078                        /* Act as if an end tag token with the tag name "p" had
2079                        been seen. */
2080                        $this->inBody(array(
2081                            'name' => 'p',
2082                            'type' => HTML5::ENDTAG
2083                        ));
2084
2085                        /* Act as if a start tag token with the tag name "hr" had
2086                        been seen. */
2087                        $this->inBody(array(
2088                            'name' => 'hr',
2089                            'type' => HTML5::ENDTAG
2090                        ));
2091
2092                        /* Act as if an end tag token with the tag name "form" had
2093                        been seen. */
2094                        $this->inBody(array(
2095                            'name' => 'form',
2096                            'type' => HTML5::ENDTAG
2097                        ));
2098                    }
2099                break;
2100
2101                /* A start tag whose tag name is "textarea" */
2102                case 'textarea':
2103                    $this->insertElement($token);
2104
2105                    /* Switch the tokeniser's content model flag to the
2106                    RCDATA state. */
2107                    return HTML5::RCDATA;
2108                break;
2109
2110                /* A start tag whose tag name is one of: "iframe", "noembed",
2111                "noframes" */
2112                case 'iframe': case 'noembed': case 'noframes':
2113                    $this->insertElement($token);
2114
2115                    /* Switch the tokeniser's content model flag to the CDATA state. */
2116                    return HTML5::CDATA;
2117                break;
2118
2119                /* A start tag whose tag name is "select" */
2120                case 'select':
2121                    /* Reconstruct the active formatting elements, if any. */
2122                    $this->reconstructActiveFormattingElements();
2123
2124                    /* Insert an HTML element for the token. */
2125                    $this->insertElement($token);
2126
2127                    /* Change the insertion mode to "in select". */
2128                    $this->mode = self::IN_SELECT;
2129                break;
2130
2131                /* A start or end tag whose tag name is one of: "caption", "col",
2132                "colgroup", "frame", "frameset", "head", "option", "optgroup",
2133                "tbody", "td", "tfoot", "th", "thead", "tr". */
2134                case 'caption': case 'col': case 'colgroup': case 'frame':
2135                case 'frameset': case 'head': case 'option': case 'optgroup':
2136                case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2137                case 'tr':
2138                    // Parse error. Ignore the token.
2139                break;
2140
2141                /* A start or end tag whose tag name is one of: "event-source",
2142                "section", "nav", "article", "aside", "header", "footer",
2143                "datagrid", "command" */
2144                case 'event-source': case 'section': case 'nav': case 'article':
2145                case 'aside': case 'header': case 'footer': case 'datagrid':
2146                case 'command':
2147                    // Work in progress!
2148                break;
2149
2150                /* A start tag token not covered by the previous entries */
2151                default:
2152                    /* Reconstruct the active formatting elements, if any. */
2153                    $this->reconstructActiveFormattingElements();
2154
2155                    $this->insertElement($token, true, true);
2156                break;
2157            }
2158            break;
2159
2160            case HTML5::ENDTAG:
2161            switch($token['name']) {
2162                /* An end tag with the tag name "body" */
2163                case 'body':
2164                    /* If the second element in the stack of open elements is
2165                    not a body element, this is a parse error. Ignore the token.
2166                    (innerHTML case) */
2167                    if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2168                        // Ignore.
2169
2170                    /* If the current node is not the body element, then this
2171                    is a parse error. */
2172                    } elseif(end($this->stack)->nodeName !== 'body') {
2173                        // Parse error.
2174                    }
2175
2176                    /* Change the insertion mode to "after body". */
2177                    $this->mode = self::AFTER_BODY;
2178                break;
2179
2180                /* An end tag with the tag name "html" */
2181                case 'html':
2182                    /* Act as if an end tag with tag name "body" had been seen,
2183                    then, if that token wasn't ignored, reprocess the current
2184                    token. */
2185                    $this->inBody(array(
2186                        'name' => 'body',
2187                        'type' => HTML5::ENDTAG
2188                    ));
2189
2190                    return $this->afterBody($token);
2191                break;
2192
2193                /* An end tag whose tag name is one of: "address", "blockquote",
2194                "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2195                "ol", "pre", "ul" */
2196                case 'address': case 'blockquote': case 'center': case 'dir':
2197                case 'div': case 'dl': case 'fieldset': case 'listing':
2198                case 'menu': case 'ol': case 'pre': case 'ul':
2199                    /* If the stack of open elements has an element in scope
2200                    with the same tag name as that of the token, then generate
2201                    implied end tags. */
2202                    if($this->elementInScope($token['name'])) {
2203                        $this->generateImpliedEndTags();
2204
2205                        /* Now, if the current node is not an element with
2206                        the same tag name as that of the token, then this
2207                        is a parse error. */
2208                        // w/e
2209
2210                        /* If the stack of open elements has an element in
2211                        scope with the same tag name as that of the token,
2212                        then pop elements from this stack until an element
2213                        with that tag name has been popped from the stack. */
2214                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2215                            if($this->stack[$n]->nodeName === $token['name']) {
2216                                $n = -1;
2217                            }
2218
2219                            array_pop($this->stack);
2220                        }
2221                    }
2222                break;
2223
2224                /* An end tag whose tag name is "form" */
2225                case 'form':
2226                    /* If the stack of open elements has an element in scope
2227                    with the same tag name as that of the token, then generate
2228                    implied    end tags. */
2229                    if($this->elementInScope($token['name'])) {
2230                        $this->generateImpliedEndTags();
2231
2232                    }
2233
2234                    if(end($this->stack)->nodeName !== $token['name']) {
2235                        /* Now, if the current node is not an element with the
2236                        same tag name as that of the token, then this is a parse
2237                        error. */
2238                        // w/e
2239
2240                    } else {
2241                        /* Otherwise, if the current node is an element with
2242                        the same tag name as that of the token pop that element
2243                        from the stack. */
2244                        array_pop($this->stack);
2245                    }
2246
2247                    /* In any case, set the form element pointer to null. */
2248                    $this->form_pointer = null;
2249                break;
2250
2251                /* An end tag whose tag name is "p" */
2252                case 'p':
2253                    /* If the stack of open elements has a p element in scope,
2254                    then generate implied end tags, except for p elements. */
2255                    if($this->elementInScope('p')) {
2256                        $this->generateImpliedEndTags(array('p'));
2257
2258                        /* If the current node is not a p element, then this is
2259                        a parse error. */
2260                        // k
2261
2262                        /* If the stack of open elements has a p element in
2263                        scope, then pop elements from this stack until the stack
2264                        no longer has a p element in scope. */
2265                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2266                            if($this->elementInScope('p')) {
2267                                array_pop($this->stack);
2268
2269                            } else {
2270                                break;
2271                            }
2272                        }
2273                    }
2274                break;
2275
2276                /* An end tag whose tag name is "dd", "dt", or "li" */
2277                case 'dd': case 'dt': case 'li':
2278                    /* If the stack of open elements has an element in scope
2279                    whose tag name matches the tag name of the token, then
2280                    generate implied end tags, except for elements with the
2281                    same tag name as the token. */
2282                    if($this->elementInScope($token['name'])) {
2283                        $this->generateImpliedEndTags(array($token['name']));
2284
2285                        /* If the current node is not an element with the same
2286                        tag name as the token, then this is a parse error. */
2287                        // w/e
2288
2289                        /* If the stack of open elements has an element in scope
2290                        whose tag name matches the tag name of the token, then
2291                        pop elements from this stack until an element with that
2292                        tag name has been popped from the stack. */
2293                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2294                            if($this->stack[$n]->nodeName === $token['name']) {
2295                                $n = -1;
2296                            }
2297
2298                            array_pop($this->stack);
2299                        }
2300                    }
2301                break;
2302
2303                /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2304                "h5", "h6" */
2305                case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2306                    $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2307
2308                    /* If the stack of open elements has in scope an element whose
2309                    tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2310                    generate implied end tags. */
2311                    if($this->elementInScope($elements)) {
2312                        $this->generateImpliedEndTags();
2313
2314                        /* Now, if the current node is not an element with the same
2315                        tag name as that of the token, then this is a parse error. */
2316                        // w/e
2317
2318                        /* If the stack of open elements has in scope an element
2319                        whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2320                        "h6", then pop elements from the stack until an element
2321                        with one of those tag names has been popped from the stack. */
2322                        while($this->elementInScope($elements)) {
2323                            array_pop($this->stack);
2324                        }
2325                    }
2326                break;
2327
2328                /* An end tag whose tag name is one of: "a", "b", "big", "em",
2329                "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2330                case 'a': case 'b': case 'big': case 'em': case 'font':
2331                case 'i': case 'nobr': case 's': case 'small': case 'strike':
2332                case 'strong': case 'tt': case 'u':
2333                    /* 1. Let the formatting element be the last element in
2334                    the list of active formatting elements that:
2335                        * is between the end of the list and the last scope
2336                        marker in the list, if any, or the start of the list
2337                        otherwise, and
2338                        * has the same tag name as the token.
2339                    */
2340                    while(true) {
2341                        for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2342                            if($this->a_formatting[$a] === self::MARKER) {
2343                                break;
2344
2345                            } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2346                                $formatting_element = $this->a_formatting[$a];
2347                                $in_stack = in_array($formatting_element, $this->stack, true);
2348                                $fe_af_pos = $a;
2349                                break;
2350                            }
2351                        }
2352
2353                        /* If there is no such node, or, if that node is
2354                        also in the stack of open elements but the element
2355                        is not in scope, then this is a parse error. Abort
2356                        these steps. The token is ignored. */
2357                        if(!isset($formatting_element) || ($in_stack &&
2358                        !$this->elementInScope($token['name']))) {
2359                            break;
2360
2361                        /* Otherwise, if there is such a node, but that node
2362                        is not in the stack of open elements, then this is a
2363                        parse error; remove the element from the list, and
2364                        abort these steps. */
2365                        } elseif(isset($formatting_element) && !$in_stack) {
2366                            unset($this->a_formatting[$fe_af_pos]);
2367                            $this->a_formatting = array_merge($this->a_formatting);
2368                            break;
2369                        }
2370
2371                        /* 2. Let the furthest block be the topmost node in the
2372                        stack of open elements that is lower in the stack
2373                        than the formatting element, and is not an element in
2374                        the phrasing or formatting categories. There might
2375                        not be one. */
2376                        $fe_s_pos = array_search($formatting_element, $this->stack, true);
2377                        $length = count($this->stack);
2378
2379                        for($s = $fe_s_pos + 1; $s < $length; $s++) {
2380                            $category = $this->getElementCategory($this->stack[$s]->nodeName);
2381
2382                            if($category !== self::PHRASING && $category !== self::FORMATTING) {
2383                                $furthest_block = $this->stack[$s];
2384                            }
2385                        }
2386
2387                        /* 3. If there is no furthest block, then the UA must
2388                        skip the subsequent steps and instead just pop all
2389                        the nodes from the bottom of the stack of open
2390                        elements, from the current node up to the formatting
2391                        element, and remove the formatting element from the
2392                        list of active formatting elements. */
2393                        if(!isset($furthest_block)) {
2394                            for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2395                                array_pop($this->stack);
2396                            }
2397
2398                            unset($this->a_formatting[$fe_af_pos]);
2399                            $this->a_formatting = array_merge($this->a_formatting);
2400                            break;
2401                        }
2402
2403                        /* 4. Let the common ancestor be the element
2404                        immediately above the formatting element in the stack
2405                        of open elements. */
2406                        $common_ancestor = $this->stack[$fe_s_pos - 1];
2407
2408                        /* 5. If the furthest block has a parent node, then
2409                        remove the furthest block from its parent node. */
2410                        if($furthest_block->parentNode !== null) {
2411                            $furthest_block->parentNode->removeChild($furthest_block);
2412                        }
2413
2414                        /* 6. Let a bookmark note the position of the
2415                        formatting element in the list of active formatting
2416                        elements relative to the elements on either side
2417                        of it in the list. */
2418                        $bookmark = $fe_af_pos;
2419
2420                        /* 7. Let node and last node  be the furthest block.
2421                        Follow these steps: */
2422                        $node = $furthest_block;
2423                        $last_node = $furthest_block;
2424
2425                        while(true) {
2426                            for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2427                                /* 7.1 Let node be the element immediately
2428                                prior to node in the stack of open elements. */
2429                                $node = $this->stack[$n];
2430
2431                                /* 7.2 If node is not in the list of active
2432                                formatting elements, then remove node from
2433                                the stack of open elements and then go back
2434                                to step 1. */
2435                                if(!in_array($node, $this->a_formatting, true)) {
2436                                    unset($this->stack[$n]);
2437                                    $this->stack = array_merge($this->stack);
2438
2439                                } else {
2440                                    break;
2441                                }
2442                            }
2443
2444                            /* 7.3 Otherwise, if node is the formatting
2445                            element, then go to the next step in the overall
2446                            algorithm. */
2447                            if($node === $formatting_element) {
2448                                break;
2449
2450                            /* 7.4 Otherwise, if last node is the furthest
2451                            block, then move the aforementioned bookmark to
2452                            be immediately after the node in the list of
2453                            active formatting elements. */
2454                            } elseif($last_node === $furthest_block) {
2455                                $bookmark = array_search($node, $this->a_formatting, true) + 1;
2456                            }
2457
2458                            /* 7.5 If node has any children, perform a
2459                            shallow clone of node, replace the entry for
2460                            node in the list of active formatting elements
2461                            with an entry for the clone, replace the entry
2462                            for node in the stack of open elements with an
2463                            entry for the clone, and let node be the clone. */
2464                            if($node->hasChildNodes()) {
2465                                $clone = $node->cloneNode();
2466                                $s_pos = array_search($node, $this->stack, true);
2467                                $a_pos = array_search($node, $this->a_formatting, true);
2468
2469                                $this->stack[$s_pos] = $clone;
2470                                $this->a_formatting[$a_pos] = $clone;
2471                                $node = $clone;
2472                            }
2473
2474                            /* 7.6 Insert last node into node, first removing
2475                            it from its previous parent node if any. */
2476                            if($last_node->parentNode !== null) {
2477                                $last_node->parentNode->removeChild($last_node);
2478                            }
2479
2480                            $node->appendChild($last_node);
2481
2482                            /* 7.7 Let last node be node. */
2483                            $last_node = $node;
2484                        }
2485
2486                        /* 8. Insert whatever last node ended up being in
2487                        the previous step into the common ancestor node,
2488                        first removing it from its previous parent node if
2489                        any. */
2490                        if($last_node->parentNode !== null) {
2491                            $last_node->parentNode->removeChild($last_node);
2492                        }
2493
2494                        $common_ancestor->appendChild($last_node);
2495
2496                        /* 9. Perform a shallow clone of the formatting
2497                        element. */
2498                        $clone = $formatting_element->cloneNode();
2499
2500                        /* 10. Take all of the child nodes of the furthest
2501                        block and append them to the clone created in the
2502                        last step. */
2503                        while($furthest_block->hasChildNodes()) {
2504                            $child = $furthest_block->firstChild;
2505                            $furthest_block->removeChild($child);
2506                            $clone->appendChild($child);
2507                        }
2508
2509                        /* 11. Append that clone to the furthest block. */
2510                        $furthest_block->appendChild($clone);
2511
2512                        /* 12. Remove the formatting element from the list
2513                        of active formatting elements, and insert the clone
2514                        into the list of active formatting elements at the
2515                        position of the aforementioned bookmark. */
2516                        $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2517                        unset($this->a_formatting[$fe_af_pos]);
2518                        $this->a_formatting = array_merge($this->a_formatting);
2519
2520                        $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2521                        $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2522                        $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2523
2524                        /* 13. Remove the formatting element from the stack
2525                        of open elements, and insert the clone into the stack
2526                        of open elements immediately after (i.e. in a more
2527                        deeply nested position than) the position of the
2528                        furthest block in that stack. */
2529                        $fe_s_pos = array_search($formatting_element, $this->stack, true);
2530                        $fb_s_pos = array_search($furthest_block, $this->stack, true);
2531                        unset($this->stack[$fe_s_pos]);
2532
2533                        $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2534                        $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2535                        $this->stack = array_merge($s_part1, array($clone), $s_part2);
2536
2537                        /* 14. Jump back to step 1 in this series of steps. */
2538                        unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2539                    }
2540                break;
2541
2542                /* An end tag token whose tag name is one of: "button",
2543                "marquee", "object" */
2544                case 'button': case 'marquee': case 'object':
2545                    /* If the stack of open elements has an element in scope whose
2546                    tag name matches the tag name of the token, then generate implied
2547                    tags. */
2548                    if($this->elementInScope($token['name'])) {
2549                        $this->generateImpliedEndTags();
2550
2551                        /* Now, if the current node is not an element with the same
2552                        tag name as the token, then this is a parse error. */
2553                        // k
2554
2555                        /* Now, if the stack of open elements has an element in scope
2556                        whose tag name matches the tag name of the token, then pop
2557                        elements from the stack until that element has been popped from
2558                        the stack, and clear the list of active formatting elements up
2559                        to the last marker. */
2560                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2561                            if($this->stack[$n]->nodeName === $token['name']) {
2562                                $n = -1;
2563                            }
2564
2565                            array_pop($this->stack);
2566                        }
2567
2568                        $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2569
2570                        for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2571                            array_pop($this->a_formatting);
2572                        }
2573                    }
2574                break;
2575
2576                /* Or an end tag whose tag name is one of: "area", "basefont",
2577                "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2578                "input", "isindex", "noembed", "noframes", "param", "select",
2579                "spacer", "table", "textarea", "wbr" */
2580                case 'area': case 'basefont': case 'bgsound': case 'br':
2581                case 'embed': case 'hr': case 'iframe': case 'image':
2582                case 'img': case 'input': case 'isindex': case 'noembed':
2583                case 'noframes': case 'param': case 'select': case 'spacer':
2584                case 'table': case 'textarea': case 'wbr':
2585                    // Parse error. Ignore the token.
2586                break;
2587
2588                /* An end tag token not covered by the previous entries */
2589                default:
2590                    for($n = count($this->stack) - 1; $n >= 0; $n--) {
2591                        /* Initialise node to be the current node (the bottommost
2592                        node of the stack). */
2593                        $node = end($this->stack);
2594
2595                        /* If node has the same tag name as the end tag token,
2596                        then: */
2597                        if($token['name'] === $node->nodeName) {
2598                            /* Generate implied end tags. */
2599                            $this->generateImpliedEndTags();
2600
2601                            /* If the tag name of the end tag token does not
2602                            match the tag name of the current node, this is a
2603                            parse error. */
2604                            // k
2605
2606                            /* Pop all the nodes from the current node up to
2607                            node, including node, then stop this algorithm. */
2608                            for($x = count($this->stack) - $n; $x >= $n; $x--) {
2609                                array_pop($this->stack);
2610                            }
2611
2612                        } else {
2613                            $category = $this->getElementCategory($node);
2614
2615                            if($category !== self::SPECIAL && $category !== self::SCOPING) {
2616                                /* Otherwise, if node is in neither the formatting
2617                                category nor the phrasing category, then this is a
2618                                parse error. Stop this algorithm. The end tag token
2619                                is ignored. */
2620                                return false;
2621                            }
2622                        }
2623                    }
2624                break;
2625            }
2626            break;
2627        }
2628    }
2629
2630    private function inTable($token) {
2631        $clear = array('html', 'table');
2632
2633        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2634        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2635        or U+0020 SPACE */
2636        if($token['type'] === HTML5::CHARACTR &&
2637        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2638            /* Append the character to the current node. */
2639            $text = $this->dom->createTextNode($token['data']);
2640            end($this->stack)->appendChild($text);
2641
2642        /* A comment token */
2643        } elseif($token['type'] === HTML5::COMMENT) {
2644            /* Append a Comment node to the current node with the data
2645            attribute set to the data given in the comment token. */
2646            $comment = $this->dom->createComment($token['data']);
2647            end($this->stack)->appendChild($comment);
2648
2649        /* A start tag whose tag name is "caption" */
2650        } elseif($token['type'] === HTML5::STARTTAG &&
2651        $token['name'] === 'caption') {
2652            /* Clear the stack back to a table context. */
2653            $this->clearStackToTableContext($clear);
2654
2655            /* Insert a marker at the end of the list of active
2656            formatting elements. */
2657            $this->a_formatting[] = self::MARKER;
2658
2659            /* Insert an HTML element for the token, then switch the
2660            insertion mode to "in caption". */
2661            $this->insertElement($token);
2662            $this->mode = self::IN_CAPTION;
2663
2664        /* A start tag whose tag name is "colgroup" */
2665        } elseif($token['type'] === HTML5::STARTTAG &&
2666        $token['name'] === 'colgroup') {
2667            /* Clear the stack back to a table context. */
2668            $this->clearStackToTableContext($clear);
2669
2670            /* Insert an HTML element for the token, then switch the
2671            insertion mode to "in column group". */
2672            $this->insertElement($token);
2673            $this->mode = self::IN_CGROUP;
2674
2675        /* A start tag whose tag name is "col" */
2676        } elseif($token['type'] === HTML5::STARTTAG &&
2677        $token['name'] === 'col') {
2678            $this->inTable(array(
2679                'name' => 'colgroup',
2680                'type' => HTML5::STARTTAG,
2681                'attr' => array()
2682            ));
2683
2684            $this->inColumnGroup($token);
2685
2686        /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2687        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2688        array('tbody', 'tfoot', 'thead'))) {
2689            /* Clear the stack back to a table context. */
2690            $this->clearStackToTableContext($clear);
2691
2692            /* Insert an HTML element for the token, then switch the insertion
2693            mode to "in table body". */
2694            $this->insertElement($token);
2695            $this->mode = self::IN_TBODY;
2696
2697        /* A start tag whose tag name is one of: "td", "th", "tr" */
2698        } elseif($token['type'] === HTML5::STARTTAG &&
2699        in_array($token['name'], array('td', 'th', 'tr'))) {
2700            /* Act as if a start tag token with the tag name "tbody" had been
2701            seen, then reprocess the current token. */
2702            $this->inTable(array(
2703                'name' => 'tbody',
2704                'type' => HTML5::STARTTAG,
2705                'attr' => array()
2706            ));
2707
2708            return $this->inTableBody($token);
2709
2710        /* A start tag whose tag name is "table" */
2711        } elseif($token['type'] === HTML5::STARTTAG &&
2712        $token['name'] === 'table') {
2713            /* Parse error. Act as if an end tag token with the tag name "table"
2714            had been seen, then, if that token wasn't ignored, reprocess the
2715            current token. */
2716            $this->inTable(array(
2717                'name' => 'table',
2718                'type' => HTML5::ENDTAG
2719            ));
2720
2721            return $this->mainPhase($token);
2722
2723        /* An end tag whose tag name is "table" */
2724        } elseif($token['type'] === HTML5::ENDTAG &&
2725        $token['name'] === 'table') {
2726            /* If the stack of open elements does not have an element in table
2727            scope with the same tag name as the token, this is a parse error.
2728            Ignore the token. (innerHTML case) */
2729            if(!$this->elementInScope($token['name'], true)) {
2730                return false;
2731
2732            /* Otherwise: */
2733            } else {
2734                /* Generate implied end tags. */
2735                $this->generateImpliedEndTags();
2736
2737                /* Now, if the current node is not a table element, then this
2738                is a parse error. */
2739                // w/e
2740
2741                /* Pop elements from this stack until a table element has been
2742                popped from the stack. */
2743                while(true) {
2744                    $current = end($this->stack)->nodeName;
2745                    array_pop($this->stack);
2746
2747                    if($current === 'table') {
2748                        break;
2749                    }
2750                }
2751
2752                /* Reset the insertion mode appropriately. */
2753                $this->resetInsertionMode();
2754            }
2755
2756        /* An end tag whose tag name is one of: "body", "caption", "col",
2757        "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2758        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2759        array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2760        'tfoot', 'th', 'thead', 'tr'))) {
2761            // Parse error. Ignore the token.
2762
2763        /* Anything else */
2764        } else {
2765            /* Parse error. Process the token as if the insertion mode was "in
2766            body", with the following exception: */
2767
2768            /* If the current node is a table, tbody, tfoot, thead, or tr
2769            element, then, whenever a node would be inserted into the current
2770            node, it must instead be inserted into the foster parent element. */
2771            if(in_array(end($this->stack)->nodeName,
2772            array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2773                /* The foster parent element is the parent element of the last
2774                table element in the stack of open elements, if there is a
2775                table element and it has such a parent element. If there is no
2776                table element in the stack of open elements (innerHTML case),
2777                then the foster parent element is the first element in the
2778                stack of open elements (the html  element). Otherwise, if there
2779                is a table element in the stack of open elements, but the last
2780                table element in the stack of open elements has no parent, or
2781                its parent node is not an element, then the foster parent
2782                element is the element before the last table element in the
2783                stack of open elements. */
2784                for($n = count($this->stack) - 1; $n >= 0; $n--) {
2785                    if($this->stack[$n]->nodeName === 'table') {
2786                        $table = $this->stack[$n];
2787                        break;
2788                    }
2789                }
2790
2791                if(isset($table) && $table->parentNode !== null) {
2792                    $this->foster_parent = $table->parentNode;
2793
2794                } elseif(!isset($table)) {
2795                    $this->foster_parent = $this->stack[0];
2796
2797                } elseif(isset($table) && ($table->parentNode === null ||
2798                $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2799                    $this->foster_parent = $this->stack[$n - 1];
2800                }
2801            }
2802
2803            $this->inBody($token);
2804        }
2805    }
2806
2807    private function inCaption($token) {
2808        /* An end tag whose tag name is "caption" */
2809        if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2810            /* If the stack of open elements does not have an element in table
2811            scope with the same tag name as the token, this is a parse error.
2812            Ignore the token. (innerHTML case) */
2813            if(!$this->elementInScope($token['name'], true)) {
2814                // Ignore
2815
2816            /* Otherwise: */
2817            } else {
2818                /* Generate implied end tags. */
2819                $this->generateImpliedEndTags();
2820
2821                /* Now, if the current node is not a caption element, then this
2822                is a parse error. */
2823                // w/e
2824
2825                /* Pop elements from this stack until a caption element has
2826                been popped from the stack. */
2827                while(true) {
2828                    $node = end($this->stack)->nodeName;
2829                    array_pop($this->stack);
2830
2831                    if($node === 'caption') {
2832                        break;
2833                    }
2834                }
2835
2836                /* Clear the list of active formatting elements up to the last
2837                marker. */
2838                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2839
2840                /* Switch the insertion mode to "in table". */
2841                $this->mode = self::IN_TABLE;
2842            }
2843
2844        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2845        "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2846        name is "table" */
2847        } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2848        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2849        'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2850        $token['name'] === 'table')) {
2851            /* Parse error. Act as if an end tag with the tag name "caption"
2852            had been seen, then, if that token wasn't ignored, reprocess the
2853            current token. */
2854            $this->inCaption(array(
2855                'name' => 'caption',
2856                'type' => HTML5::ENDTAG
2857            ));
2858
2859            return $this->inTable($token);
2860
2861        /* An end tag whose tag name is one of: "body", "col", "colgroup",
2862        "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2863        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2864        array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2865        'thead', 'tr'))) {
2866            // Parse error. Ignore the token.
2867
2868        /* Anything else */
2869        } else {
2870            /* Process the token as if the insertion mode was "in body". */
2871            $this->inBody($token);
2872        }
2873    }
2874
2875    private function inColumnGroup($token) {
2876        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2877        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2878        or U+0020 SPACE */
2879        if($token['type'] === HTML5::CHARACTR &&
2880        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2881            /* Append the character to the current node. */
2882            $text = $this->dom->createTextNode($token['data']);
2883            end($this->stack)->appendChild($text);
2884
2885        /* A comment token */
2886        } elseif($token['type'] === HTML5::COMMENT) {
2887            /* Append a Comment node to the current node with the data
2888            attribute set to the data given in the comment token. */
2889            $comment = $this->dom->createComment($token['data']);
2890            end($this->stack)->appendChild($comment);
2891
2892        /* A start tag whose tag name is "col" */
2893        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2894            /* Insert a col element for the token. Immediately pop the current
2895            node off the stack of open elements. */
2896            $this->insertElement($token);
2897            array_pop($this->stack);
2898
2899        /* An end tag whose tag name is "colgroup" */
2900        } elseif($token['type'] === HTML5::ENDTAG &&
2901        $token['name'] === 'colgroup') {
2902            /* If the current node is the root html element, then this is a
2903            parse error, ignore the token. (innerHTML case) */
2904            if(end($this->stack)->nodeName === 'html') {
2905                // Ignore
2906
2907            /* Otherwise, pop the current node (which will be a colgroup
2908            element) from the stack of open elements. Switch the insertion
2909            mode to "in table". */
2910            } else {
2911                array_pop($this->stack);
2912                $this->mode = self::IN_TABLE;
2913            }
2914
2915        /* An end tag whose tag name is "col" */
2916        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2917            /* Parse error. Ignore the token. */
2918
2919        /* Anything else */
2920        } else {
2921            /* Act as if an end tag with the tag name "colgroup" had been seen,
2922            and then, if that token wasn't ignored, reprocess the current token. */
2923            $this->inColumnGroup(array(
2924                'name' => 'colgroup',
2925                'type' => HTML5::ENDTAG
2926            ));
2927
2928            return $this->inTable($token);
2929        }
2930    }
2931
2932    private function inTableBody($token) {
2933        $clear = array('tbody', 'tfoot', 'thead', 'html');
2934
2935        /* A start tag whose tag name is "tr" */
2936        if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2937            /* Clear the stack back to a table body context. */
2938            $this->clearStackToTableContext($clear);
2939
2940            /* Insert a tr element for the token, then switch the insertion
2941            mode to "in row". */
2942            $this->insertElement($token);
2943            $this->mode = self::IN_ROW;
2944
2945        /* A start tag whose tag name is one of: "th", "td" */
2946        } elseif($token['type'] === HTML5::STARTTAG &&
2947        ($token['name'] === 'th' ||    $token['name'] === 'td')) {
2948            /* Parse error. Act as if a start tag with the tag name "tr" had
2949            been seen, then reprocess the current token. */
2950            $this->inTableBody(array(
2951                'name' => 'tr',
2952                'type' => HTML5::STARTTAG,
2953                'attr' => array()
2954            ));
2955
2956            return $this->inRow($token);
2957
2958        /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2959        } elseif($token['type'] === HTML5::ENDTAG &&
2960        in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2961            /* If the stack of open elements does not have an element in table
2962            scope with the same tag name as the token, this is a parse error.
2963            Ignore the token. */
2964            if(!$this->elementInScope($token['name'], true)) {
2965                // Ignore
2966
2967            /* Otherwise: */
2968            } else {
2969                /* Clear the stack back to a table body context. */
2970                $this->clearStackToTableContext($clear);
2971
2972                /* Pop the current node from the stack of open elements. Switch
2973                the insertion mode to "in table". */
2974                array_pop($this->stack);
2975                $this->mode = self::IN_TABLE;
2976            }
2977
2978        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2979        "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2980        } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2981        array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2982        ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2983            /* If the stack of open elements does not have a tbody, thead, or
2984            tfoot element in table scope, this is a parse error. Ignore the
2985            token. (innerHTML case) */
2986            if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2987                // Ignore.
2988
2989            /* Otherwise: */
2990            } else {
2991                /* Clear the stack back to a table body context. */
2992                $this->clearStackToTableContext($clear);
2993
2994                /* Act as if an end tag with the same tag name as the current
2995                node ("tbody", "tfoot", or "thead") had been seen, then
2996                reprocess the current token. */
2997                $this->inTableBody(array(
2998                    'name' => end($this->stack)->nodeName,
2999                    'type' => HTML5::ENDTAG
3000                ));
3001
3002                return $this->mainPhase($token);
3003            }
3004
3005        /* An end tag whose tag name is one of: "body", "caption", "col",
3006        "colgroup", "html", "td", "th", "tr" */
3007        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3008        array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3009            /* Parse error. Ignore the token. */
3010
3011        /* Anything else */
3012        } else {
3013            /* Process the token as if the insertion mode was "in table". */
3014            $this->inTable($token);
3015        }
3016    }
3017
3018    private function inRow($token) {
3019        $clear = array('tr', 'html');
3020
3021        /* A start tag whose tag name is one of: "th", "td" */
3022        if($token['type'] === HTML5::STARTTAG &&
3023        ($token['name'] === 'th' || $token['name'] === 'td')) {
3024            /* Clear the stack back to a table row context. */
3025            $this->clearStackToTableContext($clear);
3026
3027            /* Insert an HTML element for the token, then switch the insertion
3028            mode to "in cell". */
3029            $this->insertElement($token);
3030            $this->mode = self::IN_CELL;
3031
3032            /* Insert a marker at the end of the list of active formatting
3033            elements. */
3034            $this->a_formatting[] = self::MARKER;
3035
3036        /* An end tag whose tag name is "tr" */
3037        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3038            /* If the stack of open elements does not have an element in table
3039            scope with the same tag name as the token, this is a parse error.
3040            Ignore the token. (innerHTML case) */
3041            if(!$this->elementInScope($token['name'], true)) {
3042                // Ignore.
3043
3044            /* Otherwise: */
3045            } else {
3046                /* Clear the stack back to a table row context. */
3047                $this->clearStackToTableContext($clear);
3048
3049                /* Pop the current node (which will be a tr element) from the
3050                stack of open elements. Switch the insertion mode to "in table
3051                body". */
3052                array_pop($this->stack);
3053                $this->mode = self::IN_TBODY;
3054            }
3055
3056        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3057        "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3058        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3059        array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3060            /* Act as if an end tag with the tag name "tr" had been seen, then,
3061            if that token wasn't ignored, reprocess the current token. */
3062            $this->inRow(array(
3063                'name' => 'tr',
3064                'type' => HTML5::ENDTAG
3065            ));
3066
3067            return $this->inCell($token);
3068
3069        /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3070        } elseif($token['type'] === HTML5::ENDTAG &&
3071        in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3072            /* If the stack of open elements does not have an element in table
3073            scope with the same tag name as the token, this is a parse error.
3074            Ignore the token. */
3075            if(!$this->elementInScope($token['name'], true)) {
3076                // Ignore.
3077
3078            /* Otherwise: */
3079            } else {
3080                /* Otherwise, act as if an end tag with the tag name "tr" had
3081                been seen, then reprocess the current token. */
3082                $this->inRow(array(
3083                    'name' => 'tr',
3084                    'type' => HTML5::ENDTAG
3085                ));
3086
3087                return $this->inCell($token);
3088            }
3089
3090        /* An end tag whose tag name is one of: "body", "caption", "col",
3091        "colgroup", "html", "td", "th" */
3092        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3093        array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3094            /* Parse error. Ignore the token. */
3095
3096        /* Anything else */
3097        } else {
3098            /* Process the token as if the insertion mode was "in table". */
3099            $this->inTable($token);
3100        }
3101    }
3102
3103    private function inCell($token) {
3104        /* An end tag whose tag name is one of: "td", "th" */
3105        if($token['type'] === HTML5::ENDTAG &&
3106        ($token['name'] === 'td' || $token['name'] === 'th')) {
3107            /* If the stack of open elements does not have an element in table
3108            scope with the same tag name as that of the token, then this is a
3109            parse error and the token must be ignored. */
3110            if(!$this->elementInScope($token['name'], true)) {
3111                // Ignore.
3112
3113            /* Otherwise: */
3114            } else {
3115                /* Generate implied end tags, except for elements with the same
3116                tag name as the token. */
3117                $this->generateImpliedEndTags(array($token['name']));
3118
3119                /* Now, if the current node is not an element with the same tag
3120                name as the token, then this is a parse error. */
3121                // k
3122
3123                /* Pop elements from this stack until an element with the same
3124                tag name as the token has been popped from the stack. */
3125                while(true) {
3126                    $node = end($this->stack)->nodeName;
3127                    array_pop($this->stack);
3128
3129                    if($node === $token['name']) {
3130                        break;
3131                    }
3132                }
3133
3134                /* Clear the list of active formatting elements up to the last
3135                marker. */
3136                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3137
3138                /* Switch the insertion mode to "in row". (The current node
3139                will be a tr element at this point.) */
3140                $this->mode = self::IN_ROW;
3141            }
3142
3143        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3144        "tbody", "td", "tfoot", "th", "thead", "tr" */
3145        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3146        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3147        'thead', 'tr'))) {
3148            /* If the stack of open elements does not have a td or th element
3149            in table scope, then this is a parse error; ignore the token.
3150            (innerHTML case) */
3151            if(!$this->elementInScope(array('td', 'th'), true)) {
3152                // Ignore.
3153
3154            /* Otherwise, close the cell (see below) and reprocess the current
3155            token. */
3156            } else {
3157                $this->closeCell();
3158                return $this->inRow($token);
3159            }
3160
3161        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3162        "tbody", "td", "tfoot", "th", "thead", "tr" */
3163        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3164        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3165        'thead', 'tr'))) {
3166            /* If the stack of open elements does not have a td or th element
3167            in table scope, then this is a parse error; ignore the token.
3168            (innerHTML case) */
3169            if(!$this->elementInScope(array('td', 'th'), true)) {
3170                // Ignore.
3171
3172            /* Otherwise, close the cell (see below) and reprocess the current
3173            token. */
3174            } else {
3175                $this->closeCell();
3176                return $this->inRow($token);
3177            }
3178
3179        /* An end tag whose tag name is one of: "body", "caption", "col",
3180        "colgroup", "html" */
3181        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3182        array('body', 'caption', 'col', 'colgroup', 'html'))) {
3183            /* Parse error. Ignore the token. */
3184
3185        /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3186        "thead", "tr" */
3187        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3188        array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3189            /* If the stack of open elements does not have an element in table
3190            scope with the same tag name as that of the token (which can only
3191            happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3192            then this is a parse error and the token must be ignored. */
3193            if(!$this->elementInScope($token['name'], true)) {
3194                // Ignore.
3195
3196            /* Otherwise, close the cell (see below) and reprocess the current
3197            token. */
3198            } else {
3199                $this->closeCell();
3200                return $this->inRow($token);
3201            }
3202
3203        /* Anything else */
3204        } else {
3205            /* Process the token as if the insertion mode was "in body". */
3206            $this->inBody($token);
3207        }
3208    }
3209
3210    private function inSelect($token) {
3211        /* Handle the token as follows: */
3212
3213        /* A character token */
3214        if($token['type'] === HTML5::CHARACTR) {
3215            /* Append the token's character to the current node. */
3216            $this->insertText($token['data']);
3217
3218        /* A comment token */
3219        } elseif($token['type'] === HTML5::COMMENT) {
3220            /* Append a Comment node to the current node with the data
3221            attribute set to the data given in the comment token. */
3222            $this->insertComment($token['data']);
3223
3224        /* A start tag token whose tag name is "option" */
3225        } elseif($token['type'] === HTML5::STARTTAG &&
3226        $token['name'] === 'option') {
3227            /* If the current node is an option element, act as if an end tag
3228            with the tag name "option" had been seen. */
3229            if(end($this->stack)->nodeName === 'option') {
3230                $this->inSelect(array(
3231                    'name' => 'option',
3232                    'type' => HTML5::ENDTAG
3233                ));
3234            }
3235
3236            /* Insert an HTML element for the token. */
3237            $this->insertElement($token);
3238
3239        /* A start tag token whose tag name is "optgroup" */
3240        } elseif($token['type'] === HTML5::STARTTAG &&
3241        $token['name'] === 'optgroup') {
3242            /* If the current node is an option element, act as if an end tag
3243            with the tag name "option" had been seen. */
3244            if(end($this->stack)->nodeName === 'option') {
3245                $this->inSelect(array(
3246                    'name' => 'option',
3247                    'type' => HTML5::ENDTAG
3248                ));
3249            }
3250
3251            /* If the current node is an optgroup element, act as if an end tag
3252            with the tag name "optgroup" had been seen. */
3253            if(end($this->stack)->nodeName === 'optgroup') {
3254                $this->inSelect(array(
3255                    'name' => 'optgroup',
3256                    'type' => HTML5::ENDTAG
3257                ));
3258            }
3259
3260            /* Insert an HTML element for the token. */
3261            $this->insertElement($token);
3262
3263        /* An end tag token whose tag name is "optgroup" */
3264        } elseif($token['type'] === HTML5::ENDTAG &&
3265        $token['name'] === 'optgroup') {
3266            /* First, if the current node is an option element, and the node
3267            immediately before it in the stack of open elements is an optgroup
3268            element, then act as if an end tag with the tag name "option" had
3269            been seen. */
3270            $elements_in_stack = count($this->stack);
3271
3272            if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3273            $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3274                $this->inSelect(array(
3275                    'name' => 'option',
3276                    'type' => HTML5::ENDTAG
3277                ));
3278            }
3279
3280            /* If the current node is an optgroup element, then pop that node
3281            from the stack of open elements. Otherwise, this is a parse error,
3282            ignore the token. */
3283            if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3284                array_pop($this->stack);
3285            }
3286
3287        /* An end tag token whose tag name is "option" */
3288        } elseif($token['type'] === HTML5::ENDTAG &&
3289        $token['name'] === 'option') {
3290            /* If the current node is an option element, then pop that node
3291            from the stack of open elements. Otherwise, this is a parse error,
3292            ignore the token. */
3293            if(end($this->stack)->nodeName === 'option') {
3294                array_pop($this->stack);
3295            }
3296
3297        /* An end tag whose tag name is "select" */
3298        } elseif($token['type'] === HTML5::ENDTAG &&
3299        $token['name'] === 'select') {
3300            /* If the stack of open elements does not have an element in table
3301            scope with the same tag name as the token, this is a parse error.
3302            Ignore the token. (innerHTML case) */
3303            if(!$this->elementInScope($token['name'], true)) {
3304                // w/e
3305
3306            /* Otherwise: */
3307            } else {
3308                /* Pop elements from the stack of open elements until a select
3309                element has been popped from the stack. */
3310                while(true) {
3311                    $current = end($this->stack)->nodeName;
3312                    array_pop($this->stack);
3313
3314                    if($current === 'select') {
3315                        break;
3316                    }
3317                }
3318
3319                /* Reset the insertion mode appropriately. */
3320                $this->resetInsertionMode();
3321            }
3322
3323        /* A start tag whose tag name is "select" */
3324        } elseif($token['name'] === 'select' &&
3325        $token['type'] === HTML5::STARTTAG) {
3326            /* Parse error. Act as if the token had been an end tag with the
3327            tag name "select" instead. */
3328            $this->inSelect(array(
3329                'name' => 'select',
3330                'type' => HTML5::ENDTAG
3331            ));
3332
3333        /* An end tag whose tag name is one of: "caption", "table", "tbody",
3334        "tfoot", "thead", "tr", "td", "th" */
3335        } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3336        'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3337            /* Parse error. */
3338            // w/e
3339
3340            /* If the stack of open elements has an element in table scope with
3341            the same tag name as that of the token, then act as if an end tag
3342            with the tag name "select" had been seen, and reprocess the token.
3343            Otherwise, ignore the token. */
3344            if($this->elementInScope($token['name'], true)) {
3345                $this->inSelect(array(
3346                    'name' => 'select',
3347                    'type' => HTML5::ENDTAG
3348                ));
3349
3350                $this->mainPhase($token);
3351            }
3352
3353        /* Anything else */
3354        } else {
3355            /* Parse error. Ignore the token. */
3356        }
3357    }
3358
3359    private function afterBody($token) {
3360        /* Handle the token as follows: */
3361
3362        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3363        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3364        or U+0020 SPACE */
3365        if($token['type'] === HTML5::CHARACTR &&
3366        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3367            /* Process the token as it would be processed if the insertion mode
3368            was "in body". */
3369            $this->inBody($token);
3370
3371        /* A comment token */
3372        } elseif($token['type'] === HTML5::COMMENT) {
3373            /* Append a Comment node to the first element in the stack of open
3374            elements (the html element), with the data attribute set to the
3375            data given in the comment token. */
3376            $comment = $this->dom->createComment($token['data']);
3377            $this->stack[0]->appendChild($comment);
3378
3379        /* An end tag with the tag name "html" */
3380        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3381            /* If the parser was originally created in order to handle the
3382            setting of an element's innerHTML attribute, this is a parse error;
3383            ignore the token. (The element will be an html element in this
3384            case.) (innerHTML case) */
3385
3386            /* Otherwise, switch to the trailing end phase. */
3387            $this->phase = self::END_PHASE;
3388
3389        /* Anything else */
3390        } else {
3391            /* Parse error. Set the insertion mode to "in body" and reprocess
3392            the token. */
3393            $this->mode = self::IN_BODY;
3394            return $this->inBody($token);
3395        }
3396    }
3397
3398    private function inFrameset($token) {
3399        /* Handle the token as follows: */
3400
3401        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3402        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3403        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3404        if($token['type'] === HTML5::CHARACTR &&
3405        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3406            /* Append the character to the current node. */
3407            $this->insertText($token['data']);
3408
3409        /* A comment token */
3410        } elseif($token['type'] === HTML5::COMMENT) {
3411            /* Append a Comment node to the current node with the data
3412            attribute set to the data given in the comment token. */
3413            $this->insertComment($token['data']);
3414
3415        /* A start tag with the tag name "frameset" */
3416        } elseif($token['name'] === 'frameset' &&
3417        $token['type'] === HTML5::STARTTAG) {
3418            $this->insertElement($token);
3419
3420        /* An end tag with the tag name "frameset" */
3421        } elseif($token['name'] === 'frameset' &&
3422        $token['type'] === HTML5::ENDTAG) {
3423            /* If the current node is the root html element, then this is a
3424            parse error; ignore the token. (innerHTML case) */
3425            if(end($this->stack)->nodeName === 'html') {
3426                // Ignore
3427
3428            } else {
3429                /* Otherwise, pop the current node from the stack of open
3430                elements. */
3431                array_pop($this->stack);
3432
3433                /* If the parser was not originally created in order to handle
3434                the setting of an element's innerHTML attribute (innerHTML case),
3435                and the current node is no longer a frameset element, then change
3436                the insertion mode to "after frameset". */
3437                $this->mode = self::AFTR_FRAME;
3438            }
3439
3440        /* A start tag with the tag name "frame" */
3441        } elseif($token['name'] === 'frame' &&
3442        $token['type'] === HTML5::STARTTAG) {
3443            /* Insert an HTML element for the token. */
3444            $this->insertElement($token);
3445
3446            /* Immediately pop the current node off the stack of open elements. */
3447            array_pop($this->stack);
3448
3449        /* A start tag with the tag name "noframes" */
3450        } elseif($token['name'] === 'noframes' &&
3451        $token['type'] === HTML5::STARTTAG) {
3452            /* Process the token as if the insertion mode had been "in body". */
3453            $this->inBody($token);
3454
3455        /* Anything else */
3456        } else {
3457            /* Parse error. Ignore the token. */
3458        }
3459    }
3460
3461    private function afterFrameset($token) {
3462        /* Handle the token as follows: */
3463
3464        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3465        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3466        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3467        if($token['type'] === HTML5::CHARACTR &&
3468        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3469            /* Append the character to the current node. */
3470            $this->insertText($token['data']);
3471
3472        /* A comment token */
3473        } elseif($token['type'] === HTML5::COMMENT) {
3474            /* Append a Comment node to the current node with the data
3475            attribute set to the data given in the comment token. */
3476            $this->insertComment($token['data']);
3477
3478        /* An end tag with the tag name "html" */
3479        } elseif($token['name'] === 'html' &&
3480        $token['type'] === HTML5::ENDTAG) {
3481            /* Switch to the trailing end phase. */
3482            $this->phase = self::END_PHASE;
3483
3484        /* A start tag with the tag name "noframes" */
3485        } elseif($token['name'] === 'noframes' &&
3486        $token['type'] === HTML5::STARTTAG) {
3487            /* Process the token as if the insertion mode had been "in body". */
3488            $this->inBody($token);
3489
3490        /* Anything else */
3491        } else {
3492            /* Parse error. Ignore the token. */
3493        }
3494    }
3495
3496    private function trailingEndPhase($token) {
3497        /* After the main phase, as each token is emitted from the tokenisation
3498        stage, it must be processed as described in this section. */
3499
3500        /* A DOCTYPE token */
3501        if($token['type'] === HTML5::DOCTYPE) {
3502            // Parse error. Ignore the token.
3503
3504        /* A comment token */
3505        } elseif($token['type'] === HTML5::COMMENT) {
3506            /* Append a Comment node to the Document object with the data
3507            attribute set to the data given in the comment token. */
3508            $comment = $this->dom->createComment($token['data']);
3509            $this->dom->appendChild($comment);
3510
3511        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3512        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3513        or U+0020 SPACE */
3514        } elseif($token['type'] === HTML5::CHARACTR &&
3515        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3516            /* Process the token as it would be processed in the main phase. */
3517            $this->mainPhase($token);
3518
3519        /* A character token that is not one of U+0009 CHARACTER TABULATION,
3520        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3521        or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3522        } elseif(($token['type'] === HTML5::CHARACTR &&
3523        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3524        $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3525            /* Parse error. Switch back to the main phase and reprocess the
3526            token. */
3527            $this->phase = self::MAIN_PHASE;
3528            return $this->mainPhase($token);
3529
3530        /* An end-of-file token */
3531        } elseif($token['type'] === HTML5::EOF) {
3532            /* OMG DONE!! */
3533        }
3534    }
3535
3536    private function insertElement($token, $append = true, $check = false) {
3537        // Proprietary workaround for libxml2's limitations with tag names
3538        if ($check) {
3539            // Slightly modified HTML5 tag-name modification,
3540            // removing anything that's not an ASCII letter, digit, or hyphen
3541            $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3542            // Remove leading hyphens and numbers
3543            $token['name'] = ltrim($token['name'], '-0..9');
3544            // In theory, this should ever be needed, but just in case
3545            if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3546        }
3547
3548        $el = $this->dom->createElement($token['name']);
3549
3550        foreach($token['attr'] as $attr) {
3551            if(!$el->hasAttribute($attr['name'])) {
3552                $el->setAttribute($attr['name'], $attr['value']);
3553            }
3554        }
3555
3556        $this->appendToRealParent($el);
3557        $this->stack[] = $el;
3558
3559        return $el;
3560    }
3561
3562    private function insertText($data) {
3563        $text = $this->dom->createTextNode($data);
3564        $this->appendToRealParent($text);
3565    }
3566
3567    private function insertComment($data) {
3568        $comment = $this->dom->createComment($data);
3569        $this->appendToRealParent($comment);
3570    }
3571
3572    private function appendToRealParent($node) {
3573        if($this->foster_parent === null) {
3574            end($this->stack)->appendChild($node);
3575
3576        } elseif($this->foster_parent !== null) {
3577            /* If the foster parent element is the parent element of the
3578            last table element in the stack of open elements, then the new
3579            node must be inserted immediately before the last table element
3580            in the stack of open elements in the foster parent element;
3581            otherwise, the new node must be appended to the foster parent
3582            element. */
3583            for($n = count($this->stack) - 1; $n >= 0; $n--) {
3584                if($this->stack[$n]->nodeName === 'table' &&
3585                $this->stack[$n]->parentNode !== null) {
3586                    $table = $this->stack[$n];
3587                    break;
3588                }
3589            }
3590
3591            if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3592                $this->foster_parent->insertBefore($node, $table);
3593            else
3594                $this->foster_parent->appendChild($node);
3595
3596            $this->foster_parent = null;
3597        }
3598    }
3599
3600    private function elementInScope($el, $table = false) {
3601        if(is_array($el)) {
3602            foreach($el as $element) {
3603                if($this->elementInScope($element, $table)) {
3604                    return true;
3605                }
3606            }
3607
3608            return false;
3609        }
3610
3611        $leng = count($this->stack);
3612
3613        for($n = 0; $n < $leng; $n++) {
3614            /* 1. Initialise node to be the current node (the bottommost node of
3615            the stack). */
3616            $node = $this->stack[$leng - 1 - $n];
3617
3618            if($node->tagName === $el) {
3619                /* 2. If node is the target node, terminate in a match state. */
3620                return true;
3621
3622            } elseif($node->tagName === 'table') {
3623                /* 3. Otherwise, if node is a table element, terminate in a failure
3624                state. */
3625                return false;
3626
3627            } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3628            'th', 'button', 'marquee', 'object'))) {
3629                /* 4. Otherwise, if the algorithm is the "has an element in scope"
3630                variant (rather than the "has an element in table scope" variant),
3631                and node is one of the following, terminate in a failure state. */
3632                return false;
3633
3634            } elseif($node === $node->ownerDocument->documentElement) {
3635                /* 5. Otherwise, if node is an html element (root element), terminate
3636                in a failure state. (This can only happen if the node is the topmost
3637                node of the    stack of open elements, and prevents the next step from
3638                being invoked if there are no more elements in the stack.) */
3639                return false;
3640            }
3641
3642            /* Otherwise, set node to the previous entry in the stack of open
3643            elements and return to step 2. (This will never fail, since the loop
3644            will always terminate in the previous step if the top of the stack
3645            is reached.) */
3646        }
3647    }
3648
3649    private function reconstructActiveFormattingElements() {
3650        /* 1. If there are no entries in the list of active formatting elements,
3651        then there is nothing to reconstruct; stop this algorithm. */
3652        $formatting_elements = count($this->a_formatting);
3653
3654        if($formatting_elements === 0) {
3655            return false;
3656        }
3657
3658        /* 3. Let entry be the last (most recently added) element in the list
3659        of active formatting elements. */
3660        $entry = end($this->a_formatting);
3661
3662        /* 2. If the last (most recently added) entry in the list of active
3663        formatting elements is a marker, or if it is an element that is in the
3664        stack of open elements, then there is nothing to reconstruct; stop this
3665        algorithm. */
3666        if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3667            return false;
3668        }
3669
3670        for($a = $formatting_elements - 1; $a >= 0; true) {
3671            /* 4. If there are no entries before entry in the list of active
3672            formatting elements, then jump to step 8. */
3673            if($a === 0) {
3674                $step_seven = false;
3675                break;
3676            }
3677
3678            /* 5. Let entry be the entry one earlier than entry in the list of
3679            active formatting elements. */
3680            $a--;
3681            $entry = $this->a_formatting[$a];
3682
3683            /* 6. If entry is neither a marker nor an element that is also in
3684            thetack of open elements, go to step 4. */
3685            if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3686                break;
3687            }
3688        }
3689
3690        while(true) {
3691            /* 7. Let entry be the element one later than entry in the list of
3692            active formatting elements. */
3693            if(isset($step_seven) && $step_seven === true) {
3694                $a++;
3695                $entry = $this->a_formatting[$a];
3696            }
3697
3698            /* 8. Perform a shallow clone of the element entry to obtain clone. */
3699            $clone = $entry->cloneNode();
3700
3701            /* 9. Append clone to the current node and push it onto the stack
3702            of open elements  so that it is the new current node. */
3703            end($this->stack)->appendChild($clone);
3704            $this->stack[] = $clone;
3705
3706            /* 10. Replace the entry for entry in the list with an entry for
3707            clone. */
3708            $this->a_formatting[$a] = $clone;
3709
3710            /* 11. If the entry for clone in the list of active formatting
3711            elements is not the last entry in the list, return to step 7. */
3712            if(end($this->a_formatting) !== $clone) {
3713                $step_seven = true;
3714            } else {
3715                break;
3716            }
3717        }
3718    }
3719
3720    private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3721        /* When the steps below require the UA to clear the list of active
3722        formatting elements up to the last marker, the UA must perform the
3723        following steps: */
3724
3725        while(true) {
3726            /* 1. Let entry be the last (most recently added) entry in the list
3727            of active formatting elements. */
3728            $entry = end($this->a_formatting);
3729
3730            /* 2. Remove entry from the list of active formatting elements. */
3731            array_pop($this->a_formatting);
3732
3733            /* 3. If entry was a marker, then stop the algorithm at this point.
3734            The list has been cleared up to the last marker. */
3735            if($entry === self::MARKER) {
3736                break;
3737            }
3738        }
3739    }
3740
3741    private function generateImpliedEndTags($exclude = array()) {
3742        /* When the steps below require the UA to generate implied end tags,
3743        then, if the current node is a dd element, a dt element, an li element,
3744        a p element, a td element, a th  element, or a tr element, the UA must
3745        act as if an end tag with the respective tag name had been seen and
3746        then generate implied end tags again. */
3747        $node = end($this->stack);
3748        $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3749
3750        while(in_array(end($this->stack)->nodeName, $elements)) {
3751            array_pop($this->stack);
3752        }
3753    }
3754
3755    private function getElementCategory($node) {
3756        $name = $node->tagName;
3757        if(in_array($name, $this->special))
3758            return self::SPECIAL;
3759
3760        elseif(in_array($name, $this->scoping))
3761            return self::SCOPING;
3762
3763        elseif(in_array($name, $this->formatting))
3764            return self::FORMATTING;
3765
3766        else
3767            return self::PHRASING;
3768    }
3769
3770    private function clearStackToTableContext($elements) {
3771        /* When the steps above require the UA to clear the stack back to a
3772        table context, it means that the UA must, while the current node is not
3773        a table element or an html element, pop elements from the stack of open
3774        elements. If this causes any elements to be popped from the stack, then
3775        this is a parse error. */
3776        while(true) {
3777            $node = end($this->stack)->nodeName;
3778
3779            if(in_array($node, $elements)) {
3780                break;
3781            } else {
3782                array_pop($this->stack);
3783            }
3784        }
3785    }
3786
3787    private function resetInsertionMode() {
3788        /* 1. Let last be false. */
3789        $last = false;
3790        $leng = count($this->stack);
3791
3792        for($n = $leng - 1; $n >= 0; $n--) {
3793            /* 2. Let node be the last node in the stack of open elements. */
3794            $node = $this->stack[$n];
3795
3796            /* 3. If node is the first node in the stack of open elements, then
3797            set last to true. If the element whose innerHTML  attribute is being
3798            set is neither a td  element nor a th element, then set node to the
3799            element whose innerHTML  attribute is being set. (innerHTML  case) */
3800            if($this->stack[0]->isSameNode($node)) {
3801                $last = true;
3802            }
3803
3804            /* 4. If node is a select element, then switch the insertion mode to
3805            "in select" and abort these steps. (innerHTML case) */
3806            if($node->nodeName === 'select') {
3807                $this->mode = self::IN_SELECT;
3808                break;
3809
3810            /* 5. If node is a td or th element, then switch the insertion mode
3811            to "in cell" and abort these steps. */
3812            } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3813                $this->mode = self::IN_CELL;
3814                break;
3815
3816            /* 6. If node is a tr element, then switch the insertion mode to
3817            "in    row" and abort these steps. */
3818            } elseif($node->nodeName === 'tr') {
3819                $this->mode = self::IN_ROW;
3820                break;
3821
3822            /* 7. If node is a tbody, thead, or tfoot element, then switch the
3823            insertion mode to "in table body" and abort these steps. */
3824            } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3825                $this->mode = self::IN_TBODY;
3826                break;
3827
3828            /* 8. If node is a caption element, then switch the insertion mode
3829            to "in caption" and abort these steps. */
3830            } elseif($node->nodeName === 'caption') {
3831                $this->mode = self::IN_CAPTION;
3832                break;
3833
3834            /* 9. If node is a colgroup element, then switch the insertion mode
3835            to "in column group" and abort these steps. (innerHTML case) */
3836            } elseif($node->nodeName === 'colgroup') {
3837                $this->mode = self::IN_CGROUP;
3838                break;
3839
3840            /* 10. If node is a table element, then switch the insertion mode
3841            to "in table" and abort these steps. */
3842            } elseif($node->nodeName === 'table') {
3843                $this->mode = self::IN_TABLE;
3844                break;
3845
3846            /* 11. If node is a head element, then switch the insertion mode
3847            to "in body" ("in body"! not "in head"!) and abort these steps.
3848            (innerHTML case) */
3849            } elseif($node->nodeName === 'head') {
3850                $this->mode = self::IN_BODY;
3851                break;
3852
3853            /* 12. If node is a body element, then switch the insertion mode to
3854            "in body" and abort these steps. */
3855            } elseif($node->nodeName === 'body') {
3856                $this->mode = self::IN_BODY;
3857                break;
3858
3859            /* 13. If node is a frameset element, then switch the insertion
3860            mode to "in frameset" and abort these steps. (innerHTML case) */
3861            } elseif($node->nodeName === 'frameset') {
3862                $this->mode = self::IN_FRAME;
3863                break;
3864
3865            /* 14. If node is an html element, then: if the head element
3866            pointer is null, switch the insertion mode to "before head",
3867            otherwise, switch the insertion mode to "after head". In either
3868            case, abort these steps. (innerHTML case) */
3869            } elseif($node->nodeName === 'html') {
3870                $this->mode = ($this->head_pointer === null)
3871                    ? self::BEFOR_HEAD
3872                    : self::AFTER_HEAD;
3873
3874                break;
3875
3876            /* 15. If last is true, then set the insertion mode to "in body"
3877            and    abort these steps. (innerHTML case) */
3878            } elseif($last) {
3879                $this->mode = self::IN_BODY;
3880                break;
3881            }
3882        }
3883    }
3884
3885    private function closeCell() {
3886        /* If the stack of open elements has a td or th element in table scope,
3887        then act as if an end tag token with that tag name had been seen. */
3888        foreach(array('td', 'th') as $cell) {
3889            if($this->elementInScope($cell, true)) {
3890                $this->inCell(array(
3891                    'name' => $cell,
3892                    'type' => HTML5::ENDTAG
3893                ));
3894
3895                break;
3896            }
3897        }
3898    }
3899
3900    public function save() {
3901        return $this->dom;
3902    }
3903}
3904?>
3905