1<?php
2class ParserXHTML extends Parser {
3  function &process($html, &$pipeline, &$media) {
4    // Run the XML parser on the XHTML we've prepared
5    $dom_tree = TreeBuilder::build($html);
6
7    // Check if parser returned valid document
8    if (is_null($dom_tree)) {
9      readfile(HTML2PS_DIR.'templates/cannot_parse.html');
10      error_log(sprintf("Cannot parse document: %s", $pipeline->get_base_url()));
11      die("HTML2PS Error");
12    }
13
14    /**
15     * Detect the base URI for this document.
16     *
17     * According to the HTML 4.01 p. 12.4.1:
18     * User agents must calculate the base URI according to the following precedences (highest priority to lowest):
19     *
20     * 1. The base URI is set by the BASE element.
21     * 2. The base URI is given by meta data discovered during a protocol interaction, such as an HTTP header (see [RFC2616]).
22     * 3. By default, the base URI is that of the current document. Not all HTML documents have a base URI (e.g., a valid HTML document may appear in an email and may not be designated by a URI). Such HTML documents are considered erroneous if they contain relative URIs and rely on a default base URI.
23     */
24
25    /**
26     * Check if BASE element present; use its first occurrence
27     */
28    $this->_scan_base($dom_tree, $pipeline);
29
30    /**
31     * @todo fall back to the protocol metadata
32     */
33
34    /**
35     * Parse STYLE / LINK nodes containing CSS references and definitions
36     * This should be done here, as the document body may include STYLE node
37     * (this violates HTML standard, but is rather often appears in Web)
38     */
39    $css =& $pipeline->get_current_css();
40    $css->scan_styles($dom_tree, $pipeline);
41
42    if (!is_null($media)) {
43      // Setup media size and margins
44      $pipeline->get_page_media(1, $media);
45      $pipeline->output_driver->update_media($media);
46      $pipeline->_setupScales($media);
47    };
48
49    $body =& traverse_dom_tree_pdf($dom_tree);
50    $box =& create_pdf_box($body, $pipeline);
51
52    return $box;
53  }
54
55  function _scan_base(&$root, &$pipeline) {
56    switch ($root->node_type()) {
57    case XML_ELEMENT_NODE:
58      if ($root->tagname() === 'base') {
59        /**
60         * See HTML 4.01 p 12.4
61         * href - this attribute specifies an absolute URI that acts as the base URI for resolving relative URIs.
62         *
63         * At this moment pipeline object have current document URI on the top of the stack;
64         * we should replace it with the value of 'href' attribute of the BASE tag
65         *
66         * To handle (possibly) incorrect values, we use 'guess_url' function; in this case
67         * if 'href' attribute contains absolute value (is it SHOULD be), it will be used;
68         * if it is missing or is relative, we'll get more of less usable value base on current
69         * document URI.
70         */
71        $new_url = $pipeline->guess_url($root->get_attribute('href'));
72        $pipeline->pop_base_url();
73        $pipeline->push_base_url($new_url);
74
75        return true;
76      };
77
78      // We continue processing here!
79    case XML_DOCUMENT_NODE:
80      $child = $root->first_child();
81      while ($child) {
82        if ($this->_scan_base($child, $pipeline)) { return; };
83        $child = $child->next_sibling();
84      };
85
86      return false;
87    };
88
89    return false;
90  }
91}
92?>