1<?php 2class ParserXHTML extends Parser { 3 function &process($html, &$pipeline, &$media) { 4 // Run the XML parser on the XHTML we've prepared 5 $dom_tree = TreeBuilder::build($html); 6 7 // Check if parser returned valid document 8 if (is_null($dom_tree)) { 9 readfile(HTML2PS_DIR.'templates/cannot_parse.html'); 10 error_log(sprintf("Cannot parse document: %s", $pipeline->get_base_url())); 11 die("HTML2PS Error"); 12 } 13 14 /** 15 * Detect the base URI for this document. 16 * 17 * According to the HTML 4.01 p. 12.4.1: 18 * User agents must calculate the base URI according to the following precedences (highest priority to lowest): 19 * 20 * 1. The base URI is set by the BASE element. 21 * 2. The base URI is given by meta data discovered during a protocol interaction, such as an HTTP header (see [RFC2616]). 22 * 3. By default, the base URI is that of the current document. Not all HTML documents have a base URI (e.g., a valid HTML document may appear in an email and may not be designated by a URI). Such HTML documents are considered erroneous if they contain relative URIs and rely on a default base URI. 23 */ 24 25 /** 26 * Check if BASE element present; use its first occurrence 27 */ 28 $this->_scan_base($dom_tree, $pipeline); 29 30 /** 31 * @todo fall back to the protocol metadata 32 */ 33 34 /** 35 * Parse STYLE / LINK nodes containing CSS references and definitions 36 * This should be done here, as the document body may include STYLE node 37 * (this violates HTML standard, but is rather often appears in Web) 38 */ 39 $css =& $pipeline->get_current_css(); 40 $css->scan_styles($dom_tree, $pipeline); 41 42 if (!is_null($media)) { 43 // Setup media size and margins 44 $pipeline->get_page_media(1, $media); 45 $pipeline->output_driver->update_media($media); 46 $pipeline->_setupScales($media); 47 }; 48 49 $body =& traverse_dom_tree_pdf($dom_tree); 50 $box =& create_pdf_box($body, $pipeline); 51 52 return $box; 53 } 54 55 function _scan_base(&$root, &$pipeline) { 56 switch ($root->node_type()) { 57 case XML_ELEMENT_NODE: 58 if ($root->tagname() === 'base') { 59 /** 60 * See HTML 4.01 p 12.4 61 * href - this attribute specifies an absolute URI that acts as the base URI for resolving relative URIs. 62 * 63 * At this moment pipeline object have current document URI on the top of the stack; 64 * we should replace it with the value of 'href' attribute of the BASE tag 65 * 66 * To handle (possibly) incorrect values, we use 'guess_url' function; in this case 67 * if 'href' attribute contains absolute value (is it SHOULD be), it will be used; 68 * if it is missing or is relative, we'll get more of less usable value base on current 69 * document URI. 70 */ 71 $new_url = $pipeline->guess_url($root->get_attribute('href')); 72 $pipeline->pop_base_url(); 73 $pipeline->push_base_url($new_url); 74 75 return true; 76 }; 77 78 // We continue processing here! 79 case XML_DOCUMENT_NODE: 80 $child = $root->first_child(); 81 while ($child) { 82 if ($this->_scan_base($child, $pipeline)) { return; }; 83 $child = $child->next_sibling(); 84 }; 85 86 return false; 87 }; 88 89 return false; 90 } 91} 92?>