1<?php
2// $Header: /cvsroot/html2ps/xhtml.utils.inc.php,v 1.35 2007/03/15 18:37:36 Konstantin Exp $
3
4function close_tag($tag, $sample_html) {
5  return preg_replace("!(<{$tag}(\s[^>]*[^/>])?)>!si","\\1/>",$sample_html);
6};
7
8function make_attr_value($attr, $html) {
9  return preg_replace("#(<[^>]*\s){$attr}(\s|>|/>)#si","\\1{$attr}=\"{$attr}\"\\2",$html);
10};
11
12
13function mk_open_tag_regexp($tag) { return "<\s*{$tag}(\s+[^>]*)?>"; };
14function mk_close_tag_regexp($tag) { return "<\s*/\s*{$tag}\s*>"; };
15
16function process_html($html) {
17  $open  = mk_open_tag_regexp("html");
18  $close = mk_close_tag_regexp("html");
19
20  if (!preg_match("#{$open}#is",$html)) {
21    $html = "<html>".$html;
22  };
23
24  /**
25   * Let's check if there's more than one <html> tags inside the page text
26   * If there are, remove everything except the first one and content between the first and second <html>
27   */
28  while (preg_match("#{$open}(.*?){$open}#is", $html)) {
29    $html = preg_replace("#{$open}(.*?){$open}#is", "<html>\\2", $html);
30  };
31
32  if (!preg_match("#{$close}#is", $html)) {
33    $html = $html."</html>";
34  };
35
36  // PHP 5.2.0 compatilibty issue
37  // preg_replace may accidentally return NULL on large files not matching this
38  $html = preg_replace("#.*({$open})#is","\\1",$html);
39
40  // PHP 5.2.0 compatilibty issue
41  // preg_replace may accidentally return NULL on large files not matching this
42
43  // Cut off all data before and after 'html' tag; unless we'll do it,
44  // the XML parser will die violently
45  $html = preg_replace("#^.*<html#is","<html",$html);
46
47  $html = preg_replace("#</html\s*>.*$#is","</html>",$html);
48
49  return $html;
50}
51
52function process_head($html) {
53  $open  = mk_open_tag_regexp("head");
54  $close = mk_close_tag_regexp("head");
55  $ohtml = mk_open_tag_regexp("html");
56  $chtml = mk_close_tag_regexp("html");
57  $obody = mk_open_tag_regexp("body");
58
59  if (!preg_match("#{$open}#is",$html)) {
60    $html = preg_replace("#({$ohtml})(.*)({$obody})#is","\\1<head>\\3</head>\\4",$html);
61  } elseif (!preg_match("#{$close}#is",$html)) {
62    if (preg_match("#{$obody}#is",$html)) {
63      $html = preg_replace("#({$obody})#is","</head>\\1",$html);
64    } else {
65      $html = preg_replace("#({$chtml})#is","</head>\\1",$html);
66    };
67  };
68  return $html;
69}
70
71function process_body($html) {
72  $open  = mk_open_tag_regexp("body");
73  $close = mk_close_tag_regexp("body");
74  $ohtml = mk_open_tag_regexp("html");
75  $chtml = mk_close_tag_regexp("html");
76  $chead = mk_close_tag_regexp("head");
77
78  if (!preg_match("#{$open}#is",$html)) {
79    if (preg_match("#{$chead}#is",$html)) {
80      $html = preg_replace("#({$chead})#is","\\1<body>",$html);
81    } else {
82      $html = preg_replace("#({$ohtml})#is","\\1<body>",$html);
83    };
84  };
85  if (!preg_match("#{$close}#is",$html)) {
86    $html = preg_replace("#({$chtml})#is","</body>\\1",$html);
87  };
88
89  // Now check is there any data between </head> and <body>.
90  $html = preg_replace("#({$chead})(.+)({$open})#is","\\1\\3\\2",$html);
91  // Check if there's any data between </body> and </html>
92  $html = preg_replace("#({$close})(.+)({$chtml})#is","\\2\\1\\3",$html);
93
94  return $html;
95}
96
97// Hmmm. May be we'll just write SAX parser on PHP? ;-)
98function fix_tags($html) {
99  $result = "";
100  $tag_stack = array();
101
102  // these corrections can simplify the regexp used to parse tags
103  // remove whitespaces before '/' and between '/' and '>' in autoclosing tags
104  $html = preg_replace("#\s*/\s*>#is","/>",$html);
105  // remove whitespaces between '<', '/' and first tag letter in closing tags
106  $html = preg_replace("#<\s*/\s*#is","</",$html);
107  // remove whitespaces between '<' and first tag letter
108  $html = preg_replace("#<\s+#is","<",$html);
109
110  while (preg_match("#(.*?)(<([a-z\d]+)[^>]*/>|<([a-z\d]+)[^>]*(?<!/)>|</([a-z\d]+)[^>]*>)#is",$html,$matches)) {
111    $result .= $matches[1];
112    $html = substr($html, strlen($matches[0]));
113
114    // Closing tag
115    if (isset($matches[5])) {
116      $tag = $matches[5];
117
118      if ($tag == $tag_stack[0]) {
119        // Matched the last opening tag (normal state)
120        // Just pop opening tag from the stack
121        array_shift($tag_stack);
122        $result .= $matches[2];
123      } elseif (array_search($tag, $tag_stack)) {
124        // We'll never should close 'table' tag such way, so let's check if any 'tables' found on the stack
125        $no_critical_tags = !array_search('table',$tag_stack);
126        if (!$no_critical_tags) {
127          $no_critical_tags = (array_search('table',$tag_stack) >= array_search($tag, $tag_stack));
128        };
129
130        if ($no_critical_tags) {
131          // Corresponding opening tag exist on the stack (somewhere deep)
132          // Note that we can forget about 0 value returned by array_search, becaus it is handled by previous 'if'
133
134          // Insert a set of closing tags for all non-matching tags
135          $i = 0;
136          while ($tag_stack[$i] != $tag) {
137            $result .= "</{$tag_stack[$i]}> ";
138            $i++;
139          };
140
141          // close current tag
142          $result .= "</{$tag_stack[$i]}> ";
143          // remove it from the stack
144          array_splice($tag_stack, $i, 1);
145          // if this tag is not "critical", reopen "run-off" tags
146          $no_reopen_tags = array("tr","td","table","marquee","body","html");
147          if (array_search($tag, $no_reopen_tags) === false) {
148            while ($i > 0) {
149              $i--;
150              $result .= "<{$tag_stack[$i]}> ";
151            };
152          } else {
153            array_splice($tag_stack, 0, $i);
154          };
155        };
156      } else {
157        // No such tag found on the stack, just remove it (do nothing in out case, as we have to explicitly
158        // add things to result
159      };
160    } elseif (isset($matches[4])) {
161      // Opening tag
162      $tag = $matches[4];
163      array_unshift($tag_stack, $tag);
164      $result .= $matches[2];
165    } else {
166      // Autoclosing tag; do nothing specific
167      $result .= $matches[2];
168    };
169  };
170
171  // Close all tags left
172  while (count($tag_stack) > 0) {
173    $tag = array_shift($tag_stack);
174    $result .= "</".$tag.">";
175  }
176
177  return $result;
178}
179
180/**
181 * This function adds quotes to attribute values; it attribute values already have quotes, no changes are made
182 */
183function quote_attrs($html) {
184  while (preg_match("!(<[^>]*)\s([^=>]+)=([^'\"\r\n >]+)([\r\n >])!si",$html, $matches)) {
185    $html = preg_replace("#(<[^>]*)\s([^=>]+)=([^'\"\r\n >]+)([\r\n >])#si","\\1 \\2='\\3'\\4",$html);
186  };
187  return $html;
188};
189
190function escape_attr_value_entities($html) {
191  $html = str_replace("<","&lt;",$html);
192  $html = str_replace(">","&gt;",$html);
193
194  // Replace all character references by their decimal codes
195  process_character_references($html);
196  $html = escape_amp($html);
197  return $html;
198}
199
200/**
201 * Updates attribute values: if there's any unescaped <, > or & symbols inside an attribute value,
202 * replaces them with corresponding entity. Also note that & should not be escaped if it is already the part
203 * of entity reference
204 *
205 * @param String $html source HTML code
206 * @return String updated HTML code
207 */
208function escape_attrs_entities($html) {
209  $result = "";
210
211  // Regular expression may be described as follows:
212  // (<[^>]*) - something starting with < (i.e. tag name and, probably, some attribute name/values pairs
213  // \s([^\s=>]+)= - space after "something", followed by attribute name (which may contain anything except spaces, = and > signs
214  // (['\"])([^\3]*?)\3 - quoted attribute value; (@todo won't work with escaped quotes inside value, by the way).
215  while (preg_match("#^(.*)(<[^>]*)\s([^\s=>]+)=(['\"])([^\\4]*?)\\4(.*)$#si", $html, $matches)) {
216    $new_value = escape_attr_value_entities($matches[5]);
217
218    $result .= $matches[1].$matches[2]." ".$matches[3]."=".$matches[4].$new_value.$matches[4];
219    $html = $matches[6];
220  };
221
222  return $result.$html;
223};
224
225function fix_attrs_spaces(&$html) {
226  while (preg_match("#(<[^>]*)\s([^\s=>]+)=\"([^\"]*?)\"([^\s])#si", $html)) {
227    $html = preg_replace("#(<[^>]*)\s([^\s=>]+)=\"([^\"]*?)\"([^\s])#si","\\1 \\2=\"\\3\" \\4",$html);
228  };
229
230  while (preg_match("#(<[^>]*)\s([^\s=>]+)='([^']*?)'([^\s])#si", $html)) {
231    $html = preg_replace("#(<[^>]*)\s([^\s=>]+)='([^']*?)'([^\s])#si","\\1 \\2='\\3' \\4",$html);
232  };
233}
234
235function fix_attrs_tag($tag) {
236  if (preg_match("#(<)(.*?)(/\s*>)#is",$tag, $matches)) {
237    $prefix  = $matches[1];
238    $suffix  = $matches[3];
239    $content = $matches[2];
240  } elseif (preg_match("#(<)(.*?)(>)#is",$tag, $matches)) {
241    $prefix  = $matches[1];
242    $suffix  = $matches[3];
243    $content = $matches[2];
244  } else {
245    return;
246  };
247
248  if (preg_match("#^\s*(\w+)\s*(.*)\s*/\s*\$#is", $content, $matches)) {
249    $tagname   = $matches[1];
250    $raw_attrs = isset($matches[2]) ? $matches[2] : "";
251  } elseif (preg_match("#^\s*(\w+)\s*(.*)\$#is", $content, $matches)) {
252    $tagname   = $matches[1];
253    $raw_attrs = isset($matches[2]) ? $matches[2] : "";
254  } else {
255    // A strange tag occurred; just remove everything
256    $tagname   = "";
257    $raw_attrs = "";
258  };
259
260  $attrs = array();
261  while (!empty($raw_attrs)) {
262    if (preg_match("#^\s*(\w+?)\s*=\s*\"(.*?)\"(.*)$#is",$raw_attrs,$matches)) {
263      $attr  = strtolower($matches[1]);
264      $value = $matches[2];
265
266      if (!isset($attrs[$attr])) {
267        $attrs[$attr] = $value;
268      };
269
270      $raw_attrs = $matches[3];
271    } elseif (preg_match("#^\s*(\w+?)\s*=\s*'(.*?)'(.*)$#is",$raw_attrs,$matches)) {
272      $attr  = strtolower($matches[1]);
273      $value = $matches[2];
274
275      if (!isset($attrs[$attr])) {
276        $attrs[$attr] = $value;
277      };
278
279      $raw_attrs = $matches[3];
280    } elseif (preg_match("#^\s*(\w+?)=(\w+)(.*)$#is",$raw_attrs,$matches)) {
281      $attr  = strtolower($matches[1]);
282      $value = $matches[2];
283
284      if (!isset($attrs[$attr])) {
285        $attrs[$attr] = $value;
286      };
287
288      $raw_attrs = $matches[3];
289    } elseif (preg_match("#^\s*\S+\s+(.*)$#is",$raw_attrs,$matches)) {
290      // Just a junk at the beginning; skip till the first space
291      $raw_attrs = $matches[1];
292    } else {
293      $raw_attrs = "";
294    };
295  };
296
297  $str = "";
298  foreach ($attrs as $key => $value) {
299    // In theory, if the garbage have been found inside the attrs section, we could get
300    // and invalid attribute name here; just ignore them in this case
301    if (HTML2PS_XMLUtils::valid_attribute_name($key)) {
302      if (strpos($value,'"') !== false) {
303        $str .= " ".$key."='".$value."'";
304      } else {
305        $str .= " ".$key."=\"".$value."\"";
306      };
307    };
308  };
309
310  return $prefix.$tagname.$str.$suffix;
311}
312
313function fix_attrs($html) {
314  $result = "";
315
316  while (preg_match("#^(.*?)(<[^/].*?>)#is",$html,$matches)) {
317    $result .= $matches[1].fix_attrs_tag($matches[2]);
318    $html = substr($html, strlen($matches[0]));
319  };
320
321  return $result.$html;
322}
323
324function fix_closing_tags($html) {
325  return preg_replace("#</\s*(\w+).*?>#","</\\1>",$html);
326}
327
328function process_pagebreak_commands(&$html) {
329  $html = preg_replace("#<\?page-break>|<!--NewPage-->#","<pagebreak/>",$html);
330}
331
332function xhtml2xhtml($html) {
333  process_pagebreak_commands($html);
334
335  // Remove STYLE tags for the same reason and store them in the temporary variable
336  // later they will be added back to HEAD section
337  $styles = process_style($html);
338
339  // Do HTML -> XML (XHTML) conversion
340  // Convert HTML character references to their Unicode analogues
341  process_character_references($html);
342
343  remove_comments($html);
344
345  // Convert all tags to lower case
346  $html = lowercase_tags($html);
347  $html = lowercase_closing_tags($html);
348
349  // Remove SCRIPT tags
350  $html = process_script($html);
351
352  $html = insert_styles($html, $styles);
353
354  return $html;
355}
356
357function html2xhtml($html) {
358  process_pagebreak_commands($html);
359
360  // Remove SCRIPT tags from the page being processed, as script content may
361  // mess the firther html-parsing utilities
362  $html = process_script($html);
363
364  // Remove STYLE tags for the same reason and store them in the temporary variable
365  // later they will be added back to HEAD section
366  $styles = process_style($html);
367
368  // Convert HTML character references to their Unicode analogues
369  process_character_references($html);
370
371  remove_comments($html);
372
373  fix_attrs_spaces($html);
374  $html = quote_attrs($html);
375  $html = escape_attrs_entities($html);
376
377  $html = lowercase_tags($html);
378  $html = lowercase_closing_tags($html);
379
380  $html = fix_closing_tags($html);
381
382  $html = close_tag("area",$html);
383  $html = close_tag("base",$html);
384  $html = close_tag("basefont",$html);
385  $html = close_tag("br",$html);
386  $html = close_tag("col",$html);
387  $html = close_tag("embed",$html);
388  $html = close_tag("frame",$html);
389  $html = close_tag("hr",$html);
390  $html = close_tag("img",$html);
391  $html = close_tag("input",$html);
392  $html = close_tag("isindex",$html);
393  $html = close_tag("link",$html);
394  $html = close_tag("meta",$html);
395  $html = close_tag("param",$html);
396
397  $html = make_attr_value("checked",$html);
398  $html = make_attr_value("compact",$html);
399  $html = make_attr_value("declare",$html);
400  $html = make_attr_value("defer",$html);
401  $html = make_attr_value("disabled",$html);
402  $html = make_attr_value("ismap",$html);
403  $html = make_attr_value("multiple",$html);
404  $html = make_attr_value("nohref",$html);
405  $html = make_attr_value("noresize",$html);
406  $html = make_attr_value("noshade",$html);
407  $html = make_attr_value("nowrap",$html);
408  $html = make_attr_value("readonly",$html);
409  $html = make_attr_value("selected",$html);
410
411  $html = process_html($html);
412  $html = process_body($html);
413
414  $html = process_head($html);
415  $html = process_p($html);
416
417  $html = escape_amp($html);
418  $html = escape_lt($html);
419  $html = escape_gt($html);
420
421  $html = escape_textarea_content($html);
422
423  process_tables($html,0);
424
425  process_lists($html,0);
426  process_deflists($html,0);
427  process_selects($html,0);
428
429  $html = fix_tags($html);
430  $html = fix_attrs($html);
431
432  $html = insert_styles($html, $styles);
433
434  return $html;
435}
436
437function escape_textarea_content($html) {
438  preg_match_all('#<textarea(.*)>(.*)<\s*/\s*textarea\s*>#Uis', $html, $matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);
439
440  // Why cycle from the last to first match?
441  // It will keep unprocessed matches offsets valid,
442  // as escaped content may differ from original content in length,
443  for ($i = count($matches)-1; $i>=0; $i--) {
444    $match = $matches[$i];
445    $match_offset = $match[2][1];
446    $match_content = $match[2][0];
447    $match_length = strlen($match_content);
448    $escaped_content = preg_replace('/&([^#])/', '&#38;\1',
449                                    str_replace('>', '&#62;',
450                                                str_replace('<', '&#60;', $match_content)));
451    $html = substr_replace($html, $escaped_content, $match_offset, $match_length);
452  };
453
454  return $html;
455}
456
457function lowercase_tags($html) {
458  $result = "";
459
460  while (preg_match("#^(.*?)(</?)([a-zA-z0-9]+)([\s>])#is",$html,$matches)) {
461    // Drop extracted part
462    $html = substr($html,strlen($matches[0]));
463    // Move extracted part to the result
464    $result .= $matches[1].$matches[2].strtolower($matches[3]).$matches[4];
465  };
466
467  return $result.$html;
468};
469
470function lowercase_closing_tags($html) {
471  $result = "";
472
473  while (preg_match("#^(.*?)(<)([a-zA-z0-9]+)(\s*/\s*>)#is",$html,$matches)) {
474    // Drop extracted part
475    $html = substr($html,strlen($matches[0]));
476    // Move extracted part to the result
477    $result .= $matches[1].$matches[2].strtolower($matches[3]).$matches[4];
478  };
479
480  return $result.$html;
481};
482
483?>