1<?php 2// $Header: /cvsroot/html2ps/xhtml.entities.inc.php,v 1.11 2006/12/24 14:42:44 Konstantin Exp $ 3 4function process_character_references(&$html) { 5 // Process symbolic character references 6 global $g_html_entities; 7 foreach ($g_html_entities as $entity => $code) { 8 $html = str_replace("&{$entity};","&#{$code};",$html); 9 10 // Some ill-brained webmasters write HTML symbolic references without 11 // terminating semicolor (especially at www.whitehouse.gov. The following 12 // replacemenet is required to fix these damaged inteties, converting them 13 // to the numerical character reference. 14 // 15 // We use [\s<] as entity name terminator to avoid breaking up longer entity 16 // names by filtering in only space or HTML-tag terminated ones. 17 // 18 $html = preg_replace("/&{$entity}([\s<])/","&#{$code};\\1",$html); 19 }; 20 21 // Process hecadecimal character references 22 while (preg_match("/&#x([[:xdigit:]]{2,4});/i", $html, $matches)) { 23 // We cannot use plain str_replace, because 'x' symbol can be in both cases; 24 // str_ireplace have appeared in PHP 5 only, so we cannot use it due the 25 // compatibility problems 26 27 $html = preg_replace("/&#x".$matches[1].";/i","&#".hexdec($matches[1]).";",$html); 28 }; 29} 30 31function escape_amp($html) { 32 // Escape all ampersants not followed by a # sharp sign 33 // Note that symbolic references were replaced by numeric before this! 34 $html = preg_replace("/&(?!#)/si","&\\1",$html); 35 36 // Complete all numeric character references unterminated with ';' 37 $html = preg_replace("/&#(\d+)(?![\d;])/si","&#\\1;",$html); 38 39 // Escape all ampersants followed by # sharp and NON-DIGIT symbol 40 // They we're not covered by above conversions and are not a 41 // symbol reference. 42 // Also, don't forget that we've used &! They should not be converted too... 43 // 44 $html = preg_replace("/&(?!#\d)/si","&\\1",$html); 45 46 return $html; 47}; 48 49function escape_lt($html) { 50 // Why this loop is needed here? 51 // The cause is that, for example, <<<a> sequence will not be replaced by 52 // <<<a>, as it should be. The regular expression matches TWO symbols 53 // << (actually, first < symbold, and one following it, so, the second < 54 // will not be matched when script attempt to find and replace next occurrence using 'g' regexp 55 // modifier. So, we will need to check for such situations agint and, possibly, restart the 56 // search and replace process. 57 // 58 while (preg_match("#<(\s*[^!/a-zA-Z])#",$html)) { 59 $html = preg_replace("#<(\s*[^!/a-zA-Z])#si","<\\1",$html); 60 }; 61 62 while (preg_match("#(<[^>]*?)<#si",$html)) { 63 $html = preg_replace("#(<[^>]*?)<#si","\\1<",$html); 64 }; 65 66 return $html; 67}; 68 69function escape_gt($html) { 70 $html = preg_replace("#([^\s\da-zA-Z'\"/=-])\s*>#si","\\1>",$html); 71 72 while (preg_match("#(>[^<]*?)>#si",$html)) { 73 $html = preg_replace("#(>[^<]*?)>#si","\\1>",$html); 74 }; 75 76 return $html; 77}; 78 79?>