1<?php 2 3/** 4 * This is the HTML pseudo-parser for the Yadis library. 5 * 6 * PHP versions 4 and 5 7 * 8 * LICENSE: See the COPYING file included in this distribution. 9 * 10 * @package OpenID 11 * @author JanRain, Inc. <openid@janrain.com> 12 * @copyright 2005-2008 Janrain, Inc. 13 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache 14 */ 15 16/** 17 * This class is responsible for scanning an HTML string to find META 18 * tags and their attributes. This is used by the Yadis discovery 19 * process. This class must be instantiated to be used. 20 * 21 * @package OpenID 22 */ 23class Auth_Yadis_ParseHTML { 24 25 /** 26 * @access private 27 */ 28 public $_re_flags = "si"; 29 30 /** 31 * @access private 32 */ 33 public $_removed_re = '<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>'; 34 35 /** 36 * @access private 37 */ 38 public $_tag_expr = '<%s%s(?:\s.*?)?%s>'; 39 40 /** 41 * @access private 42 */ 43 public $_attr_find = '\b([-\w]+)=(".*?"|\'.*?\'|.+?)[\/\s>]'; 44 45 function __construct() 46 { 47 $this->_attr_find = sprintf("/%s/%s", 48 $this->_attr_find, 49 $this->_re_flags); 50 51 $this->_removed_re = sprintf("/%s/%s", 52 $this->_removed_re, 53 $this->_re_flags); 54 55 $this->_entity_replacements = [ 56 'amp' => '&', 57 'lt' => '<', 58 'gt' => '>', 59 'quot' => '"', 60 ]; 61 62 $this->_ent_replace = 63 sprintf("&(%s);", implode("|", 64 $this->_entity_replacements)); 65 } 66 67 /** 68 * Strip single and double quotes off of a string, if they are 69 * present. 70 * 71 * @access private 72 * @param string $str The original string 73 * @return string $new_str The new string with leading and 74 * trailing quotes removed 75 */ 76 function removeQuotes($str) 77 { 78 $matches = []; 79 $double = '/^"(.*)"$/'; 80 $single = "/^'(.*)'$/"; 81 82 if (preg_match($double, $str, $matches)) { 83 return $matches[1]; 84 } else if (preg_match($single, $str, $matches)) { 85 return $matches[1]; 86 } else { 87 return $str; 88 } 89 } 90 91 /** 92 * Create a regular expression that will match an opening 93 * or closing tag from a set of names. 94 * 95 * @access private 96 * @param mixed $tag_names Tag names to match 97 * @param mixed $close false/0 = no, true/1 = yes, other = maybe 98 * @param mixed $self_close false/0 = no, true/1 = yes, other = maybe 99 * @return string $regex A regular expression string to be used 100 * in, say, preg_match. 101 */ 102 function tagPattern($tag_names, $close, $self_close) 103 { 104 if (is_array($tag_names)) { 105 $tag_names = '(?:'.implode('|',$tag_names).')'; 106 } 107 if ($close) { 108 $close = '\/' . (($close == 1)? '' : '?'); 109 } else { 110 $close = ''; 111 } 112 if ($self_close) { 113 $self_close = '(?:\/\s*)' . (($self_close == 1)? '' : '?'); 114 } else { 115 $self_close = ''; 116 } 117 $expr = sprintf($this->_tag_expr, $close, $tag_names, $self_close); 118 119 return sprintf("/%s/%s", $expr, $this->_re_flags); 120 } 121 122 /** 123 * Given an HTML document string, this finds all the META tags in 124 * the document, provided they are found in the 125 * <HTML><HEAD>...</HEAD> section of the document. The <HTML> tag 126 * may be missing. 127 * 128 * @access private 129 * @param string $html_string An HTMl document string 130 * @return array $tag_list Array of tags; each tag is an array of 131 * attribute -> value. 132 */ 133 function getMetaTags($html_string) 134 { 135 $html_string = preg_replace($this->_removed_re, 136 "", 137 $html_string); 138 139 $key_tags = [ 140 $this->tagPattern('html', false, false), 141 $this->tagPattern('head', false, false), 142 $this->tagPattern('head', true, false), 143 $this->tagPattern('html', true, false), 144 $this->tagPattern([ 145 'body', 'frameset', 'frame', 'p', 'div', 146 'table','span','a' 147 ], 'maybe', 'maybe') 148 ]; 149 $key_tags_pos = []; 150 foreach ($key_tags as $pat) { 151 $matches = []; 152 preg_match($pat, $html_string, $matches, PREG_OFFSET_CAPTURE); 153 if($matches) { 154 $key_tags_pos[] = $matches[0][1]; 155 } else { 156 $key_tags_pos[] = null; 157 } 158 } 159 // no opening head tag 160 if (is_null($key_tags_pos[1])) { 161 return []; 162 } 163 // the effective </head> is the min of the following 164 if (is_null($key_tags_pos[2])) { 165 $key_tags_pos[2] = strlen($html_string); 166 } 167 foreach ([$key_tags_pos[3], $key_tags_pos[4]] as $pos) { 168 if (!is_null($pos) && $pos < $key_tags_pos[2]) { 169 $key_tags_pos[2] = $pos; 170 } 171 } 172 // closing head tag comes before opening head tag 173 if ($key_tags_pos[1] > $key_tags_pos[2]) { 174 return []; 175 } 176 // if there is an opening html tag, make sure the opening head tag 177 // comes after it 178 if (!is_null($key_tags_pos[0]) && $key_tags_pos[1] < $key_tags_pos[0]) { 179 return []; 180 } 181 $html_string = substr($html_string, $key_tags_pos[1], 182 ($key_tags_pos[2]-$key_tags_pos[1])); 183 184 $link_data = []; 185 $link_matches = []; 186 187 if (!preg_match_all($this->tagPattern('meta', false, 'maybe'), 188 $html_string, $link_matches)) { 189 return []; 190 } 191 192 foreach ($link_matches[0] as $link) { 193 $attr_matches = []; 194 preg_match_all($this->_attr_find, $link, $attr_matches); 195 $link_attrs = []; 196 foreach ($attr_matches[0] as $index => $full_match) { 197 $name = $attr_matches[1][$index]; 198 $value = html_entity_decode( 199 $this->removeQuotes($attr_matches[2][$index])); 200 201 $link_attrs[strtolower($name)] = $value; 202 } 203 $link_data[] = $link_attrs; 204 } 205 206 return $link_data; 207 } 208 209 /** 210 * Looks for a META tag with an "http-equiv" attribute whose value 211 * is one of ("x-xrds-location", "x-yadis-location"), ignoring 212 * case. If such a META tag is found, its "content" attribute 213 * value is returned. 214 * 215 * @param string $html_string An HTML document in string format 216 * @return mixed $content The "content" attribute value of the 217 * META tag, if found, or null if no such tag was found. 218 */ 219 function getHTTPEquiv($html_string) 220 { 221 $meta_tags = $this->getMetaTags($html_string); 222 223 if ($meta_tags) { 224 foreach ($meta_tags as $tag) { 225 if (array_key_exists('http-equiv', $tag) && 226 (in_array(strtolower($tag['http-equiv']), 227 ['x-xrds-location', 'x-yadis-location'])) && 228 array_key_exists('content', $tag)) { 229 return $tag['content']; 230 } 231 } 232 } 233 234 return null; 235 } 236} 237 238