1<?php 2 3/** 4 * This module implements a VERY limited parser that finds <link> tags 5 * in the head of HTML or XHTML documents and parses out their 6 * attributes according to the OpenID spec. It is a liberal parser, 7 * but it requires these things from the data in order to work: 8 * 9 * - There must be an open <html> tag 10 * 11 * - There must be an open <head> tag inside of the <html> tag 12 * 13 * - Only <link>s that are found inside of the <head> tag are parsed 14 * (this is by design) 15 * 16 * - The parser follows the OpenID specification in resolving the 17 * attributes of the link tags. This means that the attributes DO 18 * NOT get resolved as they would by an XML or HTML parser. In 19 * particular, only certain entities get replaced, and href 20 * attributes do not get resolved relative to a base URL. 21 * 22 * From http://openid.net/specs.bml: 23 * 24 * - The openid.server URL MUST be an absolute URL. OpenID consumers 25 * MUST NOT attempt to resolve relative URLs. 26 * 27 * - The openid.server URL MUST NOT include entities other than &, 28 * <, >, and ". 29 * 30 * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds 31 * of quoting are allowed for attributes. 32 * 33 * The parser deals with invalid markup in these ways: 34 * 35 * - Tag names are not case-sensitive 36 * 37 * - The <html> tag is accepted even when it is not at the top level 38 * 39 * - The <head> tag is accepted even when it is not a direct child of 40 * the <html> tag, but a <html> tag must be an ancestor of the 41 * <head> tag 42 * 43 * - <link> tags are accepted even when they are not direct children 44 * of the <head> tag, but a <head> tag must be an ancestor of the 45 * <link> tag 46 * 47 * - If there is no closing tag for an open <html> or <head> tag, the 48 * remainder of the document is viewed as being inside of the 49 * tag. If there is no closing tag for a <link> tag, the link tag is 50 * treated as a short tag. Exceptions to this rule are that <html> 51 * closes <html> and <body> or <head> closes <head> 52 * 53 * - Attributes of the <link> tag are not required to be quoted. 54 * 55 * - In the case of duplicated attribute names, the attribute coming 56 * last in the tag will be the value returned. 57 * 58 * - Any text that does not parse as an attribute within a link tag 59 * will be ignored. (e.g. <link pumpkin rel='openid.server' /> will 60 * ignore pumpkin) 61 * 62 * - If there are more than one <html> or <head> tag, the parser only 63 * looks inside of the first one. 64 * 65 * - The contents of <script> tags are ignored entirely, except 66 * unclosed <script> tags. Unclosed <script> tags are ignored. 67 * 68 * - Any other invalid markup is ignored, including unclosed SGML 69 * comments and unclosed <![CDATA[blocks. 70 * 71 * PHP versions 4 and 5 72 * 73 * LICENSE: See the COPYING file included in this distribution. 74 * 75 * @access private 76 * @package OpenID 77 * @author JanRain, Inc. <openid@janrain.com> 78 * @copyright 2005-2008 Janrain, Inc. 79 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache 80 */ 81 82/** 83 * Require Auth_OpenID::arrayGet(). 84 */ 85require_once "Auth/OpenID.php"; 86 87class Auth_OpenID_Parse { 88 89 /** 90 * Specify some flags for use with regex matching. 91 */ 92 var $_re_flags = "si"; 93 94 /** 95 * Stuff to remove before we start looking for tags 96 */ 97 var $_removed_re = 98 "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>"; 99 100 /** 101 * Starts with the tag name at a word boundary, where the tag name 102 * is not a namespace 103 */ 104 var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*)(?:<\/?%s\s*>|\Z))"; 105 106 var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)'; 107 108 var $_open_tag_expr = "<%s\b"; 109 var $_close_tag_expr = "<((\/%s\b)|(%s[^>\/]*\/))>"; 110 111 function Auth_OpenID_Parse() 112 { 113 $this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s", 114 $this->_re_flags); 115 116 $this->_entity_replacements = array( 117 'amp' => '&', 118 'lt' => '<', 119 'gt' => '>', 120 'quot' => '"' 121 ); 122 123 $this->_attr_find = sprintf("/%s/%s", 124 $this->_attr_find, 125 $this->_re_flags); 126 127 $this->_removed_re = sprintf("/%s/%s", 128 $this->_removed_re, 129 $this->_re_flags); 130 131 $this->_ent_replace = 132 sprintf("&(%s);", implode("|", 133 $this->_entity_replacements)); 134 } 135 136 /** 137 * Returns a regular expression that will match a given tag in an 138 * SGML string. 139 */ 140 function tagMatcher($tag_name, $close_tags = null) 141 { 142 $expr = $this->_tag_expr; 143 144 if ($close_tags) { 145 $options = implode("|", array_merge(array($tag_name), $close_tags)); 146 $closer = sprintf("(?:%s)", $options); 147 } else { 148 $closer = $tag_name; 149 } 150 151 $expr = sprintf($expr, $tag_name, $closer); 152 return sprintf("/%s/%s", $expr, $this->_re_flags); 153 } 154 155 function openTag($tag_name) 156 { 157 $expr = sprintf($this->_open_tag_expr, $tag_name); 158 return sprintf("/%s/%s", $expr, $this->_re_flags); 159 } 160 161 function closeTag($tag_name) 162 { 163 $expr = sprintf($this->_close_tag_expr, $tag_name, $tag_name); 164 return sprintf("/%s/%s", $expr, $this->_re_flags); 165 } 166 167 function htmlBegin($s) 168 { 169 $matches = array(); 170 $result = preg_match($this->openTag('html'), $s, 171 $matches, PREG_OFFSET_CAPTURE); 172 if ($result === false || !$matches) { 173 return false; 174 } 175 // Return the offset of the first match. 176 return $matches[0][1]; 177 } 178 179 function htmlEnd($s) 180 { 181 $matches = array(); 182 $result = preg_match($this->closeTag('html'), $s, 183 $matches, PREG_OFFSET_CAPTURE); 184 if ($result === false || !$matches) { 185 return false; 186 } 187 // Return the offset of the first match. 188 return $matches[count($matches) - 1][1]; 189 } 190 191 function headFind() 192 { 193 return $this->tagMatcher('head', array('body', 'html')); 194 } 195 196 function replaceEntities($str) 197 { 198 foreach ($this->_entity_replacements as $old => $new) { 199 $str = preg_replace(sprintf("/&%s;/", $old), $new, $str); 200 } 201 return $str; 202 } 203 204 function removeQuotes($str) 205 { 206 $matches = array(); 207 $double = '/^"(.*)"$/'; 208 $single = "/^\'(.*)\'$/"; 209 210 if (preg_match($double, $str, $matches)) { 211 return $matches[1]; 212 } else if (preg_match($single, $str, $matches)) { 213 return $matches[1]; 214 } else { 215 return $str; 216 } 217 } 218 219 function match($regexp, $text, &$match) 220 { 221 if (!is_callable('mb_ereg_search_init')) { 222 return preg_match($regexp, $text, $match); 223 } 224 225 $regexp = substr($regexp, 1, strlen($regexp) - 2 - strlen($this->_re_flags)); 226 mb_ereg_search_init($text); 227 if (!mb_ereg_search($regexp)) { 228 return false; 229 } 230 $match = mb_ereg_search_getregs(); 231 return true; 232 } 233 234 /** 235 * Find all link tags in a string representing a HTML document and 236 * return a list of their attributes. 237 * 238 * @todo This is quite ineffective and may fail with the default 239 * pcre.backtrack_limit of 100000 in PHP 5.2, if $html is big. 240 * It should rather use stripos (in PHP5) or strpos()+strtoupper() 241 * in PHP4 to manage this. 242 * 243 * @param string $html The text to parse 244 * @return array $list An array of arrays of attributes, one for each 245 * link tag 246 */ 247 function parseLinkAttrs($html) 248 { 249 $stripped = preg_replace($this->_removed_re, 250 "", 251 $html); 252 253 $html_begin = $this->htmlBegin($stripped); 254 $html_end = $this->htmlEnd($stripped); 255 256 if ($html_begin === false) { 257 return array(); 258 } 259 260 if ($html_end === false) { 261 $html_end = strlen($stripped); 262 } 263 264 $stripped = substr($stripped, $html_begin, 265 $html_end - $html_begin); 266 267 // Workaround to prevent PREG_BACKTRACK_LIMIT_ERROR: 268 $old_btlimit = ini_set( 'pcre.backtrack_limit', -1 ); 269 270 // Try to find the <HEAD> tag. 271 $head_re = $this->headFind(); 272 $head_match = array(); 273 if (!$this->match($head_re, $stripped, $head_match)) { 274 ini_set( 'pcre.backtrack_limit', $old_btlimit ); 275 return array(); 276 } 277 278 $link_data = array(); 279 $link_matches = array(); 280 281 if (!preg_match_all($this->_link_find, $head_match[0], 282 $link_matches)) { 283 ini_set( 'pcre.backtrack_limit', $old_btlimit ); 284 return array(); 285 } 286 287 foreach ($link_matches[0] as $link) { 288 $attr_matches = array(); 289 preg_match_all($this->_attr_find, $link, $attr_matches); 290 $link_attrs = array(); 291 foreach ($attr_matches[0] as $index => $full_match) { 292 $name = $attr_matches[1][$index]; 293 $value = $this->replaceEntities( 294 $this->removeQuotes($attr_matches[2][$index])); 295 296 $link_attrs[strtolower($name)] = $value; 297 } 298 $link_data[] = $link_attrs; 299 } 300 301 ini_set( 'pcre.backtrack_limit', $old_btlimit ); 302 return $link_data; 303 } 304 305 function relMatches($rel_attr, $target_rel) 306 { 307 // Does this target_rel appear in the rel_str? 308 // XXX: TESTME 309 $rels = preg_split("/\s+/", trim($rel_attr)); 310 foreach ($rels as $rel) { 311 $rel = strtolower($rel); 312 if ($rel == $target_rel) { 313 return 1; 314 } 315 } 316 317 return 0; 318 } 319 320 function linkHasRel($link_attrs, $target_rel) 321 { 322 // Does this link have target_rel as a relationship? 323 // XXX: TESTME 324 $rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null); 325 return ($rel_attr && $this->relMatches($rel_attr, 326 $target_rel)); 327 } 328 329 function findLinksRel($link_attrs_list, $target_rel) 330 { 331 // Filter the list of link attributes on whether it has 332 // target_rel as a relationship. 333 // XXX: TESTME 334 $result = array(); 335 foreach ($link_attrs_list as $attr) { 336 if ($this->linkHasRel($attr, $target_rel)) { 337 $result[] = $attr; 338 } 339 } 340 341 return $result; 342 } 343 344 function findFirstHref($link_attrs_list, $target_rel) 345 { 346 // Return the value of the href attribute for the first link 347 // tag in the list that has target_rel as a relationship. 348 // XXX: TESTME 349 $matches = $this->findLinksRel($link_attrs_list, 350 $target_rel); 351 if (!$matches) { 352 return null; 353 } 354 $first = $matches[0]; 355 return Auth_OpenID::arrayGet($first, 'href', null); 356 } 357} 358 359function Auth_OpenID_legacy_discover($html_text, $server_rel, 360 $delegate_rel) 361{ 362 $p = new Auth_OpenID_Parse(); 363 364 $link_attrs = $p->parseLinkAttrs($html_text); 365 366 $server_url = $p->findFirstHref($link_attrs, 367 $server_rel); 368 369 if ($server_url === null) { 370 return false; 371 } else { 372 $delegate_url = $p->findFirstHref($link_attrs, 373 $delegate_rel); 374 return array($delegate_url, $server_url); 375 } 376} 377 378