1<?php
2
3/**
4 * This module implements a VERY limited parser that finds <link> tags
5 * in the head of HTML or XHTML documents and parses out their
6 * attributes according to the OpenID spec. It is a liberal parser,
7 * but it requires these things from the data in order to work:
8 *
9 * - There must be an open <html> tag
10 *
11 * - There must be an open <head> tag inside of the <html> tag
12 *
13 * - Only <link>s that are found inside of the <head> tag are parsed
14 *   (this is by design)
15 *
16 * - The parser follows the OpenID specification in resolving the
17 *   attributes of the link tags. This means that the attributes DO
18 *   NOT get resolved as they would by an XML or HTML parser. In
19 *   particular, only certain entities get replaced, and href
20 *   attributes do not get resolved relative to a base URL.
21 *
22 * From http://openid.net/specs.bml:
23 *
24 * - The openid.server URL MUST be an absolute URL. OpenID consumers
25 *   MUST NOT attempt to resolve relative URLs.
26 *
27 * - The openid.server URL MUST NOT include entities other than &amp;,
28 *   &lt;, &gt;, and &quot;.
29 *
30 * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds
31 * of quoting are allowed for attributes.
32 *
33 * The parser deals with invalid markup in these ways:
34 *
35 * - Tag names are not case-sensitive
36 *
37 * - The <html> tag is accepted even when it is not at the top level
38 *
39 * - The <head> tag is accepted even when it is not a direct child of
40 *   the <html> tag, but a <html> tag must be an ancestor of the
41 *   <head> tag
42 *
43 * - <link> tags are accepted even when they are not direct children
44 *   of the <head> tag, but a <head> tag must be an ancestor of the
45 *   <link> tag
46 *
47 * - If there is no closing tag for an open <html> or <head> tag, the
48 *   remainder of the document is viewed as being inside of the
49 *   tag. If there is no closing tag for a <link> tag, the link tag is
50 *   treated as a short tag. Exceptions to this rule are that <html>
51 *   closes <html> and <body> or <head> closes <head>
52 *
53 * - Attributes of the <link> tag are not required to be quoted.
54 *
55 * - In the case of duplicated attribute names, the attribute coming
56 *   last in the tag will be the value returned.
57 *
58 * - Any text that does not parse as an attribute within a link tag
59 *   will be ignored. (e.g. <link pumpkin rel='openid.server' /> will
60 *   ignore pumpkin)
61 *
62 * - If there are more than one <html> or <head> tag, the parser only
63 *   looks inside of the first one.
64 *
65 * - The contents of <script> tags are ignored entirely, except
66 *   unclosed <script> tags. Unclosed <script> tags are ignored.
67 *
68 * - Any other invalid markup is ignored, including unclosed SGML
69 *   comments and unclosed <![CDATA[blocks.
70 *
71 * PHP versions 4 and 5
72 *
73 * LICENSE: See the COPYING file included in this distribution.
74 *
75 * @access private
76 * @package OpenID
77 * @author JanRain, Inc. <openid@janrain.com>
78 * @copyright 2005-2008 Janrain, Inc.
79 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache
80 */
81
82/**
83 * Require Auth_OpenID::arrayGet().
84 */
85require_once "Auth/OpenID.php";
86
87class Auth_OpenID_Parse {
88
89    /**
90     * Specify some flags for use with regex matching.
91     */
92    public $_re_flags = "si";
93
94    /**
95     * Stuff to remove before we start looking for tags
96     */
97    public $_removed_re =
98           "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
99
100    /**
101     * Starts with the tag name at a word boundary, where the tag name
102     * is not a namespace
103     */
104    public $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*)(?:<\/?%s\s*>|\Z))";
105
106    public $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
107
108    public $_open_tag_expr = "<%s\b";
109    public $_close_tag_expr = "<((\/%s\b)|(%s[^>\/]*\/))>";
110
111    function __construct()
112    {
113        $this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s",
114                                    $this->_re_flags);
115
116        $this->_entity_replacements = [
117            'amp' => '&',
118            'lt' => '<',
119            'gt' => '>',
120            'quot' => '"',
121        ];
122
123        $this->_attr_find = sprintf("/%s/%s",
124                                    $this->_attr_find,
125                                    $this->_re_flags);
126
127        $this->_removed_re = sprintf("/%s/%s",
128                                     $this->_removed_re,
129                                     $this->_re_flags);
130
131        $this->_ent_replace =
132            sprintf("&(%s);", implode("|",
133                                      $this->_entity_replacements));
134    }
135
136    /**
137     * Returns a regular expression that will match a given tag in an
138     * SGML string.
139     *
140     * @param string $tag_name
141     * @param array $close_tags
142     * @return string
143     */
144    function tagMatcher($tag_name, $close_tags = null)
145    {
146        $expr = $this->_tag_expr;
147
148        if ($close_tags) {
149            $options = implode("|", array_merge([$tag_name], $close_tags));
150            $closer = sprintf("(?:%s)", $options);
151        } else {
152            $closer = $tag_name;
153        }
154
155        $expr = sprintf($expr, $tag_name, $closer);
156        return sprintf("/%s/%s", $expr, $this->_re_flags);
157    }
158
159    function openTag($tag_name)
160    {
161        $expr = sprintf($this->_open_tag_expr, $tag_name);
162        return sprintf("/%s/%s", $expr, $this->_re_flags);
163    }
164
165    function closeTag($tag_name)
166    {
167        $expr = sprintf($this->_close_tag_expr, $tag_name, $tag_name);
168        return sprintf("/%s/%s", $expr, $this->_re_flags);
169    }
170
171    function htmlBegin($s)
172    {
173        $matches = [];
174        $result = preg_match($this->openTag('html'), $s,
175                             $matches, PREG_OFFSET_CAPTURE);
176        if ($result === false || !$matches) {
177            return false;
178        }
179        // Return the offset of the first match.
180        return $matches[0][1];
181    }
182
183    function htmlEnd($s)
184    {
185        $matches = [];
186        $result = preg_match($this->closeTag('html'), $s,
187                             $matches, PREG_OFFSET_CAPTURE);
188        if ($result === false || !$matches) {
189            return false;
190        }
191        // Return the offset of the first match.
192        return $matches[count($matches) - 1][1];
193    }
194
195    function headFind()
196    {
197        return $this->tagMatcher('head', ['body', 'html']);
198    }
199
200    function replaceEntities($str)
201    {
202        foreach ($this->_entity_replacements as $old => $new) {
203            $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
204        }
205        return $str;
206    }
207
208    function removeQuotes($str)
209    {
210        $matches = [];
211        $double = '/^"(.*)"$/';
212        $single = "/^\'(.*)\'$/";
213
214        if (preg_match($double, $str, $matches)) {
215            return $matches[1];
216        } else if (preg_match($single, $str, $matches)) {
217            return $matches[1];
218        } else {
219            return $str;
220        }
221    }
222
223    function match($regexp, $text, &$match)
224    {
225        if (preg_match($regexp, $text, $match)) {
226           return true;
227        }
228        return false;
229    }
230
231    /**
232     * Find all link tags in a string representing a HTML document and
233     * return a list of their attributes.
234     *
235     * @todo This is quite ineffective and may fail with the default
236     *       pcre.backtrack_limit of 100000 in PHP 5.2, if $html is big.
237     *       It should rather use stripos (in PHP5) or strpos()+strtoupper()
238     *       in PHP4 to manage this.
239     *
240     * @param string $html The text to parse
241     * @return array $list An array of arrays of attributes, one for each
242     * link tag
243     */
244    function parseLinkAttrs($html)
245    {
246        $stripped = preg_replace($this->_removed_re,
247                                 "",
248                                 $html);
249
250        $html_begin = $this->htmlBegin($stripped);
251        $html_end = $this->htmlEnd($stripped);
252
253        if ($html_begin === false) {
254            return [];
255        }
256
257        if ($html_end === false) {
258            $html_end = strlen($stripped);
259        }
260
261        $stripped = substr($stripped, $html_begin,
262                           $html_end - $html_begin);
263
264        // Workaround to prevent PREG_BACKTRACK_LIMIT_ERROR:
265        $old_btlimit = ini_set( 'pcre.backtrack_limit', -1 );
266
267        // Try to find the <HEAD> tag.
268        $head_re = $this->headFind();
269        $head_match = [];
270        if (!$this->match($head_re, $stripped, $head_match)) {
271                     ini_set( 'pcre.backtrack_limit', $old_btlimit );
272                     return [];
273        }
274
275        $link_data = [];
276        $link_matches = [];
277
278        if (!preg_match_all($this->_link_find, $head_match[0],
279                            $link_matches)) {
280            ini_set( 'pcre.backtrack_limit', $old_btlimit );
281            return [];
282        }
283
284        foreach ($link_matches[0] as $link) {
285            $attr_matches = [];
286            preg_match_all($this->_attr_find, $link, $attr_matches);
287            $link_attrs = [];
288            foreach ($attr_matches[0] as $index => $full_match) {
289                $name = $attr_matches[1][$index];
290                $value = $this->replaceEntities(
291                              $this->removeQuotes($attr_matches[2][$index]));
292
293                $link_attrs[strtolower($name)] = $value;
294            }
295            $link_data[] = $link_attrs;
296        }
297
298        ini_set( 'pcre.backtrack_limit', $old_btlimit );
299        return $link_data;
300    }
301
302    function relMatches($rel_attr, $target_rel)
303    {
304        // Does this target_rel appear in the rel_str?
305        // XXX: TESTME
306        $rels = preg_split("/\s+/", trim($rel_attr));
307        foreach ($rels as $rel) {
308            $rel = strtolower($rel);
309            if ($rel == $target_rel) {
310                return 1;
311            }
312        }
313
314        return 0;
315    }
316
317    function linkHasRel($link_attrs, $target_rel)
318    {
319        // Does this link have target_rel as a relationship?
320        // XXX: TESTME
321        $rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null);
322        return ($rel_attr && $this->relMatches($rel_attr,
323                                               $target_rel));
324    }
325
326    function findLinksRel($link_attrs_list, $target_rel)
327    {
328        // Filter the list of link attributes on whether it has
329        // target_rel as a relationship.
330        // XXX: TESTME
331        $result = [];
332        foreach ($link_attrs_list as $attr) {
333            if ($this->linkHasRel($attr, $target_rel)) {
334                $result[] = $attr;
335            }
336        }
337
338        return $result;
339    }
340
341    function findFirstHref($link_attrs_list, $target_rel)
342    {
343        // Return the value of the href attribute for the first link
344        // tag in the list that has target_rel as a relationship.
345        // XXX: TESTME
346        $matches = $this->findLinksRel($link_attrs_list,
347                                       $target_rel);
348        if (!$matches) {
349            return null;
350        }
351        $first = $matches[0];
352        return Auth_OpenID::arrayGet($first, 'href', null);
353    }
354}
355
356function Auth_OpenID_legacy_discover($html_text, $server_rel,
357                                     $delegate_rel)
358{
359    $p = new Auth_OpenID_Parse();
360
361    $link_attrs = $p->parseLinkAttrs($html_text);
362
363    $server_url = $p->findFirstHref($link_attrs,
364                                    $server_rel);
365
366    if ($server_url === null) {
367        return false;
368    } else {
369        $delegate_url = $p->findFirstHref($link_attrs,
370                                          $delegate_rel);
371        return [$delegate_url, $server_url];
372    }
373}
374
375