1<?php
2
3/**
4 * SimplePie
5 *
6 * A PHP-Based RSS and Atom Feed Framework.
7 * Takes the hard work out of managing a complete RSS/Atom solution.
8 *
9 * Copyright (c) 2004-2022, Ryan Parman, Sam Sneddon, Ryan McCue, and contributors
10 * All rights reserved.
11 *
12 * Redistribution and use in source and binary forms, with or without modification, are
13 * permitted provided that the following conditions are met:
14 *
15 * 	* Redistributions of source code must retain the above copyright notice, this list of
16 * 	  conditions and the following disclaimer.
17 *
18 * 	* Redistributions in binary form must reproduce the above copyright notice, this list
19 * 	  of conditions and the following disclaimer in the documentation and/or other materials
20 * 	  provided with the distribution.
21 *
22 * 	* Neither the name of the SimplePie Team nor the names of its contributors may be used
23 * 	  to endorse or promote products derived from this software without specific prior
24 * 	  written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
27 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
28 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
29 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
31 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
33 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 *
36 * @package SimplePie
37 * @copyright 2004-2016 Ryan Parman, Sam Sneddon, Ryan McCue
38 * @author Ryan Parman
39 * @author Sam Sneddon
40 * @author Ryan McCue
41 * @link http://simplepie.org/ SimplePie
42 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
43 */
44
45namespace SimplePie\HTTP;
46
47/**
48 * HTTP Response Parser
49 *
50 * @package SimplePie
51 * @subpackage HTTP
52 */
53class Parser
54{
55    /**
56     * HTTP Version
57     *
58     * @var float
59     */
60    public $http_version = 0.0;
61
62    /**
63     * Status code
64     *
65     * @var int
66     */
67    public $status_code = 0;
68
69    /**
70     * Reason phrase
71     *
72     * @var string
73     */
74    public $reason = '';
75
76    /**
77     * Key/value pairs of the headers
78     *
79     * @var array
80     */
81    public $headers = [];
82
83    /**
84     * Body of the response
85     *
86     * @var string
87     */
88    public $body = '';
89
90    private const STATE_HTTP_VERSION = 'http_version';
91
92    private const STATE_STATUS = 'status';
93
94    private const STATE_REASON = 'reason';
95
96    private const STATE_NEW_LINE = 'new_line';
97
98    private const STATE_BODY = 'body';
99
100    private const STATE_NAME = 'name';
101
102    private const STATE_VALUE = 'value';
103
104    private const STATE_VALUE_CHAR = 'value_char';
105
106    private const STATE_QUOTE = 'quote';
107
108    private const STATE_QUOTE_ESCAPED = 'quote_escaped';
109
110    private const STATE_QUOTE_CHAR = 'quote_char';
111
112    private const STATE_CHUNKED = 'chunked';
113
114    private const STATE_EMIT = 'emit';
115
116    private const STATE_ERROR = false;
117
118    /**
119     * Current state of the state machine
120     *
121     * @var self::STATE_*
122     */
123    protected $state = self::STATE_HTTP_VERSION;
124
125    /**
126     * Input data
127     *
128     * @var string
129     */
130    protected $data = '';
131
132    /**
133     * Input data length (to avoid calling strlen() everytime this is needed)
134     *
135     * @var int
136     */
137    protected $data_length = 0;
138
139    /**
140     * Current position of the pointer
141     *
142     * @var int
143     */
144    protected $position = 0;
145
146    /**
147     * Name of the hedaer currently being parsed
148     *
149     * @var string
150     */
151    protected $name = '';
152
153    /**
154     * Value of the hedaer currently being parsed
155     *
156     * @var string
157     */
158    protected $value = '';
159
160    /**
161     * Create an instance of the class with the input data
162     *
163     * @param string $data Input data
164     */
165    public function __construct($data)
166    {
167        $this->data = $data;
168        $this->data_length = strlen($this->data);
169    }
170
171    /**
172     * Parse the input data
173     *
174     * @return bool true on success, false on failure
175     */
176    public function parse()
177    {
178        while ($this->state && $this->state !== self::STATE_EMIT && $this->has_data()) {
179            $state = $this->state;
180            $this->$state();
181        }
182        $this->data = '';
183        if ($this->state === self::STATE_EMIT || $this->state === self::STATE_BODY) {
184            return true;
185        }
186
187        $this->http_version = '';
188        $this->status_code = 0;
189        $this->reason = '';
190        $this->headers = [];
191        $this->body = '';
192        return false;
193    }
194
195    /**
196     * Check whether there is data beyond the pointer
197     *
198     * @return bool true if there is further data, false if not
199     */
200    protected function has_data()
201    {
202        return (bool) ($this->position < $this->data_length);
203    }
204
205    /**
206     * See if the next character is LWS
207     *
208     * @return bool true if the next character is LWS, false if not
209     */
210    protected function is_linear_whitespace()
211    {
212        return (bool) ($this->data[$this->position] === "\x09"
213            || $this->data[$this->position] === "\x20"
214            || ($this->data[$this->position] === "\x0A"
215                && isset($this->data[$this->position + 1])
216                && ($this->data[$this->position + 1] === "\x09" || $this->data[$this->position + 1] === "\x20")));
217    }
218
219    /**
220     * Parse the HTTP version
221     */
222    protected function http_version()
223    {
224        if (strpos($this->data, "\x0A") !== false && strtoupper(substr($this->data, 0, 5)) === 'HTTP/') {
225            $len = strspn($this->data, '0123456789.', 5);
226            $this->http_version = substr($this->data, 5, $len);
227            $this->position += 5 + $len;
228            if (substr_count($this->http_version, '.') <= 1) {
229                $this->http_version = (float) $this->http_version;
230                $this->position += strspn($this->data, "\x09\x20", $this->position);
231                $this->state = self::STATE_STATUS;
232            } else {
233                $this->state = self::STATE_ERROR;
234            }
235        } else {
236            $this->state = self::STATE_ERROR;
237        }
238    }
239
240    /**
241     * Parse the status code
242     */
243    protected function status()
244    {
245        if ($len = strspn($this->data, '0123456789', $this->position)) {
246            $this->status_code = (int) substr($this->data, $this->position, $len);
247            $this->position += $len;
248            $this->state = self::STATE_REASON;
249        } else {
250            $this->state = self::STATE_ERROR;
251        }
252    }
253
254    /**
255     * Parse the reason phrase
256     */
257    protected function reason()
258    {
259        $len = strcspn($this->data, "\x0A", $this->position);
260        $this->reason = trim(substr($this->data, $this->position, $len), "\x09\x0D\x20");
261        $this->position += $len + 1;
262        $this->state = self::STATE_NEW_LINE;
263    }
264
265    /**
266     * Deal with a new line, shifting data around as needed
267     */
268    protected function new_line()
269    {
270        $this->value = trim($this->value, "\x0D\x20");
271        if ($this->name !== '' && $this->value !== '') {
272            $this->name = strtolower($this->name);
273            // We should only use the last Content-Type header. c.f. issue #1
274            if (isset($this->headers[$this->name]) && $this->name !== 'content-type') {
275                $this->headers[$this->name] .= ', ' . $this->value;
276            } else {
277                $this->headers[$this->name] = $this->value;
278            }
279        }
280        $this->name = '';
281        $this->value = '';
282        if (substr($this->data[$this->position], 0, 2) === "\x0D\x0A") {
283            $this->position += 2;
284            $this->state = self::STATE_BODY;
285        } elseif ($this->data[$this->position] === "\x0A") {
286            $this->position++;
287            $this->state = self::STATE_BODY;
288        } else {
289            $this->state = self::STATE_NAME;
290        }
291    }
292
293    /**
294     * Parse a header name
295     */
296    protected function name()
297    {
298        $len = strcspn($this->data, "\x0A:", $this->position);
299        if (isset($this->data[$this->position + $len])) {
300            if ($this->data[$this->position + $len] === "\x0A") {
301                $this->position += $len;
302                $this->state = self::STATE_NEW_LINE;
303            } else {
304                $this->name = substr($this->data, $this->position, $len);
305                $this->position += $len + 1;
306                $this->state = self::STATE_VALUE;
307            }
308        } else {
309            $this->state = self::STATE_ERROR;
310        }
311    }
312
313    /**
314     * Parse LWS, replacing consecutive LWS characters with a single space
315     */
316    protected function linear_whitespace()
317    {
318        do {
319            if (substr($this->data, $this->position, 2) === "\x0D\x0A") {
320                $this->position += 2;
321            } elseif ($this->data[$this->position] === "\x0A") {
322                $this->position++;
323            }
324            $this->position += strspn($this->data, "\x09\x20", $this->position);
325        } while ($this->has_data() && $this->is_linear_whitespace());
326        $this->value .= "\x20";
327    }
328
329    /**
330     * See what state to move to while within non-quoted header values
331     */
332    protected function value()
333    {
334        if ($this->is_linear_whitespace()) {
335            $this->linear_whitespace();
336        } else {
337            switch ($this->data[$this->position]) {
338                case '"':
339                    // Workaround for ETags: we have to include the quotes as
340                    // part of the tag.
341                    if (strtolower($this->name) === 'etag') {
342                        $this->value .= '"';
343                        $this->position++;
344                        $this->state = self::STATE_VALUE_CHAR;
345                        break;
346                    }
347                    $this->position++;
348                    $this->state = self::STATE_QUOTE;
349                    break;
350
351                case "\x0A":
352                    $this->position++;
353                    $this->state = self::STATE_NEW_LINE;
354                    break;
355
356                default:
357                    $this->state = self::STATE_VALUE_CHAR;
358                    break;
359            }
360        }
361    }
362
363    /**
364     * Parse a header value while outside quotes
365     */
366    protected function value_char()
367    {
368        $len = strcspn($this->data, "\x09\x20\x0A\"", $this->position);
369        $this->value .= substr($this->data, $this->position, $len);
370        $this->position += $len;
371        $this->state = self::STATE_VALUE;
372    }
373
374    /**
375     * See what state to move to while within quoted header values
376     */
377    protected function quote()
378    {
379        if ($this->is_linear_whitespace()) {
380            $this->linear_whitespace();
381        } else {
382            switch ($this->data[$this->position]) {
383                case '"':
384                    $this->position++;
385                    $this->state = self::STATE_VALUE;
386                    break;
387
388                case "\x0A":
389                    $this->position++;
390                    $this->state = self::STATE_NEW_LINE;
391                    break;
392
393                case '\\':
394                    $this->position++;
395                    $this->state = self::STATE_QUOTE_ESCAPED;
396                    break;
397
398                default:
399                    $this->state = self::STATE_QUOTE_CHAR;
400                    break;
401            }
402        }
403    }
404
405    /**
406     * Parse a header value while within quotes
407     */
408    protected function quote_char()
409    {
410        $len = strcspn($this->data, "\x09\x20\x0A\"\\", $this->position);
411        $this->value .= substr($this->data, $this->position, $len);
412        $this->position += $len;
413        $this->state = self::STATE_VALUE;
414    }
415
416    /**
417     * Parse an escaped character within quotes
418     */
419    protected function quote_escaped()
420    {
421        $this->value .= $this->data[$this->position];
422        $this->position++;
423        $this->state = self::STATE_QUOTE;
424    }
425
426    /**
427     * Parse the body
428     */
429    protected function body()
430    {
431        $this->body = substr($this->data, $this->position);
432        if (!empty($this->headers['transfer-encoding'])) {
433            unset($this->headers['transfer-encoding']);
434            $this->state = self::STATE_CHUNKED;
435        } else {
436            $this->state = self::STATE_EMIT;
437        }
438    }
439
440    /**
441     * Parsed a "Transfer-Encoding: chunked" body
442     */
443    protected function chunked()
444    {
445        if (!preg_match('/^([0-9a-f]+)[^\r\n]*\r\n/i', trim($this->body))) {
446            $this->state = self::STATE_EMIT;
447            return;
448        }
449
450        $decoded = '';
451        $encoded = $this->body;
452
453        while (true) {
454            $is_chunked = (bool) preg_match('/^([0-9a-f]+)[^\r\n]*\r\n/i', $encoded, $matches);
455            if (!$is_chunked) {
456                // Looks like it's not chunked after all
457                $this->state = self::STATE_EMIT;
458                return;
459            }
460
461            $length = hexdec(trim($matches[1]));
462            if ($length === 0) {
463                // Ignore trailer headers
464                $this->state = self::STATE_EMIT;
465                $this->body = $decoded;
466                return;
467            }
468
469            $chunk_length = strlen($matches[0]);
470            $decoded .= substr($encoded, $chunk_length, $length);
471            $encoded = substr($encoded, $chunk_length + $length + 2);
472
473            // BC for PHP < 8.0: substr() can return bool instead of string
474            $encoded = ($encoded === false) ? '' : $encoded;
475
476            if (trim($encoded) === '0' || empty($encoded)) {
477                $this->state = self::STATE_EMIT;
478                $this->body = $decoded;
479                return;
480            }
481        }
482    }
483
484    /**
485     * Prepare headers (take care of proxies headers)
486     *
487     * @param string  $headers Raw headers
488     * @param integer $count   Redirection count. Default to 1.
489     *
490     * @return string
491     */
492    public static function prepareHeaders($headers, $count = 1)
493    {
494        $data = explode("\r\n\r\n", $headers, $count);
495        $data = array_pop($data);
496        if (false !== stripos($data, "HTTP/1.0 200 Connection established\r\n")) {
497            $exploded = explode("\r\n\r\n", $data, 2);
498            $data = end($exploded);
499        }
500        if (false !== stripos($data, "HTTP/1.1 200 Connection established\r\n")) {
501            $exploded = explode("\r\n\r\n", $data, 2);
502            $data = end($exploded);
503        }
504        return $data;
505    }
506}
507
508class_alias('SimplePie\HTTP\Parser', 'SimplePie_HTTP_Parser');
509