1<?php
2
3declare(strict_types=1);
4/**
5 * SimplePie
6 *
7 * A PHP-Based RSS and Atom Feed Framework.
8 * Takes the hard work out of managing a complete RSS/Atom solution.
9 *
10 * Copyright (c) 2004-2022, Ryan Parman, Sam Sneddon, Ryan McCue, and contributors
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without modification, are
14 * permitted provided that the following conditions are met:
15 *
16 * 	* Redistributions of source code must retain the above copyright notice, this list of
17 * 	  conditions and the following disclaimer.
18 *
19 * 	* Redistributions in binary form must reproduce the above copyright notice, this list
20 * 	  of conditions and the following disclaimer in the documentation and/or other materials
21 * 	  provided with the distribution.
22 *
23 * 	* Neither the name of the SimplePie Team nor the names of its contributors may be used
24 * 	  to endorse or promote products derived from this software without specific prior
25 * 	  written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
28 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
29 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
30 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
34 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 * @package SimplePie
38 * @copyright 2004-2016 Ryan Parman, Sam Sneddon, Ryan McCue
39 * @author Ryan Parman
40 * @author Sam Sneddon
41 * @author Ryan McCue
42 * @link http://simplepie.org/ SimplePie
43 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
44 */
45
46namespace SimplePie\HTTP;
47
48/**
49 * HTTP Response Parser
50 *
51 * @package SimplePie
52 * @subpackage HTTP
53 */
54class Parser
55{
56    /**
57     * HTTP Version
58     *
59     * @var float
60     */
61    public $http_version = 0.0;
62
63    /**
64     * Status code
65     *
66     * @var int
67     */
68    public $status_code = 0;
69
70    /**
71     * Reason phrase
72     *
73     * @var string
74     */
75    public $reason = '';
76
77    /**
78     * Key/value pairs of the headers
79     *
80     * @var array
81     */
82    public $headers = [];
83
84    /**
85     * Body of the response
86     *
87     * @var string
88     */
89    public $body = '';
90
91    private const STATE_HTTP_VERSION = 'http_version';
92
93    private const STATE_STATUS = 'status';
94
95    private const STATE_REASON = 'reason';
96
97    private const STATE_NEW_LINE = 'new_line';
98
99    private const STATE_BODY = 'body';
100
101    private const STATE_NAME = 'name';
102
103    private const STATE_VALUE = 'value';
104
105    private const STATE_VALUE_CHAR = 'value_char';
106
107    private const STATE_QUOTE = 'quote';
108
109    private const STATE_QUOTE_ESCAPED = 'quote_escaped';
110
111    private const STATE_QUOTE_CHAR = 'quote_char';
112
113    private const STATE_CHUNKED = 'chunked';
114
115    private const STATE_EMIT = 'emit';
116
117    private const STATE_ERROR = false;
118
119    /**
120     * Current state of the state machine
121     *
122     * @var self::STATE_*
123     */
124    protected $state = self::STATE_HTTP_VERSION;
125
126    /**
127     * Input data
128     *
129     * @var string
130     */
131    protected $data = '';
132
133    /**
134     * Input data length (to avoid calling strlen() everytime this is needed)
135     *
136     * @var int
137     */
138    protected $data_length = 0;
139
140    /**
141     * Current position of the pointer
142     *
143     * @var int
144     */
145    protected $position = 0;
146
147    /**
148     * Name of the hedaer currently being parsed
149     *
150     * @var string
151     */
152    protected $name = '';
153
154    /**
155     * Value of the hedaer currently being parsed
156     *
157     * @var string
158     */
159    protected $value = '';
160
161    /**
162     * Create an instance of the class with the input data
163     *
164     * @param string $data Input data
165     */
166    public function __construct($data)
167    {
168        $this->data = $data;
169        $this->data_length = strlen($this->data);
170    }
171
172    /**
173     * Parse the input data
174     *
175     * @return bool true on success, false on failure
176     */
177    public function parse()
178    {
179        while ($this->state && $this->state !== self::STATE_EMIT && $this->has_data()) {
180            $state = $this->state;
181            $this->$state();
182        }
183        $this->data = '';
184        if ($this->state === self::STATE_EMIT || $this->state === self::STATE_BODY) {
185            return true;
186        }
187
188        $this->http_version = '';
189        $this->status_code = 0;
190        $this->reason = '';
191        $this->headers = [];
192        $this->body = '';
193        return false;
194    }
195
196    /**
197     * Check whether there is data beyond the pointer
198     *
199     * @return bool true if there is further data, false if not
200     */
201    protected function has_data()
202    {
203        return (bool) ($this->position < $this->data_length);
204    }
205
206    /**
207     * See if the next character is LWS
208     *
209     * @return bool true if the next character is LWS, false if not
210     */
211    protected function is_linear_whitespace()
212    {
213        return (bool) ($this->data[$this->position] === "\x09"
214            || $this->data[$this->position] === "\x20"
215            || ($this->data[$this->position] === "\x0A"
216                && isset($this->data[$this->position + 1])
217                && ($this->data[$this->position + 1] === "\x09" || $this->data[$this->position + 1] === "\x20")));
218    }
219
220    /**
221     * Parse the HTTP version
222     */
223    protected function http_version()
224    {
225        if (strpos($this->data, "\x0A") !== false && strtoupper(substr($this->data, 0, 5)) === 'HTTP/') {
226            $len = strspn($this->data, '0123456789.', 5);
227            $this->http_version = substr($this->data, 5, $len);
228            $this->position += 5 + $len;
229            if (substr_count($this->http_version, '.') <= 1) {
230                $this->http_version = (float) $this->http_version;
231                $this->position += strspn($this->data, "\x09\x20", $this->position);
232                $this->state = self::STATE_STATUS;
233            } else {
234                $this->state = self::STATE_ERROR;
235            }
236        } else {
237            $this->state = self::STATE_ERROR;
238        }
239    }
240
241    /**
242     * Parse the status code
243     */
244    protected function status()
245    {
246        if ($len = strspn($this->data, '0123456789', $this->position)) {
247            $this->status_code = (int) substr($this->data, $this->position, $len);
248            $this->position += $len;
249            $this->state = self::STATE_REASON;
250        } else {
251            $this->state = self::STATE_ERROR;
252        }
253    }
254
255    /**
256     * Parse the reason phrase
257     */
258    protected function reason()
259    {
260        $len = strcspn($this->data, "\x0A", $this->position);
261        $this->reason = trim(substr($this->data, $this->position, $len), "\x09\x0D\x20");
262        $this->position += $len + 1;
263        $this->state = self::STATE_NEW_LINE;
264    }
265
266    /**
267     * Deal with a new line, shifting data around as needed
268     */
269    protected function new_line()
270    {
271        $this->value = trim($this->value, "\x0D\x20");
272        if ($this->name !== '' && $this->value !== '') {
273            $this->name = strtolower($this->name);
274            // We should only use the last Content-Type header. c.f. issue #1
275            if (isset($this->headers[$this->name]) && $this->name !== 'content-type') {
276                $this->headers[$this->name] .= ', ' . $this->value;
277            } else {
278                $this->headers[$this->name] = $this->value;
279            }
280        }
281        $this->name = '';
282        $this->value = '';
283        if (substr($this->data[$this->position], 0, 2) === "\x0D\x0A") {
284            $this->position += 2;
285            $this->state = self::STATE_BODY;
286        } elseif ($this->data[$this->position] === "\x0A") {
287            $this->position++;
288            $this->state = self::STATE_BODY;
289        } else {
290            $this->state = self::STATE_NAME;
291        }
292    }
293
294    /**
295     * Parse a header name
296     */
297    protected function name()
298    {
299        $len = strcspn($this->data, "\x0A:", $this->position);
300        if (isset($this->data[$this->position + $len])) {
301            if ($this->data[$this->position + $len] === "\x0A") {
302                $this->position += $len;
303                $this->state = self::STATE_NEW_LINE;
304            } else {
305                $this->name = substr($this->data, $this->position, $len);
306                $this->position += $len + 1;
307                $this->state = self::STATE_VALUE;
308            }
309        } else {
310            $this->state = self::STATE_ERROR;
311        }
312    }
313
314    /**
315     * Parse LWS, replacing consecutive LWS characters with a single space
316     */
317    protected function linear_whitespace()
318    {
319        do {
320            if (substr($this->data, $this->position, 2) === "\x0D\x0A") {
321                $this->position += 2;
322            } elseif ($this->data[$this->position] === "\x0A") {
323                $this->position++;
324            }
325            $this->position += strspn($this->data, "\x09\x20", $this->position);
326        } while ($this->has_data() && $this->is_linear_whitespace());
327        $this->value .= "\x20";
328    }
329
330    /**
331     * See what state to move to while within non-quoted header values
332     */
333    protected function value()
334    {
335        if ($this->is_linear_whitespace()) {
336            $this->linear_whitespace();
337        } else {
338            switch ($this->data[$this->position]) {
339                case '"':
340                    // Workaround for ETags: we have to include the quotes as
341                    // part of the tag.
342                    if (strtolower($this->name) === 'etag') {
343                        $this->value .= '"';
344                        $this->position++;
345                        $this->state = self::STATE_VALUE_CHAR;
346                        break;
347                    }
348                    $this->position++;
349                    $this->state = self::STATE_QUOTE;
350                    break;
351
352                case "\x0A":
353                    $this->position++;
354                    $this->state = self::STATE_NEW_LINE;
355                    break;
356
357                default:
358                    $this->state = self::STATE_VALUE_CHAR;
359                    break;
360            }
361        }
362    }
363
364    /**
365     * Parse a header value while outside quotes
366     */
367    protected function value_char()
368    {
369        $len = strcspn($this->data, "\x09\x20\x0A\"", $this->position);
370        $this->value .= substr($this->data, $this->position, $len);
371        $this->position += $len;
372        $this->state = self::STATE_VALUE;
373    }
374
375    /**
376     * See what state to move to while within quoted header values
377     */
378    protected function quote()
379    {
380        if ($this->is_linear_whitespace()) {
381            $this->linear_whitespace();
382        } else {
383            switch ($this->data[$this->position]) {
384                case '"':
385                    $this->position++;
386                    $this->state = self::STATE_VALUE;
387                    break;
388
389                case "\x0A":
390                    $this->position++;
391                    $this->state = self::STATE_NEW_LINE;
392                    break;
393
394                case '\\':
395                    $this->position++;
396                    $this->state = self::STATE_QUOTE_ESCAPED;
397                    break;
398
399                default:
400                    $this->state = self::STATE_QUOTE_CHAR;
401                    break;
402            }
403        }
404    }
405
406    /**
407     * Parse a header value while within quotes
408     */
409    protected function quote_char()
410    {
411        $len = strcspn($this->data, "\x09\x20\x0A\"\\", $this->position);
412        $this->value .= substr($this->data, $this->position, $len);
413        $this->position += $len;
414        $this->state = self::STATE_VALUE;
415    }
416
417    /**
418     * Parse an escaped character within quotes
419     */
420    protected function quote_escaped()
421    {
422        $this->value .= $this->data[$this->position];
423        $this->position++;
424        $this->state = self::STATE_QUOTE;
425    }
426
427    /**
428     * Parse the body
429     */
430    protected function body()
431    {
432        $this->body = substr($this->data, $this->position);
433        if (!empty($this->headers['transfer-encoding'])) {
434            unset($this->headers['transfer-encoding']);
435            $this->state = self::STATE_CHUNKED;
436        } else {
437            $this->state = self::STATE_EMIT;
438        }
439    }
440
441    /**
442     * Parsed a "Transfer-Encoding: chunked" body
443     */
444    protected function chunked()
445    {
446        if (!preg_match('/^([0-9a-f]+)[^\r\n]*\r\n/i', trim($this->body))) {
447            $this->state = self::STATE_EMIT;
448            return;
449        }
450
451        $decoded = '';
452        $encoded = $this->body;
453
454        while (true) {
455            $is_chunked = (bool) preg_match('/^([0-9a-f]+)[^\r\n]*\r\n/i', $encoded, $matches);
456            if (!$is_chunked) {
457                // Looks like it's not chunked after all
458                $this->state = self::STATE_EMIT;
459                return;
460            }
461
462            $length = hexdec(trim($matches[1]));
463            if ($length === 0) {
464                // Ignore trailer headers
465                $this->state = self::STATE_EMIT;
466                $this->body = $decoded;
467                return;
468            }
469
470            $chunk_length = strlen($matches[0]);
471            $decoded .= substr($encoded, $chunk_length, $length);
472            $encoded = substr($encoded, $chunk_length + $length + 2);
473
474            // BC for PHP < 8.0: substr() can return bool instead of string
475            $encoded = ($encoded === false) ? '' : $encoded;
476
477            if (trim($encoded) === '0' || empty($encoded)) {
478                $this->state = self::STATE_EMIT;
479                $this->body = $decoded;
480                return;
481            }
482        }
483    }
484
485    /**
486     * Prepare headers (take care of proxies headers)
487     *
488     * @param string  $headers Raw headers
489     * @param integer $count   Redirection count. Default to 1.
490     *
491     * @return string
492     */
493    public static function prepareHeaders($headers, $count = 1)
494    {
495        $data = explode("\r\n\r\n", $headers, $count);
496        $data = array_pop($data);
497        if (false !== stripos($data, "HTTP/1.0 200 Connection established\r\n")) {
498            $exploded = explode("\r\n\r\n", $data, 2);
499            $data = end($exploded);
500        }
501        if (false !== stripos($data, "HTTP/1.1 200 Connection established\r\n")) {
502            $exploded = explode("\r\n\r\n", $data, 2);
503            $data = end($exploded);
504        }
505        return $data;
506    }
507}
508
509class_alias('SimplePie\HTTP\Parser', 'SimplePie_HTTP_Parser');
510