1<?php
2
3namespace Sabre\VObject\Parser;
4
5use Sabre\VObject\Component;
6use Sabre\VObject\Component\VCalendar;
7use Sabre\VObject\Component\VCard;
8use Sabre\VObject\Document;
9use Sabre\VObject\EofException;
10use Sabre\VObject\Node;
11use Sabre\VObject\ParseException;
12
13/**
14 * MimeDir parser.
15 *
16 * This class parses iCalendar 2.0 and vCard 2.1, 3.0 and 4.0 files. This
17 * parser will return one of the following two objects from the parse method:
18 *
19 * Sabre\VObject\Component\VCalendar
20 * Sabre\VObject\Component\VCard
21 *
22 * @copyright Copyright (C) fruux GmbH (https://fruux.com/)
23 * @author Evert Pot (http://evertpot.com/)
24 * @license http://sabre.io/license/ Modified BSD License
25 */
26class MimeDir extends Parser
27{
28    /**
29     * The input stream.
30     *
31     * @var resource
32     */
33    protected $input;
34
35    /**
36     * Root component.
37     *
38     * @var Component
39     */
40    protected $root;
41
42    /**
43     * By default all input will be assumed to be UTF-8.
44     *
45     * However, both iCalendar and vCard might be encoded using different
46     * character sets. The character set is usually set in the mime-type.
47     *
48     * If this is the case, use setEncoding to specify that a different
49     * encoding will be used. If this is set, the parser will automatically
50     * convert all incoming data to UTF-8.
51     *
52     * @var string
53     */
54    protected $charset = 'UTF-8';
55
56    /**
57     * The list of character sets we support when decoding.
58     *
59     * This would be a const expression but for now we need to support PHP 5.5
60     */
61    protected static $SUPPORTED_CHARSETS = [
62        'UTF-8',
63        'ISO-8859-1',
64        'Windows-1252',
65    ];
66
67    /**
68     * Parses an iCalendar or vCard file.
69     *
70     * Pass a stream or a string. If null is parsed, the existing buffer is
71     * used.
72     *
73     * @param string|resource|null $input
74     * @param int                  $options
75     *
76     * @return \Sabre\VObject\Document
77     */
78    public function parse($input = null, $options = 0)
79    {
80        $this->root = null;
81
82        if (!is_null($input)) {
83            $this->setInput($input);
84        }
85
86        if (0 !== $options) {
87            $this->options = $options;
88        }
89
90        $this->parseDocument();
91
92        return $this->root;
93    }
94
95    /**
96     * By default all input will be assumed to be UTF-8.
97     *
98     * However, both iCalendar and vCard might be encoded using different
99     * character sets. The character set is usually set in the mime-type.
100     *
101     * If this is the case, use setEncoding to specify that a different
102     * encoding will be used. If this is set, the parser will automatically
103     * convert all incoming data to UTF-8.
104     *
105     * @param string $charset
106     */
107    public function setCharset($charset)
108    {
109        if (!in_array($charset, self::$SUPPORTED_CHARSETS)) {
110            throw new \InvalidArgumentException('Unsupported encoding. (Supported encodings: '.implode(', ', self::$SUPPORTED_CHARSETS).')');
111        }
112        $this->charset = $charset;
113    }
114
115    /**
116     * Sets the input buffer. Must be a string or stream.
117     *
118     * @param resource|string $input
119     */
120    public function setInput($input)
121    {
122        // Resetting the parser
123        $this->lineIndex = 0;
124        $this->startLine = 0;
125
126        if (is_string($input)) {
127            // Converting to a stream.
128            $stream = fopen('php://temp', 'r+');
129            fwrite($stream, $input);
130            rewind($stream);
131            $this->input = $stream;
132        } elseif (is_resource($input)) {
133            $this->input = $input;
134        } else {
135            throw new \InvalidArgumentException('This parser can only read from strings or streams.');
136        }
137    }
138
139    /**
140     * Parses an entire document.
141     */
142    protected function parseDocument()
143    {
144        $line = $this->readLine();
145
146        // BOM is ZERO WIDTH NO-BREAK SPACE (U+FEFF).
147        // It's 0xEF 0xBB 0xBF in UTF-8 hex.
148        if (3 <= strlen($line)
149            && 0xef === ord($line[0])
150            && 0xbb === ord($line[1])
151            && 0xbf === ord($line[2])) {
152            $line = substr($line, 3);
153        }
154
155        switch (strtoupper($line)) {
156            case 'BEGIN:VCALENDAR':
157                $class = VCalendar::$componentMap['VCALENDAR'];
158                break;
159            case 'BEGIN:VCARD':
160                $class = VCard::$componentMap['VCARD'];
161                break;
162            default:
163                throw new ParseException('This parser only supports VCARD and VCALENDAR files');
164        }
165
166        $this->root = new $class([], false);
167
168        while (true) {
169            // Reading until we hit END:
170            $line = $this->readLine();
171            if ('END:' === strtoupper(substr($line, 0, 4))) {
172                break;
173            }
174            $result = $this->parseLine($line);
175            if ($result) {
176                $this->root->add($result);
177            }
178        }
179
180        $name = strtoupper(substr($line, 4));
181        if ($name !== $this->root->name) {
182            throw new ParseException('Invalid MimeDir file. expected: "END:'.$this->root->name.'" got: "END:'.$name.'"');
183        }
184    }
185
186    /**
187     * Parses a line, and if it hits a component, it will also attempt to parse
188     * the entire component.
189     *
190     * @param string $line Unfolded line
191     *
192     * @return Node
193     */
194    protected function parseLine($line)
195    {
196        // Start of a new component
197        if ('BEGIN:' === strtoupper(substr($line, 0, 6))) {
198            if (substr($line, 6) === $this->root->name) {
199                throw new ParseException('Invalid MimeDir file. Unexpected component: "'.$line.'" in document type '.$this->root->name);
200            }
201            $component = $this->root->createComponent(substr($line, 6), [], false);
202
203            while (true) {
204                // Reading until we hit END:
205                $line = $this->readLine();
206                if ('END:' === strtoupper(substr($line, 0, 4))) {
207                    break;
208                }
209                $result = $this->parseLine($line);
210                if ($result) {
211                    $component->add($result);
212                }
213            }
214
215            $name = strtoupper(substr($line, 4));
216            if ($name !== $component->name) {
217                throw new ParseException('Invalid MimeDir file. expected: "END:'.$component->name.'" got: "END:'.$name.'"');
218            }
219
220            return $component;
221        } else {
222            // Property reader
223            $property = $this->readProperty($line);
224            if (!$property) {
225                // Ignored line
226                return false;
227            }
228
229            return $property;
230        }
231    }
232
233    /**
234     * We need to look ahead 1 line every time to see if we need to 'unfold'
235     * the next line.
236     *
237     * If that was not the case, we store it here.
238     *
239     * @var string|null
240     */
241    protected $lineBuffer;
242
243    /**
244     * The real current line number.
245     */
246    protected $lineIndex = 0;
247
248    /**
249     * In the case of unfolded lines, this property holds the line number for
250     * the start of the line.
251     *
252     * @var int
253     */
254    protected $startLine = 0;
255
256    /**
257     * Contains a 'raw' representation of the current line.
258     *
259     * @var string
260     */
261    protected $rawLine;
262
263    /**
264     * Reads a single line from the buffer.
265     *
266     * This method strips any newlines and also takes care of unfolding.
267     *
268     * @throws \Sabre\VObject\EofException
269     *
270     * @return string
271     */
272    protected function readLine()
273    {
274        if (!\is_null($this->lineBuffer)) {
275            $rawLine = $this->lineBuffer;
276            $this->lineBuffer = null;
277        } else {
278            do {
279                $eof = \feof($this->input);
280
281                $rawLine = \fgets($this->input);
282
283                if ($eof || (\feof($this->input) && false === $rawLine)) {
284                    throw new EofException('End of document reached prematurely');
285                }
286                if (false === $rawLine) {
287                    throw new ParseException('Error reading from input stream');
288                }
289                $rawLine = \rtrim($rawLine, "\r\n");
290            } while ('' === $rawLine); // Skipping empty lines
291            ++$this->lineIndex;
292        }
293        $line = $rawLine;
294
295        $this->startLine = $this->lineIndex;
296
297        // Looking ahead for folded lines.
298        while (true) {
299            $nextLine = \rtrim(\fgets($this->input), "\r\n");
300            ++$this->lineIndex;
301            if (!$nextLine) {
302                break;
303            }
304            if ("\t" === $nextLine[0] || ' ' === $nextLine[0]) {
305                $curLine = \substr($nextLine, 1);
306                $line .= $curLine;
307                $rawLine .= "\n ".$curLine;
308            } else {
309                $this->lineBuffer = $nextLine;
310                break;
311            }
312        }
313        $this->rawLine = $rawLine;
314
315        return $line;
316    }
317
318    /**
319     * Reads a property or component from a line.
320     */
321    protected function readProperty($line)
322    {
323        if ($this->options & self::OPTION_FORGIVING) {
324            $propNameToken = 'A-Z0-9\-\._\\/';
325        } else {
326            $propNameToken = 'A-Z0-9\-\.';
327        }
328
329        $paramNameToken = 'A-Z0-9\-';
330        $safeChar = '^";:,';
331        $qSafeChar = '^"';
332
333        $regex = "/
334            ^(?P<name> [$propNameToken]+ ) (?=[;:])        # property name
335            |
336            (?<=:)(?P<propValue> .+)$                      # property value
337            |
338            ;(?P<paramName> [$paramNameToken]+) (?=[=;:])  # parameter name
339            |
340            (=|,)(?P<paramValue>                           # parameter value
341                (?: [$safeChar]*) |
342                \"(?: [$qSafeChar]+)\"
343            ) (?=[;:,])
344            /xi";
345
346        //echo $regex, "\n"; die();
347        preg_match_all($regex, $line, $matches, PREG_SET_ORDER);
348
349        $property = [
350            'name' => null,
351            'parameters' => [],
352            'value' => null,
353        ];
354
355        $lastParam = null;
356
357        /*
358         * Looping through all the tokens.
359         *
360         * Note that we are looping through them in reverse order, because if a
361         * sub-pattern matched, the subsequent named patterns will not show up
362         * in the result.
363         */
364        foreach ($matches as $match) {
365            if (isset($match['paramValue'])) {
366                if ($match['paramValue'] && '"' === $match['paramValue'][0]) {
367                    $value = substr($match['paramValue'], 1, -1);
368                } else {
369                    $value = $match['paramValue'];
370                }
371
372                $value = $this->unescapeParam($value);
373
374                if (is_null($lastParam)) {
375                    throw new ParseException('Invalid Mimedir file. Line starting at '.$this->startLine.' did not follow iCalendar/vCard conventions');
376                }
377                if (is_null($property['parameters'][$lastParam])) {
378                    $property['parameters'][$lastParam] = $value;
379                } elseif (is_array($property['parameters'][$lastParam])) {
380                    $property['parameters'][$lastParam][] = $value;
381                } else {
382                    $property['parameters'][$lastParam] = [
383                        $property['parameters'][$lastParam],
384                        $value,
385                    ];
386                }
387                continue;
388            }
389            if (isset($match['paramName'])) {
390                $lastParam = strtoupper($match['paramName']);
391                if (!isset($property['parameters'][$lastParam])) {
392                    $property['parameters'][$lastParam] = null;
393                }
394                continue;
395            }
396            if (isset($match['propValue'])) {
397                $property['value'] = $match['propValue'];
398                continue;
399            }
400            if (isset($match['name']) && $match['name']) {
401                $property['name'] = strtoupper($match['name']);
402                continue;
403            }
404
405            // @codeCoverageIgnoreStart
406            throw new \LogicException('This code should not be reachable');
407            // @codeCoverageIgnoreEnd
408        }
409
410        if (is_null($property['value'])) {
411            $property['value'] = '';
412        }
413        if (!$property['name']) {
414            if ($this->options & self::OPTION_IGNORE_INVALID_LINES) {
415                return false;
416            }
417            throw new ParseException('Invalid Mimedir file. Line starting at '.$this->startLine.' did not follow iCalendar/vCard conventions');
418        }
419
420        // vCard 2.1 states that parameters may appear without a name, and only
421        // a value. We can deduce the value based on its name.
422        //
423        // Our parser will get those as parameters without a value instead, so
424        // we're filtering these parameters out first.
425        $namedParameters = [];
426        $namelessParameters = [];
427
428        foreach ($property['parameters'] as $name => $value) {
429            if (!is_null($value)) {
430                $namedParameters[$name] = $value;
431            } else {
432                $namelessParameters[] = $name;
433            }
434        }
435
436        $propObj = $this->root->createProperty($property['name'], null, $namedParameters);
437
438        foreach ($namelessParameters as $namelessParameter) {
439            $propObj->add(null, $namelessParameter);
440        }
441
442        if ('QUOTED-PRINTABLE' === strtoupper($propObj['ENCODING'])) {
443            $propObj->setQuotedPrintableValue($this->extractQuotedPrintableValue());
444        } else {
445            $charset = $this->charset;
446            if (Document::VCARD21 === $this->root->getDocumentType() && isset($propObj['CHARSET'])) {
447                // vCard 2.1 allows the character set to be specified per property.
448                $charset = (string) $propObj['CHARSET'];
449            }
450            switch (strtolower($charset)) {
451                case 'utf-8':
452                    break;
453                case 'iso-8859-1':
454                    $property['value'] = utf8_encode($property['value']);
455                    break;
456                case 'windows-1252':
457                    $property['value'] = mb_convert_encoding($property['value'], 'UTF-8', $charset);
458                    break;
459                default:
460                    throw new ParseException('Unsupported CHARSET: '.$propObj['CHARSET']);
461            }
462            $propObj->setRawMimeDirValue($property['value']);
463        }
464
465        return $propObj;
466    }
467
468    /**
469     * Unescapes a property value.
470     *
471     * vCard 2.1 says:
472     *   * Semi-colons must be escaped in some property values, specifically
473     *     ADR, ORG and N.
474     *   * Semi-colons must be escaped in parameter values, because semi-colons
475     *     are also use to separate values.
476     *   * No mention of escaping backslashes with another backslash.
477     *   * newlines are not escaped either, instead QUOTED-PRINTABLE is used to
478     *     span values over more than 1 line.
479     *
480     * vCard 3.0 says:
481     *   * (rfc2425) Backslashes, newlines (\n or \N) and comma's must be
482     *     escaped, all time time.
483     *   * Comma's are used for delimiters in multiple values
484     *   * (rfc2426) Adds to to this that the semi-colon MUST also be escaped,
485     *     as in some properties semi-colon is used for separators.
486     *   * Properties using semi-colons: N, ADR, GEO, ORG
487     *   * Both ADR and N's individual parts may be broken up further with a
488     *     comma.
489     *   * Properties using commas: NICKNAME, CATEGORIES
490     *
491     * vCard 4.0 (rfc6350) says:
492     *   * Commas must be escaped.
493     *   * Semi-colons may be escaped, an unescaped semi-colon _may_ be a
494     *     delimiter, depending on the property.
495     *   * Backslashes must be escaped
496     *   * Newlines must be escaped as either \N or \n.
497     *   * Some compound properties may contain multiple parts themselves, so a
498     *     comma within a semi-colon delimited property may also be unescaped
499     *     to denote multiple parts _within_ the compound property.
500     *   * Text-properties using semi-colons: N, ADR, ORG, CLIENTPIDMAP.
501     *   * Text-properties using commas: NICKNAME, RELATED, CATEGORIES, PID.
502     *
503     * Even though the spec says that commas must always be escaped, the
504     * example for GEO in Section 6.5.2 seems to violate this.
505     *
506     * iCalendar 2.0 (rfc5545) says:
507     *   * Commas or semi-colons may be used as delimiters, depending on the
508     *     property.
509     *   * Commas, semi-colons, backslashes, newline (\N or \n) are always
510     *     escaped, unless they are delimiters.
511     *   * Colons shall not be escaped.
512     *   * Commas can be considered the 'default delimiter' and is described as
513     *     the delimiter in cases where the order of the multiple values is
514     *     insignificant.
515     *   * Semi-colons are described as the delimiter for 'structured values'.
516     *     They are specifically used in Semi-colons are used as a delimiter in
517     *     REQUEST-STATUS, RRULE, GEO and EXRULE. EXRULE is deprecated however.
518     *
519     * Now for the parameters
520     *
521     * If delimiter is not set (null) this method will just return a string.
522     * If it's a comma or a semi-colon the string will be split on those
523     * characters, and always return an array.
524     *
525     * @param string $input
526     * @param string $delimiter
527     *
528     * @return string|string[]
529     */
530    public static function unescapeValue($input, $delimiter = ';')
531    {
532        $regex = '#  (?: (\\\\ (?: \\\\ | N | n | ; | , ) )';
533        if ($delimiter) {
534            $regex .= ' | ('.$delimiter.')';
535        }
536        $regex .= ') #x';
537
538        $matches = preg_split($regex, $input, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
539
540        $resultArray = [];
541        $result = '';
542
543        foreach ($matches as $match) {
544            switch ($match) {
545                case '\\\\':
546                    $result .= '\\';
547                    break;
548                case '\N':
549                case '\n':
550                    $result .= "\n";
551                    break;
552                case '\;':
553                    $result .= ';';
554                    break;
555                case '\,':
556                    $result .= ',';
557                    break;
558                case $delimiter:
559                    $resultArray[] = $result;
560                    $result = '';
561                    break;
562                default:
563                    $result .= $match;
564                    break;
565            }
566        }
567
568        $resultArray[] = $result;
569
570        return $delimiter ? $resultArray : $result;
571    }
572
573    /**
574     * Unescapes a parameter value.
575     *
576     * vCard 2.1:
577     *   * Does not mention a mechanism for this. In addition, double quotes
578     *     are never used to wrap values.
579     *   * This means that parameters can simply not contain colons or
580     *     semi-colons.
581     *
582     * vCard 3.0 (rfc2425, rfc2426):
583     *   * Parameters _may_ be surrounded by double quotes.
584     *   * If this is not the case, semi-colon, colon and comma may simply not
585     *     occur (the comma used for multiple parameter values though).
586     *   * If it is surrounded by double-quotes, it may simply not contain
587     *     double-quotes.
588     *   * This means that a parameter can in no case encode double-quotes, or
589     *     newlines.
590     *
591     * vCard 4.0 (rfc6350)
592     *   * Behavior seems to be identical to vCard 3.0
593     *
594     * iCalendar 2.0 (rfc5545)
595     *   * Behavior seems to be identical to vCard 3.0
596     *
597     * Parameter escaping mechanism (rfc6868) :
598     *   * This rfc describes a new way to escape parameter values.
599     *   * New-line is encoded as ^n
600     *   * ^ is encoded as ^^.
601     *   * " is encoded as ^'
602     *
603     * @param string $input
604     */
605    private function unescapeParam($input)
606    {
607        return
608            preg_replace_callback(
609                '#(\^(\^|n|\'))#',
610                function ($matches) {
611                    switch ($matches[2]) {
612                        case 'n':
613                            return "\n";
614                        case '^':
615                            return '^';
616                        case '\'':
617                            return '"';
618
619                    // @codeCoverageIgnoreStart
620                    }
621                    // @codeCoverageIgnoreEnd
622                },
623                $input
624            );
625    }
626
627    /**
628     * Gets the full quoted printable value.
629     *
630     * We need a special method for this, because newlines have both a meaning
631     * in vCards, and in QuotedPrintable.
632     *
633     * This method does not do any decoding.
634     *
635     * @return string
636     */
637    private function extractQuotedPrintableValue()
638    {
639        // We need to parse the raw line again to get the start of the value.
640        //
641        // We are basically looking for the first colon (:), but we need to
642        // skip over the parameters first, as they may contain one.
643        $regex = '/^
644            (?: [^:])+ # Anything but a colon
645            (?: "[^"]")* # A parameter in double quotes
646            : # start of the value we really care about
647            (.*)$
648        /xs';
649
650        preg_match($regex, $this->rawLine, $matches);
651
652        $value = $matches[1];
653        // Removing the first whitespace character from every line. Kind of
654        // like unfolding, but we keep the newline.
655        $value = str_replace("\n ", "\n", $value);
656
657        // Microsoft products don't always correctly fold lines, they may be
658        // missing a whitespace. So if 'forgiving' is turned on, we will take
659        // those as well.
660        if ($this->options & self::OPTION_FORGIVING) {
661            while ('=' === substr($value, -1) && $this->lineBuffer) {
662                // Reading the line
663                $this->readLine();
664                // Grabbing the raw form
665                $value .= "\n".$this->rawLine;
666            }
667        }
668
669        return $value;
670    }
671}
672