1<?php
2
3namespace Sabre\VObject\Parser;
4
5use Sabre\VObject\Component;
6use Sabre\VObject\Component\VCalendar;
7use Sabre\VObject\Component\VCard;
8use Sabre\VObject\Document;
9use Sabre\VObject\EofException;
10use Sabre\VObject\ParseException;
11
12/**
13 * MimeDir parser.
14 *
15 * This class parses iCalendar 2.0 and vCard 2.1, 3.0 and 4.0 files. This
16 * parser will return one of the following two objects from the parse method:
17 *
18 * Sabre\VObject\Component\VCalendar
19 * Sabre\VObject\Component\VCard
20 *
21 * @copyright Copyright (C) fruux GmbH (https://fruux.com/)
22 * @author Evert Pot (http://evertpot.com/)
23 * @license http://sabre.io/license/ Modified BSD License
24 */
25class MimeDir extends Parser {
26
27    /**
28     * The input stream.
29     *
30     * @var resource
31     */
32    protected $input;
33
34    /**
35     * Root component.
36     *
37     * @var Component
38     */
39    protected $root;
40
41    /**
42     * By default all input will be assumed to be UTF-8.
43     *
44     * However, both iCalendar and vCard might be encoded using different
45     * character sets. The character set is usually set in the mime-type.
46     *
47     * If this is the case, use setEncoding to specify that a different
48     * encoding will be used. If this is set, the parser will automatically
49     * convert all incoming data to UTF-8.
50     *
51     * @var string
52     */
53    protected $charset = 'UTF-8';
54
55    /**
56     * The list of character sets we support when decoding.
57     *
58     * This would be a const expression but for now we need to support PHP 5.5
59     */
60    protected static $SUPPORTED_CHARSETS = [
61        'UTF-8',
62        'ISO-8859-1',
63        'Windows-1252',
64    ];
65
66    /**
67     * Parses an iCalendar or vCard file.
68     *
69     * Pass a stream or a string. If null is parsed, the existing buffer is
70     * used.
71     *
72     * @param string|resource|null $input
73     * @param int $options
74     *
75     * @return Sabre\VObject\Document
76     */
77    function parse($input = null, $options = 0) {
78
79        $this->root = null;
80
81        if (!is_null($input)) {
82            $this->setInput($input);
83        }
84
85        if (0 !== $options) {
86            $this->options = $options;
87        }
88
89        $this->parseDocument();
90
91        return $this->root;
92
93    }
94
95    /**
96     * By default all input will be assumed to be UTF-8.
97     *
98     * However, both iCalendar and vCard might be encoded using different
99     * character sets. The character set is usually set in the mime-type.
100     *
101     * If this is the case, use setEncoding to specify that a different
102     * encoding will be used. If this is set, the parser will automatically
103     * convert all incoming data to UTF-8.
104     *
105     * @param string $charset
106     */
107    function setCharset($charset) {
108
109        if (!in_array($charset, self::$SUPPORTED_CHARSETS)) {
110            throw new \InvalidArgumentException('Unsupported encoding. (Supported encodings: ' . implode(', ', self::$SUPPORTED_CHARSETS) . ')');
111        }
112        $this->charset = $charset;
113
114    }
115
116    /**
117     * Sets the input buffer. Must be a string or stream.
118     *
119     * @param resource|string $input
120     *
121     * @return void
122     */
123    function setInput($input) {
124
125        // Resetting the parser
126        $this->lineIndex = 0;
127        $this->startLine = 0;
128
129        if (is_string($input)) {
130            // Convering to a stream.
131            $stream = fopen('php://temp', 'r+');
132            fwrite($stream, $input);
133            rewind($stream);
134            $this->input = $stream;
135        } elseif (is_resource($input)) {
136            $this->input = $input;
137        } else {
138            throw new \InvalidArgumentException('This parser can only read from strings or streams.');
139        }
140
141    }
142
143    /**
144     * Parses an entire document.
145     *
146     * @return void
147     */
148    protected function parseDocument() {
149
150        $line = $this->readLine();
151
152        // BOM is ZERO WIDTH NO-BREAK SPACE (U+FEFF).
153        // It's 0xEF 0xBB 0xBF in UTF-8 hex.
154        if (3 <= strlen($line)
155            && ord($line[0]) === 0xef
156            && ord($line[1]) === 0xbb
157            && ord($line[2]) === 0xbf) {
158            $line = substr($line, 3);
159        }
160
161        switch (strtoupper($line)) {
162            case 'BEGIN:VCALENDAR' :
163                $class = VCalendar::$componentMap['VCALENDAR'];
164                break;
165            case 'BEGIN:VCARD' :
166                $class = VCard::$componentMap['VCARD'];
167                break;
168            default :
169                throw new ParseException('This parser only supports VCARD and VCALENDAR files');
170        }
171
172        $this->root = new $class([], false);
173
174        while (true) {
175
176            // Reading until we hit END:
177            $line = $this->readLine();
178            if (strtoupper(substr($line, 0, 4)) === 'END:') {
179                break;
180            }
181            $result = $this->parseLine($line);
182            if ($result) {
183                $this->root->add($result);
184            }
185
186        }
187
188        $name = strtoupper(substr($line, 4));
189        if ($name !== $this->root->name) {
190            throw new ParseException('Invalid MimeDir file. expected: "END:' . $this->root->name . '" got: "END:' . $name . '"');
191        }
192
193    }
194
195    /**
196     * Parses a line, and if it hits a component, it will also attempt to parse
197     * the entire component.
198     *
199     * @param string $line Unfolded line
200     *
201     * @return Node
202     */
203    protected function parseLine($line) {
204
205        // Start of a new component
206        if (strtoupper(substr($line, 0, 6)) === 'BEGIN:') {
207
208            $component = $this->root->createComponent(substr($line, 6), [], false);
209
210            while (true) {
211
212                // Reading until we hit END:
213                $line = $this->readLine();
214                if (strtoupper(substr($line, 0, 4)) === 'END:') {
215                    break;
216                }
217                $result = $this->parseLine($line);
218                if ($result) {
219                    $component->add($result);
220                }
221
222            }
223
224            $name = strtoupper(substr($line, 4));
225            if ($name !== $component->name) {
226                throw new ParseException('Invalid MimeDir file. expected: "END:' . $component->name . '" got: "END:' . $name . '"');
227            }
228
229            return $component;
230
231        } else {
232
233            // Property reader
234            $property = $this->readProperty($line);
235            if (!$property) {
236                // Ignored line
237                return false;
238            }
239            return $property;
240
241        }
242
243    }
244
245    /**
246     * We need to look ahead 1 line every time to see if we need to 'unfold'
247     * the next line.
248     *
249     * If that was not the case, we store it here.
250     *
251     * @var null|string
252     */
253    protected $lineBuffer;
254
255    /**
256     * The real current line number.
257     */
258    protected $lineIndex = 0;
259
260    /**
261     * In the case of unfolded lines, this property holds the line number for
262     * the start of the line.
263     *
264     * @var int
265     */
266    protected $startLine = 0;
267
268    /**
269     * Contains a 'raw' representation of the current line.
270     *
271     * @var string
272     */
273    protected $rawLine;
274
275    /**
276     * Reads a single line from the buffer.
277     *
278     * This method strips any newlines and also takes care of unfolding.
279     *
280     * @throws \Sabre\VObject\EofException
281     *
282     * @return string
283     */
284    protected function readLine() {
285
286        if (!is_null($this->lineBuffer)) {
287            $rawLine = $this->lineBuffer;
288            $this->lineBuffer = null;
289        } else {
290            do {
291                $eof = feof($this->input);
292
293                $rawLine = fgets($this->input);
294
295                if ($eof || (feof($this->input) && $rawLine === false)) {
296                    throw new EofException('End of document reached prematurely');
297                }
298                if ($rawLine === false) {
299                    throw new ParseException('Error reading from input stream');
300                }
301                $rawLine = rtrim($rawLine, "\r\n");
302            } while ($rawLine === ''); // Skipping empty lines
303            $this->lineIndex++;
304        }
305        $line = $rawLine;
306
307        $this->startLine = $this->lineIndex;
308
309        // Looking ahead for folded lines.
310        while (true) {
311
312            $nextLine = rtrim(fgets($this->input), "\r\n");
313            $this->lineIndex++;
314            if (!$nextLine) {
315                break;
316            }
317            if ($nextLine[0] === "\t" || $nextLine[0] === " ") {
318                $line .= substr($nextLine, 1);
319                $rawLine .= "\n " . substr($nextLine, 1);
320            } else {
321                $this->lineBuffer = $nextLine;
322                break;
323            }
324
325        }
326        $this->rawLine = $rawLine;
327        return $line;
328
329    }
330
331    /**
332     * Reads a property or component from a line.
333     *
334     * @return void
335     */
336    protected function readProperty($line) {
337
338        if ($this->options & self::OPTION_FORGIVING) {
339            $propNameToken = 'A-Z0-9\-\._\\/';
340        } else {
341            $propNameToken = 'A-Z0-9\-\.';
342        }
343
344        $paramNameToken = 'A-Z0-9\-';
345        $safeChar = '^";:,';
346        $qSafeChar = '^"';
347
348        $regex = "/
349            ^(?P<name> [$propNameToken]+ ) (?=[;:])        # property name
350            |
351            (?<=:)(?P<propValue> .+)$                      # property value
352            |
353            ;(?P<paramName> [$paramNameToken]+) (?=[=;:])  # parameter name
354            |
355            (=|,)(?P<paramValue>                           # parameter value
356                (?: [$safeChar]*) |
357                \"(?: [$qSafeChar]+)\"
358            ) (?=[;:,])
359            /xi";
360
361        //echo $regex, "\n"; die();
362        preg_match_all($regex, $line, $matches,  PREG_SET_ORDER);
363
364        $property = [
365            'name'       => null,
366            'parameters' => [],
367            'value'      => null
368        ];
369
370        $lastParam = null;
371
372        /**
373         * Looping through all the tokens.
374         *
375         * Note that we are looping through them in reverse order, because if a
376         * sub-pattern matched, the subsequent named patterns will not show up
377         * in the result.
378         */
379        foreach ($matches as $match) {
380
381            if (isset($match['paramValue'])) {
382                if ($match['paramValue'] && $match['paramValue'][0] === '"') {
383                    $value = substr($match['paramValue'], 1, -1);
384                } else {
385                    $value = $match['paramValue'];
386                }
387
388                $value = $this->unescapeParam($value);
389
390                if (is_null($lastParam)) {
391                    throw new ParseException('Invalid Mimedir file. Line starting at ' . $this->startLine . ' did not follow iCalendar/vCard conventions');
392                }
393                if (is_null($property['parameters'][$lastParam])) {
394                    $property['parameters'][$lastParam] = $value;
395                } elseif (is_array($property['parameters'][$lastParam])) {
396                    $property['parameters'][$lastParam][] = $value;
397                } else {
398                    $property['parameters'][$lastParam] = [
399                        $property['parameters'][$lastParam],
400                        $value
401                    ];
402                }
403                continue;
404            }
405            if (isset($match['paramName'])) {
406                $lastParam = strtoupper($match['paramName']);
407                if (!isset($property['parameters'][$lastParam])) {
408                    $property['parameters'][$lastParam] = null;
409                }
410                continue;
411            }
412            if (isset($match['propValue'])) {
413                $property['value'] = $match['propValue'];
414                continue;
415            }
416            if (isset($match['name']) && $match['name']) {
417                $property['name'] = strtoupper($match['name']);
418                continue;
419            }
420
421            // @codeCoverageIgnoreStart
422            throw new \LogicException('This code should not be reachable');
423            // @codeCoverageIgnoreEnd
424
425        }
426
427        if (is_null($property['value'])) {
428            $property['value'] = '';
429        }
430        if (!$property['name']) {
431            if ($this->options & self::OPTION_IGNORE_INVALID_LINES) {
432                return false;
433            }
434            throw new ParseException('Invalid Mimedir file. Line starting at ' . $this->startLine . ' did not follow iCalendar/vCard conventions');
435        }
436
437        // vCard 2.1 states that parameters may appear without a name, and only
438        // a value. We can deduce the value based on it's name.
439        //
440        // Our parser will get those as parameters without a value instead, so
441        // we're filtering these parameters out first.
442        $namedParameters = [];
443        $namelessParameters = [];
444
445        foreach ($property['parameters'] as $name => $value) {
446            if (!is_null($value)) {
447                $namedParameters[$name] = $value;
448            } else {
449                $namelessParameters[] = $name;
450            }
451        }
452
453        $propObj = $this->root->createProperty($property['name'], null, $namedParameters);
454
455        foreach ($namelessParameters as $namelessParameter) {
456            $propObj->add(null, $namelessParameter);
457        }
458
459        if (strtoupper($propObj['ENCODING']) === 'QUOTED-PRINTABLE') {
460            $propObj->setQuotedPrintableValue($this->extractQuotedPrintableValue());
461        } else {
462            $charset = $this->charset;
463            if ($this->root->getDocumentType() === Document::VCARD21 && isset($propObj['CHARSET'])) {
464                // vCard 2.1 allows the character set to be specified per property.
465                $charset = (string)$propObj['CHARSET'];
466            }
467            switch ($charset) {
468                case 'UTF-8' :
469                    break;
470                case 'ISO-8859-1' :
471                    $property['value'] = utf8_encode($property['value']);
472                    break;
473                case 'Windows-1252' :
474                    $property['value'] = mb_convert_encoding($property['value'], 'UTF-8', $charset);
475                    break;
476                default :
477                    throw new ParseException('Unsupported CHARSET: ' . $propObj['CHARSET']);
478            }
479            $propObj->setRawMimeDirValue($property['value']);
480        }
481
482        return $propObj;
483
484    }
485
486    /**
487     * Unescapes a property value.
488     *
489     * vCard 2.1 says:
490     *   * Semi-colons must be escaped in some property values, specifically
491     *     ADR, ORG and N.
492     *   * Semi-colons must be escaped in parameter values, because semi-colons
493     *     are also use to separate values.
494     *   * No mention of escaping backslashes with another backslash.
495     *   * newlines are not escaped either, instead QUOTED-PRINTABLE is used to
496     *     span values over more than 1 line.
497     *
498     * vCard 3.0 says:
499     *   * (rfc2425) Backslashes, newlines (\n or \N) and comma's must be
500     *     escaped, all time time.
501     *   * Comma's are used for delimeters in multiple values
502     *   * (rfc2426) Adds to to this that the semi-colon MUST also be escaped,
503     *     as in some properties semi-colon is used for separators.
504     *   * Properties using semi-colons: N, ADR, GEO, ORG
505     *   * Both ADR and N's individual parts may be broken up further with a
506     *     comma.
507     *   * Properties using commas: NICKNAME, CATEGORIES
508     *
509     * vCard 4.0 (rfc6350) says:
510     *   * Commas must be escaped.
511     *   * Semi-colons may be escaped, an unescaped semi-colon _may_ be a
512     *     delimiter, depending on the property.
513     *   * Backslashes must be escaped
514     *   * Newlines must be escaped as either \N or \n.
515     *   * Some compound properties may contain multiple parts themselves, so a
516     *     comma within a semi-colon delimited property may also be unescaped
517     *     to denote multiple parts _within_ the compound property.
518     *   * Text-properties using semi-colons: N, ADR, ORG, CLIENTPIDMAP.
519     *   * Text-properties using commas: NICKNAME, RELATED, CATEGORIES, PID.
520     *
521     * Even though the spec says that commas must always be escaped, the
522     * example for GEO in Section 6.5.2 seems to violate this.
523     *
524     * iCalendar 2.0 (rfc5545) says:
525     *   * Commas or semi-colons may be used as delimiters, depending on the
526     *     property.
527     *   * Commas, semi-colons, backslashes, newline (\N or \n) are always
528     *     escaped, unless they are delimiters.
529     *   * Colons shall not be escaped.
530     *   * Commas can be considered the 'default delimiter' and is described as
531     *     the delimiter in cases where the order of the multiple values is
532     *     insignificant.
533     *   * Semi-colons are described as the delimiter for 'structured values'.
534     *     They are specifically used in Semi-colons are used as a delimiter in
535     *     REQUEST-STATUS, RRULE, GEO and EXRULE. EXRULE is deprecated however.
536     *
537     * Now for the parameters
538     *
539     * If delimiter is not set (null) this method will just return a string.
540     * If it's a comma or a semi-colon the string will be split on those
541     * characters, and always return an array.
542     *
543     * @param string $input
544     * @param string $delimiter
545     *
546     * @return string|string[]
547     */
548    static function unescapeValue($input, $delimiter = ';') {
549
550        $regex = '#  (?: (\\\\ (?: \\\\ | N | n | ; | , ) )';
551        if ($delimiter) {
552            $regex .= ' | (' . $delimiter . ')';
553        }
554        $regex .= ') #x';
555
556        $matches = preg_split($regex, $input, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
557
558        $resultArray = [];
559        $result = '';
560
561        foreach ($matches as $match) {
562
563            switch ($match) {
564                case '\\\\' :
565                    $result .= '\\';
566                    break;
567                case '\N' :
568                case '\n' :
569                    $result .= "\n";
570                    break;
571                case '\;' :
572                    $result .= ';';
573                    break;
574                case '\,' :
575                    $result .= ',';
576                    break;
577                case $delimiter :
578                    $resultArray[] = $result;
579                    $result = '';
580                    break;
581                default :
582                    $result .= $match;
583                    break;
584
585            }
586
587        }
588
589        $resultArray[] = $result;
590        return $delimiter ? $resultArray : $result;
591
592    }
593
594    /**
595     * Unescapes a parameter value.
596     *
597     * vCard 2.1:
598     *   * Does not mention a mechanism for this. In addition, double quotes
599     *     are never used to wrap values.
600     *   * This means that parameters can simply not contain colons or
601     *     semi-colons.
602     *
603     * vCard 3.0 (rfc2425, rfc2426):
604     *   * Parameters _may_ be surrounded by double quotes.
605     *   * If this is not the case, semi-colon, colon and comma may simply not
606     *     occur (the comma used for multiple parameter values though).
607     *   * If it is surrounded by double-quotes, it may simply not contain
608     *     double-quotes.
609     *   * This means that a parameter can in no case encode double-quotes, or
610     *     newlines.
611     *
612     * vCard 4.0 (rfc6350)
613     *   * Behavior seems to be identical to vCard 3.0
614     *
615     * iCalendar 2.0 (rfc5545)
616     *   * Behavior seems to be identical to vCard 3.0
617     *
618     * Parameter escaping mechanism (rfc6868) :
619     *   * This rfc describes a new way to escape parameter values.
620     *   * New-line is encoded as ^n
621     *   * ^ is encoded as ^^.
622     *   * " is encoded as ^'
623     *
624     * @param string $input
625     *
626     * @return void
627     */
628    private function unescapeParam($input) {
629
630        return
631            preg_replace_callback(
632                '#(\^(\^|n|\'))#',
633                function($matches) {
634                    switch ($matches[2]) {
635                        case 'n' :
636                            return "\n";
637                        case '^' :
638                            return '^';
639                        case '\'' :
640                            return '"';
641
642                    // @codeCoverageIgnoreStart
643                    }
644                    // @codeCoverageIgnoreEnd
645                },
646                $input
647            );
648    }
649
650    /**
651     * Gets the full quoted printable value.
652     *
653     * We need a special method for this, because newlines have both a meaning
654     * in vCards, and in QuotedPrintable.
655     *
656     * This method does not do any decoding.
657     *
658     * @return string
659     */
660    private function extractQuotedPrintableValue() {
661
662        // We need to parse the raw line again to get the start of the value.
663        //
664        // We are basically looking for the first colon (:), but we need to
665        // skip over the parameters first, as they may contain one.
666        $regex = '/^
667            (?: [^:])+ # Anything but a colon
668            (?: "[^"]")* # A parameter in double quotes
669            : # start of the value we really care about
670            (.*)$
671        /xs';
672
673        preg_match($regex, $this->rawLine, $matches);
674
675        $value = $matches[1];
676        // Removing the first whitespace character from every line. Kind of
677        // like unfolding, but we keep the newline.
678        $value = str_replace("\n ", "\n", $value);
679
680        // Microsoft products don't always correctly fold lines, they may be
681        // missing a whitespace. So if 'forgiving' is turned on, we will take
682        // those as well.
683        if ($this->options & self::OPTION_FORGIVING) {
684            while (substr($value, -1) === '=') {
685                // Reading the line
686                $this->readLine();
687                // Grabbing the raw form
688                $value .= "\n" . $this->rawLine;
689            }
690        }
691
692        return $value;
693
694    }
695
696}
697