1<?php 2 3namespace Sabre\VObject\Parser; 4 5use Sabre\VObject\Component; 6use Sabre\VObject\Component\VCalendar; 7use Sabre\VObject\Component\VCard; 8use Sabre\VObject\Document; 9use Sabre\VObject\EofException; 10use Sabre\VObject\Node; 11use Sabre\VObject\ParseException; 12 13/** 14 * MimeDir parser. 15 * 16 * This class parses iCalendar 2.0 and vCard 2.1, 3.0 and 4.0 files. This 17 * parser will return one of the following two objects from the parse method: 18 * 19 * Sabre\VObject\Component\VCalendar 20 * Sabre\VObject\Component\VCard 21 * 22 * @copyright Copyright (C) fruux GmbH (https://fruux.com/) 23 * @author Evert Pot (http://evertpot.com/) 24 * @license http://sabre.io/license/ Modified BSD License 25 */ 26class MimeDir extends Parser 27{ 28 /** 29 * The input stream. 30 * 31 * @var resource 32 */ 33 protected $input; 34 35 /** 36 * Root component. 37 * 38 * @var Component 39 */ 40 protected $root; 41 42 /** 43 * By default all input will be assumed to be UTF-8. 44 * 45 * However, both iCalendar and vCard might be encoded using different 46 * character sets. The character set is usually set in the mime-type. 47 * 48 * If this is the case, use setEncoding to specify that a different 49 * encoding will be used. If this is set, the parser will automatically 50 * convert all incoming data to UTF-8. 51 * 52 * @var string 53 */ 54 protected $charset = 'UTF-8'; 55 56 /** 57 * The list of character sets we support when decoding. 58 * 59 * This would be a const expression but for now we need to support PHP 5.5 60 */ 61 protected static $SUPPORTED_CHARSETS = [ 62 'UTF-8', 63 'ISO-8859-1', 64 'Windows-1252', 65 ]; 66 67 /** 68 * Parses an iCalendar or vCard file. 69 * 70 * Pass a stream or a string. If null is parsed, the existing buffer is 71 * used. 72 * 73 * @param string|resource|null $input 74 * @param int $options 75 * 76 * @return \Sabre\VObject\Document 77 */ 78 public function parse($input = null, $options = 0) 79 { 80 $this->root = null; 81 82 if (!is_null($input)) { 83 $this->setInput($input); 84 } 85 86 if (0 !== $options) { 87 $this->options = $options; 88 } 89 90 $this->parseDocument(); 91 92 return $this->root; 93 } 94 95 /** 96 * By default all input will be assumed to be UTF-8. 97 * 98 * However, both iCalendar and vCard might be encoded using different 99 * character sets. The character set is usually set in the mime-type. 100 * 101 * If this is the case, use setEncoding to specify that a different 102 * encoding will be used. If this is set, the parser will automatically 103 * convert all incoming data to UTF-8. 104 * 105 * @param string $charset 106 */ 107 public function setCharset($charset) 108 { 109 if (!in_array($charset, self::$SUPPORTED_CHARSETS)) { 110 throw new \InvalidArgumentException('Unsupported encoding. (Supported encodings: '.implode(', ', self::$SUPPORTED_CHARSETS).')'); 111 } 112 $this->charset = $charset; 113 } 114 115 /** 116 * Sets the input buffer. Must be a string or stream. 117 * 118 * @param resource|string $input 119 */ 120 public function setInput($input) 121 { 122 // Resetting the parser 123 $this->lineIndex = 0; 124 $this->startLine = 0; 125 126 if (is_string($input)) { 127 // Converting to a stream. 128 $stream = fopen('php://temp', 'r+'); 129 fwrite($stream, $input); 130 rewind($stream); 131 $this->input = $stream; 132 } elseif (is_resource($input)) { 133 $this->input = $input; 134 } else { 135 throw new \InvalidArgumentException('This parser can only read from strings or streams.'); 136 } 137 } 138 139 /** 140 * Parses an entire document. 141 */ 142 protected function parseDocument() 143 { 144 $line = $this->readLine(); 145 146 // BOM is ZERO WIDTH NO-BREAK SPACE (U+FEFF). 147 // It's 0xEF 0xBB 0xBF in UTF-8 hex. 148 if (3 <= strlen($line) 149 && 0xef === ord($line[0]) 150 && 0xbb === ord($line[1]) 151 && 0xbf === ord($line[2])) { 152 $line = substr($line, 3); 153 } 154 155 switch (strtoupper($line)) { 156 case 'BEGIN:VCALENDAR': 157 $class = VCalendar::$componentMap['VCALENDAR']; 158 break; 159 case 'BEGIN:VCARD': 160 $class = VCard::$componentMap['VCARD']; 161 break; 162 default: 163 throw new ParseException('This parser only supports VCARD and VCALENDAR files'); 164 } 165 166 $this->root = new $class([], false); 167 168 while (true) { 169 // Reading until we hit END: 170 $line = $this->readLine(); 171 if ('END:' === strtoupper(substr($line, 0, 4))) { 172 break; 173 } 174 $result = $this->parseLine($line); 175 if ($result) { 176 $this->root->add($result); 177 } 178 } 179 180 $name = strtoupper(substr($line, 4)); 181 if ($name !== $this->root->name) { 182 throw new ParseException('Invalid MimeDir file. expected: "END:'.$this->root->name.'" got: "END:'.$name.'"'); 183 } 184 } 185 186 /** 187 * Parses a line, and if it hits a component, it will also attempt to parse 188 * the entire component. 189 * 190 * @param string $line Unfolded line 191 * 192 * @return Node 193 */ 194 protected function parseLine($line) 195 { 196 // Start of a new component 197 if ('BEGIN:' === strtoupper(substr($line, 0, 6))) { 198 if (substr($line, 6) === $this->root->name) { 199 throw new ParseException('Invalid MimeDir file. Unexpected component: "'.$line.'" in document type '.$this->root->name); 200 } 201 $component = $this->root->createComponent(substr($line, 6), [], false); 202 203 while (true) { 204 // Reading until we hit END: 205 $line = $this->readLine(); 206 if ('END:' === strtoupper(substr($line, 0, 4))) { 207 break; 208 } 209 $result = $this->parseLine($line); 210 if ($result) { 211 $component->add($result); 212 } 213 } 214 215 $name = strtoupper(substr($line, 4)); 216 if ($name !== $component->name) { 217 throw new ParseException('Invalid MimeDir file. expected: "END:'.$component->name.'" got: "END:'.$name.'"'); 218 } 219 220 return $component; 221 } else { 222 // Property reader 223 $property = $this->readProperty($line); 224 if (!$property) { 225 // Ignored line 226 return false; 227 } 228 229 return $property; 230 } 231 } 232 233 /** 234 * We need to look ahead 1 line every time to see if we need to 'unfold' 235 * the next line. 236 * 237 * If that was not the case, we store it here. 238 * 239 * @var string|null 240 */ 241 protected $lineBuffer; 242 243 /** 244 * The real current line number. 245 */ 246 protected $lineIndex = 0; 247 248 /** 249 * In the case of unfolded lines, this property holds the line number for 250 * the start of the line. 251 * 252 * @var int 253 */ 254 protected $startLine = 0; 255 256 /** 257 * Contains a 'raw' representation of the current line. 258 * 259 * @var string 260 */ 261 protected $rawLine; 262 263 /** 264 * Reads a single line from the buffer. 265 * 266 * This method strips any newlines and also takes care of unfolding. 267 * 268 * @throws \Sabre\VObject\EofException 269 * 270 * @return string 271 */ 272 protected function readLine() 273 { 274 if (!\is_null($this->lineBuffer)) { 275 $rawLine = $this->lineBuffer; 276 $this->lineBuffer = null; 277 } else { 278 do { 279 $eof = \feof($this->input); 280 281 $rawLine = \fgets($this->input); 282 283 if ($eof || (\feof($this->input) && false === $rawLine)) { 284 throw new EofException('End of document reached prematurely'); 285 } 286 if (false === $rawLine) { 287 throw new ParseException('Error reading from input stream'); 288 } 289 $rawLine = \rtrim($rawLine, "\r\n"); 290 } while ('' === $rawLine); // Skipping empty lines 291 ++$this->lineIndex; 292 } 293 $line = $rawLine; 294 295 $this->startLine = $this->lineIndex; 296 297 // Looking ahead for folded lines. 298 while (true) { 299 $nextLine = \rtrim(\fgets($this->input), "\r\n"); 300 ++$this->lineIndex; 301 if (!$nextLine) { 302 break; 303 } 304 if ("\t" === $nextLine[0] || ' ' === $nextLine[0]) { 305 $curLine = \substr($nextLine, 1); 306 $line .= $curLine; 307 $rawLine .= "\n ".$curLine; 308 } else { 309 $this->lineBuffer = $nextLine; 310 break; 311 } 312 } 313 $this->rawLine = $rawLine; 314 315 return $line; 316 } 317 318 /** 319 * Reads a property or component from a line. 320 */ 321 protected function readProperty($line) 322 { 323 if ($this->options & self::OPTION_FORGIVING) { 324 $propNameToken = 'A-Z0-9\-\._\\/'; 325 } else { 326 $propNameToken = 'A-Z0-9\-\.'; 327 } 328 329 $paramNameToken = 'A-Z0-9\-'; 330 $safeChar = '^";:,'; 331 $qSafeChar = '^"'; 332 333 $regex = "/ 334 ^(?P<name> [$propNameToken]+ ) (?=[;:]) # property name 335 | 336 (?<=:)(?P<propValue> .+)$ # property value 337 | 338 ;(?P<paramName> [$paramNameToken]+) (?=[=;:]) # parameter name 339 | 340 (=|,)(?P<paramValue> # parameter value 341 (?: [$safeChar]*) | 342 \"(?: [$qSafeChar]+)\" 343 ) (?=[;:,]) 344 /xi"; 345 346 //echo $regex, "\n"; die(); 347 preg_match_all($regex, $line, $matches, PREG_SET_ORDER); 348 349 $property = [ 350 'name' => null, 351 'parameters' => [], 352 'value' => null, 353 ]; 354 355 $lastParam = null; 356 357 /* 358 * Looping through all the tokens. 359 * 360 * Note that we are looping through them in reverse order, because if a 361 * sub-pattern matched, the subsequent named patterns will not show up 362 * in the result. 363 */ 364 foreach ($matches as $match) { 365 if (isset($match['paramValue'])) { 366 if ($match['paramValue'] && '"' === $match['paramValue'][0]) { 367 $value = substr($match['paramValue'], 1, -1); 368 } else { 369 $value = $match['paramValue']; 370 } 371 372 $value = $this->unescapeParam($value); 373 374 if (is_null($lastParam)) { 375 throw new ParseException('Invalid Mimedir file. Line starting at '.$this->startLine.' did not follow iCalendar/vCard conventions'); 376 } 377 if (is_null($property['parameters'][$lastParam])) { 378 $property['parameters'][$lastParam] = $value; 379 } elseif (is_array($property['parameters'][$lastParam])) { 380 $property['parameters'][$lastParam][] = $value; 381 } else { 382 $property['parameters'][$lastParam] = [ 383 $property['parameters'][$lastParam], 384 $value, 385 ]; 386 } 387 continue; 388 } 389 if (isset($match['paramName'])) { 390 $lastParam = strtoupper($match['paramName']); 391 if (!isset($property['parameters'][$lastParam])) { 392 $property['parameters'][$lastParam] = null; 393 } 394 continue; 395 } 396 if (isset($match['propValue'])) { 397 $property['value'] = $match['propValue']; 398 continue; 399 } 400 if (isset($match['name']) && $match['name']) { 401 $property['name'] = strtoupper($match['name']); 402 continue; 403 } 404 405 // @codeCoverageIgnoreStart 406 throw new \LogicException('This code should not be reachable'); 407 // @codeCoverageIgnoreEnd 408 } 409 410 if (is_null($property['value'])) { 411 $property['value'] = ''; 412 } 413 if (!$property['name']) { 414 if ($this->options & self::OPTION_IGNORE_INVALID_LINES) { 415 return false; 416 } 417 throw new ParseException('Invalid Mimedir file. Line starting at '.$this->startLine.' did not follow iCalendar/vCard conventions'); 418 } 419 420 // vCard 2.1 states that parameters may appear without a name, and only 421 // a value. We can deduce the value based on its name. 422 // 423 // Our parser will get those as parameters without a value instead, so 424 // we're filtering these parameters out first. 425 $namedParameters = []; 426 $namelessParameters = []; 427 428 foreach ($property['parameters'] as $name => $value) { 429 if (!is_null($value)) { 430 $namedParameters[$name] = $value; 431 } else { 432 $namelessParameters[] = $name; 433 } 434 } 435 436 $propObj = $this->root->createProperty($property['name'], null, $namedParameters); 437 438 foreach ($namelessParameters as $namelessParameter) { 439 $propObj->add(null, $namelessParameter); 440 } 441 442 if ('QUOTED-PRINTABLE' === strtoupper($propObj['ENCODING'])) { 443 $propObj->setQuotedPrintableValue($this->extractQuotedPrintableValue()); 444 } else { 445 $charset = $this->charset; 446 if (Document::VCARD21 === $this->root->getDocumentType() && isset($propObj['CHARSET'])) { 447 // vCard 2.1 allows the character set to be specified per property. 448 $charset = (string) $propObj['CHARSET']; 449 } 450 switch (strtolower($charset)) { 451 case 'utf-8': 452 break; 453 case 'iso-8859-1': 454 $property['value'] = utf8_encode($property['value']); 455 break; 456 case 'windows-1252': 457 $property['value'] = mb_convert_encoding($property['value'], 'UTF-8', $charset); 458 break; 459 default: 460 throw new ParseException('Unsupported CHARSET: '.$propObj['CHARSET']); 461 } 462 $propObj->setRawMimeDirValue($property['value']); 463 } 464 465 return $propObj; 466 } 467 468 /** 469 * Unescapes a property value. 470 * 471 * vCard 2.1 says: 472 * * Semi-colons must be escaped in some property values, specifically 473 * ADR, ORG and N. 474 * * Semi-colons must be escaped in parameter values, because semi-colons 475 * are also use to separate values. 476 * * No mention of escaping backslashes with another backslash. 477 * * newlines are not escaped either, instead QUOTED-PRINTABLE is used to 478 * span values over more than 1 line. 479 * 480 * vCard 3.0 says: 481 * * (rfc2425) Backslashes, newlines (\n or \N) and comma's must be 482 * escaped, all time time. 483 * * Comma's are used for delimiters in multiple values 484 * * (rfc2426) Adds to to this that the semi-colon MUST also be escaped, 485 * as in some properties semi-colon is used for separators. 486 * * Properties using semi-colons: N, ADR, GEO, ORG 487 * * Both ADR and N's individual parts may be broken up further with a 488 * comma. 489 * * Properties using commas: NICKNAME, CATEGORIES 490 * 491 * vCard 4.0 (rfc6350) says: 492 * * Commas must be escaped. 493 * * Semi-colons may be escaped, an unescaped semi-colon _may_ be a 494 * delimiter, depending on the property. 495 * * Backslashes must be escaped 496 * * Newlines must be escaped as either \N or \n. 497 * * Some compound properties may contain multiple parts themselves, so a 498 * comma within a semi-colon delimited property may also be unescaped 499 * to denote multiple parts _within_ the compound property. 500 * * Text-properties using semi-colons: N, ADR, ORG, CLIENTPIDMAP. 501 * * Text-properties using commas: NICKNAME, RELATED, CATEGORIES, PID. 502 * 503 * Even though the spec says that commas must always be escaped, the 504 * example for GEO in Section 6.5.2 seems to violate this. 505 * 506 * iCalendar 2.0 (rfc5545) says: 507 * * Commas or semi-colons may be used as delimiters, depending on the 508 * property. 509 * * Commas, semi-colons, backslashes, newline (\N or \n) are always 510 * escaped, unless they are delimiters. 511 * * Colons shall not be escaped. 512 * * Commas can be considered the 'default delimiter' and is described as 513 * the delimiter in cases where the order of the multiple values is 514 * insignificant. 515 * * Semi-colons are described as the delimiter for 'structured values'. 516 * They are specifically used in Semi-colons are used as a delimiter in 517 * REQUEST-STATUS, RRULE, GEO and EXRULE. EXRULE is deprecated however. 518 * 519 * Now for the parameters 520 * 521 * If delimiter is not set (null) this method will just return a string. 522 * If it's a comma or a semi-colon the string will be split on those 523 * characters, and always return an array. 524 * 525 * @param string $input 526 * @param string $delimiter 527 * 528 * @return string|string[] 529 */ 530 public static function unescapeValue($input, $delimiter = ';') 531 { 532 $regex = '# (?: (\\\\ (?: \\\\ | N | n | ; | , ) )'; 533 if ($delimiter) { 534 $regex .= ' | ('.$delimiter.')'; 535 } 536 $regex .= ') #x'; 537 538 $matches = preg_split($regex, $input, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 539 540 $resultArray = []; 541 $result = ''; 542 543 foreach ($matches as $match) { 544 switch ($match) { 545 case '\\\\': 546 $result .= '\\'; 547 break; 548 case '\N': 549 case '\n': 550 $result .= "\n"; 551 break; 552 case '\;': 553 $result .= ';'; 554 break; 555 case '\,': 556 $result .= ','; 557 break; 558 case $delimiter: 559 $resultArray[] = $result; 560 $result = ''; 561 break; 562 default: 563 $result .= $match; 564 break; 565 } 566 } 567 568 $resultArray[] = $result; 569 570 return $delimiter ? $resultArray : $result; 571 } 572 573 /** 574 * Unescapes a parameter value. 575 * 576 * vCard 2.1: 577 * * Does not mention a mechanism for this. In addition, double quotes 578 * are never used to wrap values. 579 * * This means that parameters can simply not contain colons or 580 * semi-colons. 581 * 582 * vCard 3.0 (rfc2425, rfc2426): 583 * * Parameters _may_ be surrounded by double quotes. 584 * * If this is not the case, semi-colon, colon and comma may simply not 585 * occur (the comma used for multiple parameter values though). 586 * * If it is surrounded by double-quotes, it may simply not contain 587 * double-quotes. 588 * * This means that a parameter can in no case encode double-quotes, or 589 * newlines. 590 * 591 * vCard 4.0 (rfc6350) 592 * * Behavior seems to be identical to vCard 3.0 593 * 594 * iCalendar 2.0 (rfc5545) 595 * * Behavior seems to be identical to vCard 3.0 596 * 597 * Parameter escaping mechanism (rfc6868) : 598 * * This rfc describes a new way to escape parameter values. 599 * * New-line is encoded as ^n 600 * * ^ is encoded as ^^. 601 * * " is encoded as ^' 602 * 603 * @param string $input 604 */ 605 private function unescapeParam($input) 606 { 607 return 608 preg_replace_callback( 609 '#(\^(\^|n|\'))#', 610 function ($matches) { 611 switch ($matches[2]) { 612 case 'n': 613 return "\n"; 614 case '^': 615 return '^'; 616 case '\'': 617 return '"'; 618 619 // @codeCoverageIgnoreStart 620 } 621 // @codeCoverageIgnoreEnd 622 }, 623 $input 624 ); 625 } 626 627 /** 628 * Gets the full quoted printable value. 629 * 630 * We need a special method for this, because newlines have both a meaning 631 * in vCards, and in QuotedPrintable. 632 * 633 * This method does not do any decoding. 634 * 635 * @return string 636 */ 637 private function extractQuotedPrintableValue() 638 { 639 // We need to parse the raw line again to get the start of the value. 640 // 641 // We are basically looking for the first colon (:), but we need to 642 // skip over the parameters first, as they may contain one. 643 $regex = '/^ 644 (?: [^:])+ # Anything but a colon 645 (?: "[^"]")* # A parameter in double quotes 646 : # start of the value we really care about 647 (.*)$ 648 /xs'; 649 650 preg_match($regex, $this->rawLine, $matches); 651 652 $value = $matches[1]; 653 // Removing the first whitespace character from every line. Kind of 654 // like unfolding, but we keep the newline. 655 $value = str_replace("\n ", "\n", $value); 656 657 // Microsoft products don't always correctly fold lines, they may be 658 // missing a whitespace. So if 'forgiving' is turned on, we will take 659 // those as well. 660 if ($this->options & self::OPTION_FORGIVING) { 661 while ('=' === substr($value, -1) && $this->lineBuffer) { 662 // Reading the line 663 $this->readLine(); 664 // Grabbing the raw form 665 $value .= "\n".$this->rawLine; 666 } 667 } 668 669 return $value; 670 } 671} 672