1<?php 2 3namespace Sabre\VObject\Parser; 4 5use Sabre\VObject\Component; 6use Sabre\VObject\Component\VCalendar; 7use Sabre\VObject\Component\VCard; 8use Sabre\VObject\Document; 9use Sabre\VObject\EofException; 10use Sabre\VObject\ParseException; 11 12/** 13 * MimeDir parser. 14 * 15 * This class parses iCalendar 2.0 and vCard 2.1, 3.0 and 4.0 files. This 16 * parser will return one of the following two objects from the parse method: 17 * 18 * Sabre\VObject\Component\VCalendar 19 * Sabre\VObject\Component\VCard 20 * 21 * @copyright Copyright (C) fruux GmbH (https://fruux.com/) 22 * @author Evert Pot (http://evertpot.com/) 23 * @license http://sabre.io/license/ Modified BSD License 24 */ 25class MimeDir extends Parser { 26 27 /** 28 * The input stream. 29 * 30 * @var resource 31 */ 32 protected $input; 33 34 /** 35 * Root component. 36 * 37 * @var Component 38 */ 39 protected $root; 40 41 /** 42 * By default all input will be assumed to be UTF-8. 43 * 44 * However, both iCalendar and vCard might be encoded using different 45 * character sets. The character set is usually set in the mime-type. 46 * 47 * If this is the case, use setEncoding to specify that a different 48 * encoding will be used. If this is set, the parser will automatically 49 * convert all incoming data to UTF-8. 50 * 51 * @var string 52 */ 53 protected $charset = 'UTF-8'; 54 55 /** 56 * The list of character sets we support when decoding. 57 * 58 * This would be a const expression but for now we need to support PHP 5.5 59 */ 60 protected static $SUPPORTED_CHARSETS = [ 61 'UTF-8', 62 'ISO-8859-1', 63 'Windows-1252', 64 ]; 65 66 /** 67 * Parses an iCalendar or vCard file. 68 * 69 * Pass a stream or a string. If null is parsed, the existing buffer is 70 * used. 71 * 72 * @param string|resource|null $input 73 * @param int $options 74 * 75 * @return Sabre\VObject\Document 76 */ 77 function parse($input = null, $options = 0) { 78 79 $this->root = null; 80 81 if (!is_null($input)) { 82 $this->setInput($input); 83 } 84 85 if (0 !== $options) { 86 $this->options = $options; 87 } 88 89 $this->parseDocument(); 90 91 return $this->root; 92 93 } 94 95 /** 96 * By default all input will be assumed to be UTF-8. 97 * 98 * However, both iCalendar and vCard might be encoded using different 99 * character sets. The character set is usually set in the mime-type. 100 * 101 * If this is the case, use setEncoding to specify that a different 102 * encoding will be used. If this is set, the parser will automatically 103 * convert all incoming data to UTF-8. 104 * 105 * @param string $charset 106 */ 107 function setCharset($charset) { 108 109 if (!in_array($charset, self::$SUPPORTED_CHARSETS)) { 110 throw new \InvalidArgumentException('Unsupported encoding. (Supported encodings: ' . implode(', ', self::$SUPPORTED_CHARSETS) . ')'); 111 } 112 $this->charset = $charset; 113 114 } 115 116 /** 117 * Sets the input buffer. Must be a string or stream. 118 * 119 * @param resource|string $input 120 * 121 * @return void 122 */ 123 function setInput($input) { 124 125 // Resetting the parser 126 $this->lineIndex = 0; 127 $this->startLine = 0; 128 129 if (is_string($input)) { 130 // Convering to a stream. 131 $stream = fopen('php://temp', 'r+'); 132 fwrite($stream, $input); 133 rewind($stream); 134 $this->input = $stream; 135 } elseif (is_resource($input)) { 136 $this->input = $input; 137 } else { 138 throw new \InvalidArgumentException('This parser can only read from strings or streams.'); 139 } 140 141 } 142 143 /** 144 * Parses an entire document. 145 * 146 * @return void 147 */ 148 protected function parseDocument() { 149 150 $line = $this->readLine(); 151 152 // BOM is ZERO WIDTH NO-BREAK SPACE (U+FEFF). 153 // It's 0xEF 0xBB 0xBF in UTF-8 hex. 154 if (3 <= strlen($line) 155 && ord($line[0]) === 0xef 156 && ord($line[1]) === 0xbb 157 && ord($line[2]) === 0xbf) { 158 $line = substr($line, 3); 159 } 160 161 switch (strtoupper($line)) { 162 case 'BEGIN:VCALENDAR' : 163 $class = VCalendar::$componentMap['VCALENDAR']; 164 break; 165 case 'BEGIN:VCARD' : 166 $class = VCard::$componentMap['VCARD']; 167 break; 168 default : 169 throw new ParseException('This parser only supports VCARD and VCALENDAR files'); 170 } 171 172 $this->root = new $class([], false); 173 174 while (true) { 175 176 // Reading until we hit END: 177 $line = $this->readLine(); 178 if (strtoupper(substr($line, 0, 4)) === 'END:') { 179 break; 180 } 181 $result = $this->parseLine($line); 182 if ($result) { 183 $this->root->add($result); 184 } 185 186 } 187 188 $name = strtoupper(substr($line, 4)); 189 if ($name !== $this->root->name) { 190 throw new ParseException('Invalid MimeDir file. expected: "END:' . $this->root->name . '" got: "END:' . $name . '"'); 191 } 192 193 } 194 195 /** 196 * Parses a line, and if it hits a component, it will also attempt to parse 197 * the entire component. 198 * 199 * @param string $line Unfolded line 200 * 201 * @return Node 202 */ 203 protected function parseLine($line) { 204 205 // Start of a new component 206 if (strtoupper(substr($line, 0, 6)) === 'BEGIN:') { 207 208 $component = $this->root->createComponent(substr($line, 6), [], false); 209 210 while (true) { 211 212 // Reading until we hit END: 213 $line = $this->readLine(); 214 if (strtoupper(substr($line, 0, 4)) === 'END:') { 215 break; 216 } 217 $result = $this->parseLine($line); 218 if ($result) { 219 $component->add($result); 220 } 221 222 } 223 224 $name = strtoupper(substr($line, 4)); 225 if ($name !== $component->name) { 226 throw new ParseException('Invalid MimeDir file. expected: "END:' . $component->name . '" got: "END:' . $name . '"'); 227 } 228 229 return $component; 230 231 } else { 232 233 // Property reader 234 $property = $this->readProperty($line); 235 if (!$property) { 236 // Ignored line 237 return false; 238 } 239 return $property; 240 241 } 242 243 } 244 245 /** 246 * We need to look ahead 1 line every time to see if we need to 'unfold' 247 * the next line. 248 * 249 * If that was not the case, we store it here. 250 * 251 * @var null|string 252 */ 253 protected $lineBuffer; 254 255 /** 256 * The real current line number. 257 */ 258 protected $lineIndex = 0; 259 260 /** 261 * In the case of unfolded lines, this property holds the line number for 262 * the start of the line. 263 * 264 * @var int 265 */ 266 protected $startLine = 0; 267 268 /** 269 * Contains a 'raw' representation of the current line. 270 * 271 * @var string 272 */ 273 protected $rawLine; 274 275 /** 276 * Reads a single line from the buffer. 277 * 278 * This method strips any newlines and also takes care of unfolding. 279 * 280 * @throws \Sabre\VObject\EofException 281 * 282 * @return string 283 */ 284 protected function readLine() { 285 286 if (!is_null($this->lineBuffer)) { 287 $rawLine = $this->lineBuffer; 288 $this->lineBuffer = null; 289 } else { 290 do { 291 $eof = feof($this->input); 292 293 $rawLine = fgets($this->input); 294 295 if ($eof || (feof($this->input) && $rawLine === false)) { 296 throw new EofException('End of document reached prematurely'); 297 } 298 if ($rawLine === false) { 299 throw new ParseException('Error reading from input stream'); 300 } 301 $rawLine = rtrim($rawLine, "\r\n"); 302 } while ($rawLine === ''); // Skipping empty lines 303 $this->lineIndex++; 304 } 305 $line = $rawLine; 306 307 $this->startLine = $this->lineIndex; 308 309 // Looking ahead for folded lines. 310 while (true) { 311 312 $nextLine = rtrim(fgets($this->input), "\r\n"); 313 $this->lineIndex++; 314 if (!$nextLine) { 315 break; 316 } 317 if ($nextLine[0] === "\t" || $nextLine[0] === " ") { 318 $line .= substr($nextLine, 1); 319 $rawLine .= "\n " . substr($nextLine, 1); 320 } else { 321 $this->lineBuffer = $nextLine; 322 break; 323 } 324 325 } 326 $this->rawLine = $rawLine; 327 return $line; 328 329 } 330 331 /** 332 * Reads a property or component from a line. 333 * 334 * @return void 335 */ 336 protected function readProperty($line) { 337 338 if ($this->options & self::OPTION_FORGIVING) { 339 $propNameToken = 'A-Z0-9\-\._\\/'; 340 } else { 341 $propNameToken = 'A-Z0-9\-\.'; 342 } 343 344 $paramNameToken = 'A-Z0-9\-'; 345 $safeChar = '^";:,'; 346 $qSafeChar = '^"'; 347 348 $regex = "/ 349 ^(?P<name> [$propNameToken]+ ) (?=[;:]) # property name 350 | 351 (?<=:)(?P<propValue> .+)$ # property value 352 | 353 ;(?P<paramName> [$paramNameToken]+) (?=[=;:]) # parameter name 354 | 355 (=|,)(?P<paramValue> # parameter value 356 (?: [$safeChar]*) | 357 \"(?: [$qSafeChar]+)\" 358 ) (?=[;:,]) 359 /xi"; 360 361 //echo $regex, "\n"; die(); 362 preg_match_all($regex, $line, $matches, PREG_SET_ORDER); 363 364 $property = [ 365 'name' => null, 366 'parameters' => [], 367 'value' => null 368 ]; 369 370 $lastParam = null; 371 372 /** 373 * Looping through all the tokens. 374 * 375 * Note that we are looping through them in reverse order, because if a 376 * sub-pattern matched, the subsequent named patterns will not show up 377 * in the result. 378 */ 379 foreach ($matches as $match) { 380 381 if (isset($match['paramValue'])) { 382 if ($match['paramValue'] && $match['paramValue'][0] === '"') { 383 $value = substr($match['paramValue'], 1, -1); 384 } else { 385 $value = $match['paramValue']; 386 } 387 388 $value = $this->unescapeParam($value); 389 390 if (is_null($lastParam)) { 391 throw new ParseException('Invalid Mimedir file. Line starting at ' . $this->startLine . ' did not follow iCalendar/vCard conventions'); 392 } 393 if (is_null($property['parameters'][$lastParam])) { 394 $property['parameters'][$lastParam] = $value; 395 } elseif (is_array($property['parameters'][$lastParam])) { 396 $property['parameters'][$lastParam][] = $value; 397 } else { 398 $property['parameters'][$lastParam] = [ 399 $property['parameters'][$lastParam], 400 $value 401 ]; 402 } 403 continue; 404 } 405 if (isset($match['paramName'])) { 406 $lastParam = strtoupper($match['paramName']); 407 if (!isset($property['parameters'][$lastParam])) { 408 $property['parameters'][$lastParam] = null; 409 } 410 continue; 411 } 412 if (isset($match['propValue'])) { 413 $property['value'] = $match['propValue']; 414 continue; 415 } 416 if (isset($match['name']) && $match['name']) { 417 $property['name'] = strtoupper($match['name']); 418 continue; 419 } 420 421 // @codeCoverageIgnoreStart 422 throw new \LogicException('This code should not be reachable'); 423 // @codeCoverageIgnoreEnd 424 425 } 426 427 if (is_null($property['value'])) { 428 $property['value'] = ''; 429 } 430 if (!$property['name']) { 431 if ($this->options & self::OPTION_IGNORE_INVALID_LINES) { 432 return false; 433 } 434 throw new ParseException('Invalid Mimedir file. Line starting at ' . $this->startLine . ' did not follow iCalendar/vCard conventions'); 435 } 436 437 // vCard 2.1 states that parameters may appear without a name, and only 438 // a value. We can deduce the value based on it's name. 439 // 440 // Our parser will get those as parameters without a value instead, so 441 // we're filtering these parameters out first. 442 $namedParameters = []; 443 $namelessParameters = []; 444 445 foreach ($property['parameters'] as $name => $value) { 446 if (!is_null($value)) { 447 $namedParameters[$name] = $value; 448 } else { 449 $namelessParameters[] = $name; 450 } 451 } 452 453 $propObj = $this->root->createProperty($property['name'], null, $namedParameters); 454 455 foreach ($namelessParameters as $namelessParameter) { 456 $propObj->add(null, $namelessParameter); 457 } 458 459 if (strtoupper($propObj['ENCODING']) === 'QUOTED-PRINTABLE') { 460 $propObj->setQuotedPrintableValue($this->extractQuotedPrintableValue()); 461 } else { 462 $charset = $this->charset; 463 if ($this->root->getDocumentType() === Document::VCARD21 && isset($propObj['CHARSET'])) { 464 // vCard 2.1 allows the character set to be specified per property. 465 $charset = (string)$propObj['CHARSET']; 466 } 467 switch ($charset) { 468 case 'UTF-8' : 469 break; 470 case 'ISO-8859-1' : 471 $property['value'] = utf8_encode($property['value']); 472 break; 473 case 'Windows-1252' : 474 $property['value'] = mb_convert_encoding($property['value'], 'UTF-8', $charset); 475 break; 476 default : 477 throw new ParseException('Unsupported CHARSET: ' . $propObj['CHARSET']); 478 } 479 $propObj->setRawMimeDirValue($property['value']); 480 } 481 482 return $propObj; 483 484 } 485 486 /** 487 * Unescapes a property value. 488 * 489 * vCard 2.1 says: 490 * * Semi-colons must be escaped in some property values, specifically 491 * ADR, ORG and N. 492 * * Semi-colons must be escaped in parameter values, because semi-colons 493 * are also use to separate values. 494 * * No mention of escaping backslashes with another backslash. 495 * * newlines are not escaped either, instead QUOTED-PRINTABLE is used to 496 * span values over more than 1 line. 497 * 498 * vCard 3.0 says: 499 * * (rfc2425) Backslashes, newlines (\n or \N) and comma's must be 500 * escaped, all time time. 501 * * Comma's are used for delimeters in multiple values 502 * * (rfc2426) Adds to to this that the semi-colon MUST also be escaped, 503 * as in some properties semi-colon is used for separators. 504 * * Properties using semi-colons: N, ADR, GEO, ORG 505 * * Both ADR and N's individual parts may be broken up further with a 506 * comma. 507 * * Properties using commas: NICKNAME, CATEGORIES 508 * 509 * vCard 4.0 (rfc6350) says: 510 * * Commas must be escaped. 511 * * Semi-colons may be escaped, an unescaped semi-colon _may_ be a 512 * delimiter, depending on the property. 513 * * Backslashes must be escaped 514 * * Newlines must be escaped as either \N or \n. 515 * * Some compound properties may contain multiple parts themselves, so a 516 * comma within a semi-colon delimited property may also be unescaped 517 * to denote multiple parts _within_ the compound property. 518 * * Text-properties using semi-colons: N, ADR, ORG, CLIENTPIDMAP. 519 * * Text-properties using commas: NICKNAME, RELATED, CATEGORIES, PID. 520 * 521 * Even though the spec says that commas must always be escaped, the 522 * example for GEO in Section 6.5.2 seems to violate this. 523 * 524 * iCalendar 2.0 (rfc5545) says: 525 * * Commas or semi-colons may be used as delimiters, depending on the 526 * property. 527 * * Commas, semi-colons, backslashes, newline (\N or \n) are always 528 * escaped, unless they are delimiters. 529 * * Colons shall not be escaped. 530 * * Commas can be considered the 'default delimiter' and is described as 531 * the delimiter in cases where the order of the multiple values is 532 * insignificant. 533 * * Semi-colons are described as the delimiter for 'structured values'. 534 * They are specifically used in Semi-colons are used as a delimiter in 535 * REQUEST-STATUS, RRULE, GEO and EXRULE. EXRULE is deprecated however. 536 * 537 * Now for the parameters 538 * 539 * If delimiter is not set (null) this method will just return a string. 540 * If it's a comma or a semi-colon the string will be split on those 541 * characters, and always return an array. 542 * 543 * @param string $input 544 * @param string $delimiter 545 * 546 * @return string|string[] 547 */ 548 static function unescapeValue($input, $delimiter = ';') { 549 550 $regex = '# (?: (\\\\ (?: \\\\ | N | n | ; | , ) )'; 551 if ($delimiter) { 552 $regex .= ' | (' . $delimiter . ')'; 553 } 554 $regex .= ') #x'; 555 556 $matches = preg_split($regex, $input, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 557 558 $resultArray = []; 559 $result = ''; 560 561 foreach ($matches as $match) { 562 563 switch ($match) { 564 case '\\\\' : 565 $result .= '\\'; 566 break; 567 case '\N' : 568 case '\n' : 569 $result .= "\n"; 570 break; 571 case '\;' : 572 $result .= ';'; 573 break; 574 case '\,' : 575 $result .= ','; 576 break; 577 case $delimiter : 578 $resultArray[] = $result; 579 $result = ''; 580 break; 581 default : 582 $result .= $match; 583 break; 584 585 } 586 587 } 588 589 $resultArray[] = $result; 590 return $delimiter ? $resultArray : $result; 591 592 } 593 594 /** 595 * Unescapes a parameter value. 596 * 597 * vCard 2.1: 598 * * Does not mention a mechanism for this. In addition, double quotes 599 * are never used to wrap values. 600 * * This means that parameters can simply not contain colons or 601 * semi-colons. 602 * 603 * vCard 3.0 (rfc2425, rfc2426): 604 * * Parameters _may_ be surrounded by double quotes. 605 * * If this is not the case, semi-colon, colon and comma may simply not 606 * occur (the comma used for multiple parameter values though). 607 * * If it is surrounded by double-quotes, it may simply not contain 608 * double-quotes. 609 * * This means that a parameter can in no case encode double-quotes, or 610 * newlines. 611 * 612 * vCard 4.0 (rfc6350) 613 * * Behavior seems to be identical to vCard 3.0 614 * 615 * iCalendar 2.0 (rfc5545) 616 * * Behavior seems to be identical to vCard 3.0 617 * 618 * Parameter escaping mechanism (rfc6868) : 619 * * This rfc describes a new way to escape parameter values. 620 * * New-line is encoded as ^n 621 * * ^ is encoded as ^^. 622 * * " is encoded as ^' 623 * 624 * @param string $input 625 * 626 * @return void 627 */ 628 private function unescapeParam($input) { 629 630 return 631 preg_replace_callback( 632 '#(\^(\^|n|\'))#', 633 function($matches) { 634 switch ($matches[2]) { 635 case 'n' : 636 return "\n"; 637 case '^' : 638 return '^'; 639 case '\'' : 640 return '"'; 641 642 // @codeCoverageIgnoreStart 643 } 644 // @codeCoverageIgnoreEnd 645 }, 646 $input 647 ); 648 } 649 650 /** 651 * Gets the full quoted printable value. 652 * 653 * We need a special method for this, because newlines have both a meaning 654 * in vCards, and in QuotedPrintable. 655 * 656 * This method does not do any decoding. 657 * 658 * @return string 659 */ 660 private function extractQuotedPrintableValue() { 661 662 // We need to parse the raw line again to get the start of the value. 663 // 664 // We are basically looking for the first colon (:), but we need to 665 // skip over the parameters first, as they may contain one. 666 $regex = '/^ 667 (?: [^:])+ # Anything but a colon 668 (?: "[^"]")* # A parameter in double quotes 669 : # start of the value we really care about 670 (.*)$ 671 /xs'; 672 673 preg_match($regex, $this->rawLine, $matches); 674 675 $value = $matches[1]; 676 // Removing the first whitespace character from every line. Kind of 677 // like unfolding, but we keep the newline. 678 $value = str_replace("\n ", "\n", $value); 679 680 // Microsoft products don't always correctly fold lines, they may be 681 // missing a whitespace. So if 'forgiving' is turned on, we will take 682 // those as well. 683 if ($this->options & self::OPTION_FORGIVING) { 684 while (substr($value, -1) === '=') { 685 // Reading the line 686 $this->readLine(); 687 // Grabbing the raw form 688 $value .= "\n" . $this->rawLine; 689 } 690 } 691 692 return $value; 693 694 } 695 696} 697