1<?php 2/** 3 * This file is part of FPDI 4 * 5 * @package setasign\Fpdi 6 * @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com) 7 * @license http://opensource.org/licenses/mit-license The MIT License 8 */ 9 10namespace setasign\Fpdi\PdfParser; 11 12use setasign\Fpdi\PdfParser\CrossReference\CrossReference; 13use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException; 14use setasign\Fpdi\PdfParser\Type\PdfArray; 15use setasign\Fpdi\PdfParser\Type\PdfBoolean; 16use setasign\Fpdi\PdfParser\Type\PdfDictionary; 17use setasign\Fpdi\PdfParser\Type\PdfHexString; 18use setasign\Fpdi\PdfParser\Type\PdfIndirectObject; 19use setasign\Fpdi\PdfParser\Type\PdfIndirectObjectReference; 20use setasign\Fpdi\PdfParser\Type\PdfName; 21use setasign\Fpdi\PdfParser\Type\PdfNull; 22use setasign\Fpdi\PdfParser\Type\PdfNumeric; 23use setasign\Fpdi\PdfParser\Type\PdfString; 24use setasign\Fpdi\PdfParser\Type\PdfToken; 25use setasign\Fpdi\PdfParser\Type\PdfType; 26 27/** 28 * A PDF parser class 29 * 30 * @package setasign\Fpdi\PdfParser 31 */ 32class PdfParser 33{ 34 /** 35 * @var StreamReader 36 */ 37 protected $streamReader; 38 39 /** 40 * @var Tokenizer 41 */ 42 protected $tokenizer; 43 44 /** 45 * The file header. 46 * 47 * @var string 48 */ 49 protected $fileHeader; 50 51 /** 52 * The offset to the file header. 53 * 54 * @var int 55 */ 56 protected $fileHeaderOffset; 57 58 /** 59 * @var CrossReference 60 */ 61 protected $xref; 62 63 /** 64 * All read objects. 65 * 66 * @var array 67 */ 68 protected $objects = []; 69 70 /** 71 * PdfParser constructor. 72 * 73 * @param StreamReader $streamReader 74 */ 75 public function __construct(StreamReader $streamReader) 76 { 77 $this->streamReader = $streamReader; 78 $this->tokenizer = new Tokenizer($streamReader); 79 } 80 81 /** 82 * Removes cycled references. 83 * 84 * @internal 85 */ 86 public function cleanUp() 87 { 88 $this->xref = null; 89 } 90 91 /** 92 * Get the stream reader instance. 93 * 94 * @return StreamReader 95 */ 96 public function getStreamReader() 97 { 98 return $this->streamReader; 99 } 100 101 /** 102 * Get the tokenizer instance. 103 * 104 * @return Tokenizer 105 */ 106 public function getTokenizer() 107 { 108 return $this->tokenizer; 109 } 110 111 /** 112 * Resolves the file header. 113 * 114 * @throws PdfParserException 115 * @return int 116 */ 117 protected function resolveFileHeader() 118 { 119 if ($this->fileHeader) { 120 return $this->fileHeaderOffset; 121 } 122 123 $this->streamReader->reset(0); 124 $offset = false; 125 $maxIterations = 1000; 126 while (true) { 127 $buffer = $this->streamReader->getBuffer(false); 128 $offset = \strpos($buffer, '%PDF-'); 129 if ($offset === false) { 130 if (!$this->streamReader->increaseLength(100) || (--$maxIterations === 0)) { 131 throw new PdfParserException( 132 'Unable to find PDF file header.', 133 PdfParserException::FILE_HEADER_NOT_FOUND 134 ); 135 } 136 continue; 137 } 138 break; 139 } 140 141 $this->fileHeaderOffset = $offset; 142 $this->streamReader->setOffset($offset); 143 144 $this->fileHeader = \trim($this->streamReader->readLine()); 145 return $this->fileHeaderOffset; 146 } 147 148 /** 149 * Get the cross reference instance. 150 * 151 * @return CrossReference 152 * @throws CrossReferenceException 153 * @throws PdfParserException 154 */ 155 public function getCrossReference() 156 { 157 if ($this->xref === null) { 158 $this->xref = new CrossReference($this, $this->resolveFileHeader()); 159 } 160 161 return $this->xref; 162 } 163 164 /** 165 * Get the PDF version. 166 * 167 * @return int[] An array of major and minor version. 168 * @throws PdfParserException 169 */ 170 public function getPdfVersion() 171 { 172 $this->resolveFileHeader(); 173 174 if (\preg_match('/%PDF-(\d)\.(\d)/', $this->fileHeader, $result) === 0) { 175 throw new PdfParserException( 176 'Unable to extract PDF version from file header.', 177 PdfParserException::PDF_VERSION_NOT_FOUND 178 ); 179 } 180 list(, $major, $minor) = $result; 181 182 $catalog = $this->getCatalog(); 183 if (isset($catalog->value['Version'])) { 184 $versionParts = \explode('.', PdfName::unescape(PdfType::resolve($catalog->value['Version'], $this)->value)); 185 if (count($versionParts) === 2) { 186 list($major, $minor) = $versionParts; 187 } 188 } 189 190 return [(int) $major, (int) $minor]; 191 } 192 193 /** 194 * Get the catalog dictionary. 195 * 196 * @return PdfDictionary 197 * @throws Type\PdfTypeException 198 * @throws CrossReferenceException 199 * @throws PdfParserException 200 */ 201 public function getCatalog() 202 { 203 $xref = $this->getCrossReference(); 204 $trailer = $xref->getTrailer(); 205 206 $catalog = PdfType::resolve(PdfDictionary::get($trailer, 'Root'), $this); 207 208 return PdfDictionary::ensure($catalog); 209 } 210 211 /** 212 * Get an indirect object by its object number. 213 * 214 * @param int $objectNumber 215 * @param bool $cache 216 * @return PdfIndirectObject 217 * @throws CrossReferenceException 218 * @throws PdfParserException 219 */ 220 public function getIndirectObject($objectNumber, $cache = false) 221 { 222 $objectNumber = (int) $objectNumber; 223 if (isset($this->objects[$objectNumber])) { 224 return $this->objects[$objectNumber]; 225 } 226 227 $xref = $this->getCrossReference(); 228 $object = $xref->getIndirectObject($objectNumber); 229 230 if ($cache) { 231 $this->objects[$objectNumber] = $object; 232 } 233 234 return $object; 235 } 236 237 /** 238 * Read a PDF value. 239 * 240 * @param null|bool|string $token 241 * @param null|string $expectedType 242 * @return bool|PdfArray|PdfBoolean|PdfHexString|PdfName|PdfNull|PdfNumeric|PdfString|PdfToken|PdfIndirectObjectReference 243 * @throws Type\PdfTypeException 244 */ 245 public function readValue($token = null, $expectedType = null) 246 { 247 if ($token === null) { 248 $token = $this->tokenizer->getNextToken(); 249 } 250 251 if ($token === false) { 252 if ($expectedType !== null) { 253 throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE); 254 } 255 return false; 256 } 257 258 switch ($token) { 259 case '(': 260 $this->ensureExpectedType($token, $expectedType); 261 return PdfString::parse($this->streamReader); 262 263 case '<': 264 if ($this->streamReader->getByte() === '<') { 265 $this->ensureExpectedType('<<', $expectedType); 266 $this->streamReader->addOffset(1); 267 return PdfDictionary::parse($this->tokenizer, $this->streamReader, $this); 268 } 269 270 $this->ensureExpectedType($token, $expectedType); 271 return PdfHexString::parse($this->streamReader); 272 273 case '/': 274 $this->ensureExpectedType($token, $expectedType); 275 return PdfName::parse($this->tokenizer, $this->streamReader); 276 277 case '[': 278 $this->ensureExpectedType($token, $expectedType); 279 return PdfArray::parse($this->tokenizer, $this); 280 281 default: 282 if (\is_numeric($token)) { 283 if (($token2 = $this->tokenizer->getNextToken()) !== false) { 284 if (\is_numeric($token2)) { 285 if (($token3 = $this->tokenizer->getNextToken()) !== false) { 286 switch ($token3) { 287 case 'obj': 288 if ($expectedType !== null && $expectedType !== PdfIndirectObject::class) { 289 throw new Type\PdfTypeException( 290 'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE 291 ); 292 } 293 294 return PdfIndirectObject::parse( 295 $token, 296 $token2, 297 $this, 298 $this->tokenizer, 299 $this->streamReader 300 ); 301 case 'R': 302 if ($expectedType !== null && 303 $expectedType !== PdfIndirectObjectReference::class 304 ) { 305 throw new Type\PdfTypeException( 306 'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE 307 ); 308 } 309 310 return PdfIndirectObjectReference::create($token, $token2); 311 } 312 313 $this->tokenizer->pushStack($token3); 314 } 315 } 316 317 $this->tokenizer->pushStack($token2); 318 } 319 320 if ($expectedType !== null && $expectedType !== PdfNumeric::class) { 321 throw new Type\PdfTypeException( 322 'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE 323 ); 324 } 325 return PdfNumeric::create($token); 326 } 327 328 if ($token === 'true' || $token === 'false') { 329 $this->ensureExpectedType($token, $expectedType); 330 return PdfBoolean::create($token === 'true'); 331 } 332 333 if ($token === 'null') { 334 $this->ensureExpectedType($token, $expectedType); 335 return new PdfNull(); 336 } 337 338 if ($expectedType !== null && $expectedType !== PdfToken::class) { 339 throw new Type\PdfTypeException( 340 'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE 341 ); 342 } 343 344 $v = new PdfToken(); 345 $v->value = $token; 346 347 return $v; 348 } 349 } 350 351 /** 352 * Ensures that the token will evaluate to an expected object type (or not). 353 * 354 * @param string $token 355 * @param string|null $expectedType 356 * @return bool 357 * @throws Type\PdfTypeException 358 */ 359 private function ensureExpectedType($token, $expectedType) 360 { 361 static $mapping = [ 362 '(' => PdfString::class, 363 '<' => PdfHexString::class, 364 '<<' => PdfDictionary::class, 365 '/' => PdfName::class, 366 '[' => PdfArray::class, 367 'true' => PdfBoolean::class, 368 'false' => PdfBoolean::class, 369 'null' => PdfNull::class 370 ]; 371 372 if ($expectedType === null || $mapping[$token] === $expectedType) { 373 return true; 374 } 375 376 throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE); 377 } 378} 379