1*dc4d9dc6SAnna Dabrowska<?php 2*dc4d9dc6SAnna Dabrowska/** 3*dc4d9dc6SAnna Dabrowska * This file is part of FPDI 4*dc4d9dc6SAnna Dabrowska * 5*dc4d9dc6SAnna Dabrowska * @package setasign\Fpdi 6*dc4d9dc6SAnna Dabrowska * @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com) 7*dc4d9dc6SAnna Dabrowska * @license http://opensource.org/licenses/mit-license The MIT License 8*dc4d9dc6SAnna Dabrowska */ 9*dc4d9dc6SAnna Dabrowska 10*dc4d9dc6SAnna Dabrowskanamespace setasign\Fpdi\PdfParser\CrossReference; 11*dc4d9dc6SAnna Dabrowska 12*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\PdfParser; 13*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\Type\PdfDictionary; 14*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\Type\PdfIndirectObject; 15*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\Type\PdfNumeric; 16*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\Type\PdfStream; 17*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\Type\PdfToken; 18*dc4d9dc6SAnna Dabrowskause setasign\Fpdi\PdfParser\Type\PdfTypeException; 19*dc4d9dc6SAnna Dabrowska 20*dc4d9dc6SAnna Dabrowska/** 21*dc4d9dc6SAnna Dabrowska * Class CrossReference 22*dc4d9dc6SAnna Dabrowska * 23*dc4d9dc6SAnna Dabrowska * This class processes the standard cross reference of a PDF document. 24*dc4d9dc6SAnna Dabrowska * 25*dc4d9dc6SAnna Dabrowska * @package setasign\Fpdi\PdfParser\CrossReference 26*dc4d9dc6SAnna Dabrowska */ 27*dc4d9dc6SAnna Dabrowskaclass CrossReference 28*dc4d9dc6SAnna Dabrowska{ 29*dc4d9dc6SAnna Dabrowska /** 30*dc4d9dc6SAnna Dabrowska * The byte length in which the "startxref" keyword should be searched. 31*dc4d9dc6SAnna Dabrowska * 32*dc4d9dc6SAnna Dabrowska * @var int 33*dc4d9dc6SAnna Dabrowska */ 34*dc4d9dc6SAnna Dabrowska static public $trailerSearchLength = 5500; 35*dc4d9dc6SAnna Dabrowska 36*dc4d9dc6SAnna Dabrowska /** 37*dc4d9dc6SAnna Dabrowska * @var int 38*dc4d9dc6SAnna Dabrowska */ 39*dc4d9dc6SAnna Dabrowska protected $fileHeaderOffset = 0; 40*dc4d9dc6SAnna Dabrowska 41*dc4d9dc6SAnna Dabrowska /** 42*dc4d9dc6SAnna Dabrowska * @var PdfParser 43*dc4d9dc6SAnna Dabrowska */ 44*dc4d9dc6SAnna Dabrowska protected $parser; 45*dc4d9dc6SAnna Dabrowska 46*dc4d9dc6SAnna Dabrowska /** 47*dc4d9dc6SAnna Dabrowska * @var ReaderInterface[] 48*dc4d9dc6SAnna Dabrowska */ 49*dc4d9dc6SAnna Dabrowska protected $readers = []; 50*dc4d9dc6SAnna Dabrowska 51*dc4d9dc6SAnna Dabrowska /** 52*dc4d9dc6SAnna Dabrowska * CrossReference constructor. 53*dc4d9dc6SAnna Dabrowska * 54*dc4d9dc6SAnna Dabrowska * @param PdfParser $parser 55*dc4d9dc6SAnna Dabrowska * @throws CrossReferenceException 56*dc4d9dc6SAnna Dabrowska * @throws PdfTypeException 57*dc4d9dc6SAnna Dabrowska */ 58*dc4d9dc6SAnna Dabrowska public function __construct(PdfParser $parser, $fileHeaderOffset = 0) 59*dc4d9dc6SAnna Dabrowska { 60*dc4d9dc6SAnna Dabrowska $this->parser = $parser; 61*dc4d9dc6SAnna Dabrowska $this->fileHeaderOffset = $fileHeaderOffset; 62*dc4d9dc6SAnna Dabrowska 63*dc4d9dc6SAnna Dabrowska $offset = $this->findStartXref(); 64*dc4d9dc6SAnna Dabrowska $reader = null; 65*dc4d9dc6SAnna Dabrowska /** @noinspection TypeUnsafeComparisonInspection */ 66*dc4d9dc6SAnna Dabrowska while ($offset != false) { // By doing an unsafe comparsion we ignore faulty references to byte offset 0 67*dc4d9dc6SAnna Dabrowska try { 68*dc4d9dc6SAnna Dabrowska $reader = $this->readXref($offset + $this->fileHeaderOffset); 69*dc4d9dc6SAnna Dabrowska } catch (CrossReferenceException $e) { 70*dc4d9dc6SAnna Dabrowska // sometimes the file header offset is part of the byte offsets, so let's retry by resetting it to zero. 71*dc4d9dc6SAnna Dabrowska if ($e->getCode() === CrossReferenceException::INVALID_DATA && $this->fileHeaderOffset !== 0) { 72*dc4d9dc6SAnna Dabrowska $this->fileHeaderOffset = 0; 73*dc4d9dc6SAnna Dabrowska $reader = $this->readXref($offset + $this->fileHeaderOffset); 74*dc4d9dc6SAnna Dabrowska } else { 75*dc4d9dc6SAnna Dabrowska throw $e; 76*dc4d9dc6SAnna Dabrowska } 77*dc4d9dc6SAnna Dabrowska } 78*dc4d9dc6SAnna Dabrowska 79*dc4d9dc6SAnna Dabrowska $trailer = $reader->getTrailer(); 80*dc4d9dc6SAnna Dabrowska $this->checkForEncryption($trailer); 81*dc4d9dc6SAnna Dabrowska $this->readers[] = $reader; 82*dc4d9dc6SAnna Dabrowska 83*dc4d9dc6SAnna Dabrowska if (isset($trailer->value['Prev'])) { 84*dc4d9dc6SAnna Dabrowska $offset = $trailer->value['Prev']->value; 85*dc4d9dc6SAnna Dabrowska } else { 86*dc4d9dc6SAnna Dabrowska $offset = false; 87*dc4d9dc6SAnna Dabrowska } 88*dc4d9dc6SAnna Dabrowska } 89*dc4d9dc6SAnna Dabrowska 90*dc4d9dc6SAnna Dabrowska // fix faulty sub-section header 91*dc4d9dc6SAnna Dabrowska if ($reader instanceof FixedReader) { 92*dc4d9dc6SAnna Dabrowska /** 93*dc4d9dc6SAnna Dabrowska * @var FixedReader $reader 94*dc4d9dc6SAnna Dabrowska */ 95*dc4d9dc6SAnna Dabrowska $reader->fixFaultySubSectionShift(); 96*dc4d9dc6SAnna Dabrowska } 97*dc4d9dc6SAnna Dabrowska 98*dc4d9dc6SAnna Dabrowska if ($reader === null) { 99*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException('No cross-reference found.', CrossReferenceException::NO_XREF_FOUND); 100*dc4d9dc6SAnna Dabrowska } 101*dc4d9dc6SAnna Dabrowska } 102*dc4d9dc6SAnna Dabrowska 103*dc4d9dc6SAnna Dabrowska /** 104*dc4d9dc6SAnna Dabrowska * Get the size of the cross reference. 105*dc4d9dc6SAnna Dabrowska * 106*dc4d9dc6SAnna Dabrowska * @return integer 107*dc4d9dc6SAnna Dabrowska */ 108*dc4d9dc6SAnna Dabrowska public function getSize() 109*dc4d9dc6SAnna Dabrowska { 110*dc4d9dc6SAnna Dabrowska return $this->getTrailer()->value['Size']->value; 111*dc4d9dc6SAnna Dabrowska } 112*dc4d9dc6SAnna Dabrowska 113*dc4d9dc6SAnna Dabrowska /** 114*dc4d9dc6SAnna Dabrowska * Get the trailer dictionary. 115*dc4d9dc6SAnna Dabrowska * 116*dc4d9dc6SAnna Dabrowska * @return PdfDictionary 117*dc4d9dc6SAnna Dabrowska */ 118*dc4d9dc6SAnna Dabrowska public function getTrailer() 119*dc4d9dc6SAnna Dabrowska { 120*dc4d9dc6SAnna Dabrowska return $this->readers[0]->getTrailer(); 121*dc4d9dc6SAnna Dabrowska } 122*dc4d9dc6SAnna Dabrowska 123*dc4d9dc6SAnna Dabrowska /** 124*dc4d9dc6SAnna Dabrowska * Get the cross reference readser instances. 125*dc4d9dc6SAnna Dabrowska * 126*dc4d9dc6SAnna Dabrowska * @return ReaderInterface[] 127*dc4d9dc6SAnna Dabrowska */ 128*dc4d9dc6SAnna Dabrowska public function getReaders() 129*dc4d9dc6SAnna Dabrowska { 130*dc4d9dc6SAnna Dabrowska return $this->readers; 131*dc4d9dc6SAnna Dabrowska } 132*dc4d9dc6SAnna Dabrowska 133*dc4d9dc6SAnna Dabrowska /** 134*dc4d9dc6SAnna Dabrowska * Get the offset by an object number. 135*dc4d9dc6SAnna Dabrowska * 136*dc4d9dc6SAnna Dabrowska * @param int $objectNumber 137*dc4d9dc6SAnna Dabrowska * @return integer|bool 138*dc4d9dc6SAnna Dabrowska */ 139*dc4d9dc6SAnna Dabrowska public function getOffsetFor($objectNumber) 140*dc4d9dc6SAnna Dabrowska { 141*dc4d9dc6SAnna Dabrowska foreach ($this->getReaders() as $reader) { 142*dc4d9dc6SAnna Dabrowska $offset = $reader->getOffsetFor($objectNumber); 143*dc4d9dc6SAnna Dabrowska if ($offset !== false) { 144*dc4d9dc6SAnna Dabrowska return $offset; 145*dc4d9dc6SAnna Dabrowska } 146*dc4d9dc6SAnna Dabrowska } 147*dc4d9dc6SAnna Dabrowska 148*dc4d9dc6SAnna Dabrowska return false; 149*dc4d9dc6SAnna Dabrowska } 150*dc4d9dc6SAnna Dabrowska 151*dc4d9dc6SAnna Dabrowska /** 152*dc4d9dc6SAnna Dabrowska * Get an indirect object by its object number. 153*dc4d9dc6SAnna Dabrowska * 154*dc4d9dc6SAnna Dabrowska * @param int $objectNumber 155*dc4d9dc6SAnna Dabrowska * @return PdfIndirectObject 156*dc4d9dc6SAnna Dabrowska * @throws CrossReferenceException 157*dc4d9dc6SAnna Dabrowska */ 158*dc4d9dc6SAnna Dabrowska public function getIndirectObject($objectNumber) 159*dc4d9dc6SAnna Dabrowska { 160*dc4d9dc6SAnna Dabrowska $offset = $this->getOffsetFor($objectNumber); 161*dc4d9dc6SAnna Dabrowska if ($offset === false) { 162*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 163*dc4d9dc6SAnna Dabrowska \sprintf('Object (id:%s) not found.', $objectNumber), 164*dc4d9dc6SAnna Dabrowska CrossReferenceException::OBJECT_NOT_FOUND 165*dc4d9dc6SAnna Dabrowska ); 166*dc4d9dc6SAnna Dabrowska } 167*dc4d9dc6SAnna Dabrowska 168*dc4d9dc6SAnna Dabrowska $parser = $this->parser; 169*dc4d9dc6SAnna Dabrowska 170*dc4d9dc6SAnna Dabrowska $parser->getTokenizer()->clearStack(); 171*dc4d9dc6SAnna Dabrowska $parser->getStreamReader()->reset($offset + $this->fileHeaderOffset); 172*dc4d9dc6SAnna Dabrowska 173*dc4d9dc6SAnna Dabrowska try { 174*dc4d9dc6SAnna Dabrowska /** @var PdfIndirectObject $object */ 175*dc4d9dc6SAnna Dabrowska $object = $parser->readValue(null, PdfIndirectObject::class); 176*dc4d9dc6SAnna Dabrowska } catch (PdfTypeException $e) { 177*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 178*dc4d9dc6SAnna Dabrowska \sprintf('Object (id:%s) not found at location (%s).', $objectNumber, $offset), 179*dc4d9dc6SAnna Dabrowska CrossReferenceException::OBJECT_NOT_FOUND, 180*dc4d9dc6SAnna Dabrowska $e 181*dc4d9dc6SAnna Dabrowska ); 182*dc4d9dc6SAnna Dabrowska } 183*dc4d9dc6SAnna Dabrowska 184*dc4d9dc6SAnna Dabrowska if ($object->objectNumber !== $objectNumber) { 185*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 186*dc4d9dc6SAnna Dabrowska \sprintf('Wrong object found, got %s while %s was expected.', $object->objectNumber, $objectNumber), 187*dc4d9dc6SAnna Dabrowska CrossReferenceException::OBJECT_NOT_FOUND 188*dc4d9dc6SAnna Dabrowska ); 189*dc4d9dc6SAnna Dabrowska } 190*dc4d9dc6SAnna Dabrowska 191*dc4d9dc6SAnna Dabrowska return $object; 192*dc4d9dc6SAnna Dabrowska } 193*dc4d9dc6SAnna Dabrowska 194*dc4d9dc6SAnna Dabrowska /** 195*dc4d9dc6SAnna Dabrowska * Read the cross-reference table at a given offset. 196*dc4d9dc6SAnna Dabrowska * 197*dc4d9dc6SAnna Dabrowska * Internally the method will try to evaluate the best reader for this cross-reference. 198*dc4d9dc6SAnna Dabrowska * 199*dc4d9dc6SAnna Dabrowska * @param int $offset 200*dc4d9dc6SAnna Dabrowska * @return ReaderInterface 201*dc4d9dc6SAnna Dabrowska * @throws CrossReferenceException 202*dc4d9dc6SAnna Dabrowska * @throws PdfTypeException 203*dc4d9dc6SAnna Dabrowska */ 204*dc4d9dc6SAnna Dabrowska protected function readXref($offset) 205*dc4d9dc6SAnna Dabrowska { 206*dc4d9dc6SAnna Dabrowska $this->parser->getStreamReader()->reset($offset); 207*dc4d9dc6SAnna Dabrowska $this->parser->getTokenizer()->clearStack(); 208*dc4d9dc6SAnna Dabrowska $initValue = $this->parser->readValue(); 209*dc4d9dc6SAnna Dabrowska 210*dc4d9dc6SAnna Dabrowska return $this->initReaderInstance($initValue); 211*dc4d9dc6SAnna Dabrowska } 212*dc4d9dc6SAnna Dabrowska 213*dc4d9dc6SAnna Dabrowska /** 214*dc4d9dc6SAnna Dabrowska * Get a cross-reference reader instance. 215*dc4d9dc6SAnna Dabrowska * 216*dc4d9dc6SAnna Dabrowska * @param PdfToken|PdfIndirectObject $initValue 217*dc4d9dc6SAnna Dabrowska * @return ReaderInterface|bool 218*dc4d9dc6SAnna Dabrowska * @throws CrossReferenceException 219*dc4d9dc6SAnna Dabrowska * @throws PdfTypeException 220*dc4d9dc6SAnna Dabrowska */ 221*dc4d9dc6SAnna Dabrowska protected function initReaderInstance($initValue) 222*dc4d9dc6SAnna Dabrowska { 223*dc4d9dc6SAnna Dabrowska $position = $this->parser->getStreamReader()->getPosition() 224*dc4d9dc6SAnna Dabrowska + $this->parser->getStreamReader()->getOffset() + $this->fileHeaderOffset; 225*dc4d9dc6SAnna Dabrowska 226*dc4d9dc6SAnna Dabrowska if ($initValue instanceof PdfToken && $initValue->value === 'xref') { 227*dc4d9dc6SAnna Dabrowska try { 228*dc4d9dc6SAnna Dabrowska return new FixedReader($this->parser); 229*dc4d9dc6SAnna Dabrowska } catch (CrossReferenceException $e) { 230*dc4d9dc6SAnna Dabrowska $this->parser->getStreamReader()->reset($position); 231*dc4d9dc6SAnna Dabrowska $this->parser->getTokenizer()->clearStack(); 232*dc4d9dc6SAnna Dabrowska 233*dc4d9dc6SAnna Dabrowska return new LineReader($this->parser); 234*dc4d9dc6SAnna Dabrowska } 235*dc4d9dc6SAnna Dabrowska } 236*dc4d9dc6SAnna Dabrowska 237*dc4d9dc6SAnna Dabrowska if ($initValue instanceof PdfIndirectObject) { 238*dc4d9dc6SAnna Dabrowska try { 239*dc4d9dc6SAnna Dabrowska $stream = PdfStream::ensure($initValue->value); 240*dc4d9dc6SAnna Dabrowska 241*dc4d9dc6SAnna Dabrowska } catch (PdfTypeException $e) { 242*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 243*dc4d9dc6SAnna Dabrowska 'Invalid object type at xref reference offset.', 244*dc4d9dc6SAnna Dabrowska CrossReferenceException::INVALID_DATA, 245*dc4d9dc6SAnna Dabrowska $e 246*dc4d9dc6SAnna Dabrowska ); 247*dc4d9dc6SAnna Dabrowska } 248*dc4d9dc6SAnna Dabrowska 249*dc4d9dc6SAnna Dabrowska $type = PdfDictionary::get($stream->value, 'Type'); 250*dc4d9dc6SAnna Dabrowska if ($type->value !== 'XRef') { 251*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 252*dc4d9dc6SAnna Dabrowska 'The xref position points to an incorrect object type.', 253*dc4d9dc6SAnna Dabrowska CrossReferenceException::INVALID_DATA 254*dc4d9dc6SAnna Dabrowska ); 255*dc4d9dc6SAnna Dabrowska } 256*dc4d9dc6SAnna Dabrowska 257*dc4d9dc6SAnna Dabrowska $this->checkForEncryption($stream->value); 258*dc4d9dc6SAnna Dabrowska 259*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 260*dc4d9dc6SAnna Dabrowska 'This PDF document probably uses a compression technique which is not supported by the ' . 261*dc4d9dc6SAnna Dabrowska 'free parser shipped with FPDI. (See https://www.setasign.com/fpdi-pdf-parser for more details)', 262*dc4d9dc6SAnna Dabrowska CrossReferenceException::COMPRESSED_XREF 263*dc4d9dc6SAnna Dabrowska ); 264*dc4d9dc6SAnna Dabrowska } 265*dc4d9dc6SAnna Dabrowska 266*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 267*dc4d9dc6SAnna Dabrowska 'The xref position points to an incorrect object type.', 268*dc4d9dc6SAnna Dabrowska CrossReferenceException::INVALID_DATA 269*dc4d9dc6SAnna Dabrowska ); 270*dc4d9dc6SAnna Dabrowska } 271*dc4d9dc6SAnna Dabrowska 272*dc4d9dc6SAnna Dabrowska /** 273*dc4d9dc6SAnna Dabrowska * Check for encryption. 274*dc4d9dc6SAnna Dabrowska * 275*dc4d9dc6SAnna Dabrowska * @param PdfDictionary $dictionary 276*dc4d9dc6SAnna Dabrowska * @throws CrossReferenceException 277*dc4d9dc6SAnna Dabrowska */ 278*dc4d9dc6SAnna Dabrowska protected function checkForEncryption(PdfDictionary $dictionary) 279*dc4d9dc6SAnna Dabrowska { 280*dc4d9dc6SAnna Dabrowska if (isset($dictionary->value['Encrypt'])) { 281*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 282*dc4d9dc6SAnna Dabrowska 'This PDF document is encrypted and cannot be processed with FPDI.', 283*dc4d9dc6SAnna Dabrowska CrossReferenceException::ENCRYPTED 284*dc4d9dc6SAnna Dabrowska ); 285*dc4d9dc6SAnna Dabrowska } 286*dc4d9dc6SAnna Dabrowska } 287*dc4d9dc6SAnna Dabrowska 288*dc4d9dc6SAnna Dabrowska /** 289*dc4d9dc6SAnna Dabrowska * Find the start position for the first cross-reference. 290*dc4d9dc6SAnna Dabrowska * 291*dc4d9dc6SAnna Dabrowska * @return int The byte-offset position of the first cross-reference. 292*dc4d9dc6SAnna Dabrowska * @throws CrossReferenceException 293*dc4d9dc6SAnna Dabrowska */ 294*dc4d9dc6SAnna Dabrowska protected function findStartXref() 295*dc4d9dc6SAnna Dabrowska { 296*dc4d9dc6SAnna Dabrowska $reader = $this->parser->getStreamReader(); 297*dc4d9dc6SAnna Dabrowska $reader->reset(-self::$trailerSearchLength, self::$trailerSearchLength); 298*dc4d9dc6SAnna Dabrowska 299*dc4d9dc6SAnna Dabrowska $buffer = $reader->getBuffer(false); 300*dc4d9dc6SAnna Dabrowska $pos = \strrpos($buffer, 'startxref'); 301*dc4d9dc6SAnna Dabrowska $addOffset = 9; 302*dc4d9dc6SAnna Dabrowska if ($pos === false) { 303*dc4d9dc6SAnna Dabrowska // Some corrupted documents uses startref, instead of startxref 304*dc4d9dc6SAnna Dabrowska $pos = \strrpos($buffer, 'startref'); 305*dc4d9dc6SAnna Dabrowska if ($pos === false) { 306*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 307*dc4d9dc6SAnna Dabrowska 'Unable to find pointer to xref table', 308*dc4d9dc6SAnna Dabrowska CrossReferenceException::NO_STARTXREF_FOUND 309*dc4d9dc6SAnna Dabrowska ); 310*dc4d9dc6SAnna Dabrowska } 311*dc4d9dc6SAnna Dabrowska $addOffset = 8; 312*dc4d9dc6SAnna Dabrowska } 313*dc4d9dc6SAnna Dabrowska 314*dc4d9dc6SAnna Dabrowska $reader->setOffset($pos + $addOffset); 315*dc4d9dc6SAnna Dabrowska 316*dc4d9dc6SAnna Dabrowska try { 317*dc4d9dc6SAnna Dabrowska $value = $this->parser->readValue(null, PdfNumeric::class); 318*dc4d9dc6SAnna Dabrowska } catch (PdfTypeException $e) { 319*dc4d9dc6SAnna Dabrowska throw new CrossReferenceException( 320*dc4d9dc6SAnna Dabrowska 'Invalid data after startxref keyword.', 321*dc4d9dc6SAnna Dabrowska CrossReferenceException::INVALID_DATA, 322*dc4d9dc6SAnna Dabrowska $e 323*dc4d9dc6SAnna Dabrowska ); 324*dc4d9dc6SAnna Dabrowska } 325*dc4d9dc6SAnna Dabrowska 326*dc4d9dc6SAnna Dabrowska return $value->value; 327*dc4d9dc6SAnna Dabrowska } 328*dc4d9dc6SAnna Dabrowska} 329