1<?php
2/**
3 * This file is part of FPDI
4 *
5 * @package   setasign\Fpdi
6 * @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com)
7 * @license   http://opensource.org/licenses/mit-license The MIT License
8 */
9
10namespace setasign\Fpdi\PdfParser;
11
12use setasign\Fpdi\PdfParser\CrossReference\CrossReference;
13use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException;
14use setasign\Fpdi\PdfParser\Type\PdfArray;
15use setasign\Fpdi\PdfParser\Type\PdfBoolean;
16use setasign\Fpdi\PdfParser\Type\PdfDictionary;
17use setasign\Fpdi\PdfParser\Type\PdfHexString;
18use setasign\Fpdi\PdfParser\Type\PdfIndirectObject;
19use setasign\Fpdi\PdfParser\Type\PdfIndirectObjectReference;
20use setasign\Fpdi\PdfParser\Type\PdfName;
21use setasign\Fpdi\PdfParser\Type\PdfNull;
22use setasign\Fpdi\PdfParser\Type\PdfNumeric;
23use setasign\Fpdi\PdfParser\Type\PdfString;
24use setasign\Fpdi\PdfParser\Type\PdfToken;
25use setasign\Fpdi\PdfParser\Type\PdfType;
26
27/**
28 * A PDF parser class
29 *
30 * @package setasign\Fpdi\PdfParser
31 */
32class PdfParser
33{
34    /**
35     * @var StreamReader
36     */
37    protected $streamReader;
38
39    /**
40     * @var Tokenizer
41     */
42    protected $tokenizer;
43
44    /**
45     * The file header.
46     *
47     * @var string
48     */
49    protected $fileHeader;
50
51    /**
52     * The offset to the file header.
53     *
54     * @var int
55     */
56    protected $fileHeaderOffset;
57
58    /**
59     * @var CrossReference
60     */
61    protected $xref;
62
63    /**
64     * All read objects.
65     *
66     * @var array
67     */
68    protected $objects = [];
69
70    /**
71     * PdfParser constructor.
72     *
73     * @param StreamReader $streamReader
74     */
75    public function __construct(StreamReader $streamReader)
76    {
77        $this->streamReader = $streamReader;
78        $this->tokenizer = new Tokenizer($streamReader);
79    }
80
81    /**
82     * Removes cycled references.
83     *
84     * @internal
85     */
86    public function cleanUp()
87    {
88        $this->xref = null;
89    }
90
91    /**
92     * Get the stream reader instance.
93     *
94     * @return StreamReader
95     */
96    public function getStreamReader()
97    {
98        return $this->streamReader;
99    }
100
101    /**
102     * Get the tokenizer instance.
103     *
104     * @return Tokenizer
105     */
106    public function getTokenizer()
107    {
108        return $this->tokenizer;
109    }
110
111    /**
112     * Resolves the file header.
113     *
114     * @throws PdfParserException
115     * @return int
116     */
117    protected function resolveFileHeader()
118    {
119        if ($this->fileHeader) {
120            return $this->fileHeaderOffset;
121        }
122
123        $this->streamReader->reset(0);
124        $offset = false;
125        $maxIterations = 1000;
126        while (true) {
127            $buffer = $this->streamReader->getBuffer(false);
128            $offset = \strpos($buffer, '%PDF-');
129            if ($offset === false) {
130                if (!$this->streamReader->increaseLength(100) || (--$maxIterations === 0)) {
131                    throw new PdfParserException(
132                        'Unable to find PDF file header.',
133                        PdfParserException::FILE_HEADER_NOT_FOUND
134                    );
135                }
136                continue;
137            }
138            break;
139        }
140
141        $this->fileHeaderOffset = $offset;
142        $this->streamReader->setOffset($offset);
143
144        $this->fileHeader = \trim($this->streamReader->readLine());
145        return $this->fileHeaderOffset;
146    }
147
148    /**
149     * Get the cross reference instance.
150     *
151     * @return CrossReference
152     * @throws CrossReferenceException
153     * @throws PdfParserException
154     */
155    public function getCrossReference()
156    {
157        if ($this->xref === null) {
158            $this->xref = new CrossReference($this, $this->resolveFileHeader());
159        }
160
161        return $this->xref;
162    }
163
164    /**
165     * Get the PDF version.
166     *
167     * @return int[] An array of major and minor version.
168     * @throws PdfParserException
169     */
170    public function getPdfVersion()
171    {
172        $this->resolveFileHeader();
173
174        if (\preg_match('/%PDF-(\d)\.(\d)/', $this->fileHeader, $result) === 0) {
175            throw new PdfParserException(
176                'Unable to extract PDF version from file header.',
177                PdfParserException::PDF_VERSION_NOT_FOUND
178            );
179        }
180        list(, $major, $minor) = $result;
181
182        $catalog = $this->getCatalog();
183        if (isset($catalog->value['Version'])) {
184            $versionParts = \explode('.', PdfName::unescape(PdfType::resolve($catalog->value['Version'], $this)->value));
185            if (count($versionParts) === 2) {
186                list($major, $minor) = $versionParts;
187            }
188        }
189
190        return [(int) $major, (int) $minor];
191    }
192
193    /**
194     * Get the catalog dictionary.
195     *
196     * @return PdfDictionary
197     * @throws Type\PdfTypeException
198     * @throws CrossReferenceException
199     * @throws PdfParserException
200     */
201    public function getCatalog()
202    {
203        $xref = $this->getCrossReference();
204        $trailer = $xref->getTrailer();
205
206        $catalog = PdfType::resolve(PdfDictionary::get($trailer, 'Root'), $this);
207
208        return PdfDictionary::ensure($catalog);
209    }
210
211    /**
212     * Get an indirect object by its object number.
213     *
214     * @param int $objectNumber
215     * @param bool $cache
216     * @return PdfIndirectObject
217     * @throws CrossReferenceException
218     * @throws PdfParserException
219     */
220    public function getIndirectObject($objectNumber, $cache = false)
221    {
222        $objectNumber = (int) $objectNumber;
223        if (isset($this->objects[$objectNumber])) {
224            return $this->objects[$objectNumber];
225        }
226
227        $xref = $this->getCrossReference();
228        $object = $xref->getIndirectObject($objectNumber);
229
230        if ($cache) {
231            $this->objects[$objectNumber] = $object;
232        }
233
234        return $object;
235    }
236
237    /**
238     * Read a PDF value.
239     *
240     * @param null|bool|string $token
241     * @param null|string $expectedType
242     * @return bool|PdfArray|PdfBoolean|PdfHexString|PdfName|PdfNull|PdfNumeric|PdfString|PdfToken|PdfIndirectObjectReference
243     * @throws Type\PdfTypeException
244     */
245    public function readValue($token = null, $expectedType = null)
246    {
247        if ($token === null) {
248            $token = $this->tokenizer->getNextToken();
249        }
250
251        if ($token === false) {
252            if ($expectedType !== null) {
253                throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
254            }
255            return false;
256        }
257
258        switch ($token) {
259            case '(':
260                $this->ensureExpectedType($token, $expectedType);
261                return PdfString::parse($this->streamReader);
262
263            case '<':
264                if ($this->streamReader->getByte() === '<') {
265                    $this->ensureExpectedType('<<', $expectedType);
266                    $this->streamReader->addOffset(1);
267                    return PdfDictionary::parse($this->tokenizer, $this->streamReader, $this);
268                }
269
270                $this->ensureExpectedType($token, $expectedType);
271                return PdfHexString::parse($this->streamReader);
272
273            case '/':
274                $this->ensureExpectedType($token, $expectedType);
275                return PdfName::parse($this->tokenizer, $this->streamReader);
276
277            case '[':
278                $this->ensureExpectedType($token, $expectedType);
279                return PdfArray::parse($this->tokenizer, $this);
280
281            default:
282                if (\is_numeric($token)) {
283                    if (($token2 = $this->tokenizer->getNextToken()) !== false) {
284                        if (\is_numeric($token2)) {
285                            if (($token3 = $this->tokenizer->getNextToken()) !== false) {
286                                switch ($token3) {
287                                    case 'obj':
288                                        if ($expectedType !== null && $expectedType !== PdfIndirectObject::class) {
289                                            throw new Type\PdfTypeException(
290                                                'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE
291                                            );
292                                        }
293
294                                        return PdfIndirectObject::parse(
295                                            $token,
296                                            $token2,
297                                            $this,
298                                            $this->tokenizer,
299                                            $this->streamReader
300                                        );
301                                    case 'R':
302                                        if ($expectedType !== null &&
303                                            $expectedType !== PdfIndirectObjectReference::class
304                                        ) {
305                                            throw new Type\PdfTypeException(
306                                                'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE
307                                            );
308                                        }
309
310                                        return PdfIndirectObjectReference::create($token, $token2);
311                                }
312
313                                $this->tokenizer->pushStack($token3);
314                            }
315                        }
316
317                        $this->tokenizer->pushStack($token2);
318                    }
319
320                    if ($expectedType !== null && $expectedType !== PdfNumeric::class) {
321                        throw new Type\PdfTypeException(
322                            'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE
323                        );
324                    }
325                    return PdfNumeric::create($token);
326                }
327
328                if ($token === 'true' || $token === 'false') {
329                    $this->ensureExpectedType($token, $expectedType);
330                    return PdfBoolean::create($token === 'true');
331                }
332
333                if ($token === 'null') {
334                    $this->ensureExpectedType($token, $expectedType);
335                    return new PdfNull();
336                }
337
338                if ($expectedType !== null && $expectedType !== PdfToken::class) {
339                    throw new Type\PdfTypeException(
340                        'Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE
341                    );
342                }
343
344                $v = new PdfToken();
345                $v->value = $token;
346
347                return $v;
348        }
349    }
350
351    /**
352     * Ensures that the token will evaluate to an expected object type (or not).
353     *
354     * @param string $token
355     * @param string|null $expectedType
356     * @return bool
357     * @throws Type\PdfTypeException
358     */
359    private function ensureExpectedType($token, $expectedType)
360    {
361        static $mapping = [
362            '(' => PdfString::class,
363            '<' => PdfHexString::class,
364            '<<' => PdfDictionary::class,
365            '/' => PdfName::class,
366            '[' => PdfArray::class,
367            'true' => PdfBoolean::class,
368            'false' => PdfBoolean::class,
369            'null' => PdfNull::class
370        ];
371
372        if ($expectedType === null || $mapping[$token] === $expectedType) {
373            return true;
374        }
375
376        throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
377    }
378}
379