1*dc4d9dc6SAnna Dabrowska<?php 2*dc4d9dc6SAnna Dabrowska/** 3*dc4d9dc6SAnna Dabrowska * This file is part of FPDI 4*dc4d9dc6SAnna Dabrowska * 5*dc4d9dc6SAnna Dabrowska * @package setasign\Fpdi 6*dc4d9dc6SAnna Dabrowska * @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com) 7*dc4d9dc6SAnna Dabrowska * @license http://opensource.org/licenses/mit-license The MIT License 8*dc4d9dc6SAnna Dabrowska */ 9*dc4d9dc6SAnna Dabrowska 10*dc4d9dc6SAnna Dabrowskanamespace setasign\Fpdi\PdfParser; 11*dc4d9dc6SAnna Dabrowska 12*dc4d9dc6SAnna Dabrowska/** 13*dc4d9dc6SAnna Dabrowska * A tokenizer class. 14*dc4d9dc6SAnna Dabrowska * 15*dc4d9dc6SAnna Dabrowska * @package setasign\Fpdi\PdfParser 16*dc4d9dc6SAnna Dabrowska */ 17*dc4d9dc6SAnna Dabrowskaclass Tokenizer 18*dc4d9dc6SAnna Dabrowska{ 19*dc4d9dc6SAnna Dabrowska /** 20*dc4d9dc6SAnna Dabrowska * @var StreamReader 21*dc4d9dc6SAnna Dabrowska */ 22*dc4d9dc6SAnna Dabrowska protected $streamReader; 23*dc4d9dc6SAnna Dabrowska 24*dc4d9dc6SAnna Dabrowska /** 25*dc4d9dc6SAnna Dabrowska * A token stack. 26*dc4d9dc6SAnna Dabrowska * 27*dc4d9dc6SAnna Dabrowska * @var string[] 28*dc4d9dc6SAnna Dabrowska */ 29*dc4d9dc6SAnna Dabrowska protected $stack = []; 30*dc4d9dc6SAnna Dabrowska 31*dc4d9dc6SAnna Dabrowska /** 32*dc4d9dc6SAnna Dabrowska * Tokenizer constructor. 33*dc4d9dc6SAnna Dabrowska * 34*dc4d9dc6SAnna Dabrowska * @param StreamReader $streamReader 35*dc4d9dc6SAnna Dabrowska */ 36*dc4d9dc6SAnna Dabrowska public function __construct(StreamReader $streamReader) 37*dc4d9dc6SAnna Dabrowska { 38*dc4d9dc6SAnna Dabrowska $this->streamReader = $streamReader; 39*dc4d9dc6SAnna Dabrowska } 40*dc4d9dc6SAnna Dabrowska 41*dc4d9dc6SAnna Dabrowska /** 42*dc4d9dc6SAnna Dabrowska * Get the stream reader instance. 43*dc4d9dc6SAnna Dabrowska * 44*dc4d9dc6SAnna Dabrowska * @return StreamReader 45*dc4d9dc6SAnna Dabrowska */ 46*dc4d9dc6SAnna Dabrowska public function getStreamReader() 47*dc4d9dc6SAnna Dabrowska { 48*dc4d9dc6SAnna Dabrowska return $this->streamReader; 49*dc4d9dc6SAnna Dabrowska } 50*dc4d9dc6SAnna Dabrowska 51*dc4d9dc6SAnna Dabrowska /** 52*dc4d9dc6SAnna Dabrowska * Clear the token stack. 53*dc4d9dc6SAnna Dabrowska */ 54*dc4d9dc6SAnna Dabrowska public function clearStack() 55*dc4d9dc6SAnna Dabrowska { 56*dc4d9dc6SAnna Dabrowska $this->stack = []; 57*dc4d9dc6SAnna Dabrowska } 58*dc4d9dc6SAnna Dabrowska 59*dc4d9dc6SAnna Dabrowska /** 60*dc4d9dc6SAnna Dabrowska * Push a token onto the stack. 61*dc4d9dc6SAnna Dabrowska * 62*dc4d9dc6SAnna Dabrowska * @param string $token 63*dc4d9dc6SAnna Dabrowska */ 64*dc4d9dc6SAnna Dabrowska public function pushStack($token) 65*dc4d9dc6SAnna Dabrowska { 66*dc4d9dc6SAnna Dabrowska $this->stack[] = $token; 67*dc4d9dc6SAnna Dabrowska } 68*dc4d9dc6SAnna Dabrowska 69*dc4d9dc6SAnna Dabrowska /** 70*dc4d9dc6SAnna Dabrowska * Get next token. 71*dc4d9dc6SAnna Dabrowska * 72*dc4d9dc6SAnna Dabrowska * @return bool|string 73*dc4d9dc6SAnna Dabrowska */ 74*dc4d9dc6SAnna Dabrowska public function getNextToken() 75*dc4d9dc6SAnna Dabrowska { 76*dc4d9dc6SAnna Dabrowska $token = \array_pop($this->stack); 77*dc4d9dc6SAnna Dabrowska if ($token !== null) { 78*dc4d9dc6SAnna Dabrowska return $token; 79*dc4d9dc6SAnna Dabrowska } 80*dc4d9dc6SAnna Dabrowska 81*dc4d9dc6SAnna Dabrowska if (($byte = $this->streamReader->readByte()) === false) { 82*dc4d9dc6SAnna Dabrowska return false; 83*dc4d9dc6SAnna Dabrowska } 84*dc4d9dc6SAnna Dabrowska 85*dc4d9dc6SAnna Dabrowska if ($byte === "\x20" || 86*dc4d9dc6SAnna Dabrowska $byte === "\x0A" || 87*dc4d9dc6SAnna Dabrowska $byte === "\x0D" || 88*dc4d9dc6SAnna Dabrowska $byte === "\x0C" || 89*dc4d9dc6SAnna Dabrowska $byte === "\x09" || 90*dc4d9dc6SAnna Dabrowska $byte === "\x00" 91*dc4d9dc6SAnna Dabrowska ) { 92*dc4d9dc6SAnna Dabrowska if ($this->leapWhiteSpaces() === false) { 93*dc4d9dc6SAnna Dabrowska return false; 94*dc4d9dc6SAnna Dabrowska } 95*dc4d9dc6SAnna Dabrowska $byte = $this->streamReader->readByte(); 96*dc4d9dc6SAnna Dabrowska } 97*dc4d9dc6SAnna Dabrowska 98*dc4d9dc6SAnna Dabrowska switch ($byte) { 99*dc4d9dc6SAnna Dabrowska case '/': 100*dc4d9dc6SAnna Dabrowska case '[': 101*dc4d9dc6SAnna Dabrowska case ']': 102*dc4d9dc6SAnna Dabrowska case '(': 103*dc4d9dc6SAnna Dabrowska case ')': 104*dc4d9dc6SAnna Dabrowska case '{': 105*dc4d9dc6SAnna Dabrowska case '}': 106*dc4d9dc6SAnna Dabrowska case '<': 107*dc4d9dc6SAnna Dabrowska case '>': 108*dc4d9dc6SAnna Dabrowska return $byte; 109*dc4d9dc6SAnna Dabrowska case '%': 110*dc4d9dc6SAnna Dabrowska $this->streamReader->readLine(); 111*dc4d9dc6SAnna Dabrowska return $this->getNextToken(); 112*dc4d9dc6SAnna Dabrowska } 113*dc4d9dc6SAnna Dabrowska 114*dc4d9dc6SAnna Dabrowska /* This way is faster than checking single bytes. 115*dc4d9dc6SAnna Dabrowska */ 116*dc4d9dc6SAnna Dabrowska $bufferOffset = $this->streamReader->getOffset(); 117*dc4d9dc6SAnna Dabrowska do { 118*dc4d9dc6SAnna Dabrowska $lastBuffer = $this->streamReader->getBuffer(false); 119*dc4d9dc6SAnna Dabrowska $pos = \strcspn( 120*dc4d9dc6SAnna Dabrowska $lastBuffer, 121*dc4d9dc6SAnna Dabrowska "\x00\x09\x0A\x0C\x0D\x20()<>[]{}/%", 122*dc4d9dc6SAnna Dabrowska $bufferOffset 123*dc4d9dc6SAnna Dabrowska ); 124*dc4d9dc6SAnna Dabrowska } while ( 125*dc4d9dc6SAnna Dabrowska // Break the loop if a delimiter or white space char is matched 126*dc4d9dc6SAnna Dabrowska // in the current buffer or increase the buffers length 127*dc4d9dc6SAnna Dabrowska $lastBuffer !== false && 128*dc4d9dc6SAnna Dabrowska ( 129*dc4d9dc6SAnna Dabrowska $bufferOffset + $pos === \strlen($lastBuffer) && 130*dc4d9dc6SAnna Dabrowska $this->streamReader->increaseLength() 131*dc4d9dc6SAnna Dabrowska ) 132*dc4d9dc6SAnna Dabrowska ); 133*dc4d9dc6SAnna Dabrowska 134*dc4d9dc6SAnna Dabrowska $result = \substr($lastBuffer, $bufferOffset - 1, $pos + 1); 135*dc4d9dc6SAnna Dabrowska $this->streamReader->setOffset($bufferOffset + $pos); 136*dc4d9dc6SAnna Dabrowska 137*dc4d9dc6SAnna Dabrowska return $result; 138*dc4d9dc6SAnna Dabrowska } 139*dc4d9dc6SAnna Dabrowska 140*dc4d9dc6SAnna Dabrowska /** 141*dc4d9dc6SAnna Dabrowska * Leap white spaces. 142*dc4d9dc6SAnna Dabrowska * 143*dc4d9dc6SAnna Dabrowska * @return boolean 144*dc4d9dc6SAnna Dabrowska */ 145*dc4d9dc6SAnna Dabrowska public function leapWhiteSpaces() 146*dc4d9dc6SAnna Dabrowska { 147*dc4d9dc6SAnna Dabrowska do { 148*dc4d9dc6SAnna Dabrowska if (!$this->streamReader->ensureContent()) { 149*dc4d9dc6SAnna Dabrowska return false; 150*dc4d9dc6SAnna Dabrowska } 151*dc4d9dc6SAnna Dabrowska 152*dc4d9dc6SAnna Dabrowska $buffer = $this->streamReader->getBuffer(false); 153*dc4d9dc6SAnna Dabrowska $matches = \strspn($buffer, "\x20\x0A\x0C\x0D\x09\x00", $this->streamReader->getOffset()); 154*dc4d9dc6SAnna Dabrowska if ($matches > 0) { 155*dc4d9dc6SAnna Dabrowska $this->streamReader->addOffset($matches); 156*dc4d9dc6SAnna Dabrowska } 157*dc4d9dc6SAnna Dabrowska } while ($this->streamReader->getOffset() >= $this->streamReader->getBufferLength()); 158*dc4d9dc6SAnna Dabrowska 159*dc4d9dc6SAnna Dabrowska return true; 160*dc4d9dc6SAnna Dabrowska } 161*dc4d9dc6SAnna Dabrowska} 162