1<?php 2 3declare(strict_types=1); 4 5/* 6 * This file is part of the league/commonmark package. 7 * 8 * (c) Colin O'Dell <colinodell@gmail.com> 9 * 10 * For the full copyright and license information, please view the LICENSE 11 * file that was distributed with this source code. 12 */ 13 14namespace League\CommonMark; 15 16use League\CommonMark\Exception\UnexpectedEncodingException; 17 18class Cursor 19{ 20 public const INDENT_LEVEL = 4; 21 22 /** 23 * @var string 24 */ 25 private $line; 26 27 /** 28 * @var int 29 */ 30 private $length; 31 32 /** 33 * @var int 34 * 35 * It's possible for this to be 1 char past the end, meaning we've parsed all chars and have 36 * reached the end. In this state, any character-returning method MUST return null. 37 */ 38 private $currentPosition = 0; 39 40 /** 41 * @var int 42 */ 43 private $column = 0; 44 45 /** 46 * @var int 47 */ 48 private $indent = 0; 49 50 /** 51 * @var int 52 */ 53 private $previousPosition = 0; 54 55 /** 56 * @var int|null 57 */ 58 private $nextNonSpaceCache; 59 60 /** 61 * @var bool 62 */ 63 private $partiallyConsumedTab = false; 64 65 /** 66 * @var bool 67 */ 68 private $lineContainsTabs; 69 70 /** 71 * @var bool 72 */ 73 private $isMultibyte; 74 75 /** 76 * @var array<int, string> 77 */ 78 private $charCache = []; 79 80 /** 81 * @param string $line The line being parsed (ASCII or UTF-8) 82 */ 83 public function __construct(string $line) 84 { 85 if (!\mb_check_encoding($line, 'UTF-8')) { 86 throw new UnexpectedEncodingException('Unexpected encoding - UTF-8 or ASCII was expected'); 87 } 88 89 $this->line = $line; 90 $this->length = \mb_strlen($line, 'UTF-8') ?: 0; 91 $this->isMultibyte = $this->length !== \strlen($line); 92 $this->lineContainsTabs = false !== \strpos($line, "\t"); 93 } 94 95 /** 96 * Returns the position of the next character which is not a space (or tab) 97 * 98 * @return int 99 */ 100 public function getNextNonSpacePosition(): int 101 { 102 if ($this->nextNonSpaceCache !== null) { 103 return $this->nextNonSpaceCache; 104 } 105 106 $i = $this->currentPosition; 107 $cols = $this->column; 108 109 while (($c = $this->getCharacter($i)) !== null) { 110 if ($c === ' ') { 111 $i++; 112 $cols++; 113 } elseif ($c === "\t") { 114 $i++; 115 $cols += (4 - ($cols % 4)); 116 } else { 117 break; 118 } 119 } 120 121 $nextNonSpace = ($c === null) ? $this->length : $i; 122 $this->indent = $cols - $this->column; 123 124 return $this->nextNonSpaceCache = $nextNonSpace; 125 } 126 127 /** 128 * Returns the next character which isn't a space (or tab) 129 * 130 * @return string 131 */ 132 public function getNextNonSpaceCharacter(): ?string 133 { 134 return $this->getCharacter($this->getNextNonSpacePosition()); 135 } 136 137 /** 138 * Calculates the current indent (number of spaces after current position) 139 * 140 * @return int 141 */ 142 public function getIndent(): int 143 { 144 if ($this->nextNonSpaceCache === null) { 145 $this->getNextNonSpacePosition(); 146 } 147 148 return $this->indent; 149 } 150 151 /** 152 * Whether the cursor is indented to INDENT_LEVEL 153 * 154 * @return bool 155 */ 156 public function isIndented(): bool 157 { 158 return $this->getIndent() >= self::INDENT_LEVEL; 159 } 160 161 /** 162 * @param int|null $index 163 * 164 * @return string|null 165 */ 166 public function getCharacter(?int $index = null): ?string 167 { 168 if ($index === null) { 169 $index = $this->currentPosition; 170 } 171 172 // Index out-of-bounds, or we're at the end 173 if ($index < 0 || $index >= $this->length) { 174 return null; 175 } 176 177 if ($this->isMultibyte) { 178 if (isset($this->charCache[$index])) { 179 return $this->charCache[$index]; 180 } 181 182 return $this->charCache[$index] = \mb_substr($this->line, $index, 1, 'UTF-8'); 183 } 184 185 return $this->line[$index]; 186 } 187 188 /** 189 * Returns the next character (or null, if none) without advancing forwards 190 * 191 * @param int $offset 192 * 193 * @return string|null 194 */ 195 public function peek(int $offset = 1): ?string 196 { 197 return $this->getCharacter($this->currentPosition + $offset); 198 } 199 200 /** 201 * Whether the remainder is blank 202 * 203 * @return bool 204 */ 205 public function isBlank(): bool 206 { 207 return $this->nextNonSpaceCache === $this->length || $this->getNextNonSpacePosition() === $this->length; 208 } 209 210 /** 211 * Move the cursor forwards 212 * 213 * @return void 214 */ 215 public function advance() 216 { 217 $this->advanceBy(1); 218 } 219 220 /** 221 * Move the cursor forwards 222 * 223 * @param int $characters Number of characters to advance by 224 * @param bool $advanceByColumns Whether to advance by columns instead of spaces 225 * 226 * @return void 227 */ 228 public function advanceBy(int $characters, bool $advanceByColumns = false) 229 { 230 if ($characters === 0) { 231 $this->previousPosition = $this->currentPosition; 232 233 return; 234 } 235 236 $this->previousPosition = $this->currentPosition; 237 $this->nextNonSpaceCache = null; 238 239 // Optimization to avoid tab handling logic if we have no tabs 240 if (!$this->lineContainsTabs || false === \strpos( 241 $nextFewChars = $this->isMultibyte ? 242 \mb_substr($this->line, $this->currentPosition, $characters, 'UTF-8') : 243 \substr($this->line, $this->currentPosition, $characters), 244 "\t" 245 )) { 246 $length = \min($characters, $this->length - $this->currentPosition); 247 $this->partiallyConsumedTab = false; 248 $this->currentPosition += $length; 249 $this->column += $length; 250 251 return; 252 } 253 254 if ($characters === 1 && !empty($nextFewChars)) { 255 $asArray = [$nextFewChars]; 256 } elseif ($this->isMultibyte) { 257 /** @var string[] $asArray */ 258 $asArray = \preg_split('//u', $nextFewChars, -1, \PREG_SPLIT_NO_EMPTY); 259 } else { 260 $asArray = \str_split($nextFewChars); 261 } 262 263 foreach ($asArray as $relPos => $c) { 264 if ($c === "\t") { 265 $charsToTab = 4 - ($this->column % 4); 266 if ($advanceByColumns) { 267 $this->partiallyConsumedTab = $charsToTab > $characters; 268 $charsToAdvance = $charsToTab > $characters ? $characters : $charsToTab; 269 $this->column += $charsToAdvance; 270 $this->currentPosition += $this->partiallyConsumedTab ? 0 : 1; 271 $characters -= $charsToAdvance; 272 } else { 273 $this->partiallyConsumedTab = false; 274 $this->column += $charsToTab; 275 $this->currentPosition++; 276 $characters--; 277 } 278 } else { 279 $this->partiallyConsumedTab = false; 280 $this->currentPosition++; 281 $this->column++; 282 $characters--; 283 } 284 285 if ($characters <= 0) { 286 break; 287 } 288 } 289 } 290 291 /** 292 * Advances the cursor by a single space or tab, if present 293 * 294 * @return bool 295 */ 296 public function advanceBySpaceOrTab(): bool 297 { 298 $character = $this->getCharacter(); 299 300 if ($character === ' ' || $character === "\t") { 301 $this->advanceBy(1, true); 302 303 return true; 304 } 305 306 return false; 307 } 308 309 /** 310 * Parse zero or more space/tab characters 311 * 312 * @return int Number of positions moved 313 */ 314 public function advanceToNextNonSpaceOrTab(): int 315 { 316 $newPosition = $this->getNextNonSpacePosition(); 317 $this->advanceBy($newPosition - $this->currentPosition); 318 $this->partiallyConsumedTab = false; 319 320 return $this->currentPosition - $this->previousPosition; 321 } 322 323 /** 324 * Parse zero or more space characters, including at most one newline. 325 * 326 * Tab characters are not parsed with this function. 327 * 328 * @return int Number of positions moved 329 */ 330 public function advanceToNextNonSpaceOrNewline(): int 331 { 332 $remainder = $this->getRemainder(); 333 334 // Optimization: Avoid the regex if we know there are no spaces or newlines 335 if (empty($remainder) || ($remainder[0] !== ' ' && $remainder[0] !== "\n")) { 336 $this->previousPosition = $this->currentPosition; 337 338 return 0; 339 } 340 341 $matches = []; 342 \preg_match('/^ *(?:\n *)?/', $remainder, $matches, \PREG_OFFSET_CAPTURE); 343 344 // [0][0] contains the matched text 345 // [0][1] contains the index of that match 346 $increment = $matches[0][1] + \strlen($matches[0][0]); 347 348 $this->advanceBy($increment); 349 350 return $this->currentPosition - $this->previousPosition; 351 } 352 353 /** 354 * Move the position to the very end of the line 355 * 356 * @return int The number of characters moved 357 */ 358 public function advanceToEnd(): int 359 { 360 $this->previousPosition = $this->currentPosition; 361 $this->nextNonSpaceCache = null; 362 363 $this->currentPosition = $this->length; 364 365 return $this->currentPosition - $this->previousPosition; 366 } 367 368 public function getRemainder(): string 369 { 370 if ($this->currentPosition >= $this->length) { 371 return ''; 372 } 373 374 $prefix = ''; 375 $position = $this->currentPosition; 376 if ($this->partiallyConsumedTab) { 377 $position++; 378 $charsToTab = 4 - ($this->column % 4); 379 $prefix = \str_repeat(' ', $charsToTab); 380 } 381 382 $subString = $this->isMultibyte ? 383 \mb_substr($this->line, $position, null, 'UTF-8') : 384 \substr($this->line, $position); 385 386 return $prefix . $subString; 387 } 388 389 public function getLine(): string 390 { 391 return $this->line; 392 } 393 394 public function isAtEnd(): bool 395 { 396 return $this->currentPosition >= $this->length; 397 } 398 399 /** 400 * Try to match a regular expression 401 * 402 * Returns the matching text and advances to the end of that match 403 * 404 * @param string $regex 405 * 406 * @return string|null 407 */ 408 public function match(string $regex): ?string 409 { 410 $subject = $this->getRemainder(); 411 412 if (!\preg_match($regex, $subject, $matches, \PREG_OFFSET_CAPTURE)) { 413 return null; 414 } 415 416 // $matches[0][0] contains the matched text 417 // $matches[0][1] contains the index of that match 418 419 if ($this->isMultibyte) { 420 // PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying 421 $offset = \mb_strlen(\substr($subject, 0, $matches[0][1]), 'UTF-8'); 422 $matchLength = \mb_strlen($matches[0][0], 'UTF-8'); 423 } else { 424 $offset = $matches[0][1]; 425 $matchLength = \strlen($matches[0][0]); 426 } 427 428 // [0][0] contains the matched text 429 // [0][1] contains the index of that match 430 $this->advanceBy($offset + $matchLength); 431 432 return $matches[0][0]; 433 } 434 435 /** 436 * Encapsulates the current state of this cursor in case you need to rollback later. 437 * 438 * WARNING: Do not parse or use the return value for ANYTHING except for 439 * passing it back into restoreState(), as the number of values and their 440 * contents may change in any future release without warning. 441 * 442 * @return array<mixed> 443 */ 444 public function saveState() 445 { 446 return [ 447 $this->currentPosition, 448 $this->previousPosition, 449 $this->nextNonSpaceCache, 450 $this->indent, 451 $this->column, 452 $this->partiallyConsumedTab, 453 ]; 454 } 455 456 /** 457 * Restore the cursor to a previous state. 458 * 459 * Pass in the value previously obtained by calling saveState(). 460 * 461 * @param array<mixed> $state 462 * 463 * @return void 464 */ 465 public function restoreState($state) 466 { 467 list( 468 $this->currentPosition, 469 $this->previousPosition, 470 $this->nextNonSpaceCache, 471 $this->indent, 472 $this->column, 473 $this->partiallyConsumedTab, 474 ) = $state; 475 } 476 477 public function getPosition(): int 478 { 479 return $this->currentPosition; 480 } 481 482 public function getPreviousText(): string 483 { 484 return \mb_substr($this->line, $this->previousPosition, $this->currentPosition - $this->previousPosition, 'UTF-8'); 485 } 486 487 public function getSubstring(int $start, ?int $length = null): string 488 { 489 if ($this->isMultibyte) { 490 return \mb_substr($this->line, $start, $length, 'UTF-8'); 491 } elseif ($length !== null) { 492 return \substr($this->line, $start, $length); 493 } 494 495 return \substr($this->line, $start); 496 } 497 498 public function getColumn(): int 499 { 500 return $this->column; 501 } 502} 503