1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Lexer\Lexer; 6 7/** 8 * This class and all the subclasses below are used to reduce the effort required to register 9 * modes with the Lexer. 10 * 11 * @author Harry Fuecks <hfuecks@gmail.com> 12 */ 13abstract class AbstractMode implements ModeInterface 14{ 15 /** @var Lexer $Lexer will be injected on loading FIXME this should be done by setter */ 16 public $Lexer; 17 protected $allowedModes = []; 18 19 /** 20 * Zero-width assertion: not at the start of a paragraph break. 21 * 22 * Paragraph boundaries are blank lines — two newlines possibly separated 23 * by horizontal whitespace. The lexer compiles all patterns with the `s` 24 * (DOTALL) flag, so a plain `.*` inside an entry-pattern lookahead would 25 * match across blank lines and let an unclosed delimiter greedily consume 26 * following paragraphs. Place this assertion before a character class to 27 * stop the match at a paragraph boundary. 28 */ 29 protected const NOT_AT_PARA_BREAK = '(?!\n[ \t]*\n)'; 30 31 /** 32 * Quantified group matching any character that does not start a paragraph 33 * break. Convenience for the common case of "consume until paragraph end". 34 * 35 * Example: 36 * return '\*\*(?=' . self::CONTENT_UNTIL_PARA . '\*\*)'; 37 */ 38 protected const CONTENT_UNTIL_PARA = '(?:' . self::NOT_AT_PARA_BREAK . '.)*'; 39 40 /** 41 * Character class: a single "non-word" character — ASCII whitespace or 42 * any ASCII punctuation character except the underscore. 43 * 44 * The `_` is excluded because it is itself a delimiter for emphasis in 45 * GFM/CommonMark; treating it as non-word would let `__foo` incorrectly 46 * open emphasis at the second `_`. 47 * 48 * Multibyte rationale: the lexer compiles patterns without the `u` flag, 49 * so UTF-8 is treated as individual bytes. Multibyte characters begin 50 * with bytes >= 0x80, which fall outside every ASCII character class. 51 * Checking that the surrounding context matches NON_WORD_CHAR positively 52 * therefore correctly treats multibyte letters as word-like — preventing 53 * intraword matches in non-Latin text (e.g. `für_etwas`, `日本_語`) 54 * without requiring `u` flag support across the whole lexer. 55 */ 56 protected const NON_WORD_CHAR = '[\s!"#$%&\'()*+,\-./:;<=>?@\[\\\\\]^`{|}~]'; 57 58 /** 59 * Zero-width assertion: current position is preceded by a non-word 60 * character, or is at the start of input/line. See {@see self::NON_WORD_CHAR} 61 * for the multibyte reasoning. 62 */ 63 protected const NO_WORD_BEFORE = '(?:^|(?<=' . self::NON_WORD_CHAR . '))'; 64 65 /** 66 * Zero-width assertion: current position is followed by a non-word 67 * character, or is at the end of input. Complement to 68 * {@see self::NO_WORD_BEFORE}. 69 */ 70 protected const NO_WORD_AFTER = '(?:\z|(?=' . self::NON_WORD_CHAR . '))'; 71 72 /** @inheritdoc */ 73 abstract public function getSort(); 74 75 /** @inheritdoc */ 76 public function preConnect() 77 { 78 } 79 80 /** @inheritdoc */ 81 public function connectTo($mode) 82 { 83 } 84 85 /** @inheritdoc */ 86 public function postConnect() 87 { 88 } 89 90 /** @inheritdoc */ 91 public function accepts($mode) 92 { 93 return in_array($mode, (array) $this->allowedModes); 94 } 95} 96