1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Lexer\Lexer; 7use dokuwiki\Parsing\ModeRegistry; 8 9/** 10 * Base class for every parser mode (syntax component) in the Parser. 11 * 12 * Besides reducing the effort required to register modes with the Lexer, this 13 * class defines the mode contract the engine relies on: getSort() and handle() 14 * are abstract and must be implemented by every mode; preConnect(), connectTo(), 15 * postConnect() and accepts() carry default implementations subclasses override 16 * as needed. Parser, Handler and ModeRegistry type-hint this class directly. 17 * 18 * @author Harry Fuecks <hfuecks@gmail.com> 19 */ 20abstract class AbstractMode 21{ 22 /** 23 * @var Lexer the lexer this mode registers its patterns with. 24 * 25 * Injected via setLexer() by Parser::addMode() / addBaseMode() before any 26 * connect callback runs, so every core mode and plugin reads it as 27 * $this->Lexer from connectTo(). External code reads it via getLexer(). 28 */ 29 protected Lexer $Lexer; 30 31 /** 32 * @var ModeRegistry the registry of the parse this mode belongs to. 33 * Injected by Parser::addMode() before any connect/handle callback runs, 34 * so subclasses may read $this->registry unconditionally from preConnect(), 35 * connectTo(), postConnect(), handle() and accepts(). 36 */ 37 protected ModeRegistry $registry; 38 39 /** 40 * @var string[] mode names accepted as nested content inside this mode. 41 * 42 * Resolved once in setModeRegistry(): allowedCategories() mapped to concrete 43 * mode names via the registry, then passed through filterAllowedModes(). A 44 * subclass that does not use categories may instead assign this list 45 * directly, in which case it is used as-is. 46 */ 47 protected $allowedModes = []; 48 49 //region Pattern building blocks 50 51 /** 52 * Zero-width assertion: not at the start of a paragraph break. 53 * 54 * Paragraph boundaries are blank lines — two newlines possibly separated 55 * by horizontal whitespace. The lexer compiles all patterns with the `s` 56 * (DOTALL) flag, so a plain `.*` inside an entry-pattern lookahead would 57 * match across blank lines and let an unclosed delimiter greedily consume 58 * following paragraphs. Place this assertion before a character class to 59 * stop the match at a paragraph boundary. 60 */ 61 protected const NOT_AT_PARA_BREAK = '(?!\n[ \t]*\n)'; 62 63 /** 64 * Quantified group matching any character that does not start a paragraph 65 * break. Convenience for the common case of "consume until paragraph end". 66 * 67 * Example: 68 * return '\*\*(?=' . self::CONTENT_UNTIL_PARA . '\*\*)'; 69 */ 70 protected const CONTENT_UNTIL_PARA = '(?:' . self::NOT_AT_PARA_BREAK . '.)*'; 71 72 /** 73 * Character class: a single "non-word" character — ASCII whitespace or 74 * any ASCII punctuation character except the underscore. 75 * 76 * The `_` is excluded because it is itself a delimiter for emphasis in 77 * GFM/CommonMark; treating it as non-word would let `__foo` incorrectly 78 * open emphasis at the second `_`. 79 * 80 * Multibyte rationale: the lexer compiles patterns without the `u` flag, 81 * so UTF-8 is treated as individual bytes. Multibyte characters begin 82 * with bytes >= 0x80, which fall outside every ASCII character class. 83 * Checking that the surrounding context matches NON_WORD_CHAR positively 84 * therefore correctly treats multibyte letters as word-like — preventing 85 * intraword matches in non-Latin text (e.g. `für_etwas`, `日本_語`) 86 * without requiring `u` flag support across the whole lexer. 87 */ 88 protected const NON_WORD_CHAR = '[\s!"#$%&\'()*+,\-./:;<=>?@\[\\\\\]^`{|}~]'; 89 90 /** 91 * Zero-width assertion: current position is preceded by a non-word 92 * character, or is at the start of input/line. See {@see self::NON_WORD_CHAR} 93 * for the multibyte reasoning. 94 */ 95 protected const NO_WORD_BEFORE = '(?:^|(?<=' . self::NON_WORD_CHAR . '))'; 96 97 /** 98 * Zero-width assertion: current position is followed by a non-word 99 * character, or is at the end of input. Complement to 100 * {@see self::NO_WORD_BEFORE}. 101 */ 102 protected const NO_WORD_AFTER = '(?:\z|(?=' . self::NON_WORD_CHAR . '))'; 103 104 //endregion 105 106 //region Lexer connection 107 108 /** 109 * Returns a number used to determine in which order modes are added. 110 * 111 * @return int 112 */ 113 abstract public function getSort(); 114 115 /** 116 * Handle a matched token from the lexer. 117 * 118 * @param string $match The matched text 119 * @param int $state The lexer state (DOKU_LEXER_ENTER, _EXIT, _MATCHED, etc.) 120 * @param int $pos Byte position in the source 121 * @param Handler $handler The handler (for addCall, status, etc.) 122 * @return bool 123 */ 124 abstract public function handle($match, $state, $pos, Handler $handler); 125 126 /** 127 * Called before any calls to connectTo. 128 * 129 * @return void 130 */ 131 public function preConnect() 132 { 133 } 134 135 /** 136 * Connects the mode. 137 * 138 * @param string $mode 139 * @return void 140 */ 141 public function connectTo($mode) 142 { 143 } 144 145 /** 146 * Called after all calls to connectTo. 147 * 148 * @return void 149 */ 150 public function postConnect() 151 { 152 } 153 154 //endregion 155 156 //region Dependency injection 157 158 /** 159 * Attach the registry of the parse this mode is taking part in and resolve 160 * the set of modes this mode accepts as nested content. 161 * 162 * Called by Parser::addMode() / addBaseMode() as the mode joins the parser. 163 * This is the earliest point the per-parse registry is available, so the 164 * accepted-mode list is resolved here, once: allowedCategories() mapped to 165 * concrete mode names via the registry taxonomy (complete by now, plugin 166 * modes included), then passed through filterAllowedModes(). A subclass that 167 * does not use categories has its directly-assigned $allowedModes used as-is. 168 * 169 * @param ModeRegistry $registry 170 * @return void 171 */ 172 public function setModeRegistry(ModeRegistry $registry): void 173 { 174 $this->registry = $registry; 175 176 $categories = $this->allowedCategories(); 177 $modes = $categories 178 ? $registry->getModesForCategories($categories) 179 : (array) $this->allowedModes; 180 $this->allowedModes = $this->filterAllowedModes($modes); 181 } 182 183 /** 184 * Attach the lexer this mode registers its patterns with. 185 * 186 * Called by Parser::addMode() / addBaseMode() as the mode joins the parser, 187 * before any connect callback runs. 188 * 189 * @param Lexer $lexer 190 * @return void 191 */ 192 public function setLexer(Lexer $lexer): void 193 { 194 $this->Lexer = $lexer; 195 } 196 197 /** 198 * The lexer this mode registers its patterns with. 199 * 200 * @return Lexer 201 */ 202 public function getLexer(): Lexer 203 { 204 return $this->Lexer; 205 } 206 207 //endregion 208 209 //region Nested mode resolution 210 211 /** 212 * CATEGORY_* constants whose modes may nest inside this mode. 213 * 214 * Override to declare the categories this mode accepts; accepts() resolves 215 * them to concrete mode names lazily (once the registry is attached) via 216 * the parse's taxonomy. Returning [] means "use $this->allowedModes as-is" 217 * (the default, empty unless a subclass sets it). 218 * 219 * @return string[] 220 */ 221 protected function allowedCategories(): array 222 { 223 return []; 224 } 225 226 /** 227 * Post-process the resolved allowedModes list. 228 * 229 * Override to remove entries (e.g. a mode excluding itself to prevent 230 * self-nesting). Applied once, after allowedCategories() is resolved. 231 * 232 * @param string[] $modes 233 * @return string[] 234 */ 235 protected function filterAllowedModes(array $modes): array 236 { 237 return $modes; 238 } 239 240 /** 241 * Check if the given mode is accepted inside this mode. 242 * 243 * The accepted-mode list is resolved once in setModeRegistry(); see there. 244 * 245 * @param string $mode 246 * @return bool 247 */ 248 public function accepts($mode) 249 { 250 return in_array($mode, $this->allowedModes, true); 251 } 252 253 //endregion 254} 255