xref: /dokuwiki/inc/Parsing/ParserMode/AbstractMode.php (revision 75364f13219a5af44f52c564ea0a62df64c3a17f)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Handler;
6use dokuwiki\Parsing\Lexer\Lexer;
7use dokuwiki\Parsing\ModeRegistry;
8
9/**
10 * Base class for every parser mode (syntax component) in the Parser.
11 *
12 * Besides reducing the effort required to register modes with the Lexer, this
13 * class defines the mode contract the engine relies on: getSort() and handle()
14 * are abstract and must be implemented by every mode; preConnect(), connectTo(),
15 * postConnect() and accepts() carry default implementations subclasses override
16 * as needed. Parser, Handler and ModeRegistry type-hint this class directly.
17 *
18 * @author Harry Fuecks <hfuecks@gmail.com>
19 */
20abstract class AbstractMode
21{
22    /**
23     * @var Lexer the lexer this mode registers its patterns with.
24     *
25     * Injected via setLexer() by Parser::addMode() / addBaseMode() before any
26     * connect callback runs, so every core mode and plugin reads it as
27     * $this->Lexer from connectTo(). External code reads it via getLexer().
28     */
29    protected Lexer $Lexer;
30
31    /**
32     * @var ModeRegistry the registry of the parse this mode belongs to.
33     * Injected by Parser::addMode() before any connect/handle callback runs,
34     * so subclasses may read $this->registry unconditionally from preConnect(),
35     * connectTo(), postConnect(), handle() and accepts().
36     */
37    protected ModeRegistry $registry;
38
39    /**
40     * @var string[] mode names accepted as nested content inside this mode.
41     *
42     * Resolved once in setModeRegistry(): allowedCategories() mapped to concrete
43     * mode names via the registry, then passed through filterAllowedModes(). A
44     * subclass that does not use categories may instead assign this list
45     * directly, in which case it is used as-is.
46     */
47    protected $allowedModes = [];
48
49    //region Pattern building blocks
50
51    /**
52     * Zero-width assertion: not at the start of a paragraph break.
53     *
54     * Paragraph boundaries are blank lines — two newlines possibly separated
55     * by horizontal whitespace. The lexer compiles all patterns with the `s`
56     * (DOTALL) flag, so a plain `.*` inside an entry-pattern lookahead would
57     * match across blank lines and let an unclosed delimiter greedily consume
58     * following paragraphs. Place this assertion before a character class to
59     * stop the match at a paragraph boundary.
60     */
61    protected const NOT_AT_PARA_BREAK = '(?!\n[ \t]*\n)';
62
63    /**
64     * Quantified group matching any character that does not start a paragraph
65     * break. Convenience for the common case of "consume until paragraph end".
66     *
67     * Example:
68     *     return '\*\*(?=' . self::CONTENT_UNTIL_PARA . '\*\*)';
69     */
70    protected const CONTENT_UNTIL_PARA = '(?:' . self::NOT_AT_PARA_BREAK . '.)*';
71
72    /**
73     * Character class: a single "non-word" character — ASCII whitespace or
74     * any ASCII punctuation character except the underscore.
75     *
76     * The `_` is excluded because it is itself a delimiter for emphasis in
77     * GFM/CommonMark; treating it as non-word would let `__foo` incorrectly
78     * open emphasis at the second `_`.
79     *
80     * Multibyte rationale: the lexer compiles patterns without the `u` flag,
81     * so UTF-8 is treated as individual bytes. Multibyte characters begin
82     * with bytes >= 0x80, which fall outside every ASCII character class.
83     * Checking that the surrounding context matches NON_WORD_CHAR positively
84     * therefore correctly treats multibyte letters as word-like — preventing
85     * intraword matches in non-Latin text (e.g. `für_etwas`, `日本_語`)
86     * without requiring `u` flag support across the whole lexer.
87     */
88    protected const NON_WORD_CHAR = '[\s!"#$%&\'()*+,\-./:;<=>?@\[\\\\\]^`{|}~]';
89
90    /**
91     * Zero-width assertion: current position is preceded by a non-word
92     * character, or is at the start of input/line. See {@see self::NON_WORD_CHAR}
93     * for the multibyte reasoning.
94     */
95    protected const NO_WORD_BEFORE = '(?:^|(?<=' . self::NON_WORD_CHAR . '))';
96
97    /**
98     * Zero-width assertion: current position is followed by a non-word
99     * character, or is at the end of input. Complement to
100     * {@see self::NO_WORD_BEFORE}.
101     */
102    protected const NO_WORD_AFTER = '(?:\z|(?=' . self::NON_WORD_CHAR . '))';
103
104    //endregion
105
106    //region Lexer connection
107
108    /**
109     * Returns a number used to determine in which order modes are added.
110     *
111     * @return int
112     */
113    abstract public function getSort();
114
115    /**
116     * Handle a matched token from the lexer.
117     *
118     * @param string $match The matched text
119     * @param int $state The lexer state (DOKU_LEXER_ENTER, _EXIT, _MATCHED, etc.)
120     * @param int $pos Byte position in the source
121     * @param Handler $handler The handler (for addCall, status, etc.)
122     * @return bool
123     */
124    abstract public function handle($match, $state, $pos, Handler $handler);
125
126    /**
127     * Called before any calls to connectTo.
128     *
129     * @return void
130     */
131    public function preConnect()
132    {
133    }
134
135    /**
136     * Connects the mode.
137     *
138     * @param string $mode
139     * @return void
140     */
141    public function connectTo($mode)
142    {
143    }
144
145    /**
146     * Called after all calls to connectTo.
147     *
148     * @return void
149     */
150    public function postConnect()
151    {
152    }
153
154    //endregion
155
156    //region Dependency injection
157
158    /**
159     * Attach the registry of the parse this mode is taking part in and resolve
160     * the set of modes this mode accepts as nested content.
161     *
162     * Called by Parser::addMode() / addBaseMode() as the mode joins the parser.
163     * This is the earliest point the per-parse registry is available, so the
164     * accepted-mode list is resolved here, once: allowedCategories() mapped to
165     * concrete mode names via the registry taxonomy (complete by now, plugin
166     * modes included), then passed through filterAllowedModes(). A subclass that
167     * does not use categories has its directly-assigned $allowedModes used as-is.
168     *
169     * @param ModeRegistry $registry
170     * @return void
171     */
172    public function setModeRegistry(ModeRegistry $registry): void
173    {
174        $this->registry = $registry;
175
176        $categories = $this->allowedCategories();
177        $modes = $categories
178            ? $registry->getModesForCategories($categories)
179            : (array) $this->allowedModes;
180        $this->allowedModes = $this->filterAllowedModes($modes);
181    }
182
183    /**
184     * Attach the lexer this mode registers its patterns with.
185     *
186     * Called by Parser::addMode() / addBaseMode() as the mode joins the parser,
187     * before any connect callback runs.
188     *
189     * @param Lexer $lexer
190     * @return void
191     */
192    public function setLexer(Lexer $lexer): void
193    {
194        $this->Lexer = $lexer;
195    }
196
197    /**
198     * The lexer this mode registers its patterns with.
199     *
200     * @return Lexer
201     */
202    public function getLexer(): Lexer
203    {
204        return $this->Lexer;
205    }
206
207    //endregion
208
209    //region Nested mode resolution
210
211    /**
212     * CATEGORY_* constants whose modes may nest inside this mode.
213     *
214     * Override to declare the categories this mode accepts; accepts() resolves
215     * them to concrete mode names lazily (once the registry is attached) via
216     * the parse's taxonomy. Returning [] means "use $this->allowedModes as-is"
217     * (the default, empty unless a subclass sets it).
218     *
219     * @return string[]
220     */
221    protected function allowedCategories(): array
222    {
223        return [];
224    }
225
226    /**
227     * Post-process the resolved allowedModes list.
228     *
229     * Override to remove entries (e.g. a mode excluding itself to prevent
230     * self-nesting). Applied once, after allowedCategories() is resolved.
231     *
232     * @param string[] $modes
233     * @return string[]
234     */
235    protected function filterAllowedModes(array $modes): array
236    {
237        return $modes;
238    }
239
240    /**
241     * Check if the given mode is accepted inside this mode.
242     *
243     * The accepted-mode list is resolved once in setModeRegistry(); see there.
244     *
245     * @param string $mode
246     * @return bool
247     */
248    public function accepts($mode)
249    {
250        return in_array($mode, $this->allowedModes, true);
251    }
252
253    //endregion
254}
255