xref: /dokuwiki/inc/Parsing/ParserMode/AbstractMode.php (revision 6b33ca93d31259cdb3d5ae2fc49b5215b6bde268)
1<?php
2
3namespace dokuwiki\Parsing\ParserMode;
4
5use dokuwiki\Parsing\Lexer\Lexer;
6
7/**
8 * This class and all the subclasses below are used to reduce the effort required to register
9 * modes with the Lexer.
10 *
11 * @author Harry Fuecks <hfuecks@gmail.com>
12 */
13abstract class AbstractMode implements ModeInterface
14{
15    /** @var Lexer $Lexer will be injected on loading FIXME this should be done by setter */
16    public $Lexer;
17    protected $allowedModes = [];
18
19    /**
20     * Zero-width assertion: not at the start of a paragraph break.
21     *
22     * Paragraph boundaries are blank lines — two newlines possibly separated
23     * by horizontal whitespace. The lexer compiles all patterns with the `s`
24     * (DOTALL) flag, so a plain `.*` inside an entry-pattern lookahead would
25     * match across blank lines and let an unclosed delimiter greedily consume
26     * following paragraphs. Place this assertion before a character class to
27     * stop the match at a paragraph boundary.
28     */
29    protected const NOT_AT_PARA_BREAK = '(?!\n[ \t]*\n)';
30
31    /**
32     * Quantified group matching any character that does not start a paragraph
33     * break. Convenience for the common case of "consume until paragraph end".
34     *
35     * Example:
36     *     return '\*\*(?=' . self::CONTENT_UNTIL_PARA . '\*\*)';
37     */
38    protected const CONTENT_UNTIL_PARA = '(?:' . self::NOT_AT_PARA_BREAK . '.)*';
39
40    /**
41     * Character class: a single "non-word" character — ASCII whitespace or
42     * any ASCII punctuation character except the underscore.
43     *
44     * The `_` is excluded because it is itself a delimiter for emphasis in
45     * GFM/CommonMark; treating it as non-word would let `__foo` incorrectly
46     * open emphasis at the second `_`.
47     *
48     * Multibyte rationale: the lexer compiles patterns without the `u` flag,
49     * so UTF-8 is treated as individual bytes. Multibyte characters begin
50     * with bytes >= 0x80, which fall outside every ASCII character class.
51     * Checking that the surrounding context matches NON_WORD_CHAR positively
52     * therefore correctly treats multibyte letters as word-like — preventing
53     * intraword matches in non-Latin text (e.g. `für_etwas`, `日本_語`)
54     * without requiring `u` flag support across the whole lexer.
55     */
56    protected const NON_WORD_CHAR = '[\s!"#$%&\'()*+,\-./:;<=>?@\[\\\\\]^`{|}~]';
57
58    /**
59     * Zero-width assertion: current position is preceded by a non-word
60     * character, or is at the start of input/line. See {@see self::NON_WORD_CHAR}
61     * for the multibyte reasoning.
62     */
63    protected const NO_WORD_BEFORE = '(?:^|(?<=' . self::NON_WORD_CHAR . '))';
64
65    /**
66     * Zero-width assertion: current position is followed by a non-word
67     * character, or is at the end of input. Complement to
68     * {@see self::NO_WORD_BEFORE}.
69     */
70    protected const NO_WORD_AFTER = '(?:\z|(?=' . self::NON_WORD_CHAR . '))';
71
72    /** @inheritdoc */
73    abstract public function getSort();
74
75    /** @inheritdoc */
76    public function preConnect()
77    {
78    }
79
80    /** @inheritdoc */
81    public function connectTo($mode)
82    {
83    }
84
85    /** @inheritdoc */
86    public function postConnect()
87    {
88    }
89
90    /** @inheritdoc */
91    public function accepts($mode)
92    {
93        return in_array($mode, (array) $this->allowedModes);
94    }
95}
96