xref: /dokuwiki/inc/Parsing/ParserMode/GfmLink.php (revision eb15e634e1400f6c4d78f5fb40179ca25f41574d)
1e89aeebdSAndreas Gohr<?php
2e89aeebdSAndreas Gohr
3e89aeebdSAndreas Gohrnamespace dokuwiki\Parsing\ParserMode;
4e89aeebdSAndreas Gohr
5e89aeebdSAndreas Gohruse dokuwiki\Parsing\Handler;
674031e46SAndreas Gohruse dokuwiki\Parsing\Helpers\Escape;
7*eb15e634SAndreas Gohruse dokuwiki\Parsing\Helpers\HtmlEntity;
81e28e406SAndreas Gohruse dokuwiki\Parsing\Helpers\Link;
91e28e406SAndreas Gohruse dokuwiki\Parsing\Helpers\Media as MediaHelper;
10e89aeebdSAndreas Gohr
11e89aeebdSAndreas Gohr/**
12e89aeebdSAndreas Gohr * GFM inline link [text](url) with optional title [text](url "title").
13e89aeebdSAndreas Gohr *
143440a8c0SAndreas Gohr * The link text may be either plain text (the common case) or an inline
153440a8c0SAndreas Gohr * image `![alt](imgUrl)` — the Markdown equivalent of DW's
163440a8c0SAndreas Gohr * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link
173440a8c0SAndreas Gohr * handler call with a media descriptor array in the label slot, reusing
183440a8c0SAndreas Gohr * the same flow that `Internallink` already drives. No new handler
193440a8c0SAndreas Gohr * instructions; renderers (xhtml, odt, metadata, …) already know how to
203440a8c0SAndreas Gohr * render a link whose label is a media descriptor.
213440a8c0SAndreas Gohr *
223440a8c0SAndreas Gohr * Mirrors DW's `Internallink` architecture: a permissive outer pattern
233440a8c0SAndreas Gohr * plus handle-time parsing, rather than encoding every GFM rule at
243440a8c0SAndreas Gohr * pattern level.
253440a8c0SAndreas Gohr *
26e89aeebdSAndreas Gohr * Deliberately not supported (see skip.php for the affected spec examples):
27e89aeebdSAndreas Gohr *
28e89aeebdSAndreas Gohr *   - Reference links [text][id] / [text][] / [foo] — the single-pass
29e89aeebdSAndreas Gohr *     lexer cannot resolve forward references to [foo]: url definitions.
303440a8c0SAndreas Gohr *   - Pointy-bracket destinations [link](<foo bar>) — the simplified
313440a8c0SAndreas Gohr *     pattern will happily match, but handle() produces an internallink
323440a8c0SAndreas Gohr *     with a broken src; spec tests for this stay in skip.php.
333440a8c0SAndreas Gohr *   - Balanced-parens inside URLs [link](foo(bar)) — matches truncate
343440a8c0SAndreas Gohr *     at first `)`, producing odd output; also in skip.php.
35e89aeebdSAndreas Gohr *   - Title HTML attribute — DokuWiki link handler instructions have no
36e89aeebdSAndreas Gohr *     title-attribute slot, and plumbing one through every renderer just
37e89aeebdSAndreas Gohr *     for this is out of scope. The title parses cleanly but is discarded.
383440a8c0SAndreas Gohr *   - Mixed text + image in the label ([prefix ![alt](img) suffix](url))
393440a8c0SAndreas Gohr *     — matches DW's policy: Internallink only converts the label to a
403440a8c0SAndreas Gohr *     media descriptor when it matches `^{{…}}$` exactly.
41e89aeebdSAndreas Gohr */
42e89aeebdSAndreas Gohrclass GfmLink extends AbstractMode
43e89aeebdSAndreas Gohr{
44*eb15e634SAndreas Gohr    // URL slot character set: any non-paren / non-newline char, OR a
45*eb15e634SAndreas Gohr    // backslash-escape sequence so an escaped `\)` doesn't terminate the
46*eb15e634SAndreas Gohr    // URL early (spec examples 504/506/508). Backslash-unescape is
47*eb15e634SAndreas Gohr    // applied post-extraction; the pattern only needs to keep escaped
48*eb15e634SAndreas Gohr    // close-parens from prematurely ending the match.
49*eb15e634SAndreas Gohr    private const URL_CHAR = '(?:\\\\.|[^)\n])';
50*eb15e634SAndreas Gohr
513440a8c0SAndreas Gohr    // Image sub-pattern reused for both the label alternative in the main
523440a8c0SAndreas Gohr    // pattern and the image-as-label detector in handle(). No capture
533440a8c0SAndreas Gohr    // groups here — the lexer wraps user patterns in a capture and
543440a8c0SAndreas Gohr    // additional captures would renumber unpredictably.
55*eb15e634SAndreas Gohr    private const IMAGE_SUB = '!\[[^\[\]\n]*\]\(' . self::URL_CHAR . '+\)';
56e89aeebdSAndreas Gohr
57e89aeebdSAndreas Gohr    /** @inheritdoc */
58e89aeebdSAndreas Gohr    public function getSort()
59e89aeebdSAndreas Gohr    {
60e89aeebdSAndreas Gohr        return 300;
61e89aeebdSAndreas Gohr    }
62e89aeebdSAndreas Gohr
63e89aeebdSAndreas Gohr    /** @inheritdoc */
64e89aeebdSAndreas Gohr    public function connectTo($mode)
65e89aeebdSAndreas Gohr    {
663440a8c0SAndreas Gohr        // Outer shape: `[text-or-image](url)`. Text class forbids brackets
673440a8c0SAndreas Gohr        // and newlines; the image alternative explicitly matches one
683440a8c0SAndreas Gohr        // inline image. URL slot is permissive (`[^)\n]+`) — handle() does
693440a8c0SAndreas Gohr        // URL / title splitting post-entry, mirroring how DW Internallink
703440a8c0SAndreas Gohr        // parses inside `[[...]]`.
71*eb15e634SAndreas Gohr        $pattern = '\[(?!\[)(?:[^\[\]\n]+|' . self::IMAGE_SUB . ')\]\(' . self::URL_CHAR . '+\)';
72e89aeebdSAndreas Gohr        $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link');
73e89aeebdSAndreas Gohr    }
74e89aeebdSAndreas Gohr
75e89aeebdSAndreas Gohr    /** @inheritdoc */
76e89aeebdSAndreas Gohr    public function handle($match, $state, $pos, Handler $handler)
77e89aeebdSAndreas Gohr    {
783440a8c0SAndreas Gohr        // Detect image-as-label `[![alt](img)](target)`. Parallels
793440a8c0SAndreas Gohr        // Internallink's `^{{…}}$` check — when the label is exactly an
803440a8c0SAndreas Gohr        // inline image, parse it into a media descriptor; otherwise
813440a8c0SAndreas Gohr        // treat the label as plain text.
82*eb15e634SAndreas Gohr        if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\((' . self::URL_CHAR . '+)\)$/', $match, $m)) {
833440a8c0SAndreas Gohr            $label     = $this->parseImageDescriptor($m[1]);
843440a8c0SAndreas Gohr            $targetUrl = $this->extractUrl($m[2]);
853440a8c0SAndreas Gohr        } else {
863440a8c0SAndreas Gohr            // Plain text label can't contain `]`, so the first `](` is
873440a8c0SAndreas Gohr            // the label/target separator.
88e89aeebdSAndreas Gohr            $sep       = strpos($match, '](');
8974031e46SAndreas Gohr            $label     = Escape::unescapeBackslashes(substr($match, 1, $sep - 1));
903440a8c0SAndreas Gohr            $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1));
913440a8c0SAndreas Gohr        }
92e89aeebdSAndreas Gohr
9374031e46SAndreas Gohr        // Classify on the raw URL so windowssharelink detection sees the
9474031e46SAndreas Gohr        // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse
9574031e46SAndreas Gohr        // would otherwise destroy the share prefix.
961e28e406SAndreas Gohr        [$call, $args] = Link::classify($targetUrl, $label);
9774031e46SAndreas Gohr        if ($call !== 'windowssharelink') {
9874031e46SAndreas Gohr            $args[0] = Escape::unescapeBackslashes($args[0]);
9974031e46SAndreas Gohr        }
1003440a8c0SAndreas Gohr        $handler->addCall($call, $args, $pos);
101e89aeebdSAndreas Gohr        return true;
102e89aeebdSAndreas Gohr    }
1033440a8c0SAndreas Gohr
1043440a8c0SAndreas Gohr    /**
1053440a8c0SAndreas Gohr     * Extract the URL from a parenthesized payload: trim surrounding
106*eb15e634SAndreas Gohr     * whitespace, take the first whitespace-delimited token, then
107*eb15e634SAndreas Gohr     * apply GFM's URL-slot transformations (entity decoding;
108*eb15e634SAndreas Gohr     * backslash-unescape happens later, after Link::classify, because
109*eb15e634SAndreas Gohr     * windowssharelink detection needs the raw `\\` runs intact).
110*eb15e634SAndreas Gohr     * Any trailing title is discarded (no renderer slot for it).
1113440a8c0SAndreas Gohr     */
1123440a8c0SAndreas Gohr    private function extractUrl(string $inside): string
1133440a8c0SAndreas Gohr    {
1143440a8c0SAndreas Gohr        $inside = trim($inside);
115*eb15e634SAndreas Gohr        $url    = substr($inside, 0, strcspn($inside, " \t\n")); // remove optional title
116*eb15e634SAndreas Gohr        return HtmlEntity::decode($url);
1173440a8c0SAndreas Gohr    }
1183440a8c0SAndreas Gohr
1193440a8c0SAndreas Gohr    /**
1203440a8c0SAndreas Gohr     * Parse an inline image sub-match `![alt](imgUrl)` into the media
1213440a8c0SAndreas Gohr     * descriptor shape Media::parseMedia() returns, so the link handler
1223440a8c0SAndreas Gohr     * can treat it as a media label identically to `[[page|{{img}}]]`.
1233440a8c0SAndreas Gohr     */
1243440a8c0SAndreas Gohr    private function parseImageDescriptor(string $imageMatch): array
1253440a8c0SAndreas Gohr    {
1263440a8c0SAndreas Gohr        $sep    = strpos($imageMatch, '](');
12774031e46SAndreas Gohr        $alt    = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2));
12874031e46SAndreas Gohr        $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1)));
1293440a8c0SAndreas Gohr
1301e28e406SAndreas Gohr        $p = MediaHelper::parseParameters($imgUrl);
1313440a8c0SAndreas Gohr        $type = (media_isexternal($p['src']) || link_isinterwiki($p['src']))
1323440a8c0SAndreas Gohr            ? 'externalmedia'
1333440a8c0SAndreas Gohr            : 'internalmedia';
1343440a8c0SAndreas Gohr
1353440a8c0SAndreas Gohr        return [
1363440a8c0SAndreas Gohr            'type'    => $type,
1373440a8c0SAndreas Gohr            'src'     => $p['src'],
1383440a8c0SAndreas Gohr            'title'   => $alt !== '' ? $alt : null,
1393440a8c0SAndreas Gohr            'align'   => $p['align'],
1403440a8c0SAndreas Gohr            'width'   => $p['width'],
1413440a8c0SAndreas Gohr            'height'  => $p['height'],
1423440a8c0SAndreas Gohr            'cache'   => $p['cache'],
1433440a8c0SAndreas Gohr            'linking' => $p['linking'],
1443440a8c0SAndreas Gohr        ];
1453440a8c0SAndreas Gohr    }
146e89aeebdSAndreas Gohr}
147