1e89aeebdSAndreas Gohr<?php 2e89aeebdSAndreas Gohr 3e89aeebdSAndreas Gohrnamespace dokuwiki\Parsing\ParserMode; 4e89aeebdSAndreas Gohr 5e89aeebdSAndreas Gohruse dokuwiki\Parsing\Handler; 674031e46SAndreas Gohruse dokuwiki\Parsing\Helpers\Escape; 7eb15e634SAndreas Gohruse dokuwiki\Parsing\Helpers\HtmlEntity; 81e28e406SAndreas Gohruse dokuwiki\Parsing\Helpers\Link; 91e28e406SAndreas Gohruse dokuwiki\Parsing\Helpers\Media as MediaHelper; 10e89aeebdSAndreas Gohr 11e89aeebdSAndreas Gohr/** 12e89aeebdSAndreas Gohr * GFM inline link [text](url) with optional title [text](url "title"). 13e89aeebdSAndreas Gohr * 143440a8c0SAndreas Gohr * The link text may be either plain text (the common case) or an inline 153440a8c0SAndreas Gohr * image `` — the Markdown equivalent of DW's 163440a8c0SAndreas Gohr * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link 173440a8c0SAndreas Gohr * handler call with a media descriptor array in the label slot, reusing 183440a8c0SAndreas Gohr * the same flow that `Internallink` already drives. No new handler 193440a8c0SAndreas Gohr * instructions; renderers (xhtml, odt, metadata, …) already know how to 203440a8c0SAndreas Gohr * render a link whose label is a media descriptor. 213440a8c0SAndreas Gohr * 223440a8c0SAndreas Gohr * Mirrors DW's `Internallink` architecture: a permissive outer pattern 233440a8c0SAndreas Gohr * plus handle-time parsing, rather than encoding every GFM rule at 243440a8c0SAndreas Gohr * pattern level. 253440a8c0SAndreas Gohr * 26e89aeebdSAndreas Gohr * Deliberately not supported (see skip.php for the affected spec examples): 27e89aeebdSAndreas Gohr * 28e89aeebdSAndreas Gohr * - Reference links [text][id] / [text][] / [foo] — the single-pass 29e89aeebdSAndreas Gohr * lexer cannot resolve forward references to [foo]: url definitions. 303440a8c0SAndreas Gohr * - Pointy-bracket destinations [link](<foo bar>) — the simplified 313440a8c0SAndreas Gohr * pattern will happily match, but handle() produces an internallink 323440a8c0SAndreas Gohr * with a broken src; spec tests for this stay in skip.php. 333440a8c0SAndreas Gohr * - Balanced-parens inside URLs [link](foo(bar)) — matches truncate 343440a8c0SAndreas Gohr * at first `)`, producing odd output; also in skip.php. 35e89aeebdSAndreas Gohr * - Title HTML attribute — DokuWiki link handler instructions have no 36e89aeebdSAndreas Gohr * title-attribute slot, and plumbing one through every renderer just 37e89aeebdSAndreas Gohr * for this is out of scope. The title parses cleanly but is discarded. 383440a8c0SAndreas Gohr * - Mixed text + image in the label ([prefix  suffix](url)) 393440a8c0SAndreas Gohr * — matches DW's policy: Internallink only converts the label to a 403440a8c0SAndreas Gohr * media descriptor when it matches `^{{…}}$` exactly. 41e89aeebdSAndreas Gohr */ 42e89aeebdSAndreas Gohrclass GfmLink extends AbstractMode 43e89aeebdSAndreas Gohr{ 44eb15e634SAndreas Gohr // URL slot character set: any non-paren / non-newline char, OR a 45eb15e634SAndreas Gohr // backslash-escape sequence so an escaped `\)` doesn't terminate the 46eb15e634SAndreas Gohr // URL early (spec examples 504/506/508). Backslash-unescape is 47eb15e634SAndreas Gohr // applied post-extraction; the pattern only needs to keep escaped 48eb15e634SAndreas Gohr // close-parens from prematurely ending the match. 49eb15e634SAndreas Gohr private const URL_CHAR = '(?:\\\\.|[^)\n])'; 50eb15e634SAndreas Gohr 510f694376SAndreas Gohr // Label character set: forbids unescaped `[` / `]` so the outer 520f694376SAndreas Gohr // bracket pair stays balanced, but allows `\[` / `\]` so an escaped 530f694376SAndreas Gohr // bracket can appear inside the label (spec example 523). The same 54*4f32c45bSAndreas Gohr // backslash-escape trick the URL slot already uses. A bare `\n` is 55*4f32c45bSAndreas Gohr // permitted as long as it is not followed by a blank line — soft 56*4f32c45bSAndreas Gohr // line breaks inside link text are allowed by the spec, blank lines 57*4f32c45bSAndreas Gohr // are not (and they would also tie up `\n#`-anchored block modes). 58*4f32c45bSAndreas Gohr private const LABEL_CHAR = '(?:\\\\.|[^\[\]\n]|\n(?![ \t]*\n))'; 590f694376SAndreas Gohr 603440a8c0SAndreas Gohr // Image sub-pattern reused for both the label alternative in the main 613440a8c0SAndreas Gohr // pattern and the image-as-label detector in handle(). No capture 623440a8c0SAndreas Gohr // groups here — the lexer wraps user patterns in a capture and 633440a8c0SAndreas Gohr // additional captures would renumber unpredictably. 640f694376SAndreas Gohr private const IMAGE_SUB = '!\[' . self::LABEL_CHAR . '*\]\(' . self::URL_CHAR . '+\)'; 65e89aeebdSAndreas Gohr 66e89aeebdSAndreas Gohr /** @inheritdoc */ 67e89aeebdSAndreas Gohr public function getSort() 68e89aeebdSAndreas Gohr { 69e89aeebdSAndreas Gohr return 300; 70e89aeebdSAndreas Gohr } 71e89aeebdSAndreas Gohr 72e89aeebdSAndreas Gohr /** @inheritdoc */ 73e89aeebdSAndreas Gohr public function connectTo($mode) 74e89aeebdSAndreas Gohr { 750f694376SAndreas Gohr // Outer shape: `[text-or-image](url)`. Text class forbids 760f694376SAndreas Gohr // unescaped brackets and newlines but allows `\[` / `\]`; the 770f694376SAndreas Gohr // image alternative explicitly matches one inline image. URL 780f694376SAndreas Gohr // slot is permissive — handle() does URL / title splitting 790f694376SAndreas Gohr // post-entry, mirroring how DW Internallink parses inside `[[...]]`. 800f694376SAndreas Gohr $pattern = '\[(?!\[)(?:' . self::LABEL_CHAR . '+|' . self::IMAGE_SUB . ')\]\(' . self::URL_CHAR . '+\)'; 81e89aeebdSAndreas Gohr $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link'); 82e89aeebdSAndreas Gohr } 83e89aeebdSAndreas Gohr 84e89aeebdSAndreas Gohr /** @inheritdoc */ 85e89aeebdSAndreas Gohr public function handle($match, $state, $pos, Handler $handler) 86e89aeebdSAndreas Gohr { 873440a8c0SAndreas Gohr // Detect image-as-label `[](target)`. Parallels 883440a8c0SAndreas Gohr // Internallink's `^{{…}}$` check — when the label is exactly an 893440a8c0SAndreas Gohr // inline image, parse it into a media descriptor; otherwise 903440a8c0SAndreas Gohr // treat the label as plain text. 91eb15e634SAndreas Gohr if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\((' . self::URL_CHAR . '+)\)$/', $match, $m)) { 923440a8c0SAndreas Gohr $label = $this->parseImageDescriptor($m[1]); 933440a8c0SAndreas Gohr $targetUrl = $this->extractUrl($m[2]); 943440a8c0SAndreas Gohr } else { 953440a8c0SAndreas Gohr // Plain text label can't contain `]`, so the first `](` is 963440a8c0SAndreas Gohr // the label/target separator. 97e89aeebdSAndreas Gohr $sep = strpos($match, ']('); 9874031e46SAndreas Gohr $label = Escape::unescapeBackslashes(substr($match, 1, $sep - 1)); 993440a8c0SAndreas Gohr $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1)); 1003440a8c0SAndreas Gohr } 101e89aeebdSAndreas Gohr 10274031e46SAndreas Gohr // Classify on the raw URL so windowssharelink detection sees the 10374031e46SAndreas Gohr // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse 10474031e46SAndreas Gohr // would otherwise destroy the share prefix. 1051e28e406SAndreas Gohr [$call, $args] = Link::classify($targetUrl, $label); 10674031e46SAndreas Gohr if ($call !== 'windowssharelink') { 10774031e46SAndreas Gohr $args[0] = Escape::unescapeBackslashes($args[0]); 10874031e46SAndreas Gohr } 1093440a8c0SAndreas Gohr $handler->addCall($call, $args, $pos); 110e89aeebdSAndreas Gohr return true; 111e89aeebdSAndreas Gohr } 1123440a8c0SAndreas Gohr 1133440a8c0SAndreas Gohr /** 1143440a8c0SAndreas Gohr * Extract the URL from a parenthesized payload: trim surrounding 115eb15e634SAndreas Gohr * whitespace, take the first whitespace-delimited token, then 116eb15e634SAndreas Gohr * apply GFM's URL-slot transformations (entity decoding; 117eb15e634SAndreas Gohr * backslash-unescape happens later, after Link::classify, because 118eb15e634SAndreas Gohr * windowssharelink detection needs the raw `\\` runs intact). 119eb15e634SAndreas Gohr * Any trailing title is discarded (no renderer slot for it). 1203440a8c0SAndreas Gohr */ 1213440a8c0SAndreas Gohr private function extractUrl(string $inside): string 1223440a8c0SAndreas Gohr { 1233440a8c0SAndreas Gohr $inside = trim($inside); 124eb15e634SAndreas Gohr $url = substr($inside, 0, strcspn($inside, " \t\n")); // remove optional title 125eb15e634SAndreas Gohr return HtmlEntity::decode($url); 1263440a8c0SAndreas Gohr } 1273440a8c0SAndreas Gohr 1283440a8c0SAndreas Gohr /** 1293440a8c0SAndreas Gohr * Parse an inline image sub-match `` into the media 1303440a8c0SAndreas Gohr * descriptor shape Media::parseMedia() returns, so the link handler 1313440a8c0SAndreas Gohr * can treat it as a media label identically to `[[page|{{img}}]]`. 1323440a8c0SAndreas Gohr */ 1333440a8c0SAndreas Gohr private function parseImageDescriptor(string $imageMatch): array 1343440a8c0SAndreas Gohr { 1353440a8c0SAndreas Gohr $sep = strpos($imageMatch, ']('); 13674031e46SAndreas Gohr $alt = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2)); 13774031e46SAndreas Gohr $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1))); 1383440a8c0SAndreas Gohr 1391e28e406SAndreas Gohr $p = MediaHelper::parseParameters($imgUrl); 1403440a8c0SAndreas Gohr $type = (media_isexternal($p['src']) || link_isinterwiki($p['src'])) 1413440a8c0SAndreas Gohr ? 'externalmedia' 1423440a8c0SAndreas Gohr : 'internalmedia'; 1433440a8c0SAndreas Gohr 1443440a8c0SAndreas Gohr return [ 1453440a8c0SAndreas Gohr 'type' => $type, 1463440a8c0SAndreas Gohr 'src' => $p['src'], 1473440a8c0SAndreas Gohr 'title' => $alt !== '' ? $alt : null, 1483440a8c0SAndreas Gohr 'align' => $p['align'], 1493440a8c0SAndreas Gohr 'width' => $p['width'], 1503440a8c0SAndreas Gohr 'height' => $p['height'], 1513440a8c0SAndreas Gohr 'cache' => $p['cache'], 1523440a8c0SAndreas Gohr 'linking' => $p['linking'], 1533440a8c0SAndreas Gohr ]; 1543440a8c0SAndreas Gohr } 155e89aeebdSAndreas Gohr} 156