1<?php 2 3namespace dokuwiki\Parsing\ParserMode; 4 5use dokuwiki\Parsing\Handler; 6use dokuwiki\Parsing\Helpers\Escape; 7use dokuwiki\Parsing\Helpers\HtmlEntity; 8use dokuwiki\Parsing\Helpers\Link; 9use dokuwiki\Parsing\Helpers\Media as MediaHelper; 10 11/** 12 * GFM inline link [text](url) with optional title [text](url "title"). 13 * 14 * The link text may be either plain text (the common case) or an inline 15 * image `` — the Markdown equivalent of DW's 16 * `[[target|{{imgUrl}}]]`. The image-as-label form emits a single link 17 * handler call with a media descriptor array in the label slot, reusing 18 * the same flow that `Internallink` already drives. No new handler 19 * instructions; renderers (xhtml, odt, metadata, …) already know how to 20 * render a link whose label is a media descriptor. 21 * 22 * Mirrors DW's `Internallink` architecture: a permissive outer pattern 23 * plus handle-time parsing, rather than encoding every GFM rule at 24 * pattern level. 25 * 26 * Deliberately not supported (see skip.php for the affected spec examples): 27 * 28 * - Reference links [text][id] / [text][] / [foo] — the single-pass 29 * lexer cannot resolve forward references to [foo]: url definitions. 30 * - Pointy-bracket destinations [link](<foo bar>) — the simplified 31 * pattern will happily match, but handle() produces an internallink 32 * with a broken src; spec tests for this stay in skip.php. 33 * - Balanced-parens inside URLs [link](foo(bar)) — matches truncate 34 * at first `)`, producing odd output; also in skip.php. 35 * - Title HTML attribute — DokuWiki link handler instructions have no 36 * title-attribute slot, and plumbing one through every renderer just 37 * for this is out of scope. The title parses cleanly but is discarded. 38 * - Mixed text + image in the label ([prefix  suffix](url)) 39 * — matches DW's policy: Internallink only converts the label to a 40 * media descriptor when it matches `^{{…}}$` exactly. 41 */ 42class GfmLink extends AbstractMode 43{ 44 // URL slot character set: any non-paren / non-newline char, OR a 45 // backslash-escape sequence so an escaped `\)` doesn't terminate the 46 // URL early (spec examples 504/506/508). Backslash-unescape is 47 // applied post-extraction; the pattern only needs to keep escaped 48 // close-parens from prematurely ending the match. 49 private const URL_CHAR = '(?:\\\\.|[^)\n])'; 50 51 // Label character set: forbids unescaped `[` / `]` so the outer 52 // bracket pair stays balanced, but allows `\[` / `\]` so an escaped 53 // bracket can appear inside the label (spec example 523). The same 54 // backslash-escape trick the URL slot already uses. A bare `\n` is 55 // permitted as long as it is not followed by a blank line — soft 56 // line breaks inside link text are allowed by the spec, blank lines 57 // are not (and they would also tie up `\n#`-anchored block modes). 58 private const LABEL_CHAR = '(?:\\\\.|[^\[\]\n]|\n(?![ \t]*\n))'; 59 60 // Image sub-pattern reused for both the label alternative in the main 61 // pattern and the image-as-label detector in handle(). No capture 62 // groups here — the lexer wraps user patterns in a capture and 63 // additional captures would renumber unpredictably. 64 private const IMAGE_SUB = '!\[' . self::LABEL_CHAR . '*\]\(' . self::URL_CHAR . '+\)'; 65 66 /** @inheritdoc */ 67 public function getSort() 68 { 69 return 300; 70 } 71 72 /** @inheritdoc */ 73 public function connectTo($mode) 74 { 75 // Outer shape: `[text-or-image](url)`. Text class forbids 76 // unescaped brackets and newlines but allows `\[` / `\]`; the 77 // image alternative explicitly matches one inline image. URL 78 // slot is permissive — handle() does URL / title splitting 79 // post-entry, mirroring how DW Internallink parses inside `[[...]]`. 80 $pattern = '\[(?!\[)(?:' . self::LABEL_CHAR . '+|' . self::IMAGE_SUB . ')\]\(' . self::URL_CHAR . '+\)'; 81 $this->Lexer->addSpecialPattern($pattern, $mode, 'gfm_link'); 82 } 83 84 /** @inheritdoc */ 85 public function handle($match, $state, $pos, Handler $handler) 86 { 87 // Detect image-as-label `[](target)`. Parallels 88 // Internallink's `^{{…}}$` check — when the label is exactly an 89 // inline image, parse it into a media descriptor; otherwise 90 // treat the label as plain text. 91 if (preg_match('/^\[(' . self::IMAGE_SUB . ')\]\((' . self::URL_CHAR . '+)\)$/', $match, $m)) { 92 $label = $this->parseImageDescriptor($m[1]); 93 $targetUrl = $this->extractUrl($m[2]); 94 } else { 95 // Plain text label can't contain `]`, so the first `](` is 96 // the label/target separator. 97 $sep = strpos($match, ']('); 98 $label = Escape::unescapeBackslashes(substr($match, 1, $sep - 1)); 99 $targetUrl = $this->extractUrl(substr($match, $sep + 2, -1)); 100 } 101 102 // Classify on the raw URL so windowssharelink detection sees the 103 // literal `\\host\path` runs intact — GFM's `\\` → `\` collapse 104 // would otherwise destroy the share prefix. 105 [$call, $args] = Link::classify($targetUrl, $label); 106 if ($call !== 'windowssharelink') { 107 $args[0] = Escape::unescapeBackslashes($args[0]); 108 } 109 $handler->addCall($call, $args, $pos); 110 return true; 111 } 112 113 /** 114 * Extract the URL from a parenthesized payload: trim surrounding 115 * whitespace, take the first whitespace-delimited token, then 116 * apply GFM's URL-slot transformations (entity decoding; 117 * backslash-unescape happens later, after Link::classify, because 118 * windowssharelink detection needs the raw `\\` runs intact). 119 * Any trailing title is discarded (no renderer slot for it). 120 */ 121 private function extractUrl(string $inside): string 122 { 123 $inside = trim($inside); 124 $url = substr($inside, 0, strcspn($inside, " \t\n")); // remove optional title 125 return HtmlEntity::decode($url); 126 } 127 128 /** 129 * Parse an inline image sub-match `` into the media 130 * descriptor shape Media::parseMedia() returns, so the link handler 131 * can treat it as a media label identically to `[[page|{{img}}]]`. 132 */ 133 private function parseImageDescriptor(string $imageMatch): array 134 { 135 $sep = strpos($imageMatch, ']('); 136 $alt = Escape::unescapeBackslashes(substr($imageMatch, 2, $sep - 2)); 137 $imgUrl = Escape::unescapeBackslashes($this->extractUrl(substr($imageMatch, $sep + 2, -1))); 138 139 $p = MediaHelper::parseParameters($imgUrl); 140 $type = (media_isexternal($p['src']) || link_isinterwiki($p['src'])) 141 ? 'externalmedia' 142 : 'internalmedia'; 143 144 return [ 145 'type' => $type, 146 'src' => $p['src'], 147 'title' => $alt !== '' ? $alt : null, 148 'align' => $p['align'], 149 'width' => $p['width'], 150 'height' => $p['height'], 151 'cache' => $p['cache'], 152 'linking' => $p['linking'], 153 ]; 154 } 155} 156