1<?php 2 3use dokuwiki\Extension\SyntaxPlugin; 4use dokuwiki\HTTP\DokuHTTPClient; 5use DOMWrap\Document; 6 7/** 8 * DokuWiki Plugin scrape (Syntax Component) 9 * 10 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 11 * @author Andreas Gohr <gohr@cosmocode.de> 12 */ 13class syntax_plugin_scrape extends SyntaxPlugin 14{ 15 public function __construct() 16 { 17 require_once __DIR__ . '/vendor/autoload.php'; 18 } 19 20 /** @inheritdoc */ 21 public function getType() 22 { 23 return 'substition'; 24 } 25 26 /** @inheritdoc */ 27 public function getPType() 28 { 29 return 'block'; 30 } 31 32 /** @inheritdoc */ 33 public function getSort() 34 { 35 return 301; 36 } 37 38 /** @inheritdoc */ 39 public function connectTo($mode) 40 { 41 $this->Lexer->addSpecialPattern('{{scrape>.+?}}', $mode, 'plugin_scrape'); 42 } 43 44 /** @inheritdoc */ 45 public function handle($match, $state, $pos, Doku_Handler $handler) 46 { 47 $match = substr($match, 9, -2); 48 [$url, $title] = sexplode('|', $match, 2); 49 [$url, $query] = sexplode(' ', $url, 2); 50 //FIXME handle refresh parameter? 51 [$url, $hash] = sexplode('#', $url, 2); 52 if ($hash) $query = trim('#' . $hash . ' ' . $query); 53 if (!$query) $query = 'body ~'; 54 55 $inner = false; 56 if (substr($query, -1) == '~') { 57 $query = rtrim($query, '~ '); 58 $inner = true; 59 } 60 61 $data = [ 62 'url' => $url, 63 'title' => $title, 64 'query' => $query, 65 'inner' => $inner 66 ]; 67 68 return $data; 69 } 70 71 /** @inheritdoc */ 72 public function render($mode, Doku_Renderer $R, $data) 73 { 74 if ($mode != 'xhtml') return false; 75 76 // support interwiki shortcuts 77 if (strpos($data['url'], '>') !== false) { 78 [$iw, $ref] = explode('>', $data['url'], 2); 79 $data['url'] = $R->_resolveInterWiki($iw, $ref); 80 } 81 82 // check if URL is allowed 83 $re = $this->getConf('allowedre'); 84 if (!$re || !preg_match('/' . $re . '/i', $data['url'])) { 85 $R->doc .= 'This URL is not allowed for scraping'; 86 return true; 87 } 88 89 // fetch remote data 90 $http = new DokuHTTPClient(); 91 $resp = $http->get($data['url']); 92 93 if (!$resp) { 94 $R->doc .= 'Failed to load remote ressource'; 95 return true; 96 } 97 98 // determine mime type 99 [$mime, $charset] = sexplode(';', $http->resp_headers['content-type'], 2); 100 $mime = trim(strtolower($mime)); 101 $charset = trim(strtolower($charset)); 102 $charset = preg_replace('/charset *= */', '', $charset); 103 104 if (preg_match('/image\/(gif|png|jpe?g)/', $mime)) { 105 // image embed 106 $R->externalmedia($data['url'], $data['title']); 107 } elseif (preg_match('/text\//', $mime)) { 108 if ($charset != 'utf-8') { 109 $resp = utf8_encode($resp); // we just assume it to be latin1 110 } 111 112 if (preg_match('/text\/html/', $mime)) { 113 // display HTML 114 $R->doc .= $this->cleanHTML($data, $resp); 115 116 //FIXME support directory listings? 117 } else { 118 // display as code 119 $R->preformatted($resp); 120 } 121 } else { 122 $R->doc .= 'Failed to handle mime type ' . hsc($mime); 123 return true; 124 } 125 126 return true; 127 } 128 129 private function cleanHTML($data, $resp) 130 { 131 global $conf; 132 133 // extract the wanted part from the HTML using the given query 134 $doc = new Document(); 135 $doc->html($resp); 136 137 $pq = $doc->find($data['query']); 138 139 // fix lists to match DokuWiki's style 140 $pq->find('li')->wrapInner('<div class="li" />'); 141 142 // fix tables to match DokuWiki's style 143 $pq->find('table')->addClass('inline')->wrap('<div class="table" />'); 144 145 // fix links to match DokuWiki's style 146 foreach ($pq->find('a') as $link) { 147 [$ext, $mime] = mimetype($link->attr('href'), true); 148 if ($ext && $mime != 'text/html') { 149 // is it a known mediafile? 150 $link->addClass('mediafile'); 151 $link->addClass('mf_' . $ext); 152 if ($conf['target']['media']) { 153 $link->attr('target', $conf['target']['media']); 154 } 155 } elseif ($link->attr('href')) { 156 // treat it as external 157 if ($conf['target']['extern']) { 158 $link->attr('target', $conf['target']['extern']); 159 } 160 $link->addClass('urlextern'); 161 } 162 $link->removeAttr('style'); 163 } 164 165 $html = ''; 166 if ($data['inner']) { 167 $html .= $pq->html(); 168 } else { 169 $pq->each(function ($node) use (&$html) { 170 $html .= $node->ownerDocument->saveXML($node) . "\n"; 171 }); 172 } 173 174 // clean up HTML 175 $config = HTMLPurifier_Config::createDefault(); 176 $config->set('Attr.EnableID', true); 177 $config->set('Attr.IDPrefix', 'scrape___'); 178 $config->set('URI.Base', $data['url']); 179 $config->set('URI.MakeAbsolute', true); 180 $config->set('Attr.AllowedFrameTargets', ['_blank', '_self', '_parent', '_top']); 181 io_mkdir_p($conf['cachedir'] . '/_HTMLPurifier'); 182 $config->set('Cache.SerializerPath', $conf['cachedir'] . '/_HTMLPurifier'); 183 $purifier = new HTMLPurifier($config); 184 $html = $purifier->purify($html); 185 186 return trim($html); 187 } 188} 189