1<?php
2
3use dokuwiki\Extension\SyntaxPlugin;
4use dokuwiki\HTTP\DokuHTTPClient;
5use DOMWrap\Document;
6
7/**
8 * DokuWiki Plugin scrape (Syntax Component)
9 *
10 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
11 * @author  Andreas Gohr <gohr@cosmocode.de>
12 */
13class syntax_plugin_scrape extends SyntaxPlugin
14{
15    public function __construct()
16    {
17        require_once __DIR__ . '/vendor/autoload.php';
18    }
19
20    /** @inheritdoc */
21    public function getType()
22    {
23        return 'substition';
24    }
25
26    /** @inheritdoc */
27    public function getPType()
28    {
29        return 'block';
30    }
31
32    /** @inheritdoc */
33    public function getSort()
34    {
35        return 301;
36    }
37
38    /** @inheritdoc */
39    public function connectTo($mode)
40    {
41        $this->Lexer->addSpecialPattern('{{scrape>.+?}}', $mode, 'plugin_scrape');
42    }
43
44    /** @inheritdoc */
45    public function handle($match, $state, $pos, Doku_Handler $handler)
46    {
47        $match = substr($match, 9, -2);
48        [$url, $title] = sexplode('|', $match, 2);
49        [$url, $query] = sexplode(' ', $url, 2);
50        //FIXME handle refresh parameter?
51        [$url, $hash] = sexplode('#', $url, 2);
52        if ($hash) $query = trim('#' . $hash . ' ' . $query);
53        if (!$query) $query = 'body ~';
54
55        $inner = false;
56        if (substr($query, -1) == '~') {
57            $query = rtrim($query, '~ ');
58            $inner = true;
59        }
60
61        $data = [
62            'url' => $url,
63            'title' => $title,
64            'query' => $query,
65            'inner' => $inner
66        ];
67
68        return $data;
69    }
70
71    /** @inheritdoc */
72    public function render($mode, Doku_Renderer $R, $data)
73    {
74        if ($mode != 'xhtml') return false;
75
76        // support interwiki shortcuts
77        if (strpos($data['url'], '>') !== false) {
78            [$iw, $ref] = explode('>', $data['url'], 2);
79            $data['url'] = $R->_resolveInterWiki($iw, $ref);
80        }
81
82        // check if URL is allowed
83        $re = $this->getConf('allowedre');
84        if (!$re || !preg_match('/' . $re . '/i', $data['url'])) {
85            $R->doc .= 'This URL is not allowed for scraping';
86            return true;
87        }
88
89        // fetch remote data
90        $http = new DokuHTTPClient();
91        $resp = $http->get($data['url']);
92
93        if (!$resp) {
94            $R->doc .= 'Failed to load remote ressource';
95            return true;
96        }
97
98        // determine mime type
99        [$mime, $charset] = sexplode(';', $http->resp_headers['content-type'], 2);
100        $mime = trim(strtolower($mime));
101        $charset = trim(strtolower($charset));
102        $charset = preg_replace('/charset *= */', '', $charset);
103
104        if (preg_match('/image\/(gif|png|jpe?g)/', $mime)) {
105            // image embed
106            $R->externalmedia($data['url'], $data['title']);
107        } elseif (preg_match('/text\//', $mime)) {
108            if ($charset != 'utf-8') {
109                $resp = utf8_encode($resp); // we just assume it to be latin1
110            }
111
112            if (preg_match('/text\/html/', $mime)) {
113                // display HTML
114                $R->doc .= $this->cleanHTML($data, $resp);
115
116                //FIXME support directory listings?
117            } else {
118                // display as code
119                $R->preformatted($resp);
120            }
121        } else {
122            $R->doc .= 'Failed to handle mime type ' . hsc($mime);
123            return true;
124        }
125
126        return true;
127    }
128
129    private function cleanHTML($data, $resp)
130    {
131        global $conf;
132
133        // extract the wanted part from the HTML using the given query
134        $doc = new Document();
135        $doc->html($resp);
136
137        $pq = $doc->find($data['query']);
138
139        // fix lists to match DokuWiki's style
140        $pq->find('li')->wrapInner('<div class="li" />');
141
142        // fix tables to match DokuWiki's style
143        $pq->find('table')->addClass('inline')->wrap('<div class="table" />');
144
145        // fix links to match DokuWiki's style
146        foreach ($pq->find('a') as $link) {
147            [$ext, $mime] = mimetype($link->attr('href'), true);
148            if ($ext && $mime != 'text/html') {
149                // is it a known mediafile?
150                $link->addClass('mediafile');
151                $link->addClass('mf_' . $ext);
152                if ($conf['target']['media']) {
153                    $link->attr('target', $conf['target']['media']);
154                }
155            } elseif ($link->attr('href')) {
156                // treat it as external
157                if ($conf['target']['extern']) {
158                    $link->attr('target', $conf['target']['extern']);
159                }
160                $link->addClass('urlextern');
161            }
162            $link->removeAttr('style');
163        }
164
165        $html = '';
166        if ($data['inner']) {
167            $html .= $pq->html();
168        } else {
169            $pq->each(function ($node) use (&$html) {
170                $html .= $node->ownerDocument->saveXML($node) . "\n";
171            });
172        }
173
174        // clean up HTML
175        $config = HTMLPurifier_Config::createDefault();
176        $config->set('Attr.EnableID', true);
177        $config->set('Attr.IDPrefix', 'scrape___');
178        $config->set('URI.Base', $data['url']);
179        $config->set('URI.MakeAbsolute', true);
180        $config->set('Attr.AllowedFrameTargets', ['_blank', '_self', '_parent', '_top']);
181        io_mkdir_p($conf['cachedir'] . '/_HTMLPurifier');
182        $config->set('Cache.SerializerPath', $conf['cachedir'] . '/_HTMLPurifier');
183        $purifier = new HTMLPurifier($config);
184        $html = $purifier->purify($html);
185
186        return trim($html);
187    }
188}
189