1<?php
2/**
3 * DokuWiki Plugin scrape (Syntax Component)
4 *
5 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
6 * @author  Andreas Gohr <gohr@cosmocode.de>
7 */
8
9// must be run within Dokuwiki
10if (!defined('DOKU_INC')) die();
11if (!defined('DOKU_PLUGIN')) define('DOKU_PLUGIN',DOKU_INC.'lib/plugins/');
12
13require_once DOKU_PLUGIN.'syntax.php';
14
15define('HTMLPURIFIER_PREFIX', dirname(__FILE__));
16require_once HTMLPURIFIER_PREFIX.'/HTMLPurifier.standalone.php';
17require_once HTMLPURIFIER_PREFIX.'/phpQuery-onefile.php';
18
19
20
21class syntax_plugin_scrape extends DokuWiki_Syntax_Plugin {
22    public function getType() {
23        return 'substition';
24    }
25
26    public function getPType() {
27        return 'block';
28    }
29
30    public function getSort() {
31        return 301;
32    }
33
34
35    public function connectTo($mode) {
36        $this->Lexer->addSpecialPattern('{{scrape>.+?}}',$mode,'plugin_scrape');
37    }
38
39
40    public function handle($match, $state, $pos, Doku_Handler $handler){
41        $match = substr($match, 9, -2);
42        list($url, $title) = explode('|', $match, 2);
43        list($url, $query) = explode(' ', $url, 2);
44        //FIXME handle refresh parameter?
45        list($url, $hash)  = explode('#', $url, 2);
46        if($hash)   $query = trim('#'.$hash.' '.$query);
47        if(!$query) $query = 'body ~';
48
49        $inner = false;
50        if(substr($query,-1) == '~'){
51            $query = rtrim($query,'~ ');
52            $inner = true;
53        }
54
55        $data = array(
56            'url'   => $url,
57            'title' => $title,
58            'query' => $query,
59            'inner' => $inner,
60        );
61
62        return $data;
63    }
64
65    public function render($mode, Doku_Renderer $R, $data) {
66        if($mode != 'xhtml') return false;
67
68        // support interwiki shortcuts
69        if(strpos($data['url'],'>') !== false){
70            list($iw,$ref) = explode('>',$data['url'],2);
71            $data['url'] = $R->_resolveInterWiki($iw,$ref);
72        }
73
74        // check if URL is allowed
75        $re = $this->getConf('allowedre');
76        if(!$re || !preg_match('/'.$re.'/i',$data['url'])){
77            $R->doc .= 'This URL is not allowed for scraping';
78            return true;
79        }
80
81
82        // fetch remote data
83        $http = new DokuHTTPClient();
84        $resp = $http->get($data['url']);
85
86        if(!$resp){
87            $R->doc .= 'Failed to load remote ressource';
88            return true;
89        }
90
91        // determine mime type
92        list($mime,$charset) = explode(';',$http->resp_headers['content-type']);
93        $mime    = trim(strtolower($mime));
94        $charset = trim(strtolower($charset));
95        $charset = preg_replace('/charset *= */','',$charset);
96
97
98        if(preg_match('/image\/(gif|png|jpe?g)/',$mime)){
99            // image embed
100            $R->externalmedia ($data['url'], $data['title']);
101        }elseif(preg_match('/text\//',$mime)){
102            if($charset != 'utf-8'){
103                $resp = utf8_encode($resp); // we just assume it to be latin1
104            }
105
106            if(preg_match('/text\/html/',$mime)){
107                // display HTML
108                $this->display_html($data,$resp,$R);
109
110                //FIXME support directory listings?
111            }else{
112                // display as code
113                $R->preformatted($resp);
114            }
115        }else{
116            $R->doc .= 'Failed to handle mime type '.hsc($mime);
117            return true;
118        }
119
120        return true;
121    }
122
123    private function display_html($data,$resp,&$R){
124        global $conf;
125
126        // extract the wanted part from the HTML using the given query
127        phpQuery::newDocument($resp);
128        $pq = pq($data['query']);
129
130        // fix lists to match DokuWiki's style
131        $pq->find('li')->wrapInner('<div class="li" />');
132
133        // fix tables to match DokuWiki's style
134        $pq->find('table')->addClass('inline');
135
136        // fix links to match DokuWiki's style
137        foreach($pq->find('a') as $link){
138            $plink = pq($link);
139            list($ext,$mime) = mimetype($plink->attr('href'),true);
140            if($ext && $mime != 'text/html'){
141                // is it a known mediafile?
142                $plink->addClass('mediafile');
143                $plink->addClass('mf_'.$ext);
144                if($conf['target']['media']){
145                    $plink->attr('target',$conf['target']['media']);
146                }
147            }elseif($plink->attr('href')){
148                // treat it as external
149                if($conf['target']['extern']){
150                    $plink->attr('target',$conf['target']['extern']);
151                }
152                $plink->addClass('urlextern');
153            }
154            $plink->removeAttr('style');
155        }
156
157        // get all wanted HTML by converting the DOMElements back to HTML
158        $html = '';
159        if($data['inner']){
160            $html .= $pq->html();
161        }else{
162            foreach($pq->elements as $elem){
163                $html .= $elem->ownerDocument->saveXML($elem);
164            }
165        }
166
167        // clean up HTML
168        $purifier = new HTMLPurifier();
169        $purifier->config->set('Attr.EnableID', true);
170        $purifier->config->set('Attr.IDPrefix', 'scrape___');
171        $purifier->config->set('URI.Base', $data['url']);
172        $purifier->config->set('URI.MakeAbsolute', true);
173        $purifier->config->set('Attr.AllowedFrameTargets',array('_blank','_self','_parent','_top'));
174        io_mkdir_p($conf['cachedir'].'/_HTMLPurifier');
175        $purifier->config->set('Cache.SerializerPath',$conf['cachedir'].'/_HTMLPurifier');
176        $html = $purifier->purify($html);
177
178        $R->doc .= $html;
179    }
180
181
182}
183
184// vim:ts=4:sw=4:et:
185