xref: /plugin/amazonlight/syntax.php (revision 339b7067b86eeab507021f36265cc3f7aa7a0c8a)
1<?php
2
3use dokuwiki\HTTP\DokuHTTPClient;
4use DOMWrap\Document;
5
6/**
7 * DokuWiki Plugin amazonlight (Syntax Component)
8 *
9 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
10 * @author  Andreas Gohr <andi@splitbrain.org>
11 */
12class syntax_plugin_amazonlight extends DokuWiki_Syntax_Plugin
13{
14
15    /** @var array what regions to use for the different countries */
16    const REGIONS = [
17        'us' => 'www.amazon.com',
18        'ca' => 'www.amazon.ca',
19        'de' => 'www.amazon.de',
20        'gb' => 'www.amazon.co.uk',
21        'fr' => 'www.amazon.fr',
22        'jp' => 'www.amazon.co.jp',
23    ];
24
25    protected DokuHTTPClient $http;
26
27    /** @inheritDoc */
28    public function getType()
29    {
30        return 'substition';
31    }
32
33    /** @inheritDoc */
34    public function getPType()
35    {
36        return 'block';
37    }
38
39    /** @inheritDoc */
40    public function getSort()
41    {
42        return 160;
43    }
44
45    /**
46     * Connect lookup pattern to lexer.
47     *
48     * @param string $mode Parser mode
49     */
50    public function connectTo($mode)
51    {
52        $this->Lexer->addSpecialPattern('\{\{amazon>[\w:\\- =]+\}\}', $mode, 'plugin_amazonlight');
53    }
54
55
56    public function __construct()
57    {
58        $http = new DokuHTTPClient();
59        $http->headers['User-Agent'] = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36';
60        $http->headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7';
61        $http->headers['Accept-Language'] = 'en-US,en;q=0.9';
62        $http->headers['Upgrade-Insecure-Requests'] = '1';
63        $http->headers['Sec-Ch-Ua'] = '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"';
64        $http->headers['Sec-Ch-Ua-Mobile'] = '?0';
65        $http->headers['Sec-Ch-Ua-Platform'] = '"Linux"';
66        $http->headers['Sec-Fetch-Dest'] = 'document';
67        $http->headers['Sec-Fetch-Mode'] = 'navigate';
68        $http->headers['Sec-Fetch-Site'] = 'none';
69        $http->headers['Sec-Fetch-User'] = '?1';
70        $this->http = $http;
71    }
72
73    /** @inheritDoc */
74    public function handle($match, $state, $pos, Doku_Handler $handler)
75    {
76        $match = substr($match, 9, -2);
77        list($ctry, $asin) = sexplode(':', $match, 2);
78
79        // no country given?
80        if (empty($asin)) {
81            $asin = $ctry;
82            $ctry = 'us';
83        }
84
85        // default parameters...
86        $params = array(
87            'imgw' => $this->getConf('imgw'),
88            'imgh' => $this->getConf('imgh'),
89        );
90        // ...can be overridden
91        list($asin, $more) = sexplode(' ', $asin, 2);
92        $params['asin'] = $asin;
93
94        if (preg_match('/(\d+)x(\d+)/i', $more, $match)) {
95            $params['imgw'] = $match[1];
96            $params['imgh'] = $match[2];
97        }
98
99        // correct country given?
100        if ($ctry === 'uk') $ctry = 'gb';
101        if (!preg_match('/^(us|gb|jp|de|fr|ca)$/', $ctry)) {
102            $ctry = 'us';
103        }
104        $params['country'] = $ctry;
105
106        return $params;
107    }
108
109    /** @inheritDoc */
110    public function render($mode, Doku_Renderer $renderer, $data)
111    {
112        if ($mode !== 'xhtml') {
113            return false;
114        }
115
116        $html = $this->output($data);
117        if (!$html) {
118            if ($data['country'] == 'de') {
119                $renderer->interwikilink('Amazon', 'Amazon.de', 'amazon.de', $data['asin']);
120            } else {
121                $renderer->interwikilink('Amazon', 'Amazon', 'amazon', $data['asin']);
122            }
123        }
124
125        $renderer->doc .= $html;
126
127        return true;
128    }
129
130    /**
131     * @param array $param
132     * @return string
133     */
134    protected function output($param)
135    {
136        global $conf;
137
138        try {
139            $data = $this->fetchCachedData($param['asin'], $param['country']);
140        } catch (Exception $e) {
141            msg(hsc($e->getMessage()), -1);
142            return false;
143        }
144
145        $img = ml($data['img'], array('w' => $param['imgw'], 'h' => $param['imgh']));
146
147        ob_start();
148        echo '<div class="amazon">';
149        echo '<a href="' . $data['url'] . '"';
150        if ($conf['target']['extern']) echo ' target="' . $conf['target']['extern'] . '"';
151        echo '>';
152        echo '<img src="' . $img . '" width="' . $param['imgw'] . '" height="' . $param['imgh'] . '" alt="" />';
153        echo '</a>';
154
155        echo '<div class="amazon_title">';
156        echo '<a href="' . $data['url'] . '"';
157        if ($conf['target']['extern']) echo ' target="' . $conf['target']['extern'] . '"';
158        echo '>';
159        echo hsc($data['title']);
160        echo '</a>';
161        echo '</div>';
162
163        echo '<div class="amazon_author">';
164        echo hsc($data['author']);
165        echo '</div>';
166
167        echo '<div class="amazon_isbn">';
168        echo hsc($data['isbn']);
169        echo '</div>';
170
171        echo '</div>';
172
173        return ob_get_clean();
174    }
175
176
177    /**
178     * Forever cache the fetched data
179     *
180     * @throws Exception
181     */
182    protected function fetchCachedData($asin, $country)
183    {
184        $partner = $this->getConf('partner_' . $country);
185
186
187        $cachefile = getCacheName($country . '-' . $asin, '.amazonlight');
188        if (file_exists($cachefile)) {
189            $data = json_decode(file_get_contents($cachefile), true);
190        } else {
191            $data = $this->fetchData($asin, $country);
192            io_saveFile($cachefile, json_encode($data));
193        }
194
195        if ($partner) {
196            $data['url'] .= '?tag=' . $partner;
197        }
198
199        return $data;
200    }
201
202    /**
203     * Fetch the meta data
204     *
205     * @param string $asin
206     * @param string $country
207     * @return array
208     * @throws Exception
209     */
210    protected function fetchData($asin, $country)
211    {
212        $region = self::REGIONS[$country];
213
214        // get homepage cookies first
215        $this->http->get('https://' . $region);
216
217        $url = 'https://' . $region . '/dp/' . $asin;
218
219        $attempt = 0;
220        $maxAttempts = 3;
221        while ($attempt < $maxAttempts) {
222            $attempt++;
223            sleep($attempt*2);
224
225            $html = $this->http->get($url);
226            if (!$html) {
227                if ($attempt < $maxAttempts) continue; // try a few times
228                throw new Exception('Failed to fetch data. Status ' . $this->http->status);
229            }
230            if (preg_match('/(captcha|api-services-support@amazon.com)/i', $html)) {
231                if ($attempt < $maxAttempts) continue; // try a few times
232                throw new Exception('Anti-Bot mechanisms triggered, cannot fetch data');
233            }
234        }
235
236        $doc = new Document();
237        $doc->html($html);
238
239        $result = [
240            'title' => $this->extract($doc, '#productTitle'),
241            'author' => $this->extract($doc, '#bylineInfo a'),
242            'rating' => $this->extract($doc, '#averageCustomerReviews span.a-declarative a > span'),
243            'isbn' => $this->extract($doc, '#rpi-attribute-book_details-isbn10 .rpi-attribute-value'),
244            'img' => $this->extract($doc, '#imgTagWrapperId img', 'src'),
245            'url' => $url,
246        ];
247
248        if (!$result['title']) {
249            $result['title'] = $this->extract($doc, 'title');
250        }
251        if (!$result['title']) {
252            throw new Exception('Could not find title in data');
253        }
254
255        return $result;
256    }
257
258    /**
259     * Extract text or attribute from a selector
260     *
261     * @param Document $doc
262     * @param string $selector
263     * @param string|null $attr attribute to extract, omit for text
264     * @return string
265     */
266    protected function extract(Document $doc, string $selector, $attr = null): string
267    {
268        $element = $doc->find($selector)->first();
269        if ($element === null) {
270            return '';
271        }
272        if ($attr) {
273            return $element->attr($attr);
274        } else {
275            return $element->text();
276        }
277    }
278}
279
280