xref: /plugin/amazonlight/syntax.php (revision f220a1a74c1a9f19ac2dfafbaed86194c1a88f90)
1<?php
2
3use dokuwiki\HTTP\DokuHTTPClient;
4use DOMWrap\Document;
5
6/**
7 * DokuWiki Plugin amazonlight (Syntax Component)
8 *
9 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
10 * @author  Andreas Gohr <andi@splitbrain.org>
11 */
12class syntax_plugin_amazonlight extends DokuWiki_Syntax_Plugin
13{
14
15    /** @var array what regions to use for the different countries */
16    const REGIONS = [
17        'us' => 'www.amazon.com',
18        'ca' => 'www.amazon.ca',
19        'de' => 'www.amazon.de',
20        'gb' => 'www.amazon.co.uk',
21        'fr' => 'www.amazon.fr',
22        'jp' => 'www.amazon.co.jp',
23    ];
24
25    protected DokuHTTPClient $http;
26
27    /** @inheritDoc */
28    public function getType()
29    {
30        return 'substition';
31    }
32
33    /** @inheritDoc */
34    public function getPType()
35    {
36        return 'block';
37    }
38
39    /** @inheritDoc */
40    public function getSort()
41    {
42        return 160;
43    }
44
45    /**
46     * Connect lookup pattern to lexer.
47     *
48     * @param string $mode Parser mode
49     */
50    public function connectTo($mode)
51    {
52        $this->Lexer->addSpecialPattern('\{\{amazon>[\w:\\- =]+\}\}', $mode, 'plugin_amazonlight');
53    }
54
55
56    public function __construct()
57    {
58        $http = new DokuHTTPClient();
59        $http->headers['User-Agent'] = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36';
60        $http->headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7';
61        $http->headers['Accept-Language'] = 'en-US,en;q=0.9';
62        $http->headers['Upgrade-Insecure-Requests'] = '1';
63        $http->headers['Sec-Ch-Ua'] = '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"';
64        $http->headers['Sec-Ch-Ua-Mobile'] = '?0';
65        $http->headers['Sec-Ch-Ua-Platform'] = '"Linux"';
66        $http->headers['Sec-Fetch-Dest'] = 'document';
67        $http->headers['Sec-Fetch-Mode'] = 'navigate';
68        $http->headers['Sec-Fetch-Site'] = 'none';
69        $http->headers['Sec-Fetch-User'] = '?1';
70        $this->http = $http;
71    }
72
73    /** @inheritDoc */
74    public function handle($match, $state, $pos, Doku_Handler $handler)
75    {
76        $match = substr($match, 9, -2);
77        list($ctry, $asin) = sexplode(':', $match, 2);
78
79        // no country given?
80        if (empty($asin)) {
81            $asin = $ctry;
82            $ctry = 'us';
83        }
84
85        // default parameters...
86        $params = array(
87            'imgw' => $this->getConf('imgw'),
88            'imgh' => $this->getConf('imgh'),
89            'price' => $this->getConf('showprice'),
90        );
91        // ...can be overridden
92        list($asin, $more) = sexplode(' ', $asin, 2);
93        $params['asin'] = $asin;
94
95        if (preg_match('/(\d+)x(\d+)/i', $more, $match)) {
96            $params['imgw'] = $match[1];
97            $params['imgh'] = $match[2];
98        }
99        if (preg_match('/noprice/i', $more, $match)) {
100            $params['price'] = false;
101        } elseif (preg_match('/(show)?price/i', $more, $match)) {
102            $params['price'] = true;
103        }
104
105        // correct country given?
106        if ($ctry === 'uk') $ctry = 'gb';
107        if (!preg_match('/^(us|gb|jp|de|fr|ca)$/', $ctry)) {
108            $ctry = 'us';
109        }
110        $params['country'] = $ctry;
111
112        return $params;
113    }
114
115    /** @inheritDoc */
116    public function render($mode, Doku_Renderer $renderer, $data)
117    {
118        if ($mode !== 'xhtml') {
119            return false;
120        }
121
122        $html = $this->output($data);
123        if (!$html) {
124            if ($data['country'] == 'de') {
125                $renderer->interwikilink('Amazon', 'Amazon.de', 'amazon.de', $data['asin']);
126            } else {
127                $renderer->interwikilink('Amazon', 'Amazon', 'amazon', $data['asin']);
128            }
129        }
130
131        $renderer->doc .= $html;
132
133        return true;
134    }
135
136    /**
137     * @param array $param
138     * @return string
139     */
140    protected function output($param)
141    {
142        global $conf;
143
144        try {
145            $data = $this->fetchData($param['asin'], $param['country']);
146        } catch (Exception $e) {
147            msg(hsc($e->getMessage()), -1);
148            return false;
149        }
150
151        $img = ml($data['img'], array('w' => $param['imgw'], 'h' => $param['imgh']));
152
153        ob_start();
154        echo '<div class="amazon">';
155        echo '<a href="' . $data['url'] . '"';
156        if ($conf['target']['extern']) echo ' target="' . $conf['target']['extern'] . '"';
157        echo '>';
158        echo '<img src="' . $img . '" width="' . $param['imgw'] . '" height="' . $param['imgh'] . '" alt="" />';
159        echo '</a>';
160
161        echo '<div class="amazon_title">';
162        echo '<a href="' . $data['url'] . '"';
163        if ($conf['target']['extern']) echo ' target="' . $conf['target']['extern'] . '"';
164        echo '>';
165        echo hsc($data['title']);
166        echo '</a>';
167        echo '</div>';
168
169        echo '<div class="amazon_author">';
170        echo hsc($data['author']);
171        echo '</div>';
172
173        echo '<div class="amazon_isbn">';
174        echo hsc($data['isbn']);
175        echo '</div>';
176
177        if ($param['price'] && $data['price']) {
178            echo '<div class="amazon_price">' . hsc($data['price']) . '</div>';
179        }
180        echo '</div>';
181
182        return ob_get_clean();
183    }
184
185    /**
186     * Fetch the meta data
187     *
188     * @param string $asin
189     * @param string $country
190     * @return array
191     * @throws Exception
192     */
193    protected function fetchData($asin, $country)
194    {
195        $partner = $this->getConf('partner_' . $country);
196        if (!$partner) $partner = 'none';
197        $region = self::REGIONS[$country];
198
199        // get homepage cookies first
200        $this->http->get('https://' . $region);
201
202
203        $url = 'https://' . $region . '/dp/' . $asin;
204
205        $attempt = 0;
206        $maxAttempts = 3;
207        while ($attempt < $maxAttempts) {
208            sleep($attempt);
209            $attempt++;
210
211            $html = $this->http->get($url);
212            if (!$html) {
213                if ($attempt < $maxAttempts) continue; // try a few times
214                throw new Exception('Failed to fetch data. Status ' . $this->http->status);
215            }
216            if (preg_match('/(captcha|api-services-support@amazon.com)/i', $html)) {
217                if ($attempt < $maxAttempts) continue; // try a few times
218                throw new Exception('Anti-Bot mechanisms triggered, cannot fetch data');
219            }
220        }
221
222        $doc = new Document();
223        $doc->html($html);
224
225        $result = [
226            'title' => $this->extract($doc, '#productTitle'),
227            'author' => $this->extract($doc, '#bylineInfo a'),
228            'rating' => $this->extract($doc, '#averageCustomerReviews span.a-declarative a > span'),
229            'price' => $this->extract($doc, '.priceToPay'),
230            'isbn' => $this->extract($doc, '#rpi-attribute-book_details-isbn10 .rpi-attribute-value'),
231            'img' => $this->extract($doc, '#imgTagWrapperId img', 'src'),
232            'url' => $url . '?tag=' . $partner,
233        ];
234
235        if (!$result['title']) {
236            $result['title'] = $this->extract($doc, 'title');
237        }
238        if (!$result['title']) {
239            throw new Exception('Could not find title in data');
240        }
241
242        return $result;
243    }
244
245    /**
246     * Extract text or attribute from a selector
247     *
248     * @param Document $doc
249     * @param string $selector
250     * @param string|null $attr attribute to extract, omit for text
251     * @return string
252     */
253    protected function extract(Document $doc, string $selector, $attr = null): string
254    {
255        $element = $doc->find($selector)->first();
256        if ($element === null) {
257            return '';
258        }
259        if ($attr) {
260            return $element->attr($attr);
261        } else {
262            return $element->text();
263        }
264    }
265}
266
267