xref: /plugin/doi/Resolver/IsbnIsbnDeResolver.php (revision 307c69801b7d8b027f4d0b7119b53e92e2931cb6)
1*307c6980SAndreas Gohr<?php
2*307c6980SAndreas Gohr
3*307c6980SAndreas Gohrnamespace dokuwiki\plugin\doi\Resolver;
4*307c6980SAndreas Gohr
5*307c6980SAndreas Gohruse dokuwiki\HTTP\DokuHTTPClient;
6*307c6980SAndreas Gohr
7*307c6980SAndreas Gohr/**
8*307c6980SAndreas Gohr * ISBN resolver scraping isdn.de
9*307c6980SAndreas Gohr */
10*307c6980SAndreas Gohrclass IsbnIsbnDeResolver extends AbstractIsbnResolver
11*307c6980SAndreas Gohr{
12*307c6980SAndreas Gohr    /** @inheritdoc */
13*307c6980SAndreas Gohr    public function getFallbackURL($id)
14*307c6980SAndreas Gohr    {
15*307c6980SAndreas Gohr        return 'https://www.isbn.de/buecher/suche/' . rawurlencode($id);
16*307c6980SAndreas Gohr    }
17*307c6980SAndreas Gohr
18*307c6980SAndreas Gohr    /** @inheritdoc */
19*307c6980SAndreas Gohr    public function getData($id)
20*307c6980SAndreas Gohr    {
21*307c6980SAndreas Gohr        return $this->fetchCachedData($id);
22*307c6980SAndreas Gohr    }
23*307c6980SAndreas Gohr
24*307c6980SAndreas Gohr    /** @inheritdoc */
25*307c6980SAndreas Gohr    protected function fetchData($id)
26*307c6980SAndreas Gohr    {
27*307c6980SAndreas Gohr        $http = new DokuHTTPClient();
28*307c6980SAndreas Gohr        $url = $this->getFallbackURL($id);
29*307c6980SAndreas Gohr
30*307c6980SAndreas Gohr
31*307c6980SAndreas Gohr        $html = $http->get($url);
32*307c6980SAndreas Gohr        if (!$html) throw new \Exception('Could not fetch data from isdn.de. ' . $http->error);
33*307c6980SAndreas Gohr
34*307c6980SAndreas Gohr        $data = $this->defaultResult;
35*307c6980SAndreas Gohr
36*307c6980SAndreas Gohr        $data['id'] = $this->extract('/<meta property="og:book:isbn" content="([^"]+)"/', $html);
37*307c6980SAndreas Gohr        if (!$data['id']) throw new \Exception('ISBN not found at isdn.de.');
38*307c6980SAndreas Gohr        $data['url'] = $this->extract('/<meta property="og:url" content="([^"]+)"/', $html);
39*307c6980SAndreas Gohr
40*307c6980SAndreas Gohr        $data['title'] = $this->extract('/<meta property="og:title" content="([^"]+)"/', $html);
41*307c6980SAndreas Gohr        $data['published'] = $this->extract('/<meta property="og:book:release_date" content="((\d){4})[^"]+"/', $html);
42*307c6980SAndreas Gohr
43*307c6980SAndreas Gohr        $data['authors'] = $this->extractAll('/<a href="\/person\/.*?">(.+?)<\/a>/', $html);
44*307c6980SAndreas Gohr        $data['publisher'] = $this->extract('/<a href="\/verlag\/.*?">(.+?)<\/a>/', $html);
45*307c6980SAndreas Gohr
46*307c6980SAndreas Gohr        return $data;
47*307c6980SAndreas Gohr    }
48*307c6980SAndreas Gohr
49*307c6980SAndreas Gohr    /**
50*307c6980SAndreas Gohr     * Extract a value from a HTML string using a regex
51*307c6980SAndreas Gohr     *
52*307c6980SAndreas Gohr     * @param string $regex
53*307c6980SAndreas Gohr     * @param string $html
54*307c6980SAndreas Gohr     * @param int $group
55*307c6980SAndreas Gohr     * @return string
56*307c6980SAndreas Gohr     */
57*307c6980SAndreas Gohr    protected function extract($regex, $html, $group = 1)
58*307c6980SAndreas Gohr    {
59*307c6980SAndreas Gohr        if (preg_match($regex, $html, $m)) {
60*307c6980SAndreas Gohr            return html_entity_decode($m[$group]);
61*307c6980SAndreas Gohr        }
62*307c6980SAndreas Gohr        return '';
63*307c6980SAndreas Gohr    }
64*307c6980SAndreas Gohr
65*307c6980SAndreas Gohr    /**
66*307c6980SAndreas Gohr     * Extract all matching values from a HTML string using a regex
67*307c6980SAndreas Gohr     *
68*307c6980SAndreas Gohr     * @param string $regex
69*307c6980SAndreas Gohr     * @param string $html
70*307c6980SAndreas Gohr     * @param int $group
71*307c6980SAndreas Gohr     * @return string
72*307c6980SAndreas Gohr     */
73*307c6980SAndreas Gohr    protected function extractAll($regex, $html, $group = 1)
74*307c6980SAndreas Gohr    {
75*307c6980SAndreas Gohr        if (preg_match_all($regex, $html, $m)) {
76*307c6980SAndreas Gohr            $all = $m[$group];
77*307c6980SAndreas Gohr            $all = array_map('html_entity_decode', $all);
78*307c6980SAndreas Gohr            $all = array_unique($all);
79*307c6980SAndreas Gohr            return $all;
80*307c6980SAndreas Gohr        }
81*307c6980SAndreas Gohr        return [];
82*307c6980SAndreas Gohr    }
83*307c6980SAndreas Gohr}
84