xref: /plugin/statistics/SearchEngines.php (revision a171b9c797be51c8fdf6d3246181382ad1aedc5b)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines
72d987c80SAndreas Gohr */
82d987c80SAndreas Gohrclass SearchEngines
92d987c80SAndreas Gohr{
10e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
11e357e0dcSAndreas Gohr (aider)    protected array $searchEngines = [
12e357e0dcSAndreas Gohr (aider)        'google' => [
13e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
14e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
15e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16e357e0dcSAndreas Gohr (aider)            'params' => ['q']
17e357e0dcSAndreas Gohr (aider)        ],
18e357e0dcSAndreas Gohr (aider)        'bing' => [
19e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
20e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.bing.com',
21e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22e357e0dcSAndreas Gohr (aider)            'params' => ['q']
23e357e0dcSAndreas Gohr (aider)        ],
24e357e0dcSAndreas Gohr (aider)        'yandex' => [
25e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
26e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yandex.ru',
27e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28e357e0dcSAndreas Gohr (aider)            'params' => ['query']
29e357e0dcSAndreas Gohr (aider)        ],
30e357e0dcSAndreas Gohr (aider)        'yahoo' => [
31e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
32e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yahoo.com',
33e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
34e357e0dcSAndreas Gohr (aider)            'params' => ['p']
35e357e0dcSAndreas Gohr (aider)        ],
36e357e0dcSAndreas Gohr (aider)        'naver' => [
37e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
38e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.naver.com',
39e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
40e357e0dcSAndreas Gohr (aider)            'params' => ['query']
41e357e0dcSAndreas Gohr (aider)        ],
42e357e0dcSAndreas Gohr (aider)        'baidu' => [
43e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
44e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.baidu.com',
45e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
46e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
47e357e0dcSAndreas Gohr (aider)        ],
48e357e0dcSAndreas Gohr (aider)        'ask' => [
49e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
50e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
51e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
52e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
53e357e0dcSAndreas Gohr (aider)        ],
54e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
55e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
56e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
57e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
58e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
59e357e0dcSAndreas Gohr (aider)        ],
60e357e0dcSAndreas Gohr (aider)        'babylon' => [
61e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
62e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.babylon.com',
63e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
64e357e0dcSAndreas Gohr (aider)            'params' => ['q']
65e357e0dcSAndreas Gohr (aider)        ],
66e357e0dcSAndreas Gohr (aider)        'aol' => [
67e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
68e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.aol.com',
69e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
71e357e0dcSAndreas Gohr (aider)        ],
72e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
73e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
74e357e0dcSAndreas Gohr (aider)            'url' => 'http://duckduckgo.com',
75e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
76e357e0dcSAndreas Gohr (aider)            'params' => ['q']
77e357e0dcSAndreas Gohr (aider)        ],
78e357e0dcSAndreas Gohr (aider)        'google_avg' => [
79e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
80e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
81e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
82e357e0dcSAndreas Gohr (aider)            'params' => ['q']
83e357e0dcSAndreas Gohr (aider)        ]
842d987c80SAndreas Gohr    ];
852d987c80SAndreas Gohr
862d987c80SAndreas Gohr    public function __construct()
872d987c80SAndreas Gohr    {
88e357e0dcSAndreas Gohr (aider)        // Add the internal DokuWiki search engine
89e357e0dcSAndreas Gohr (aider)        $this->searchEngines['dokuwiki'] = [
90e357e0dcSAndreas Gohr (aider)            'name' => 'DokuWiki Internal Search',
91e357e0dcSAndreas Gohr (aider)            'url' => wl(),
92e357e0dcSAndreas Gohr (aider)            'regex' => '',
93e357e0dcSAndreas Gohr (aider)            'params' => ['q']
94e357e0dcSAndreas Gohr (aider)        ];
952d987c80SAndreas Gohr    }
962d987c80SAndreas Gohr
97*a171b9c7SAndreas Gohr (aider)    /**
98*a171b9c7SAndreas Gohr (aider)     * Analyze a referrer URL to extract search engine information and query
99*a171b9c7SAndreas Gohr (aider)     *
100*a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
101*a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
102*a171b9c7SAndreas Gohr (aider)     */
103*a171b9c7SAndreas Gohr (aider)    public function analyzeReferrer(string $referer): ?array
104*a171b9c7SAndreas Gohr (aider)    {
105*a171b9c7SAndreas Gohr (aider)        $referer = strtolower($referer);
106*a171b9c7SAndreas Gohr (aider)
107*a171b9c7SAndreas Gohr (aider)        // parse the referer
108*a171b9c7SAndreas Gohr (aider)        $urlparts = parse_url($referer);
109*a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
110*a171b9c7SAndreas Gohr (aider)            return null;
111*a171b9c7SAndreas Gohr (aider)        }
112*a171b9c7SAndreas Gohr (aider)
113*a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
114*a171b9c7SAndreas Gohr (aider)        $qpart = $urlparts['query'] ?? '';
115*a171b9c7SAndreas Gohr (aider)        if (!$qpart && isset($urlparts['fragment'])) {
116*a171b9c7SAndreas Gohr (aider)            $qpart = $urlparts['fragment']; // google does this
117*a171b9c7SAndreas Gohr (aider)        }
118*a171b9c7SAndreas Gohr (aider)
119*a171b9c7SAndreas Gohr (aider)        $params = [];
120*a171b9c7SAndreas Gohr (aider)        if ($qpart) {
121*a171b9c7SAndreas Gohr (aider)            parse_str($qpart, $params);
122*a171b9c7SAndreas Gohr (aider)        }
123*a171b9c7SAndreas Gohr (aider)
124*a171b9c7SAndreas Gohr (aider)        $query = '';
125*a171b9c7SAndreas Gohr (aider)        $engineKey = '';
126*a171b9c7SAndreas Gohr (aider)        $engineName = '';
127*a171b9c7SAndreas Gohr (aider)
128*a171b9c7SAndreas Gohr (aider)        // check domain against known search engines
129*a171b9c7SAndreas Gohr (aider)        foreach ($this->searchEngines as $key => $engine) {
130*a171b9c7SAndreas Gohr (aider)            if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki)
131*a171b9c7SAndreas Gohr (aider)
132*a171b9c7SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
133*a171b9c7SAndreas Gohr (aider)                $engineKey = $key;
134*a171b9c7SAndreas Gohr (aider)                $engineName = $engine['name'];
135*a171b9c7SAndreas Gohr (aider)
136*a171b9c7SAndreas Gohr (aider)                // check the known parameters for content
137*a171b9c7SAndreas Gohr (aider)                foreach ($engine['params'] as $param) {
138*a171b9c7SAndreas Gohr (aider)                    if (!empty($params[$param])) {
139*a171b9c7SAndreas Gohr (aider)                        $query = $params[$param];
140*a171b9c7SAndreas Gohr (aider)                        break;
141*a171b9c7SAndreas Gohr (aider)                    }
142*a171b9c7SAndreas Gohr (aider)                }
143*a171b9c7SAndreas Gohr (aider)                break;
144*a171b9c7SAndreas Gohr (aider)            }
145*a171b9c7SAndreas Gohr (aider)        }
146*a171b9c7SAndreas Gohr (aider)
147*a171b9c7SAndreas Gohr (aider)        // try some generic search engine parameters if no specific engine matched
148*a171b9c7SAndreas Gohr (aider)        if (!$engineKey) {
149*a171b9c7SAndreas Gohr (aider)            foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) {
150*a171b9c7SAndreas Gohr (aider)                if (!empty($params[$param])) {
151*a171b9c7SAndreas Gohr (aider)                    $query = $params[$param];
152*a171b9c7SAndreas Gohr (aider)                    // generate name from domain
153*a171b9c7SAndreas Gohr (aider)                    $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld
154*a171b9c7SAndreas Gohr (aider)                    $engineName = explode('.', $engineName);
155*a171b9c7SAndreas Gohr (aider)                    $engineName = array_pop($engineName);
156*a171b9c7SAndreas Gohr (aider)                    $engineKey = 'generic_' . $engineName;
157*a171b9c7SAndreas Gohr (aider)                    break;
158*a171b9c7SAndreas Gohr (aider)                }
159*a171b9c7SAndreas Gohr (aider)            }
160*a171b9c7SAndreas Gohr (aider)        }
161*a171b9c7SAndreas Gohr (aider)
162*a171b9c7SAndreas Gohr (aider)        // still no hit? not a search engine
163*a171b9c7SAndreas Gohr (aider)        if (!$engineKey || !$query) {
164*a171b9c7SAndreas Gohr (aider)            return null;
165*a171b9c7SAndreas Gohr (aider)        }
166*a171b9c7SAndreas Gohr (aider)
167*a171b9c7SAndreas Gohr (aider)        // clean the query
168*a171b9c7SAndreas Gohr (aider)        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries
169*a171b9c7SAndreas Gohr (aider)        $query = preg_replace('/ +/', ' ', $query); // ws compact
170*a171b9c7SAndreas Gohr (aider)        $query = trim($query);
171*a171b9c7SAndreas Gohr (aider)
172*a171b9c7SAndreas Gohr (aider)        if (!$query) {
173*a171b9c7SAndreas Gohr (aider)            return null;
174*a171b9c7SAndreas Gohr (aider)        }
175*a171b9c7SAndreas Gohr (aider)
176*a171b9c7SAndreas Gohr (aider)        return [
177*a171b9c7SAndreas Gohr (aider)            'engine' => $engineKey,
178*a171b9c7SAndreas Gohr (aider)            'name' => $engineName,
179*a171b9c7SAndreas Gohr (aider)            'query' => $query
180*a171b9c7SAndreas Gohr (aider)        ];
181*a171b9c7SAndreas Gohr (aider)    }
182*a171b9c7SAndreas Gohr (aider)
183*a171b9c7SAndreas Gohr (aider)    /**
184*a171b9c7SAndreas Gohr (aider)     * Get search engine information by key
185*a171b9c7SAndreas Gohr (aider)     *
186*a171b9c7SAndreas Gohr (aider)     * @param string $key The search engine key
187*a171b9c7SAndreas Gohr (aider)     * @return array|null The search engine data or null if not found
188*a171b9c7SAndreas Gohr (aider)     */
189*a171b9c7SAndreas Gohr (aider)    public function getSearchEngine(string $key): ?array
190*a171b9c7SAndreas Gohr (aider)    {
191*a171b9c7SAndreas Gohr (aider)        return $this->searchEngines[$key] ?? null;
192*a171b9c7SAndreas Gohr (aider)    }
193*a171b9c7SAndreas Gohr (aider)
194*a171b9c7SAndreas Gohr (aider)    /**
195*a171b9c7SAndreas Gohr (aider)     * Get all search engines
196*a171b9c7SAndreas Gohr (aider)     *
197*a171b9c7SAndreas Gohr (aider)     * @return array All search engine definitions
198*a171b9c7SAndreas Gohr (aider)     */
199*a171b9c7SAndreas Gohr (aider)    public function getAllSearchEngines(): array
200*a171b9c7SAndreas Gohr (aider)    {
201*a171b9c7SAndreas Gohr (aider)        return $this->searchEngines;
202*a171b9c7SAndreas Gohr (aider)    }
203*a171b9c7SAndreas Gohr (aider)
2042d987c80SAndreas Gohr}
205