xref: /plugin/statistics/SearchEngines.php (revision 6811247a0bc8a9bcfa5be848932c2907af4329fc)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
6762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer
7762f4807SAndreas Gohr *
8762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here
92d987c80SAndreas Gohr */
102d987c80SAndreas Gohrclass SearchEngines
112d987c80SAndreas Gohr{
12e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
13762f4807SAndreas Gohr    protected static array $searchEngines = [
14e357e0dcSAndreas Gohr (aider)        'google' => [
15e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
16*6811247aSAndreas Gohr            'url' => 'https://www.google.com',
17e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
18e357e0dcSAndreas Gohr (aider)            'params' => ['q']
19e357e0dcSAndreas Gohr (aider)        ],
20e357e0dcSAndreas Gohr (aider)        'bing' => [
21e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
22*6811247aSAndreas Gohr            'url' => 'https://www.bing.com',
23e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
24e357e0dcSAndreas Gohr (aider)            'params' => ['q']
25e357e0dcSAndreas Gohr (aider)        ],
26e357e0dcSAndreas Gohr (aider)        'yandex' => [
27e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
28*6811247aSAndreas Gohr            'url' => 'https://www.yandex.ru',
29e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
30e357e0dcSAndreas Gohr (aider)            'params' => ['query']
31e357e0dcSAndreas Gohr (aider)        ],
32e357e0dcSAndreas Gohr (aider)        'yahoo' => [
33e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
34*6811247aSAndreas Gohr            'url' => 'https://www.yahoo.com',
35e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
36e357e0dcSAndreas Gohr (aider)            'params' => ['p']
37e357e0dcSAndreas Gohr (aider)        ],
38e357e0dcSAndreas Gohr (aider)        'naver' => [
39e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
40*6811247aSAndreas Gohr            'url' => 'https://www.naver.com',
41e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
42e357e0dcSAndreas Gohr (aider)            'params' => ['query']
43e357e0dcSAndreas Gohr (aider)        ],
44e357e0dcSAndreas Gohr (aider)        'baidu' => [
45e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
46*6811247aSAndreas Gohr            'url' => 'https://www.baidu.com',
47e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
48e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
49e357e0dcSAndreas Gohr (aider)        ],
50e357e0dcSAndreas Gohr (aider)        'ask' => [
51e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
52*6811247aSAndreas Gohr            'url' => 'https://www.ask.com',
53e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
54e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
55e357e0dcSAndreas Gohr (aider)        ],
56e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
57e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
58*6811247aSAndreas Gohr            'url' => 'https://www.ask.com',
59e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
60e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
61e357e0dcSAndreas Gohr (aider)        ],
62e357e0dcSAndreas Gohr (aider)        'babylon' => [
63e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
64*6811247aSAndreas Gohr            'url' => 'https://search.babylon.com',
65e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
66e357e0dcSAndreas Gohr (aider)            'params' => ['q']
67e357e0dcSAndreas Gohr (aider)        ],
68e357e0dcSAndreas Gohr (aider)        'aol' => [
69e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
70*6811247aSAndreas Gohr            'url' => 'https://search.aol.com',
71e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
72e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
73e357e0dcSAndreas Gohr (aider)        ],
74e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
75e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
76*6811247aSAndreas Gohr            'url' => 'https://duckduckgo.com',
77e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
78e357e0dcSAndreas Gohr (aider)            'params' => ['q']
79e357e0dcSAndreas Gohr (aider)        ],
8045f4cdffSAndreas Gohr        'ecosia' => [
8145f4cdffSAndreas Gohr            'name' => 'Ecosia',
8245f4cdffSAndreas Gohr            'url' => 'https://www.ecosia.org',
8345f4cdffSAndreas Gohr            'regex' => '^(\w+\.)*ecosia\.org$',
8445f4cdffSAndreas Gohr            'params' => ['q']
8545f4cdffSAndreas Gohr        ],
8645f4cdffSAndreas Gohr        'qwant' => [
8745f4cdffSAndreas Gohr            'name' => 'Qwant',
8845f4cdffSAndreas Gohr            'url' => 'https://www.qwant.com',
8945f4cdffSAndreas Gohr            'regex' => '^(\w+\.)*qwant\.com$',
9045f4cdffSAndreas Gohr            'params' => ['q']
9145f4cdffSAndreas Gohr        ],
92*6811247aSAndreas Gohr        'avg' => [
93*6811247aSAndreas Gohr            'name' => 'AVG Safe Search',
94*6811247aSAndreas Gohr            'url' => 'https://search.avg.com',
95e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
96e357e0dcSAndreas Gohr (aider)            'params' => ['q']
97*6811247aSAndreas Gohr        ],
98*6811247aSAndreas Gohr        'brave' => [
99*6811247aSAndreas Gohr            'name' => 'Brave Search',
100*6811247aSAndreas Gohr            'url' => 'https://search.brave.com',
101*6811247aSAndreas Gohr            'regex' => '^(\w+\.)*search\.brave\.com$',
102*6811247aSAndreas Gohr            'params' => ['q']
103*6811247aSAndreas Gohr        ],
1042d987c80SAndreas Gohr    ];
1052d987c80SAndreas Gohr
106762f4807SAndreas Gohr    /** @var string|null The search engine key */
107762f4807SAndreas Gohr    protected ?string $engine = null;
10806bd4382SAndreas Gohr (aider)
109aecf8e88SAndreas Gohr (aider)    /** @var string|null The search engine name */
110762f4807SAndreas Gohr    protected ?string $name = null;
111aecf8e88SAndreas Gohr (aider)
112aecf8e88SAndreas Gohr (aider)    /** @var string|null The search query */
113aecf8e88SAndreas Gohr (aider)    protected ?string $query = null;
114aecf8e88SAndreas Gohr (aider)
115762f4807SAndreas Gohr    /**
116762f4807SAndreas Gohr     * Constructor
117762f4807SAndreas Gohr     *
118762f4807SAndreas Gohr     * @param string $referer The HTTP referer URL to analyze
119762f4807SAndreas Gohr     */
120762f4807SAndreas Gohr    public function __construct(string $referer)
1212d987c80SAndreas Gohr    {
122762f4807SAndreas Gohr        $this->analyze($referer);
12306bd4382SAndreas Gohr (aider)    }
12406bd4382SAndreas Gohr (aider)
12506bd4382SAndreas Gohr (aider)    /**
126762f4807SAndreas Gohr     * Check if the referer is from a search engine
12706bd4382SAndreas Gohr (aider)     *
128762f4807SAndreas Gohr     * @return bool True if the referer is from a search engine
12906bd4382SAndreas Gohr (aider)     */
13006bd4382SAndreas Gohr (aider)    public function isSearchEngine(): bool
13106bd4382SAndreas Gohr (aider)    {
132762f4807SAndreas Gohr        return (bool)$this->engine;
13306bd4382SAndreas Gohr (aider)    }
13406bd4382SAndreas Gohr (aider)
13506bd4382SAndreas Gohr (aider)    /**
136762f4807SAndreas Gohr     * Get the search engine identifier from the referer
13706bd4382SAndreas Gohr (aider)     *
138c428ec28SAndreas Gohr     * @return string|null The search engine or null if not a search engine
13906bd4382SAndreas Gohr (aider)     */
140c428ec28SAndreas Gohr    public function getEngine(): ?string
14106bd4382SAndreas Gohr (aider)    {
142762f4807SAndreas Gohr        return $this->engine;
14306bd4382SAndreas Gohr (aider)    }
14406bd4382SAndreas Gohr (aider)
14506bd4382SAndreas Gohr (aider)    /**
146762f4807SAndreas Gohr     * Get the search query from the referer
14706bd4382SAndreas Gohr (aider)     *
14806bd4382SAndreas Gohr (aider)     * @return string|null The search query or null if not a search engine
14906bd4382SAndreas Gohr (aider)     */
15006bd4382SAndreas Gohr (aider)    public function getQuery(): ?string
15106bd4382SAndreas Gohr (aider)    {
152aecf8e88SAndreas Gohr (aider)        return $this->query;
15306bd4382SAndreas Gohr (aider)    }
15406bd4382SAndreas Gohr (aider)
15506bd4382SAndreas Gohr (aider)    /**
156762f4807SAndreas Gohr     * Get the search engine name for the given engine identifier
157762f4807SAndreas Gohr     *
158c428ec28SAndreas Gohr     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
15906bd4382SAndreas Gohr (aider)     */
160c428ec28SAndreas Gohr    public static function getName($engine): string
16106bd4382SAndreas Gohr (aider)    {
162c4c84f98SAndreas Gohr        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine);
163762f4807SAndreas Gohr    }
164762f4807SAndreas Gohr
165762f4807SAndreas Gohr    /**
166762f4807SAndreas Gohr     * Get the search engine URL for the given engine identifier
167762f4807SAndreas Gohr     *
168762f4807SAndreas Gohr     * @return string|null The search engine URL or null if not defined
169762f4807SAndreas Gohr     */
170762f4807SAndreas Gohr    public static function getUrl($engine): ?string
171762f4807SAndreas Gohr    {
172762f4807SAndreas Gohr        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
173762f4807SAndreas Gohr    }
174762f4807SAndreas Gohr
175762f4807SAndreas Gohr    /**
176762f4807SAndreas Gohr     * Analyze the referer and populate member variables
177762f4807SAndreas Gohr     */
178762f4807SAndreas Gohr    protected function analyze(string $referer): void
179762f4807SAndreas Gohr    {
180762f4807SAndreas Gohr        $result = $this->analyzereferer($referer);
18106bd4382SAndreas Gohr (aider)
182aecf8e88SAndreas Gohr (aider)        if ($result) {
183762f4807SAndreas Gohr            $this->engine = $result['engine'];
184762f4807SAndreas Gohr            $this->name = $result['name'];
185aecf8e88SAndreas Gohr (aider)            $this->query = $result['query'];
186aecf8e88SAndreas Gohr (aider)        }
1872d987c80SAndreas Gohr    }
1882d987c80SAndreas Gohr
189a171b9c7SAndreas Gohr (aider)    /**
190762f4807SAndreas Gohr     * Analyze a referer URL to extract search engine information and query
191a171b9c7SAndreas Gohr (aider)     *
192a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
193a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
194a171b9c7SAndreas Gohr (aider)     */
195762f4807SAndreas Gohr    protected function analyzereferer(string $referer): ?array
196a171b9c7SAndreas Gohr (aider)    {
197a73005f2SAndreas Gohr (aider)        $urlparts = parse_url(strtolower($referer));
198a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
199a171b9c7SAndreas Gohr (aider)            return null;
200a171b9c7SAndreas Gohr (aider)        }
201a171b9c7SAndreas Gohr (aider)
202a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
203a73005f2SAndreas Gohr (aider)        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
204a171b9c7SAndreas Gohr (aider)
205a73005f2SAndreas Gohr (aider)        parse_str($queryString, $params);
206a73005f2SAndreas Gohr (aider)
207a73005f2SAndreas Gohr (aider)        // Try to match against known search engines
208a73005f2SAndreas Gohr (aider)        $result = $this->matchKnownEngine($domain, $params);
209a73005f2SAndreas Gohr (aider)        if ($result) {
210a73005f2SAndreas Gohr (aider)            return $result;
211a73005f2SAndreas Gohr (aider)        }
212a73005f2SAndreas Gohr (aider)
213a73005f2SAndreas Gohr (aider)        // Try generic search parameters
214a73005f2SAndreas Gohr (aider)        return $this->matchGenericEngine($domain, $params);
215a73005f2SAndreas Gohr (aider)    }
216a73005f2SAndreas Gohr (aider)
217a73005f2SAndreas Gohr (aider)    /**
218a73005f2SAndreas Gohr (aider)     * Try to match against known search engines
219a73005f2SAndreas Gohr (aider)     *
220a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
221a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
222a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
223a73005f2SAndreas Gohr (aider)     */
224a73005f2SAndreas Gohr (aider)    protected function matchKnownEngine(string $domain, array $params): ?array
225a73005f2SAndreas Gohr (aider)    {
226762f4807SAndreas Gohr        foreach (self::$searchEngines as $key => $engine) {
227a73005f2SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
228a73005f2SAndreas Gohr (aider)                $query = $this->extractQuery($params, $engine['params']);
229a73005f2SAndreas Gohr (aider)                return [
230a73005f2SAndreas Gohr (aider)                    'engine' => $key,
231a73005f2SAndreas Gohr (aider)                    'name' => $engine['name'],
232a73005f2SAndreas Gohr (aider)                    'query' => $query
233a73005f2SAndreas Gohr (aider)                ];
234a73005f2SAndreas Gohr (aider)            }
235a73005f2SAndreas Gohr (aider)        }
236a73005f2SAndreas Gohr (aider)
237a73005f2SAndreas Gohr (aider)        return null;
238a73005f2SAndreas Gohr (aider)    }
239a73005f2SAndreas Gohr (aider)
240a73005f2SAndreas Gohr (aider)    /**
241a73005f2SAndreas Gohr (aider)     * Try to match against generic search parameters
242a73005f2SAndreas Gohr (aider)     *
243a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
244a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
245a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
246a73005f2SAndreas Gohr (aider)     */
247a73005f2SAndreas Gohr (aider)    protected function matchGenericEngine(string $domain, array $params): ?array
248a73005f2SAndreas Gohr (aider)    {
249a73005f2SAndreas Gohr (aider)        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
250a73005f2SAndreas Gohr (aider)        $query = $this->extractQuery($params, $genericParams);
251a171b9c7SAndreas Gohr (aider)
252a171b9c7SAndreas Gohr (aider)        if (!$query) {
253a171b9c7SAndreas Gohr (aider)            return null;
254a171b9c7SAndreas Gohr (aider)        }
255a171b9c7SAndreas Gohr (aider)
256a73005f2SAndreas Gohr (aider)        // Generate engine name from domain
257a73005f2SAndreas Gohr (aider)        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
258762f4807SAndreas Gohr        $domainParts = explode('.', $engineName);
259762f4807SAndreas Gohr        $engineName = array_pop($domainParts);
260a73005f2SAndreas Gohr (aider)
261a171b9c7SAndreas Gohr (aider)        return [
262762f4807SAndreas Gohr            'engine' => $engineName,
263a73005f2SAndreas Gohr (aider)            'name' => ucfirst($engineName),
264a171b9c7SAndreas Gohr (aider)            'query' => $query
265a171b9c7SAndreas Gohr (aider)        ];
266a171b9c7SAndreas Gohr (aider)    }
267a171b9c7SAndreas Gohr (aider)
268a171b9c7SAndreas Gohr (aider)    /**
269a73005f2SAndreas Gohr (aider)     * Extract and clean search query from parameters
270a171b9c7SAndreas Gohr (aider)     *
271a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
272a73005f2SAndreas Gohr (aider)     * @param array $paramNames Parameter names to check
273a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null
274a171b9c7SAndreas Gohr (aider)     */
275a73005f2SAndreas Gohr (aider)    protected function extractQuery(array $params, array $paramNames): ?string
276a171b9c7SAndreas Gohr (aider)    {
277a73005f2SAndreas Gohr (aider)        foreach ($paramNames as $param) {
278a73005f2SAndreas Gohr (aider)            if (!empty($params[$param])) {
279a73005f2SAndreas Gohr (aider)                $query = $this->cleanQuery($params[$param]);
280a73005f2SAndreas Gohr (aider)                if ($query) {
281a73005f2SAndreas Gohr (aider)                    return $query;
282a73005f2SAndreas Gohr (aider)                }
283a73005f2SAndreas Gohr (aider)            }
284a73005f2SAndreas Gohr (aider)        }
285a73005f2SAndreas Gohr (aider)
286a73005f2SAndreas Gohr (aider)        return null;
287a171b9c7SAndreas Gohr (aider)    }
288a171b9c7SAndreas Gohr (aider)
289a171b9c7SAndreas Gohr (aider)    /**
290a73005f2SAndreas Gohr (aider)     * Clean and validate search query
291a171b9c7SAndreas Gohr (aider)     *
292a73005f2SAndreas Gohr (aider)     * @param string $query Raw query string
293a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null if invalid
294a171b9c7SAndreas Gohr (aider)     */
295a73005f2SAndreas Gohr (aider)    protected function cleanQuery(string $query): ?string
296a171b9c7SAndreas Gohr (aider)    {
297c428ec28SAndreas Gohr        // Remove non-search queries (cache: and related: prefixes)
298c428ec28SAndreas Gohr        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
299a73005f2SAndreas Gohr (aider)        // Compact whitespace
300c428ec28SAndreas Gohr        $query = preg_replace('/\s+/', ' ', $query);
301a73005f2SAndreas Gohr (aider)        $query = trim($query);
302a73005f2SAndreas Gohr (aider)
303a73005f2SAndreas Gohr (aider)        return $query ?: null;
304a171b9c7SAndreas Gohr (aider)    }
3052d987c80SAndreas Gohr}
306