xref: /plugin/statistics/SearchEngines.php (revision 45f4cdff9eeee357a9c7da871bb438b139ad9748)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
6762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer
7762f4807SAndreas Gohr *
8762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here
92d987c80SAndreas Gohr */
102d987c80SAndreas Gohrclass SearchEngines
112d987c80SAndreas Gohr{
12e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
13762f4807SAndreas Gohr    protected static array $searchEngines = [
14762f4807SAndreas Gohr        'dokuwiki' => [
15762f4807SAndreas Gohr            'name' => 'DokuWiki Internal Search',
16762f4807SAndreas Gohr            'url' => DOKU_URL,
17762f4807SAndreas Gohr            'regex' => '', // set in constructor
18762f4807SAndreas Gohr            'params' => ['q']
19762f4807SAndreas Gohr        ],
20e357e0dcSAndreas Gohr (aider)        'google' => [
21e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
22e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
23e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
24e357e0dcSAndreas Gohr (aider)            'params' => ['q']
25e357e0dcSAndreas Gohr (aider)        ],
26e357e0dcSAndreas Gohr (aider)        'bing' => [
27e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
28e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.bing.com',
29e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
30e357e0dcSAndreas Gohr (aider)            'params' => ['q']
31e357e0dcSAndreas Gohr (aider)        ],
32e357e0dcSAndreas Gohr (aider)        'yandex' => [
33e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
34e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yandex.ru',
35e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
36e357e0dcSAndreas Gohr (aider)            'params' => ['query']
37e357e0dcSAndreas Gohr (aider)        ],
38e357e0dcSAndreas Gohr (aider)        'yahoo' => [
39e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
40e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yahoo.com',
41e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
42e357e0dcSAndreas Gohr (aider)            'params' => ['p']
43e357e0dcSAndreas Gohr (aider)        ],
44e357e0dcSAndreas Gohr (aider)        'naver' => [
45e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
46e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.naver.com',
47e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
48e357e0dcSAndreas Gohr (aider)            'params' => ['query']
49e357e0dcSAndreas Gohr (aider)        ],
50e357e0dcSAndreas Gohr (aider)        'baidu' => [
51e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
52e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.baidu.com',
53e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
54e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
55e357e0dcSAndreas Gohr (aider)        ],
56e357e0dcSAndreas Gohr (aider)        'ask' => [
57e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
58e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
59e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
60e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
61e357e0dcSAndreas Gohr (aider)        ],
62e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
63e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
64e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
65e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
66e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
67e357e0dcSAndreas Gohr (aider)        ],
68e357e0dcSAndreas Gohr (aider)        'babylon' => [
69e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
70e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.babylon.com',
71e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
72e357e0dcSAndreas Gohr (aider)            'params' => ['q']
73e357e0dcSAndreas Gohr (aider)        ],
74e357e0dcSAndreas Gohr (aider)        'aol' => [
75e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
76e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.aol.com',
77e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
78e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
79e357e0dcSAndreas Gohr (aider)        ],
80e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
81e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
82e357e0dcSAndreas Gohr (aider)            'url' => 'http://duckduckgo.com',
83e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
84e357e0dcSAndreas Gohr (aider)            'params' => ['q']
85e357e0dcSAndreas Gohr (aider)        ],
86*45f4cdffSAndreas Gohr        'ecosia' => [
87*45f4cdffSAndreas Gohr            'name' => 'Ecosia',
88*45f4cdffSAndreas Gohr            'url' => 'https://www.ecosia.org',
89*45f4cdffSAndreas Gohr            'regex' => '^(\w+\.)*ecosia\.org$',
90*45f4cdffSAndreas Gohr            'params' => ['q']
91*45f4cdffSAndreas Gohr        ],
92*45f4cdffSAndreas Gohr        'qwant' => [
93*45f4cdffSAndreas Gohr            'name' => 'Qwant',
94*45f4cdffSAndreas Gohr            'url' => 'https://www.qwant.com',
95*45f4cdffSAndreas Gohr            'regex' => '^(\w+\.)*qwant\.com$',
96*45f4cdffSAndreas Gohr            'params' => ['q']
97*45f4cdffSAndreas Gohr        ],
98e357e0dcSAndreas Gohr (aider)        'google_avg' => [
99e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
100e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
101e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
102e357e0dcSAndreas Gohr (aider)            'params' => ['q']
103e357e0dcSAndreas Gohr (aider)        ]
1042d987c80SAndreas Gohr    ];
1052d987c80SAndreas Gohr
106762f4807SAndreas Gohr    /** @var string|null The search engine key */
107762f4807SAndreas Gohr    protected ?string $engine = null;
10806bd4382SAndreas Gohr (aider)
109aecf8e88SAndreas Gohr (aider)    /** @var string|null The search engine name */
110762f4807SAndreas Gohr    protected ?string $name = null;
111aecf8e88SAndreas Gohr (aider)
112aecf8e88SAndreas Gohr (aider)    /** @var string|null The search query */
113aecf8e88SAndreas Gohr (aider)    protected ?string $query = null;
114aecf8e88SAndreas Gohr (aider)
115762f4807SAndreas Gohr    /**
116762f4807SAndreas Gohr     * Constructor
117762f4807SAndreas Gohr     *
118762f4807SAndreas Gohr     * @param string $referer The HTTP referer URL to analyze
119762f4807SAndreas Gohr     */
120762f4807SAndreas Gohr    public function __construct(string $referer)
1212d987c80SAndreas Gohr    {
122762f4807SAndreas Gohr        // Add regex matching ourselves
123762f4807SAndreas Gohr        self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$';
124762f4807SAndreas Gohr        $this->analyze($referer);
12506bd4382SAndreas Gohr (aider)    }
12606bd4382SAndreas Gohr (aider)
12706bd4382SAndreas Gohr (aider)    /**
128762f4807SAndreas Gohr     * Check if the referer is from a search engine
12906bd4382SAndreas Gohr (aider)     *
130762f4807SAndreas Gohr     * @return bool True if the referer is from a search engine
13106bd4382SAndreas Gohr (aider)     */
13206bd4382SAndreas Gohr (aider)    public function isSearchEngine(): bool
13306bd4382SAndreas Gohr (aider)    {
134762f4807SAndreas Gohr        return (bool)$this->engine;
13506bd4382SAndreas Gohr (aider)    }
13606bd4382SAndreas Gohr (aider)
13706bd4382SAndreas Gohr (aider)    /**
138762f4807SAndreas Gohr     * Get the search engine identifier from the referer
13906bd4382SAndreas Gohr (aider)     *
140c428ec28SAndreas Gohr     * @return string|null The search engine or null if not a search engine
14106bd4382SAndreas Gohr (aider)     */
142c428ec28SAndreas Gohr    public function getEngine(): ?string
14306bd4382SAndreas Gohr (aider)    {
144762f4807SAndreas Gohr        return $this->engine;
14506bd4382SAndreas Gohr (aider)    }
14606bd4382SAndreas Gohr (aider)
14706bd4382SAndreas Gohr (aider)    /**
148762f4807SAndreas Gohr     * Get the search query from the referer
14906bd4382SAndreas Gohr (aider)     *
15006bd4382SAndreas Gohr (aider)     * @return string|null The search query or null if not a search engine
15106bd4382SAndreas Gohr (aider)     */
15206bd4382SAndreas Gohr (aider)    public function getQuery(): ?string
15306bd4382SAndreas Gohr (aider)    {
154aecf8e88SAndreas Gohr (aider)        return $this->query;
15506bd4382SAndreas Gohr (aider)    }
15606bd4382SAndreas Gohr (aider)
15706bd4382SAndreas Gohr (aider)    /**
158762f4807SAndreas Gohr     * Get the search engine name for the given engine identifier
159762f4807SAndreas Gohr     *
160c428ec28SAndreas Gohr     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
16106bd4382SAndreas Gohr (aider)     */
162c428ec28SAndreas Gohr    public static function getName($engine): string
16306bd4382SAndreas Gohr (aider)    {
164c4c84f98SAndreas Gohr        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine);
165762f4807SAndreas Gohr    }
166762f4807SAndreas Gohr
167762f4807SAndreas Gohr    /**
168762f4807SAndreas Gohr     * Get the search engine URL for the given engine identifier
169762f4807SAndreas Gohr     *
170762f4807SAndreas Gohr     * @return string|null The search engine URL or null if not defined
171762f4807SAndreas Gohr     */
172762f4807SAndreas Gohr    public static function getUrl($engine): ?string
173762f4807SAndreas Gohr    {
174762f4807SAndreas Gohr        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
175762f4807SAndreas Gohr    }
176762f4807SAndreas Gohr
177762f4807SAndreas Gohr    /**
178762f4807SAndreas Gohr     * Analyze the referer and populate member variables
179762f4807SAndreas Gohr     */
180762f4807SAndreas Gohr    protected function analyze(string $referer): void
181762f4807SAndreas Gohr    {
182762f4807SAndreas Gohr        $result = $this->analyzereferer($referer);
18306bd4382SAndreas Gohr (aider)
184aecf8e88SAndreas Gohr (aider)        if ($result) {
185762f4807SAndreas Gohr            $this->engine = $result['engine'];
186762f4807SAndreas Gohr            $this->name = $result['name'];
187aecf8e88SAndreas Gohr (aider)            $this->query = $result['query'];
188aecf8e88SAndreas Gohr (aider)        }
1892d987c80SAndreas Gohr    }
1902d987c80SAndreas Gohr
191a171b9c7SAndreas Gohr (aider)    /**
192762f4807SAndreas Gohr     * Analyze a referer URL to extract search engine information and query
193a171b9c7SAndreas Gohr (aider)     *
194a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
195a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
196a171b9c7SAndreas Gohr (aider)     */
197762f4807SAndreas Gohr    protected function analyzereferer(string $referer): ?array
198a171b9c7SAndreas Gohr (aider)    {
199a73005f2SAndreas Gohr (aider)        $urlparts = parse_url(strtolower($referer));
200a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
201a171b9c7SAndreas Gohr (aider)            return null;
202a171b9c7SAndreas Gohr (aider)        }
203a171b9c7SAndreas Gohr (aider)
204a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
205a73005f2SAndreas Gohr (aider)        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
206a171b9c7SAndreas Gohr (aider)
207a73005f2SAndreas Gohr (aider)        parse_str($queryString, $params);
208a73005f2SAndreas Gohr (aider)
209a73005f2SAndreas Gohr (aider)        // Try to match against known search engines
210a73005f2SAndreas Gohr (aider)        $result = $this->matchKnownEngine($domain, $params);
211a73005f2SAndreas Gohr (aider)        if ($result) {
212a73005f2SAndreas Gohr (aider)            return $result;
213a73005f2SAndreas Gohr (aider)        }
214a73005f2SAndreas Gohr (aider)
215a73005f2SAndreas Gohr (aider)        // Try generic search parameters
216a73005f2SAndreas Gohr (aider)        return $this->matchGenericEngine($domain, $params);
217a73005f2SAndreas Gohr (aider)    }
218a73005f2SAndreas Gohr (aider)
219a73005f2SAndreas Gohr (aider)    /**
220a73005f2SAndreas Gohr (aider)     * Try to match against known search engines
221a73005f2SAndreas Gohr (aider)     *
222a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
223a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
224a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
225a73005f2SAndreas Gohr (aider)     */
226a73005f2SAndreas Gohr (aider)    protected function matchKnownEngine(string $domain, array $params): ?array
227a73005f2SAndreas Gohr (aider)    {
228762f4807SAndreas Gohr        foreach (self::$searchEngines as $key => $engine) {
229a73005f2SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
230a73005f2SAndreas Gohr (aider)                $query = $this->extractQuery($params, $engine['params']);
231a73005f2SAndreas Gohr (aider)                return [
232a73005f2SAndreas Gohr (aider)                    'engine' => $key,
233a73005f2SAndreas Gohr (aider)                    'name' => $engine['name'],
234a73005f2SAndreas Gohr (aider)                    'query' => $query
235a73005f2SAndreas Gohr (aider)                ];
236a73005f2SAndreas Gohr (aider)            }
237a73005f2SAndreas Gohr (aider)        }
238a73005f2SAndreas Gohr (aider)
239a73005f2SAndreas Gohr (aider)        return null;
240a73005f2SAndreas Gohr (aider)    }
241a73005f2SAndreas Gohr (aider)
242a73005f2SAndreas Gohr (aider)    /**
243a73005f2SAndreas Gohr (aider)     * Try to match against generic search parameters
244a73005f2SAndreas Gohr (aider)     *
245a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
246a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
247a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
248a73005f2SAndreas Gohr (aider)     */
249a73005f2SAndreas Gohr (aider)    protected function matchGenericEngine(string $domain, array $params): ?array
250a73005f2SAndreas Gohr (aider)    {
251a73005f2SAndreas Gohr (aider)        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
252a73005f2SAndreas Gohr (aider)        $query = $this->extractQuery($params, $genericParams);
253a171b9c7SAndreas Gohr (aider)
254a171b9c7SAndreas Gohr (aider)        if (!$query) {
255a171b9c7SAndreas Gohr (aider)            return null;
256a171b9c7SAndreas Gohr (aider)        }
257a171b9c7SAndreas Gohr (aider)
258a73005f2SAndreas Gohr (aider)        // Generate engine name from domain
259a73005f2SAndreas Gohr (aider)        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
260762f4807SAndreas Gohr        $domainParts = explode('.', $engineName);
261762f4807SAndreas Gohr        $engineName = array_pop($domainParts);
262a73005f2SAndreas Gohr (aider)
263a171b9c7SAndreas Gohr (aider)        return [
264762f4807SAndreas Gohr            'engine' => $engineName,
265a73005f2SAndreas Gohr (aider)            'name' => ucfirst($engineName),
266a171b9c7SAndreas Gohr (aider)            'query' => $query
267a171b9c7SAndreas Gohr (aider)        ];
268a171b9c7SAndreas Gohr (aider)    }
269a171b9c7SAndreas Gohr (aider)
270a171b9c7SAndreas Gohr (aider)    /**
271a73005f2SAndreas Gohr (aider)     * Extract and clean search query from parameters
272a171b9c7SAndreas Gohr (aider)     *
273a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
274a73005f2SAndreas Gohr (aider)     * @param array $paramNames Parameter names to check
275a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null
276a171b9c7SAndreas Gohr (aider)     */
277a73005f2SAndreas Gohr (aider)    protected function extractQuery(array $params, array $paramNames): ?string
278a171b9c7SAndreas Gohr (aider)    {
279a73005f2SAndreas Gohr (aider)        foreach ($paramNames as $param) {
280a73005f2SAndreas Gohr (aider)            if (!empty($params[$param])) {
281a73005f2SAndreas Gohr (aider)                $query = $this->cleanQuery($params[$param]);
282a73005f2SAndreas Gohr (aider)                if ($query) {
283a73005f2SAndreas Gohr (aider)                    return $query;
284a73005f2SAndreas Gohr (aider)                }
285a73005f2SAndreas Gohr (aider)            }
286a73005f2SAndreas Gohr (aider)        }
287a73005f2SAndreas Gohr (aider)
288a73005f2SAndreas Gohr (aider)        return null;
289a171b9c7SAndreas Gohr (aider)    }
290a171b9c7SAndreas Gohr (aider)
291a171b9c7SAndreas Gohr (aider)    /**
292a73005f2SAndreas Gohr (aider)     * Clean and validate search query
293a171b9c7SAndreas Gohr (aider)     *
294a73005f2SAndreas Gohr (aider)     * @param string $query Raw query string
295a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null if invalid
296a171b9c7SAndreas Gohr (aider)     */
297a73005f2SAndreas Gohr (aider)    protected function cleanQuery(string $query): ?string
298a171b9c7SAndreas Gohr (aider)    {
299c428ec28SAndreas Gohr        // Remove non-search queries (cache: and related: prefixes)
300c428ec28SAndreas Gohr        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
301a73005f2SAndreas Gohr (aider)        // Compact whitespace
302c428ec28SAndreas Gohr        $query = preg_replace('/\s+/', ' ', $query);
303a73005f2SAndreas Gohr (aider)        $query = trim($query);
304a73005f2SAndreas Gohr (aider)
305a73005f2SAndreas Gohr (aider)        return $query ?: null;
306a171b9c7SAndreas Gohr (aider)    }
3072d987c80SAndreas Gohr}
308