xref: /plugin/statistics/SearchEngines.php (revision aecf8e88a958911ceea01f601a32b2453018b637)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines
72d987c80SAndreas Gohr */
82d987c80SAndreas Gohrclass SearchEngines
92d987c80SAndreas Gohr{
10e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
11e357e0dcSAndreas Gohr (aider)    protected array $searchEngines = [
12e357e0dcSAndreas Gohr (aider)        'google' => [
13e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
14e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
15e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16e357e0dcSAndreas Gohr (aider)            'params' => ['q']
17e357e0dcSAndreas Gohr (aider)        ],
18e357e0dcSAndreas Gohr (aider)        'bing' => [
19e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
20e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.bing.com',
21e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22e357e0dcSAndreas Gohr (aider)            'params' => ['q']
23e357e0dcSAndreas Gohr (aider)        ],
24e357e0dcSAndreas Gohr (aider)        'yandex' => [
25e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
26e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yandex.ru',
27e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28e357e0dcSAndreas Gohr (aider)            'params' => ['query']
29e357e0dcSAndreas Gohr (aider)        ],
30e357e0dcSAndreas Gohr (aider)        'yahoo' => [
31e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
32e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yahoo.com',
33e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
34e357e0dcSAndreas Gohr (aider)            'params' => ['p']
35e357e0dcSAndreas Gohr (aider)        ],
36e357e0dcSAndreas Gohr (aider)        'naver' => [
37e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
38e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.naver.com',
39e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
40e357e0dcSAndreas Gohr (aider)            'params' => ['query']
41e357e0dcSAndreas Gohr (aider)        ],
42e357e0dcSAndreas Gohr (aider)        'baidu' => [
43e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
44e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.baidu.com',
45e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
46e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
47e357e0dcSAndreas Gohr (aider)        ],
48e357e0dcSAndreas Gohr (aider)        'ask' => [
49e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
50e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
51e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
52e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
53e357e0dcSAndreas Gohr (aider)        ],
54e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
55e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
56e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
57e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
58e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
59e357e0dcSAndreas Gohr (aider)        ],
60e357e0dcSAndreas Gohr (aider)        'babylon' => [
61e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
62e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.babylon.com',
63e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
64e357e0dcSAndreas Gohr (aider)            'params' => ['q']
65e357e0dcSAndreas Gohr (aider)        ],
66e357e0dcSAndreas Gohr (aider)        'aol' => [
67e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
68e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.aol.com',
69e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
71e357e0dcSAndreas Gohr (aider)        ],
72e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
73e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
74e357e0dcSAndreas Gohr (aider)            'url' => 'http://duckduckgo.com',
75e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
76e357e0dcSAndreas Gohr (aider)            'params' => ['q']
77e357e0dcSAndreas Gohr (aider)        ],
78e357e0dcSAndreas Gohr (aider)        'google_avg' => [
79e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
80e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
81e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
82e357e0dcSAndreas Gohr (aider)            'params' => ['q']
83e357e0dcSAndreas Gohr (aider)        ]
842d987c80SAndreas Gohr    ];
852d987c80SAndreas Gohr
86*aecf8e88SAndreas Gohr (aider)    /** @var string The referrer URL being analyzed */
87*aecf8e88SAndreas Gohr (aider)    protected string $referrer;
8806bd4382SAndreas Gohr (aider)
89*aecf8e88SAndreas Gohr (aider)    /** @var bool Whether the referrer is from a search engine */
90*aecf8e88SAndreas Gohr (aider)    protected bool $isSearchEngine = false;
9106bd4382SAndreas Gohr (aider)
92*aecf8e88SAndreas Gohr (aider)    /** @var string|null The search engine name */
93*aecf8e88SAndreas Gohr (aider)    protected ?string $engineName = null;
94*aecf8e88SAndreas Gohr (aider)
95*aecf8e88SAndreas Gohr (aider)    /** @var string|null The search engine key */
96*aecf8e88SAndreas Gohr (aider)    protected ?string $engineKey = null;
97*aecf8e88SAndreas Gohr (aider)
98*aecf8e88SAndreas Gohr (aider)    /** @var string|null The search query */
99*aecf8e88SAndreas Gohr (aider)    protected ?string $query = null;
100*aecf8e88SAndreas Gohr (aider)
101*aecf8e88SAndreas Gohr (aider)    public function __construct(string $referrer)
1022d987c80SAndreas Gohr    {
103e357e0dcSAndreas Gohr (aider)        // Add the internal DokuWiki search engine
104e357e0dcSAndreas Gohr (aider)        $this->searchEngines['dokuwiki'] = [
105e357e0dcSAndreas Gohr (aider)            'name' => 'DokuWiki Internal Search',
106e357e0dcSAndreas Gohr (aider)            'url' => wl(),
107e357e0dcSAndreas Gohr (aider)            'regex' => '',
108e357e0dcSAndreas Gohr (aider)            'params' => ['q']
109e357e0dcSAndreas Gohr (aider)        ];
11006bd4382SAndreas Gohr (aider)
11106bd4382SAndreas Gohr (aider)        $this->referrer = $referrer;
112*aecf8e88SAndreas Gohr (aider)        $this->analyze();
11306bd4382SAndreas Gohr (aider)    }
11406bd4382SAndreas Gohr (aider)
11506bd4382SAndreas Gohr (aider)    /**
11606bd4382SAndreas Gohr (aider)     * Check if the referrer is from a search engine
11706bd4382SAndreas Gohr (aider)     *
11806bd4382SAndreas Gohr (aider)     * @return bool True if the referrer is from a search engine
11906bd4382SAndreas Gohr (aider)     */
12006bd4382SAndreas Gohr (aider)    public function isSearchEngine(): bool
12106bd4382SAndreas Gohr (aider)    {
122*aecf8e88SAndreas Gohr (aider)        return $this->isSearchEngine;
12306bd4382SAndreas Gohr (aider)    }
12406bd4382SAndreas Gohr (aider)
12506bd4382SAndreas Gohr (aider)    /**
12606bd4382SAndreas Gohr (aider)     * Get the search engine name
12706bd4382SAndreas Gohr (aider)     *
12806bd4382SAndreas Gohr (aider)     * @return string|null The search engine name or null if not a search engine
12906bd4382SAndreas Gohr (aider)     */
13006bd4382SAndreas Gohr (aider)    public function getName(): ?string
13106bd4382SAndreas Gohr (aider)    {
132*aecf8e88SAndreas Gohr (aider)        return $this->engineName;
13306bd4382SAndreas Gohr (aider)    }
13406bd4382SAndreas Gohr (aider)
13506bd4382SAndreas Gohr (aider)    /**
13606bd4382SAndreas Gohr (aider)     * Get the search engine URL
13706bd4382SAndreas Gohr (aider)     *
13806bd4382SAndreas Gohr (aider)     * @return string|null The search engine URL or null if not a search engine
13906bd4382SAndreas Gohr (aider)     */
14006bd4382SAndreas Gohr (aider)    public function getUrl(): ?string
14106bd4382SAndreas Gohr (aider)    {
142*aecf8e88SAndreas Gohr (aider)        if (!$this->engineKey) {
14306bd4382SAndreas Gohr (aider)            return null;
14406bd4382SAndreas Gohr (aider)        }
14506bd4382SAndreas Gohr (aider)
146*aecf8e88SAndreas Gohr (aider)        return $this->searchEngines[$this->engineKey]['url'] ?? null;
14706bd4382SAndreas Gohr (aider)    }
14806bd4382SAndreas Gohr (aider)
14906bd4382SAndreas Gohr (aider)    /**
15006bd4382SAndreas Gohr (aider)     * Get the search query
15106bd4382SAndreas Gohr (aider)     *
15206bd4382SAndreas Gohr (aider)     * @return string|null The search query or null if not a search engine
15306bd4382SAndreas Gohr (aider)     */
15406bd4382SAndreas Gohr (aider)    public function getQuery(): ?string
15506bd4382SAndreas Gohr (aider)    {
156*aecf8e88SAndreas Gohr (aider)        return $this->query;
15706bd4382SAndreas Gohr (aider)    }
15806bd4382SAndreas Gohr (aider)
15906bd4382SAndreas Gohr (aider)    /**
160*aecf8e88SAndreas Gohr (aider)     * Analyze the referrer and populate member variables
16106bd4382SAndreas Gohr (aider)     */
162*aecf8e88SAndreas Gohr (aider)    protected function analyze(): void
16306bd4382SAndreas Gohr (aider)    {
164*aecf8e88SAndreas Gohr (aider)        $result = $this->analyzeReferrer($this->referrer);
16506bd4382SAndreas Gohr (aider)
166*aecf8e88SAndreas Gohr (aider)        if ($result) {
167*aecf8e88SAndreas Gohr (aider)            $this->isSearchEngine = true;
168*aecf8e88SAndreas Gohr (aider)            $this->engineKey = $result['engine'];
169*aecf8e88SAndreas Gohr (aider)            $this->engineName = $result['name'];
170*aecf8e88SAndreas Gohr (aider)            $this->query = $result['query'];
171*aecf8e88SAndreas Gohr (aider)        }
1722d987c80SAndreas Gohr    }
1732d987c80SAndreas Gohr
174a171b9c7SAndreas Gohr (aider)    /**
175a171b9c7SAndreas Gohr (aider)     * Analyze a referrer URL to extract search engine information and query
176a171b9c7SAndreas Gohr (aider)     *
177a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
178a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
179a171b9c7SAndreas Gohr (aider)     */
180a73005f2SAndreas Gohr (aider)    protected function analyzeReferrer(string $referer): ?array
181a171b9c7SAndreas Gohr (aider)    {
182a73005f2SAndreas Gohr (aider)        $urlparts = parse_url(strtolower($referer));
183a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
184a171b9c7SAndreas Gohr (aider)            return null;
185a171b9c7SAndreas Gohr (aider)        }
186a171b9c7SAndreas Gohr (aider)
187a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
188a73005f2SAndreas Gohr (aider)        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
189a171b9c7SAndreas Gohr (aider)
190a73005f2SAndreas Gohr (aider)        if (!$queryString) {
191a171b9c7SAndreas Gohr (aider)            return null;
192a171b9c7SAndreas Gohr (aider)        }
193a171b9c7SAndreas Gohr (aider)
194a73005f2SAndreas Gohr (aider)        parse_str($queryString, $params);
195a73005f2SAndreas Gohr (aider)
196a73005f2SAndreas Gohr (aider)        // Try to match against known search engines
197a73005f2SAndreas Gohr (aider)        $result = $this->matchKnownEngine($domain, $params);
198a73005f2SAndreas Gohr (aider)        if ($result) {
199a73005f2SAndreas Gohr (aider)            return $result;
200a73005f2SAndreas Gohr (aider)        }
201a73005f2SAndreas Gohr (aider)
202a73005f2SAndreas Gohr (aider)        // Try generic search parameters
203a73005f2SAndreas Gohr (aider)        return $this->matchGenericEngine($domain, $params);
204a73005f2SAndreas Gohr (aider)    }
205a73005f2SAndreas Gohr (aider)
206a73005f2SAndreas Gohr (aider)    /**
207a73005f2SAndreas Gohr (aider)     * Try to match against known search engines
208a73005f2SAndreas Gohr (aider)     *
209a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
210a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
211a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
212a73005f2SAndreas Gohr (aider)     */
213a73005f2SAndreas Gohr (aider)    protected function matchKnownEngine(string $domain, array $params): ?array
214a73005f2SAndreas Gohr (aider)    {
215a73005f2SAndreas Gohr (aider)        foreach ($this->searchEngines as $key => $engine) {
216a73005f2SAndreas Gohr (aider)            if (!$engine['regex']) {
217a73005f2SAndreas Gohr (aider)                continue; // skip engines without regex (like dokuwiki)
218a73005f2SAndreas Gohr (aider)            }
219a73005f2SAndreas Gohr (aider)
220a73005f2SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
221a73005f2SAndreas Gohr (aider)                $query = $this->extractQuery($params, $engine['params']);
222a73005f2SAndreas Gohr (aider)                if ($query) {
223a73005f2SAndreas Gohr (aider)                    return [
224a73005f2SAndreas Gohr (aider)                        'engine' => $key,
225a73005f2SAndreas Gohr (aider)                        'name' => $engine['name'],
226a73005f2SAndreas Gohr (aider)                        'query' => $query
227a73005f2SAndreas Gohr (aider)                    ];
228a73005f2SAndreas Gohr (aider)                }
229a73005f2SAndreas Gohr (aider)            }
230a73005f2SAndreas Gohr (aider)        }
231a73005f2SAndreas Gohr (aider)
232a73005f2SAndreas Gohr (aider)        return null;
233a73005f2SAndreas Gohr (aider)    }
234a73005f2SAndreas Gohr (aider)
235a73005f2SAndreas Gohr (aider)    /**
236a73005f2SAndreas Gohr (aider)     * Try to match against generic search parameters
237a73005f2SAndreas Gohr (aider)     *
238a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
239a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
240a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
241a73005f2SAndreas Gohr (aider)     */
242a73005f2SAndreas Gohr (aider)    protected function matchGenericEngine(string $domain, array $params): ?array
243a73005f2SAndreas Gohr (aider)    {
244a73005f2SAndreas Gohr (aider)        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
245a73005f2SAndreas Gohr (aider)        $query = $this->extractQuery($params, $genericParams);
246a171b9c7SAndreas Gohr (aider)
247a171b9c7SAndreas Gohr (aider)        if (!$query) {
248a171b9c7SAndreas Gohr (aider)            return null;
249a171b9c7SAndreas Gohr (aider)        }
250a171b9c7SAndreas Gohr (aider)
251a73005f2SAndreas Gohr (aider)        // Generate engine name from domain
252a73005f2SAndreas Gohr (aider)        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
253a73005f2SAndreas Gohr (aider)        $engineName = array_pop(explode('.', $engineName));
254a73005f2SAndreas Gohr (aider)
255a171b9c7SAndreas Gohr (aider)        return [
256a73005f2SAndreas Gohr (aider)            'engine' => 'generic_' . $engineName,
257a73005f2SAndreas Gohr (aider)            'name' => ucfirst($engineName),
258a171b9c7SAndreas Gohr (aider)            'query' => $query
259a171b9c7SAndreas Gohr (aider)        ];
260a171b9c7SAndreas Gohr (aider)    }
261a171b9c7SAndreas Gohr (aider)
262a171b9c7SAndreas Gohr (aider)    /**
263a73005f2SAndreas Gohr (aider)     * Extract and clean search query from parameters
264a171b9c7SAndreas Gohr (aider)     *
265a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
266a73005f2SAndreas Gohr (aider)     * @param array $paramNames Parameter names to check
267a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null
268a171b9c7SAndreas Gohr (aider)     */
269a73005f2SAndreas Gohr (aider)    protected function extractQuery(array $params, array $paramNames): ?string
270a171b9c7SAndreas Gohr (aider)    {
271a73005f2SAndreas Gohr (aider)        foreach ($paramNames as $param) {
272a73005f2SAndreas Gohr (aider)            if (!empty($params[$param])) {
273a73005f2SAndreas Gohr (aider)                $query = $this->cleanQuery($params[$param]);
274a73005f2SAndreas Gohr (aider)                if ($query) {
275a73005f2SAndreas Gohr (aider)                    return $query;
276a73005f2SAndreas Gohr (aider)                }
277a73005f2SAndreas Gohr (aider)            }
278a73005f2SAndreas Gohr (aider)        }
279a73005f2SAndreas Gohr (aider)
280a73005f2SAndreas Gohr (aider)        return null;
281a171b9c7SAndreas Gohr (aider)    }
282a171b9c7SAndreas Gohr (aider)
283a171b9c7SAndreas Gohr (aider)    /**
284a73005f2SAndreas Gohr (aider)     * Clean and validate search query
285a171b9c7SAndreas Gohr (aider)     *
286a73005f2SAndreas Gohr (aider)     * @param string $query Raw query string
287a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null if invalid
288a171b9c7SAndreas Gohr (aider)     */
289a73005f2SAndreas Gohr (aider)    protected function cleanQuery(string $query): ?string
290a171b9c7SAndreas Gohr (aider)    {
291a73005f2SAndreas Gohr (aider)        // Remove non-search queries
292a73005f2SAndreas Gohr (aider)        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query);
293a73005f2SAndreas Gohr (aider)        // Compact whitespace
294a73005f2SAndreas Gohr (aider)        $query = preg_replace('/ +/', ' ', $query);
295a73005f2SAndreas Gohr (aider)        $query = trim($query);
296a73005f2SAndreas Gohr (aider)
297a73005f2SAndreas Gohr (aider)        return $query ?: null;
298a171b9c7SAndreas Gohr (aider)    }
299a171b9c7SAndreas Gohr (aider)
3002d987c80SAndreas Gohr}
301