xref: /plugin/statistics/SearchEngines.php (revision a73005f250d1a2d2de9eb3e2d23ac62fd0fe3095)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines
72d987c80SAndreas Gohr */
82d987c80SAndreas Gohrclass SearchEngines
92d987c80SAndreas Gohr{
10e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
11e357e0dcSAndreas Gohr (aider)    protected array $searchEngines = [
12e357e0dcSAndreas Gohr (aider)        'google' => [
13e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
14e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
15e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16e357e0dcSAndreas Gohr (aider)            'params' => ['q']
17e357e0dcSAndreas Gohr (aider)        ],
18e357e0dcSAndreas Gohr (aider)        'bing' => [
19e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
20e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.bing.com',
21e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22e357e0dcSAndreas Gohr (aider)            'params' => ['q']
23e357e0dcSAndreas Gohr (aider)        ],
24e357e0dcSAndreas Gohr (aider)        'yandex' => [
25e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
26e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yandex.ru',
27e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28e357e0dcSAndreas Gohr (aider)            'params' => ['query']
29e357e0dcSAndreas Gohr (aider)        ],
30e357e0dcSAndreas Gohr (aider)        'yahoo' => [
31e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
32e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yahoo.com',
33e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
34e357e0dcSAndreas Gohr (aider)            'params' => ['p']
35e357e0dcSAndreas Gohr (aider)        ],
36e357e0dcSAndreas Gohr (aider)        'naver' => [
37e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
38e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.naver.com',
39e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
40e357e0dcSAndreas Gohr (aider)            'params' => ['query']
41e357e0dcSAndreas Gohr (aider)        ],
42e357e0dcSAndreas Gohr (aider)        'baidu' => [
43e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
44e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.baidu.com',
45e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
46e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
47e357e0dcSAndreas Gohr (aider)        ],
48e357e0dcSAndreas Gohr (aider)        'ask' => [
49e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
50e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
51e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
52e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
53e357e0dcSAndreas Gohr (aider)        ],
54e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
55e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
56e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
57e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
58e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
59e357e0dcSAndreas Gohr (aider)        ],
60e357e0dcSAndreas Gohr (aider)        'babylon' => [
61e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
62e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.babylon.com',
63e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
64e357e0dcSAndreas Gohr (aider)            'params' => ['q']
65e357e0dcSAndreas Gohr (aider)        ],
66e357e0dcSAndreas Gohr (aider)        'aol' => [
67e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
68e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.aol.com',
69e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
71e357e0dcSAndreas Gohr (aider)        ],
72e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
73e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
74e357e0dcSAndreas Gohr (aider)            'url' => 'http://duckduckgo.com',
75e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
76e357e0dcSAndreas Gohr (aider)            'params' => ['q']
77e357e0dcSAndreas Gohr (aider)        ],
78e357e0dcSAndreas Gohr (aider)        'google_avg' => [
79e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
80e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
81e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
82e357e0dcSAndreas Gohr (aider)            'params' => ['q']
83e357e0dcSAndreas Gohr (aider)        ]
842d987c80SAndreas Gohr    ];
852d987c80SAndreas Gohr
8606bd4382SAndreas Gohr (aider)    /** @var string|null The referrer URL being analyzed */
8706bd4382SAndreas Gohr (aider)    protected ?string $referrer = null;
8806bd4382SAndreas Gohr (aider)
8906bd4382SAndreas Gohr (aider)    /** @var array|null Cached analysis result */
9006bd4382SAndreas Gohr (aider)    protected ?array $analysisResult = null;
9106bd4382SAndreas Gohr (aider)
9206bd4382SAndreas Gohr (aider)    public function __construct(?string $referrer = null)
932d987c80SAndreas Gohr    {
94e357e0dcSAndreas Gohr (aider)        // Add the internal DokuWiki search engine
95e357e0dcSAndreas Gohr (aider)        $this->searchEngines['dokuwiki'] = [
96e357e0dcSAndreas Gohr (aider)            'name' => 'DokuWiki Internal Search',
97e357e0dcSAndreas Gohr (aider)            'url' => wl(),
98e357e0dcSAndreas Gohr (aider)            'regex' => '',
99e357e0dcSAndreas Gohr (aider)            'params' => ['q']
100e357e0dcSAndreas Gohr (aider)        ];
10106bd4382SAndreas Gohr (aider)
10206bd4382SAndreas Gohr (aider)        if ($referrer !== null) {
10306bd4382SAndreas Gohr (aider)            $this->referrer = $referrer;
104*a73005f2SAndreas Gohr (aider)        }
10506bd4382SAndreas Gohr (aider)    }
10606bd4382SAndreas Gohr (aider)
10706bd4382SAndreas Gohr (aider)    /**
10806bd4382SAndreas Gohr (aider)     * Check if the referrer is from a search engine
10906bd4382SAndreas Gohr (aider)     *
11006bd4382SAndreas Gohr (aider)     * @return bool True if the referrer is from a search engine
11106bd4382SAndreas Gohr (aider)     */
11206bd4382SAndreas Gohr (aider)    public function isSearchEngine(): bool
11306bd4382SAndreas Gohr (aider)    {
114*a73005f2SAndreas Gohr (aider)        return $this->getAnalysis() !== null;
11506bd4382SAndreas Gohr (aider)    }
11606bd4382SAndreas Gohr (aider)
11706bd4382SAndreas Gohr (aider)    /**
11806bd4382SAndreas Gohr (aider)     * Get the search engine name
11906bd4382SAndreas Gohr (aider)     *
12006bd4382SAndreas Gohr (aider)     * @return string|null The search engine name or null if not a search engine
12106bd4382SAndreas Gohr (aider)     */
12206bd4382SAndreas Gohr (aider)    public function getName(): ?string
12306bd4382SAndreas Gohr (aider)    {
124*a73005f2SAndreas Gohr (aider)        $analysis = $this->getAnalysis();
125*a73005f2SAndreas Gohr (aider)        return $analysis['name'] ?? null;
12606bd4382SAndreas Gohr (aider)    }
12706bd4382SAndreas Gohr (aider)
12806bd4382SAndreas Gohr (aider)    /**
12906bd4382SAndreas Gohr (aider)     * Get the search engine URL
13006bd4382SAndreas Gohr (aider)     *
13106bd4382SAndreas Gohr (aider)     * @return string|null The search engine URL or null if not a search engine
13206bd4382SAndreas Gohr (aider)     */
13306bd4382SAndreas Gohr (aider)    public function getUrl(): ?string
13406bd4382SAndreas Gohr (aider)    {
135*a73005f2SAndreas Gohr (aider)        $analysis = $this->getAnalysis();
136*a73005f2SAndreas Gohr (aider)        if (!$analysis) {
13706bd4382SAndreas Gohr (aider)            return null;
13806bd4382SAndreas Gohr (aider)        }
13906bd4382SAndreas Gohr (aider)
140*a73005f2SAndreas Gohr (aider)        return $this->searchEngines[$analysis['engine']]['url'] ?? null;
14106bd4382SAndreas Gohr (aider)    }
14206bd4382SAndreas Gohr (aider)
14306bd4382SAndreas Gohr (aider)    /**
14406bd4382SAndreas Gohr (aider)     * Get the search query
14506bd4382SAndreas Gohr (aider)     *
14606bd4382SAndreas Gohr (aider)     * @return string|null The search query or null if not a search engine
14706bd4382SAndreas Gohr (aider)     */
14806bd4382SAndreas Gohr (aider)    public function getQuery(): ?string
14906bd4382SAndreas Gohr (aider)    {
150*a73005f2SAndreas Gohr (aider)        $analysis = $this->getAnalysis();
151*a73005f2SAndreas Gohr (aider)        return $analysis['query'] ?? null;
15206bd4382SAndreas Gohr (aider)    }
15306bd4382SAndreas Gohr (aider)
15406bd4382SAndreas Gohr (aider)    /**
155*a73005f2SAndreas Gohr (aider)     * Get or perform analysis of the current referrer
156*a73005f2SAndreas Gohr (aider)     *
157*a73005f2SAndreas Gohr (aider)     * @return array|null Analysis result or null if not a search engine
15806bd4382SAndreas Gohr (aider)     */
159*a73005f2SAndreas Gohr (aider)    protected function getAnalysis(): ?array
16006bd4382SAndreas Gohr (aider)    {
161*a73005f2SAndreas Gohr (aider)        if ($this->analysisResult === null && $this->referrer !== null) {
162*a73005f2SAndreas Gohr (aider)            $this->analysisResult = $this->analyzeReferrer($this->referrer);
16306bd4382SAndreas Gohr (aider)        }
16406bd4382SAndreas Gohr (aider)
165*a73005f2SAndreas Gohr (aider)        return $this->analysisResult;
1662d987c80SAndreas Gohr    }
1672d987c80SAndreas Gohr
168a171b9c7SAndreas Gohr (aider)    /**
169a171b9c7SAndreas Gohr (aider)     * Analyze a referrer URL to extract search engine information and query
170a171b9c7SAndreas Gohr (aider)     *
171a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
172a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
173a171b9c7SAndreas Gohr (aider)     */
174*a73005f2SAndreas Gohr (aider)    protected function analyzeReferrer(string $referer): ?array
175a171b9c7SAndreas Gohr (aider)    {
176*a73005f2SAndreas Gohr (aider)        $urlparts = parse_url(strtolower($referer));
177a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
178a171b9c7SAndreas Gohr (aider)            return null;
179a171b9c7SAndreas Gohr (aider)        }
180a171b9c7SAndreas Gohr (aider)
181a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
182*a73005f2SAndreas Gohr (aider)        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
183a171b9c7SAndreas Gohr (aider)
184*a73005f2SAndreas Gohr (aider)        if (!$queryString) {
185a171b9c7SAndreas Gohr (aider)            return null;
186a171b9c7SAndreas Gohr (aider)        }
187a171b9c7SAndreas Gohr (aider)
188*a73005f2SAndreas Gohr (aider)        parse_str($queryString, $params);
189*a73005f2SAndreas Gohr (aider)
190*a73005f2SAndreas Gohr (aider)        // Try to match against known search engines
191*a73005f2SAndreas Gohr (aider)        $result = $this->matchKnownEngine($domain, $params);
192*a73005f2SAndreas Gohr (aider)        if ($result) {
193*a73005f2SAndreas Gohr (aider)            return $result;
194*a73005f2SAndreas Gohr (aider)        }
195*a73005f2SAndreas Gohr (aider)
196*a73005f2SAndreas Gohr (aider)        // Try generic search parameters
197*a73005f2SAndreas Gohr (aider)        return $this->matchGenericEngine($domain, $params);
198*a73005f2SAndreas Gohr (aider)    }
199*a73005f2SAndreas Gohr (aider)
200*a73005f2SAndreas Gohr (aider)    /**
201*a73005f2SAndreas Gohr (aider)     * Try to match against known search engines
202*a73005f2SAndreas Gohr (aider)     *
203*a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
204*a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
205*a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
206*a73005f2SAndreas Gohr (aider)     */
207*a73005f2SAndreas Gohr (aider)    protected function matchKnownEngine(string $domain, array $params): ?array
208*a73005f2SAndreas Gohr (aider)    {
209*a73005f2SAndreas Gohr (aider)        foreach ($this->searchEngines as $key => $engine) {
210*a73005f2SAndreas Gohr (aider)            if (!$engine['regex']) {
211*a73005f2SAndreas Gohr (aider)                continue; // skip engines without regex (like dokuwiki)
212*a73005f2SAndreas Gohr (aider)            }
213*a73005f2SAndreas Gohr (aider)
214*a73005f2SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
215*a73005f2SAndreas Gohr (aider)                $query = $this->extractQuery($params, $engine['params']);
216*a73005f2SAndreas Gohr (aider)                if ($query) {
217*a73005f2SAndreas Gohr (aider)                    return [
218*a73005f2SAndreas Gohr (aider)                        'engine' => $key,
219*a73005f2SAndreas Gohr (aider)                        'name' => $engine['name'],
220*a73005f2SAndreas Gohr (aider)                        'query' => $query
221*a73005f2SAndreas Gohr (aider)                    ];
222*a73005f2SAndreas Gohr (aider)                }
223*a73005f2SAndreas Gohr (aider)            }
224*a73005f2SAndreas Gohr (aider)        }
225*a73005f2SAndreas Gohr (aider)
226*a73005f2SAndreas Gohr (aider)        return null;
227*a73005f2SAndreas Gohr (aider)    }
228*a73005f2SAndreas Gohr (aider)
229*a73005f2SAndreas Gohr (aider)    /**
230*a73005f2SAndreas Gohr (aider)     * Try to match against generic search parameters
231*a73005f2SAndreas Gohr (aider)     *
232*a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
233*a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
234*a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
235*a73005f2SAndreas Gohr (aider)     */
236*a73005f2SAndreas Gohr (aider)    protected function matchGenericEngine(string $domain, array $params): ?array
237*a73005f2SAndreas Gohr (aider)    {
238*a73005f2SAndreas Gohr (aider)        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
239*a73005f2SAndreas Gohr (aider)        $query = $this->extractQuery($params, $genericParams);
240a171b9c7SAndreas Gohr (aider)
241a171b9c7SAndreas Gohr (aider)        if (!$query) {
242a171b9c7SAndreas Gohr (aider)            return null;
243a171b9c7SAndreas Gohr (aider)        }
244a171b9c7SAndreas Gohr (aider)
245*a73005f2SAndreas Gohr (aider)        // Generate engine name from domain
246*a73005f2SAndreas Gohr (aider)        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
247*a73005f2SAndreas Gohr (aider)        $engineName = array_pop(explode('.', $engineName));
248*a73005f2SAndreas Gohr (aider)
249a171b9c7SAndreas Gohr (aider)        return [
250*a73005f2SAndreas Gohr (aider)            'engine' => 'generic_' . $engineName,
251*a73005f2SAndreas Gohr (aider)            'name' => ucfirst($engineName),
252a171b9c7SAndreas Gohr (aider)            'query' => $query
253a171b9c7SAndreas Gohr (aider)        ];
254a171b9c7SAndreas Gohr (aider)    }
255a171b9c7SAndreas Gohr (aider)
256a171b9c7SAndreas Gohr (aider)    /**
257*a73005f2SAndreas Gohr (aider)     * Extract and clean search query from parameters
258a171b9c7SAndreas Gohr (aider)     *
259*a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
260*a73005f2SAndreas Gohr (aider)     * @param array $paramNames Parameter names to check
261*a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null
262a171b9c7SAndreas Gohr (aider)     */
263*a73005f2SAndreas Gohr (aider)    protected function extractQuery(array $params, array $paramNames): ?string
264a171b9c7SAndreas Gohr (aider)    {
265*a73005f2SAndreas Gohr (aider)        foreach ($paramNames as $param) {
266*a73005f2SAndreas Gohr (aider)            if (!empty($params[$param])) {
267*a73005f2SAndreas Gohr (aider)                $query = $this->cleanQuery($params[$param]);
268*a73005f2SAndreas Gohr (aider)                if ($query) {
269*a73005f2SAndreas Gohr (aider)                    return $query;
270*a73005f2SAndreas Gohr (aider)                }
271*a73005f2SAndreas Gohr (aider)            }
272*a73005f2SAndreas Gohr (aider)        }
273*a73005f2SAndreas Gohr (aider)
274*a73005f2SAndreas Gohr (aider)        return null;
275a171b9c7SAndreas Gohr (aider)    }
276a171b9c7SAndreas Gohr (aider)
277a171b9c7SAndreas Gohr (aider)    /**
278*a73005f2SAndreas Gohr (aider)     * Clean and validate search query
279a171b9c7SAndreas Gohr (aider)     *
280*a73005f2SAndreas Gohr (aider)     * @param string $query Raw query string
281*a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null if invalid
282a171b9c7SAndreas Gohr (aider)     */
283*a73005f2SAndreas Gohr (aider)    protected function cleanQuery(string $query): ?string
284a171b9c7SAndreas Gohr (aider)    {
285*a73005f2SAndreas Gohr (aider)        // Remove non-search queries
286*a73005f2SAndreas Gohr (aider)        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query);
287*a73005f2SAndreas Gohr (aider)        // Compact whitespace
288*a73005f2SAndreas Gohr (aider)        $query = preg_replace('/ +/', ' ', $query);
289*a73005f2SAndreas Gohr (aider)        $query = trim($query);
290*a73005f2SAndreas Gohr (aider)
291*a73005f2SAndreas Gohr (aider)        return $query ?: null;
292a171b9c7SAndreas Gohr (aider)    }
293a171b9c7SAndreas Gohr (aider)
2942d987c80SAndreas Gohr}
295