xref: /plugin/statistics/SearchEngines.php (revision c428ec28b92989bfa72433f30fd3e015edbb413c)
12d987c80SAndreas Gohr<?php
22d987c80SAndreas Gohr
32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics;
42d987c80SAndreas Gohr
52d987c80SAndreas Gohr/**
6762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer
7762f4807SAndreas Gohr *
8762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here
92d987c80SAndreas Gohr */
102d987c80SAndreas Gohrclass SearchEngines
112d987c80SAndreas Gohr{
12e357e0dcSAndreas Gohr (aider)    /** @var array Search engine definitions with regex patterns and metadata */
13762f4807SAndreas Gohr    protected static array $searchEngines = [
14762f4807SAndreas Gohr        'dokuwiki' => [
15762f4807SAndreas Gohr            'name' => 'DokuWiki Internal Search',
16762f4807SAndreas Gohr            'url' => DOKU_URL,
17762f4807SAndreas Gohr            'regex' => '', // set in constructor
18762f4807SAndreas Gohr            'params' => ['q']
19762f4807SAndreas Gohr        ],
20e357e0dcSAndreas Gohr (aider)        'google' => [
21e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
22e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
23e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
24e357e0dcSAndreas Gohr (aider)            'params' => ['q']
25e357e0dcSAndreas Gohr (aider)        ],
26e357e0dcSAndreas Gohr (aider)        'bing' => [
27e357e0dcSAndreas Gohr (aider)            'name' => 'Bing',
28e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.bing.com',
29e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
30e357e0dcSAndreas Gohr (aider)            'params' => ['q']
31e357e0dcSAndreas Gohr (aider)        ],
32e357e0dcSAndreas Gohr (aider)        'yandex' => [
33e357e0dcSAndreas Gohr (aider)            'name' => 'Яндекс (Yandex)',
34e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yandex.ru',
35e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
36e357e0dcSAndreas Gohr (aider)            'params' => ['query']
37e357e0dcSAndreas Gohr (aider)        ],
38e357e0dcSAndreas Gohr (aider)        'yahoo' => [
39e357e0dcSAndreas Gohr (aider)            'name' => 'Yahoo!',
40e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.yahoo.com',
41e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*yahoo\.com$',
42e357e0dcSAndreas Gohr (aider)            'params' => ['p']
43e357e0dcSAndreas Gohr (aider)        ],
44e357e0dcSAndreas Gohr (aider)        'naver' => [
45e357e0dcSAndreas Gohr (aider)            'name' => '네이버 (Naver)',
46e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.naver.com',
47e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.naver\.com$',
48e357e0dcSAndreas Gohr (aider)            'params' => ['query']
49e357e0dcSAndreas Gohr (aider)        ],
50e357e0dcSAndreas Gohr (aider)        'baidu' => [
51e357e0dcSAndreas Gohr (aider)            'name' => '百度 (Baidu)',
52e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.baidu.com',
53e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*baidu\.com$',
54e357e0dcSAndreas Gohr (aider)            'params' => ['wd', 'word', 'kw']
55e357e0dcSAndreas Gohr (aider)        ],
56e357e0dcSAndreas Gohr (aider)        'ask' => [
57e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
58e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
59e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*ask\.com$',
60e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
61e357e0dcSAndreas Gohr (aider)        ],
62e357e0dcSAndreas Gohr (aider)        'ask_search_results' => [
63e357e0dcSAndreas Gohr (aider)            'name' => 'Ask',
64e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.ask.com',
65e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*search-results\.com$',
66e357e0dcSAndreas Gohr (aider)            'params' => ['ask', 'q', 'searchfor']
67e357e0dcSAndreas Gohr (aider)        ],
68e357e0dcSAndreas Gohr (aider)        'babylon' => [
69e357e0dcSAndreas Gohr (aider)            'name' => 'Babylon',
70e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.babylon.com',
71e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.babylon\.com$',
72e357e0dcSAndreas Gohr (aider)            'params' => ['q']
73e357e0dcSAndreas Gohr (aider)        ],
74e357e0dcSAndreas Gohr (aider)        'aol' => [
75e357e0dcSAndreas Gohr (aider)            'name' => 'AOL Search',
76e357e0dcSAndreas Gohr (aider)            'url' => 'http://search.aol.com',
77e357e0dcSAndreas Gohr (aider)            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
78e357e0dcSAndreas Gohr (aider)            'params' => ['query', 'q']
79e357e0dcSAndreas Gohr (aider)        ],
80e357e0dcSAndreas Gohr (aider)        'duckduckgo' => [
81e357e0dcSAndreas Gohr (aider)            'name' => 'DuckDuckGo',
82e357e0dcSAndreas Gohr (aider)            'url' => 'http://duckduckgo.com',
83e357e0dcSAndreas Gohr (aider)            'regex' => '^duckduckgo\.com$',
84e357e0dcSAndreas Gohr (aider)            'params' => ['q']
85e357e0dcSAndreas Gohr (aider)        ],
86e357e0dcSAndreas Gohr (aider)        'google_avg' => [
87e357e0dcSAndreas Gohr (aider)            'name' => 'Google',
88e357e0dcSAndreas Gohr (aider)            'url' => 'http://www.google.com',
89e357e0dcSAndreas Gohr (aider)            'regex' => '^search\.avg\.com$',
90e357e0dcSAndreas Gohr (aider)            'params' => ['q']
91e357e0dcSAndreas Gohr (aider)        ]
922d987c80SAndreas Gohr    ];
932d987c80SAndreas Gohr
94762f4807SAndreas Gohr    /** @var string|null The search engine key */
95762f4807SAndreas Gohr    protected ?string $engine = null;
9606bd4382SAndreas Gohr (aider)
97aecf8e88SAndreas Gohr (aider)    /** @var string|null The search engine name */
98762f4807SAndreas Gohr    protected ?string $name = null;
99aecf8e88SAndreas Gohr (aider)
100aecf8e88SAndreas Gohr (aider)    /** @var string|null The search query */
101aecf8e88SAndreas Gohr (aider)    protected ?string $query = null;
102aecf8e88SAndreas Gohr (aider)
103762f4807SAndreas Gohr    /**
104762f4807SAndreas Gohr     * Constructor
105762f4807SAndreas Gohr     *
106762f4807SAndreas Gohr     * @param string $referer The HTTP referer URL to analyze
107762f4807SAndreas Gohr     */
108762f4807SAndreas Gohr    public function __construct(string $referer)
1092d987c80SAndreas Gohr    {
110762f4807SAndreas Gohr        // Add regex matching ourselves
111762f4807SAndreas Gohr        self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$';
112762f4807SAndreas Gohr        $this->analyze($referer);
11306bd4382SAndreas Gohr (aider)    }
11406bd4382SAndreas Gohr (aider)
11506bd4382SAndreas Gohr (aider)    /**
116762f4807SAndreas Gohr     * Check if the referer is from a search engine
11706bd4382SAndreas Gohr (aider)     *
118762f4807SAndreas Gohr     * @return bool True if the referer is from a search engine
11906bd4382SAndreas Gohr (aider)     */
12006bd4382SAndreas Gohr (aider)    public function isSearchEngine(): bool
12106bd4382SAndreas Gohr (aider)    {
122762f4807SAndreas Gohr        return (bool)$this->engine;
12306bd4382SAndreas Gohr (aider)    }
12406bd4382SAndreas Gohr (aider)
12506bd4382SAndreas Gohr (aider)    /**
126762f4807SAndreas Gohr     * Get the search engine identifier from the referer
12706bd4382SAndreas Gohr (aider)     *
128*c428ec28SAndreas Gohr     * @return string|null The search engine or null if not a search engine
12906bd4382SAndreas Gohr (aider)     */
130*c428ec28SAndreas Gohr    public function getEngine(): ?string
13106bd4382SAndreas Gohr (aider)    {
132762f4807SAndreas Gohr        return $this->engine;
13306bd4382SAndreas Gohr (aider)    }
13406bd4382SAndreas Gohr (aider)
13506bd4382SAndreas Gohr (aider)    /**
136762f4807SAndreas Gohr     * Get the search query from the referer
13706bd4382SAndreas Gohr (aider)     *
13806bd4382SAndreas Gohr (aider)     * @return string|null The search query or null if not a search engine
13906bd4382SAndreas Gohr (aider)     */
14006bd4382SAndreas Gohr (aider)    public function getQuery(): ?string
14106bd4382SAndreas Gohr (aider)    {
142aecf8e88SAndreas Gohr (aider)        return $this->query;
14306bd4382SAndreas Gohr (aider)    }
14406bd4382SAndreas Gohr (aider)
14506bd4382SAndreas Gohr (aider)    /**
146762f4807SAndreas Gohr     * Get the search engine name for the given engine identifier
147762f4807SAndreas Gohr     *
148*c428ec28SAndreas Gohr     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
14906bd4382SAndreas Gohr (aider)     */
150*c428ec28SAndreas Gohr    public static function getName($engine): string
15106bd4382SAndreas Gohr (aider)    {
152*c428ec28SAndreas Gohr        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucfirst($engine);
153762f4807SAndreas Gohr    }
154762f4807SAndreas Gohr
155762f4807SAndreas Gohr    /**
156762f4807SAndreas Gohr     * Get the search engine URL for the given engine identifier
157762f4807SAndreas Gohr     *
158762f4807SAndreas Gohr     * @return string|null The search engine URL or null if not defined
159762f4807SAndreas Gohr     */
160762f4807SAndreas Gohr    public static function getUrl($engine): ?string
161762f4807SAndreas Gohr    {
162762f4807SAndreas Gohr        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
163762f4807SAndreas Gohr    }
164762f4807SAndreas Gohr
165762f4807SAndreas Gohr    /**
166762f4807SAndreas Gohr     * Analyze the referer and populate member variables
167762f4807SAndreas Gohr     */
168762f4807SAndreas Gohr    protected function analyze(string $referer): void
169762f4807SAndreas Gohr    {
170762f4807SAndreas Gohr        $result = $this->analyzereferer($referer);
17106bd4382SAndreas Gohr (aider)
172aecf8e88SAndreas Gohr (aider)        if ($result) {
173762f4807SAndreas Gohr            $this->engine = $result['engine'];
174762f4807SAndreas Gohr            $this->name = $result['name'];
175aecf8e88SAndreas Gohr (aider)            $this->query = $result['query'];
176aecf8e88SAndreas Gohr (aider)        }
1772d987c80SAndreas Gohr    }
1782d987c80SAndreas Gohr
179a171b9c7SAndreas Gohr (aider)    /**
180762f4807SAndreas Gohr     * Analyze a referer URL to extract search engine information and query
181a171b9c7SAndreas Gohr (aider)     *
182a171b9c7SAndreas Gohr (aider)     * @param string $referer The HTTP referer URL
183a171b9c7SAndreas Gohr (aider)     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
184a171b9c7SAndreas Gohr (aider)     */
185762f4807SAndreas Gohr    protected function analyzereferer(string $referer): ?array
186a171b9c7SAndreas Gohr (aider)    {
187a73005f2SAndreas Gohr (aider)        $urlparts = parse_url(strtolower($referer));
188a171b9c7SAndreas Gohr (aider)        if (!isset($urlparts['host'])) {
189a171b9c7SAndreas Gohr (aider)            return null;
190a171b9c7SAndreas Gohr (aider)        }
191a171b9c7SAndreas Gohr (aider)
192a171b9c7SAndreas Gohr (aider)        $domain = $urlparts['host'];
193a73005f2SAndreas Gohr (aider)        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
194a171b9c7SAndreas Gohr (aider)
195a73005f2SAndreas Gohr (aider)        if (!$queryString) {
196a171b9c7SAndreas Gohr (aider)            return null;
197a171b9c7SAndreas Gohr (aider)        }
198a171b9c7SAndreas Gohr (aider)
199a73005f2SAndreas Gohr (aider)        parse_str($queryString, $params);
200a73005f2SAndreas Gohr (aider)
201a73005f2SAndreas Gohr (aider)        // Try to match against known search engines
202a73005f2SAndreas Gohr (aider)        $result = $this->matchKnownEngine($domain, $params);
203a73005f2SAndreas Gohr (aider)        if ($result) {
204a73005f2SAndreas Gohr (aider)            return $result;
205a73005f2SAndreas Gohr (aider)        }
206a73005f2SAndreas Gohr (aider)
207a73005f2SAndreas Gohr (aider)        // Try generic search parameters
208a73005f2SAndreas Gohr (aider)        return $this->matchGenericEngine($domain, $params);
209a73005f2SAndreas Gohr (aider)    }
210a73005f2SAndreas Gohr (aider)
211a73005f2SAndreas Gohr (aider)    /**
212a73005f2SAndreas Gohr (aider)     * Try to match against known search engines
213a73005f2SAndreas Gohr (aider)     *
214a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
215a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
216a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
217a73005f2SAndreas Gohr (aider)     */
218a73005f2SAndreas Gohr (aider)    protected function matchKnownEngine(string $domain, array $params): ?array
219a73005f2SAndreas Gohr (aider)    {
220762f4807SAndreas Gohr        foreach (self::$searchEngines as $key => $engine) {
221a73005f2SAndreas Gohr (aider)            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
222a73005f2SAndreas Gohr (aider)                $query = $this->extractQuery($params, $engine['params']);
223a73005f2SAndreas Gohr (aider)                if ($query) {
224a73005f2SAndreas Gohr (aider)                    return [
225a73005f2SAndreas Gohr (aider)                        'engine' => $key,
226a73005f2SAndreas Gohr (aider)                        'name' => $engine['name'],
227a73005f2SAndreas Gohr (aider)                        'query' => $query
228a73005f2SAndreas Gohr (aider)                    ];
229a73005f2SAndreas Gohr (aider)                }
230a73005f2SAndreas Gohr (aider)            }
231a73005f2SAndreas Gohr (aider)        }
232a73005f2SAndreas Gohr (aider)
233a73005f2SAndreas Gohr (aider)        return null;
234a73005f2SAndreas Gohr (aider)    }
235a73005f2SAndreas Gohr (aider)
236a73005f2SAndreas Gohr (aider)    /**
237a73005f2SAndreas Gohr (aider)     * Try to match against generic search parameters
238a73005f2SAndreas Gohr (aider)     *
239a73005f2SAndreas Gohr (aider)     * @param string $domain The domain to check
240a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
241a73005f2SAndreas Gohr (aider)     * @return array|null Match result or null
242a73005f2SAndreas Gohr (aider)     */
243a73005f2SAndreas Gohr (aider)    protected function matchGenericEngine(string $domain, array $params): ?array
244a73005f2SAndreas Gohr (aider)    {
245a73005f2SAndreas Gohr (aider)        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
246a73005f2SAndreas Gohr (aider)        $query = $this->extractQuery($params, $genericParams);
247a171b9c7SAndreas Gohr (aider)
248a171b9c7SAndreas Gohr (aider)        if (!$query) {
249a171b9c7SAndreas Gohr (aider)            return null;
250a171b9c7SAndreas Gohr (aider)        }
251a171b9c7SAndreas Gohr (aider)
252a73005f2SAndreas Gohr (aider)        // Generate engine name from domain
253a73005f2SAndreas Gohr (aider)        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
254762f4807SAndreas Gohr        $domainParts = explode('.', $engineName);
255762f4807SAndreas Gohr        $engineName = array_pop($domainParts);
256a73005f2SAndreas Gohr (aider)
257a171b9c7SAndreas Gohr (aider)        return [
258762f4807SAndreas Gohr            'engine' => $engineName,
259a73005f2SAndreas Gohr (aider)            'name' => ucfirst($engineName),
260a171b9c7SAndreas Gohr (aider)            'query' => $query
261a171b9c7SAndreas Gohr (aider)        ];
262a171b9c7SAndreas Gohr (aider)    }
263a171b9c7SAndreas Gohr (aider)
264a171b9c7SAndreas Gohr (aider)    /**
265a73005f2SAndreas Gohr (aider)     * Extract and clean search query from parameters
266a171b9c7SAndreas Gohr (aider)     *
267a73005f2SAndreas Gohr (aider)     * @param array $params URL parameters
268a73005f2SAndreas Gohr (aider)     * @param array $paramNames Parameter names to check
269a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null
270a171b9c7SAndreas Gohr (aider)     */
271a73005f2SAndreas Gohr (aider)    protected function extractQuery(array $params, array $paramNames): ?string
272a171b9c7SAndreas Gohr (aider)    {
273a73005f2SAndreas Gohr (aider)        foreach ($paramNames as $param) {
274a73005f2SAndreas Gohr (aider)            if (!empty($params[$param])) {
275a73005f2SAndreas Gohr (aider)                $query = $this->cleanQuery($params[$param]);
276a73005f2SAndreas Gohr (aider)                if ($query) {
277a73005f2SAndreas Gohr (aider)                    return $query;
278a73005f2SAndreas Gohr (aider)                }
279a73005f2SAndreas Gohr (aider)            }
280a73005f2SAndreas Gohr (aider)        }
281a73005f2SAndreas Gohr (aider)
282a73005f2SAndreas Gohr (aider)        return null;
283a171b9c7SAndreas Gohr (aider)    }
284a171b9c7SAndreas Gohr (aider)
285a171b9c7SAndreas Gohr (aider)    /**
286a73005f2SAndreas Gohr (aider)     * Clean and validate search query
287a171b9c7SAndreas Gohr (aider)     *
288a73005f2SAndreas Gohr (aider)     * @param string $query Raw query string
289a73005f2SAndreas Gohr (aider)     * @return string|null Cleaned query or null if invalid
290a171b9c7SAndreas Gohr (aider)     */
291a73005f2SAndreas Gohr (aider)    protected function cleanQuery(string $query): ?string
292a171b9c7SAndreas Gohr (aider)    {
293*c428ec28SAndreas Gohr        // Remove non-search queries (cache: and related: prefixes)
294*c428ec28SAndreas Gohr        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
295a73005f2SAndreas Gohr (aider)        // Compact whitespace
296*c428ec28SAndreas Gohr        $query = preg_replace('/\s+/', ' ', $query);
297a73005f2SAndreas Gohr (aider)        $query = trim($query);
298a73005f2SAndreas Gohr (aider)
299a73005f2SAndreas Gohr (aider)        return $query ?: null;
300a171b9c7SAndreas Gohr (aider)    }
301a171b9c7SAndreas Gohr (aider)
3022d987c80SAndreas Gohr}
303