1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Extract search Engine Inormation from the HTTP referer
7 *
8 * We use the HTTP specification misspelling of "referer" here
9 */
10class SearchEngines
11{
12    /** @var array Search engine definitions with regex patterns and metadata */
13    protected static array $searchEngines = [
14        'google' => [
15            'name' => 'Google',
16            'url' => 'https://www.google.com',
17            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
18            'params' => ['q']
19        ],
20        'bing' => [
21            'name' => 'Bing',
22            'url' => 'https://www.bing.com',
23            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
24            'params' => ['q']
25        ],
26        'yandex' => [
27            'name' => 'Яндекс (Yandex)',
28            'url' => 'https://www.yandex.ru',
29            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
30            'params' => ['query']
31        ],
32        'yahoo' => [
33            'name' => 'Yahoo!',
34            'url' => 'https://www.yahoo.com',
35            'regex' => '^(\w+\.)*yahoo\.com$',
36            'params' => ['p']
37        ],
38        'naver' => [
39            'name' => '네이버 (Naver)',
40            'url' => 'https://www.naver.com',
41            'regex' => '^search\.naver\.com$',
42            'params' => ['query']
43        ],
44        'baidu' => [
45            'name' => '百度 (Baidu)',
46            'url' => 'https://www.baidu.com',
47            'regex' => '^(\w+\.)*baidu\.com$',
48            'params' => ['wd', 'word', 'kw']
49        ],
50        'ask' => [
51            'name' => 'Ask',
52            'url' => 'https://www.ask.com',
53            'regex' => '^(\w+\.)*ask\.com$',
54            'params' => ['ask', 'q', 'searchfor']
55        ],
56        'ask_search_results' => [
57            'name' => 'Ask',
58            'url' => 'https://www.ask.com',
59            'regex' => '^(\w+\.)*search-results\.com$',
60            'params' => ['ask', 'q', 'searchfor']
61        ],
62        'babylon' => [
63            'name' => 'Babylon',
64            'url' => 'https://search.babylon.com',
65            'regex' => '^search\.babylon\.com$',
66            'params' => ['q']
67        ],
68        'aol' => [
69            'name' => 'AOL Search',
70            'url' => 'https://search.aol.com',
71            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
72            'params' => ['query', 'q']
73        ],
74        'duckduckgo' => [
75            'name' => 'DuckDuckGo',
76            'url' => 'https://duckduckgo.com',
77            'regex' => '^duckduckgo\.com$',
78            'params' => ['q']
79        ],
80        'ecosia' => [
81            'name' => 'Ecosia',
82            'url' => 'https://www.ecosia.org',
83            'regex' => '^(\w+\.)*ecosia\.org$',
84            'params' => ['q']
85        ],
86        'qwant' => [
87            'name' => 'Qwant',
88            'url' => 'https://www.qwant.com',
89            'regex' => '^(\w+\.)*qwant\.com$',
90            'params' => ['q']
91        ],
92        'avg' => [
93            'name' => 'AVG Safe Search',
94            'url' => 'https://search.avg.com',
95            'regex' => '^search\.avg\.com$',
96            'params' => ['q']
97        ],
98        'brave' => [
99            'name' => 'Brave Search',
100            'url' => 'https://search.brave.com',
101            'regex' => '^(\w+\.)*search\.brave\.com$',
102            'params' => ['q']
103        ],
104    ];
105
106    /** @var string|null The search engine key */
107    protected ?string $engine = null;
108
109    /** @var string|null The search engine name */
110    protected ?string $name = null;
111
112    /** @var string|null The search query */
113    protected ?string $query = null;
114
115    /**
116     * Constructor
117     *
118     * @param string $referer The HTTP referer URL to analyze
119     */
120    public function __construct(string $referer)
121    {
122        $this->analyze($referer);
123    }
124
125    /**
126     * Check if the referer is from a search engine
127     *
128     * @return bool True if the referer is from a search engine
129     */
130    public function isSearchEngine(): bool
131    {
132        return (bool)$this->engine;
133    }
134
135    /**
136     * Get the search engine identifier from the referer
137     *
138     * @return string|null The search engine or null if not a search engine
139     */
140    public function getEngine(): ?string
141    {
142        return $this->engine;
143    }
144
145    /**
146     * Get the search query from the referer
147     *
148     * @return string|null The search query or null if not a search engine
149     */
150    public function getQuery(): ?string
151    {
152        return $this->query;
153    }
154
155    /**
156     * Get the search engine name for the given engine identifier
157     *
158     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
159     */
160    public static function getName($engine): string
161    {
162        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine);
163    }
164
165    /**
166     * Get the search engine URL for the given engine identifier
167     *
168     * @return string|null The search engine URL or null if not defined
169     */
170    public static function getUrl($engine): ?string
171    {
172        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
173    }
174
175    /**
176     * Analyze the referer and populate member variables
177     */
178    protected function analyze(string $referer): void
179    {
180        $result = $this->analyzereferer($referer);
181
182        if ($result) {
183            $this->engine = $result['engine'];
184            $this->name = $result['name'];
185            $this->query = $result['query'];
186        }
187    }
188
189    /**
190     * Analyze a referer URL to extract search engine information and query
191     *
192     * @param string $referer The HTTP referer URL
193     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
194     */
195    protected function analyzereferer(string $referer): ?array
196    {
197        $urlparts = parse_url(strtolower($referer));
198        if (!isset($urlparts['host'])) {
199            return null;
200        }
201
202        $domain = $urlparts['host'];
203        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
204
205        parse_str($queryString, $params);
206
207        // Try to match against known search engines
208        $result = $this->matchKnownEngine($domain, $params);
209        if ($result) {
210            return $result;
211        }
212
213        // Try generic search parameters
214        return $this->matchGenericEngine($domain, $params);
215    }
216
217    /**
218     * Try to match against known search engines
219     *
220     * @param string $domain The domain to check
221     * @param array $params URL parameters
222     * @return array|null Match result or null
223     */
224    protected function matchKnownEngine(string $domain, array $params): ?array
225    {
226        foreach (self::$searchEngines as $key => $engine) {
227            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
228                $query = $this->extractQuery($params, $engine['params']);
229                return [
230                    'engine' => $key,
231                    'name' => $engine['name'],
232                    'query' => $query
233                ];
234            }
235        }
236
237        return null;
238    }
239
240    /**
241     * Try to match against generic search parameters
242     *
243     * @param string $domain The domain to check
244     * @param array $params URL parameters
245     * @return array|null Match result or null
246     */
247    protected function matchGenericEngine(string $domain, array $params): ?array
248    {
249        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
250        $query = $this->extractQuery($params, $genericParams);
251
252        if (!$query) {
253            return null;
254        }
255
256        // Generate engine name from domain
257        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
258        $domainParts = explode('.', $engineName);
259        $engineName = array_pop($domainParts);
260
261        return [
262            'engine' => $engineName,
263            'name' => ucfirst($engineName),
264            'query' => $query
265        ];
266    }
267
268    /**
269     * Extract and clean search query from parameters
270     *
271     * @param array $params URL parameters
272     * @param array $paramNames Parameter names to check
273     * @return string|null Cleaned query or null
274     */
275    protected function extractQuery(array $params, array $paramNames): ?string
276    {
277        foreach ($paramNames as $param) {
278            if (!empty($params[$param])) {
279                $query = $this->cleanQuery($params[$param]);
280                if ($query) {
281                    return $query;
282                }
283            }
284        }
285
286        return null;
287    }
288
289    /**
290     * Clean and validate search query
291     *
292     * @param string $query Raw query string
293     * @return string|null Cleaned query or null if invalid
294     */
295    protected function cleanQuery(string $query): ?string
296    {
297        // Remove non-search queries (cache: and related: prefixes)
298        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
299        // Compact whitespace
300        $query = preg_replace('/\s+/', ' ', $query);
301        $query = trim($query);
302
303        return $query ?: null;
304    }
305}
306