xref: /plugin/statistics/SearchEngines.php (revision 10dcb86f38cab1d9cfa19911e1d59c086626d7c8)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Extract search Engine Inormation from the HTTP referer
7 *
8 * We use the HTTP specification misspelling of "referer" here
9 */
10class SearchEngines
11{
12    /** @var array Search engine definitions with regex patterns and metadata */
13    protected static array $searchEngines = [
14        'google' => [
15            'name' => 'Google',
16            'url' => 'http://www.google.com',
17            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
18            'params' => ['q']
19        ],
20        'bing' => [
21            'name' => 'Bing',
22            'url' => 'http://www.bing.com',
23            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
24            'params' => ['q']
25        ],
26        'yandex' => [
27            'name' => 'Яндекс (Yandex)',
28            'url' => 'http://www.yandex.ru',
29            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
30            'params' => ['query']
31        ],
32        'yahoo' => [
33            'name' => 'Yahoo!',
34            'url' => 'http://www.yahoo.com',
35            'regex' => '^(\w+\.)*yahoo\.com$',
36            'params' => ['p']
37        ],
38        'naver' => [
39            'name' => '네이버 (Naver)',
40            'url' => 'http://www.naver.com',
41            'regex' => '^search\.naver\.com$',
42            'params' => ['query']
43        ],
44        'baidu' => [
45            'name' => '百度 (Baidu)',
46            'url' => 'http://www.baidu.com',
47            'regex' => '^(\w+\.)*baidu\.com$',
48            'params' => ['wd', 'word', 'kw']
49        ],
50        'ask' => [
51            'name' => 'Ask',
52            'url' => 'http://www.ask.com',
53            'regex' => '^(\w+\.)*ask\.com$',
54            'params' => ['ask', 'q', 'searchfor']
55        ],
56        'ask_search_results' => [
57            'name' => 'Ask',
58            'url' => 'http://www.ask.com',
59            'regex' => '^(\w+\.)*search-results\.com$',
60            'params' => ['ask', 'q', 'searchfor']
61        ],
62        'babylon' => [
63            'name' => 'Babylon',
64            'url' => 'http://search.babylon.com',
65            'regex' => '^search\.babylon\.com$',
66            'params' => ['q']
67        ],
68        'aol' => [
69            'name' => 'AOL Search',
70            'url' => 'http://search.aol.com',
71            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
72            'params' => ['query', 'q']
73        ],
74        'duckduckgo' => [
75            'name' => 'DuckDuckGo',
76            'url' => 'http://duckduckgo.com',
77            'regex' => '^duckduckgo\.com$',
78            'params' => ['q']
79        ],
80        'ecosia' => [
81            'name' => 'Ecosia',
82            'url' => 'https://www.ecosia.org',
83            'regex' => '^(\w+\.)*ecosia\.org$',
84            'params' => ['q']
85        ],
86        'qwant' => [
87            'name' => 'Qwant',
88            'url' => 'https://www.qwant.com',
89            'regex' => '^(\w+\.)*qwant\.com$',
90            'params' => ['q']
91        ],
92        'google_avg' => [
93            'name' => 'Google',
94            'url' => 'http://www.google.com',
95            'regex' => '^search\.avg\.com$',
96            'params' => ['q']
97        ]
98    ];
99
100    /** @var string|null The search engine key */
101    protected ?string $engine = null;
102
103    /** @var string|null The search engine name */
104    protected ?string $name = null;
105
106    /** @var string|null The search query */
107    protected ?string $query = null;
108
109    /**
110     * Constructor
111     *
112     * @param string $referer The HTTP referer URL to analyze
113     */
114    public function __construct(string $referer)
115    {
116        $this->analyze($referer);
117    }
118
119    /**
120     * Check if the referer is from a search engine
121     *
122     * @return bool True if the referer is from a search engine
123     */
124    public function isSearchEngine(): bool
125    {
126        return (bool)$this->engine;
127    }
128
129    /**
130     * Get the search engine identifier from the referer
131     *
132     * @return string|null The search engine or null if not a search engine
133     */
134    public function getEngine(): ?string
135    {
136        return $this->engine;
137    }
138
139    /**
140     * Get the search query from the referer
141     *
142     * @return string|null The search query or null if not a search engine
143     */
144    public function getQuery(): ?string
145    {
146        return $this->query;
147    }
148
149    /**
150     * Get the search engine name for the given engine identifier
151     *
152     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
153     */
154    public static function getName($engine): string
155    {
156        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine);
157    }
158
159    /**
160     * Get the search engine URL for the given engine identifier
161     *
162     * @return string|null The search engine URL or null if not defined
163     */
164    public static function getUrl($engine): ?string
165    {
166        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
167    }
168
169    /**
170     * Analyze the referer and populate member variables
171     */
172    protected function analyze(string $referer): void
173    {
174        $result = $this->analyzereferer($referer);
175
176        if ($result) {
177            $this->engine = $result['engine'];
178            $this->name = $result['name'];
179            $this->query = $result['query'];
180        }
181    }
182
183    /**
184     * Analyze a referer URL to extract search engine information and query
185     *
186     * @param string $referer The HTTP referer URL
187     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
188     */
189    protected function analyzereferer(string $referer): ?array
190    {
191        $urlparts = parse_url(strtolower($referer));
192        if (!isset($urlparts['host'])) {
193            return null;
194        }
195
196        $domain = $urlparts['host'];
197        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
198
199        parse_str($queryString, $params);
200
201        // Try to match against known search engines
202        $result = $this->matchKnownEngine($domain, $params);
203        if ($result) {
204            return $result;
205        }
206
207        // Try generic search parameters
208        return $this->matchGenericEngine($domain, $params);
209    }
210
211    /**
212     * Try to match against known search engines
213     *
214     * @param string $domain The domain to check
215     * @param array $params URL parameters
216     * @return array|null Match result or null
217     */
218    protected function matchKnownEngine(string $domain, array $params): ?array
219    {
220        foreach (self::$searchEngines as $key => $engine) {
221            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
222                $query = $this->extractQuery($params, $engine['params']);
223                return [
224                    'engine' => $key,
225                    'name' => $engine['name'],
226                    'query' => $query
227                ];
228            }
229        }
230
231        return null;
232    }
233
234    /**
235     * Try to match against generic search parameters
236     *
237     * @param string $domain The domain to check
238     * @param array $params URL parameters
239     * @return array|null Match result or null
240     */
241    protected function matchGenericEngine(string $domain, array $params): ?array
242    {
243        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
244        $query = $this->extractQuery($params, $genericParams);
245
246        if (!$query) {
247            return null;
248        }
249
250        // Generate engine name from domain
251        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
252        $domainParts = explode('.', $engineName);
253        $engineName = array_pop($domainParts);
254
255        return [
256            'engine' => $engineName,
257            'name' => ucfirst($engineName),
258            'query' => $query
259        ];
260    }
261
262    /**
263     * Extract and clean search query from parameters
264     *
265     * @param array $params URL parameters
266     * @param array $paramNames Parameter names to check
267     * @return string|null Cleaned query or null
268     */
269    protected function extractQuery(array $params, array $paramNames): ?string
270    {
271        foreach ($paramNames as $param) {
272            if (!empty($params[$param])) {
273                $query = $this->cleanQuery($params[$param]);
274                if ($query) {
275                    return $query;
276                }
277            }
278        }
279
280        return null;
281    }
282
283    /**
284     * Clean and validate search query
285     *
286     * @param string $query Raw query string
287     * @return string|null Cleaned query or null if invalid
288     */
289    protected function cleanQuery(string $query): ?string
290    {
291        // Remove non-search queries (cache: and related: prefixes)
292        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
293        // Compact whitespace
294        $query = preg_replace('/\s+/', ' ', $query);
295        $query = trim($query);
296
297        return $query ?: null;
298    }
299}
300