xref: /plugin/statistics/SearchEngines.php (revision c428ec28b92989bfa72433f30fd3e015edbb413c)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Extract search Engine Inormation from the HTTP referer
7 *
8 * We use the HTTP specification misspelling of "referer" here
9 */
10class SearchEngines
11{
12    /** @var array Search engine definitions with regex patterns and metadata */
13    protected static array $searchEngines = [
14        'dokuwiki' => [
15            'name' => 'DokuWiki Internal Search',
16            'url' => DOKU_URL,
17            'regex' => '', // set in constructor
18            'params' => ['q']
19        ],
20        'google' => [
21            'name' => 'Google',
22            'url' => 'http://www.google.com',
23            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
24            'params' => ['q']
25        ],
26        'bing' => [
27            'name' => 'Bing',
28            'url' => 'http://www.bing.com',
29            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
30            'params' => ['q']
31        ],
32        'yandex' => [
33            'name' => 'Яндекс (Yandex)',
34            'url' => 'http://www.yandex.ru',
35            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
36            'params' => ['query']
37        ],
38        'yahoo' => [
39            'name' => 'Yahoo!',
40            'url' => 'http://www.yahoo.com',
41            'regex' => '^(\w+\.)*yahoo\.com$',
42            'params' => ['p']
43        ],
44        'naver' => [
45            'name' => '네이버 (Naver)',
46            'url' => 'http://www.naver.com',
47            'regex' => '^search\.naver\.com$',
48            'params' => ['query']
49        ],
50        'baidu' => [
51            'name' => '百度 (Baidu)',
52            'url' => 'http://www.baidu.com',
53            'regex' => '^(\w+\.)*baidu\.com$',
54            'params' => ['wd', 'word', 'kw']
55        ],
56        'ask' => [
57            'name' => 'Ask',
58            'url' => 'http://www.ask.com',
59            'regex' => '^(\w+\.)*ask\.com$',
60            'params' => ['ask', 'q', 'searchfor']
61        ],
62        'ask_search_results' => [
63            'name' => 'Ask',
64            'url' => 'http://www.ask.com',
65            'regex' => '^(\w+\.)*search-results\.com$',
66            'params' => ['ask', 'q', 'searchfor']
67        ],
68        'babylon' => [
69            'name' => 'Babylon',
70            'url' => 'http://search.babylon.com',
71            'regex' => '^search\.babylon\.com$',
72            'params' => ['q']
73        ],
74        'aol' => [
75            'name' => 'AOL Search',
76            'url' => 'http://search.aol.com',
77            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
78            'params' => ['query', 'q']
79        ],
80        'duckduckgo' => [
81            'name' => 'DuckDuckGo',
82            'url' => 'http://duckduckgo.com',
83            'regex' => '^duckduckgo\.com$',
84            'params' => ['q']
85        ],
86        'google_avg' => [
87            'name' => 'Google',
88            'url' => 'http://www.google.com',
89            'regex' => '^search\.avg\.com$',
90            'params' => ['q']
91        ]
92    ];
93
94    /** @var string|null The search engine key */
95    protected ?string $engine = null;
96
97    /** @var string|null The search engine name */
98    protected ?string $name = null;
99
100    /** @var string|null The search query */
101    protected ?string $query = null;
102
103    /**
104     * Constructor
105     *
106     * @param string $referer The HTTP referer URL to analyze
107     */
108    public function __construct(string $referer)
109    {
110        // Add regex matching ourselves
111        self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$';
112        $this->analyze($referer);
113    }
114
115    /**
116     * Check if the referer is from a search engine
117     *
118     * @return bool True if the referer is from a search engine
119     */
120    public function isSearchEngine(): bool
121    {
122        return (bool)$this->engine;
123    }
124
125    /**
126     * Get the search engine identifier from the referer
127     *
128     * @return string|null The search engine or null if not a search engine
129     */
130    public function getEngine(): ?string
131    {
132        return $this->engine;
133    }
134
135    /**
136     * Get the search query from the referer
137     *
138     * @return string|null The search query or null if not a search engine
139     */
140    public function getQuery(): ?string
141    {
142        return $this->query;
143    }
144
145    /**
146     * Get the search engine name for the given engine identifier
147     *
148     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
149     */
150    public static function getName($engine): string
151    {
152        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucfirst($engine);
153    }
154
155    /**
156     * Get the search engine URL for the given engine identifier
157     *
158     * @return string|null The search engine URL or null if not defined
159     */
160    public static function getUrl($engine): ?string
161    {
162        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
163    }
164
165    /**
166     * Analyze the referer and populate member variables
167     */
168    protected function analyze(string $referer): void
169    {
170        $result = $this->analyzereferer($referer);
171
172        if ($result) {
173            $this->engine = $result['engine'];
174            $this->name = $result['name'];
175            $this->query = $result['query'];
176        }
177    }
178
179    /**
180     * Analyze a referer URL to extract search engine information and query
181     *
182     * @param string $referer The HTTP referer URL
183     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
184     */
185    protected function analyzereferer(string $referer): ?array
186    {
187        $urlparts = parse_url(strtolower($referer));
188        if (!isset($urlparts['host'])) {
189            return null;
190        }
191
192        $domain = $urlparts['host'];
193        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
194
195        if (!$queryString) {
196            return null;
197        }
198
199        parse_str($queryString, $params);
200
201        // Try to match against known search engines
202        $result = $this->matchKnownEngine($domain, $params);
203        if ($result) {
204            return $result;
205        }
206
207        // Try generic search parameters
208        return $this->matchGenericEngine($domain, $params);
209    }
210
211    /**
212     * Try to match against known search engines
213     *
214     * @param string $domain The domain to check
215     * @param array $params URL parameters
216     * @return array|null Match result or null
217     */
218    protected function matchKnownEngine(string $domain, array $params): ?array
219    {
220        foreach (self::$searchEngines as $key => $engine) {
221            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
222                $query = $this->extractQuery($params, $engine['params']);
223                if ($query) {
224                    return [
225                        'engine' => $key,
226                        'name' => $engine['name'],
227                        'query' => $query
228                    ];
229                }
230            }
231        }
232
233        return null;
234    }
235
236    /**
237     * Try to match against generic search parameters
238     *
239     * @param string $domain The domain to check
240     * @param array $params URL parameters
241     * @return array|null Match result or null
242     */
243    protected function matchGenericEngine(string $domain, array $params): ?array
244    {
245        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
246        $query = $this->extractQuery($params, $genericParams);
247
248        if (!$query) {
249            return null;
250        }
251
252        // Generate engine name from domain
253        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
254        $domainParts = explode('.', $engineName);
255        $engineName = array_pop($domainParts);
256
257        return [
258            'engine' => $engineName,
259            'name' => ucfirst($engineName),
260            'query' => $query
261        ];
262    }
263
264    /**
265     * Extract and clean search query from parameters
266     *
267     * @param array $params URL parameters
268     * @param array $paramNames Parameter names to check
269     * @return string|null Cleaned query or null
270     */
271    protected function extractQuery(array $params, array $paramNames): ?string
272    {
273        foreach ($paramNames as $param) {
274            if (!empty($params[$param])) {
275                $query = $this->cleanQuery($params[$param]);
276                if ($query) {
277                    return $query;
278                }
279            }
280        }
281
282        return null;
283    }
284
285    /**
286     * Clean and validate search query
287     *
288     * @param string $query Raw query string
289     * @return string|null Cleaned query or null if invalid
290     */
291    protected function cleanQuery(string $query): ?string
292    {
293        // Remove non-search queries (cache: and related: prefixes)
294        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
295        // Compact whitespace
296        $query = preg_replace('/\s+/', ' ', $query);
297        $query = trim($query);
298
299        return $query ?: null;
300    }
301
302}
303