xref: /plugin/statistics/SearchEngines.php (revision 45f4cdff9eeee357a9c7da871bb438b139ad9748)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Extract search Engine Inormation from the HTTP referer
7 *
8 * We use the HTTP specification misspelling of "referer" here
9 */
10class SearchEngines
11{
12    /** @var array Search engine definitions with regex patterns and metadata */
13    protected static array $searchEngines = [
14        'dokuwiki' => [
15            'name' => 'DokuWiki Internal Search',
16            'url' => DOKU_URL,
17            'regex' => '', // set in constructor
18            'params' => ['q']
19        ],
20        'google' => [
21            'name' => 'Google',
22            'url' => 'http://www.google.com',
23            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
24            'params' => ['q']
25        ],
26        'bing' => [
27            'name' => 'Bing',
28            'url' => 'http://www.bing.com',
29            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
30            'params' => ['q']
31        ],
32        'yandex' => [
33            'name' => 'Яндекс (Yandex)',
34            'url' => 'http://www.yandex.ru',
35            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
36            'params' => ['query']
37        ],
38        'yahoo' => [
39            'name' => 'Yahoo!',
40            'url' => 'http://www.yahoo.com',
41            'regex' => '^(\w+\.)*yahoo\.com$',
42            'params' => ['p']
43        ],
44        'naver' => [
45            'name' => '네이버 (Naver)',
46            'url' => 'http://www.naver.com',
47            'regex' => '^search\.naver\.com$',
48            'params' => ['query']
49        ],
50        'baidu' => [
51            'name' => '百度 (Baidu)',
52            'url' => 'http://www.baidu.com',
53            'regex' => '^(\w+\.)*baidu\.com$',
54            'params' => ['wd', 'word', 'kw']
55        ],
56        'ask' => [
57            'name' => 'Ask',
58            'url' => 'http://www.ask.com',
59            'regex' => '^(\w+\.)*ask\.com$',
60            'params' => ['ask', 'q', 'searchfor']
61        ],
62        'ask_search_results' => [
63            'name' => 'Ask',
64            'url' => 'http://www.ask.com',
65            'regex' => '^(\w+\.)*search-results\.com$',
66            'params' => ['ask', 'q', 'searchfor']
67        ],
68        'babylon' => [
69            'name' => 'Babylon',
70            'url' => 'http://search.babylon.com',
71            'regex' => '^search\.babylon\.com$',
72            'params' => ['q']
73        ],
74        'aol' => [
75            'name' => 'AOL Search',
76            'url' => 'http://search.aol.com',
77            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
78            'params' => ['query', 'q']
79        ],
80        'duckduckgo' => [
81            'name' => 'DuckDuckGo',
82            'url' => 'http://duckduckgo.com',
83            'regex' => '^duckduckgo\.com$',
84            'params' => ['q']
85        ],
86        'ecosia' => [
87            'name' => 'Ecosia',
88            'url' => 'https://www.ecosia.org',
89            'regex' => '^(\w+\.)*ecosia\.org$',
90            'params' => ['q']
91        ],
92        'qwant' => [
93            'name' => 'Qwant',
94            'url' => 'https://www.qwant.com',
95            'regex' => '^(\w+\.)*qwant\.com$',
96            'params' => ['q']
97        ],
98        'google_avg' => [
99            'name' => 'Google',
100            'url' => 'http://www.google.com',
101            'regex' => '^search\.avg\.com$',
102            'params' => ['q']
103        ]
104    ];
105
106    /** @var string|null The search engine key */
107    protected ?string $engine = null;
108
109    /** @var string|null The search engine name */
110    protected ?string $name = null;
111
112    /** @var string|null The search query */
113    protected ?string $query = null;
114
115    /**
116     * Constructor
117     *
118     * @param string $referer The HTTP referer URL to analyze
119     */
120    public function __construct(string $referer)
121    {
122        // Add regex matching ourselves
123        self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$';
124        $this->analyze($referer);
125    }
126
127    /**
128     * Check if the referer is from a search engine
129     *
130     * @return bool True if the referer is from a search engine
131     */
132    public function isSearchEngine(): bool
133    {
134        return (bool)$this->engine;
135    }
136
137    /**
138     * Get the search engine identifier from the referer
139     *
140     * @return string|null The search engine or null if not a search engine
141     */
142    public function getEngine(): ?string
143    {
144        return $this->engine;
145    }
146
147    /**
148     * Get the search query from the referer
149     *
150     * @return string|null The search query or null if not a search engine
151     */
152    public function getQuery(): ?string
153    {
154        return $this->query;
155    }
156
157    /**
158     * Get the search engine name for the given engine identifier
159     *
160     * @return string If we have a name for the engine, return it, otherwise return capitalized $engine
161     */
162    public static function getName($engine): string
163    {
164        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine);
165    }
166
167    /**
168     * Get the search engine URL for the given engine identifier
169     *
170     * @return string|null The search engine URL or null if not defined
171     */
172    public static function getUrl($engine): ?string
173    {
174        return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null;
175    }
176
177    /**
178     * Analyze the referer and populate member variables
179     */
180    protected function analyze(string $referer): void
181    {
182        $result = $this->analyzereferer($referer);
183
184        if ($result) {
185            $this->engine = $result['engine'];
186            $this->name = $result['name'];
187            $this->query = $result['query'];
188        }
189    }
190
191    /**
192     * Analyze a referer URL to extract search engine information and query
193     *
194     * @param string $referer The HTTP referer URL
195     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
196     */
197    protected function analyzereferer(string $referer): ?array
198    {
199        $urlparts = parse_url(strtolower($referer));
200        if (!isset($urlparts['host'])) {
201            return null;
202        }
203
204        $domain = $urlparts['host'];
205        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
206
207        parse_str($queryString, $params);
208
209        // Try to match against known search engines
210        $result = $this->matchKnownEngine($domain, $params);
211        if ($result) {
212            return $result;
213        }
214
215        // Try generic search parameters
216        return $this->matchGenericEngine($domain, $params);
217    }
218
219    /**
220     * Try to match against known search engines
221     *
222     * @param string $domain The domain to check
223     * @param array $params URL parameters
224     * @return array|null Match result or null
225     */
226    protected function matchKnownEngine(string $domain, array $params): ?array
227    {
228        foreach (self::$searchEngines as $key => $engine) {
229            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
230                $query = $this->extractQuery($params, $engine['params']);
231                return [
232                    'engine' => $key,
233                    'name' => $engine['name'],
234                    'query' => $query
235                ];
236            }
237        }
238
239        return null;
240    }
241
242    /**
243     * Try to match against generic search parameters
244     *
245     * @param string $domain The domain to check
246     * @param array $params URL parameters
247     * @return array|null Match result or null
248     */
249    protected function matchGenericEngine(string $domain, array $params): ?array
250    {
251        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
252        $query = $this->extractQuery($params, $genericParams);
253
254        if (!$query) {
255            return null;
256        }
257
258        // Generate engine name from domain
259        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
260        $domainParts = explode('.', $engineName);
261        $engineName = array_pop($domainParts);
262
263        return [
264            'engine' => $engineName,
265            'name' => ucfirst($engineName),
266            'query' => $query
267        ];
268    }
269
270    /**
271     * Extract and clean search query from parameters
272     *
273     * @param array $params URL parameters
274     * @param array $paramNames Parameter names to check
275     * @return string|null Cleaned query or null
276     */
277    protected function extractQuery(array $params, array $paramNames): ?string
278    {
279        foreach ($paramNames as $param) {
280            if (!empty($params[$param])) {
281                $query = $this->cleanQuery($params[$param]);
282                if ($query) {
283                    return $query;
284                }
285            }
286        }
287
288        return null;
289    }
290
291    /**
292     * Clean and validate search query
293     *
294     * @param string $query Raw query string
295     * @return string|null Cleaned query or null if invalid
296     */
297    protected function cleanQuery(string $query): ?string
298    {
299        // Remove non-search queries (cache: and related: prefixes)
300        $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query);
301        // Compact whitespace
302        $query = preg_replace('/\s+/', ' ', $query);
303        $query = trim($query);
304
305        return $query ?: null;
306    }
307}
308