xref: /plugin/statistics/SearchEngines.php (revision aecf8e88a958911ceea01f601a32b2453018b637)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Defines regular expressions for the most common search engines
7 */
8class SearchEngines
9{
10    /** @var array Search engine definitions with regex patterns and metadata */
11    protected array $searchEngines = [
12        'google' => [
13            'name' => 'Google',
14            'url' => 'http://www.google.com',
15            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16            'params' => ['q']
17        ],
18        'bing' => [
19            'name' => 'Bing',
20            'url' => 'http://www.bing.com',
21            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22            'params' => ['q']
23        ],
24        'yandex' => [
25            'name' => 'Яндекс (Yandex)',
26            'url' => 'http://www.yandex.ru',
27            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28            'params' => ['query']
29        ],
30        'yahoo' => [
31            'name' => 'Yahoo!',
32            'url' => 'http://www.yahoo.com',
33            'regex' => '^(\w+\.)*yahoo\.com$',
34            'params' => ['p']
35        ],
36        'naver' => [
37            'name' => '네이버 (Naver)',
38            'url' => 'http://www.naver.com',
39            'regex' => '^search\.naver\.com$',
40            'params' => ['query']
41        ],
42        'baidu' => [
43            'name' => '百度 (Baidu)',
44            'url' => 'http://www.baidu.com',
45            'regex' => '^(\w+\.)*baidu\.com$',
46            'params' => ['wd', 'word', 'kw']
47        ],
48        'ask' => [
49            'name' => 'Ask',
50            'url' => 'http://www.ask.com',
51            'regex' => '^(\w+\.)*ask\.com$',
52            'params' => ['ask', 'q', 'searchfor']
53        ],
54        'ask_search_results' => [
55            'name' => 'Ask',
56            'url' => 'http://www.ask.com',
57            'regex' => '^(\w+\.)*search-results\.com$',
58            'params' => ['ask', 'q', 'searchfor']
59        ],
60        'babylon' => [
61            'name' => 'Babylon',
62            'url' => 'http://search.babylon.com',
63            'regex' => '^search\.babylon\.com$',
64            'params' => ['q']
65        ],
66        'aol' => [
67            'name' => 'AOL Search',
68            'url' => 'http://search.aol.com',
69            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70            'params' => ['query', 'q']
71        ],
72        'duckduckgo' => [
73            'name' => 'DuckDuckGo',
74            'url' => 'http://duckduckgo.com',
75            'regex' => '^duckduckgo\.com$',
76            'params' => ['q']
77        ],
78        'google_avg' => [
79            'name' => 'Google',
80            'url' => 'http://www.google.com',
81            'regex' => '^search\.avg\.com$',
82            'params' => ['q']
83        ]
84    ];
85
86    /** @var string The referrer URL being analyzed */
87    protected string $referrer;
88
89    /** @var bool Whether the referrer is from a search engine */
90    protected bool $isSearchEngine = false;
91
92    /** @var string|null The search engine name */
93    protected ?string $engineName = null;
94
95    /** @var string|null The search engine key */
96    protected ?string $engineKey = null;
97
98    /** @var string|null The search query */
99    protected ?string $query = null;
100
101    public function __construct(string $referrer)
102    {
103        // Add the internal DokuWiki search engine
104        $this->searchEngines['dokuwiki'] = [
105            'name' => 'DokuWiki Internal Search',
106            'url' => wl(),
107            'regex' => '',
108            'params' => ['q']
109        ];
110
111        $this->referrer = $referrer;
112        $this->analyze();
113    }
114
115    /**
116     * Check if the referrer is from a search engine
117     *
118     * @return bool True if the referrer is from a search engine
119     */
120    public function isSearchEngine(): bool
121    {
122        return $this->isSearchEngine;
123    }
124
125    /**
126     * Get the search engine name
127     *
128     * @return string|null The search engine name or null if not a search engine
129     */
130    public function getName(): ?string
131    {
132        return $this->engineName;
133    }
134
135    /**
136     * Get the search engine URL
137     *
138     * @return string|null The search engine URL or null if not a search engine
139     */
140    public function getUrl(): ?string
141    {
142        if (!$this->engineKey) {
143            return null;
144        }
145
146        return $this->searchEngines[$this->engineKey]['url'] ?? null;
147    }
148
149    /**
150     * Get the search query
151     *
152     * @return string|null The search query or null if not a search engine
153     */
154    public function getQuery(): ?string
155    {
156        return $this->query;
157    }
158
159    /**
160     * Analyze the referrer and populate member variables
161     */
162    protected function analyze(): void
163    {
164        $result = $this->analyzeReferrer($this->referrer);
165
166        if ($result) {
167            $this->isSearchEngine = true;
168            $this->engineKey = $result['engine'];
169            $this->engineName = $result['name'];
170            $this->query = $result['query'];
171        }
172    }
173
174    /**
175     * Analyze a referrer URL to extract search engine information and query
176     *
177     * @param string $referer The HTTP referer URL
178     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
179     */
180    protected function analyzeReferrer(string $referer): ?array
181    {
182        $urlparts = parse_url(strtolower($referer));
183        if (!isset($urlparts['host'])) {
184            return null;
185        }
186
187        $domain = $urlparts['host'];
188        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
189
190        if (!$queryString) {
191            return null;
192        }
193
194        parse_str($queryString, $params);
195
196        // Try to match against known search engines
197        $result = $this->matchKnownEngine($domain, $params);
198        if ($result) {
199            return $result;
200        }
201
202        // Try generic search parameters
203        return $this->matchGenericEngine($domain, $params);
204    }
205
206    /**
207     * Try to match against known search engines
208     *
209     * @param string $domain The domain to check
210     * @param array $params URL parameters
211     * @return array|null Match result or null
212     */
213    protected function matchKnownEngine(string $domain, array $params): ?array
214    {
215        foreach ($this->searchEngines as $key => $engine) {
216            if (!$engine['regex']) {
217                continue; // skip engines without regex (like dokuwiki)
218            }
219
220            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
221                $query = $this->extractQuery($params, $engine['params']);
222                if ($query) {
223                    return [
224                        'engine' => $key,
225                        'name' => $engine['name'],
226                        'query' => $query
227                    ];
228                }
229            }
230        }
231
232        return null;
233    }
234
235    /**
236     * Try to match against generic search parameters
237     *
238     * @param string $domain The domain to check
239     * @param array $params URL parameters
240     * @return array|null Match result or null
241     */
242    protected function matchGenericEngine(string $domain, array $params): ?array
243    {
244        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
245        $query = $this->extractQuery($params, $genericParams);
246
247        if (!$query) {
248            return null;
249        }
250
251        // Generate engine name from domain
252        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
253        $engineName = array_pop(explode('.', $engineName));
254
255        return [
256            'engine' => 'generic_' . $engineName,
257            'name' => ucfirst($engineName),
258            'query' => $query
259        ];
260    }
261
262    /**
263     * Extract and clean search query from parameters
264     *
265     * @param array $params URL parameters
266     * @param array $paramNames Parameter names to check
267     * @return string|null Cleaned query or null
268     */
269    protected function extractQuery(array $params, array $paramNames): ?string
270    {
271        foreach ($paramNames as $param) {
272            if (!empty($params[$param])) {
273                $query = $this->cleanQuery($params[$param]);
274                if ($query) {
275                    return $query;
276                }
277            }
278        }
279
280        return null;
281    }
282
283    /**
284     * Clean and validate search query
285     *
286     * @param string $query Raw query string
287     * @return string|null Cleaned query or null if invalid
288     */
289    protected function cleanQuery(string $query): ?string
290    {
291        // Remove non-search queries
292        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query);
293        // Compact whitespace
294        $query = preg_replace('/ +/', ' ', $query);
295        $query = trim($query);
296
297        return $query ?: null;
298    }
299
300}
301