xref: /plugin/statistics/SearchEngines.php (revision a73005f250d1a2d2de9eb3e2d23ac62fd0fe3095)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Defines regular expressions for the most common search engines
7 */
8class SearchEngines
9{
10    /** @var array Search engine definitions with regex patterns and metadata */
11    protected array $searchEngines = [
12        'google' => [
13            'name' => 'Google',
14            'url' => 'http://www.google.com',
15            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16            'params' => ['q']
17        ],
18        'bing' => [
19            'name' => 'Bing',
20            'url' => 'http://www.bing.com',
21            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22            'params' => ['q']
23        ],
24        'yandex' => [
25            'name' => 'Яндекс (Yandex)',
26            'url' => 'http://www.yandex.ru',
27            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28            'params' => ['query']
29        ],
30        'yahoo' => [
31            'name' => 'Yahoo!',
32            'url' => 'http://www.yahoo.com',
33            'regex' => '^(\w+\.)*yahoo\.com$',
34            'params' => ['p']
35        ],
36        'naver' => [
37            'name' => '네이버 (Naver)',
38            'url' => 'http://www.naver.com',
39            'regex' => '^search\.naver\.com$',
40            'params' => ['query']
41        ],
42        'baidu' => [
43            'name' => '百度 (Baidu)',
44            'url' => 'http://www.baidu.com',
45            'regex' => '^(\w+\.)*baidu\.com$',
46            'params' => ['wd', 'word', 'kw']
47        ],
48        'ask' => [
49            'name' => 'Ask',
50            'url' => 'http://www.ask.com',
51            'regex' => '^(\w+\.)*ask\.com$',
52            'params' => ['ask', 'q', 'searchfor']
53        ],
54        'ask_search_results' => [
55            'name' => 'Ask',
56            'url' => 'http://www.ask.com',
57            'regex' => '^(\w+\.)*search-results\.com$',
58            'params' => ['ask', 'q', 'searchfor']
59        ],
60        'babylon' => [
61            'name' => 'Babylon',
62            'url' => 'http://search.babylon.com',
63            'regex' => '^search\.babylon\.com$',
64            'params' => ['q']
65        ],
66        'aol' => [
67            'name' => 'AOL Search',
68            'url' => 'http://search.aol.com',
69            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70            'params' => ['query', 'q']
71        ],
72        'duckduckgo' => [
73            'name' => 'DuckDuckGo',
74            'url' => 'http://duckduckgo.com',
75            'regex' => '^duckduckgo\.com$',
76            'params' => ['q']
77        ],
78        'google_avg' => [
79            'name' => 'Google',
80            'url' => 'http://www.google.com',
81            'regex' => '^search\.avg\.com$',
82            'params' => ['q']
83        ]
84    ];
85
86    /** @var string|null The referrer URL being analyzed */
87    protected ?string $referrer = null;
88
89    /** @var array|null Cached analysis result */
90    protected ?array $analysisResult = null;
91
92    public function __construct(?string $referrer = null)
93    {
94        // Add the internal DokuWiki search engine
95        $this->searchEngines['dokuwiki'] = [
96            'name' => 'DokuWiki Internal Search',
97            'url' => wl(),
98            'regex' => '',
99            'params' => ['q']
100        ];
101
102        if ($referrer !== null) {
103            $this->referrer = $referrer;
104        }
105    }
106
107    /**
108     * Check if the referrer is from a search engine
109     *
110     * @return bool True if the referrer is from a search engine
111     */
112    public function isSearchEngine(): bool
113    {
114        return $this->getAnalysis() !== null;
115    }
116
117    /**
118     * Get the search engine name
119     *
120     * @return string|null The search engine name or null if not a search engine
121     */
122    public function getName(): ?string
123    {
124        $analysis = $this->getAnalysis();
125        return $analysis['name'] ?? null;
126    }
127
128    /**
129     * Get the search engine URL
130     *
131     * @return string|null The search engine URL or null if not a search engine
132     */
133    public function getUrl(): ?string
134    {
135        $analysis = $this->getAnalysis();
136        if (!$analysis) {
137            return null;
138        }
139
140        return $this->searchEngines[$analysis['engine']]['url'] ?? null;
141    }
142
143    /**
144     * Get the search query
145     *
146     * @return string|null The search query or null if not a search engine
147     */
148    public function getQuery(): ?string
149    {
150        $analysis = $this->getAnalysis();
151        return $analysis['query'] ?? null;
152    }
153
154    /**
155     * Get or perform analysis of the current referrer
156     *
157     * @return array|null Analysis result or null if not a search engine
158     */
159    protected function getAnalysis(): ?array
160    {
161        if ($this->analysisResult === null && $this->referrer !== null) {
162            $this->analysisResult = $this->analyzeReferrer($this->referrer);
163        }
164
165        return $this->analysisResult;
166    }
167
168    /**
169     * Analyze a referrer URL to extract search engine information and query
170     *
171     * @param string $referer The HTTP referer URL
172     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
173     */
174    protected function analyzeReferrer(string $referer): ?array
175    {
176        $urlparts = parse_url(strtolower($referer));
177        if (!isset($urlparts['host'])) {
178            return null;
179        }
180
181        $domain = $urlparts['host'];
182        $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? '';
183
184        if (!$queryString) {
185            return null;
186        }
187
188        parse_str($queryString, $params);
189
190        // Try to match against known search engines
191        $result = $this->matchKnownEngine($domain, $params);
192        if ($result) {
193            return $result;
194        }
195
196        // Try generic search parameters
197        return $this->matchGenericEngine($domain, $params);
198    }
199
200    /**
201     * Try to match against known search engines
202     *
203     * @param string $domain The domain to check
204     * @param array $params URL parameters
205     * @return array|null Match result or null
206     */
207    protected function matchKnownEngine(string $domain, array $params): ?array
208    {
209        foreach ($this->searchEngines as $key => $engine) {
210            if (!$engine['regex']) {
211                continue; // skip engines without regex (like dokuwiki)
212            }
213
214            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
215                $query = $this->extractQuery($params, $engine['params']);
216                if ($query) {
217                    return [
218                        'engine' => $key,
219                        'name' => $engine['name'],
220                        'query' => $query
221                    ];
222                }
223            }
224        }
225
226        return null;
227    }
228
229    /**
230     * Try to match against generic search parameters
231     *
232     * @param string $domain The domain to check
233     * @param array $params URL parameters
234     * @return array|null Match result or null
235     */
236    protected function matchGenericEngine(string $domain, array $params): ?array
237    {
238        $genericParams = ['search', 'query', 'q', 'keywords', 'keyword'];
239        $query = $this->extractQuery($params, $genericParams);
240
241        if (!$query) {
242            return null;
243        }
244
245        // Generate engine name from domain
246        $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain);
247        $engineName = array_pop(explode('.', $engineName));
248
249        return [
250            'engine' => 'generic_' . $engineName,
251            'name' => ucfirst($engineName),
252            'query' => $query
253        ];
254    }
255
256    /**
257     * Extract and clean search query from parameters
258     *
259     * @param array $params URL parameters
260     * @param array $paramNames Parameter names to check
261     * @return string|null Cleaned query or null
262     */
263    protected function extractQuery(array $params, array $paramNames): ?string
264    {
265        foreach ($paramNames as $param) {
266            if (!empty($params[$param])) {
267                $query = $this->cleanQuery($params[$param]);
268                if ($query) {
269                    return $query;
270                }
271            }
272        }
273
274        return null;
275    }
276
277    /**
278     * Clean and validate search query
279     *
280     * @param string $query Raw query string
281     * @return string|null Cleaned query or null if invalid
282     */
283    protected function cleanQuery(string $query): ?string
284    {
285        // Remove non-search queries
286        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query);
287        // Compact whitespace
288        $query = preg_replace('/ +/', ' ', $query);
289        $query = trim($query);
290
291        return $query ?: null;
292    }
293
294}
295