xref: /plugin/statistics/SearchEngines.php (revision 06bd4382454fe99f88f03cf234b281e15118ab82)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Defines regular expressions for the most common search engines
7 */
8class SearchEngines
9{
10    /** @var array Search engine definitions with regex patterns and metadata */
11    protected array $searchEngines = [
12        'google' => [
13            'name' => 'Google',
14            'url' => 'http://www.google.com',
15            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16            'params' => ['q']
17        ],
18        'bing' => [
19            'name' => 'Bing',
20            'url' => 'http://www.bing.com',
21            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22            'params' => ['q']
23        ],
24        'yandex' => [
25            'name' => 'Яндекс (Yandex)',
26            'url' => 'http://www.yandex.ru',
27            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28            'params' => ['query']
29        ],
30        'yahoo' => [
31            'name' => 'Yahoo!',
32            'url' => 'http://www.yahoo.com',
33            'regex' => '^(\w+\.)*yahoo\.com$',
34            'params' => ['p']
35        ],
36        'naver' => [
37            'name' => '네이버 (Naver)',
38            'url' => 'http://www.naver.com',
39            'regex' => '^search\.naver\.com$',
40            'params' => ['query']
41        ],
42        'baidu' => [
43            'name' => '百度 (Baidu)',
44            'url' => 'http://www.baidu.com',
45            'regex' => '^(\w+\.)*baidu\.com$',
46            'params' => ['wd', 'word', 'kw']
47        ],
48        'ask' => [
49            'name' => 'Ask',
50            'url' => 'http://www.ask.com',
51            'regex' => '^(\w+\.)*ask\.com$',
52            'params' => ['ask', 'q', 'searchfor']
53        ],
54        'ask_search_results' => [
55            'name' => 'Ask',
56            'url' => 'http://www.ask.com',
57            'regex' => '^(\w+\.)*search-results\.com$',
58            'params' => ['ask', 'q', 'searchfor']
59        ],
60        'babylon' => [
61            'name' => 'Babylon',
62            'url' => 'http://search.babylon.com',
63            'regex' => '^search\.babylon\.com$',
64            'params' => ['q']
65        ],
66        'aol' => [
67            'name' => 'AOL Search',
68            'url' => 'http://search.aol.com',
69            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70            'params' => ['query', 'q']
71        ],
72        'duckduckgo' => [
73            'name' => 'DuckDuckGo',
74            'url' => 'http://duckduckgo.com',
75            'regex' => '^duckduckgo\.com$',
76            'params' => ['q']
77        ],
78        'google_avg' => [
79            'name' => 'Google',
80            'url' => 'http://www.google.com',
81            'regex' => '^search\.avg\.com$',
82            'params' => ['q']
83        ]
84    ];
85
86    /** @var string|null The referrer URL being analyzed */
87    protected ?string $referrer = null;
88
89    /** @var array|null Cached analysis result */
90    protected ?array $analysisResult = null;
91
92    public function __construct(?string $referrer = null)
93    {
94        // Add the internal DokuWiki search engine
95        $this->searchEngines['dokuwiki'] = [
96            'name' => 'DokuWiki Internal Search',
97            'url' => wl(),
98            'regex' => '',
99            'params' => ['q']
100        ];
101
102        if ($referrer !== null) {
103            $this->setReferrer($referrer);
104        }
105    }
106
107    /**
108     * Set the referrer URL to analyze
109     *
110     * @param string $referrer The HTTP referrer URL
111     */
112    public function setReferrer(string $referrer): void
113    {
114        $this->referrer = $referrer;
115        $this->analysisResult = null; // Clear cache
116    }
117
118    /**
119     * Check if the referrer is from a search engine
120     *
121     * @return bool True if the referrer is from a search engine
122     */
123    public function isSearchEngine(): bool
124    {
125        $this->analyze();
126        return $this->analysisResult !== null;
127    }
128
129    /**
130     * Get the search engine name
131     *
132     * @return string|null The search engine name or null if not a search engine
133     */
134    public function getName(): ?string
135    {
136        $this->analyze();
137        return $this->analysisResult['name'] ?? null;
138    }
139
140    /**
141     * Get the search engine URL
142     *
143     * @return string|null The search engine URL or null if not a search engine
144     */
145    public function getUrl(): ?string
146    {
147        $this->analyze();
148        if (!$this->analysisResult) {
149            return null;
150        }
151
152        $engineKey = $this->analysisResult['engine'];
153        return $this->searchEngines[$engineKey]['url'] ?? null;
154    }
155
156    /**
157     * Get the search query
158     *
159     * @return string|null The search query or null if not a search engine
160     */
161    public function getQuery(): ?string
162    {
163        $this->analyze();
164        return $this->analysisResult['query'] ?? null;
165    }
166
167    /**
168     * Analyze the current referrer
169     */
170    protected function analyze(): void
171    {
172        if ($this->analysisResult !== null || $this->referrer === null) {
173            return; // Already analyzed or no referrer set
174        }
175
176        $this->analysisResult = $this->analyzeReferrer($this->referrer);
177    }
178
179    /**
180     * Analyze a referrer URL to extract search engine information and query
181     *
182     * @param string $referer The HTTP referer URL
183     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
184     */
185    public function analyzeReferrer(string $referer): ?array
186    {
187        $referer = strtolower($referer);
188
189        // parse the referer
190        $urlparts = parse_url($referer);
191        if (!isset($urlparts['host'])) {
192            return null;
193        }
194
195        $domain = $urlparts['host'];
196        $qpart = $urlparts['query'] ?? '';
197        if (!$qpart && isset($urlparts['fragment'])) {
198            $qpart = $urlparts['fragment']; // google does this
199        }
200
201        $params = [];
202        if ($qpart) {
203            parse_str($qpart, $params);
204        }
205
206        $query = '';
207        $engineKey = '';
208        $engineName = '';
209
210        // check domain against known search engines
211        foreach ($this->searchEngines as $key => $engine) {
212            if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki)
213
214            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
215                $engineKey = $key;
216                $engineName = $engine['name'];
217
218                // check the known parameters for content
219                foreach ($engine['params'] as $param) {
220                    if (!empty($params[$param])) {
221                        $query = $params[$param];
222                        break;
223                    }
224                }
225                break;
226            }
227        }
228
229        // try some generic search engine parameters if no specific engine matched
230        if (!$engineKey) {
231            foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) {
232                if (!empty($params[$param])) {
233                    $query = $params[$param];
234                    // generate name from domain
235                    $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld
236                    $engineName = explode('.', $engineName);
237                    $engineName = array_pop($engineName);
238                    $engineKey = 'generic_' . $engineName;
239                    break;
240                }
241            }
242        }
243
244        // still no hit? not a search engine
245        if (!$engineKey || !$query) {
246            return null;
247        }
248
249        // clean the query
250        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries
251        $query = preg_replace('/ +/', ' ', $query); // ws compact
252        $query = trim($query);
253
254        if (!$query) {
255            return null;
256        }
257
258        return [
259            'engine' => $engineKey,
260            'name' => $engineName,
261            'query' => $query
262        ];
263    }
264
265    /**
266     * Get search engine information by key
267     *
268     * @param string $key The search engine key
269     * @return array|null The search engine data or null if not found
270     */
271    public function getSearchEngine(string $key): ?array
272    {
273        return $this->searchEngines[$key] ?? null;
274    }
275
276    /**
277     * Get all search engines
278     *
279     * @return array All search engine definitions
280     */
281    public function getAllSearchEngines(): array
282    {
283        return $this->searchEngines;
284    }
285
286}
287