xref: /plugin/statistics/SearchEngines.php (revision a171b9c797be51c8fdf6d3246181382ad1aedc5b)
1<?php
2
3namespace dokuwiki\plugin\statistics;
4
5/**
6 * Defines regular expressions for the most common search engines
7 */
8class SearchEngines
9{
10    /** @var array Search engine definitions with regex patterns and metadata */
11    protected array $searchEngines = [
12        'google' => [
13            'name' => 'Google',
14            'url' => 'http://www.google.com',
15            'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$',
16            'params' => ['q']
17        ],
18        'bing' => [
19            'name' => 'Bing',
20            'url' => 'http://www.bing.com',
21            'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$',
22            'params' => ['q']
23        ],
24        'yandex' => [
25            'name' => 'Яндекс (Yandex)',
26            'url' => 'http://www.yandex.ru',
27            'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$',
28            'params' => ['query']
29        ],
30        'yahoo' => [
31            'name' => 'Yahoo!',
32            'url' => 'http://www.yahoo.com',
33            'regex' => '^(\w+\.)*yahoo\.com$',
34            'params' => ['p']
35        ],
36        'naver' => [
37            'name' => '네이버 (Naver)',
38            'url' => 'http://www.naver.com',
39            'regex' => '^search\.naver\.com$',
40            'params' => ['query']
41        ],
42        'baidu' => [
43            'name' => '百度 (Baidu)',
44            'url' => 'http://www.baidu.com',
45            'regex' => '^(\w+\.)*baidu\.com$',
46            'params' => ['wd', 'word', 'kw']
47        ],
48        'ask' => [
49            'name' => 'Ask',
50            'url' => 'http://www.ask.com',
51            'regex' => '^(\w+\.)*ask\.com$',
52            'params' => ['ask', 'q', 'searchfor']
53        ],
54        'ask_search_results' => [
55            'name' => 'Ask',
56            'url' => 'http://www.ask.com',
57            'regex' => '^(\w+\.)*search-results\.com$',
58            'params' => ['ask', 'q', 'searchfor']
59        ],
60        'babylon' => [
61            'name' => 'Babylon',
62            'url' => 'http://search.babylon.com',
63            'regex' => '^search\.babylon\.com$',
64            'params' => ['q']
65        ],
66        'aol' => [
67            'name' => 'AOL Search',
68            'url' => 'http://search.aol.com',
69            'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$',
70            'params' => ['query', 'q']
71        ],
72        'duckduckgo' => [
73            'name' => 'DuckDuckGo',
74            'url' => 'http://duckduckgo.com',
75            'regex' => '^duckduckgo\.com$',
76            'params' => ['q']
77        ],
78        'google_avg' => [
79            'name' => 'Google',
80            'url' => 'http://www.google.com',
81            'regex' => '^search\.avg\.com$',
82            'params' => ['q']
83        ]
84    ];
85
86    public function __construct()
87    {
88        // Add the internal DokuWiki search engine
89        $this->searchEngines['dokuwiki'] = [
90            'name' => 'DokuWiki Internal Search',
91            'url' => wl(),
92            'regex' => '',
93            'params' => ['q']
94        ];
95    }
96
97    /**
98     * Analyze a referrer URL to extract search engine information and query
99     *
100     * @param string $referer The HTTP referer URL
101     * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine
102     */
103    public function analyzeReferrer(string $referer): ?array
104    {
105        $referer = strtolower($referer);
106
107        // parse the referer
108        $urlparts = parse_url($referer);
109        if (!isset($urlparts['host'])) {
110            return null;
111        }
112
113        $domain = $urlparts['host'];
114        $qpart = $urlparts['query'] ?? '';
115        if (!$qpart && isset($urlparts['fragment'])) {
116            $qpart = $urlparts['fragment']; // google does this
117        }
118
119        $params = [];
120        if ($qpart) {
121            parse_str($qpart, $params);
122        }
123
124        $query = '';
125        $engineKey = '';
126        $engineName = '';
127
128        // check domain against known search engines
129        foreach ($this->searchEngines as $key => $engine) {
130            if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki)
131
132            if (preg_match('/' . $engine['regex'] . '/', $domain)) {
133                $engineKey = $key;
134                $engineName = $engine['name'];
135
136                // check the known parameters for content
137                foreach ($engine['params'] as $param) {
138                    if (!empty($params[$param])) {
139                        $query = $params[$param];
140                        break;
141                    }
142                }
143                break;
144            }
145        }
146
147        // try some generic search engine parameters if no specific engine matched
148        if (!$engineKey) {
149            foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) {
150                if (!empty($params[$param])) {
151                    $query = $params[$param];
152                    // generate name from domain
153                    $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld
154                    $engineName = explode('.', $engineName);
155                    $engineName = array_pop($engineName);
156                    $engineKey = 'generic_' . $engineName;
157                    break;
158                }
159            }
160        }
161
162        // still no hit? not a search engine
163        if (!$engineKey || !$query) {
164            return null;
165        }
166
167        // clean the query
168        $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries
169        $query = preg_replace('/ +/', ' ', $query); // ws compact
170        $query = trim($query);
171
172        if (!$query) {
173            return null;
174        }
175
176        return [
177            'engine' => $engineKey,
178            'name' => $engineName,
179            'query' => $query
180        ];
181    }
182
183    /**
184     * Get search engine information by key
185     *
186     * @param string $key The search engine key
187     * @return array|null The search engine data or null if not found
188     */
189    public function getSearchEngine(string $key): ?array
190    {
191        return $this->searchEngines[$key] ?? null;
192    }
193
194    /**
195     * Get all search engines
196     *
197     * @return array All search engine definitions
198     */
199    public function getAllSearchEngines(): array
200    {
201        return $this->searchEngines;
202    }
203
204}
205