1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Defines regular expressions for the most common search engines 7 */ 8class SearchEngines 9{ 10 /** @var array Search engine definitions with regex patterns and metadata */ 11 protected array $searchEngines = [ 12 'google' => [ 13 'name' => 'Google', 14 'url' => 'http://www.google.com', 15 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16 'params' => ['q'] 17 ], 18 'bing' => [ 19 'name' => 'Bing', 20 'url' => 'http://www.bing.com', 21 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22 'params' => ['q'] 23 ], 24 'yandex' => [ 25 'name' => 'Яндекс (Yandex)', 26 'url' => 'http://www.yandex.ru', 27 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28 'params' => ['query'] 29 ], 30 'yahoo' => [ 31 'name' => 'Yahoo!', 32 'url' => 'http://www.yahoo.com', 33 'regex' => '^(\w+\.)*yahoo\.com$', 34 'params' => ['p'] 35 ], 36 'naver' => [ 37 'name' => '네이버 (Naver)', 38 'url' => 'http://www.naver.com', 39 'regex' => '^search\.naver\.com$', 40 'params' => ['query'] 41 ], 42 'baidu' => [ 43 'name' => '百度 (Baidu)', 44 'url' => 'http://www.baidu.com', 45 'regex' => '^(\w+\.)*baidu\.com$', 46 'params' => ['wd', 'word', 'kw'] 47 ], 48 'ask' => [ 49 'name' => 'Ask', 50 'url' => 'http://www.ask.com', 51 'regex' => '^(\w+\.)*ask\.com$', 52 'params' => ['ask', 'q', 'searchfor'] 53 ], 54 'ask_search_results' => [ 55 'name' => 'Ask', 56 'url' => 'http://www.ask.com', 57 'regex' => '^(\w+\.)*search-results\.com$', 58 'params' => ['ask', 'q', 'searchfor'] 59 ], 60 'babylon' => [ 61 'name' => 'Babylon', 62 'url' => 'http://search.babylon.com', 63 'regex' => '^search\.babylon\.com$', 64 'params' => ['q'] 65 ], 66 'aol' => [ 67 'name' => 'AOL Search', 68 'url' => 'http://search.aol.com', 69 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70 'params' => ['query', 'q'] 71 ], 72 'duckduckgo' => [ 73 'name' => 'DuckDuckGo', 74 'url' => 'http://duckduckgo.com', 75 'regex' => '^duckduckgo\.com$', 76 'params' => ['q'] 77 ], 78 'google_avg' => [ 79 'name' => 'Google', 80 'url' => 'http://www.google.com', 81 'regex' => '^search\.avg\.com$', 82 'params' => ['q'] 83 ] 84 ]; 85 86 /** @var string|null The referrer URL being analyzed */ 87 protected ?string $referrer = null; 88 89 /** @var array|null Cached analysis result */ 90 protected ?array $analysisResult = null; 91 92 public function __construct(?string $referrer = null) 93 { 94 // Add the internal DokuWiki search engine 95 $this->searchEngines['dokuwiki'] = [ 96 'name' => 'DokuWiki Internal Search', 97 'url' => wl(), 98 'regex' => '', 99 'params' => ['q'] 100 ]; 101 102 if ($referrer !== null) { 103 $this->setReferrer($referrer); 104 } 105 } 106 107 /** 108 * Set the referrer URL to analyze 109 * 110 * @param string $referrer The HTTP referrer URL 111 */ 112 public function setReferrer(string $referrer): void 113 { 114 $this->referrer = $referrer; 115 $this->analysisResult = null; // Clear cache 116 } 117 118 /** 119 * Check if the referrer is from a search engine 120 * 121 * @return bool True if the referrer is from a search engine 122 */ 123 public function isSearchEngine(): bool 124 { 125 $this->analyze(); 126 return $this->analysisResult !== null; 127 } 128 129 /** 130 * Get the search engine name 131 * 132 * @return string|null The search engine name or null if not a search engine 133 */ 134 public function getName(): ?string 135 { 136 $this->analyze(); 137 return $this->analysisResult['name'] ?? null; 138 } 139 140 /** 141 * Get the search engine URL 142 * 143 * @return string|null The search engine URL or null if not a search engine 144 */ 145 public function getUrl(): ?string 146 { 147 $this->analyze(); 148 if (!$this->analysisResult) { 149 return null; 150 } 151 152 $engineKey = $this->analysisResult['engine']; 153 return $this->searchEngines[$engineKey]['url'] ?? null; 154 } 155 156 /** 157 * Get the search query 158 * 159 * @return string|null The search query or null if not a search engine 160 */ 161 public function getQuery(): ?string 162 { 163 $this->analyze(); 164 return $this->analysisResult['query'] ?? null; 165 } 166 167 /** 168 * Analyze the current referrer 169 */ 170 protected function analyze(): void 171 { 172 if ($this->analysisResult !== null || $this->referrer === null) { 173 return; // Already analyzed or no referrer set 174 } 175 176 $this->analysisResult = $this->analyzeReferrer($this->referrer); 177 } 178 179 /** 180 * Analyze a referrer URL to extract search engine information and query 181 * 182 * @param string $referer The HTTP referer URL 183 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 184 */ 185 public function analyzeReferrer(string $referer): ?array 186 { 187 $referer = strtolower($referer); 188 189 // parse the referer 190 $urlparts = parse_url($referer); 191 if (!isset($urlparts['host'])) { 192 return null; 193 } 194 195 $domain = $urlparts['host']; 196 $qpart = $urlparts['query'] ?? ''; 197 if (!$qpart && isset($urlparts['fragment'])) { 198 $qpart = $urlparts['fragment']; // google does this 199 } 200 201 $params = []; 202 if ($qpart) { 203 parse_str($qpart, $params); 204 } 205 206 $query = ''; 207 $engineKey = ''; 208 $engineName = ''; 209 210 // check domain against known search engines 211 foreach ($this->searchEngines as $key => $engine) { 212 if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki) 213 214 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 215 $engineKey = $key; 216 $engineName = $engine['name']; 217 218 // check the known parameters for content 219 foreach ($engine['params'] as $param) { 220 if (!empty($params[$param])) { 221 $query = $params[$param]; 222 break; 223 } 224 } 225 break; 226 } 227 } 228 229 // try some generic search engine parameters if no specific engine matched 230 if (!$engineKey) { 231 foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) { 232 if (!empty($params[$param])) { 233 $query = $params[$param]; 234 // generate name from domain 235 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld 236 $engineName = explode('.', $engineName); 237 $engineName = array_pop($engineName); 238 $engineKey = 'generic_' . $engineName; 239 break; 240 } 241 } 242 } 243 244 // still no hit? not a search engine 245 if (!$engineKey || !$query) { 246 return null; 247 } 248 249 // clean the query 250 $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries 251 $query = preg_replace('/ +/', ' ', $query); // ws compact 252 $query = trim($query); 253 254 if (!$query) { 255 return null; 256 } 257 258 return [ 259 'engine' => $engineKey, 260 'name' => $engineName, 261 'query' => $query 262 ]; 263 } 264 265 /** 266 * Get search engine information by key 267 * 268 * @param string $key The search engine key 269 * @return array|null The search engine data or null if not found 270 */ 271 public function getSearchEngine(string $key): ?array 272 { 273 return $this->searchEngines[$key] ?? null; 274 } 275 276 /** 277 * Get all search engines 278 * 279 * @return array All search engine definitions 280 */ 281 public function getAllSearchEngines(): array 282 { 283 return $this->searchEngines; 284 } 285 286} 287