1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Extract search Engine Inormation from the HTTP referer 7 * 8 * We use the HTTP specification misspelling of "referer" here 9 */ 10class SearchEngines 11{ 12 /** @var array Search engine definitions with regex patterns and metadata */ 13 protected static array $searchEngines = [ 14 'google' => [ 15 'name' => 'Google', 16 'url' => 'https://www.google.com', 17 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 18 'params' => ['q'] 19 ], 20 'bing' => [ 21 'name' => 'Bing', 22 'url' => 'https://www.bing.com', 23 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 24 'params' => ['q'] 25 ], 26 'yandex' => [ 27 'name' => 'Яндекс (Yandex)', 28 'url' => 'https://www.yandex.ru', 29 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 30 'params' => ['query'] 31 ], 32 'yahoo' => [ 33 'name' => 'Yahoo!', 34 'url' => 'https://www.yahoo.com', 35 'regex' => '^(\w+\.)*yahoo\.com$', 36 'params' => ['p'] 37 ], 38 'naver' => [ 39 'name' => '네이버 (Naver)', 40 'url' => 'https://www.naver.com', 41 'regex' => '^search\.naver\.com$', 42 'params' => ['query'] 43 ], 44 'baidu' => [ 45 'name' => '百度 (Baidu)', 46 'url' => 'https://www.baidu.com', 47 'regex' => '^(\w+\.)*baidu\.com$', 48 'params' => ['wd', 'word', 'kw'] 49 ], 50 'ask' => [ 51 'name' => 'Ask', 52 'url' => 'https://www.ask.com', 53 'regex' => '^(\w+\.)*ask\.com$', 54 'params' => ['ask', 'q', 'searchfor'] 55 ], 56 'ask_search_results' => [ 57 'name' => 'Ask', 58 'url' => 'https://www.ask.com', 59 'regex' => '^(\w+\.)*search-results\.com$', 60 'params' => ['ask', 'q', 'searchfor'] 61 ], 62 'babylon' => [ 63 'name' => 'Babylon', 64 'url' => 'https://search.babylon.com', 65 'regex' => '^search\.babylon\.com$', 66 'params' => ['q'] 67 ], 68 'aol' => [ 69 'name' => 'AOL Search', 70 'url' => 'https://search.aol.com', 71 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 72 'params' => ['query', 'q'] 73 ], 74 'duckduckgo' => [ 75 'name' => 'DuckDuckGo', 76 'url' => 'https://duckduckgo.com', 77 'regex' => '^duckduckgo\.com$', 78 'params' => ['q'] 79 ], 80 'ecosia' => [ 81 'name' => 'Ecosia', 82 'url' => 'https://www.ecosia.org', 83 'regex' => '^(\w+\.)*ecosia\.org$', 84 'params' => ['q'] 85 ], 86 'qwant' => [ 87 'name' => 'Qwant', 88 'url' => 'https://www.qwant.com', 89 'regex' => '^(\w+\.)*qwant\.com$', 90 'params' => ['q'] 91 ], 92 'avg' => [ 93 'name' => 'AVG Safe Search', 94 'url' => 'https://search.avg.com', 95 'regex' => '^search\.avg\.com$', 96 'params' => ['q'] 97 ], 98 'brave' => [ 99 'name' => 'Brave Search', 100 'url' => 'https://search.brave.com', 101 'regex' => '^(\w+\.)*search\.brave\.com$', 102 'params' => ['q'] 103 ], 104 ]; 105 106 /** @var string|null The search engine key */ 107 protected ?string $engine = null; 108 109 /** @var string|null The search engine name */ 110 protected ?string $name = null; 111 112 /** @var string|null The search query */ 113 protected ?string $query = null; 114 115 /** 116 * Constructor 117 * 118 * @param string $referer The HTTP referer URL to analyze 119 */ 120 public function __construct(string $referer) 121 { 122 $this->analyze($referer); 123 } 124 125 /** 126 * Check if the referer is from a search engine 127 * 128 * @return bool True if the referer is from a search engine 129 */ 130 public function isSearchEngine(): bool 131 { 132 return (bool)$this->engine; 133 } 134 135 /** 136 * Get the search engine identifier from the referer 137 * 138 * @return string|null The search engine or null if not a search engine 139 */ 140 public function getEngine(): ?string 141 { 142 return $this->engine; 143 } 144 145 /** 146 * Get the search query from the referer 147 * 148 * @return string|null The search query or null if not a search engine 149 */ 150 public function getQuery(): ?string 151 { 152 return $this->query; 153 } 154 155 /** 156 * Get the search engine name for the given engine identifier 157 * 158 * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 159 */ 160 public static function getName($engine): string 161 { 162 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 163 } 164 165 /** 166 * Get the search engine URL for the given engine identifier 167 * 168 * @return string|null The search engine URL or null if not defined 169 */ 170 public static function getUrl($engine): ?string 171 { 172 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 173 } 174 175 /** 176 * Analyze the referer and populate member variables 177 */ 178 protected function analyze(string $referer): void 179 { 180 $result = $this->analyzereferer($referer); 181 182 if ($result) { 183 $this->engine = $result['engine']; 184 $this->name = $result['name']; 185 $this->query = $result['query']; 186 } 187 } 188 189 /** 190 * Analyze a referer URL to extract search engine information and query 191 * 192 * @param string $referer The HTTP referer URL 193 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 194 */ 195 protected function analyzereferer(string $referer): ?array 196 { 197 $urlparts = parse_url(strtolower($referer)); 198 if (!isset($urlparts['host'])) { 199 return null; 200 } 201 202 $domain = $urlparts['host']; 203 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 204 205 parse_str($queryString, $params); 206 207 // Try to match against known search engines 208 $result = $this->matchKnownEngine($domain, $params); 209 if ($result) { 210 return $result; 211 } 212 213 // Try generic search parameters 214 return $this->matchGenericEngine($domain, $params); 215 } 216 217 /** 218 * Try to match against known search engines 219 * 220 * @param string $domain The domain to check 221 * @param array $params URL parameters 222 * @return array|null Match result or null 223 */ 224 protected function matchKnownEngine(string $domain, array $params): ?array 225 { 226 foreach (self::$searchEngines as $key => $engine) { 227 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 228 $query = $this->extractQuery($params, $engine['params']); 229 return [ 230 'engine' => $key, 231 'name' => $engine['name'], 232 'query' => $query 233 ]; 234 } 235 } 236 237 return null; 238 } 239 240 /** 241 * Try to match against generic search parameters 242 * 243 * @param string $domain The domain to check 244 * @param array $params URL parameters 245 * @return array|null Match result or null 246 */ 247 protected function matchGenericEngine(string $domain, array $params): ?array 248 { 249 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 250 $query = $this->extractQuery($params, $genericParams); 251 252 if (!$query) { 253 return null; 254 } 255 256 // Generate engine name from domain 257 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 258 $domainParts = explode('.', $engineName); 259 $engineName = array_pop($domainParts); 260 261 return [ 262 'engine' => $engineName, 263 'name' => ucfirst($engineName), 264 'query' => $query 265 ]; 266 } 267 268 /** 269 * Extract and clean search query from parameters 270 * 271 * @param array $params URL parameters 272 * @param array $paramNames Parameter names to check 273 * @return string|null Cleaned query or null 274 */ 275 protected function extractQuery(array $params, array $paramNames): ?string 276 { 277 foreach ($paramNames as $param) { 278 if (!empty($params[$param])) { 279 $query = $this->cleanQuery($params[$param]); 280 if ($query) { 281 return $query; 282 } 283 } 284 } 285 286 return null; 287 } 288 289 /** 290 * Clean and validate search query 291 * 292 * @param string $query Raw query string 293 * @return string|null Cleaned query or null if invalid 294 */ 295 protected function cleanQuery(string $query): ?string 296 { 297 // Remove non-search queries (cache: and related: prefixes) 298 $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 299 // Compact whitespace 300 $query = preg_replace('/\s+/', ' ', $query); 301 $query = trim($query); 302 303 return $query ?: null; 304 } 305} 306