1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Extract search Engine Inormation from the HTTP referer 7 * 8 * We use the HTTP specification misspelling of "referer" here 9 */ 10class SearchEngines 11{ 12 /** @var array Search engine definitions with regex patterns and metadata */ 13 protected static array $searchEngines = [ 14 'google' => [ 15 'name' => 'Google', 16 'url' => 'http://www.google.com', 17 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 18 'params' => ['q'] 19 ], 20 'bing' => [ 21 'name' => 'Bing', 22 'url' => 'http://www.bing.com', 23 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 24 'params' => ['q'] 25 ], 26 'yandex' => [ 27 'name' => 'Яндекс (Yandex)', 28 'url' => 'http://www.yandex.ru', 29 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 30 'params' => ['query'] 31 ], 32 'yahoo' => [ 33 'name' => 'Yahoo!', 34 'url' => 'http://www.yahoo.com', 35 'regex' => '^(\w+\.)*yahoo\.com$', 36 'params' => ['p'] 37 ], 38 'naver' => [ 39 'name' => '네이버 (Naver)', 40 'url' => 'http://www.naver.com', 41 'regex' => '^search\.naver\.com$', 42 'params' => ['query'] 43 ], 44 'baidu' => [ 45 'name' => '百度 (Baidu)', 46 'url' => 'http://www.baidu.com', 47 'regex' => '^(\w+\.)*baidu\.com$', 48 'params' => ['wd', 'word', 'kw'] 49 ], 50 'ask' => [ 51 'name' => 'Ask', 52 'url' => 'http://www.ask.com', 53 'regex' => '^(\w+\.)*ask\.com$', 54 'params' => ['ask', 'q', 'searchfor'] 55 ], 56 'ask_search_results' => [ 57 'name' => 'Ask', 58 'url' => 'http://www.ask.com', 59 'regex' => '^(\w+\.)*search-results\.com$', 60 'params' => ['ask', 'q', 'searchfor'] 61 ], 62 'babylon' => [ 63 'name' => 'Babylon', 64 'url' => 'http://search.babylon.com', 65 'regex' => '^search\.babylon\.com$', 66 'params' => ['q'] 67 ], 68 'aol' => [ 69 'name' => 'AOL Search', 70 'url' => 'http://search.aol.com', 71 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 72 'params' => ['query', 'q'] 73 ], 74 'duckduckgo' => [ 75 'name' => 'DuckDuckGo', 76 'url' => 'http://duckduckgo.com', 77 'regex' => '^duckduckgo\.com$', 78 'params' => ['q'] 79 ], 80 'ecosia' => [ 81 'name' => 'Ecosia', 82 'url' => 'https://www.ecosia.org', 83 'regex' => '^(\w+\.)*ecosia\.org$', 84 'params' => ['q'] 85 ], 86 'qwant' => [ 87 'name' => 'Qwant', 88 'url' => 'https://www.qwant.com', 89 'regex' => '^(\w+\.)*qwant\.com$', 90 'params' => ['q'] 91 ], 92 'google_avg' => [ 93 'name' => 'Google', 94 'url' => 'http://www.google.com', 95 'regex' => '^search\.avg\.com$', 96 'params' => ['q'] 97 ] 98 ]; 99 100 /** @var string|null The search engine key */ 101 protected ?string $engine = null; 102 103 /** @var string|null The search engine name */ 104 protected ?string $name = null; 105 106 /** @var string|null The search query */ 107 protected ?string $query = null; 108 109 /** 110 * Constructor 111 * 112 * @param string $referer The HTTP referer URL to analyze 113 */ 114 public function __construct(string $referer) 115 { 116 $this->analyze($referer); 117 } 118 119 /** 120 * Check if the referer is from a search engine 121 * 122 * @return bool True if the referer is from a search engine 123 */ 124 public function isSearchEngine(): bool 125 { 126 return (bool)$this->engine; 127 } 128 129 /** 130 * Get the search engine identifier from the referer 131 * 132 * @return string|null The search engine or null if not a search engine 133 */ 134 public function getEngine(): ?string 135 { 136 return $this->engine; 137 } 138 139 /** 140 * Get the search query from the referer 141 * 142 * @return string|null The search query or null if not a search engine 143 */ 144 public function getQuery(): ?string 145 { 146 return $this->query; 147 } 148 149 /** 150 * Get the search engine name for the given engine identifier 151 * 152 * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 153 */ 154 public static function getName($engine): string 155 { 156 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 157 } 158 159 /** 160 * Get the search engine URL for the given engine identifier 161 * 162 * @return string|null The search engine URL or null if not defined 163 */ 164 public static function getUrl($engine): ?string 165 { 166 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 167 } 168 169 /** 170 * Analyze the referer and populate member variables 171 */ 172 protected function analyze(string $referer): void 173 { 174 $result = $this->analyzereferer($referer); 175 176 if ($result) { 177 $this->engine = $result['engine']; 178 $this->name = $result['name']; 179 $this->query = $result['query']; 180 } 181 } 182 183 /** 184 * Analyze a referer URL to extract search engine information and query 185 * 186 * @param string $referer The HTTP referer URL 187 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 188 */ 189 protected function analyzereferer(string $referer): ?array 190 { 191 $urlparts = parse_url(strtolower($referer)); 192 if (!isset($urlparts['host'])) { 193 return null; 194 } 195 196 $domain = $urlparts['host']; 197 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 198 199 parse_str($queryString, $params); 200 201 // Try to match against known search engines 202 $result = $this->matchKnownEngine($domain, $params); 203 if ($result) { 204 return $result; 205 } 206 207 // Try generic search parameters 208 return $this->matchGenericEngine($domain, $params); 209 } 210 211 /** 212 * Try to match against known search engines 213 * 214 * @param string $domain The domain to check 215 * @param array $params URL parameters 216 * @return array|null Match result or null 217 */ 218 protected function matchKnownEngine(string $domain, array $params): ?array 219 { 220 foreach (self::$searchEngines as $key => $engine) { 221 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 222 $query = $this->extractQuery($params, $engine['params']); 223 return [ 224 'engine' => $key, 225 'name' => $engine['name'], 226 'query' => $query 227 ]; 228 } 229 } 230 231 return null; 232 } 233 234 /** 235 * Try to match against generic search parameters 236 * 237 * @param string $domain The domain to check 238 * @param array $params URL parameters 239 * @return array|null Match result or null 240 */ 241 protected function matchGenericEngine(string $domain, array $params): ?array 242 { 243 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 244 $query = $this->extractQuery($params, $genericParams); 245 246 if (!$query) { 247 return null; 248 } 249 250 // Generate engine name from domain 251 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 252 $domainParts = explode('.', $engineName); 253 $engineName = array_pop($domainParts); 254 255 return [ 256 'engine' => $engineName, 257 'name' => ucfirst($engineName), 258 'query' => $query 259 ]; 260 } 261 262 /** 263 * Extract and clean search query from parameters 264 * 265 * @param array $params URL parameters 266 * @param array $paramNames Parameter names to check 267 * @return string|null Cleaned query or null 268 */ 269 protected function extractQuery(array $params, array $paramNames): ?string 270 { 271 foreach ($paramNames as $param) { 272 if (!empty($params[$param])) { 273 $query = $this->cleanQuery($params[$param]); 274 if ($query) { 275 return $query; 276 } 277 } 278 } 279 280 return null; 281 } 282 283 /** 284 * Clean and validate search query 285 * 286 * @param string $query Raw query string 287 * @return string|null Cleaned query or null if invalid 288 */ 289 protected function cleanQuery(string $query): ?string 290 { 291 // Remove non-search queries (cache: and related: prefixes) 292 $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 293 // Compact whitespace 294 $query = preg_replace('/\s+/', ' ', $query); 295 $query = trim($query); 296 297 return $query ?: null; 298 } 299} 300