1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Extract search Engine Inormation from the HTTP referer 7 * 8 * We use the HTTP specification misspelling of "referer" here 9 */ 10class SearchEngines 11{ 12 /** @var array Search engine definitions with regex patterns and metadata */ 13 protected static array $searchEngines = [ 14 'dokuwiki' => [ 15 'name' => 'DokuWiki Internal Search', 16 'url' => DOKU_URL, 17 'regex' => '', // set in constructor 18 'params' => ['q'] 19 ], 20 'google' => [ 21 'name' => 'Google', 22 'url' => 'http://www.google.com', 23 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 24 'params' => ['q'] 25 ], 26 'bing' => [ 27 'name' => 'Bing', 28 'url' => 'http://www.bing.com', 29 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 30 'params' => ['q'] 31 ], 32 'yandex' => [ 33 'name' => 'Яндекс (Yandex)', 34 'url' => 'http://www.yandex.ru', 35 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 36 'params' => ['query'] 37 ], 38 'yahoo' => [ 39 'name' => 'Yahoo!', 40 'url' => 'http://www.yahoo.com', 41 'regex' => '^(\w+\.)*yahoo\.com$', 42 'params' => ['p'] 43 ], 44 'naver' => [ 45 'name' => '네이버 (Naver)', 46 'url' => 'http://www.naver.com', 47 'regex' => '^search\.naver\.com$', 48 'params' => ['query'] 49 ], 50 'baidu' => [ 51 'name' => '百度 (Baidu)', 52 'url' => 'http://www.baidu.com', 53 'regex' => '^(\w+\.)*baidu\.com$', 54 'params' => ['wd', 'word', 'kw'] 55 ], 56 'ask' => [ 57 'name' => 'Ask', 58 'url' => 'http://www.ask.com', 59 'regex' => '^(\w+\.)*ask\.com$', 60 'params' => ['ask', 'q', 'searchfor'] 61 ], 62 'ask_search_results' => [ 63 'name' => 'Ask', 64 'url' => 'http://www.ask.com', 65 'regex' => '^(\w+\.)*search-results\.com$', 66 'params' => ['ask', 'q', 'searchfor'] 67 ], 68 'babylon' => [ 69 'name' => 'Babylon', 70 'url' => 'http://search.babylon.com', 71 'regex' => '^search\.babylon\.com$', 72 'params' => ['q'] 73 ], 74 'aol' => [ 75 'name' => 'AOL Search', 76 'url' => 'http://search.aol.com', 77 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 78 'params' => ['query', 'q'] 79 ], 80 'duckduckgo' => [ 81 'name' => 'DuckDuckGo', 82 'url' => 'http://duckduckgo.com', 83 'regex' => '^duckduckgo\.com$', 84 'params' => ['q'] 85 ], 86 'ecosia' => [ 87 'name' => 'Ecosia', 88 'url' => 'https://www.ecosia.org', 89 'regex' => '^(\w+\.)*ecosia\.org$', 90 'params' => ['q'] 91 ], 92 'qwant' => [ 93 'name' => 'Qwant', 94 'url' => 'https://www.qwant.com', 95 'regex' => '^(\w+\.)*qwant\.com$', 96 'params' => ['q'] 97 ], 98 'google_avg' => [ 99 'name' => 'Google', 100 'url' => 'http://www.google.com', 101 'regex' => '^search\.avg\.com$', 102 'params' => ['q'] 103 ] 104 ]; 105 106 /** @var string|null The search engine key */ 107 protected ?string $engine = null; 108 109 /** @var string|null The search engine name */ 110 protected ?string $name = null; 111 112 /** @var string|null The search query */ 113 protected ?string $query = null; 114 115 /** 116 * Constructor 117 * 118 * @param string $referer The HTTP referer URL to analyze 119 */ 120 public function __construct(string $referer) 121 { 122 // Add regex matching ourselves 123 self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$'; 124 $this->analyze($referer); 125 } 126 127 /** 128 * Check if the referer is from a search engine 129 * 130 * @return bool True if the referer is from a search engine 131 */ 132 public function isSearchEngine(): bool 133 { 134 return (bool)$this->engine; 135 } 136 137 /** 138 * Get the search engine identifier from the referer 139 * 140 * @return string|null The search engine or null if not a search engine 141 */ 142 public function getEngine(): ?string 143 { 144 return $this->engine; 145 } 146 147 /** 148 * Get the search query from the referer 149 * 150 * @return string|null The search query or null if not a search engine 151 */ 152 public function getQuery(): ?string 153 { 154 return $this->query; 155 } 156 157 /** 158 * Get the search engine name for the given engine identifier 159 * 160 * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 161 */ 162 public static function getName($engine): string 163 { 164 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 165 } 166 167 /** 168 * Get the search engine URL for the given engine identifier 169 * 170 * @return string|null The search engine URL or null if not defined 171 */ 172 public static function getUrl($engine): ?string 173 { 174 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 175 } 176 177 /** 178 * Analyze the referer and populate member variables 179 */ 180 protected function analyze(string $referer): void 181 { 182 $result = $this->analyzereferer($referer); 183 184 if ($result) { 185 $this->engine = $result['engine']; 186 $this->name = $result['name']; 187 $this->query = $result['query']; 188 } 189 } 190 191 /** 192 * Analyze a referer URL to extract search engine information and query 193 * 194 * @param string $referer The HTTP referer URL 195 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 196 */ 197 protected function analyzereferer(string $referer): ?array 198 { 199 $urlparts = parse_url(strtolower($referer)); 200 if (!isset($urlparts['host'])) { 201 return null; 202 } 203 204 $domain = $urlparts['host']; 205 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 206 207 parse_str($queryString, $params); 208 209 // Try to match against known search engines 210 $result = $this->matchKnownEngine($domain, $params); 211 if ($result) { 212 return $result; 213 } 214 215 // Try generic search parameters 216 return $this->matchGenericEngine($domain, $params); 217 } 218 219 /** 220 * Try to match against known search engines 221 * 222 * @param string $domain The domain to check 223 * @param array $params URL parameters 224 * @return array|null Match result or null 225 */ 226 protected function matchKnownEngine(string $domain, array $params): ?array 227 { 228 foreach (self::$searchEngines as $key => $engine) { 229 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 230 $query = $this->extractQuery($params, $engine['params']); 231 return [ 232 'engine' => $key, 233 'name' => $engine['name'], 234 'query' => $query 235 ]; 236 } 237 } 238 239 return null; 240 } 241 242 /** 243 * Try to match against generic search parameters 244 * 245 * @param string $domain The domain to check 246 * @param array $params URL parameters 247 * @return array|null Match result or null 248 */ 249 protected function matchGenericEngine(string $domain, array $params): ?array 250 { 251 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 252 $query = $this->extractQuery($params, $genericParams); 253 254 if (!$query) { 255 return null; 256 } 257 258 // Generate engine name from domain 259 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 260 $domainParts = explode('.', $engineName); 261 $engineName = array_pop($domainParts); 262 263 return [ 264 'engine' => $engineName, 265 'name' => ucfirst($engineName), 266 'query' => $query 267 ]; 268 } 269 270 /** 271 * Extract and clean search query from parameters 272 * 273 * @param array $params URL parameters 274 * @param array $paramNames Parameter names to check 275 * @return string|null Cleaned query or null 276 */ 277 protected function extractQuery(array $params, array $paramNames): ?string 278 { 279 foreach ($paramNames as $param) { 280 if (!empty($params[$param])) { 281 $query = $this->cleanQuery($params[$param]); 282 if ($query) { 283 return $query; 284 } 285 } 286 } 287 288 return null; 289 } 290 291 /** 292 * Clean and validate search query 293 * 294 * @param string $query Raw query string 295 * @return string|null Cleaned query or null if invalid 296 */ 297 protected function cleanQuery(string $query): ?string 298 { 299 // Remove non-search queries (cache: and related: prefixes) 300 $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 301 // Compact whitespace 302 $query = preg_replace('/\s+/', ' ', $query); 303 $query = trim($query); 304 305 return $query ?: null; 306 } 307} 308