1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Extract search Engine Inormation from the HTTP referer 7 * 8 * We use the HTTP specification misspelling of "referer" here 9 */ 10class SearchEngines 11{ 12 /** @var array Search engine definitions with regex patterns and metadata */ 13 protected static array $searchEngines = [ 14 'dokuwiki' => [ 15 'name' => 'DokuWiki Internal Search', 16 'url' => DOKU_URL, 17 'regex' => '', // set in constructor 18 'params' => ['q'] 19 ], 20 'google' => [ 21 'name' => 'Google', 22 'url' => 'http://www.google.com', 23 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 24 'params' => ['q'] 25 ], 26 'bing' => [ 27 'name' => 'Bing', 28 'url' => 'http://www.bing.com', 29 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 30 'params' => ['q'] 31 ], 32 'yandex' => [ 33 'name' => 'Яндекс (Yandex)', 34 'url' => 'http://www.yandex.ru', 35 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 36 'params' => ['query'] 37 ], 38 'yahoo' => [ 39 'name' => 'Yahoo!', 40 'url' => 'http://www.yahoo.com', 41 'regex' => '^(\w+\.)*yahoo\.com$', 42 'params' => ['p'] 43 ], 44 'naver' => [ 45 'name' => '네이버 (Naver)', 46 'url' => 'http://www.naver.com', 47 'regex' => '^search\.naver\.com$', 48 'params' => ['query'] 49 ], 50 'baidu' => [ 51 'name' => '百度 (Baidu)', 52 'url' => 'http://www.baidu.com', 53 'regex' => '^(\w+\.)*baidu\.com$', 54 'params' => ['wd', 'word', 'kw'] 55 ], 56 'ask' => [ 57 'name' => 'Ask', 58 'url' => 'http://www.ask.com', 59 'regex' => '^(\w+\.)*ask\.com$', 60 'params' => ['ask', 'q', 'searchfor'] 61 ], 62 'ask_search_results' => [ 63 'name' => 'Ask', 64 'url' => 'http://www.ask.com', 65 'regex' => '^(\w+\.)*search-results\.com$', 66 'params' => ['ask', 'q', 'searchfor'] 67 ], 68 'babylon' => [ 69 'name' => 'Babylon', 70 'url' => 'http://search.babylon.com', 71 'regex' => '^search\.babylon\.com$', 72 'params' => ['q'] 73 ], 74 'aol' => [ 75 'name' => 'AOL Search', 76 'url' => 'http://search.aol.com', 77 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 78 'params' => ['query', 'q'] 79 ], 80 'duckduckgo' => [ 81 'name' => 'DuckDuckGo', 82 'url' => 'http://duckduckgo.com', 83 'regex' => '^duckduckgo\.com$', 84 'params' => ['q'] 85 ], 86 'google_avg' => [ 87 'name' => 'Google', 88 'url' => 'http://www.google.com', 89 'regex' => '^search\.avg\.com$', 90 'params' => ['q'] 91 ] 92 ]; 93 94 /** @var string|null The search engine key */ 95 protected ?string $engine = null; 96 97 /** @var string|null The search engine name */ 98 protected ?string $name = null; 99 100 /** @var string|null The search query */ 101 protected ?string $query = null; 102 103 /** 104 * Constructor 105 * 106 * @param string $referer The HTTP referer URL to analyze 107 */ 108 public function __construct(string $referer) 109 { 110 // Add regex matching ourselves 111 self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$'; 112 $this->analyze($referer); 113 } 114 115 /** 116 * Check if the referer is from a search engine 117 * 118 * @return bool True if the referer is from a search engine 119 */ 120 public function isSearchEngine(): bool 121 { 122 return (bool)$this->engine; 123 } 124 125 /** 126 * Get the search engine identifier from the referer 127 * 128 * @return string|null The search engine or null if not a search engine 129 */ 130 public function getEngine(): ?string 131 { 132 return $this->engine; 133 } 134 135 /** 136 * Get the search query from the referer 137 * 138 * @return string|null The search query or null if not a search engine 139 */ 140 public function getQuery(): ?string 141 { 142 return $this->query; 143 } 144 145 /** 146 * Get the search engine name for the given engine identifier 147 * 148 * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 149 */ 150 public static function getName($engine): string 151 { 152 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucfirst($engine); 153 } 154 155 /** 156 * Get the search engine URL for the given engine identifier 157 * 158 * @return string|null The search engine URL or null if not defined 159 */ 160 public static function getUrl($engine): ?string 161 { 162 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 163 } 164 165 /** 166 * Analyze the referer and populate member variables 167 */ 168 protected function analyze(string $referer): void 169 { 170 $result = $this->analyzereferer($referer); 171 172 if ($result) { 173 $this->engine = $result['engine']; 174 $this->name = $result['name']; 175 $this->query = $result['query']; 176 } 177 } 178 179 /** 180 * Analyze a referer URL to extract search engine information and query 181 * 182 * @param string $referer The HTTP referer URL 183 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 184 */ 185 protected function analyzereferer(string $referer): ?array 186 { 187 $urlparts = parse_url(strtolower($referer)); 188 if (!isset($urlparts['host'])) { 189 return null; 190 } 191 192 $domain = $urlparts['host']; 193 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 194 195 if (!$queryString) { 196 return null; 197 } 198 199 parse_str($queryString, $params); 200 201 // Try to match against known search engines 202 $result = $this->matchKnownEngine($domain, $params); 203 if ($result) { 204 return $result; 205 } 206 207 // Try generic search parameters 208 return $this->matchGenericEngine($domain, $params); 209 } 210 211 /** 212 * Try to match against known search engines 213 * 214 * @param string $domain The domain to check 215 * @param array $params URL parameters 216 * @return array|null Match result or null 217 */ 218 protected function matchKnownEngine(string $domain, array $params): ?array 219 { 220 foreach (self::$searchEngines as $key => $engine) { 221 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 222 $query = $this->extractQuery($params, $engine['params']); 223 if ($query) { 224 return [ 225 'engine' => $key, 226 'name' => $engine['name'], 227 'query' => $query 228 ]; 229 } 230 } 231 } 232 233 return null; 234 } 235 236 /** 237 * Try to match against generic search parameters 238 * 239 * @param string $domain The domain to check 240 * @param array $params URL parameters 241 * @return array|null Match result or null 242 */ 243 protected function matchGenericEngine(string $domain, array $params): ?array 244 { 245 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 246 $query = $this->extractQuery($params, $genericParams); 247 248 if (!$query) { 249 return null; 250 } 251 252 // Generate engine name from domain 253 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 254 $domainParts = explode('.', $engineName); 255 $engineName = array_pop($domainParts); 256 257 return [ 258 'engine' => $engineName, 259 'name' => ucfirst($engineName), 260 'query' => $query 261 ]; 262 } 263 264 /** 265 * Extract and clean search query from parameters 266 * 267 * @param array $params URL parameters 268 * @param array $paramNames Parameter names to check 269 * @return string|null Cleaned query or null 270 */ 271 protected function extractQuery(array $params, array $paramNames): ?string 272 { 273 foreach ($paramNames as $param) { 274 if (!empty($params[$param])) { 275 $query = $this->cleanQuery($params[$param]); 276 if ($query) { 277 return $query; 278 } 279 } 280 } 281 282 return null; 283 } 284 285 /** 286 * Clean and validate search query 287 * 288 * @param string $query Raw query string 289 * @return string|null Cleaned query or null if invalid 290 */ 291 protected function cleanQuery(string $query): ?string 292 { 293 // Remove non-search queries (cache: and related: prefixes) 294 $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 295 // Compact whitespace 296 $query = preg_replace('/\s+/', ' ', $query); 297 $query = trim($query); 298 299 return $query ?: null; 300 } 301 302} 303