1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Extract search Engine Inormation from the HTTP referer 7 * 8 * We use the HTTP specification misspelling of "referer" here 9 */ 10class SearchEngines 11{ 12 /** @var array Search engine definitions with regex patterns and metadata */ 13 protected static array $searchEngines = [ 14 'dokuwiki' => [ 15 'name' => 'DokuWiki Internal Search', 16 'url' => DOKU_URL, 17 'regex' => '', // set in constructor 18 'params' => ['q'] 19 ], 20 'google' => [ 21 'name' => 'Google', 22 'url' => 'http://www.google.com', 23 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 24 'params' => ['q'] 25 ], 26 'bing' => [ 27 'name' => 'Bing', 28 'url' => 'http://www.bing.com', 29 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 30 'params' => ['q'] 31 ], 32 'yandex' => [ 33 'name' => 'Яндекс (Yandex)', 34 'url' => 'http://www.yandex.ru', 35 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 36 'params' => ['query'] 37 ], 38 'yahoo' => [ 39 'name' => 'Yahoo!', 40 'url' => 'http://www.yahoo.com', 41 'regex' => '^(\w+\.)*yahoo\.com$', 42 'params' => ['p'] 43 ], 44 'naver' => [ 45 'name' => '네이버 (Naver)', 46 'url' => 'http://www.naver.com', 47 'regex' => '^search\.naver\.com$', 48 'params' => ['query'] 49 ], 50 'baidu' => [ 51 'name' => '百度 (Baidu)', 52 'url' => 'http://www.baidu.com', 53 'regex' => '^(\w+\.)*baidu\.com$', 54 'params' => ['wd', 'word', 'kw'] 55 ], 56 'ask' => [ 57 'name' => 'Ask', 58 'url' => 'http://www.ask.com', 59 'regex' => '^(\w+\.)*ask\.com$', 60 'params' => ['ask', 'q', 'searchfor'] 61 ], 62 'ask_search_results' => [ 63 'name' => 'Ask', 64 'url' => 'http://www.ask.com', 65 'regex' => '^(\w+\.)*search-results\.com$', 66 'params' => ['ask', 'q', 'searchfor'] 67 ], 68 'babylon' => [ 69 'name' => 'Babylon', 70 'url' => 'http://search.babylon.com', 71 'regex' => '^search\.babylon\.com$', 72 'params' => ['q'] 73 ], 74 'aol' => [ 75 'name' => 'AOL Search', 76 'url' => 'http://search.aol.com', 77 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 78 'params' => ['query', 'q'] 79 ], 80 'duckduckgo' => [ 81 'name' => 'DuckDuckGo', 82 'url' => 'http://duckduckgo.com', 83 'regex' => '^duckduckgo\.com$', 84 'params' => ['q'] 85 ], 86 'google_avg' => [ 87 'name' => 'Google', 88 'url' => 'http://www.google.com', 89 'regex' => '^search\.avg\.com$', 90 'params' => ['q'] 91 ] 92 ]; 93 94 /** @var string|null The search engine key */ 95 protected ?string $engine = null; 96 97 /** @var string|null The search engine name */ 98 protected ?string $name = null; 99 100 /** @var string|null The search query */ 101 protected ?string $query = null; 102 103 /** 104 * Constructor 105 * 106 * @param string $referer The HTTP referer URL to analyze 107 */ 108 public function __construct(string $referer) 109 { 110 // Add regex matching ourselves 111 self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$'; 112 $this->analyze($referer); 113 } 114 115 /** 116 * Check if the referer is from a search engine 117 * 118 * @return bool True if the referer is from a search engine 119 */ 120 public function isSearchEngine(): bool 121 { 122 return (bool)$this->engine; 123 } 124 125 /** 126 * Get the search engine identifier from the referer 127 * 128 * @return string|null The search engine or null if not a search engine 129 */ 130 public function getEngine(): ?string 131 { 132 return $this->engine; 133 } 134 135 /** 136 * Get the search query from the referer 137 * 138 * @return string|null The search query or null if not a search engine 139 */ 140 public function getQuery(): ?string 141 { 142 return $this->query; 143 } 144 145 /** 146 * Get the search engine name for the given engine identifier 147 * 148 * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 149 */ 150 public static function getName($engine): string 151 { 152 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 153 } 154 155 /** 156 * Get the search engine URL for the given engine identifier 157 * 158 * @return string|null The search engine URL or null if not defined 159 */ 160 public static function getUrl($engine): ?string 161 { 162 return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 163 } 164 165 /** 166 * Analyze the referer and populate member variables 167 */ 168 protected function analyze(string $referer): void 169 { 170 $result = $this->analyzereferer($referer); 171 172 if ($result) { 173 $this->engine = $result['engine']; 174 $this->name = $result['name']; 175 $this->query = $result['query']; 176 } 177 } 178 179 /** 180 * Analyze a referer URL to extract search engine information and query 181 * 182 * @param string $referer The HTTP referer URL 183 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 184 */ 185 protected function analyzereferer(string $referer): ?array 186 { 187 $urlparts = parse_url(strtolower($referer)); 188 if (!isset($urlparts['host'])) { 189 return null; 190 } 191 192 $domain = $urlparts['host']; 193 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 194 195 parse_str($queryString, $params); 196 197 // Try to match against known search engines 198 $result = $this->matchKnownEngine($domain, $params); 199 if ($result) { 200 return $result; 201 } 202 203 // Try generic search parameters 204 return $this->matchGenericEngine($domain, $params); 205 } 206 207 /** 208 * Try to match against known search engines 209 * 210 * @param string $domain The domain to check 211 * @param array $params URL parameters 212 * @return array|null Match result or null 213 */ 214 protected function matchKnownEngine(string $domain, array $params): ?array 215 { 216 foreach (self::$searchEngines as $key => $engine) { 217 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 218 $query = $this->extractQuery($params, $engine['params']); 219 return [ 220 'engine' => $key, 221 'name' => $engine['name'], 222 'query' => $query 223 ]; 224 } 225 } 226 227 return null; 228 } 229 230 /** 231 * Try to match against generic search parameters 232 * 233 * @param string $domain The domain to check 234 * @param array $params URL parameters 235 * @return array|null Match result or null 236 */ 237 protected function matchGenericEngine(string $domain, array $params): ?array 238 { 239 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 240 $query = $this->extractQuery($params, $genericParams); 241 242 if (!$query) { 243 return null; 244 } 245 246 // Generate engine name from domain 247 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 248 $domainParts = explode('.', $engineName); 249 $engineName = array_pop($domainParts); 250 251 return [ 252 'engine' => $engineName, 253 'name' => ucfirst($engineName), 254 'query' => $query 255 ]; 256 } 257 258 /** 259 * Extract and clean search query from parameters 260 * 261 * @param array $params URL parameters 262 * @param array $paramNames Parameter names to check 263 * @return string|null Cleaned query or null 264 */ 265 protected function extractQuery(array $params, array $paramNames): ?string 266 { 267 foreach ($paramNames as $param) { 268 if (!empty($params[$param])) { 269 $query = $this->cleanQuery($params[$param]); 270 if ($query) { 271 return $query; 272 } 273 } 274 } 275 276 return null; 277 } 278 279 /** 280 * Clean and validate search query 281 * 282 * @param string $query Raw query string 283 * @return string|null Cleaned query or null if invalid 284 */ 285 protected function cleanQuery(string $query): ?string 286 { 287 // Remove non-search queries (cache: and related: prefixes) 288 $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 289 // Compact whitespace 290 $query = preg_replace('/\s+/', ' ', $query); 291 $query = trim($query); 292 293 return $query ?: null; 294 } 295} 296