1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Defines regular expressions for the most common search engines 7 */ 8class SearchEngines 9{ 10 /** @var array Search engine definitions with regex patterns and metadata */ 11 protected array $searchEngines = [ 12 'google' => [ 13 'name' => 'Google', 14 'url' => 'http://www.google.com', 15 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16 'params' => ['q'] 17 ], 18 'bing' => [ 19 'name' => 'Bing', 20 'url' => 'http://www.bing.com', 21 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22 'params' => ['q'] 23 ], 24 'yandex' => [ 25 'name' => 'Яндекс (Yandex)', 26 'url' => 'http://www.yandex.ru', 27 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28 'params' => ['query'] 29 ], 30 'yahoo' => [ 31 'name' => 'Yahoo!', 32 'url' => 'http://www.yahoo.com', 33 'regex' => '^(\w+\.)*yahoo\.com$', 34 'params' => ['p'] 35 ], 36 'naver' => [ 37 'name' => '네이버 (Naver)', 38 'url' => 'http://www.naver.com', 39 'regex' => '^search\.naver\.com$', 40 'params' => ['query'] 41 ], 42 'baidu' => [ 43 'name' => '百度 (Baidu)', 44 'url' => 'http://www.baidu.com', 45 'regex' => '^(\w+\.)*baidu\.com$', 46 'params' => ['wd', 'word', 'kw'] 47 ], 48 'ask' => [ 49 'name' => 'Ask', 50 'url' => 'http://www.ask.com', 51 'regex' => '^(\w+\.)*ask\.com$', 52 'params' => ['ask', 'q', 'searchfor'] 53 ], 54 'ask_search_results' => [ 55 'name' => 'Ask', 56 'url' => 'http://www.ask.com', 57 'regex' => '^(\w+\.)*search-results\.com$', 58 'params' => ['ask', 'q', 'searchfor'] 59 ], 60 'babylon' => [ 61 'name' => 'Babylon', 62 'url' => 'http://search.babylon.com', 63 'regex' => '^search\.babylon\.com$', 64 'params' => ['q'] 65 ], 66 'aol' => [ 67 'name' => 'AOL Search', 68 'url' => 'http://search.aol.com', 69 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70 'params' => ['query', 'q'] 71 ], 72 'duckduckgo' => [ 73 'name' => 'DuckDuckGo', 74 'url' => 'http://duckduckgo.com', 75 'regex' => '^duckduckgo\.com$', 76 'params' => ['q'] 77 ], 78 'google_avg' => [ 79 'name' => 'Google', 80 'url' => 'http://www.google.com', 81 'regex' => '^search\.avg\.com$', 82 'params' => ['q'] 83 ] 84 ]; 85 86 /** @var string The referrer URL being analyzed */ 87 protected string $referrer; 88 89 /** @var bool Whether the referrer is from a search engine */ 90 protected bool $isSearchEngine = false; 91 92 /** @var string|null The search engine name */ 93 protected ?string $engineName = null; 94 95 /** @var string|null The search engine key */ 96 protected ?string $engineKey = null; 97 98 /** @var string|null The search query */ 99 protected ?string $query = null; 100 101 public function __construct(string $referrer) 102 { 103 // Add the internal DokuWiki search engine 104 $this->searchEngines['dokuwiki'] = [ 105 'name' => 'DokuWiki Internal Search', 106 'url' => wl(), 107 'regex' => '', 108 'params' => ['q'] 109 ]; 110 111 $this->referrer = $referrer; 112 $this->analyze(); 113 } 114 115 /** 116 * Check if the referrer is from a search engine 117 * 118 * @return bool True if the referrer is from a search engine 119 */ 120 public function isSearchEngine(): bool 121 { 122 return $this->isSearchEngine; 123 } 124 125 /** 126 * Get the search engine name 127 * 128 * @return string|null The search engine name or null if not a search engine 129 */ 130 public function getName(): ?string 131 { 132 return $this->engineName; 133 } 134 135 /** 136 * Get the search engine URL 137 * 138 * @return string|null The search engine URL or null if not a search engine 139 */ 140 public function getUrl(): ?string 141 { 142 if (!$this->engineKey) { 143 return null; 144 } 145 146 return $this->searchEngines[$this->engineKey]['url'] ?? null; 147 } 148 149 /** 150 * Get the search query 151 * 152 * @return string|null The search query or null if not a search engine 153 */ 154 public function getQuery(): ?string 155 { 156 return $this->query; 157 } 158 159 /** 160 * Analyze the referrer and populate member variables 161 */ 162 protected function analyze(): void 163 { 164 $result = $this->analyzeReferrer($this->referrer); 165 166 if ($result) { 167 $this->isSearchEngine = true; 168 $this->engineKey = $result['engine']; 169 $this->engineName = $result['name']; 170 $this->query = $result['query']; 171 } 172 } 173 174 /** 175 * Analyze a referrer URL to extract search engine information and query 176 * 177 * @param string $referer The HTTP referer URL 178 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 179 */ 180 protected function analyzeReferrer(string $referer): ?array 181 { 182 $urlparts = parse_url(strtolower($referer)); 183 if (!isset($urlparts['host'])) { 184 return null; 185 } 186 187 $domain = $urlparts['host']; 188 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 189 190 if (!$queryString) { 191 return null; 192 } 193 194 parse_str($queryString, $params); 195 196 // Try to match against known search engines 197 $result = $this->matchKnownEngine($domain, $params); 198 if ($result) { 199 return $result; 200 } 201 202 // Try generic search parameters 203 return $this->matchGenericEngine($domain, $params); 204 } 205 206 /** 207 * Try to match against known search engines 208 * 209 * @param string $domain The domain to check 210 * @param array $params URL parameters 211 * @return array|null Match result or null 212 */ 213 protected function matchKnownEngine(string $domain, array $params): ?array 214 { 215 foreach ($this->searchEngines as $key => $engine) { 216 if (!$engine['regex']) { 217 continue; // skip engines without regex (like dokuwiki) 218 } 219 220 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 221 $query = $this->extractQuery($params, $engine['params']); 222 if ($query) { 223 return [ 224 'engine' => $key, 225 'name' => $engine['name'], 226 'query' => $query 227 ]; 228 } 229 } 230 } 231 232 return null; 233 } 234 235 /** 236 * Try to match against generic search parameters 237 * 238 * @param string $domain The domain to check 239 * @param array $params URL parameters 240 * @return array|null Match result or null 241 */ 242 protected function matchGenericEngine(string $domain, array $params): ?array 243 { 244 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 245 $query = $this->extractQuery($params, $genericParams); 246 247 if (!$query) { 248 return null; 249 } 250 251 // Generate engine name from domain 252 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 253 $engineName = array_pop(explode('.', $engineName)); 254 255 return [ 256 'engine' => 'generic_' . $engineName, 257 'name' => ucfirst($engineName), 258 'query' => $query 259 ]; 260 } 261 262 /** 263 * Extract and clean search query from parameters 264 * 265 * @param array $params URL parameters 266 * @param array $paramNames Parameter names to check 267 * @return string|null Cleaned query or null 268 */ 269 protected function extractQuery(array $params, array $paramNames): ?string 270 { 271 foreach ($paramNames as $param) { 272 if (!empty($params[$param])) { 273 $query = $this->cleanQuery($params[$param]); 274 if ($query) { 275 return $query; 276 } 277 } 278 } 279 280 return null; 281 } 282 283 /** 284 * Clean and validate search query 285 * 286 * @param string $query Raw query string 287 * @return string|null Cleaned query or null if invalid 288 */ 289 protected function cleanQuery(string $query): ?string 290 { 291 // Remove non-search queries 292 $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); 293 // Compact whitespace 294 $query = preg_replace('/ +/', ' ', $query); 295 $query = trim($query); 296 297 return $query ?: null; 298 } 299 300} 301