1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Defines regular expressions for the most common search engines 7 */ 8class SearchEngines 9{ 10 /** @var array Search engine definitions with regex patterns and metadata */ 11 protected array $searchEngines = [ 12 'google' => [ 13 'name' => 'Google', 14 'url' => 'http://www.google.com', 15 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16 'params' => ['q'] 17 ], 18 'bing' => [ 19 'name' => 'Bing', 20 'url' => 'http://www.bing.com', 21 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22 'params' => ['q'] 23 ], 24 'yandex' => [ 25 'name' => 'Яндекс (Yandex)', 26 'url' => 'http://www.yandex.ru', 27 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28 'params' => ['query'] 29 ], 30 'yahoo' => [ 31 'name' => 'Yahoo!', 32 'url' => 'http://www.yahoo.com', 33 'regex' => '^(\w+\.)*yahoo\.com$', 34 'params' => ['p'] 35 ], 36 'naver' => [ 37 'name' => '네이버 (Naver)', 38 'url' => 'http://www.naver.com', 39 'regex' => '^search\.naver\.com$', 40 'params' => ['query'] 41 ], 42 'baidu' => [ 43 'name' => '百度 (Baidu)', 44 'url' => 'http://www.baidu.com', 45 'regex' => '^(\w+\.)*baidu\.com$', 46 'params' => ['wd', 'word', 'kw'] 47 ], 48 'ask' => [ 49 'name' => 'Ask', 50 'url' => 'http://www.ask.com', 51 'regex' => '^(\w+\.)*ask\.com$', 52 'params' => ['ask', 'q', 'searchfor'] 53 ], 54 'ask_search_results' => [ 55 'name' => 'Ask', 56 'url' => 'http://www.ask.com', 57 'regex' => '^(\w+\.)*search-results\.com$', 58 'params' => ['ask', 'q', 'searchfor'] 59 ], 60 'babylon' => [ 61 'name' => 'Babylon', 62 'url' => 'http://search.babylon.com', 63 'regex' => '^search\.babylon\.com$', 64 'params' => ['q'] 65 ], 66 'aol' => [ 67 'name' => 'AOL Search', 68 'url' => 'http://search.aol.com', 69 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70 'params' => ['query', 'q'] 71 ], 72 'duckduckgo' => [ 73 'name' => 'DuckDuckGo', 74 'url' => 'http://duckduckgo.com', 75 'regex' => '^duckduckgo\.com$', 76 'params' => ['q'] 77 ], 78 'google_avg' => [ 79 'name' => 'Google', 80 'url' => 'http://www.google.com', 81 'regex' => '^search\.avg\.com$', 82 'params' => ['q'] 83 ] 84 ]; 85 86 /** @var string|null The referrer URL being analyzed */ 87 protected ?string $referrer = null; 88 89 /** @var array|null Cached analysis result */ 90 protected ?array $analysisResult = null; 91 92 public function __construct(?string $referrer = null) 93 { 94 // Add the internal DokuWiki search engine 95 $this->searchEngines['dokuwiki'] = [ 96 'name' => 'DokuWiki Internal Search', 97 'url' => wl(), 98 'regex' => '', 99 'params' => ['q'] 100 ]; 101 102 if ($referrer !== null) { 103 $this->referrer = $referrer; 104 } 105 } 106 107 /** 108 * Check if the referrer is from a search engine 109 * 110 * @return bool True if the referrer is from a search engine 111 */ 112 public function isSearchEngine(): bool 113 { 114 return $this->getAnalysis() !== null; 115 } 116 117 /** 118 * Get the search engine name 119 * 120 * @return string|null The search engine name or null if not a search engine 121 */ 122 public function getName(): ?string 123 { 124 $analysis = $this->getAnalysis(); 125 return $analysis['name'] ?? null; 126 } 127 128 /** 129 * Get the search engine URL 130 * 131 * @return string|null The search engine URL or null if not a search engine 132 */ 133 public function getUrl(): ?string 134 { 135 $analysis = $this->getAnalysis(); 136 if (!$analysis) { 137 return null; 138 } 139 140 return $this->searchEngines[$analysis['engine']]['url'] ?? null; 141 } 142 143 /** 144 * Get the search query 145 * 146 * @return string|null The search query or null if not a search engine 147 */ 148 public function getQuery(): ?string 149 { 150 $analysis = $this->getAnalysis(); 151 return $analysis['query'] ?? null; 152 } 153 154 /** 155 * Get or perform analysis of the current referrer 156 * 157 * @return array|null Analysis result or null if not a search engine 158 */ 159 protected function getAnalysis(): ?array 160 { 161 if ($this->analysisResult === null && $this->referrer !== null) { 162 $this->analysisResult = $this->analyzeReferrer($this->referrer); 163 } 164 165 return $this->analysisResult; 166 } 167 168 /** 169 * Analyze a referrer URL to extract search engine information and query 170 * 171 * @param string $referer The HTTP referer URL 172 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 173 */ 174 protected function analyzeReferrer(string $referer): ?array 175 { 176 $urlparts = parse_url(strtolower($referer)); 177 if (!isset($urlparts['host'])) { 178 return null; 179 } 180 181 $domain = $urlparts['host']; 182 $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 183 184 if (!$queryString) { 185 return null; 186 } 187 188 parse_str($queryString, $params); 189 190 // Try to match against known search engines 191 $result = $this->matchKnownEngine($domain, $params); 192 if ($result) { 193 return $result; 194 } 195 196 // Try generic search parameters 197 return $this->matchGenericEngine($domain, $params); 198 } 199 200 /** 201 * Try to match against known search engines 202 * 203 * @param string $domain The domain to check 204 * @param array $params URL parameters 205 * @return array|null Match result or null 206 */ 207 protected function matchKnownEngine(string $domain, array $params): ?array 208 { 209 foreach ($this->searchEngines as $key => $engine) { 210 if (!$engine['regex']) { 211 continue; // skip engines without regex (like dokuwiki) 212 } 213 214 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 215 $query = $this->extractQuery($params, $engine['params']); 216 if ($query) { 217 return [ 218 'engine' => $key, 219 'name' => $engine['name'], 220 'query' => $query 221 ]; 222 } 223 } 224 } 225 226 return null; 227 } 228 229 /** 230 * Try to match against generic search parameters 231 * 232 * @param string $domain The domain to check 233 * @param array $params URL parameters 234 * @return array|null Match result or null 235 */ 236 protected function matchGenericEngine(string $domain, array $params): ?array 237 { 238 $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 239 $query = $this->extractQuery($params, $genericParams); 240 241 if (!$query) { 242 return null; 243 } 244 245 // Generate engine name from domain 246 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 247 $engineName = array_pop(explode('.', $engineName)); 248 249 return [ 250 'engine' => 'generic_' . $engineName, 251 'name' => ucfirst($engineName), 252 'query' => $query 253 ]; 254 } 255 256 /** 257 * Extract and clean search query from parameters 258 * 259 * @param array $params URL parameters 260 * @param array $paramNames Parameter names to check 261 * @return string|null Cleaned query or null 262 */ 263 protected function extractQuery(array $params, array $paramNames): ?string 264 { 265 foreach ($paramNames as $param) { 266 if (!empty($params[$param])) { 267 $query = $this->cleanQuery($params[$param]); 268 if ($query) { 269 return $query; 270 } 271 } 272 } 273 274 return null; 275 } 276 277 /** 278 * Clean and validate search query 279 * 280 * @param string $query Raw query string 281 * @return string|null Cleaned query or null if invalid 282 */ 283 protected function cleanQuery(string $query): ?string 284 { 285 // Remove non-search queries 286 $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); 287 // Compact whitespace 288 $query = preg_replace('/ +/', ' ', $query); 289 $query = trim($query); 290 291 return $query ?: null; 292 } 293 294} 295