12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 6762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer 7762f4807SAndreas Gohr * 8762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here 92d987c80SAndreas Gohr */ 102d987c80SAndreas Gohrclass SearchEngines 112d987c80SAndreas Gohr{ 12e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 13762f4807SAndreas Gohr protected static array $searchEngines = [ 14762f4807SAndreas Gohr 'dokuwiki' => [ 15762f4807SAndreas Gohr 'name' => 'DokuWiki Internal Search', 16762f4807SAndreas Gohr 'url' => DOKU_URL, 17762f4807SAndreas Gohr 'regex' => '', // set in constructor 18762f4807SAndreas Gohr 'params' => ['q'] 19762f4807SAndreas Gohr ], 20e357e0dcSAndreas Gohr (aider) 'google' => [ 21e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 22e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 23e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 24e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 25e357e0dcSAndreas Gohr (aider) ], 26e357e0dcSAndreas Gohr (aider) 'bing' => [ 27e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 28e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 29e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 30e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 31e357e0dcSAndreas Gohr (aider) ], 32e357e0dcSAndreas Gohr (aider) 'yandex' => [ 33e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 34e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 35e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 36e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 37e357e0dcSAndreas Gohr (aider) ], 38e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 39e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 40e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 41e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 42e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 43e357e0dcSAndreas Gohr (aider) ], 44e357e0dcSAndreas Gohr (aider) 'naver' => [ 45e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 46e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 47e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 48e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 49e357e0dcSAndreas Gohr (aider) ], 50e357e0dcSAndreas Gohr (aider) 'baidu' => [ 51e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 52e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 53e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 54e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 55e357e0dcSAndreas Gohr (aider) ], 56e357e0dcSAndreas Gohr (aider) 'ask' => [ 57e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 58e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 59e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 60e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 61e357e0dcSAndreas Gohr (aider) ], 62e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 63e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 64e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 65e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 66e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 67e357e0dcSAndreas Gohr (aider) ], 68e357e0dcSAndreas Gohr (aider) 'babylon' => [ 69e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 70e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 71e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 72e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 73e357e0dcSAndreas Gohr (aider) ], 74e357e0dcSAndreas Gohr (aider) 'aol' => [ 75e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 76e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 77e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 78e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 79e357e0dcSAndreas Gohr (aider) ], 80e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 81e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 82e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 83e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 84e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 85e357e0dcSAndreas Gohr (aider) ], 86e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 87e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 88e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 89e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 90e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 91e357e0dcSAndreas Gohr (aider) ] 922d987c80SAndreas Gohr ]; 932d987c80SAndreas Gohr 94762f4807SAndreas Gohr /** @var string|null The search engine key */ 95762f4807SAndreas Gohr protected ?string $engine = null; 9606bd4382SAndreas Gohr (aider) 97aecf8e88SAndreas Gohr (aider) /** @var string|null The search engine name */ 98762f4807SAndreas Gohr protected ?string $name = null; 99aecf8e88SAndreas Gohr (aider) 100aecf8e88SAndreas Gohr (aider) /** @var string|null The search query */ 101aecf8e88SAndreas Gohr (aider) protected ?string $query = null; 102aecf8e88SAndreas Gohr (aider) 103762f4807SAndreas Gohr /** 104762f4807SAndreas Gohr * Constructor 105762f4807SAndreas Gohr * 106762f4807SAndreas Gohr * @param string $referer The HTTP referer URL to analyze 107762f4807SAndreas Gohr */ 108762f4807SAndreas Gohr public function __construct(string $referer) 1092d987c80SAndreas Gohr { 110762f4807SAndreas Gohr // Add regex matching ourselves 111762f4807SAndreas Gohr self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$'; 112762f4807SAndreas Gohr $this->analyze($referer); 11306bd4382SAndreas Gohr (aider) } 11406bd4382SAndreas Gohr (aider) 11506bd4382SAndreas Gohr (aider) /** 116762f4807SAndreas Gohr * Check if the referer is from a search engine 11706bd4382SAndreas Gohr (aider) * 118762f4807SAndreas Gohr * @return bool True if the referer is from a search engine 11906bd4382SAndreas Gohr (aider) */ 12006bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 12106bd4382SAndreas Gohr (aider) { 122762f4807SAndreas Gohr return (bool)$this->engine; 12306bd4382SAndreas Gohr (aider) } 12406bd4382SAndreas Gohr (aider) 12506bd4382SAndreas Gohr (aider) /** 126762f4807SAndreas Gohr * Get the search engine identifier from the referer 12706bd4382SAndreas Gohr (aider) * 128c428ec28SAndreas Gohr * @return string|null The search engine or null if not a search engine 12906bd4382SAndreas Gohr (aider) */ 130c428ec28SAndreas Gohr public function getEngine(): ?string 13106bd4382SAndreas Gohr (aider) { 132762f4807SAndreas Gohr return $this->engine; 13306bd4382SAndreas Gohr (aider) } 13406bd4382SAndreas Gohr (aider) 13506bd4382SAndreas Gohr (aider) /** 136762f4807SAndreas Gohr * Get the search query from the referer 13706bd4382SAndreas Gohr (aider) * 13806bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 13906bd4382SAndreas Gohr (aider) */ 14006bd4382SAndreas Gohr (aider) public function getQuery(): ?string 14106bd4382SAndreas Gohr (aider) { 142aecf8e88SAndreas Gohr (aider) return $this->query; 14306bd4382SAndreas Gohr (aider) } 14406bd4382SAndreas Gohr (aider) 14506bd4382SAndreas Gohr (aider) /** 146762f4807SAndreas Gohr * Get the search engine name for the given engine identifier 147762f4807SAndreas Gohr * 148c428ec28SAndreas Gohr * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 14906bd4382SAndreas Gohr (aider) */ 150c428ec28SAndreas Gohr public static function getName($engine): string 15106bd4382SAndreas Gohr (aider) { 152*c4c84f98SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 153762f4807SAndreas Gohr } 154762f4807SAndreas Gohr 155762f4807SAndreas Gohr /** 156762f4807SAndreas Gohr * Get the search engine URL for the given engine identifier 157762f4807SAndreas Gohr * 158762f4807SAndreas Gohr * @return string|null The search engine URL or null if not defined 159762f4807SAndreas Gohr */ 160762f4807SAndreas Gohr public static function getUrl($engine): ?string 161762f4807SAndreas Gohr { 162762f4807SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 163762f4807SAndreas Gohr } 164762f4807SAndreas Gohr 165762f4807SAndreas Gohr /** 166762f4807SAndreas Gohr * Analyze the referer and populate member variables 167762f4807SAndreas Gohr */ 168762f4807SAndreas Gohr protected function analyze(string $referer): void 169762f4807SAndreas Gohr { 170762f4807SAndreas Gohr $result = $this->analyzereferer($referer); 17106bd4382SAndreas Gohr (aider) 172aecf8e88SAndreas Gohr (aider) if ($result) { 173762f4807SAndreas Gohr $this->engine = $result['engine']; 174762f4807SAndreas Gohr $this->name = $result['name']; 175aecf8e88SAndreas Gohr (aider) $this->query = $result['query']; 176aecf8e88SAndreas Gohr (aider) } 1772d987c80SAndreas Gohr } 1782d987c80SAndreas Gohr 179a171b9c7SAndreas Gohr (aider) /** 180762f4807SAndreas Gohr * Analyze a referer URL to extract search engine information and query 181a171b9c7SAndreas Gohr (aider) * 182a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 183a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 184a171b9c7SAndreas Gohr (aider) */ 185762f4807SAndreas Gohr protected function analyzereferer(string $referer): ?array 186a171b9c7SAndreas Gohr (aider) { 187a73005f2SAndreas Gohr (aider) $urlparts = parse_url(strtolower($referer)); 188a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 189a171b9c7SAndreas Gohr (aider) return null; 190a171b9c7SAndreas Gohr (aider) } 191a171b9c7SAndreas Gohr (aider) 192a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 193a73005f2SAndreas Gohr (aider) $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 194a171b9c7SAndreas Gohr (aider) 195a73005f2SAndreas Gohr (aider) parse_str($queryString, $params); 196a73005f2SAndreas Gohr (aider) 197a73005f2SAndreas Gohr (aider) // Try to match against known search engines 198a73005f2SAndreas Gohr (aider) $result = $this->matchKnownEngine($domain, $params); 199a73005f2SAndreas Gohr (aider) if ($result) { 200a73005f2SAndreas Gohr (aider) return $result; 201a73005f2SAndreas Gohr (aider) } 202a73005f2SAndreas Gohr (aider) 203a73005f2SAndreas Gohr (aider) // Try generic search parameters 204a73005f2SAndreas Gohr (aider) return $this->matchGenericEngine($domain, $params); 205a73005f2SAndreas Gohr (aider) } 206a73005f2SAndreas Gohr (aider) 207a73005f2SAndreas Gohr (aider) /** 208a73005f2SAndreas Gohr (aider) * Try to match against known search engines 209a73005f2SAndreas Gohr (aider) * 210a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 211a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 212a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 213a73005f2SAndreas Gohr (aider) */ 214a73005f2SAndreas Gohr (aider) protected function matchKnownEngine(string $domain, array $params): ?array 215a73005f2SAndreas Gohr (aider) { 216762f4807SAndreas Gohr foreach (self::$searchEngines as $key => $engine) { 217a73005f2SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 218a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $engine['params']); 219a73005f2SAndreas Gohr (aider) return [ 220a73005f2SAndreas Gohr (aider) 'engine' => $key, 221a73005f2SAndreas Gohr (aider) 'name' => $engine['name'], 222a73005f2SAndreas Gohr (aider) 'query' => $query 223a73005f2SAndreas Gohr (aider) ]; 224a73005f2SAndreas Gohr (aider) } 225a73005f2SAndreas Gohr (aider) } 226a73005f2SAndreas Gohr (aider) 227a73005f2SAndreas Gohr (aider) return null; 228a73005f2SAndreas Gohr (aider) } 229a73005f2SAndreas Gohr (aider) 230a73005f2SAndreas Gohr (aider) /** 231a73005f2SAndreas Gohr (aider) * Try to match against generic search parameters 232a73005f2SAndreas Gohr (aider) * 233a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 234a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 235a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 236a73005f2SAndreas Gohr (aider) */ 237a73005f2SAndreas Gohr (aider) protected function matchGenericEngine(string $domain, array $params): ?array 238a73005f2SAndreas Gohr (aider) { 239a73005f2SAndreas Gohr (aider) $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 240a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $genericParams); 241a171b9c7SAndreas Gohr (aider) 242a171b9c7SAndreas Gohr (aider) if (!$query) { 243a171b9c7SAndreas Gohr (aider) return null; 244a171b9c7SAndreas Gohr (aider) } 245a171b9c7SAndreas Gohr (aider) 246a73005f2SAndreas Gohr (aider) // Generate engine name from domain 247a73005f2SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 248762f4807SAndreas Gohr $domainParts = explode('.', $engineName); 249762f4807SAndreas Gohr $engineName = array_pop($domainParts); 250a73005f2SAndreas Gohr (aider) 251a171b9c7SAndreas Gohr (aider) return [ 252762f4807SAndreas Gohr 'engine' => $engineName, 253a73005f2SAndreas Gohr (aider) 'name' => ucfirst($engineName), 254a171b9c7SAndreas Gohr (aider) 'query' => $query 255a171b9c7SAndreas Gohr (aider) ]; 256a171b9c7SAndreas Gohr (aider) } 257a171b9c7SAndreas Gohr (aider) 258a171b9c7SAndreas Gohr (aider) /** 259a73005f2SAndreas Gohr (aider) * Extract and clean search query from parameters 260a171b9c7SAndreas Gohr (aider) * 261a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 262a73005f2SAndreas Gohr (aider) * @param array $paramNames Parameter names to check 263a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null 264a171b9c7SAndreas Gohr (aider) */ 265a73005f2SAndreas Gohr (aider) protected function extractQuery(array $params, array $paramNames): ?string 266a171b9c7SAndreas Gohr (aider) { 267a73005f2SAndreas Gohr (aider) foreach ($paramNames as $param) { 268a73005f2SAndreas Gohr (aider) if (!empty($params[$param])) { 269a73005f2SAndreas Gohr (aider) $query = $this->cleanQuery($params[$param]); 270a73005f2SAndreas Gohr (aider) if ($query) { 271a73005f2SAndreas Gohr (aider) return $query; 272a73005f2SAndreas Gohr (aider) } 273a73005f2SAndreas Gohr (aider) } 274a73005f2SAndreas Gohr (aider) } 275a73005f2SAndreas Gohr (aider) 276a73005f2SAndreas Gohr (aider) return null; 277a171b9c7SAndreas Gohr (aider) } 278a171b9c7SAndreas Gohr (aider) 279a171b9c7SAndreas Gohr (aider) /** 280a73005f2SAndreas Gohr (aider) * Clean and validate search query 281a171b9c7SAndreas Gohr (aider) * 282a73005f2SAndreas Gohr (aider) * @param string $query Raw query string 283a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null if invalid 284a171b9c7SAndreas Gohr (aider) */ 285a73005f2SAndreas Gohr (aider) protected function cleanQuery(string $query): ?string 286a171b9c7SAndreas Gohr (aider) { 287c428ec28SAndreas Gohr // Remove non-search queries (cache: and related: prefixes) 288c428ec28SAndreas Gohr $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 289a73005f2SAndreas Gohr (aider) // Compact whitespace 290c428ec28SAndreas Gohr $query = preg_replace('/\s+/', ' ', $query); 291a73005f2SAndreas Gohr (aider) $query = trim($query); 292a73005f2SAndreas Gohr (aider) 293a73005f2SAndreas Gohr (aider) return $query ?: null; 294a171b9c7SAndreas Gohr (aider) } 2952d987c80SAndreas Gohr} 296