12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 6762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer 7762f4807SAndreas Gohr * 8762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here 92d987c80SAndreas Gohr */ 102d987c80SAndreas Gohrclass SearchEngines 112d987c80SAndreas Gohr{ 12e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 13762f4807SAndreas Gohr protected static array $searchEngines = [ 14e357e0dcSAndreas Gohr (aider) 'google' => [ 15e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 16*6811247aSAndreas Gohr 'url' => 'https://www.google.com', 17e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 18e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 19e357e0dcSAndreas Gohr (aider) ], 20e357e0dcSAndreas Gohr (aider) 'bing' => [ 21e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 22*6811247aSAndreas Gohr 'url' => 'https://www.bing.com', 23e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 24e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 25e357e0dcSAndreas Gohr (aider) ], 26e357e0dcSAndreas Gohr (aider) 'yandex' => [ 27e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 28*6811247aSAndreas Gohr 'url' => 'https://www.yandex.ru', 29e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 30e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 31e357e0dcSAndreas Gohr (aider) ], 32e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 33e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 34*6811247aSAndreas Gohr 'url' => 'https://www.yahoo.com', 35e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 36e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 37e357e0dcSAndreas Gohr (aider) ], 38e357e0dcSAndreas Gohr (aider) 'naver' => [ 39e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 40*6811247aSAndreas Gohr 'url' => 'https://www.naver.com', 41e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 42e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 43e357e0dcSAndreas Gohr (aider) ], 44e357e0dcSAndreas Gohr (aider) 'baidu' => [ 45e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 46*6811247aSAndreas Gohr 'url' => 'https://www.baidu.com', 47e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 48e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 49e357e0dcSAndreas Gohr (aider) ], 50e357e0dcSAndreas Gohr (aider) 'ask' => [ 51e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 52*6811247aSAndreas Gohr 'url' => 'https://www.ask.com', 53e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 54e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 55e357e0dcSAndreas Gohr (aider) ], 56e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 57e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 58*6811247aSAndreas Gohr 'url' => 'https://www.ask.com', 59e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 60e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 61e357e0dcSAndreas Gohr (aider) ], 62e357e0dcSAndreas Gohr (aider) 'babylon' => [ 63e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 64*6811247aSAndreas Gohr 'url' => 'https://search.babylon.com', 65e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 66e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 67e357e0dcSAndreas Gohr (aider) ], 68e357e0dcSAndreas Gohr (aider) 'aol' => [ 69e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 70*6811247aSAndreas Gohr 'url' => 'https://search.aol.com', 71e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 72e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 73e357e0dcSAndreas Gohr (aider) ], 74e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 75e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 76*6811247aSAndreas Gohr 'url' => 'https://duckduckgo.com', 77e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 78e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 79e357e0dcSAndreas Gohr (aider) ], 8045f4cdffSAndreas Gohr 'ecosia' => [ 8145f4cdffSAndreas Gohr 'name' => 'Ecosia', 8245f4cdffSAndreas Gohr 'url' => 'https://www.ecosia.org', 8345f4cdffSAndreas Gohr 'regex' => '^(\w+\.)*ecosia\.org$', 8445f4cdffSAndreas Gohr 'params' => ['q'] 8545f4cdffSAndreas Gohr ], 8645f4cdffSAndreas Gohr 'qwant' => [ 8745f4cdffSAndreas Gohr 'name' => 'Qwant', 8845f4cdffSAndreas Gohr 'url' => 'https://www.qwant.com', 8945f4cdffSAndreas Gohr 'regex' => '^(\w+\.)*qwant\.com$', 9045f4cdffSAndreas Gohr 'params' => ['q'] 9145f4cdffSAndreas Gohr ], 92*6811247aSAndreas Gohr 'avg' => [ 93*6811247aSAndreas Gohr 'name' => 'AVG Safe Search', 94*6811247aSAndreas Gohr 'url' => 'https://search.avg.com', 95e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 96e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 97*6811247aSAndreas Gohr ], 98*6811247aSAndreas Gohr 'brave' => [ 99*6811247aSAndreas Gohr 'name' => 'Brave Search', 100*6811247aSAndreas Gohr 'url' => 'https://search.brave.com', 101*6811247aSAndreas Gohr 'regex' => '^(\w+\.)*search\.brave\.com$', 102*6811247aSAndreas Gohr 'params' => ['q'] 103*6811247aSAndreas Gohr ], 1042d987c80SAndreas Gohr ]; 1052d987c80SAndreas Gohr 106762f4807SAndreas Gohr /** @var string|null The search engine key */ 107762f4807SAndreas Gohr protected ?string $engine = null; 10806bd4382SAndreas Gohr (aider) 109aecf8e88SAndreas Gohr (aider) /** @var string|null The search engine name */ 110762f4807SAndreas Gohr protected ?string $name = null; 111aecf8e88SAndreas Gohr (aider) 112aecf8e88SAndreas Gohr (aider) /** @var string|null The search query */ 113aecf8e88SAndreas Gohr (aider) protected ?string $query = null; 114aecf8e88SAndreas Gohr (aider) 115762f4807SAndreas Gohr /** 116762f4807SAndreas Gohr * Constructor 117762f4807SAndreas Gohr * 118762f4807SAndreas Gohr * @param string $referer The HTTP referer URL to analyze 119762f4807SAndreas Gohr */ 120762f4807SAndreas Gohr public function __construct(string $referer) 1212d987c80SAndreas Gohr { 122762f4807SAndreas Gohr $this->analyze($referer); 12306bd4382SAndreas Gohr (aider) } 12406bd4382SAndreas Gohr (aider) 12506bd4382SAndreas Gohr (aider) /** 126762f4807SAndreas Gohr * Check if the referer is from a search engine 12706bd4382SAndreas Gohr (aider) * 128762f4807SAndreas Gohr * @return bool True if the referer is from a search engine 12906bd4382SAndreas Gohr (aider) */ 13006bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 13106bd4382SAndreas Gohr (aider) { 132762f4807SAndreas Gohr return (bool)$this->engine; 13306bd4382SAndreas Gohr (aider) } 13406bd4382SAndreas Gohr (aider) 13506bd4382SAndreas Gohr (aider) /** 136762f4807SAndreas Gohr * Get the search engine identifier from the referer 13706bd4382SAndreas Gohr (aider) * 138c428ec28SAndreas Gohr * @return string|null The search engine or null if not a search engine 13906bd4382SAndreas Gohr (aider) */ 140c428ec28SAndreas Gohr public function getEngine(): ?string 14106bd4382SAndreas Gohr (aider) { 142762f4807SAndreas Gohr return $this->engine; 14306bd4382SAndreas Gohr (aider) } 14406bd4382SAndreas Gohr (aider) 14506bd4382SAndreas Gohr (aider) /** 146762f4807SAndreas Gohr * Get the search query from the referer 14706bd4382SAndreas Gohr (aider) * 14806bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 14906bd4382SAndreas Gohr (aider) */ 15006bd4382SAndreas Gohr (aider) public function getQuery(): ?string 15106bd4382SAndreas Gohr (aider) { 152aecf8e88SAndreas Gohr (aider) return $this->query; 15306bd4382SAndreas Gohr (aider) } 15406bd4382SAndreas Gohr (aider) 15506bd4382SAndreas Gohr (aider) /** 156762f4807SAndreas Gohr * Get the search engine name for the given engine identifier 157762f4807SAndreas Gohr * 158c428ec28SAndreas Gohr * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 15906bd4382SAndreas Gohr (aider) */ 160c428ec28SAndreas Gohr public static function getName($engine): string 16106bd4382SAndreas Gohr (aider) { 162c4c84f98SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 163762f4807SAndreas Gohr } 164762f4807SAndreas Gohr 165762f4807SAndreas Gohr /** 166762f4807SAndreas Gohr * Get the search engine URL for the given engine identifier 167762f4807SAndreas Gohr * 168762f4807SAndreas Gohr * @return string|null The search engine URL or null if not defined 169762f4807SAndreas Gohr */ 170762f4807SAndreas Gohr public static function getUrl($engine): ?string 171762f4807SAndreas Gohr { 172762f4807SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 173762f4807SAndreas Gohr } 174762f4807SAndreas Gohr 175762f4807SAndreas Gohr /** 176762f4807SAndreas Gohr * Analyze the referer and populate member variables 177762f4807SAndreas Gohr */ 178762f4807SAndreas Gohr protected function analyze(string $referer): void 179762f4807SAndreas Gohr { 180762f4807SAndreas Gohr $result = $this->analyzereferer($referer); 18106bd4382SAndreas Gohr (aider) 182aecf8e88SAndreas Gohr (aider) if ($result) { 183762f4807SAndreas Gohr $this->engine = $result['engine']; 184762f4807SAndreas Gohr $this->name = $result['name']; 185aecf8e88SAndreas Gohr (aider) $this->query = $result['query']; 186aecf8e88SAndreas Gohr (aider) } 1872d987c80SAndreas Gohr } 1882d987c80SAndreas Gohr 189a171b9c7SAndreas Gohr (aider) /** 190762f4807SAndreas Gohr * Analyze a referer URL to extract search engine information and query 191a171b9c7SAndreas Gohr (aider) * 192a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 193a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 194a171b9c7SAndreas Gohr (aider) */ 195762f4807SAndreas Gohr protected function analyzereferer(string $referer): ?array 196a171b9c7SAndreas Gohr (aider) { 197a73005f2SAndreas Gohr (aider) $urlparts = parse_url(strtolower($referer)); 198a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 199a171b9c7SAndreas Gohr (aider) return null; 200a171b9c7SAndreas Gohr (aider) } 201a171b9c7SAndreas Gohr (aider) 202a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 203a73005f2SAndreas Gohr (aider) $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 204a171b9c7SAndreas Gohr (aider) 205a73005f2SAndreas Gohr (aider) parse_str($queryString, $params); 206a73005f2SAndreas Gohr (aider) 207a73005f2SAndreas Gohr (aider) // Try to match against known search engines 208a73005f2SAndreas Gohr (aider) $result = $this->matchKnownEngine($domain, $params); 209a73005f2SAndreas Gohr (aider) if ($result) { 210a73005f2SAndreas Gohr (aider) return $result; 211a73005f2SAndreas Gohr (aider) } 212a73005f2SAndreas Gohr (aider) 213a73005f2SAndreas Gohr (aider) // Try generic search parameters 214a73005f2SAndreas Gohr (aider) return $this->matchGenericEngine($domain, $params); 215a73005f2SAndreas Gohr (aider) } 216a73005f2SAndreas Gohr (aider) 217a73005f2SAndreas Gohr (aider) /** 218a73005f2SAndreas Gohr (aider) * Try to match against known search engines 219a73005f2SAndreas Gohr (aider) * 220a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 221a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 222a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 223a73005f2SAndreas Gohr (aider) */ 224a73005f2SAndreas Gohr (aider) protected function matchKnownEngine(string $domain, array $params): ?array 225a73005f2SAndreas Gohr (aider) { 226762f4807SAndreas Gohr foreach (self::$searchEngines as $key => $engine) { 227a73005f2SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 228a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $engine['params']); 229a73005f2SAndreas Gohr (aider) return [ 230a73005f2SAndreas Gohr (aider) 'engine' => $key, 231a73005f2SAndreas Gohr (aider) 'name' => $engine['name'], 232a73005f2SAndreas Gohr (aider) 'query' => $query 233a73005f2SAndreas Gohr (aider) ]; 234a73005f2SAndreas Gohr (aider) } 235a73005f2SAndreas Gohr (aider) } 236a73005f2SAndreas Gohr (aider) 237a73005f2SAndreas Gohr (aider) return null; 238a73005f2SAndreas Gohr (aider) } 239a73005f2SAndreas Gohr (aider) 240a73005f2SAndreas Gohr (aider) /** 241a73005f2SAndreas Gohr (aider) * Try to match against generic search parameters 242a73005f2SAndreas Gohr (aider) * 243a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 244a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 245a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 246a73005f2SAndreas Gohr (aider) */ 247a73005f2SAndreas Gohr (aider) protected function matchGenericEngine(string $domain, array $params): ?array 248a73005f2SAndreas Gohr (aider) { 249a73005f2SAndreas Gohr (aider) $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 250a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $genericParams); 251a171b9c7SAndreas Gohr (aider) 252a171b9c7SAndreas Gohr (aider) if (!$query) { 253a171b9c7SAndreas Gohr (aider) return null; 254a171b9c7SAndreas Gohr (aider) } 255a171b9c7SAndreas Gohr (aider) 256a73005f2SAndreas Gohr (aider) // Generate engine name from domain 257a73005f2SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 258762f4807SAndreas Gohr $domainParts = explode('.', $engineName); 259762f4807SAndreas Gohr $engineName = array_pop($domainParts); 260a73005f2SAndreas Gohr (aider) 261a171b9c7SAndreas Gohr (aider) return [ 262762f4807SAndreas Gohr 'engine' => $engineName, 263a73005f2SAndreas Gohr (aider) 'name' => ucfirst($engineName), 264a171b9c7SAndreas Gohr (aider) 'query' => $query 265a171b9c7SAndreas Gohr (aider) ]; 266a171b9c7SAndreas Gohr (aider) } 267a171b9c7SAndreas Gohr (aider) 268a171b9c7SAndreas Gohr (aider) /** 269a73005f2SAndreas Gohr (aider) * Extract and clean search query from parameters 270a171b9c7SAndreas Gohr (aider) * 271a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 272a73005f2SAndreas Gohr (aider) * @param array $paramNames Parameter names to check 273a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null 274a171b9c7SAndreas Gohr (aider) */ 275a73005f2SAndreas Gohr (aider) protected function extractQuery(array $params, array $paramNames): ?string 276a171b9c7SAndreas Gohr (aider) { 277a73005f2SAndreas Gohr (aider) foreach ($paramNames as $param) { 278a73005f2SAndreas Gohr (aider) if (!empty($params[$param])) { 279a73005f2SAndreas Gohr (aider) $query = $this->cleanQuery($params[$param]); 280a73005f2SAndreas Gohr (aider) if ($query) { 281a73005f2SAndreas Gohr (aider) return $query; 282a73005f2SAndreas Gohr (aider) } 283a73005f2SAndreas Gohr (aider) } 284a73005f2SAndreas Gohr (aider) } 285a73005f2SAndreas Gohr (aider) 286a73005f2SAndreas Gohr (aider) return null; 287a171b9c7SAndreas Gohr (aider) } 288a171b9c7SAndreas Gohr (aider) 289a171b9c7SAndreas Gohr (aider) /** 290a73005f2SAndreas Gohr (aider) * Clean and validate search query 291a171b9c7SAndreas Gohr (aider) * 292a73005f2SAndreas Gohr (aider) * @param string $query Raw query string 293a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null if invalid 294a171b9c7SAndreas Gohr (aider) */ 295a73005f2SAndreas Gohr (aider) protected function cleanQuery(string $query): ?string 296a171b9c7SAndreas Gohr (aider) { 297c428ec28SAndreas Gohr // Remove non-search queries (cache: and related: prefixes) 298c428ec28SAndreas Gohr $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 299a73005f2SAndreas Gohr (aider) // Compact whitespace 300c428ec28SAndreas Gohr $query = preg_replace('/\s+/', ' ', $query); 301a73005f2SAndreas Gohr (aider) $query = trim($query); 302a73005f2SAndreas Gohr (aider) 303a73005f2SAndreas Gohr (aider) return $query ?: null; 304a171b9c7SAndreas Gohr (aider) } 3052d987c80SAndreas Gohr} 306