12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 6*762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer 7*762f4807SAndreas Gohr * 8*762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here 92d987c80SAndreas Gohr */ 102d987c80SAndreas Gohrclass SearchEngines 112d987c80SAndreas Gohr{ 12e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 13*762f4807SAndreas Gohr protected static array $searchEngines = [ 14*762f4807SAndreas Gohr 'dokuwiki' => [ 15*762f4807SAndreas Gohr 'name' => 'DokuWiki Internal Search', 16*762f4807SAndreas Gohr 'url' => DOKU_URL, 17*762f4807SAndreas Gohr 'regex' => '', // set in constructor 18*762f4807SAndreas Gohr 'params' => ['q'] 19*762f4807SAndreas Gohr ], 20e357e0dcSAndreas Gohr (aider) 'google' => [ 21e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 22e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 23e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 24e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 25e357e0dcSAndreas Gohr (aider) ], 26e357e0dcSAndreas Gohr (aider) 'bing' => [ 27e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 28e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 29e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 30e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 31e357e0dcSAndreas Gohr (aider) ], 32e357e0dcSAndreas Gohr (aider) 'yandex' => [ 33e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 34e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 35e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 36e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 37e357e0dcSAndreas Gohr (aider) ], 38e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 39e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 40e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 41e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 42e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 43e357e0dcSAndreas Gohr (aider) ], 44e357e0dcSAndreas Gohr (aider) 'naver' => [ 45e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 46e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 47e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 48e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 49e357e0dcSAndreas Gohr (aider) ], 50e357e0dcSAndreas Gohr (aider) 'baidu' => [ 51e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 52e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 53e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 54e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 55e357e0dcSAndreas Gohr (aider) ], 56e357e0dcSAndreas Gohr (aider) 'ask' => [ 57e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 58e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 59e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 60e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 61e357e0dcSAndreas Gohr (aider) ], 62e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 63e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 64e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 65e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 66e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 67e357e0dcSAndreas Gohr (aider) ], 68e357e0dcSAndreas Gohr (aider) 'babylon' => [ 69e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 70e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 71e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 72e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 73e357e0dcSAndreas Gohr (aider) ], 74e357e0dcSAndreas Gohr (aider) 'aol' => [ 75e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 76e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 77e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 78e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 79e357e0dcSAndreas Gohr (aider) ], 80e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 81e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 82e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 83e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 84e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 85e357e0dcSAndreas Gohr (aider) ], 86e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 87e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 88e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 89e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 90e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 91e357e0dcSAndreas Gohr (aider) ] 922d987c80SAndreas Gohr ]; 932d987c80SAndreas Gohr 94*762f4807SAndreas Gohr /** @var string|null The search engine key */ 95*762f4807SAndreas Gohr protected ?string $engine = null; 9606bd4382SAndreas Gohr (aider) 97aecf8e88SAndreas Gohr (aider) /** @var string|null The search engine name */ 98*762f4807SAndreas Gohr protected ?string $name = null; 99aecf8e88SAndreas Gohr (aider) 100aecf8e88SAndreas Gohr (aider) /** @var string|null The search query */ 101aecf8e88SAndreas Gohr (aider) protected ?string $query = null; 102aecf8e88SAndreas Gohr (aider) 103*762f4807SAndreas Gohr /** 104*762f4807SAndreas Gohr * Constructor 105*762f4807SAndreas Gohr * 106*762f4807SAndreas Gohr * @param string $referer The HTTP referer URL to analyze 107*762f4807SAndreas Gohr */ 108*762f4807SAndreas Gohr public function __construct(string $referer) 1092d987c80SAndreas Gohr { 110*762f4807SAndreas Gohr // Add regex matching ourselves 111*762f4807SAndreas Gohr self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$'; 112*762f4807SAndreas Gohr $this->analyze($referer); 11306bd4382SAndreas Gohr (aider) } 11406bd4382SAndreas Gohr (aider) 11506bd4382SAndreas Gohr (aider) /** 116*762f4807SAndreas Gohr * Check if the referer is from a search engine 11706bd4382SAndreas Gohr (aider) * 118*762f4807SAndreas Gohr * @return bool True if the referer is from a search engine 11906bd4382SAndreas Gohr (aider) */ 12006bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 12106bd4382SAndreas Gohr (aider) { 122*762f4807SAndreas Gohr return (bool)$this->engine; 12306bd4382SAndreas Gohr (aider) } 12406bd4382SAndreas Gohr (aider) 12506bd4382SAndreas Gohr (aider) /** 126*762f4807SAndreas Gohr * Get the search engine identifier from the referer 12706bd4382SAndreas Gohr (aider) * 128*762f4807SAndreas Gohr * @return string The search engine 12906bd4382SAndreas Gohr (aider) */ 130*762f4807SAndreas Gohr public function getEngine(): string 13106bd4382SAndreas Gohr (aider) { 132*762f4807SAndreas Gohr return $this->engine; 13306bd4382SAndreas Gohr (aider) } 13406bd4382SAndreas Gohr (aider) 13506bd4382SAndreas Gohr (aider) /** 136*762f4807SAndreas Gohr * Get the search query from the referer 13706bd4382SAndreas Gohr (aider) * 13806bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 13906bd4382SAndreas Gohr (aider) */ 14006bd4382SAndreas Gohr (aider) public function getQuery(): ?string 14106bd4382SAndreas Gohr (aider) { 142aecf8e88SAndreas Gohr (aider) return $this->query; 14306bd4382SAndreas Gohr (aider) } 14406bd4382SAndreas Gohr (aider) 14506bd4382SAndreas Gohr (aider) /** 146*762f4807SAndreas Gohr * Get the search engine name for the given engine identifier 147*762f4807SAndreas Gohr * 148*762f4807SAndreas Gohr * @return string If we have a name for the engine, return it, otherwise return $engine 14906bd4382SAndreas Gohr (aider) */ 150*762f4807SAndreas Gohr public static function getName($engine): ?string 15106bd4382SAndreas Gohr (aider) { 152*762f4807SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : $engine; 153*762f4807SAndreas Gohr } 154*762f4807SAndreas Gohr 155*762f4807SAndreas Gohr /** 156*762f4807SAndreas Gohr * Get the search engine URL for the given engine identifier 157*762f4807SAndreas Gohr * 158*762f4807SAndreas Gohr * @return string|null The search engine URL or null if not defined 159*762f4807SAndreas Gohr */ 160*762f4807SAndreas Gohr public static function getUrl($engine): ?string 161*762f4807SAndreas Gohr { 162*762f4807SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 163*762f4807SAndreas Gohr } 164*762f4807SAndreas Gohr 165*762f4807SAndreas Gohr /** 166*762f4807SAndreas Gohr * Analyze the referer and populate member variables 167*762f4807SAndreas Gohr */ 168*762f4807SAndreas Gohr protected function analyze(string $referer): void 169*762f4807SAndreas Gohr { 170*762f4807SAndreas Gohr $result = $this->analyzereferer($referer); 17106bd4382SAndreas Gohr (aider) 172aecf8e88SAndreas Gohr (aider) if ($result) { 173*762f4807SAndreas Gohr $this->engine = $result['engine']; 174*762f4807SAndreas Gohr $this->name = $result['name']; 175aecf8e88SAndreas Gohr (aider) $this->query = $result['query']; 176aecf8e88SAndreas Gohr (aider) } 1772d987c80SAndreas Gohr } 1782d987c80SAndreas Gohr 179a171b9c7SAndreas Gohr (aider) /** 180*762f4807SAndreas Gohr * Analyze a referer URL to extract search engine information and query 181a171b9c7SAndreas Gohr (aider) * 182a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 183a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 184a171b9c7SAndreas Gohr (aider) */ 185*762f4807SAndreas Gohr protected function analyzereferer(string $referer): ?array 186a171b9c7SAndreas Gohr (aider) { 187a73005f2SAndreas Gohr (aider) $urlparts = parse_url(strtolower($referer)); 188a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 189a171b9c7SAndreas Gohr (aider) return null; 190a171b9c7SAndreas Gohr (aider) } 191a171b9c7SAndreas Gohr (aider) 192a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 193a73005f2SAndreas Gohr (aider) $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 194a171b9c7SAndreas Gohr (aider) 195a73005f2SAndreas Gohr (aider) if (!$queryString) { 196a171b9c7SAndreas Gohr (aider) return null; 197a171b9c7SAndreas Gohr (aider) } 198a171b9c7SAndreas Gohr (aider) 199a73005f2SAndreas Gohr (aider) parse_str($queryString, $params); 200a73005f2SAndreas Gohr (aider) 201a73005f2SAndreas Gohr (aider) // Try to match against known search engines 202a73005f2SAndreas Gohr (aider) $result = $this->matchKnownEngine($domain, $params); 203a73005f2SAndreas Gohr (aider) if ($result) { 204a73005f2SAndreas Gohr (aider) return $result; 205a73005f2SAndreas Gohr (aider) } 206a73005f2SAndreas Gohr (aider) 207a73005f2SAndreas Gohr (aider) // Try generic search parameters 208a73005f2SAndreas Gohr (aider) return $this->matchGenericEngine($domain, $params); 209a73005f2SAndreas Gohr (aider) } 210a73005f2SAndreas Gohr (aider) 211a73005f2SAndreas Gohr (aider) /** 212a73005f2SAndreas Gohr (aider) * Try to match against known search engines 213a73005f2SAndreas Gohr (aider) * 214a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 215a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 216a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 217a73005f2SAndreas Gohr (aider) */ 218a73005f2SAndreas Gohr (aider) protected function matchKnownEngine(string $domain, array $params): ?array 219a73005f2SAndreas Gohr (aider) { 220*762f4807SAndreas Gohr foreach (self::$searchEngines as $key => $engine) { 221a73005f2SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 222a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $engine['params']); 223a73005f2SAndreas Gohr (aider) if ($query) { 224a73005f2SAndreas Gohr (aider) return [ 225a73005f2SAndreas Gohr (aider) 'engine' => $key, 226a73005f2SAndreas Gohr (aider) 'name' => $engine['name'], 227a73005f2SAndreas Gohr (aider) 'query' => $query 228a73005f2SAndreas Gohr (aider) ]; 229a73005f2SAndreas Gohr (aider) } 230a73005f2SAndreas Gohr (aider) } 231a73005f2SAndreas Gohr (aider) } 232a73005f2SAndreas Gohr (aider) 233a73005f2SAndreas Gohr (aider) return null; 234a73005f2SAndreas Gohr (aider) } 235a73005f2SAndreas Gohr (aider) 236a73005f2SAndreas Gohr (aider) /** 237a73005f2SAndreas Gohr (aider) * Try to match against generic search parameters 238a73005f2SAndreas Gohr (aider) * 239a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 240a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 241a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 242a73005f2SAndreas Gohr (aider) */ 243a73005f2SAndreas Gohr (aider) protected function matchGenericEngine(string $domain, array $params): ?array 244a73005f2SAndreas Gohr (aider) { 245a73005f2SAndreas Gohr (aider) $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 246a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $genericParams); 247a171b9c7SAndreas Gohr (aider) 248a171b9c7SAndreas Gohr (aider) if (!$query) { 249a171b9c7SAndreas Gohr (aider) return null; 250a171b9c7SAndreas Gohr (aider) } 251a171b9c7SAndreas Gohr (aider) 252a73005f2SAndreas Gohr (aider) // Generate engine name from domain 253a73005f2SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 254*762f4807SAndreas Gohr $domainParts = explode('.', $engineName); 255*762f4807SAndreas Gohr $engineName = array_pop($domainParts); 256a73005f2SAndreas Gohr (aider) 257a171b9c7SAndreas Gohr (aider) return [ 258*762f4807SAndreas Gohr 'engine' => $engineName, 259a73005f2SAndreas Gohr (aider) 'name' => ucfirst($engineName), 260a171b9c7SAndreas Gohr (aider) 'query' => $query 261a171b9c7SAndreas Gohr (aider) ]; 262a171b9c7SAndreas Gohr (aider) } 263a171b9c7SAndreas Gohr (aider) 264a171b9c7SAndreas Gohr (aider) /** 265a73005f2SAndreas Gohr (aider) * Extract and clean search query from parameters 266a171b9c7SAndreas Gohr (aider) * 267a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 268a73005f2SAndreas Gohr (aider) * @param array $paramNames Parameter names to check 269a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null 270a171b9c7SAndreas Gohr (aider) */ 271a73005f2SAndreas Gohr (aider) protected function extractQuery(array $params, array $paramNames): ?string 272a171b9c7SAndreas Gohr (aider) { 273a73005f2SAndreas Gohr (aider) foreach ($paramNames as $param) { 274a73005f2SAndreas Gohr (aider) if (!empty($params[$param])) { 275a73005f2SAndreas Gohr (aider) $query = $this->cleanQuery($params[$param]); 276a73005f2SAndreas Gohr (aider) if ($query) { 277a73005f2SAndreas Gohr (aider) return $query; 278a73005f2SAndreas Gohr (aider) } 279a73005f2SAndreas Gohr (aider) } 280a73005f2SAndreas Gohr (aider) } 281a73005f2SAndreas Gohr (aider) 282a73005f2SAndreas Gohr (aider) return null; 283a171b9c7SAndreas Gohr (aider) } 284a171b9c7SAndreas Gohr (aider) 285a171b9c7SAndreas Gohr (aider) /** 286a73005f2SAndreas Gohr (aider) * Clean and validate search query 287a171b9c7SAndreas Gohr (aider) * 288a73005f2SAndreas Gohr (aider) * @param string $query Raw query string 289a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null if invalid 290a171b9c7SAndreas Gohr (aider) */ 291a73005f2SAndreas Gohr (aider) protected function cleanQuery(string $query): ?string 292a171b9c7SAndreas Gohr (aider) { 293a73005f2SAndreas Gohr (aider) // Remove non-search queries 294a73005f2SAndreas Gohr (aider) $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); 295a73005f2SAndreas Gohr (aider) // Compact whitespace 296a73005f2SAndreas Gohr (aider) $query = preg_replace('/ +/', ' ', $query); 297a73005f2SAndreas Gohr (aider) $query = trim($query); 298a73005f2SAndreas Gohr (aider) 299a73005f2SAndreas Gohr (aider) return $query ?: null; 300a171b9c7SAndreas Gohr (aider) } 301a171b9c7SAndreas Gohr (aider) 3022d987c80SAndreas Gohr} 303