12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 6762f4807SAndreas Gohr * Extract search Engine Inormation from the HTTP referer 7762f4807SAndreas Gohr * 8762f4807SAndreas Gohr * We use the HTTP specification misspelling of "referer" here 92d987c80SAndreas Gohr */ 102d987c80SAndreas Gohrclass SearchEngines 112d987c80SAndreas Gohr{ 12e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 13762f4807SAndreas Gohr protected static array $searchEngines = [ 14762f4807SAndreas Gohr 'dokuwiki' => [ 15762f4807SAndreas Gohr 'name' => 'DokuWiki Internal Search', 16762f4807SAndreas Gohr 'url' => DOKU_URL, 17762f4807SAndreas Gohr 'regex' => '', // set in constructor 18762f4807SAndreas Gohr 'params' => ['q'] 19762f4807SAndreas Gohr ], 20e357e0dcSAndreas Gohr (aider) 'google' => [ 21e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 22e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 23e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 24e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 25e357e0dcSAndreas Gohr (aider) ], 26e357e0dcSAndreas Gohr (aider) 'bing' => [ 27e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 28e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 29e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 30e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 31e357e0dcSAndreas Gohr (aider) ], 32e357e0dcSAndreas Gohr (aider) 'yandex' => [ 33e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 34e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 35e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 36e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 37e357e0dcSAndreas Gohr (aider) ], 38e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 39e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 40e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 41e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 42e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 43e357e0dcSAndreas Gohr (aider) ], 44e357e0dcSAndreas Gohr (aider) 'naver' => [ 45e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 46e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 47e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 48e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 49e357e0dcSAndreas Gohr (aider) ], 50e357e0dcSAndreas Gohr (aider) 'baidu' => [ 51e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 52e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 53e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 54e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 55e357e0dcSAndreas Gohr (aider) ], 56e357e0dcSAndreas Gohr (aider) 'ask' => [ 57e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 58e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 59e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 60e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 61e357e0dcSAndreas Gohr (aider) ], 62e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 63e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 64e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 65e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 66e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 67e357e0dcSAndreas Gohr (aider) ], 68e357e0dcSAndreas Gohr (aider) 'babylon' => [ 69e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 70e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 71e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 72e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 73e357e0dcSAndreas Gohr (aider) ], 74e357e0dcSAndreas Gohr (aider) 'aol' => [ 75e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 76e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 77e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 78e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 79e357e0dcSAndreas Gohr (aider) ], 80e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 81e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 82e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 83e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 84e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 85e357e0dcSAndreas Gohr (aider) ], 86*45f4cdffSAndreas Gohr 'ecosia' => [ 87*45f4cdffSAndreas Gohr 'name' => 'Ecosia', 88*45f4cdffSAndreas Gohr 'url' => 'https://www.ecosia.org', 89*45f4cdffSAndreas Gohr 'regex' => '^(\w+\.)*ecosia\.org$', 90*45f4cdffSAndreas Gohr 'params' => ['q'] 91*45f4cdffSAndreas Gohr ], 92*45f4cdffSAndreas Gohr 'qwant' => [ 93*45f4cdffSAndreas Gohr 'name' => 'Qwant', 94*45f4cdffSAndreas Gohr 'url' => 'https://www.qwant.com', 95*45f4cdffSAndreas Gohr 'regex' => '^(\w+\.)*qwant\.com$', 96*45f4cdffSAndreas Gohr 'params' => ['q'] 97*45f4cdffSAndreas Gohr ], 98e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 99e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 100e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 101e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 102e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 103e357e0dcSAndreas Gohr (aider) ] 1042d987c80SAndreas Gohr ]; 1052d987c80SAndreas Gohr 106762f4807SAndreas Gohr /** @var string|null The search engine key */ 107762f4807SAndreas Gohr protected ?string $engine = null; 10806bd4382SAndreas Gohr (aider) 109aecf8e88SAndreas Gohr (aider) /** @var string|null The search engine name */ 110762f4807SAndreas Gohr protected ?string $name = null; 111aecf8e88SAndreas Gohr (aider) 112aecf8e88SAndreas Gohr (aider) /** @var string|null The search query */ 113aecf8e88SAndreas Gohr (aider) protected ?string $query = null; 114aecf8e88SAndreas Gohr (aider) 115762f4807SAndreas Gohr /** 116762f4807SAndreas Gohr * Constructor 117762f4807SAndreas Gohr * 118762f4807SAndreas Gohr * @param string $referer The HTTP referer URL to analyze 119762f4807SAndreas Gohr */ 120762f4807SAndreas Gohr public function __construct(string $referer) 1212d987c80SAndreas Gohr { 122762f4807SAndreas Gohr // Add regex matching ourselves 123762f4807SAndreas Gohr self::$searchEngines['dokuwiki']['regex'] = '^' . preg_quote(parse_url(DOKU_URL, PHP_URL_HOST), '/') . '$'; 124762f4807SAndreas Gohr $this->analyze($referer); 12506bd4382SAndreas Gohr (aider) } 12606bd4382SAndreas Gohr (aider) 12706bd4382SAndreas Gohr (aider) /** 128762f4807SAndreas Gohr * Check if the referer is from a search engine 12906bd4382SAndreas Gohr (aider) * 130762f4807SAndreas Gohr * @return bool True if the referer is from a search engine 13106bd4382SAndreas Gohr (aider) */ 13206bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 13306bd4382SAndreas Gohr (aider) { 134762f4807SAndreas Gohr return (bool)$this->engine; 13506bd4382SAndreas Gohr (aider) } 13606bd4382SAndreas Gohr (aider) 13706bd4382SAndreas Gohr (aider) /** 138762f4807SAndreas Gohr * Get the search engine identifier from the referer 13906bd4382SAndreas Gohr (aider) * 140c428ec28SAndreas Gohr * @return string|null The search engine or null if not a search engine 14106bd4382SAndreas Gohr (aider) */ 142c428ec28SAndreas Gohr public function getEngine(): ?string 14306bd4382SAndreas Gohr (aider) { 144762f4807SAndreas Gohr return $this->engine; 14506bd4382SAndreas Gohr (aider) } 14606bd4382SAndreas Gohr (aider) 14706bd4382SAndreas Gohr (aider) /** 148762f4807SAndreas Gohr * Get the search query from the referer 14906bd4382SAndreas Gohr (aider) * 15006bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 15106bd4382SAndreas Gohr (aider) */ 15206bd4382SAndreas Gohr (aider) public function getQuery(): ?string 15306bd4382SAndreas Gohr (aider) { 154aecf8e88SAndreas Gohr (aider) return $this->query; 15506bd4382SAndreas Gohr (aider) } 15606bd4382SAndreas Gohr (aider) 15706bd4382SAndreas Gohr (aider) /** 158762f4807SAndreas Gohr * Get the search engine name for the given engine identifier 159762f4807SAndreas Gohr * 160c428ec28SAndreas Gohr * @return string If we have a name for the engine, return it, otherwise return capitalized $engine 16106bd4382SAndreas Gohr (aider) */ 162c428ec28SAndreas Gohr public static function getName($engine): string 16306bd4382SAndreas Gohr (aider) { 164c4c84f98SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['name'] : ucwords($engine); 165762f4807SAndreas Gohr } 166762f4807SAndreas Gohr 167762f4807SAndreas Gohr /** 168762f4807SAndreas Gohr * Get the search engine URL for the given engine identifier 169762f4807SAndreas Gohr * 170762f4807SAndreas Gohr * @return string|null The search engine URL or null if not defined 171762f4807SAndreas Gohr */ 172762f4807SAndreas Gohr public static function getUrl($engine): ?string 173762f4807SAndreas Gohr { 174762f4807SAndreas Gohr return isset(self::$searchEngines[$engine]) ? self::$searchEngines[$engine]['url'] : null; 175762f4807SAndreas Gohr } 176762f4807SAndreas Gohr 177762f4807SAndreas Gohr /** 178762f4807SAndreas Gohr * Analyze the referer and populate member variables 179762f4807SAndreas Gohr */ 180762f4807SAndreas Gohr protected function analyze(string $referer): void 181762f4807SAndreas Gohr { 182762f4807SAndreas Gohr $result = $this->analyzereferer($referer); 18306bd4382SAndreas Gohr (aider) 184aecf8e88SAndreas Gohr (aider) if ($result) { 185762f4807SAndreas Gohr $this->engine = $result['engine']; 186762f4807SAndreas Gohr $this->name = $result['name']; 187aecf8e88SAndreas Gohr (aider) $this->query = $result['query']; 188aecf8e88SAndreas Gohr (aider) } 1892d987c80SAndreas Gohr } 1902d987c80SAndreas Gohr 191a171b9c7SAndreas Gohr (aider) /** 192762f4807SAndreas Gohr * Analyze a referer URL to extract search engine information and query 193a171b9c7SAndreas Gohr (aider) * 194a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 195a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 196a171b9c7SAndreas Gohr (aider) */ 197762f4807SAndreas Gohr protected function analyzereferer(string $referer): ?array 198a171b9c7SAndreas Gohr (aider) { 199a73005f2SAndreas Gohr (aider) $urlparts = parse_url(strtolower($referer)); 200a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 201a171b9c7SAndreas Gohr (aider) return null; 202a171b9c7SAndreas Gohr (aider) } 203a171b9c7SAndreas Gohr (aider) 204a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 205a73005f2SAndreas Gohr (aider) $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 206a171b9c7SAndreas Gohr (aider) 207a73005f2SAndreas Gohr (aider) parse_str($queryString, $params); 208a73005f2SAndreas Gohr (aider) 209a73005f2SAndreas Gohr (aider) // Try to match against known search engines 210a73005f2SAndreas Gohr (aider) $result = $this->matchKnownEngine($domain, $params); 211a73005f2SAndreas Gohr (aider) if ($result) { 212a73005f2SAndreas Gohr (aider) return $result; 213a73005f2SAndreas Gohr (aider) } 214a73005f2SAndreas Gohr (aider) 215a73005f2SAndreas Gohr (aider) // Try generic search parameters 216a73005f2SAndreas Gohr (aider) return $this->matchGenericEngine($domain, $params); 217a73005f2SAndreas Gohr (aider) } 218a73005f2SAndreas Gohr (aider) 219a73005f2SAndreas Gohr (aider) /** 220a73005f2SAndreas Gohr (aider) * Try to match against known search engines 221a73005f2SAndreas Gohr (aider) * 222a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 223a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 224a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 225a73005f2SAndreas Gohr (aider) */ 226a73005f2SAndreas Gohr (aider) protected function matchKnownEngine(string $domain, array $params): ?array 227a73005f2SAndreas Gohr (aider) { 228762f4807SAndreas Gohr foreach (self::$searchEngines as $key => $engine) { 229a73005f2SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 230a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $engine['params']); 231a73005f2SAndreas Gohr (aider) return [ 232a73005f2SAndreas Gohr (aider) 'engine' => $key, 233a73005f2SAndreas Gohr (aider) 'name' => $engine['name'], 234a73005f2SAndreas Gohr (aider) 'query' => $query 235a73005f2SAndreas Gohr (aider) ]; 236a73005f2SAndreas Gohr (aider) } 237a73005f2SAndreas Gohr (aider) } 238a73005f2SAndreas Gohr (aider) 239a73005f2SAndreas Gohr (aider) return null; 240a73005f2SAndreas Gohr (aider) } 241a73005f2SAndreas Gohr (aider) 242a73005f2SAndreas Gohr (aider) /** 243a73005f2SAndreas Gohr (aider) * Try to match against generic search parameters 244a73005f2SAndreas Gohr (aider) * 245a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 246a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 247a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 248a73005f2SAndreas Gohr (aider) */ 249a73005f2SAndreas Gohr (aider) protected function matchGenericEngine(string $domain, array $params): ?array 250a73005f2SAndreas Gohr (aider) { 251a73005f2SAndreas Gohr (aider) $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 252a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $genericParams); 253a171b9c7SAndreas Gohr (aider) 254a171b9c7SAndreas Gohr (aider) if (!$query) { 255a171b9c7SAndreas Gohr (aider) return null; 256a171b9c7SAndreas Gohr (aider) } 257a171b9c7SAndreas Gohr (aider) 258a73005f2SAndreas Gohr (aider) // Generate engine name from domain 259a73005f2SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 260762f4807SAndreas Gohr $domainParts = explode('.', $engineName); 261762f4807SAndreas Gohr $engineName = array_pop($domainParts); 262a73005f2SAndreas Gohr (aider) 263a171b9c7SAndreas Gohr (aider) return [ 264762f4807SAndreas Gohr 'engine' => $engineName, 265a73005f2SAndreas Gohr (aider) 'name' => ucfirst($engineName), 266a171b9c7SAndreas Gohr (aider) 'query' => $query 267a171b9c7SAndreas Gohr (aider) ]; 268a171b9c7SAndreas Gohr (aider) } 269a171b9c7SAndreas Gohr (aider) 270a171b9c7SAndreas Gohr (aider) /** 271a73005f2SAndreas Gohr (aider) * Extract and clean search query from parameters 272a171b9c7SAndreas Gohr (aider) * 273a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 274a73005f2SAndreas Gohr (aider) * @param array $paramNames Parameter names to check 275a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null 276a171b9c7SAndreas Gohr (aider) */ 277a73005f2SAndreas Gohr (aider) protected function extractQuery(array $params, array $paramNames): ?string 278a171b9c7SAndreas Gohr (aider) { 279a73005f2SAndreas Gohr (aider) foreach ($paramNames as $param) { 280a73005f2SAndreas Gohr (aider) if (!empty($params[$param])) { 281a73005f2SAndreas Gohr (aider) $query = $this->cleanQuery($params[$param]); 282a73005f2SAndreas Gohr (aider) if ($query) { 283a73005f2SAndreas Gohr (aider) return $query; 284a73005f2SAndreas Gohr (aider) } 285a73005f2SAndreas Gohr (aider) } 286a73005f2SAndreas Gohr (aider) } 287a73005f2SAndreas Gohr (aider) 288a73005f2SAndreas Gohr (aider) return null; 289a171b9c7SAndreas Gohr (aider) } 290a171b9c7SAndreas Gohr (aider) 291a171b9c7SAndreas Gohr (aider) /** 292a73005f2SAndreas Gohr (aider) * Clean and validate search query 293a171b9c7SAndreas Gohr (aider) * 294a73005f2SAndreas Gohr (aider) * @param string $query Raw query string 295a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null if invalid 296a171b9c7SAndreas Gohr (aider) */ 297a73005f2SAndreas Gohr (aider) protected function cleanQuery(string $query): ?string 298a171b9c7SAndreas Gohr (aider) { 299c428ec28SAndreas Gohr // Remove non-search queries (cache: and related: prefixes) 300c428ec28SAndreas Gohr $query = preg_replace('/^(cache|related):[^\s]+\s*/', '', $query); 301a73005f2SAndreas Gohr (aider) // Compact whitespace 302c428ec28SAndreas Gohr $query = preg_replace('/\s+/', ' ', $query); 303a73005f2SAndreas Gohr (aider) $query = trim($query); 304a73005f2SAndreas Gohr (aider) 305a73005f2SAndreas Gohr (aider) return $query ?: null; 306a171b9c7SAndreas Gohr (aider) } 3072d987c80SAndreas Gohr} 308