12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines 72d987c80SAndreas Gohr */ 82d987c80SAndreas Gohrclass SearchEngines 92d987c80SAndreas Gohr{ 10e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 11e357e0dcSAndreas Gohr (aider) protected array $searchEngines = [ 12e357e0dcSAndreas Gohr (aider) 'google' => [ 13e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 14e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 15e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 17e357e0dcSAndreas Gohr (aider) ], 18e357e0dcSAndreas Gohr (aider) 'bing' => [ 19e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 20e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 21e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 23e357e0dcSAndreas Gohr (aider) ], 24e357e0dcSAndreas Gohr (aider) 'yandex' => [ 25e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 26e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 27e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 29e357e0dcSAndreas Gohr (aider) ], 30e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 31e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 32e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 33e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 34e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 35e357e0dcSAndreas Gohr (aider) ], 36e357e0dcSAndreas Gohr (aider) 'naver' => [ 37e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 38e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 39e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 40e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 41e357e0dcSAndreas Gohr (aider) ], 42e357e0dcSAndreas Gohr (aider) 'baidu' => [ 43e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 44e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 45e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 46e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 47e357e0dcSAndreas Gohr (aider) ], 48e357e0dcSAndreas Gohr (aider) 'ask' => [ 49e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 50e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 51e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 52e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 53e357e0dcSAndreas Gohr (aider) ], 54e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 55e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 56e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 57e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 58e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 59e357e0dcSAndreas Gohr (aider) ], 60e357e0dcSAndreas Gohr (aider) 'babylon' => [ 61e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 62e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 63e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 64e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 65e357e0dcSAndreas Gohr (aider) ], 66e357e0dcSAndreas Gohr (aider) 'aol' => [ 67e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 68e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 69e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 71e357e0dcSAndreas Gohr (aider) ], 72e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 73e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 74e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 75e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 76e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 77e357e0dcSAndreas Gohr (aider) ], 78e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 79e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 80e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 81e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 82e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 83e357e0dcSAndreas Gohr (aider) ] 842d987c80SAndreas Gohr ]; 852d987c80SAndreas Gohr 8606bd4382SAndreas Gohr (aider) /** @var string|null The referrer URL being analyzed */ 8706bd4382SAndreas Gohr (aider) protected ?string $referrer = null; 8806bd4382SAndreas Gohr (aider) 8906bd4382SAndreas Gohr (aider) /** @var array|null Cached analysis result */ 9006bd4382SAndreas Gohr (aider) protected ?array $analysisResult = null; 9106bd4382SAndreas Gohr (aider) 9206bd4382SAndreas Gohr (aider) public function __construct(?string $referrer = null) 932d987c80SAndreas Gohr { 94e357e0dcSAndreas Gohr (aider) // Add the internal DokuWiki search engine 95e357e0dcSAndreas Gohr (aider) $this->searchEngines['dokuwiki'] = [ 96e357e0dcSAndreas Gohr (aider) 'name' => 'DokuWiki Internal Search', 97e357e0dcSAndreas Gohr (aider) 'url' => wl(), 98e357e0dcSAndreas Gohr (aider) 'regex' => '', 99e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 100e357e0dcSAndreas Gohr (aider) ]; 10106bd4382SAndreas Gohr (aider) 10206bd4382SAndreas Gohr (aider) if ($referrer !== null) { 10306bd4382SAndreas Gohr (aider) $this->referrer = $referrer; 104*a73005f2SAndreas Gohr (aider) } 10506bd4382SAndreas Gohr (aider) } 10606bd4382SAndreas Gohr (aider) 10706bd4382SAndreas Gohr (aider) /** 10806bd4382SAndreas Gohr (aider) * Check if the referrer is from a search engine 10906bd4382SAndreas Gohr (aider) * 11006bd4382SAndreas Gohr (aider) * @return bool True if the referrer is from a search engine 11106bd4382SAndreas Gohr (aider) */ 11206bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 11306bd4382SAndreas Gohr (aider) { 114*a73005f2SAndreas Gohr (aider) return $this->getAnalysis() !== null; 11506bd4382SAndreas Gohr (aider) } 11606bd4382SAndreas Gohr (aider) 11706bd4382SAndreas Gohr (aider) /** 11806bd4382SAndreas Gohr (aider) * Get the search engine name 11906bd4382SAndreas Gohr (aider) * 12006bd4382SAndreas Gohr (aider) * @return string|null The search engine name or null if not a search engine 12106bd4382SAndreas Gohr (aider) */ 12206bd4382SAndreas Gohr (aider) public function getName(): ?string 12306bd4382SAndreas Gohr (aider) { 124*a73005f2SAndreas Gohr (aider) $analysis = $this->getAnalysis(); 125*a73005f2SAndreas Gohr (aider) return $analysis['name'] ?? null; 12606bd4382SAndreas Gohr (aider) } 12706bd4382SAndreas Gohr (aider) 12806bd4382SAndreas Gohr (aider) /** 12906bd4382SAndreas Gohr (aider) * Get the search engine URL 13006bd4382SAndreas Gohr (aider) * 13106bd4382SAndreas Gohr (aider) * @return string|null The search engine URL or null if not a search engine 13206bd4382SAndreas Gohr (aider) */ 13306bd4382SAndreas Gohr (aider) public function getUrl(): ?string 13406bd4382SAndreas Gohr (aider) { 135*a73005f2SAndreas Gohr (aider) $analysis = $this->getAnalysis(); 136*a73005f2SAndreas Gohr (aider) if (!$analysis) { 13706bd4382SAndreas Gohr (aider) return null; 13806bd4382SAndreas Gohr (aider) } 13906bd4382SAndreas Gohr (aider) 140*a73005f2SAndreas Gohr (aider) return $this->searchEngines[$analysis['engine']]['url'] ?? null; 14106bd4382SAndreas Gohr (aider) } 14206bd4382SAndreas Gohr (aider) 14306bd4382SAndreas Gohr (aider) /** 14406bd4382SAndreas Gohr (aider) * Get the search query 14506bd4382SAndreas Gohr (aider) * 14606bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 14706bd4382SAndreas Gohr (aider) */ 14806bd4382SAndreas Gohr (aider) public function getQuery(): ?string 14906bd4382SAndreas Gohr (aider) { 150*a73005f2SAndreas Gohr (aider) $analysis = $this->getAnalysis(); 151*a73005f2SAndreas Gohr (aider) return $analysis['query'] ?? null; 15206bd4382SAndreas Gohr (aider) } 15306bd4382SAndreas Gohr (aider) 15406bd4382SAndreas Gohr (aider) /** 155*a73005f2SAndreas Gohr (aider) * Get or perform analysis of the current referrer 156*a73005f2SAndreas Gohr (aider) * 157*a73005f2SAndreas Gohr (aider) * @return array|null Analysis result or null if not a search engine 15806bd4382SAndreas Gohr (aider) */ 159*a73005f2SAndreas Gohr (aider) protected function getAnalysis(): ?array 16006bd4382SAndreas Gohr (aider) { 161*a73005f2SAndreas Gohr (aider) if ($this->analysisResult === null && $this->referrer !== null) { 162*a73005f2SAndreas Gohr (aider) $this->analysisResult = $this->analyzeReferrer($this->referrer); 16306bd4382SAndreas Gohr (aider) } 16406bd4382SAndreas Gohr (aider) 165*a73005f2SAndreas Gohr (aider) return $this->analysisResult; 1662d987c80SAndreas Gohr } 1672d987c80SAndreas Gohr 168a171b9c7SAndreas Gohr (aider) /** 169a171b9c7SAndreas Gohr (aider) * Analyze a referrer URL to extract search engine information and query 170a171b9c7SAndreas Gohr (aider) * 171a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 172a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 173a171b9c7SAndreas Gohr (aider) */ 174*a73005f2SAndreas Gohr (aider) protected function analyzeReferrer(string $referer): ?array 175a171b9c7SAndreas Gohr (aider) { 176*a73005f2SAndreas Gohr (aider) $urlparts = parse_url(strtolower($referer)); 177a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 178a171b9c7SAndreas Gohr (aider) return null; 179a171b9c7SAndreas Gohr (aider) } 180a171b9c7SAndreas Gohr (aider) 181a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 182*a73005f2SAndreas Gohr (aider) $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 183a171b9c7SAndreas Gohr (aider) 184*a73005f2SAndreas Gohr (aider) if (!$queryString) { 185a171b9c7SAndreas Gohr (aider) return null; 186a171b9c7SAndreas Gohr (aider) } 187a171b9c7SAndreas Gohr (aider) 188*a73005f2SAndreas Gohr (aider) parse_str($queryString, $params); 189*a73005f2SAndreas Gohr (aider) 190*a73005f2SAndreas Gohr (aider) // Try to match against known search engines 191*a73005f2SAndreas Gohr (aider) $result = $this->matchKnownEngine($domain, $params); 192*a73005f2SAndreas Gohr (aider) if ($result) { 193*a73005f2SAndreas Gohr (aider) return $result; 194*a73005f2SAndreas Gohr (aider) } 195*a73005f2SAndreas Gohr (aider) 196*a73005f2SAndreas Gohr (aider) // Try generic search parameters 197*a73005f2SAndreas Gohr (aider) return $this->matchGenericEngine($domain, $params); 198*a73005f2SAndreas Gohr (aider) } 199*a73005f2SAndreas Gohr (aider) 200*a73005f2SAndreas Gohr (aider) /** 201*a73005f2SAndreas Gohr (aider) * Try to match against known search engines 202*a73005f2SAndreas Gohr (aider) * 203*a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 204*a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 205*a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 206*a73005f2SAndreas Gohr (aider) */ 207*a73005f2SAndreas Gohr (aider) protected function matchKnownEngine(string $domain, array $params): ?array 208*a73005f2SAndreas Gohr (aider) { 209*a73005f2SAndreas Gohr (aider) foreach ($this->searchEngines as $key => $engine) { 210*a73005f2SAndreas Gohr (aider) if (!$engine['regex']) { 211*a73005f2SAndreas Gohr (aider) continue; // skip engines without regex (like dokuwiki) 212*a73005f2SAndreas Gohr (aider) } 213*a73005f2SAndreas Gohr (aider) 214*a73005f2SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 215*a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $engine['params']); 216*a73005f2SAndreas Gohr (aider) if ($query) { 217*a73005f2SAndreas Gohr (aider) return [ 218*a73005f2SAndreas Gohr (aider) 'engine' => $key, 219*a73005f2SAndreas Gohr (aider) 'name' => $engine['name'], 220*a73005f2SAndreas Gohr (aider) 'query' => $query 221*a73005f2SAndreas Gohr (aider) ]; 222*a73005f2SAndreas Gohr (aider) } 223*a73005f2SAndreas Gohr (aider) } 224*a73005f2SAndreas Gohr (aider) } 225*a73005f2SAndreas Gohr (aider) 226*a73005f2SAndreas Gohr (aider) return null; 227*a73005f2SAndreas Gohr (aider) } 228*a73005f2SAndreas Gohr (aider) 229*a73005f2SAndreas Gohr (aider) /** 230*a73005f2SAndreas Gohr (aider) * Try to match against generic search parameters 231*a73005f2SAndreas Gohr (aider) * 232*a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 233*a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 234*a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 235*a73005f2SAndreas Gohr (aider) */ 236*a73005f2SAndreas Gohr (aider) protected function matchGenericEngine(string $domain, array $params): ?array 237*a73005f2SAndreas Gohr (aider) { 238*a73005f2SAndreas Gohr (aider) $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 239*a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $genericParams); 240a171b9c7SAndreas Gohr (aider) 241a171b9c7SAndreas Gohr (aider) if (!$query) { 242a171b9c7SAndreas Gohr (aider) return null; 243a171b9c7SAndreas Gohr (aider) } 244a171b9c7SAndreas Gohr (aider) 245*a73005f2SAndreas Gohr (aider) // Generate engine name from domain 246*a73005f2SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 247*a73005f2SAndreas Gohr (aider) $engineName = array_pop(explode('.', $engineName)); 248*a73005f2SAndreas Gohr (aider) 249a171b9c7SAndreas Gohr (aider) return [ 250*a73005f2SAndreas Gohr (aider) 'engine' => 'generic_' . $engineName, 251*a73005f2SAndreas Gohr (aider) 'name' => ucfirst($engineName), 252a171b9c7SAndreas Gohr (aider) 'query' => $query 253a171b9c7SAndreas Gohr (aider) ]; 254a171b9c7SAndreas Gohr (aider) } 255a171b9c7SAndreas Gohr (aider) 256a171b9c7SAndreas Gohr (aider) /** 257*a73005f2SAndreas Gohr (aider) * Extract and clean search query from parameters 258a171b9c7SAndreas Gohr (aider) * 259*a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 260*a73005f2SAndreas Gohr (aider) * @param array $paramNames Parameter names to check 261*a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null 262a171b9c7SAndreas Gohr (aider) */ 263*a73005f2SAndreas Gohr (aider) protected function extractQuery(array $params, array $paramNames): ?string 264a171b9c7SAndreas Gohr (aider) { 265*a73005f2SAndreas Gohr (aider) foreach ($paramNames as $param) { 266*a73005f2SAndreas Gohr (aider) if (!empty($params[$param])) { 267*a73005f2SAndreas Gohr (aider) $query = $this->cleanQuery($params[$param]); 268*a73005f2SAndreas Gohr (aider) if ($query) { 269*a73005f2SAndreas Gohr (aider) return $query; 270*a73005f2SAndreas Gohr (aider) } 271*a73005f2SAndreas Gohr (aider) } 272*a73005f2SAndreas Gohr (aider) } 273*a73005f2SAndreas Gohr (aider) 274*a73005f2SAndreas Gohr (aider) return null; 275a171b9c7SAndreas Gohr (aider) } 276a171b9c7SAndreas Gohr (aider) 277a171b9c7SAndreas Gohr (aider) /** 278*a73005f2SAndreas Gohr (aider) * Clean and validate search query 279a171b9c7SAndreas Gohr (aider) * 280*a73005f2SAndreas Gohr (aider) * @param string $query Raw query string 281*a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null if invalid 282a171b9c7SAndreas Gohr (aider) */ 283*a73005f2SAndreas Gohr (aider) protected function cleanQuery(string $query): ?string 284a171b9c7SAndreas Gohr (aider) { 285*a73005f2SAndreas Gohr (aider) // Remove non-search queries 286*a73005f2SAndreas Gohr (aider) $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); 287*a73005f2SAndreas Gohr (aider) // Compact whitespace 288*a73005f2SAndreas Gohr (aider) $query = preg_replace('/ +/', ' ', $query); 289*a73005f2SAndreas Gohr (aider) $query = trim($query); 290*a73005f2SAndreas Gohr (aider) 291*a73005f2SAndreas Gohr (aider) return $query ?: null; 292a171b9c7SAndreas Gohr (aider) } 293a171b9c7SAndreas Gohr (aider) 2942d987c80SAndreas Gohr} 295