12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines 72d987c80SAndreas Gohr */ 82d987c80SAndreas Gohrclass SearchEngines 92d987c80SAndreas Gohr{ 10e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 11e357e0dcSAndreas Gohr (aider) protected array $searchEngines = [ 12e357e0dcSAndreas Gohr (aider) 'google' => [ 13e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 14e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 15e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 17e357e0dcSAndreas Gohr (aider) ], 18e357e0dcSAndreas Gohr (aider) 'bing' => [ 19e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 20e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 21e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 23e357e0dcSAndreas Gohr (aider) ], 24e357e0dcSAndreas Gohr (aider) 'yandex' => [ 25e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 26e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 27e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 29e357e0dcSAndreas Gohr (aider) ], 30e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 31e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 32e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 33e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 34e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 35e357e0dcSAndreas Gohr (aider) ], 36e357e0dcSAndreas Gohr (aider) 'naver' => [ 37e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 38e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 39e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 40e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 41e357e0dcSAndreas Gohr (aider) ], 42e357e0dcSAndreas Gohr (aider) 'baidu' => [ 43e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 44e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 45e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 46e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 47e357e0dcSAndreas Gohr (aider) ], 48e357e0dcSAndreas Gohr (aider) 'ask' => [ 49e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 50e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 51e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 52e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 53e357e0dcSAndreas Gohr (aider) ], 54e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 55e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 56e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 57e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 58e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 59e357e0dcSAndreas Gohr (aider) ], 60e357e0dcSAndreas Gohr (aider) 'babylon' => [ 61e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 62e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 63e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 64e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 65e357e0dcSAndreas Gohr (aider) ], 66e357e0dcSAndreas Gohr (aider) 'aol' => [ 67e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 68e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 69e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 71e357e0dcSAndreas Gohr (aider) ], 72e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 73e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 74e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 75e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 76e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 77e357e0dcSAndreas Gohr (aider) ], 78e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 79e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 80e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 81e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 82e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 83e357e0dcSAndreas Gohr (aider) ] 842d987c80SAndreas Gohr ]; 852d987c80SAndreas Gohr 86*aecf8e88SAndreas Gohr (aider) /** @var string The referrer URL being analyzed */ 87*aecf8e88SAndreas Gohr (aider) protected string $referrer; 8806bd4382SAndreas Gohr (aider) 89*aecf8e88SAndreas Gohr (aider) /** @var bool Whether the referrer is from a search engine */ 90*aecf8e88SAndreas Gohr (aider) protected bool $isSearchEngine = false; 9106bd4382SAndreas Gohr (aider) 92*aecf8e88SAndreas Gohr (aider) /** @var string|null The search engine name */ 93*aecf8e88SAndreas Gohr (aider) protected ?string $engineName = null; 94*aecf8e88SAndreas Gohr (aider) 95*aecf8e88SAndreas Gohr (aider) /** @var string|null The search engine key */ 96*aecf8e88SAndreas Gohr (aider) protected ?string $engineKey = null; 97*aecf8e88SAndreas Gohr (aider) 98*aecf8e88SAndreas Gohr (aider) /** @var string|null The search query */ 99*aecf8e88SAndreas Gohr (aider) protected ?string $query = null; 100*aecf8e88SAndreas Gohr (aider) 101*aecf8e88SAndreas Gohr (aider) public function __construct(string $referrer) 1022d987c80SAndreas Gohr { 103e357e0dcSAndreas Gohr (aider) // Add the internal DokuWiki search engine 104e357e0dcSAndreas Gohr (aider) $this->searchEngines['dokuwiki'] = [ 105e357e0dcSAndreas Gohr (aider) 'name' => 'DokuWiki Internal Search', 106e357e0dcSAndreas Gohr (aider) 'url' => wl(), 107e357e0dcSAndreas Gohr (aider) 'regex' => '', 108e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 109e357e0dcSAndreas Gohr (aider) ]; 11006bd4382SAndreas Gohr (aider) 11106bd4382SAndreas Gohr (aider) $this->referrer = $referrer; 112*aecf8e88SAndreas Gohr (aider) $this->analyze(); 11306bd4382SAndreas Gohr (aider) } 11406bd4382SAndreas Gohr (aider) 11506bd4382SAndreas Gohr (aider) /** 11606bd4382SAndreas Gohr (aider) * Check if the referrer is from a search engine 11706bd4382SAndreas Gohr (aider) * 11806bd4382SAndreas Gohr (aider) * @return bool True if the referrer is from a search engine 11906bd4382SAndreas Gohr (aider) */ 12006bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 12106bd4382SAndreas Gohr (aider) { 122*aecf8e88SAndreas Gohr (aider) return $this->isSearchEngine; 12306bd4382SAndreas Gohr (aider) } 12406bd4382SAndreas Gohr (aider) 12506bd4382SAndreas Gohr (aider) /** 12606bd4382SAndreas Gohr (aider) * Get the search engine name 12706bd4382SAndreas Gohr (aider) * 12806bd4382SAndreas Gohr (aider) * @return string|null The search engine name or null if not a search engine 12906bd4382SAndreas Gohr (aider) */ 13006bd4382SAndreas Gohr (aider) public function getName(): ?string 13106bd4382SAndreas Gohr (aider) { 132*aecf8e88SAndreas Gohr (aider) return $this->engineName; 13306bd4382SAndreas Gohr (aider) } 13406bd4382SAndreas Gohr (aider) 13506bd4382SAndreas Gohr (aider) /** 13606bd4382SAndreas Gohr (aider) * Get the search engine URL 13706bd4382SAndreas Gohr (aider) * 13806bd4382SAndreas Gohr (aider) * @return string|null The search engine URL or null if not a search engine 13906bd4382SAndreas Gohr (aider) */ 14006bd4382SAndreas Gohr (aider) public function getUrl(): ?string 14106bd4382SAndreas Gohr (aider) { 142*aecf8e88SAndreas Gohr (aider) if (!$this->engineKey) { 14306bd4382SAndreas Gohr (aider) return null; 14406bd4382SAndreas Gohr (aider) } 14506bd4382SAndreas Gohr (aider) 146*aecf8e88SAndreas Gohr (aider) return $this->searchEngines[$this->engineKey]['url'] ?? null; 14706bd4382SAndreas Gohr (aider) } 14806bd4382SAndreas Gohr (aider) 14906bd4382SAndreas Gohr (aider) /** 15006bd4382SAndreas Gohr (aider) * Get the search query 15106bd4382SAndreas Gohr (aider) * 15206bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 15306bd4382SAndreas Gohr (aider) */ 15406bd4382SAndreas Gohr (aider) public function getQuery(): ?string 15506bd4382SAndreas Gohr (aider) { 156*aecf8e88SAndreas Gohr (aider) return $this->query; 15706bd4382SAndreas Gohr (aider) } 15806bd4382SAndreas Gohr (aider) 15906bd4382SAndreas Gohr (aider) /** 160*aecf8e88SAndreas Gohr (aider) * Analyze the referrer and populate member variables 16106bd4382SAndreas Gohr (aider) */ 162*aecf8e88SAndreas Gohr (aider) protected function analyze(): void 16306bd4382SAndreas Gohr (aider) { 164*aecf8e88SAndreas Gohr (aider) $result = $this->analyzeReferrer($this->referrer); 16506bd4382SAndreas Gohr (aider) 166*aecf8e88SAndreas Gohr (aider) if ($result) { 167*aecf8e88SAndreas Gohr (aider) $this->isSearchEngine = true; 168*aecf8e88SAndreas Gohr (aider) $this->engineKey = $result['engine']; 169*aecf8e88SAndreas Gohr (aider) $this->engineName = $result['name']; 170*aecf8e88SAndreas Gohr (aider) $this->query = $result['query']; 171*aecf8e88SAndreas Gohr (aider) } 1722d987c80SAndreas Gohr } 1732d987c80SAndreas Gohr 174a171b9c7SAndreas Gohr (aider) /** 175a171b9c7SAndreas Gohr (aider) * Analyze a referrer URL to extract search engine information and query 176a171b9c7SAndreas Gohr (aider) * 177a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 178a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 179a171b9c7SAndreas Gohr (aider) */ 180a73005f2SAndreas Gohr (aider) protected function analyzeReferrer(string $referer): ?array 181a171b9c7SAndreas Gohr (aider) { 182a73005f2SAndreas Gohr (aider) $urlparts = parse_url(strtolower($referer)); 183a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 184a171b9c7SAndreas Gohr (aider) return null; 185a171b9c7SAndreas Gohr (aider) } 186a171b9c7SAndreas Gohr (aider) 187a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 188a73005f2SAndreas Gohr (aider) $queryString = $urlparts['query'] ?? $urlparts['fragment'] ?? ''; 189a171b9c7SAndreas Gohr (aider) 190a73005f2SAndreas Gohr (aider) if (!$queryString) { 191a171b9c7SAndreas Gohr (aider) return null; 192a171b9c7SAndreas Gohr (aider) } 193a171b9c7SAndreas Gohr (aider) 194a73005f2SAndreas Gohr (aider) parse_str($queryString, $params); 195a73005f2SAndreas Gohr (aider) 196a73005f2SAndreas Gohr (aider) // Try to match against known search engines 197a73005f2SAndreas Gohr (aider) $result = $this->matchKnownEngine($domain, $params); 198a73005f2SAndreas Gohr (aider) if ($result) { 199a73005f2SAndreas Gohr (aider) return $result; 200a73005f2SAndreas Gohr (aider) } 201a73005f2SAndreas Gohr (aider) 202a73005f2SAndreas Gohr (aider) // Try generic search parameters 203a73005f2SAndreas Gohr (aider) return $this->matchGenericEngine($domain, $params); 204a73005f2SAndreas Gohr (aider) } 205a73005f2SAndreas Gohr (aider) 206a73005f2SAndreas Gohr (aider) /** 207a73005f2SAndreas Gohr (aider) * Try to match against known search engines 208a73005f2SAndreas Gohr (aider) * 209a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 210a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 211a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 212a73005f2SAndreas Gohr (aider) */ 213a73005f2SAndreas Gohr (aider) protected function matchKnownEngine(string $domain, array $params): ?array 214a73005f2SAndreas Gohr (aider) { 215a73005f2SAndreas Gohr (aider) foreach ($this->searchEngines as $key => $engine) { 216a73005f2SAndreas Gohr (aider) if (!$engine['regex']) { 217a73005f2SAndreas Gohr (aider) continue; // skip engines without regex (like dokuwiki) 218a73005f2SAndreas Gohr (aider) } 219a73005f2SAndreas Gohr (aider) 220a73005f2SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 221a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $engine['params']); 222a73005f2SAndreas Gohr (aider) if ($query) { 223a73005f2SAndreas Gohr (aider) return [ 224a73005f2SAndreas Gohr (aider) 'engine' => $key, 225a73005f2SAndreas Gohr (aider) 'name' => $engine['name'], 226a73005f2SAndreas Gohr (aider) 'query' => $query 227a73005f2SAndreas Gohr (aider) ]; 228a73005f2SAndreas Gohr (aider) } 229a73005f2SAndreas Gohr (aider) } 230a73005f2SAndreas Gohr (aider) } 231a73005f2SAndreas Gohr (aider) 232a73005f2SAndreas Gohr (aider) return null; 233a73005f2SAndreas Gohr (aider) } 234a73005f2SAndreas Gohr (aider) 235a73005f2SAndreas Gohr (aider) /** 236a73005f2SAndreas Gohr (aider) * Try to match against generic search parameters 237a73005f2SAndreas Gohr (aider) * 238a73005f2SAndreas Gohr (aider) * @param string $domain The domain to check 239a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 240a73005f2SAndreas Gohr (aider) * @return array|null Match result or null 241a73005f2SAndreas Gohr (aider) */ 242a73005f2SAndreas Gohr (aider) protected function matchGenericEngine(string $domain, array $params): ?array 243a73005f2SAndreas Gohr (aider) { 244a73005f2SAndreas Gohr (aider) $genericParams = ['search', 'query', 'q', 'keywords', 'keyword']; 245a73005f2SAndreas Gohr (aider) $query = $this->extractQuery($params, $genericParams); 246a171b9c7SAndreas Gohr (aider) 247a171b9c7SAndreas Gohr (aider) if (!$query) { 248a171b9c7SAndreas Gohr (aider) return null; 249a171b9c7SAndreas Gohr (aider) } 250a171b9c7SAndreas Gohr (aider) 251a73005f2SAndreas Gohr (aider) // Generate engine name from domain 252a73005f2SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); 253a73005f2SAndreas Gohr (aider) $engineName = array_pop(explode('.', $engineName)); 254a73005f2SAndreas Gohr (aider) 255a171b9c7SAndreas Gohr (aider) return [ 256a73005f2SAndreas Gohr (aider) 'engine' => 'generic_' . $engineName, 257a73005f2SAndreas Gohr (aider) 'name' => ucfirst($engineName), 258a171b9c7SAndreas Gohr (aider) 'query' => $query 259a171b9c7SAndreas Gohr (aider) ]; 260a171b9c7SAndreas Gohr (aider) } 261a171b9c7SAndreas Gohr (aider) 262a171b9c7SAndreas Gohr (aider) /** 263a73005f2SAndreas Gohr (aider) * Extract and clean search query from parameters 264a171b9c7SAndreas Gohr (aider) * 265a73005f2SAndreas Gohr (aider) * @param array $params URL parameters 266a73005f2SAndreas Gohr (aider) * @param array $paramNames Parameter names to check 267a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null 268a171b9c7SAndreas Gohr (aider) */ 269a73005f2SAndreas Gohr (aider) protected function extractQuery(array $params, array $paramNames): ?string 270a171b9c7SAndreas Gohr (aider) { 271a73005f2SAndreas Gohr (aider) foreach ($paramNames as $param) { 272a73005f2SAndreas Gohr (aider) if (!empty($params[$param])) { 273a73005f2SAndreas Gohr (aider) $query = $this->cleanQuery($params[$param]); 274a73005f2SAndreas Gohr (aider) if ($query) { 275a73005f2SAndreas Gohr (aider) return $query; 276a73005f2SAndreas Gohr (aider) } 277a73005f2SAndreas Gohr (aider) } 278a73005f2SAndreas Gohr (aider) } 279a73005f2SAndreas Gohr (aider) 280a73005f2SAndreas Gohr (aider) return null; 281a171b9c7SAndreas Gohr (aider) } 282a171b9c7SAndreas Gohr (aider) 283a171b9c7SAndreas Gohr (aider) /** 284a73005f2SAndreas Gohr (aider) * Clean and validate search query 285a171b9c7SAndreas Gohr (aider) * 286a73005f2SAndreas Gohr (aider) * @param string $query Raw query string 287a73005f2SAndreas Gohr (aider) * @return string|null Cleaned query or null if invalid 288a171b9c7SAndreas Gohr (aider) */ 289a73005f2SAndreas Gohr (aider) protected function cleanQuery(string $query): ?string 290a171b9c7SAndreas Gohr (aider) { 291a73005f2SAndreas Gohr (aider) // Remove non-search queries 292a73005f2SAndreas Gohr (aider) $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); 293a73005f2SAndreas Gohr (aider) // Compact whitespace 294a73005f2SAndreas Gohr (aider) $query = preg_replace('/ +/', ' ', $query); 295a73005f2SAndreas Gohr (aider) $query = trim($query); 296a73005f2SAndreas Gohr (aider) 297a73005f2SAndreas Gohr (aider) return $query ?: null; 298a171b9c7SAndreas Gohr (aider) } 299a171b9c7SAndreas Gohr (aider) 3002d987c80SAndreas Gohr} 301