12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines 72d987c80SAndreas Gohr */ 82d987c80SAndreas Gohrclass SearchEngines 92d987c80SAndreas Gohr{ 10e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 11e357e0dcSAndreas Gohr (aider) protected array $searchEngines = [ 12e357e0dcSAndreas Gohr (aider) 'google' => [ 13e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 14e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 15e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 17e357e0dcSAndreas Gohr (aider) ], 18e357e0dcSAndreas Gohr (aider) 'bing' => [ 19e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 20e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 21e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 23e357e0dcSAndreas Gohr (aider) ], 24e357e0dcSAndreas Gohr (aider) 'yandex' => [ 25e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 26e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 27e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 29e357e0dcSAndreas Gohr (aider) ], 30e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 31e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 32e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 33e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 34e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 35e357e0dcSAndreas Gohr (aider) ], 36e357e0dcSAndreas Gohr (aider) 'naver' => [ 37e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 38e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 39e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 40e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 41e357e0dcSAndreas Gohr (aider) ], 42e357e0dcSAndreas Gohr (aider) 'baidu' => [ 43e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 44e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 45e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 46e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 47e357e0dcSAndreas Gohr (aider) ], 48e357e0dcSAndreas Gohr (aider) 'ask' => [ 49e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 50e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 51e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 52e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 53e357e0dcSAndreas Gohr (aider) ], 54e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 55e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 56e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 57e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 58e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 59e357e0dcSAndreas Gohr (aider) ], 60e357e0dcSAndreas Gohr (aider) 'babylon' => [ 61e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 62e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 63e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 64e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 65e357e0dcSAndreas Gohr (aider) ], 66e357e0dcSAndreas Gohr (aider) 'aol' => [ 67e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 68e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 69e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 71e357e0dcSAndreas Gohr (aider) ], 72e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 73e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 74e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 75e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 76e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 77e357e0dcSAndreas Gohr (aider) ], 78e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 79e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 80e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 81e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 82e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 83e357e0dcSAndreas Gohr (aider) ] 842d987c80SAndreas Gohr ]; 852d987c80SAndreas Gohr 86*06bd4382SAndreas Gohr (aider) /** @var string|null The referrer URL being analyzed */ 87*06bd4382SAndreas Gohr (aider) protected ?string $referrer = null; 88*06bd4382SAndreas Gohr (aider) 89*06bd4382SAndreas Gohr (aider) /** @var array|null Cached analysis result */ 90*06bd4382SAndreas Gohr (aider) protected ?array $analysisResult = null; 91*06bd4382SAndreas Gohr (aider) 92*06bd4382SAndreas Gohr (aider) public function __construct(?string $referrer = null) 932d987c80SAndreas Gohr { 94e357e0dcSAndreas Gohr (aider) // Add the internal DokuWiki search engine 95e357e0dcSAndreas Gohr (aider) $this->searchEngines['dokuwiki'] = [ 96e357e0dcSAndreas Gohr (aider) 'name' => 'DokuWiki Internal Search', 97e357e0dcSAndreas Gohr (aider) 'url' => wl(), 98e357e0dcSAndreas Gohr (aider) 'regex' => '', 99e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 100e357e0dcSAndreas Gohr (aider) ]; 101*06bd4382SAndreas Gohr (aider) 102*06bd4382SAndreas Gohr (aider) if ($referrer !== null) { 103*06bd4382SAndreas Gohr (aider) $this->setReferrer($referrer); 104*06bd4382SAndreas Gohr (aider) } 105*06bd4382SAndreas Gohr (aider) } 106*06bd4382SAndreas Gohr (aider) 107*06bd4382SAndreas Gohr (aider) /** 108*06bd4382SAndreas Gohr (aider) * Set the referrer URL to analyze 109*06bd4382SAndreas Gohr (aider) * 110*06bd4382SAndreas Gohr (aider) * @param string $referrer The HTTP referrer URL 111*06bd4382SAndreas Gohr (aider) */ 112*06bd4382SAndreas Gohr (aider) public function setReferrer(string $referrer): void 113*06bd4382SAndreas Gohr (aider) { 114*06bd4382SAndreas Gohr (aider) $this->referrer = $referrer; 115*06bd4382SAndreas Gohr (aider) $this->analysisResult = null; // Clear cache 116*06bd4382SAndreas Gohr (aider) } 117*06bd4382SAndreas Gohr (aider) 118*06bd4382SAndreas Gohr (aider) /** 119*06bd4382SAndreas Gohr (aider) * Check if the referrer is from a search engine 120*06bd4382SAndreas Gohr (aider) * 121*06bd4382SAndreas Gohr (aider) * @return bool True if the referrer is from a search engine 122*06bd4382SAndreas Gohr (aider) */ 123*06bd4382SAndreas Gohr (aider) public function isSearchEngine(): bool 124*06bd4382SAndreas Gohr (aider) { 125*06bd4382SAndreas Gohr (aider) $this->analyze(); 126*06bd4382SAndreas Gohr (aider) return $this->analysisResult !== null; 127*06bd4382SAndreas Gohr (aider) } 128*06bd4382SAndreas Gohr (aider) 129*06bd4382SAndreas Gohr (aider) /** 130*06bd4382SAndreas Gohr (aider) * Get the search engine name 131*06bd4382SAndreas Gohr (aider) * 132*06bd4382SAndreas Gohr (aider) * @return string|null The search engine name or null if not a search engine 133*06bd4382SAndreas Gohr (aider) */ 134*06bd4382SAndreas Gohr (aider) public function getName(): ?string 135*06bd4382SAndreas Gohr (aider) { 136*06bd4382SAndreas Gohr (aider) $this->analyze(); 137*06bd4382SAndreas Gohr (aider) return $this->analysisResult['name'] ?? null; 138*06bd4382SAndreas Gohr (aider) } 139*06bd4382SAndreas Gohr (aider) 140*06bd4382SAndreas Gohr (aider) /** 141*06bd4382SAndreas Gohr (aider) * Get the search engine URL 142*06bd4382SAndreas Gohr (aider) * 143*06bd4382SAndreas Gohr (aider) * @return string|null The search engine URL or null if not a search engine 144*06bd4382SAndreas Gohr (aider) */ 145*06bd4382SAndreas Gohr (aider) public function getUrl(): ?string 146*06bd4382SAndreas Gohr (aider) { 147*06bd4382SAndreas Gohr (aider) $this->analyze(); 148*06bd4382SAndreas Gohr (aider) if (!$this->analysisResult) { 149*06bd4382SAndreas Gohr (aider) return null; 150*06bd4382SAndreas Gohr (aider) } 151*06bd4382SAndreas Gohr (aider) 152*06bd4382SAndreas Gohr (aider) $engineKey = $this->analysisResult['engine']; 153*06bd4382SAndreas Gohr (aider) return $this->searchEngines[$engineKey]['url'] ?? null; 154*06bd4382SAndreas Gohr (aider) } 155*06bd4382SAndreas Gohr (aider) 156*06bd4382SAndreas Gohr (aider) /** 157*06bd4382SAndreas Gohr (aider) * Get the search query 158*06bd4382SAndreas Gohr (aider) * 159*06bd4382SAndreas Gohr (aider) * @return string|null The search query or null if not a search engine 160*06bd4382SAndreas Gohr (aider) */ 161*06bd4382SAndreas Gohr (aider) public function getQuery(): ?string 162*06bd4382SAndreas Gohr (aider) { 163*06bd4382SAndreas Gohr (aider) $this->analyze(); 164*06bd4382SAndreas Gohr (aider) return $this->analysisResult['query'] ?? null; 165*06bd4382SAndreas Gohr (aider) } 166*06bd4382SAndreas Gohr (aider) 167*06bd4382SAndreas Gohr (aider) /** 168*06bd4382SAndreas Gohr (aider) * Analyze the current referrer 169*06bd4382SAndreas Gohr (aider) */ 170*06bd4382SAndreas Gohr (aider) protected function analyze(): void 171*06bd4382SAndreas Gohr (aider) { 172*06bd4382SAndreas Gohr (aider) if ($this->analysisResult !== null || $this->referrer === null) { 173*06bd4382SAndreas Gohr (aider) return; // Already analyzed or no referrer set 174*06bd4382SAndreas Gohr (aider) } 175*06bd4382SAndreas Gohr (aider) 176*06bd4382SAndreas Gohr (aider) $this->analysisResult = $this->analyzeReferrer($this->referrer); 1772d987c80SAndreas Gohr } 1782d987c80SAndreas Gohr 179a171b9c7SAndreas Gohr (aider) /** 180a171b9c7SAndreas Gohr (aider) * Analyze a referrer URL to extract search engine information and query 181a171b9c7SAndreas Gohr (aider) * 182a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 183a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 184a171b9c7SAndreas Gohr (aider) */ 185a171b9c7SAndreas Gohr (aider) public function analyzeReferrer(string $referer): ?array 186a171b9c7SAndreas Gohr (aider) { 187a171b9c7SAndreas Gohr (aider) $referer = strtolower($referer); 188a171b9c7SAndreas Gohr (aider) 189a171b9c7SAndreas Gohr (aider) // parse the referer 190a171b9c7SAndreas Gohr (aider) $urlparts = parse_url($referer); 191a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 192a171b9c7SAndreas Gohr (aider) return null; 193a171b9c7SAndreas Gohr (aider) } 194a171b9c7SAndreas Gohr (aider) 195a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 196a171b9c7SAndreas Gohr (aider) $qpart = $urlparts['query'] ?? ''; 197a171b9c7SAndreas Gohr (aider) if (!$qpart && isset($urlparts['fragment'])) { 198a171b9c7SAndreas Gohr (aider) $qpart = $urlparts['fragment']; // google does this 199a171b9c7SAndreas Gohr (aider) } 200a171b9c7SAndreas Gohr (aider) 201a171b9c7SAndreas Gohr (aider) $params = []; 202a171b9c7SAndreas Gohr (aider) if ($qpart) { 203a171b9c7SAndreas Gohr (aider) parse_str($qpart, $params); 204a171b9c7SAndreas Gohr (aider) } 205a171b9c7SAndreas Gohr (aider) 206a171b9c7SAndreas Gohr (aider) $query = ''; 207a171b9c7SAndreas Gohr (aider) $engineKey = ''; 208a171b9c7SAndreas Gohr (aider) $engineName = ''; 209a171b9c7SAndreas Gohr (aider) 210a171b9c7SAndreas Gohr (aider) // check domain against known search engines 211a171b9c7SAndreas Gohr (aider) foreach ($this->searchEngines as $key => $engine) { 212a171b9c7SAndreas Gohr (aider) if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki) 213a171b9c7SAndreas Gohr (aider) 214a171b9c7SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 215a171b9c7SAndreas Gohr (aider) $engineKey = $key; 216a171b9c7SAndreas Gohr (aider) $engineName = $engine['name']; 217a171b9c7SAndreas Gohr (aider) 218a171b9c7SAndreas Gohr (aider) // check the known parameters for content 219a171b9c7SAndreas Gohr (aider) foreach ($engine['params'] as $param) { 220a171b9c7SAndreas Gohr (aider) if (!empty($params[$param])) { 221a171b9c7SAndreas Gohr (aider) $query = $params[$param]; 222a171b9c7SAndreas Gohr (aider) break; 223a171b9c7SAndreas Gohr (aider) } 224a171b9c7SAndreas Gohr (aider) } 225a171b9c7SAndreas Gohr (aider) break; 226a171b9c7SAndreas Gohr (aider) } 227a171b9c7SAndreas Gohr (aider) } 228a171b9c7SAndreas Gohr (aider) 229a171b9c7SAndreas Gohr (aider) // try some generic search engine parameters if no specific engine matched 230a171b9c7SAndreas Gohr (aider) if (!$engineKey) { 231a171b9c7SAndreas Gohr (aider) foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) { 232a171b9c7SAndreas Gohr (aider) if (!empty($params[$param])) { 233a171b9c7SAndreas Gohr (aider) $query = $params[$param]; 234a171b9c7SAndreas Gohr (aider) // generate name from domain 235a171b9c7SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld 236a171b9c7SAndreas Gohr (aider) $engineName = explode('.', $engineName); 237a171b9c7SAndreas Gohr (aider) $engineName = array_pop($engineName); 238a171b9c7SAndreas Gohr (aider) $engineKey = 'generic_' . $engineName; 239a171b9c7SAndreas Gohr (aider) break; 240a171b9c7SAndreas Gohr (aider) } 241a171b9c7SAndreas Gohr (aider) } 242a171b9c7SAndreas Gohr (aider) } 243a171b9c7SAndreas Gohr (aider) 244a171b9c7SAndreas Gohr (aider) // still no hit? not a search engine 245a171b9c7SAndreas Gohr (aider) if (!$engineKey || !$query) { 246a171b9c7SAndreas Gohr (aider) return null; 247a171b9c7SAndreas Gohr (aider) } 248a171b9c7SAndreas Gohr (aider) 249a171b9c7SAndreas Gohr (aider) // clean the query 250a171b9c7SAndreas Gohr (aider) $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries 251a171b9c7SAndreas Gohr (aider) $query = preg_replace('/ +/', ' ', $query); // ws compact 252a171b9c7SAndreas Gohr (aider) $query = trim($query); 253a171b9c7SAndreas Gohr (aider) 254a171b9c7SAndreas Gohr (aider) if (!$query) { 255a171b9c7SAndreas Gohr (aider) return null; 256a171b9c7SAndreas Gohr (aider) } 257a171b9c7SAndreas Gohr (aider) 258a171b9c7SAndreas Gohr (aider) return [ 259a171b9c7SAndreas Gohr (aider) 'engine' => $engineKey, 260a171b9c7SAndreas Gohr (aider) 'name' => $engineName, 261a171b9c7SAndreas Gohr (aider) 'query' => $query 262a171b9c7SAndreas Gohr (aider) ]; 263a171b9c7SAndreas Gohr (aider) } 264a171b9c7SAndreas Gohr (aider) 265a171b9c7SAndreas Gohr (aider) /** 266a171b9c7SAndreas Gohr (aider) * Get search engine information by key 267a171b9c7SAndreas Gohr (aider) * 268a171b9c7SAndreas Gohr (aider) * @param string $key The search engine key 269a171b9c7SAndreas Gohr (aider) * @return array|null The search engine data or null if not found 270a171b9c7SAndreas Gohr (aider) */ 271a171b9c7SAndreas Gohr (aider) public function getSearchEngine(string $key): ?array 272a171b9c7SAndreas Gohr (aider) { 273a171b9c7SAndreas Gohr (aider) return $this->searchEngines[$key] ?? null; 274a171b9c7SAndreas Gohr (aider) } 275a171b9c7SAndreas Gohr (aider) 276a171b9c7SAndreas Gohr (aider) /** 277a171b9c7SAndreas Gohr (aider) * Get all search engines 278a171b9c7SAndreas Gohr (aider) * 279a171b9c7SAndreas Gohr (aider) * @return array All search engine definitions 280a171b9c7SAndreas Gohr (aider) */ 281a171b9c7SAndreas Gohr (aider) public function getAllSearchEngines(): array 282a171b9c7SAndreas Gohr (aider) { 283a171b9c7SAndreas Gohr (aider) return $this->searchEngines; 284a171b9c7SAndreas Gohr (aider) } 285a171b9c7SAndreas Gohr (aider) 2862d987c80SAndreas Gohr} 287