12d987c80SAndreas Gohr<?php 22d987c80SAndreas Gohr 32d987c80SAndreas Gohrnamespace dokuwiki\plugin\statistics; 42d987c80SAndreas Gohr 52d987c80SAndreas Gohr/** 62d987c80SAndreas Gohr * Defines regular expressions for the most common search engines 72d987c80SAndreas Gohr */ 82d987c80SAndreas Gohrclass SearchEngines 92d987c80SAndreas Gohr{ 10e357e0dcSAndreas Gohr (aider) /** @var array Search engine definitions with regex patterns and metadata */ 11e357e0dcSAndreas Gohr (aider) protected array $searchEngines = [ 12e357e0dcSAndreas Gohr (aider) 'google' => [ 13e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 14e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 15e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 17e357e0dcSAndreas Gohr (aider) ], 18e357e0dcSAndreas Gohr (aider) 'bing' => [ 19e357e0dcSAndreas Gohr (aider) 'name' => 'Bing', 20e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.bing.com', 21e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 23e357e0dcSAndreas Gohr (aider) ], 24e357e0dcSAndreas Gohr (aider) 'yandex' => [ 25e357e0dcSAndreas Gohr (aider) 'name' => 'Яндекс (Yandex)', 26e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yandex.ru', 27e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 29e357e0dcSAndreas Gohr (aider) ], 30e357e0dcSAndreas Gohr (aider) 'yahoo' => [ 31e357e0dcSAndreas Gohr (aider) 'name' => 'Yahoo!', 32e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.yahoo.com', 33e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*yahoo\.com$', 34e357e0dcSAndreas Gohr (aider) 'params' => ['p'] 35e357e0dcSAndreas Gohr (aider) ], 36e357e0dcSAndreas Gohr (aider) 'naver' => [ 37e357e0dcSAndreas Gohr (aider) 'name' => '네이버 (Naver)', 38e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.naver.com', 39e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.naver\.com$', 40e357e0dcSAndreas Gohr (aider) 'params' => ['query'] 41e357e0dcSAndreas Gohr (aider) ], 42e357e0dcSAndreas Gohr (aider) 'baidu' => [ 43e357e0dcSAndreas Gohr (aider) 'name' => '百度 (Baidu)', 44e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.baidu.com', 45e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*baidu\.com$', 46e357e0dcSAndreas Gohr (aider) 'params' => ['wd', 'word', 'kw'] 47e357e0dcSAndreas Gohr (aider) ], 48e357e0dcSAndreas Gohr (aider) 'ask' => [ 49e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 50e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 51e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*ask\.com$', 52e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 53e357e0dcSAndreas Gohr (aider) ], 54e357e0dcSAndreas Gohr (aider) 'ask_search_results' => [ 55e357e0dcSAndreas Gohr (aider) 'name' => 'Ask', 56e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.ask.com', 57e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*search-results\.com$', 58e357e0dcSAndreas Gohr (aider) 'params' => ['ask', 'q', 'searchfor'] 59e357e0dcSAndreas Gohr (aider) ], 60e357e0dcSAndreas Gohr (aider) 'babylon' => [ 61e357e0dcSAndreas Gohr (aider) 'name' => 'Babylon', 62e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.babylon.com', 63e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.babylon\.com$', 64e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 65e357e0dcSAndreas Gohr (aider) ], 66e357e0dcSAndreas Gohr (aider) 'aol' => [ 67e357e0dcSAndreas Gohr (aider) 'name' => 'AOL Search', 68e357e0dcSAndreas Gohr (aider) 'url' => 'http://search.aol.com', 69e357e0dcSAndreas Gohr (aider) 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70e357e0dcSAndreas Gohr (aider) 'params' => ['query', 'q'] 71e357e0dcSAndreas Gohr (aider) ], 72e357e0dcSAndreas Gohr (aider) 'duckduckgo' => [ 73e357e0dcSAndreas Gohr (aider) 'name' => 'DuckDuckGo', 74e357e0dcSAndreas Gohr (aider) 'url' => 'http://duckduckgo.com', 75e357e0dcSAndreas Gohr (aider) 'regex' => '^duckduckgo\.com$', 76e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 77e357e0dcSAndreas Gohr (aider) ], 78e357e0dcSAndreas Gohr (aider) 'google_avg' => [ 79e357e0dcSAndreas Gohr (aider) 'name' => 'Google', 80e357e0dcSAndreas Gohr (aider) 'url' => 'http://www.google.com', 81e357e0dcSAndreas Gohr (aider) 'regex' => '^search\.avg\.com$', 82e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 83e357e0dcSAndreas Gohr (aider) ] 842d987c80SAndreas Gohr ]; 852d987c80SAndreas Gohr 862d987c80SAndreas Gohr public function __construct() 872d987c80SAndreas Gohr { 88e357e0dcSAndreas Gohr (aider) // Add the internal DokuWiki search engine 89e357e0dcSAndreas Gohr (aider) $this->searchEngines['dokuwiki'] = [ 90e357e0dcSAndreas Gohr (aider) 'name' => 'DokuWiki Internal Search', 91e357e0dcSAndreas Gohr (aider) 'url' => wl(), 92e357e0dcSAndreas Gohr (aider) 'regex' => '', 93e357e0dcSAndreas Gohr (aider) 'params' => ['q'] 94e357e0dcSAndreas Gohr (aider) ]; 952d987c80SAndreas Gohr } 962d987c80SAndreas Gohr 97*a171b9c7SAndreas Gohr (aider) /** 98*a171b9c7SAndreas Gohr (aider) * Analyze a referrer URL to extract search engine information and query 99*a171b9c7SAndreas Gohr (aider) * 100*a171b9c7SAndreas Gohr (aider) * @param string $referer The HTTP referer URL 101*a171b9c7SAndreas Gohr (aider) * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 102*a171b9c7SAndreas Gohr (aider) */ 103*a171b9c7SAndreas Gohr (aider) public function analyzeReferrer(string $referer): ?array 104*a171b9c7SAndreas Gohr (aider) { 105*a171b9c7SAndreas Gohr (aider) $referer = strtolower($referer); 106*a171b9c7SAndreas Gohr (aider) 107*a171b9c7SAndreas Gohr (aider) // parse the referer 108*a171b9c7SAndreas Gohr (aider) $urlparts = parse_url($referer); 109*a171b9c7SAndreas Gohr (aider) if (!isset($urlparts['host'])) { 110*a171b9c7SAndreas Gohr (aider) return null; 111*a171b9c7SAndreas Gohr (aider) } 112*a171b9c7SAndreas Gohr (aider) 113*a171b9c7SAndreas Gohr (aider) $domain = $urlparts['host']; 114*a171b9c7SAndreas Gohr (aider) $qpart = $urlparts['query'] ?? ''; 115*a171b9c7SAndreas Gohr (aider) if (!$qpart && isset($urlparts['fragment'])) { 116*a171b9c7SAndreas Gohr (aider) $qpart = $urlparts['fragment']; // google does this 117*a171b9c7SAndreas Gohr (aider) } 118*a171b9c7SAndreas Gohr (aider) 119*a171b9c7SAndreas Gohr (aider) $params = []; 120*a171b9c7SAndreas Gohr (aider) if ($qpart) { 121*a171b9c7SAndreas Gohr (aider) parse_str($qpart, $params); 122*a171b9c7SAndreas Gohr (aider) } 123*a171b9c7SAndreas Gohr (aider) 124*a171b9c7SAndreas Gohr (aider) $query = ''; 125*a171b9c7SAndreas Gohr (aider) $engineKey = ''; 126*a171b9c7SAndreas Gohr (aider) $engineName = ''; 127*a171b9c7SAndreas Gohr (aider) 128*a171b9c7SAndreas Gohr (aider) // check domain against known search engines 129*a171b9c7SAndreas Gohr (aider) foreach ($this->searchEngines as $key => $engine) { 130*a171b9c7SAndreas Gohr (aider) if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki) 131*a171b9c7SAndreas Gohr (aider) 132*a171b9c7SAndreas Gohr (aider) if (preg_match('/' . $engine['regex'] . '/', $domain)) { 133*a171b9c7SAndreas Gohr (aider) $engineKey = $key; 134*a171b9c7SAndreas Gohr (aider) $engineName = $engine['name']; 135*a171b9c7SAndreas Gohr (aider) 136*a171b9c7SAndreas Gohr (aider) // check the known parameters for content 137*a171b9c7SAndreas Gohr (aider) foreach ($engine['params'] as $param) { 138*a171b9c7SAndreas Gohr (aider) if (!empty($params[$param])) { 139*a171b9c7SAndreas Gohr (aider) $query = $params[$param]; 140*a171b9c7SAndreas Gohr (aider) break; 141*a171b9c7SAndreas Gohr (aider) } 142*a171b9c7SAndreas Gohr (aider) } 143*a171b9c7SAndreas Gohr (aider) break; 144*a171b9c7SAndreas Gohr (aider) } 145*a171b9c7SAndreas Gohr (aider) } 146*a171b9c7SAndreas Gohr (aider) 147*a171b9c7SAndreas Gohr (aider) // try some generic search engine parameters if no specific engine matched 148*a171b9c7SAndreas Gohr (aider) if (!$engineKey) { 149*a171b9c7SAndreas Gohr (aider) foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) { 150*a171b9c7SAndreas Gohr (aider) if (!empty($params[$param])) { 151*a171b9c7SAndreas Gohr (aider) $query = $params[$param]; 152*a171b9c7SAndreas Gohr (aider) // generate name from domain 153*a171b9c7SAndreas Gohr (aider) $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld 154*a171b9c7SAndreas Gohr (aider) $engineName = explode('.', $engineName); 155*a171b9c7SAndreas Gohr (aider) $engineName = array_pop($engineName); 156*a171b9c7SAndreas Gohr (aider) $engineKey = 'generic_' . $engineName; 157*a171b9c7SAndreas Gohr (aider) break; 158*a171b9c7SAndreas Gohr (aider) } 159*a171b9c7SAndreas Gohr (aider) } 160*a171b9c7SAndreas Gohr (aider) } 161*a171b9c7SAndreas Gohr (aider) 162*a171b9c7SAndreas Gohr (aider) // still no hit? not a search engine 163*a171b9c7SAndreas Gohr (aider) if (!$engineKey || !$query) { 164*a171b9c7SAndreas Gohr (aider) return null; 165*a171b9c7SAndreas Gohr (aider) } 166*a171b9c7SAndreas Gohr (aider) 167*a171b9c7SAndreas Gohr (aider) // clean the query 168*a171b9c7SAndreas Gohr (aider) $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries 169*a171b9c7SAndreas Gohr (aider) $query = preg_replace('/ +/', ' ', $query); // ws compact 170*a171b9c7SAndreas Gohr (aider) $query = trim($query); 171*a171b9c7SAndreas Gohr (aider) 172*a171b9c7SAndreas Gohr (aider) if (!$query) { 173*a171b9c7SAndreas Gohr (aider) return null; 174*a171b9c7SAndreas Gohr (aider) } 175*a171b9c7SAndreas Gohr (aider) 176*a171b9c7SAndreas Gohr (aider) return [ 177*a171b9c7SAndreas Gohr (aider) 'engine' => $engineKey, 178*a171b9c7SAndreas Gohr (aider) 'name' => $engineName, 179*a171b9c7SAndreas Gohr (aider) 'query' => $query 180*a171b9c7SAndreas Gohr (aider) ]; 181*a171b9c7SAndreas Gohr (aider) } 182*a171b9c7SAndreas Gohr (aider) 183*a171b9c7SAndreas Gohr (aider) /** 184*a171b9c7SAndreas Gohr (aider) * Get search engine information by key 185*a171b9c7SAndreas Gohr (aider) * 186*a171b9c7SAndreas Gohr (aider) * @param string $key The search engine key 187*a171b9c7SAndreas Gohr (aider) * @return array|null The search engine data or null if not found 188*a171b9c7SAndreas Gohr (aider) */ 189*a171b9c7SAndreas Gohr (aider) public function getSearchEngine(string $key): ?array 190*a171b9c7SAndreas Gohr (aider) { 191*a171b9c7SAndreas Gohr (aider) return $this->searchEngines[$key] ?? null; 192*a171b9c7SAndreas Gohr (aider) } 193*a171b9c7SAndreas Gohr (aider) 194*a171b9c7SAndreas Gohr (aider) /** 195*a171b9c7SAndreas Gohr (aider) * Get all search engines 196*a171b9c7SAndreas Gohr (aider) * 197*a171b9c7SAndreas Gohr (aider) * @return array All search engine definitions 198*a171b9c7SAndreas Gohr (aider) */ 199*a171b9c7SAndreas Gohr (aider) public function getAllSearchEngines(): array 200*a171b9c7SAndreas Gohr (aider) { 201*a171b9c7SAndreas Gohr (aider) return $this->searchEngines; 202*a171b9c7SAndreas Gohr (aider) } 203*a171b9c7SAndreas Gohr (aider) 2042d987c80SAndreas Gohr} 205