1<?php 2 3namespace dokuwiki\plugin\statistics; 4 5/** 6 * Defines regular expressions for the most common search engines 7 */ 8class SearchEngines 9{ 10 /** @var array Search engine definitions with regex patterns and metadata */ 11 protected array $searchEngines = [ 12 'google' => [ 13 'name' => 'Google', 14 'url' => 'http://www.google.com', 15 'regex' => '^(\w+\.)*google(\.co)?\.([a-z]{2,5})$', 16 'params' => ['q'] 17 ], 18 'bing' => [ 19 'name' => 'Bing', 20 'url' => 'http://www.bing.com', 21 'regex' => '^(\w+\.)*bing(\.co)?\.([a-z]{2,5})$', 22 'params' => ['q'] 23 ], 24 'yandex' => [ 25 'name' => 'Яндекс (Yandex)', 26 'url' => 'http://www.yandex.ru', 27 'regex' => '^(\w+\.)*yandex(\.co)?\.([a-z]{2,5})$', 28 'params' => ['query'] 29 ], 30 'yahoo' => [ 31 'name' => 'Yahoo!', 32 'url' => 'http://www.yahoo.com', 33 'regex' => '^(\w+\.)*yahoo\.com$', 34 'params' => ['p'] 35 ], 36 'naver' => [ 37 'name' => '네이버 (Naver)', 38 'url' => 'http://www.naver.com', 39 'regex' => '^search\.naver\.com$', 40 'params' => ['query'] 41 ], 42 'baidu' => [ 43 'name' => '百度 (Baidu)', 44 'url' => 'http://www.baidu.com', 45 'regex' => '^(\w+\.)*baidu\.com$', 46 'params' => ['wd', 'word', 'kw'] 47 ], 48 'ask' => [ 49 'name' => 'Ask', 50 'url' => 'http://www.ask.com', 51 'regex' => '^(\w+\.)*ask\.com$', 52 'params' => ['ask', 'q', 'searchfor'] 53 ], 54 'ask_search_results' => [ 55 'name' => 'Ask', 56 'url' => 'http://www.ask.com', 57 'regex' => '^(\w+\.)*search-results\.com$', 58 'params' => ['ask', 'q', 'searchfor'] 59 ], 60 'babylon' => [ 61 'name' => 'Babylon', 62 'url' => 'http://search.babylon.com', 63 'regex' => '^search\.babylon\.com$', 64 'params' => ['q'] 65 ], 66 'aol' => [ 67 'name' => 'AOL Search', 68 'url' => 'http://search.aol.com', 69 'regex' => '^(\w+\.)*(aol)?((search|recherches?|images|suche|alicesuche)\.)aol(\.co)?\.([a-z]{2,5})$', 70 'params' => ['query', 'q'] 71 ], 72 'duckduckgo' => [ 73 'name' => 'DuckDuckGo', 74 'url' => 'http://duckduckgo.com', 75 'regex' => '^duckduckgo\.com$', 76 'params' => ['q'] 77 ], 78 'google_avg' => [ 79 'name' => 'Google', 80 'url' => 'http://www.google.com', 81 'regex' => '^search\.avg\.com$', 82 'params' => ['q'] 83 ] 84 ]; 85 86 public function __construct() 87 { 88 // Add the internal DokuWiki search engine 89 $this->searchEngines['dokuwiki'] = [ 90 'name' => 'DokuWiki Internal Search', 91 'url' => wl(), 92 'regex' => '', 93 'params' => ['q'] 94 ]; 95 } 96 97 /** 98 * Analyze a referrer URL to extract search engine information and query 99 * 100 * @param string $referer The HTTP referer URL 101 * @return array|null Array with 'engine', 'name', 'query' keys or null if not a search engine 102 */ 103 public function analyzeReferrer(string $referer): ?array 104 { 105 $referer = strtolower($referer); 106 107 // parse the referer 108 $urlparts = parse_url($referer); 109 if (!isset($urlparts['host'])) { 110 return null; 111 } 112 113 $domain = $urlparts['host']; 114 $qpart = $urlparts['query'] ?? ''; 115 if (!$qpart && isset($urlparts['fragment'])) { 116 $qpart = $urlparts['fragment']; // google does this 117 } 118 119 $params = []; 120 if ($qpart) { 121 parse_str($qpart, $params); 122 } 123 124 $query = ''; 125 $engineKey = ''; 126 $engineName = ''; 127 128 // check domain against known search engines 129 foreach ($this->searchEngines as $key => $engine) { 130 if (!$engine['regex']) continue; // skip engines without regex (like dokuwiki) 131 132 if (preg_match('/' . $engine['regex'] . '/', $domain)) { 133 $engineKey = $key; 134 $engineName = $engine['name']; 135 136 // check the known parameters for content 137 foreach ($engine['params'] as $param) { 138 if (!empty($params[$param])) { 139 $query = $params[$param]; 140 break; 141 } 142 } 143 break; 144 } 145 } 146 147 // try some generic search engine parameters if no specific engine matched 148 if (!$engineKey) { 149 foreach (['search', 'query', 'q', 'keywords', 'keyword'] as $param) { 150 if (!empty($params[$param])) { 151 $query = $params[$param]; 152 // generate name from domain 153 $engineName = preg_replace('/(\.co)?\.([a-z]{2,5})$/', '', $domain); // strip tld 154 $engineName = explode('.', $engineName); 155 $engineName = array_pop($engineName); 156 $engineKey = 'generic_' . $engineName; 157 break; 158 } 159 } 160 } 161 162 // still no hit? not a search engine 163 if (!$engineKey || !$query) { 164 return null; 165 } 166 167 // clean the query 168 $query = preg_replace('/^(cache|related):[^\+]+/', '', $query); // non-search queries 169 $query = preg_replace('/ +/', ' ', $query); // ws compact 170 $query = trim($query); 171 172 if (!$query) { 173 return null; 174 } 175 176 return [ 177 'engine' => $engineKey, 178 'name' => $engineName, 179 'query' => $query 180 ]; 181 } 182 183 /** 184 * Get search engine information by key 185 * 186 * @param string $key The search engine key 187 * @return array|null The search engine data or null if not found 188 */ 189 public function getSearchEngine(string $key): ?array 190 { 191 return $this->searchEngines[$key] ?? null; 192 } 193 194 /** 195 * Get all search engines 196 * 197 * @return array All search engine definitions 198 */ 199 public function getAllSearchEngines(): array 200 { 201 return $this->searchEngines; 202 } 203 204} 205