1<?php 2 3namespace Jaybizzle\CrawlerDetect; 4 5class CrawlerDetect 6{ 7 /** 8 * The user agent. 9 * 10 * @var null 11 */ 12 protected $userAgent = null; 13 14 /** 15 * Headers that contain a user agent. 16 * 17 * @var array 18 */ 19 protected $httpHeaders = array(); 20 21 /** 22 * Store regex matches. 23 * 24 * @var array 25 */ 26 protected $matches = array(); 27 28 /** 29 * List of strings to remove from the user agent before running the crawler regex 30 * Over a large list of user agents, this gives us about a 55% speed increase! 31 * 32 * @var array 33 */ 34 protected static $ignore = array( 35 'Safari.[\d\.]*', 36 'Firefox.[\d\.]*', 37 'Chrome.[\d\.]*', 38 'Chromium.[\d\.]*', 39 'MSIE.[\d\.]', 40 'Opera\/[\d\.]*', 41 'Mozilla.[\d\.]*', 42 'AppleWebKit.[\d\.]*', 43 'Trident.[\d\.]*', 44 'Windows NT.[\d\.]*', 45 'Android.[\d\.]*', 46 'Macintosh.', 47 'Ubuntu', 48 'Linux', 49 'Intel', 50 'Mac OS X', 51 'Gecko.[\d\.]*', 52 'KHTML', 53 'CriOS.[\d\.]*', 54 'CPU iPhone OS ([0-9_])* like Mac OS X', 55 'CPU OS ([0-9_])* like Mac OS X', 56 'iPod', 57 'like Gecko', 58 'compatible', 59 'x86_..', 60 'i686', 61 'x64', 62 'X11', 63 'rv:[\d\.]*', 64 'Version.[\d\.]*', 65 'WOW64', 66 'Win64', 67 'Dalvik.[\d\.]*', 68 '\.NET CLR [\d\.]*', 69 'Presto.[\d\.]*', 70 'Media Center PC', 71 ); 72 73 /** 74 * Array of regular expressions to match against the user agent. 75 * 76 * @var array 77 */ 78 protected static $crawlers = array( 79 '008\\/', 80 'A6-Indexer', 81 'Aboundex', 82 'Accoona-AI-Agent', 83 'acoon', 84 'AddThis', 85 'ADmantX', 86 'AHC', 87 'Airmail', 88 'Anemone', 89 'Arachmo', 90 'archive-com', 91 'B-l-i-t-z-B-O-T', 92 'bibnum\.bnf', 93 'biglotron', 94 'binlar', 95 'BingPreview', 96 'boitho\.com-dc', 97 'BUbiNG', 98 'Butterfly\\/', 99 'BuzzSumo', 100 'CC Metadata Scaper', 101 'Cerberian Drtrs', 102 'changedetection', 103 'Charlotte', 104 'clips\.ua\.ac\.be', 105 'CloudFlare-AlwaysOnline', 106 'coccoc', 107 'Commons-HttpClient', 108 'convera', 109 'cosmos', 110 'Covario-IDS', 111 'curl', 112 'CyberPatrol', 113 'DataparkSearch', 114 'dataprovider', 115 'Digg', 116 'DomainAppender', 117 'drupact', 118 'EARTHCOM', 119 'ec2linkfinder', 120 'ElectricMonk', 121 'Embedly', 122 'europarchive\.org', 123 'EventMachine HttpClient', 124 'ezooms', 125 'eZ Publish Link Validator', 126 'facebookexternalhit', 127 'Feedfetcher-Google', 128 'FeedValidator', 129 'FindLinks', 130 'findlink', 131 'findthatfile', 132 'Flamingo_SearchEngine', 133 'fluffy', 134 'getprismatic\.com', 135 'g00g1e\.net', 136 'GigablastOpenSource', 137 'grub-client', 138 'Genieo', 139 'Go-http-client', 140 'Google-HTTP-Java-Client', 141 'Google favicon', 142 'Google Keyword Suggestion', 143 'heritrix', 144 'Holmes', 145 'htdig', 146 'httpunit', 147 'httrack', 148 'ichiro', 149 'igdeSpyder', 150 'InAGist', 151 'InfoWizards Reciprocal Link System PRO', 152 'integromedb', 153 'IODC', 154 'IOI', 155 'ips-agent', 156 'iZSearch', 157 'L\.webis', 158 'Larbin', 159 'libwww', 160 'Link Valet', 161 'linkdex', 162 'LinkExaminer', 163 'LinkWalker', 164 'Lipperhey Link Explorer', 165 'Lipperhey SEO Service', 166 'LongURL API', 167 'ltx71', 168 'lwp-trivial', 169 'MegaIndex\.ru', 170 'mabontland', 171 'MagpieRSS', 172 'Mediapartners-Google', 173 'MetaURI', 174 'Mnogosearch', 175 'mogimogi', 176 'Morning Paper', 177 'Mrcgiguy', 178 'MVAClient', 179 'netresearchserver', 180 'NewsGator', 181 'newsme', 182 'NG-Search', 183 '^NING\\/', 184 'Notifixious', 185 'nutch', 186 'NutchCVS', 187 'Nymesis', 188 'oegp', 189 'online link validator', 190 'Online Website Link Checker', 191 'Orbiter', 192 'ow\.ly', 193 'Ploetz \+ Zeller', 194 'page2rss', 195 'panscient', 196 'Peew', 197 'phpcrawl', 198 'Pizilla', 199 'Plukkie', 200 'Pompos', 201 'postano', 202 'PostPost', 203 'postrank', 204 'proximic', 205 'PycURL', 206 'Python-httplib2', 207 'python-requests', 208 'Python-urllib', 209 'Qseero', 210 'Qwantify', 211 'Radian6', 212 'RebelMouse', 213 'REL Link Checker', 214 'RetrevoPageAnalyzer', 215 'Riddler', 216 'Robosourcer', 217 'Ruby', 218 'SBIder', 219 'ScoutJet', 220 'ScoutURLMonitor', 221 'Scrapy', 222 'Scrubby', 223 'SearchSight', 224 'semanticdiscovery', 225 'SEOstats', 226 'Seznam screenshot-generator', 227 'ShopWiki', 228 'SiteBar', 229 'siteexplorer\.info', 230 'slider\.com', 231 'slurp', 232 'Snappy', 233 'sogou', 234 'speedy', 235 'Sqworm', 236 'StackRambler', 237 'Stratagems Kumo', 238 'summify', 239 'teoma', 240 'theoldreader\.com', 241 'TinEye', 242 'Traackr.com', 243 'truwoGPS', 244 'tweetedtimes\.com', 245 'Twikle', 246 'UnwindFetchor', 247 'updated', 248 'urlresolver', 249 'Validator\.nu\\/LV', 250 'Vagabondo', 251 'Vivante Link Checker', 252 'Vortex', 253 'voyager\\/', 254 'VYU2', 255 'W3C-checklink', 256 'W3C_CSS_Validator_JFouffa', 257 'W3C_I18n-Checker', 258 'W3C-mobileOK', 259 'W3C_Unicorn', 260 'W3C_Validator', 261 'WebIndex', 262 'Websquash\.com', 263 'webcollage', 264 'webmon ', 265 'WeSEE:Search', 266 'wf84', 267 'wget', 268 'WomlpeFactory', 269 'wotbox', 270 'Xenu Link Sleuth', 271 'XML Sitemaps Generator', 272 'Y!J-ASR', 273 'yacy', 274 'Yahoo Link Preview', 275 'Yahoo! Slurp China', 276 'Yahoo! Slurp', 277 'YahooSeeker', 278 'YahooSeeker-Testing', 279 'YandexImages', 280 'YandexMetrika', 281 'yandex', 282 'yanga', 283 'yeti', 284 'yoogliFetchAgent', 285 'Zao', 286 'ZyBorg', 287 '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)', 288 ); 289 290 /** 291 * All possible HTTP headers that represent the 292 * User-Agent string. 293 * 294 * @var array 295 */ 296 protected static $uaHttpHeaders = array( 297 // The default User-Agent string. 298 'HTTP_USER_AGENT', 299 // Header can occur on devices using Opera Mini. 300 'HTTP_X_OPERAMINI_PHONE_UA', 301 // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/ 302 'HTTP_X_DEVICE_USER_AGENT', 303 'HTTP_X_ORIGINAL_USER_AGENT', 304 'HTTP_X_SKYFIRE_PHONE', 305 'HTTP_X_BOLT_PHONE_UA', 306 'HTTP_DEVICE_STOCK_UA', 307 'HTTP_X_UCBROWSER_DEVICE_UA', 308 ); 309 310 /** 311 * Class constructor. 312 */ 313 public function __construct(array $headers = null, $userAgent = null) 314 { 315 $this->setHttpHeaders($headers); 316 $this->setUserAgent($userAgent); 317 } 318 319 /** 320 * Set HTTP headers. 321 * 322 * @param array $httpHeaders 323 */ 324 public function setHttpHeaders($httpHeaders = null) 325 { 326 // use global _SERVER if $httpHeaders aren't defined 327 if (!is_array($httpHeaders) || !count($httpHeaders)) { 328 $httpHeaders = $_SERVER; 329 } 330 // clear existing headers 331 $this->httpHeaders = array(); 332 // Only save HTTP headers. In PHP land, that means only _SERVER vars that 333 // start with HTTP_. 334 foreach ($httpHeaders as $key => $value) { 335 if (substr($key, 0, 5) === 'HTTP_') { 336 $this->httpHeaders[$key] = $value; 337 } 338 } 339 } 340 341 /** 342 * Return user agent headers. 343 * 344 * @return array 345 */ 346 public function getUaHttpHeaders() 347 { 348 return self::$uaHttpHeaders; 349 } 350 351 /** 352 * Set the user agent. 353 * 354 * @param string $userAgent 355 */ 356 public function setUserAgent($userAgent = null) 357 { 358 if (false === empty($userAgent)) { 359 return $this->userAgent = $userAgent; 360 } else { 361 $this->userAgent = null; 362 foreach ($this->getUaHttpHeaders() as $altHeader) { 363 if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow. 364 $this->userAgent .= $this->httpHeaders[$altHeader].' '; 365 } 366 } 367 368 return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null); 369 } 370 } 371 372 /** 373 * Build the user agent regex. 374 * 375 * @return string 376 */ 377 public function getRegex() 378 { 379 return '('.implode('|', self::$crawlers).')'; 380 } 381 382 /** 383 * Build the replacement regex. 384 * 385 * @return string 386 */ 387 public function getIgnored() 388 { 389 return '('.implode('|', self::$ignore).')'; 390 } 391 392 /** 393 * Check user agent string against the regex. 394 * 395 * @param string $userAgent 396 * 397 * @return bool 398 */ 399 public function isCrawler($userAgent = null) 400 { 401 $agent = is_null($userAgent) ? $this->userAgent : $userAgent; 402 403 $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent); 404 405 $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches); 406 407 if ($matches) { 408 $this->matches = $matches; 409 } 410 411 return (bool) $result; 412 } 413 414 /** 415 * Return the matches. 416 * 417 * @return array 418 */ 419 public function getMatches() 420 { 421 return $this->matches[0]; 422 } 423} 424