1<?php
2
3namespace Jaybizzle\CrawlerDetect;
4
5class CrawlerDetect
6{
7    /**
8     * The user agent.
9     *
10     * @var null
11     */
12    protected $userAgent = null;
13
14    /**
15     * Headers that contain a user agent.
16     *
17     * @var array
18     */
19    protected $httpHeaders = array();
20
21    /**
22     * Store regex matches.
23     *
24     * @var array
25     */
26    protected $matches = array();
27
28    /**
29     * List of strings to remove from the user agent before running the crawler regex
30     * Over a large list of user agents, this gives us about a 55% speed increase!
31     *
32     * @var array
33     */
34    protected static $ignore = array(
35        'Safari.[\d\.]*',
36        'Firefox.[\d\.]*',
37        'Chrome.[\d\.]*',
38        'Chromium.[\d\.]*',
39        'MSIE.[\d\.]',
40        'Opera\/[\d\.]*',
41        'Mozilla.[\d\.]*',
42        'AppleWebKit.[\d\.]*',
43        'Trident.[\d\.]*',
44        'Windows NT.[\d\.]*',
45        'Android.[\d\.]*',
46        'Macintosh.',
47        'Ubuntu',
48        'Linux',
49        'Intel',
50        'Mac OS X',
51        'Gecko.[\d\.]*',
52        'KHTML',
53        'CriOS.[\d\.]*',
54        'CPU iPhone OS ([0-9_])* like Mac OS X',
55        'CPU OS ([0-9_])* like Mac OS X',
56        'iPod',
57        'like Gecko',
58        'compatible',
59        'x86_..',
60        'i686',
61        'x64',
62        'X11',
63        'rv:[\d\.]*',
64        'Version.[\d\.]*',
65        'WOW64',
66        'Win64',
67        'Dalvik.[\d\.]*',
68        '\.NET CLR [\d\.]*',
69        'Presto.[\d\.]*',
70        'Media Center PC',
71    );
72
73    /**
74     * Array of regular expressions to match against the user agent.
75     *
76     * @var array
77     */
78    protected static $crawlers = array(
79        '008\\/',
80        'A6-Indexer',
81        'Aboundex',
82        'Accoona-AI-Agent',
83        'acoon',
84        'AddThis',
85        'ADmantX',
86        'AHC',
87        'Airmail',
88        'Anemone',
89        'Arachmo',
90        'archive-com',
91        'B-l-i-t-z-B-O-T',
92        'bibnum\.bnf',
93        'biglotron',
94        'binlar',
95        'BingPreview',
96        'boitho\.com-dc',
97        'BUbiNG',
98        'Butterfly\\/',
99        'BuzzSumo',
100        'CC Metadata Scaper',
101        'Cerberian Drtrs',
102        'changedetection',
103        'Charlotte',
104        'clips\.ua\.ac\.be',
105        'CloudFlare-AlwaysOnline',
106        'coccoc',
107        'Commons-HttpClient',
108        'convera',
109        'cosmos',
110        'Covario-IDS',
111        'curl',
112        'CyberPatrol',
113        'DataparkSearch',
114        'dataprovider',
115        'Digg',
116        'DomainAppender',
117        'drupact',
118        'EARTHCOM',
119        'ec2linkfinder',
120        'ElectricMonk',
121        'Embedly',
122        'europarchive\.org',
123        'EventMachine HttpClient',
124        'ezooms',
125        'eZ Publish Link Validator',
126        'facebookexternalhit',
127        'Feedfetcher-Google',
128        'FeedValidator',
129        'FindLinks',
130        'findlink',
131        'findthatfile',
132        'Flamingo_SearchEngine',
133        'fluffy',
134        'getprismatic\.com',
135        'g00g1e\.net',
136        'GigablastOpenSource',
137        'grub-client',
138        'Genieo',
139        'Go-http-client',
140        'Google-HTTP-Java-Client',
141        'Google favicon',
142        'Google Keyword Suggestion',
143        'heritrix',
144        'Holmes',
145        'htdig',
146        'httpunit',
147        'httrack',
148        'ichiro',
149        'igdeSpyder',
150        'InAGist',
151        'InfoWizards Reciprocal Link System PRO',
152        'integromedb',
153        'IODC',
154        'IOI',
155        'ips-agent',
156        'iZSearch',
157        'L\.webis',
158        'Larbin',
159        'libwww',
160        'Link Valet',
161        'linkdex',
162        'LinkExaminer',
163        'LinkWalker',
164        'Lipperhey Link Explorer',
165        'Lipperhey SEO Service',
166        'LongURL API',
167        'ltx71',
168        'lwp-trivial',
169        'MegaIndex\.ru',
170        'mabontland',
171        'MagpieRSS',
172        'Mediapartners-Google',
173        'MetaURI',
174        'Mnogosearch',
175        'mogimogi',
176        'Morning Paper',
177        'Mrcgiguy',
178        'MVAClient',
179        'netresearchserver',
180        'NewsGator',
181        'newsme',
182        'NG-Search',
183        '^NING\\/',
184        'Notifixious',
185        'nutch',
186        'NutchCVS',
187        'Nymesis',
188        'oegp',
189        'online link validator',
190        'Online Website Link Checker',
191        'Orbiter',
192        'ow\.ly',
193        'Ploetz \+ Zeller',
194        'page2rss',
195        'panscient',
196        'Peew',
197        'phpcrawl',
198        'Pizilla',
199        'Plukkie',
200        'Pompos',
201        'postano',
202        'PostPost',
203        'postrank',
204        'proximic',
205        'PycURL',
206        'Python-httplib2',
207        'python-requests',
208        'Python-urllib',
209        'Qseero',
210        'Qwantify',
211        'Radian6',
212        'RebelMouse',
213        'REL Link Checker',
214        'RetrevoPageAnalyzer',
215        'Riddler',
216        'Robosourcer',
217        'Ruby',
218        'SBIder',
219        'ScoutJet',
220        'ScoutURLMonitor',
221        'Scrapy',
222        'Scrubby',
223        'SearchSight',
224        'semanticdiscovery',
225        'SEOstats',
226        'Seznam screenshot-generator',
227        'ShopWiki',
228        'SiteBar',
229        'siteexplorer\.info',
230        'slider\.com',
231        'slurp',
232        'Snappy',
233        'sogou',
234        'speedy',
235        'Sqworm',
236        'StackRambler',
237        'Stratagems Kumo',
238        'summify',
239        'teoma',
240        'theoldreader\.com',
241        'TinEye',
242        'Traackr.com',
243        'truwoGPS',
244        'tweetedtimes\.com',
245        'Twikle',
246        'UnwindFetchor',
247        'updated',
248        'urlresolver',
249        'Validator\.nu\\/LV',
250        'Vagabondo',
251        'Vivante Link Checker',
252        'Vortex',
253        'voyager\\/',
254        'VYU2',
255        'W3C-checklink',
256        'W3C_CSS_Validator_JFouffa',
257        'W3C_I18n-Checker',
258        'W3C-mobileOK',
259        'W3C_Unicorn',
260        'W3C_Validator',
261        'WebIndex',
262        'Websquash\.com',
263        'webcollage',
264        'webmon ',
265        'WeSEE:Search',
266        'wf84',
267        'wget',
268        'WomlpeFactory',
269        'wotbox',
270        'Xenu Link Sleuth',
271        'XML Sitemaps Generator',
272        'Y!J-ASR',
273        'yacy',
274        'Yahoo Link Preview',
275        'Yahoo! Slurp China',
276        'Yahoo! Slurp',
277        'YahooSeeker',
278        'YahooSeeker-Testing',
279        'YandexImages',
280        'YandexMetrika',
281        'yandex',
282        'yanga',
283        'yeti',
284        'yoogliFetchAgent',
285        'Zao',
286        'ZyBorg',
287        '[a-z0-9\\-_]*((?<!cu)bot|crawler|archiver|transcoder|spider)',
288    );
289
290    /**
291     * All possible HTTP headers that represent the
292     * User-Agent string.
293     *
294     * @var array
295     */
296    protected static $uaHttpHeaders = array(
297        // The default User-Agent string.
298        'HTTP_USER_AGENT',
299        // Header can occur on devices using Opera Mini.
300        'HTTP_X_OPERAMINI_PHONE_UA',
301        // Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
302        'HTTP_X_DEVICE_USER_AGENT',
303        'HTTP_X_ORIGINAL_USER_AGENT',
304        'HTTP_X_SKYFIRE_PHONE',
305        'HTTP_X_BOLT_PHONE_UA',
306        'HTTP_DEVICE_STOCK_UA',
307        'HTTP_X_UCBROWSER_DEVICE_UA',
308    );
309
310    /**
311     * Class constructor.
312     */
313    public function __construct(array $headers = null, $userAgent = null)
314    {
315        $this->setHttpHeaders($headers);
316        $this->setUserAgent($userAgent);
317    }
318
319    /**
320     * Set HTTP headers.
321     *
322     * @param array $httpHeaders
323     */
324    public function setHttpHeaders($httpHeaders = null)
325    {
326        // use global _SERVER if $httpHeaders aren't defined
327        if (!is_array($httpHeaders) || !count($httpHeaders)) {
328            $httpHeaders = $_SERVER;
329        }
330        // clear existing headers
331        $this->httpHeaders = array();
332        // Only save HTTP headers. In PHP land, that means only _SERVER vars that
333        // start with HTTP_.
334        foreach ($httpHeaders as $key => $value) {
335            if (substr($key, 0, 5) === 'HTTP_') {
336                $this->httpHeaders[$key] = $value;
337            }
338        }
339    }
340
341    /**
342     * Return user agent headers.
343     *
344     * @return array
345     */
346    public function getUaHttpHeaders()
347    {
348        return self::$uaHttpHeaders;
349    }
350
351    /**
352     * Set the user agent.
353     *
354     * @param string $userAgent
355     */
356    public function setUserAgent($userAgent = null)
357    {
358        if (false === empty($userAgent)) {
359            return $this->userAgent = $userAgent;
360        } else {
361            $this->userAgent = null;
362            foreach ($this->getUaHttpHeaders() as $altHeader) {
363                if (false === empty($this->httpHeaders[$altHeader])) { // @todo: should use getHttpHeader(), but it would be slow.
364                    $this->userAgent .= $this->httpHeaders[$altHeader].' ';
365                }
366            }
367
368            return $this->userAgent = (!empty($this->userAgent) ? trim($this->userAgent) : null);
369        }
370    }
371
372    /**
373     * Build the user agent regex.
374     *
375     * @return string
376     */
377    public function getRegex()
378    {
379        return '('.implode('|', self::$crawlers).')';
380    }
381
382    /**
383     * Build the replacement regex.
384     *
385     * @return string
386     */
387    public function getIgnored()
388    {
389        return '('.implode('|', self::$ignore).')';
390    }
391
392    /**
393     * Check user agent string against the regex.
394     *
395     * @param string $userAgent
396     *
397     * @return bool
398     */
399    public function isCrawler($userAgent = null)
400    {
401        $agent = is_null($userAgent) ? $this->userAgent : $userAgent;
402
403        $agent = preg_replace('/'.$this->getIgnored().'/i', '', $agent);
404
405        $result = preg_match('/'.$this->getRegex().'/i', $agent, $matches);
406
407        if ($matches) {
408            $this->matches = $matches;
409        }
410
411        return (bool) $result;
412    }
413
414    /**
415     * Return the matches.
416     *
417     * @return array
418     */
419    public function getMatches()
420    {
421        return $this->matches[0];
422    }
423}
424