1<?php 2#ensure numeric timeout. convert e.g., '1 week' to #seconds. 3function expiry_ensurenum($expiry){ 4 if(!$expiry) return 0; 5 if(!is_numeric($expiry)) { 6 $expiry_=$expiry; #backup for error message. 7 #strtotime requires a space betwen the number and the unit. 8 $expiry=preg_replace("/^([0-9]+)([a-zA-Z])/",'$1 $2',$expiry); 9 if(($expiry = strtotime("+$expiry",0))===false){ 10 throw new Exception("Invalid timeout: [ $expiry_ ]"); 11 } 12 } 13 return $expiry; 14} 15function expiry_totime($expiry,$expirytime=null){ 16 if(isset($expirytime)) return $expirytime; 17 $expiry=expiry_ensurenum($expiry); 18 if(!$expiry) return 0; 19 return time()-$expiry; 20} 21#return status or [status,errormsg] 22function linkcheck_checkurl_curl($url,$o=[]) { 23 $o=array_merge([ 24 'verifypeer'=>true, 25 'verifypeername'=>NULL, #defaults to same as verifypeer. 26 'cacertfile'=>NULL, 27 'dbg'=>false, 28 'nobody'=>true, 29 ],$o); 30 31 $ch = curl_init($url); 32 if($o['cacertfile']) curl_setopt($ch, CURLOPT_CAINFO, $o['cacertfile']); 33 curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $o['verifypeer']?1:0); 34 curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, ($o['verifypeername']??$o['verifypeer'])?2:0); 35 curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); 36 curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1); 37 curl_setopt($ch,CURLOPT_MAXREDIRS, 5); 38 curl_setopt($ch,CURLOPT_TIMEOUT,10); 39 curl_setopt($ch,CURLOPT_NOBODY, $o['nobody']?1:0); 40 curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET'); #without this kaggle and amazon return 404 (maybe they don't like HEAD method that curl might be setting with NOBODY option. 41 #curl_setopt($ch, CURLOPT_HTTPGET, 1); #this turns off NOBODY option (and curl ends up getting the body. CUSTOMREQUEST is better; it doesn't turn off NOBODY.) 42 $output = curl_exec($ch); 43 if($o['dbg']){ 44 echo "Got output:\n".(php_sapi_name() === 'cli'?$output:htmlspecialchars($output))."\n"; 45 } 46 $curl_errno = curl_errno($ch); 47 if($curl_errno) { 48 if($curl_errno == 28) { 49 $code = 408; 50 } 51 else if($curl_errno == 60) { 52 $code = 495; 53 } 54 else if($curl_errno<100) $code=$curl_errno; 55 else $code=500; 56 $msg=curl_error($ch); 57 $ret=[$code,$msg]; 58 } 59 else{ 60 $ret = curl_getinfo($ch, CURLINFO_HTTP_CODE); 61 } 62 curl_close($ch); 63 return $ret; 64} 65function linkcheck_checkurl_stream($url,$o=[]) { 66 $o=array_merge([ 67 'verifypeer'=>true, 68 'verifypeername'=>null, #defaults to same as verifypeer. 69 'cacertfile'=>NULL, 70 ],$o); 71 72 $ossl=['verify_peer'=>$o['verifypeer'],'verify_peer_name'=> $o['verifypeername']??$o['verifypeer']]; 73 if($o['cacertfile']) $ossl['cafile']=$o['cacertfile']; 74 $context=['ssl'=>$ossl,'http'=>['method'=>'HEAD']]; 75 $s=@file_get_contents($url, false, stream_context_create($context),0,0); #getting a length of zero for efficiency; it results in empty string on successful request. 76 if($s!==false){ return 200; } 77 else return 404; 78} 79 80 81#returns the http code for a url. 82function linkcheck_checkurl($url,$o=[]) { 83 $o=array_merge([ 84 'verifypeer'=>true, 85 'cacertfile'=>NULL, 86 'autodownloadcacertfile'=>true, #if cacertfile is specified but doesn't exist (or is out of date), download it. 87 'cacertfileexpiry'=>'1 month', #The revisions on https://curl.se/docs/caextract.html are 1-2months apart. 88 'cacertfileexpirytime'=>null, #alternative to cacertfileexpiry time length, provide a timepoint. 89 'dbg'=>false, 90 ],$o); 91 if(!$o['verifypeer']) $o['cacertfile']=NULL; #if verifypeer is off, we don't need a cacertfile. 92 #autodownload cacertfile. 93 if($o['cacertfile'] && $o['autodownloadcacertfile']&&(!is_file($o['cacertfile'])||filemtime($o['cacertfile'])<expiry_totime($o['cacertfileexpiry'],$o['cacertfileexpirytime']))){ 94 if($o['dbg']) echo "<li> Downloading cacert.pem ...\n"; 95 if(!($s=file_get_contents("https://curl.haxx.se/ca/cacert.pem"))){ 96 throw new Exception("Failed to download cacert.pem."); 97 } 98 if(!is_dir(dirname($o['cacertfile']))) mkdir(dirname($o['cacertfile']),0755); 99 file_put_contents($o['cacertfile'],$s); 100 } 101 102 if(function_exists('curl_init')) return linkcheck_checkurl_curl($url,$o); 103 else return linkcheck_checkurl_stream($url,$o); 104} 105 106#get db connection. create cache table if it doesn't exist. 107#codegroup and lastcheckdate are redundant but make it easier to look at the db contents. 108function linkcheck_db($dbfile){ 109 if(!is_string($dbfile)) return $dbfile; 110 if(!file_exists($dbfile)) { 111 if(!is_dir(dirname($dbfile))) mkdir(dirname($dbfile),0755); 112 $db = new SQLite3($dbfile); 113 $db->exec("CREATE TABLE IF NOT EXISTS linkcheck_cache (url TEXT PRIMARY KEY, codegroup TEXT, code INTEGER, msg TEXT, lastcheck INTEGER, lastcheckdate DATETIME, pages TEXT)"); 114 } 115 else $db = new SQLite3($dbfile); 116 return $db; 117} 118function linkcheck_db_query($db,$query){ 119 $rows=[]; 120 $res = $db->query($query); 121 while($row=$res->fetchArray(SQLITE3_ASSOC)){ 122 $rows[]=$row; 123 } 124 return $rows; 125} 126 127#maps to one of valid, invalid, error. 128function linkcheck_code2group($code){ 129 if($code>=200 && $code<=399) return 'valid'; 130 elseif($code>=400&&$code<=499) return 'invalid'; 131 else return 'error'; 132} 133function linkcheck_code2msg($code){ 134 $msgs=[400=>'Bad Request',401=>'Unauthorized',402=>'Payment Required',403=>'Forbidden',404=>'Not Found',405=>'Method Not Allowed',406=>'Not Acceptable',407=>'Proxy Authentication Required (RFC 7235)',408=>'Request Timeout',495=>'SSL Certificate Error']; 135 return $msgs[$code]??''; 136} 137 138function linkcheck_checkurl_withcache($url,$o=[]) { 139 $o=array_merge([ 140 'dbfile'=>NULL, 141 'cacheexpiry'=>604800, #when not given, defaults to 604800, #'1 week', 142 'cacheexpirytime'=>null, #alternative to cacheexpiry time length, provide a timepoint. 143 'requireexists'=>true, #whether to restrict checks to the urls that have previously been inserted into the database. 144 ],$o); 145 if(!$o['dbfile']) return linkcheck_checkurl($url,$o); 146 147 $db=linkcheck_db($o['dbfile']); 148 $url_=$db->escapeString($url); 149 $row=$db->querySingle("SELECT code,msg,lastcheck FROM linkcheck_cache WHERE url='$url_'", true); 150 151 if($o['requireexists']&&!$row) return [500,"url [ $url ] not in database"]; 152 if(!isset($o['cacheexpirytime'])) $o['cacheexpirytime']=expiry_totime($o['cacheexpiry']); 153 154 if(!$row || $row['lastcheck']<$o['cacheexpirytime']){ 155 if(!$row) $row=['url'=>$url_]; 156 $r=linkcheck_checkurl($url,$o); 157 $row['code']=(is_array($r)?$r[0]:$r)+0; #ensures integer 158 $row['codegroup']=linkcheck_code2group($row['code']); 159 160 if(is_array($r)){ $row['msg']=$r[1]; $msg_=$db->escapeString($row['msg']); } 161 else{ $msg_=$row['msg']=''; } 162 163 $row['lastcheck']=time(); 164 $row['lastcheckdate']=date('Y-m-d H:i:s',$row['lastcheck']); 165 if(!$o['requireexists']) 166 $db->exec("INSERT OR IGNORE INTO linkcheck_cache(url) VALUES ('$url_')"); 167 $db->exec("UPDATE linkcheck_cache SET codegroup='$row[codegroup]', code=$row[code],msg='$msg_',lastcheck=$row[lastcheck],lastcheckdate='$row[lastcheckdate]' WHERE url='$url_'"); 168 } 169 return $row['msg'] ? [$row['code'],$row['msg']] : $row['code']; 170} 171 172 173 174#just some testing in the command line... 175/* 176if(php_sapi_name() === 'cli' && getenv('AHMETLIBPHP')){ 177 require_once getenv('AHMETLIBPHP').'/ahmet.php'; 178 #ve(linkcheck_checkurl_withcache('https://localhost/',['cacertfile'=>'C:/downloads/cacert.pem','verifypeer'=>false,'dbg'=>true,'dbfile'=>'C:/downloads/linkcheck_cache.sqlite','requireexists'=>false])); 179 #oxford doesn't like programmatic access. it returns 403. nothing we can do. 180 #ve(linkcheck_checkurl('https://academic.oup.com/bioinformatics/article/24/24/2872/196843',['dbg'=>1,'nobody'=>0])); 181} 182*/ 183