<?php #ensure numeric timeout. convert e.g., '1 week' to #seconds. function expiry_ensurenum($expiry){ if(!$expiry) return 0; if(!is_numeric($expiry)) { $expiry_=$expiry; #backup for error message. #strtotime requires a space betwen the number and the unit. $expiry=preg_replace("/^([0-9]+)([a-zA-Z])/",'$1 $2',$expiry); if(($expiry = strtotime("+$expiry",0))===false){ throw new Exception("Invalid timeout: [ $expiry_ ]"); } } return $expiry; } function expiry_totime($expiry,$expirytime=null){ if(isset($expirytime)) return $expirytime; $expiry=expiry_ensurenum($expiry); if(!$expiry) return 0; return time()-$expiry; } #return status or [status,errormsg] function linkcheck_checkurl_curl($url,$o=[]) { $o=array_merge([ 'verifypeer'=>true, 'verifypeername'=>NULL, #defaults to same as verifypeer. 'cacertfile'=>NULL, 'dbg'=>false, 'nobody'=>true, ],$o); $ch = curl_init($url); if($o['cacertfile']) curl_setopt($ch, CURLOPT_CAINFO, $o['cacertfile']); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $o['verifypeer']?1:0); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, ($o['verifypeername']??$o['verifypeer'])?2:0); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1); curl_setopt($ch,CURLOPT_MAXREDIRS, 5); curl_setopt($ch,CURLOPT_TIMEOUT,10); curl_setopt($ch,CURLOPT_NOBODY, $o['nobody']?1:0); curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET'); #without this kaggle and amazon return 404 (maybe they don't like HEAD method that curl might be setting with NOBODY option. #curl_setopt($ch, CURLOPT_HTTPGET, 1); #this turns off NOBODY option (and curl ends up getting the body. CUSTOMREQUEST is better; it doesn't turn off NOBODY.) $output = curl_exec($ch); if($o['dbg']){ echo "Got output:\n".(php_sapi_name() === 'cli'?$output:htmlspecialchars($output))."\n"; } $curl_errno = curl_errno($ch); if($curl_errno) { if($curl_errno == 28) { $code = 408; } else if($curl_errno == 60) { $code = 495; } else if($curl_errno<100) $code=$curl_errno; else $code=500; $msg=curl_error($ch); $ret=[$code,$msg]; } else{ $ret = curl_getinfo($ch, CURLINFO_HTTP_CODE); } curl_close($ch); return $ret; } function linkcheck_checkurl_stream($url,$o=[]) { $o=array_merge([ 'verifypeer'=>true, 'verifypeername'=>null, #defaults to same as verifypeer. 'cacertfile'=>NULL, ],$o); $ossl=['verify_peer'=>$o['verifypeer'],'verify_peer_name'=> $o['verifypeername']??$o['verifypeer']]; if($o['cacertfile']) $ossl['cafile']=$o['cacertfile']; $context=['ssl'=>$ossl,'http'=>['method'=>'HEAD']]; $s=@file_get_contents($url, false, stream_context_create($context),0,0); #getting a length of zero for efficiency; it results in empty string on successful request. if($s!==false){ return 200; } else return 404; } #returns the http code for a url. function linkcheck_checkurl($url,$o=[]) { $o=array_merge([ 'verifypeer'=>true, 'cacertfile'=>NULL, 'autodownloadcacertfile'=>true, #if cacertfile is specified but doesn't exist (or is out of date), download it. 'cacertfileexpiry'=>'1 month', #The revisions on https://curl.se/docs/caextract.html are 1-2months apart. 'cacertfileexpirytime'=>null, #alternative to cacertfileexpiry time length, provide a timepoint. 'dbg'=>false, ],$o); if(!$o['verifypeer']) $o['cacertfile']=NULL; #if verifypeer is off, we don't need a cacertfile. #autodownload cacertfile. if($o['cacertfile'] && $o['autodownloadcacertfile']&&(!is_file($o['cacertfile'])||filemtime($o['cacertfile'])<expiry_totime($o['cacertfileexpiry'],$o['cacertfileexpirytime']))){ if($o['dbg']) echo "<li> Downloading cacert.pem ...\n"; if(!($s=file_get_contents("https://curl.haxx.se/ca/cacert.pem"))){ throw new Exception("Failed to download cacert.pem."); } if(!is_dir(dirname($o['cacertfile']))) mkdir(dirname($o['cacertfile']),0755); file_put_contents($o['cacertfile'],$s); } if(function_exists('curl_init')) return linkcheck_checkurl_curl($url,$o); else return linkcheck_checkurl_stream($url,$o); } #get db connection. create cache table if it doesn't exist. #codegroup and lastcheckdate are redundant but make it easier to look at the db contents. function linkcheck_db($dbfile){ if(!is_string($dbfile)) return $dbfile; if(!file_exists($dbfile)) { if(!is_dir(dirname($dbfile))) mkdir(dirname($dbfile),0755); $db = new SQLite3($dbfile); $db->exec("CREATE TABLE IF NOT EXISTS linkcheck_cache (url TEXT PRIMARY KEY, codegroup TEXT, code INTEGER, msg TEXT, lastcheck INTEGER, lastcheckdate DATETIME, pages TEXT)"); } else $db = new SQLite3($dbfile); return $db; } function linkcheck_db_query($db,$query){ $rows=[]; $res = $db->query($query); while($row=$res->fetchArray(SQLITE3_ASSOC)){ $rows[]=$row; } return $rows; } #maps to one of valid, invalid, error. function linkcheck_code2group($code){ if($code>=200 && $code<=399) return 'valid'; elseif($code>=400&&$code<=499) return 'invalid'; else return 'error'; } function linkcheck_code2msg($code){ $msgs=[400=>'Bad Request',401=>'Unauthorized',402=>'Payment Required',403=>'Forbidden',404=>'Not Found',405=>'Method Not Allowed',406=>'Not Acceptable',407=>'Proxy Authentication Required (RFC 7235)',408=>'Request Timeout',495=>'SSL Certificate Error']; return $msgs[$code]??''; } function linkcheck_checkurl_withcache($url,$o=[]) { $o=array_merge([ 'dbfile'=>NULL, 'cacheexpiry'=>604800, #when not given, defaults to 604800, #'1 week', 'cacheexpirytime'=>null, #alternative to cacheexpiry time length, provide a timepoint. 'requireexists'=>true, #whether to restrict checks to the urls that have previously been inserted into the database. ],$o); if(!$o['dbfile']) return linkcheck_checkurl($url,$o); $db=linkcheck_db($o['dbfile']); $url_=$db->escapeString($url); $row=$db->querySingle("SELECT code,msg,lastcheck FROM linkcheck_cache WHERE url='$url_'", true); if($o['requireexists']&&!$row) return [500,"url [ $url ] not in database"]; if(!isset($o['cacheexpirytime'])) $o['cacheexpirytime']=expiry_totime($o['cacheexpiry']); if(!$row || $row['lastcheck']<$o['cacheexpirytime']){ if(!$row) $row=['url'=>$url_]; $r=linkcheck_checkurl($url,$o); $row['code']=(is_array($r)?$r[0]:$r)+0; #ensures integer $row['codegroup']=linkcheck_code2group($row['code']); if(is_array($r)){ $row['msg']=$r[1]; $msg_=$db->escapeString($row['msg']); } else{ $msg_=$row['msg']=''; } $row['lastcheck']=time(); $row['lastcheckdate']=date('Y-m-d H:i:s',$row['lastcheck']); if(!$o['requireexists']) $db->exec("INSERT OR IGNORE INTO linkcheck_cache(url) VALUES ('$url_')"); $db->exec("UPDATE linkcheck_cache SET codegroup='$row[codegroup]', code=$row[code],msg='$msg_',lastcheck=$row[lastcheck],lastcheckdate='$row[lastcheckdate]' WHERE url='$url_'"); } return $row['msg'] ? [$row['code'],$row['msg']] : $row['code']; } #just some testing in the command line... /* if(php_sapi_name() === 'cli' && getenv('AHMETLIBPHP')){ require_once getenv('AHMETLIBPHP').'/ahmet.php'; #ve(linkcheck_checkurl_withcache('https://localhost/',['cacertfile'=>'C:/downloads/cacert.pem','verifypeer'=>false,'dbg'=>true,'dbfile'=>'C:/downloads/linkcheck_cache.sqlite','requireexists'=>false])); #oxford doesn't like programmatic access. it returns 403. nothing we can do. #ve(linkcheck_checkurl('https://academic.oup.com/bioinformatics/article/24/24/2872/196843',['dbg'=>1,'nobody'=>0])); } */