1<?php
2#ensure numeric timeout. convert e.g., '1 week' to #seconds.
3function expiry_ensurenum($expiry){
4    if(!$expiry) return 0;
5    if(!is_numeric($expiry)) {
6        $expiry_=$expiry; #backup for error message.
7        #strtotime requires a space betwen the number and the unit.
8        $expiry=preg_replace("/^([0-9]+)([a-zA-Z])/",'$1 $2',$expiry);
9        if(($expiry = strtotime("+$expiry",0))===false){
10            throw new Exception("Invalid timeout: [ $expiry_ ]");
11        }
12    }
13    return $expiry;
14}
15function expiry_totime($expiry,$expirytime=null){
16    if(isset($expirytime)) return $expirytime;
17    $expiry=expiry_ensurenum($expiry);
18    if(!$expiry) return 0;
19    return time()-$expiry;
20}
21#return status or [status,errormsg]
22function linkcheck_checkurl_curl($url,$o=[]) {
23    $o=array_merge([
24        'verifypeer'=>true,
25        'verifypeername'=>NULL, #defaults to same as verifypeer.
26        'cacertfile'=>NULL,
27		'dbg'=>false,
28        'nobody'=>true,
29    ],$o);
30
31    $ch = curl_init($url);
32    if($o['cacertfile']) curl_setopt($ch, CURLOPT_CAINFO, $o['cacertfile']);
33    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $o['verifypeer']?1:0);
34    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, ($o['verifypeername']??$o['verifypeer'])?2:0);
35    curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
36    curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
37    curl_setopt($ch,CURLOPT_MAXREDIRS, 5);
38    curl_setopt($ch,CURLOPT_TIMEOUT,10);
39    curl_setopt($ch,CURLOPT_NOBODY, $o['nobody']?1:0);
40    curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET'); #without this kaggle and amazon return 404 (maybe they don't like HEAD method that curl might be setting with NOBODY option.
41    #curl_setopt($ch, CURLOPT_HTTPGET, 1);  #this turns off NOBODY option (and curl ends up getting the body. CUSTOMREQUEST is better; it doesn't turn off NOBODY.)
42    $output = curl_exec($ch);
43    if($o['dbg']){
44        echo "Got output:\n".(php_sapi_name() === 'cli'?$output:htmlspecialchars($output))."\n";
45    }
46    $curl_errno = curl_errno($ch);
47    if($curl_errno) {
48        if($curl_errno == 28) {
49            $code = 408;
50        }
51        else if($curl_errno == 60) {
52            $code = 495;
53        }
54        else if($curl_errno<100) $code=$curl_errno;
55        else $code=500;
56        $msg=curl_error($ch);
57        $ret=[$code,$msg];
58    }
59    else{
60    $ret = curl_getinfo($ch, CURLINFO_HTTP_CODE);
61    }
62    curl_close($ch);
63    return $ret;
64}
65function linkcheck_checkurl_stream($url,$o=[]) {
66    $o=array_merge([
67        'verifypeer'=>true,
68        'verifypeername'=>null, #defaults to same as verifypeer.
69        'cacertfile'=>NULL,
70    ],$o);
71
72    $ossl=['verify_peer'=>$o['verifypeer'],'verify_peer_name'=> $o['verifypeername']??$o['verifypeer']];
73    if($o['cacertfile']) $ossl['cafile']=$o['cacertfile'];
74    $context=['ssl'=>$ossl,'http'=>['method'=>'HEAD']];
75    $s=@file_get_contents($url, false, stream_context_create($context),0,0); #getting a length of zero for efficiency; it results in empty string on successful request.
76    if($s!==false){ return 200; }
77    else return 404;
78}
79
80
81#returns the http code for a url.
82function linkcheck_checkurl($url,$o=[]) {
83    $o=array_merge([
84        'verifypeer'=>true,
85        'cacertfile'=>NULL,
86        'autodownloadcacertfile'=>true, #if cacertfile is specified but doesn't exist (or is out of date), download it.
87        'cacertfileexpiry'=>'1 month', #The revisions on https://curl.se/docs/caextract.html are 1-2months apart.
88        'cacertfileexpirytime'=>null, #alternative to cacertfileexpiry time length, provide a timepoint.
89        'dbg'=>false,
90    ],$o);
91    if(!$o['verifypeer']) $o['cacertfile']=NULL; #if verifypeer is off, we don't need a cacertfile.
92    #autodownload cacertfile.
93    if($o['cacertfile'] && $o['autodownloadcacertfile']&&(!is_file($o['cacertfile'])||filemtime($o['cacertfile'])<expiry_totime($o['cacertfileexpiry'],$o['cacertfileexpirytime']))){
94        if($o['dbg']) echo "<li> Downloading cacert.pem ...\n";
95        if(!($s=file_get_contents("https://curl.haxx.se/ca/cacert.pem"))){
96            throw new Exception("Failed to download cacert.pem.");
97        }
98        if(!is_dir(dirname($o['cacertfile']))) mkdir(dirname($o['cacertfile']),0755);
99        file_put_contents($o['cacertfile'],$s);
100    }
101
102    if(function_exists('curl_init')) return linkcheck_checkurl_curl($url,$o);
103    else return linkcheck_checkurl_stream($url,$o);
104}
105
106#get db connection. create cache table if it doesn't exist.
107#codegroup and lastcheckdate are redundant but make it easier to look at the db contents.
108function linkcheck_db($dbfile){
109    if(!is_string($dbfile)) return $dbfile;
110    if(!file_exists($dbfile)) {
111        if(!is_dir(dirname($dbfile))) mkdir(dirname($dbfile),0755);
112        $db = new SQLite3($dbfile);
113        $db->exec("CREATE TABLE IF NOT EXISTS linkcheck_cache (url TEXT PRIMARY KEY, codegroup TEXT, code INTEGER, msg TEXT, lastcheck INTEGER, lastcheckdate DATETIME, pages TEXT)");
114    }
115    else $db = new SQLite3($dbfile);
116    return $db;
117}
118function linkcheck_db_query($db,$query){
119    $rows=[];
120    $res = $db->query($query);
121    while($row=$res->fetchArray(SQLITE3_ASSOC)){
122        $rows[]=$row;
123    }
124    return $rows;
125}
126
127#maps to one of valid, invalid, error.
128function linkcheck_code2group($code){
129    if($code>=200 && $code<=399) return 'valid';
130    elseif($code>=400&&$code<=499) return 'invalid';
131    else return 'error';
132}
133function linkcheck_code2msg($code){
134    $msgs=[400=>'Bad Request',401=>'Unauthorized',402=>'Payment Required',403=>'Forbidden',404=>'Not Found',405=>'Method Not Allowed',406=>'Not Acceptable',407=>'Proxy Authentication Required (RFC 7235)',408=>'Request Timeout',495=>'SSL Certificate Error'];
135    return $msgs[$code]??'';
136}
137
138function linkcheck_checkurl_withcache($url,$o=[]) {
139    $o=array_merge([
140        'dbfile'=>NULL,
141        'cacheexpiry'=>604800, #when not given, defaults to 604800, #'1 week',
142        'cacheexpirytime'=>null, #alternative to cacheexpiry time length, provide a timepoint.
143        'requireexists'=>true, #whether to restrict checks to the urls that have previously been inserted into the database.
144    ],$o);
145    if(!$o['dbfile']) return linkcheck_checkurl($url,$o);
146
147    $db=linkcheck_db($o['dbfile']);
148    $url_=$db->escapeString($url);
149    $row=$db->querySingle("SELECT code,msg,lastcheck FROM linkcheck_cache WHERE url='$url_'", true);
150
151    if($o['requireexists']&&!$row) return [500,"url [ $url ] not in database"];
152    if(!isset($o['cacheexpirytime'])) $o['cacheexpirytime']=expiry_totime($o['cacheexpiry']);
153
154    if(!$row || $row['lastcheck']<$o['cacheexpirytime']){
155        if(!$row) $row=['url'=>$url_];
156        $r=linkcheck_checkurl($url,$o);
157        $row['code']=(is_array($r)?$r[0]:$r)+0; #ensures integer
158        $row['codegroup']=linkcheck_code2group($row['code']);
159
160        if(is_array($r)){ $row['msg']=$r[1]; $msg_=$db->escapeString($row['msg']); }
161        else{ $msg_=$row['msg']=''; }
162
163        $row['lastcheck']=time();
164        $row['lastcheckdate']=date('Y-m-d H:i:s',$row['lastcheck']);
165        if(!$o['requireexists'])
166            $db->exec("INSERT OR IGNORE INTO linkcheck_cache(url) VALUES ('$url_')");
167        $db->exec("UPDATE linkcheck_cache SET codegroup='$row[codegroup]', code=$row[code],msg='$msg_',lastcheck=$row[lastcheck],lastcheckdate='$row[lastcheckdate]' WHERE url='$url_'");
168    }
169    return $row['msg'] ? [$row['code'],$row['msg']] : $row['code'];
170}
171
172
173
174#just some testing in the command line...
175/*
176if(php_sapi_name() === 'cli' && getenv('AHMETLIBPHP')){
177    require_once getenv('AHMETLIBPHP').'/ahmet.php';
178    #ve(linkcheck_checkurl_withcache('https://localhost/',['cacertfile'=>'C:/downloads/cacert.pem','verifypeer'=>false,'dbg'=>true,'dbfile'=>'C:/downloads/linkcheck_cache.sqlite','requireexists'=>false]));
179    #oxford doesn't like programmatic access. it returns 403. nothing we can do.
180    #ve(linkcheck_checkurl('https://academic.oup.com/bioinformatics/article/24/24/2872/196843',['dbg'=>1,'nobody'=>0]));
181}
182*/
183