1<?php 2 3require_once(HTML2PS_DIR.'fetcher._interface.class.php'); 4 5define('HTTP_OK',200); 6 7/** 8 * @TODO send authorization headers only if they have been required by the server; 9 */ 10class FetcherUrl extends Fetcher { 11 var $_connections; 12 13 var $protocol; 14 var $host; 15 var $port; 16 var $path; 17 18 var $url; 19 20 var $headers; 21 var $content; 22 var $code; 23 24 var $redirects; 25 26 // Authorization 27 28 var $user; 29 var $pass; 30 31 // --------------------------------------------- 32 // FetcherURL - PUBLIC methods 33 // --------------------------------------------- 34 35 // "Fetcher" interface implementation 36 37 function get_base_url() { 38 return $this->url; 39 } 40 41 function get_data($data_id) { 42 $this->redirects = 0; 43 44 if ($this->fetch($data_id)) { 45 if ($this->code != HTTP_OK) { 46 47 $_server_response = $this->headers; 48 $_http_error = $this->code; 49 $_url = htmlspecialchars($data_id); 50 51 ob_start(); 52 include('templates/error._http.tpl'); 53 $this->error_message .= ob_get_contents(); 54 ob_end_clean(); 55 56 error_log("Cannot open $data_id, HTTP result code is: ".$this->code); 57 58 return null; 59 }; 60 61 return new FetchedDataURL($this->content, 62 explode("\r\n",$this->headers), 63 $this->url); 64 } elseif ($this->redirects > MAX_REDIRECTS) { 65 $_server_response = $this->headers; 66 $_url = htmlspecialchars($data_id); 67 68 ob_start(); 69 include('templates/error._redirects.tpl'); 70 $this->error_message .= ob_get_contents(); 71 ob_end_clean(); 72 73 error_log(sprintf("Cannot open %s, too many redirects", 74 $data_id)); 75 76 return null; 77 } else { 78 $_server_response = $this->headers; 79 $_url = htmlspecialchars($data_id); 80 81 ob_start(); 82 include('templates/error._connection.tpl'); 83 $this->error_message .= ob_get_contents(); 84 ob_end_clean(); 85 86 error_log(sprintf("Cannot open %s", 87 $data_id)); 88 89 return null; 90 } 91 } 92 93 function error_message() { 94 return $this->error_message; 95 } 96 97 // FetcherURL - constructor 98 99 function FetcherURL() { 100 $this->_connections = array(); 101 102 $this->error_message = ""; 103 104 $this->redirects = 0; 105 $this->port = 80; 106 107 // Default encoding 108 // $this->encoding = "iso-8859-1"; 109 110 $this->user_agent = DEFAULT_USER_AGENT; 111 } 112 113 // --------------------------------------------- 114 // FetcherURL - PRIVATE methods 115 // --------------------------------------------- 116 117 /** 118 * Connects to the target host using either HTTP or HTTPS protocol; 119 * returns handle to connection socked or 'null' in case connection failed. 120 * 121 * @access private 122 * @final 123 * @return resource 124 */ 125 function _connect() { 126 // Connect to the target host 127 if ($this->protocol == "https") { 128 return $this->_connect_ssl(); 129 }; 130 131 $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT); 132 133 if (!$fp) { 134 $message = sprintf("Cannot connect to %s:%d - (%d) %s", 135 $this->host, 136 $this->port, 137 $errno, 138 $errstr); 139 error_log($message); 140 $this->error_message = $message; 141 return null; 142 }; 143 144 return $fp; 145 } 146 147 function _connect_ssl() { 148 /** 149 * Check if there's SSL support library loaded 150 * 151 * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box) 152 * openssl extension IS present, but fsockopen still complains "No SSL support in this build". 153 * (probably PHP bug?) 154 */ 155 if (!extension_loaded('openssl')) { 156 $message = sprintf("Cannot connect to %s:%d. SSL Extension missing", 157 $this->host, 158 $this->port); 159 error_log($message); 160 $this->error_message .= $message; 161 return null; 162 }; 163 164 $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5); 165 166 if (!$fp) { 167 $message = sprintf("Cannot connect to %s:%d - (%d) %s<br/>Missing SSL support?", 168 $this->host, 169 $this->port, 170 $errno, 171 $errstr); 172 error_log($message); 173 $this->error_message = $message; 174 return null; 175 }; 176 177 return $fp; 178 } 179 180 function _extract_code($res) { 181 // Check return code 182 // Note the return code will always be contained in the response, so 183 // the we may not check the result of 'preg_match' - it matches always. 184 // 185 // A month later: nope, not always. 186 // 187 if (preg_match('/\s(\d+)\s/',$res,$matches)) { 188 $result = $matches[1]; 189 } else { 190 $result = "200"; 191 }; 192 193 return $result; 194 } 195 196 function _fix_location($location) { 197 if (substr($location, 0, 7) == "http://") { return $location; }; 198 if (substr($location, 0, 8) == "https://") { return $location; }; 199 200 if ($location{0} == "/") { 201 return $this->protocol."://".$this->host.$location; 202 }; 203 204 return $this->protocol."://".$this->host.$this->path.$location; 205 } 206 207 function fetch($url) { 208 /** 209 * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid 210 * URL, so fetcher will attempt to fetch something from the localhost instead of 211 * passing control to subsequent user-defined fetchers (which probably will know 212 * how to handle this). 213 */ 214 if ($url === "") { 215 return null; 216 } 217 218 $this->url = $url; 219 220 $parts = @parse_url($this->url); 221 222 /** 223 * If an malformed URL have been specified, add a message to the log file and 224 * continue processing (as such URLs may be found in otherwise good HTML file - 225 * for example, invalid image or CSS reference) 226 */ 227 if ($parts == false) { 228 error_log(sprintf("The URL '%s' could not be parsed", $this->url)); 229 230 $this->content = ''; 231 $this->code = HTTP_OK; 232 return true; 233 }; 234 235 /** 236 * Setup default values 237 */ 238 $this->protocol = 'http'; 239 $this->host = 'localhost'; 240 $this->user = ""; 241 $this->pass = ""; 242 $this->port = 80; 243 $this->path = "/"; 244 $this->query = ""; 245 246 if (isset($parts['scheme'])) { $this->protocol = $parts['scheme']; }; 247 if (isset($parts['host'])) { $this->host = $parts['host']; }; 248 if (isset($parts['user'])) { $this->user = $parts['user']; }; 249 if (isset($parts['pass'])) { $this->pass = $parts['pass']; }; 250 if (isset($parts['port'])) { $this->port = $parts['port']; }; 251 if (isset($parts['path'])) { $this->path = $parts['path']; } else { $this->path = "/"; }; 252 if (isset($parts['query'])) { $this->path .= '?'.$parts['query']; }; 253 254 switch (strtolower($this->protocol)) { 255 case 'http': 256 return $this->fetch_http(); 257 case 'https': 258 return $this->fetch_https(); 259 case 'file': 260 $this->host = ""; 261 return $this->fetch_file(); 262 default: 263 $message = sprintf("Unsupported protocol: %s", $this->protocol); 264 error_log($message); 265 $this->error_message .= $message; 266 return null; 267 } 268 } 269 270 function fetch_http() { 271 $res = $this->_head(); 272 273 if (is_null($res)) { return null; }; 274 $this->code = $this->_extract_code($res); 275 276 return $this->_process_code($res); 277 } 278 279 function fetch_https() { 280 /** 281 * SSL works via port 443 282 */ 283 if ($this->protocol == "https" && !isset($parts['port'])) { 284 $this->port = 443; 285 } 286 287 $res = $this->_head(); 288 289 if (is_null($res)) { return null; }; 290 $this->code = $this->_extract_code($res); 291 292 return $this->_process_code($res); 293 } 294 295 function fetch_file() { 296 if (PHP_OS == "WINNT") { 297 $path = substr($this->url, 7); 298 if ($path{0} == "/") { $path = substr($path, 1); }; 299 } else { 300 $path = substr($this->url, 7); 301 }; 302 303 $normalized_path = realpath(urldecode($path)); 304 $normalized_path_part = substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT)); 305 if ($normalized_path_part !== FILE_PROTOCOL_RESTRICT) { 306 error_log(sprintf("Access denied to file '%s'", $normalized_path)); 307 308 $this->content = ""; 309 $this->code = HTTP_OK; 310 return true; 311 } 312 313 $this->content = @file_get_contents($normalized_path); 314 $this->code = HTTP_OK; 315 316 return true; 317 } 318 319 function _get() { 320 $socket = $this->_connect(); 321 if (is_null($socket)) { return null; }; 322 323 // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents) 324 $header = "GET ".$this->path." HTTP/1.1\r\n"; 325 $header .= "Host: ".$this->host."\r\n"; 326 $header .= "Accept: */*\r\n"; 327 $header .= "User-Agent: ".$this->user_agent."\r\n"; 328 $header .= "Connection: keep-alive\r\n"; 329 $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n"; 330 $header .= $this->_header_basic_authorization(); 331 $header .= "\r\n"; 332 333 fputs ($socket, $header); 334 // Get the responce 335 $res = ""; 336 337 // The PHP-recommended construction 338 // while (!feof($fp)) { $res .= fread($fp, 4096); }; 339 // hangs indefinitely on www.searchscout.com, for example. 340 // seems that they do not close conection on their side or somewhat similar; 341 342 // let's assume that there will be no HTML pages greater than 1 Mb 343 344 $res = fread($socket, 1024*1024); 345 346 // Close connection handle, we do not need it anymore 347 fclose($socket); 348 349 return $res; 350 } 351 352 function _head() { 353 $socket = $this->_connect(); 354 355 if (is_null($socket)) { return null; }; 356 357 // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents) 358 $header = "HEAD ".$this->path." HTTP/1.1\r\n"; 359 $header .= "Host: ".$this->host."\r\n"; 360 $header .= "Accept: */*\r\n"; 361 $header .= "User-Agent: ".$this->user_agent."\r\n"; 362 $header .= "Connection: keep-alive\r\n"; 363 $header .= "Accept: text/html\r\n"; 364 $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n"; 365 366 $header .= $this->_header_basic_authorization(); 367 368 $header .= "\r\n"; 369 370 // Send the header 371 fputs ($socket, $header); 372 // Get the responce 373 $res = ""; 374 375 // The PHP-recommended construction 376 // while (!feof($fp)) { $res .= fread($fp, 4096); }; 377 // hangs indefinitely on www.searchscout.com, for example. 378 // seems that they do not close conection on their side or somewhat similar; 379 380 // let's assume that there will be no HTML pages greater than 1 Mb 381 382 $res = fread($socket, 4096); 383 384 // Close connection handle, we do not need it anymore 385 fclose($socket); 386 387 return $res; 388 } 389 390 function _process_code($res, $used_get = false) { 391 switch ($this->code) { 392 case '200': // OK 393 if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) { 394 $this->headers = $matches[1]; 395 }; 396 397 /** 398 * @todo add error processing here 399 * 400 * Note: file_get_contents is smart enough to use basic authorization headers provided 401 * user name / password are given in the URL. 402 */ 403 $this->content = @file_get_contents($this->url); 404 405 return true; 406 break; 407 case '301': // Moved Permanently 408 $this->redirects++; 409 if ($this->redirects > MAX_REDIRECTS) { return false; }; 410 preg_match('/Location: ([\S]+)/i',$res,$matches); 411 return $this->fetch($this->_fix_location($matches[1])); 412 case '302': // Found 413 $this->redirects++; 414 if ($this->redirects > MAX_REDIRECTS) { return false; }; 415 preg_match('/Location: ([\S]+)/i',$res,$matches); 416 error_log('Redirected to:'.$matches[1]); 417 418 return $this->fetch($this->_fix_location($matches[1])); 419 case '400': // Bad request 420 case '401': // Unauthorized 421 case '402': // Payment required 422 case '403': // Forbidden 423 case '404': // Not found - but should return some html content - error page 424 case '406': // Not acceptable 425 if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) { 426 error_log("Unrecognized HTTP response"); 427 return false; 428 }; 429 $this->headers = $matches[1]; 430 $this->content = @file_get_contents($this->url); 431 return true; 432 case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests 433 // Try to get URL information using GET request (if we didn't tried it before) 434 if (!$used_get) { 435 $res = $this->_get(); 436 if (is_null($res)) { return null; }; 437 $this->code = $this->_extract_code($res); 438 return $this->_process_code($res, true); 439 } else { 440 if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) { 441 error_log("Unrecognized HTTP response"); 442 return false; 443 }; 444 $this->headers = $matches[1]; 445 $this->content = @file_get_contents($this->url); 446 return true; 447 }; 448 default: 449 error_log("Unrecognized HTTP result code:".$this->code); 450 return false; 451 }; 452 } 453 454 function _header_basic_authorization() { 455 if (!is_null($this->user) && $this->user != "") { 456 return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass)); 457 }; 458 } 459} 460?>