1<?php
2
3require_once(HTML2PS_DIR.'fetcher._interface.class.php');
4
5define('HTTP_OK',200);
6
7/**
8 * @TODO send authorization headers only if they have been required by the server;
9 */
10class FetcherUrl extends Fetcher {
11  var $_connections;
12
13  var $protocol;
14  var $host;
15  var $port;
16  var $path;
17
18  var $url;
19
20  var $headers;
21  var $content;
22  var $code;
23
24  var $redirects;
25
26  // Authorization
27
28  var $user;
29  var $pass;
30
31  // ---------------------------------------------
32  // FetcherURL - PUBLIC methods
33  // ---------------------------------------------
34
35  // "Fetcher" interface implementation
36
37  function get_base_url() {
38    return $this->url;
39  }
40
41  function get_data($data_id) {
42    $this->redirects = 0;
43
44    if ($this->fetch($data_id)) {
45      if ($this->code != HTTP_OK) {
46
47        $_server_response = $this->headers;
48        $_http_error = $this->code;
49        $_url = htmlspecialchars($data_id);
50
51        ob_start();
52        include('templates/error._http.tpl');
53        $this->error_message .= ob_get_contents();
54        ob_end_clean();
55
56        error_log("Cannot open $data_id, HTTP result code is: ".$this->code);
57
58        return null;
59      };
60
61      return new FetchedDataURL($this->content,
62                                explode("\r\n",$this->headers),
63                                $this->url);
64    } elseif ($this->redirects > MAX_REDIRECTS) {
65      $_server_response    = $this->headers;
66      $_url = htmlspecialchars($data_id);
67
68      ob_start();
69      include('templates/error._redirects.tpl');
70      $this->error_message .= ob_get_contents();
71      ob_end_clean();
72
73      error_log(sprintf("Cannot open %s, too many redirects",
74                        $data_id));
75
76      return null;
77    } else {
78      $_server_response = $this->headers;
79      $_url = htmlspecialchars($data_id);
80
81      ob_start();
82      include('templates/error._connection.tpl');
83      $this->error_message .= ob_get_contents();
84      ob_end_clean();
85
86      error_log(sprintf("Cannot open %s",
87                        $data_id));
88
89      return null;
90    }
91  }
92
93  function error_message() {
94    return $this->error_message;
95  }
96
97  // FetcherURL - constructor
98
99  function FetcherURL() {
100    $this->_connections = array();
101
102    $this->error_message = "";
103
104    $this->redirects = 0;
105    $this->port = 80;
106
107    // Default encoding
108    //    $this->encoding = "iso-8859-1";
109
110    $this->user_agent = DEFAULT_USER_AGENT;
111  }
112
113  // ---------------------------------------------
114  // FetcherURL - PRIVATE methods
115  // ---------------------------------------------
116
117  /**
118   * Connects to the target host using either HTTP or HTTPS protocol;
119   * returns handle to connection socked or 'null' in case connection failed.
120   *
121   * @access private
122   * @final
123   * @return resource
124   */
125  function _connect() {
126    // Connect to the target host
127    if ($this->protocol == "https") {
128      return $this->_connect_ssl();
129    };
130
131    $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT);
132
133    if (!$fp) {
134      $message = sprintf("Cannot connect to %s:%d - (%d) %s",
135                         $this->host,
136                         $this->port,
137                         $errno,
138                         $errstr);
139      error_log($message);
140      $this->error_message = $message;
141      return null;
142    };
143
144    return $fp;
145  }
146
147  function _connect_ssl() {
148    /**
149     * Check if there's SSL support library loaded
150     *
151     * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box)
152     * openssl extension IS present, but fsockopen still complains "No SSL support in this build".
153     * (probably PHP bug?)
154     */
155    if (!extension_loaded('openssl')) {
156      $message = sprintf("Cannot connect to %s:%d. SSL Extension missing",
157                         $this->host,
158                         $this->port);
159      error_log($message);
160      $this->error_message .= $message;
161      return null;
162    };
163
164    $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5);
165
166    if (!$fp) {
167      $message = sprintf("Cannot connect to %s:%d - (%d) %s<br/>Missing SSL support?",
168                         $this->host,
169                         $this->port,
170                         $errno,
171                         $errstr);
172      error_log($message);
173      $this->error_message = $message;
174      return null;
175    };
176
177    return $fp;
178  }
179
180  function _extract_code($res) {
181    // Check return code
182    // Note the return code will always be contained in the response, so
183    // the we may not check the result of 'preg_match' - it matches always.
184    //
185    // A month later: nope, not always.
186    //
187    if (preg_match('/\s(\d+)\s/',$res,$matches)) {
188      $result = $matches[1];
189    } else {
190      $result = "200";
191    };
192
193    return $result;
194  }
195
196  function _fix_location($location) {
197    if (substr($location, 0, 7) == "http://") { return $location; };
198    if (substr($location, 0, 8) == "https://") { return $location; };
199
200    if ($location{0} == "/") {
201      return $this->protocol."://".$this->host.$location;
202    };
203
204    return $this->protocol."://".$this->host.$this->path.$location;
205  }
206
207  function fetch($url) {
208    /**
209     * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid
210     * URL, so fetcher will attempt to fetch something from the localhost instead of
211     * passing control to subsequent user-defined fetchers (which probably will know
212     * how to handle this).
213     */
214    if ($url === "") {
215      return null;
216    }
217
218    $this->url = $url;
219
220    $parts = @parse_url($this->url);
221
222    /**
223     * If an malformed URL have been specified, add a message to the log file and
224     * continue processing (as such URLs may be found in otherwise good HTML file -
225     * for example, invalid image or CSS reference)
226     */
227    if ($parts == false) {
228      error_log(sprintf("The URL '%s' could not be parsed", $this->url));
229
230      $this->content = '';
231      $this->code = HTTP_OK;
232      return true;
233    };
234
235    /**
236     * Setup default values
237     */
238    $this->protocol = 'http';
239    $this->host = 'localhost';
240    $this->user = "";
241    $this->pass = "";
242    $this->port = 80;
243    $this->path = "/";
244    $this->query = "";
245
246    if (isset($parts['scheme']))   { $this->protocol  = $parts['scheme'];    };
247    if (isset($parts['host']))     { $this->host      = $parts['host'];      };
248    if (isset($parts['user']))     { $this->user      = $parts['user'];      };
249    if (isset($parts['pass']))     { $this->pass      = $parts['pass'];      };
250    if (isset($parts['port']))     { $this->port      = $parts['port'];      };
251    if (isset($parts['path']))     { $this->path      = $parts['path'];      } else { $this->path = "/"; };
252    if (isset($parts['query']))    { $this->path     .= '?'.$parts['query']; };
253
254    switch (strtolower($this->protocol)) {
255    case 'http':
256      return $this->fetch_http();
257    case 'https':
258      return $this->fetch_https();
259    case 'file':
260      $this->host = "";
261      return $this->fetch_file();
262    default:
263      $message = sprintf("Unsupported protocol: %s", $this->protocol);
264      error_log($message);
265      $this->error_message .= $message;
266      return null;
267    }
268  }
269
270  function fetch_http() {
271    $res = $this->_head();
272
273    if (is_null($res)) { return null; };
274    $this->code = $this->_extract_code($res);
275
276    return $this->_process_code($res);
277  }
278
279  function fetch_https() {
280    /**
281     * SSL works via port 443
282     */
283    if ($this->protocol == "https" && !isset($parts['port'])) {
284       $this->port = 443;
285    }
286
287    $res = $this->_head();
288
289    if (is_null($res)) { return null; };
290    $this->code = $this->_extract_code($res);
291
292    return $this->_process_code($res);
293  }
294
295  function fetch_file() {
296    if (PHP_OS == "WINNT") {
297      $path = substr($this->url, 7);
298      if ($path{0} == "/") { $path = substr($path, 1); };
299    } else {
300      $path = substr($this->url, 7);
301    };
302
303    $normalized_path = realpath(urldecode($path));
304    $normalized_path_part = substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT));
305    if ($normalized_path_part !== FILE_PROTOCOL_RESTRICT) {
306      error_log(sprintf("Access denied to file '%s'", $normalized_path));
307
308      $this->content = "";
309      $this->code = HTTP_OK;
310      return true;
311    }
312
313    $this->content = @file_get_contents($normalized_path);
314    $this->code = HTTP_OK;
315
316    return true;
317  }
318
319  function _get() {
320    $socket = $this->_connect();
321    if (is_null($socket)) { return null; };
322
323    // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
324    $header  = "GET ".$this->path." HTTP/1.1\r\n";
325    $header .= "Host: ".$this->host."\r\n";
326    $header .= "Accept: */*\r\n";
327    $header .= "User-Agent: ".$this->user_agent."\r\n";
328    $header .= "Connection: keep-alive\r\n";
329    $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
330    $header .= $this->_header_basic_authorization();
331    $header .= "\r\n";
332
333    fputs ($socket, $header);
334    // Get the responce
335    $res = "";
336
337    // The PHP-recommended construction
338    //    while (!feof($fp)) { $res .= fread($fp, 4096); };
339    // hangs indefinitely on www.searchscout.com, for example.
340    // seems that they do not close conection on their side or somewhat similar;
341
342    // let's assume that there will be no HTML pages greater than 1 Mb
343
344    $res = fread($socket, 1024*1024);
345
346    // Close connection handle, we do not need it anymore
347    fclose($socket);
348
349    return $res;
350  }
351
352  function _head() {
353    $socket = $this->_connect();
354
355    if (is_null($socket)) { return null; };
356
357    // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
358    $header  = "HEAD ".$this->path." HTTP/1.1\r\n";
359    $header .= "Host: ".$this->host."\r\n";
360    $header .= "Accept: */*\r\n";
361    $header .= "User-Agent: ".$this->user_agent."\r\n";
362    $header .= "Connection: keep-alive\r\n";
363    $header .= "Accept: text/html\r\n";
364    $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
365
366    $header .= $this->_header_basic_authorization();
367
368    $header .= "\r\n";
369
370    // Send the header
371    fputs ($socket, $header);
372    // Get the responce
373    $res = "";
374
375    // The PHP-recommended construction
376    //    while (!feof($fp)) { $res .= fread($fp, 4096); };
377    // hangs indefinitely on www.searchscout.com, for example.
378    // seems that they do not close conection on their side or somewhat similar;
379
380    // let's assume that there will be no HTML pages greater than 1 Mb
381
382    $res = fread($socket, 4096);
383
384    // Close connection handle, we do not need it anymore
385    fclose($socket);
386
387    return $res;
388  }
389
390  function _process_code($res, $used_get = false) {
391    switch ($this->code) {
392    case '200': // OK
393      if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
394        $this->headers = $matches[1];
395      };
396
397      /**
398       * @todo add error processing here
399       *
400       * Note: file_get_contents is smart enough to use basic authorization headers provided
401       * user name / password are given in the URL.
402       */
403      $this->content = @file_get_contents($this->url);
404
405      return true;
406      break;
407    case '301': // Moved Permanently
408      $this->redirects++;
409      if ($this->redirects > MAX_REDIRECTS) { return false; };
410      preg_match('/Location: ([\S]+)/i',$res,$matches);
411      return $this->fetch($this->_fix_location($matches[1]));
412    case '302': // Found
413      $this->redirects++;
414      if ($this->redirects > MAX_REDIRECTS) { return false; };
415      preg_match('/Location: ([\S]+)/i',$res,$matches);
416      error_log('Redirected to:'.$matches[1]);
417
418      return $this->fetch($this->_fix_location($matches[1]));
419    case '400': // Bad request
420    case '401': // Unauthorized
421    case '402': // Payment required
422    case '403': // Forbidden
423    case '404': // Not found - but should return some html content - error page
424    case '406': // Not acceptable
425      if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
426        error_log("Unrecognized HTTP response");
427        return false;
428      };
429      $this->headers = $matches[1];
430      $this->content = @file_get_contents($this->url);
431      return true;
432    case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests
433      // Try to get URL information using GET request (if we didn't tried it before)
434      if (!$used_get) {
435        $res = $this->_get();
436        if (is_null($res)) { return null; };
437        $this->code = $this->_extract_code($res);
438        return $this->_process_code($res, true);
439      } else {
440        if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
441          error_log("Unrecognized HTTP response");
442          return false;
443        };
444        $this->headers = $matches[1];
445        $this->content = @file_get_contents($this->url);
446        return true;
447      };
448    default:
449      error_log("Unrecognized HTTP result code:".$this->code);
450      return false;
451    };
452  }
453
454  function _header_basic_authorization() {
455    if (!is_null($this->user) && $this->user != "") {
456      return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass));
457    };
458  }
459}
460?>