1<?php 2/* 3 ====================================================================== 4 lastRSS 0.9.1 5 6 Simple yet powerfull PHP class to parse RSS files. 7 8 by Vojtech Semecky, webmaster @ webdot . cz 9 10 Latest version, features, manual and examples: 11 http://lastrss.webdot.cz/ 12 13 ---------------------------------------------------------------------- 14 LICENSE 15 16 This program is free software; you can redistribute it and/or 17 modify it under the terms of the GNU General Public License (GPL) 18 as published by the Free Software Foundation; either version 2 19 of the License, or (at your option) any later version. 20 21 This program is distributed in the hope that it will be useful, 22 but WITHOUT ANY WARRANTY; without even the implied warranty of 23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 GNU General Public License for more details. 25 26 To read the license please visit http://www.gnu.org/copyleft/gpl.html 27 ====================================================================== 28*/ 29// Dokuwiki extension 30if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../../../').'/'); 31define('NOSESSION',true); 32define('DOKU_DISABLE_GZIP_OUTPUT', 1); 33require_once(DOKU_INC.'inc/init.php'); 34require_once(DOKU_INC.'inc/auth.php'); 35require_once(DOKU_INC.'inc/HTTPClient.php'); 36 37/** 38* lastRSS 39* Simple yet powerfull PHP class to parse RSS files. 40*/ 41class lastRSS { 42 // ------------------------------------------------------------------- 43 // Public properties 44 // ------------------------------------------------------------------- 45 var $default_cp = 'UTF-8'; 46 var $CDATA = 'nochange'; 47 var $cp = ''; 48 var $items_limit = 0; 49 var $stripHTML = False; 50 var $date_format = ''; 51 52 // ------------------------------------------------------------------- 53 // Private variables 54 // ------------------------------------------------------------------- 55 var $channeltags = array ('title', 'link', 'description', 'language', 'copyright', 'managingEditor', 'webMaster', 'lastBuildDate', 'rating', 'docs'); 56 var $itemtags = array('title', 'link', 'description', 'author', 'category', 'comments', 'enclosure', 'guid', 'pubDate', 'source'); 57 var $imagetags = array('title', 'url', 'link', 'width', 'height'); 58 var $textinputtags = array('title', 'description', 'name', 'link'); 59 60 // ------------------------------------------------------------------- 61 // Parse RSS file and returns associative array. 62 // ------------------------------------------------------------------- 63 function Get ($rss_url) { 64 // If CACHE ENABLED 65 if ($this->cache_dir != '') { 66 $cache_file = $this->cache_dir . '/rsscache_' . md5($rss_url); 67 $timedif = @(time() - filemtime($cache_file)); 68 if ($timedif < $this->cache_time) { 69 // cached file is fresh enough, return cached array 70 $result = unserialize(join('', file($cache_file))); 71 // set 'cached' to 1 only if cached file is correct 72 if ($result) $result['cached'] = 1; 73 } else { 74 // cached file is too old, create new 75 $result = $this->Parse($rss_url); 76 $serialized = serialize($result); 77 if ($f = @fopen($cache_file, 'w')) { 78 fwrite ($f, $serialized, strlen($serialized)); 79 fclose($f); 80 } 81 if ($result) $result['cached'] = 0; 82 // cached file is very too old, about one week, deleted 83 $dh=opendir($this->cache_dir); 84 while ($file = readdir ($dh)) { 85 if ($file != "." && $file != "..") { 86 if (file_exists($this->cache_dir."/".$file)) { 87 if ((fileatime($this->cache_dir."/".$file)+604800) <= time()) { 88 unlink ($this->cache_dir."/".$file); 89 } 90 } 91 } 92 } 93 closedir($dh); 94 } 95 } 96 // If CACHE DISABLED >> load and parse the file directly 97 else { 98 $result = $this->Parse($rss_url); 99 if ($result) $result['cached'] = 0; 100 } 101 // return result 102 return $result; 103 } 104 105 // ------------------------------------------------------------------- 106 // Modification of preg_match(); return trimed field with index 1 107 // from 'classic' preg_match() array output 108 // ------------------------------------------------------------------- 109 function my_preg_match ($pattern, $subject) { 110 // start regullar expression 111 preg_match($pattern, $subject, $out); 112 113 // if there is some result... process it and return it 114 if(isset($out[1])) { 115 // Process CDATA (if present) 116 if ($this->CDATA == 'content') { // Get CDATA content (without CDATA tag) 117 $out[1] = strtr($out[1], array('<![CDATA['=>'', ']]>'=>'')); 118 } elseif ($this->CDATA == 'strip') { // Strip CDATA 119 $out[1] = strtr($out[1], array('<![CDATA['=>'', ']]>'=>'')); 120 } 121 122 // If code page is set convert character encoding to required 123 if ($this->cp != '') 124 //$out[1] = $this->MyConvertEncoding($this->rsscp, $this->cp, $out[1]); 125 $out[1] = iconv($this->rsscp, $this->cp.'//TRANSLIT', $out[1]); 126 // Return result 127 return trim($out[1]); 128 } else { 129 // if there is NO result, return empty string 130 return ''; 131 } 132 } 133 134 // ------------------------------------------------------------------- 135 // Replace HTML entities &something; by real characters 136 // ------------------------------------------------------------------- 137 function unhtmlentities ($string) { 138 // Get HTML entities table 139 $trans_tbl = get_html_translation_table (HTML_ENTITIES, ENT_QUOTES); 140 // Flip keys<==>values 141 $trans_tbl = array_flip ($trans_tbl); 142 // Add support for ' entity (missing in HTML_ENTITIES) 143 $trans_tbl += array(''' => "'"); 144 // Replace entities by values 145 return strtr ($string, $trans_tbl); 146 } 147 148 // ------------------------------------------------------------------- 149 // Parse() is private method used by Get() to load and parse RSS file. 150 // Don't use Parse() in your scripts - use Get($rss_file) instead. 151 // ------------------------------------------------------------------- 152 function Parse ($rss_url) { 153 // Open and load RSS file 154 $http = new DokuHTTPClient(); 155 if ($rss_content = $http->get($rss_url)) { 156 // Parse document encoding 157 $result['encoding'] = $this->my_preg_match("'encoding=[\'\"](.*?)[\'\"]'si", $rss_content); 158 // if document codepage is specified, use it 159 if ($result['encoding'] != '') 160 { $this->rsscp = $result['encoding']; } // This is used in my_preg_match() 161 // otherwise use the default codepage 162 else 163 { $this->rsscp = $this->default_cp; } // This is used in my_preg_match() 164 165 // Parse CHANNEL info 166 preg_match("'<channel.*?>(.*?)</channel>'si", $rss_content, $out_channel); 167 foreach($this->channeltags as $channeltag) 168 { 169 $temp = $this->my_preg_match("'<$channeltag.*?>(.*?)</$channeltag>'si", $out_channel[1]); 170 if ($temp != '') $result[$channeltag] = $temp; // Set only if not empty 171 } 172 // If date_format is specified and lastBuildDate is valid 173 if ($this->date_format != '' && ($timestamp = strtotime($result['lastBuildDate'])) !==-1) { 174 // convert lastBuildDate to specified date format 175 $result['lastBuildDate'] = strftime($this->date_format, $timestamp); 176 } 177 178 // Parse TEXTINPUT info 179 preg_match("'<textinput(|[^>]*[^/])>(.*?)</textinput>'si", $rss_content, $out_textinfo); 180 // This a little strange regexp means: 181 // Look for tag <textinput> with or without any attributes, but skip truncated version <textinput /> (it's not beggining tag) 182 if (isset($out_textinfo[2])) { 183 foreach($this->textinputtags as $textinputtag) { 184 $temp = $this->my_preg_match("'<$textinputtag.*?>(.*?)</$textinputtag>'si", $out_textinfo[2]); 185 if ($temp != '') $result['textinput_'.$textinputtag] = $temp; // Set only if not empty 186 } 187 } 188 // Parse IMAGE info 189 preg_match("'<image.*?>(.*?)</image>'si", $rss_content, $out_imageinfo); 190 if (isset($out_imageinfo[1])) { 191 foreach($this->imagetags as $imagetag) { 192 $temp = $this->my_preg_match("'<$imagetag.*?>(.*?)</$imagetag>'si", $out_imageinfo[1]); 193 if ($temp != '') $result['image_'.$imagetag] = $temp; // Set only if not empty 194 } 195 } 196 // Parse ITEMS 197 preg_match_all("'<item(| .*?)>(.*?)</item>'si", $rss_content, $items); 198 $rss_items = $items[2]; 199 $i = 0; 200 $result['items'] = array(); // create array even if there are no items 201 foreach($rss_items as $rss_item) { 202 //check it is this UTF-8 203 if($this->check_utf8(utf8_decode($rss_item))) $rss_item = $rss_item; 204 else if($this->check_utf8($rss_item)) $rss_item = utf8_decode($rss_item); 205 // If number of items is lower then limit: Parse one item 206 if ($i < $this->items_limit || $this->items_limit == 0) { 207 foreach($this->itemtags as $itemtag) { 208 $temp = $this->my_preg_match("'<$itemtag.*?>(.*?)</$itemtag>'si", $rss_item); 209 if ($temp != '') $result['items'][$i][$itemtag] = $temp; // Set only if not empty 210 } 211 // Strip HTML tags and other bullshit from DESCRIPTION 212 if ($this->stripHTML && $result['items'][$i]['description']) 213 $result['items'][$i]['description'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['description']))); 214 // Strip HTML tags and other bullshit from TITLE 215 if ($this->stripHTML && $result['items'][$i]['title']) 216 $result['items'][$i]['title'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['title']))); 217 // If date_format is specified and pubDate is valid 218 if ($this->date_format != '' && ($timestamp = strtotime($result['items'][$i]['pubDate'])) !==-1) { 219 // convert pubDate to specified date format 220 $result['items'][$i]['pubDate'] = strftime($this->date_format, $timestamp); 221 } 222 // Item counter 223 $i++; 224 } 225 } 226 227 $result['items_count'] = $i; 228 return $result; 229 } 230 else // Error in opening return False 231 { 232 return False; 233 } 234 } 235 236 //Encoding-Check ob Datei UTF-8 oder ISO-8859-1 237 function check_utf8($str) { 238 $len = strlen($str); 239 for($i = 0; $i < $len; $i++){ 240 $c = ord($str[$i]); 241 if ($c > 128) { 242 if (($c > 247)) return false; 243 elseif ($c > 239) $bytes = 4; 244 elseif ($c > 223) $bytes = 3; 245 elseif ($c > 191) $bytes = 2; 246 else return false; 247 if (($i + $bytes) > $len) return false; 248 while ($bytes > 1) { 249 $i++; 250 $b = ord($str[$i]); 251 if ($b < 128 || $b > 191) return false; 252 $bytes--; 253 } 254 } 255 } 256 return true; 257 } 258} 259 260?>