1<?php 2/** 3 * DokuWiki indexer 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../'); 9define('DOKU_DISABLE_GZIP_OUTPUT',1); 10require_once(DOKU_INC.'inc/init.php'); 11require_once(DOKU_INC.'inc/auth.php'); 12require_once(DOKU_INC.'inc/events.php'); 13session_write_close(); //close session 14if(!defined('NL')) define('NL',"\n"); 15 16// Version tag used to force rebuild on upgrade 17define('INDEXER_VERSION', 2); 18 19// keep running after browser closes connection 20@ignore_user_abort(true); 21 22// check if user abort worked, if yes send output early 23if(@ignore_user_abort() && !$conf['broken_iua']){ 24 sendGIF(); // send gif 25 $defer = false; 26}else{ 27 $defer = true; 28} 29 30$ID = cleanID($_REQUEST['id']); 31 32// Catch any possible output (e.g. errors) 33if(!$_REQUEST['debug']) ob_start(); 34 35// run one of the jobs 36$tmp = array(); 37$evt = new Doku_Event('INDEXER_TASKS_RUN', $tmp); 38if ($evt->advise_before()) { 39 runIndexer() or 40 metaUpdate() or 41 runSitemapper() or 42 runTrimRecentChanges() or 43 runTrimRecentChanges(true) or 44 $evt->advise_after(); 45} 46if($defer) sendGIF(); 47 48if(!$_REQUEST['debug']) ob_end_clean(); 49exit; 50 51// -------------------------------------------------------------------- 52 53/** 54 * Trims the recent changes cache (or imports the old changelog) as needed. 55 * 56 * @param media_changes If the media changelog shall be trimmed instead of 57 * the page changelog 58 * 59 * @author Ben Coburn <btcoburn@silicodon.net> 60 */ 61function runTrimRecentChanges($media_changes = false) { 62 global $conf; 63 64 $fn = ($media_changes ? $conf['media_changelog'] : $conf['changelog']); 65 66 // Trim the Recent Changes 67 // Trims the recent changes cache to the last $conf['changes_days'] recent 68 // changes or $conf['recent'] items, which ever is larger. 69 // The trimming is only done once a day. 70 if (@file_exists($fn) && 71 (filectime($fn)+86400)<time() && 72 !@file_exists($fn.'_tmp')) { 73 io_lock($fn); 74 $lines = file($fn); 75 if (count($lines)<=$conf['recent']) { 76 // nothing to trim 77 io_unlock($fn); 78 return false; 79 } 80 81 io_saveFile($fn.'_tmp', ''); // presave tmp as 2nd lock 82 $trim_time = time() - $conf['recent_days']*86400; 83 $out_lines = array(); 84 85 for ($i=0; $i<count($lines); $i++) { 86 $log = parseChangelogLine($lines[$i]); 87 if ($log === false) continue; // discard junk 88 if ($log['date'] < $trim_time) { 89 $old_lines[$log['date'].".$i"] = $lines[$i]; // keep old lines for now (append .$i to prevent key collisions) 90 } else { 91 $out_lines[$log['date'].".$i"] = $lines[$i]; // definitely keep these lines 92 } 93 } 94 95 // sort the final result, it shouldn't be necessary, 96 // however the extra robustness in making the changelog cache self-correcting is worth it 97 ksort($out_lines); 98 $extra = $conf['recent'] - count($out_lines); // do we need extra lines do bring us up to minimum 99 if ($extra > 0) { 100 ksort($old_lines); 101 $out_lines = array_merge(array_slice($old_lines,-$extra),$out_lines); 102 } 103 104 // save trimmed changelog 105 io_saveFile($fn.'_tmp', implode('', $out_lines)); 106 @unlink($fn); 107 if (!rename($fn.'_tmp', $fn)) { 108 // rename failed so try another way... 109 io_unlock($fn); 110 io_saveFile($fn, implode('', $out_lines)); 111 @unlink($fn.'_tmp'); 112 } else { 113 io_unlock($fn); 114 } 115 return true; 116 } 117 118 // nothing done 119 return false; 120} 121 122/** 123 * Runs the indexer for the current page 124 * 125 * @author Andreas Gohr <andi@splitbrain.org> 126 */ 127function runIndexer(){ 128 global $ID; 129 global $conf; 130 print "runIndexer(): started".NL; 131 132 // Move index files (if needed) 133 // Uses the importoldindex plugin to upgrade the index automatically. 134 // FIXME: Remove this from runIndexer when it is no longer needed. 135 if (@file_exists($conf['cachedir'].'/page.idx') && 136 (!@file_exists($conf['indexdir'].'/page.idx') || 137 !filesize($conf['indexdir'].'/page.idx')) && 138 !@file_exists($conf['indexdir'].'/index_importing')) { 139 echo "trigger TEMPORARY_INDEX_UPGRADE_EVENT\n"; 140 $tmp = array(); // no event data 141 trigger_event('TEMPORARY_INDEX_UPGRADE_EVENT', $tmp); 142 } 143 144 if(!$ID) return false; 145 146 // check if indexing needed 147 $idxtag = metaFN($ID,'.indexed'); 148 if(@file_exists($idxtag)){ 149 if(io_readFile($idxtag) >= INDEXER_VERSION){ 150 $last = @filemtime($idxtag); 151 if($last > @filemtime(wikiFN($ID))){ 152 print "runIndexer(): index for $ID up to date".NL; 153 return false; 154 } 155 } 156 } 157 158 // try to aquire a lock 159 $lock = $conf['lockdir'].'/_indexer.lock'; 160 while(!@mkdir($lock,$conf['dmode'])){ 161 usleep(50); 162 if(time()-@filemtime($lock) > 60*5){ 163 // looks like a stale lock - remove it 164 @rmdir($lock); 165 print "runIndexer(): stale lock removed".NL; 166 }else{ 167 print "runIndexer(): indexer locked".NL; 168 return false; 169 } 170 } 171 if($conf['dperm']) chmod($lock, $conf['dperm']); 172 173 require_once(DOKU_INC.'inc/indexer.php'); 174 175 // upgrade to version 2 176 if (!@file_exists($conf['indexdir'].'/pageword.idx')) 177 idx_upgradePageWords(); 178 179 // do the work 180 idx_addPage($ID); 181 182 // we're finished - save and free lock 183 io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); 184 @rmdir($lock); 185 print "runIndexer(): finished".NL; 186 return true; 187} 188 189/** 190 * Will render the metadata for the page if not exists yet 191 * 192 * This makes sure pages which are created from outside DokuWiki will 193 * gain their data when viewed for the first time. 194 */ 195function metaUpdate(){ 196 global $ID; 197 print "metaUpdate(): started".NL; 198 199 if(!$ID) return false; 200 $file = metaFN($ID, '.meta'); 201 echo "meta file: $file".NL; 202 203 // rendering needed? 204 if (@file_exists($file)) return false; 205 if (!@file_exists(wikiFN($ID))) return false; 206 207 require_once(DOKU_INC.'inc/common.php'); 208 require_once(DOKU_INC.'inc/parserutils.php'); 209 global $conf; 210 211 212 // gather some additional info from changelog 213 $info = io_grep($conf['changelog'], 214 '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/', 215 0,true); 216 217 $meta = array(); 218 if(!empty($info)){ 219 $meta['date']['created'] = $info[0][1]; 220 foreach($info as $item){ 221 if($item[4] != '*'){ 222 $meta['date']['modified'] = $item[1]; 223 if($item[3]){ 224 $meta['contributor'][$item[3]] = $item[3]; 225 } 226 } 227 } 228 } 229 230 $meta = p_render_metadata($ID, $meta); 231 io_saveFile($file, serialize($meta)); 232 233 echo "metaUpdate(): finished".NL; 234 return true; 235} 236 237/** 238 * Builds a Google Sitemap of all public pages known to the indexer 239 * 240 * The map is placed in the root directory named sitemap.xml.gz - This 241 * file needs to be writable! 242 * 243 * @author Andreas Gohr 244 * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html 245 */ 246function runSitemapper(){ 247 global $conf; 248 print "runSitemapper(): started".NL; 249 if(!$conf['sitemap']) return false; 250 251 if($conf['compression'] == 'bz2' || $conf['compression'] == 'gz'){ 252 $sitemap = 'sitemap.xml.gz'; 253 }else{ 254 $sitemap = 'sitemap.xml'; 255 } 256 print "runSitemapper(): using $sitemap".NL; 257 258 if(@file_exists(DOKU_INC.$sitemap)){ 259 if(!is_writable(DOKU_INC.$sitemap)) return false; 260 }else{ 261 if(!is_writable(DOKU_INC)) return false; 262 } 263 264 if(@filesize(DOKU_INC.$sitemap) && 265 @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){ 266 print 'runSitemapper(): Sitemap up to date'.NL; 267 return false; 268 } 269 270 $pages = file($conf['indexdir'].'/page.idx'); 271 print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL; 272 273 // build the sitemap 274 ob_start(); 275 print '<?xml version="1.0" encoding="UTF-8"?>'.NL; 276 print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL; 277 foreach($pages as $id){ 278 $id = trim($id); 279 $file = wikiFN($id); 280 281 //skip hidden, non existing and restricted files 282 if(isHiddenPage($id)) continue; 283 $date = @filemtime($file); 284 if(!$date) continue; 285 if(auth_aclcheck($id,'','') < AUTH_READ) continue; 286 287 print ' <url>'.NL; 288 print ' <loc>'.wl($id,'',true).'</loc>'.NL; 289 print ' <lastmod>'.date_iso8601($date).'</lastmod>'.NL; 290 print ' </url>'.NL; 291 } 292 print '</urlset>'.NL; 293 $data = ob_get_contents(); 294 ob_end_clean(); 295 296 //save the new sitemap 297 io_saveFile(DOKU_INC.$sitemap,$data); 298 299 //ping search engines... 300 $http = new DokuHTTPClient(); 301 $http->timeout = 8; 302 303 //ping google 304 print 'runSitemapper(): pinging google'.NL; 305 $url = 'http://www.google.com/webmasters/sitemaps/ping?sitemap='; 306 $url .= urlencode(DOKU_URL.$sitemap); 307 $resp = $http->get($url); 308 if($http->error) print 'runSitemapper(): '.$http->error.NL; 309 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 310 311 //ping yahoo 312 print 'runSitemapper(): pinging yahoo'.NL; 313 $url = 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url='; 314 $url .= urlencode(DOKU_URL.$sitemap); 315 $resp = $http->get($url); 316 if($http->error) print 'runSitemapper(): '.$http->error.NL; 317 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 318 319 //ping microsoft 320 print 'runSitemapper(): pinging microsoft'.NL; 321 $url = 'http://webmaster.live.com/webmaster/ping.aspx?sitemap='; 322 $url .= urlencode(DOKU_URL.$sitemap); 323 $resp = $http->get($url); 324 if($http->error) print 'runSitemapper(): '.$http->error.NL; 325 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 326 327 print 'runSitemapper(): finished'.NL; 328 return true; 329} 330 331/** 332 * Formats a timestamp as ISO 8601 date 333 * 334 * @author <ungu at terong dot com> 335 * @link http://www.php.net/manual/en/function.date.php#54072 336 */ 337function date_iso8601($int_date) { 338 //$int_date: current date in UNIX timestamp 339 $date_mod = date('Y-m-d\TH:i:s', $int_date); 340 $pre_timezone = date('O', $int_date); 341 $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2); 342 $date_mod .= $time_zone; 343 return $date_mod; 344} 345 346/** 347 * Just send a 1x1 pixel blank gif to the browser 348 * 349 * @author Andreas Gohr <andi@splitbrain.org> 350 * @author Harry Fuecks <fuecks@gmail.com> 351 */ 352function sendGIF(){ 353 if($_REQUEST['debug']){ 354 header('Content-Type: text/plain'); 355 return; 356 } 357 $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'); 358 header('Content-Type: image/gif'); 359 header('Content-Length: '.strlen($img)); 360 header('Connection: Close'); 361 print $img; 362 flush(); 363 // Browser should drop connection after this 364 // Thinks it's got the whole image 365} 366 367//Setup VIM: ex: et ts=4 enc=utf-8 : 368// No trailing PHP closing tag - no output please! 369// See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php 370