1<?php 2/** 3 * DokuWiki indexer 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../'); 9define('DOKU_DISABLE_GZIP_OUTPUT',1); 10require_once(DOKU_INC.'inc/init.php'); 11session_write_close(); //close session 12if(!defined('NL')) define('NL',"\n"); 13 14// Version tag used to force rebuild on upgrade 15define('INDEXER_VERSION', 2); 16 17// keep running after browser closes connection 18@ignore_user_abort(true); 19 20// check if user abort worked, if yes send output early 21$defer = !@ignore_user_abort() || $conf['broken_iua']; 22if(!$defer){ 23 sendGIF(); // send gif 24} 25 26$ID = cleanID($_REQUEST['id']); 27 28// Catch any possible output (e.g. errors) 29$output = isset($_REQUEST['debug']) && $conf['allowdebug']; 30if(!$output) ob_start(); 31 32// run one of the jobs 33$tmp = array(); // No event data 34$evt = new Doku_Event('INDEXER_TASKS_RUN', $tmp); 35if ($evt->advise_before()) { 36 runIndexer() or 37 metaUpdate() or 38 runSitemapper() or 39 sendDigest() or 40 runTrimRecentChanges() or 41 runTrimRecentChanges(true) or 42 $evt->advise_after(); 43} 44if($defer) sendGIF(); 45 46if(!$output) ob_end_clean(); 47exit; 48 49// -------------------------------------------------------------------- 50 51/** 52 * Trims the recent changes cache (or imports the old changelog) as needed. 53 * 54 * @param media_changes If the media changelog shall be trimmed instead of 55 * the page changelog 56 * 57 * @author Ben Coburn <btcoburn@silicodon.net> 58 */ 59function runTrimRecentChanges($media_changes = false) { 60 global $conf; 61 62 $fn = ($media_changes ? $conf['media_changelog'] : $conf['changelog']); 63 64 // Trim the Recent Changes 65 // Trims the recent changes cache to the last $conf['changes_days'] recent 66 // changes or $conf['recent'] items, which ever is larger. 67 // The trimming is only done once a day. 68 if (@file_exists($fn) && 69 (@filemtime($fn.'.trimmed')+86400)<time() && 70 !@file_exists($fn.'_tmp')) { 71 @touch($fn.'.trimmed'); 72 io_lock($fn); 73 $lines = file($fn); 74 if (count($lines)<=$conf['recent']) { 75 // nothing to trim 76 io_unlock($fn); 77 return false; 78 } 79 80 io_saveFile($fn.'_tmp', ''); // presave tmp as 2nd lock 81 $trim_time = time() - $conf['recent_days']*86400; 82 $out_lines = array(); 83 84 for ($i=0; $i<count($lines); $i++) { 85 $log = parseChangelogLine($lines[$i]); 86 if ($log === false) continue; // discard junk 87 if ($log['date'] < $trim_time) { 88 $old_lines[$log['date'].".$i"] = $lines[$i]; // keep old lines for now (append .$i to prevent key collisions) 89 } else { 90 $out_lines[$log['date'].".$i"] = $lines[$i]; // definitely keep these lines 91 } 92 } 93 94 if (count($lines)==count($out_lines)) { 95 // nothing to trim 96 @unlink($fn.'_tmp'); 97 io_unlock($fn); 98 return false; 99 } 100 101 // sort the final result, it shouldn't be necessary, 102 // however the extra robustness in making the changelog cache self-correcting is worth it 103 ksort($out_lines); 104 $extra = $conf['recent'] - count($out_lines); // do we need extra lines do bring us up to minimum 105 if ($extra > 0) { 106 ksort($old_lines); 107 $out_lines = array_merge(array_slice($old_lines,-$extra),$out_lines); 108 } 109 110 // save trimmed changelog 111 io_saveFile($fn.'_tmp', implode('', $out_lines)); 112 @unlink($fn); 113 if (!rename($fn.'_tmp', $fn)) { 114 // rename failed so try another way... 115 io_unlock($fn); 116 io_saveFile($fn, implode('', $out_lines)); 117 @unlink($fn.'_tmp'); 118 } else { 119 io_unlock($fn); 120 } 121 return true; 122 } 123 124 // nothing done 125 return false; 126} 127 128/** 129 * Runs the indexer for the current page 130 * 131 * @author Andreas Gohr <andi@splitbrain.org> 132 */ 133function runIndexer(){ 134 global $ID; 135 global $conf; 136 print "runIndexer(): started".NL; 137 138 if(!$ID) return false; 139 140 // check if indexing needed 141 $idxtag = metaFN($ID,'.indexed'); 142 if(@file_exists($idxtag)){ 143 if(io_readFile($idxtag) >= INDEXER_VERSION){ 144 $last = @filemtime($idxtag); 145 if($last > @filemtime(wikiFN($ID))){ 146 print "runIndexer(): index for $ID up to date".NL; 147 return false; 148 } 149 } 150 } 151 152 // try to aquire a lock 153 $lock = $conf['lockdir'].'/_indexer.lock'; 154 while(!@mkdir($lock,$conf['dmode'])){ 155 usleep(50); 156 if(time()-@filemtime($lock) > 60*5){ 157 // looks like a stale lock - remove it 158 @rmdir($lock); 159 print "runIndexer(): stale lock removed".NL; 160 }else{ 161 print "runIndexer(): indexer locked".NL; 162 return false; 163 } 164 } 165 if($conf['dperm']) chmod($lock, $conf['dperm']); 166 167 // do the work 168 idx_addPage($ID); 169 170 // we're finished - save and free lock 171 io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); 172 @rmdir($lock); 173 print "runIndexer(): finished".NL; 174 return true; 175} 176 177/** 178 * Will render the metadata for the page if not exists yet 179 * 180 * This makes sure pages which are created from outside DokuWiki will 181 * gain their data when viewed for the first time. 182 */ 183function metaUpdate(){ 184 global $ID; 185 print "metaUpdate(): started".NL; 186 187 if(!$ID) return false; 188 $file = metaFN($ID, '.meta'); 189 echo "meta file: $file".NL; 190 191 // rendering needed? 192 if (@file_exists($file)) return false; 193 if (!@file_exists(wikiFN($ID))) return false; 194 195 global $conf; 196 197 // gather some additional info from changelog 198 $info = io_grep($conf['changelog'], 199 '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/', 200 0,true); 201 202 $meta = array(); 203 if(!empty($info)){ 204 $meta['date']['created'] = $info[0][1]; 205 foreach($info as $item){ 206 if($item[4] != '*'){ 207 $meta['date']['modified'] = $item[1]; 208 if($item[3]){ 209 $meta['contributor'][$item[3]] = $item[3]; 210 } 211 } 212 } 213 } 214 215 $meta = p_render_metadata($ID, $meta); 216 io_saveFile($file, serialize($meta)); 217 218 echo "metaUpdate(): finished".NL; 219 return true; 220} 221 222/** 223 * Builds a Google Sitemap of all public pages known to the indexer 224 * 225 * The map is placed in the root directory named sitemap.xml.gz - This 226 * file needs to be writable! 227 * 228 * @author Andreas Gohr 229 * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html 230 */ 231function runSitemapper(){ 232 global $conf; 233 print "runSitemapper(): started".NL; 234 if(!$conf['sitemap']) return false; 235 236 if($conf['compression'] == 'bz2' || $conf['compression'] == 'gz'){ 237 $sitemap = 'sitemap.xml.gz'; 238 }else{ 239 $sitemap = 'sitemap.xml'; 240 } 241 print "runSitemapper(): using $sitemap".NL; 242 243 if(@file_exists(DOKU_INC.$sitemap)){ 244 if(!is_writable(DOKU_INC.$sitemap)) return false; 245 }else{ 246 if(!is_writable(DOKU_INC)) return false; 247 } 248 249 if(@filesize(DOKU_INC.$sitemap) && 250 @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){ 251 print 'runSitemapper(): Sitemap up to date'.NL; 252 return false; 253 } 254 255 $pages = idx_getIndex('page', ''); 256 print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL; 257 258 // build the sitemap 259 ob_start(); 260 print '<?xml version="1.0" encoding="UTF-8"?>'.NL; 261 print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL; 262 foreach($pages as $id){ 263 $id = trim($id); 264 $file = wikiFN($id); 265 266 //skip hidden, non existing and restricted files 267 if(isHiddenPage($id)) continue; 268 $date = @filemtime($file); 269 if(!$date) continue; 270 if(auth_aclcheck($id,'','') < AUTH_READ) continue; 271 272 print ' <url>'.NL; 273 print ' <loc>'.wl($id,'',true).'</loc>'.NL; 274 print ' <lastmod>'.date_iso8601($date).'</lastmod>'.NL; 275 print ' </url>'.NL; 276 } 277 print '</urlset>'.NL; 278 $data = ob_get_contents(); 279 ob_end_clean(); 280 281 //save the new sitemap 282 io_saveFile(DOKU_INC.$sitemap,$data); 283 284 //ping search engines... 285 $http = new DokuHTTPClient(); 286 $http->timeout = 8; 287 288 //ping google 289 print 'runSitemapper(): pinging google'.NL; 290 $url = 'http://www.google.com/webmasters/sitemaps/ping?sitemap='; 291 $url .= urlencode(DOKU_URL.$sitemap); 292 $resp = $http->get($url); 293 if($http->error) print 'runSitemapper(): '.$http->error.NL; 294 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 295 296 //ping yahoo 297 print 'runSitemapper(): pinging yahoo'.NL; 298 $url = 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url='; 299 $url .= urlencode(DOKU_URL.$sitemap); 300 $resp = $http->get($url); 301 if($http->error) print 'runSitemapper(): '.$http->error.NL; 302 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 303 304 //ping microsoft 305 print 'runSitemapper(): pinging microsoft'.NL; 306 $url = 'http://www.bing.com/webmaster/ping.aspx?siteMap='; 307 $url .= urlencode(DOKU_URL.$sitemap); 308 $resp = $http->get($url); 309 if($http->error) print 'runSitemapper(): '.$http->error.NL; 310 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 311 312 print 'runSitemapper(): finished'.NL; 313 return true; 314} 315 316/** 317 * Send digest and list mails for all subscriptions which are in effect for the 318 * current page 319 * 320 * @author Adrian Lang <lang@cosmocode.de> 321 */ 322function sendDigest() { 323 echo 'sendDigest(): start'.NL; 324 global $ID; 325 global $conf; 326 if (!$conf['subscribers']) { 327 return; 328 } 329 $subscriptions = subscription_find($ID, array('style' => '(digest|list)', 330 'escaped' => true)); 331 global $auth; 332 global $lang; 333 global $conf; 334 global $USERINFO; 335 336 // remember current user info 337 $olduinfo = $USERINFO; 338 $olduser = $_SERVER['REMOTE_USER']; 339 340 foreach($subscriptions as $id => $users) { 341 if (!subscription_lock($id)) { 342 continue; 343 } 344 foreach($users as $data) { 345 list($user, $style, $lastupdate) = $data; 346 $lastupdate = (int) $lastupdate; 347 if ($lastupdate + $conf['subscribe_time'] > time()) { 348 // Less than the configured time period passed since last 349 // update. 350 continue; 351 } 352 353 // Work as the user to make sure ACLs apply correctly 354 $USERINFO = $auth->getUserData($user); 355 $_SERVER['REMOTE_USER'] = $user; 356 if ($USERINFO === false) { 357 continue; 358 } 359 360 if (substr($id, -1, 1) === ':') { 361 // The subscription target is a namespace 362 $changes = getRecentsSince($lastupdate, null, getNS($id)); 363 } else { 364 if(auth_quickaclcheck($id) < AUTH_READ) continue; 365 366 $meta = p_get_metadata($id); 367 $changes = array($meta['last_change']); 368 } 369 370 // Filter out pages only changed in small and own edits 371 $change_ids = array(); 372 foreach($changes as $rev) { 373 $n = 0; 374 while (!is_null($rev) && $rev['date'] >= $lastupdate && 375 ($_SERVER['REMOTE_USER'] === $rev['user'] || 376 $rev['type'] === DOKU_CHANGE_TYPE_MINOR_EDIT)) { 377 $rev = getRevisions($rev['id'], $n++, 1); 378 $rev = (count($rev) > 0) ? $rev[0] : null; 379 } 380 381 if (!is_null($rev) && $rev['date'] >= $lastupdate) { 382 // Some change was not a minor one and not by myself 383 $change_ids[] = $rev['id']; 384 } 385 } 386 387 if ($style === 'digest') { 388 foreach($change_ids as $change_id) { 389 subscription_send_digest($USERINFO['mail'], $change_id, 390 $lastupdate); 391 } 392 } elseif ($style === 'list') { 393 subscription_send_list($USERINFO['mail'], $change_ids, $id); 394 } 395 // TODO: Handle duplicate subscriptions. 396 397 // Update notification time. 398 subscription_set($user, $id, $style, time(), true); 399 } 400 subscription_unlock($id); 401 } 402 403 // restore current user info 404 $USERINFO = $olduinfo; 405 $_SERVER['REMOTE_USER'] = $olduser; 406} 407 408/** 409 * Formats a timestamp as ISO 8601 date 410 * 411 * @author <ungu at terong dot com> 412 * @link http://www.php.net/manual/en/function.date.php#54072 413 */ 414function date_iso8601($int_date) { 415 //$int_date: current date in UNIX timestamp 416 $date_mod = date('Y-m-d\TH:i:s', $int_date); 417 $pre_timezone = date('O', $int_date); 418 $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2); 419 $date_mod .= $time_zone; 420 return $date_mod; 421} 422 423/** 424 * Just send a 1x1 pixel blank gif to the browser 425 * 426 * @author Andreas Gohr <andi@splitbrain.org> 427 * @author Harry Fuecks <fuecks@gmail.com> 428 */ 429function sendGIF(){ 430 if(isset($_REQUEST['debug'])){ 431 header('Content-Type: text/plain'); 432 return; 433 } 434 $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'); 435 header('Content-Type: image/gif'); 436 header('Content-Length: '.strlen($img)); 437 header('Connection: Close'); 438 print $img; 439 flush(); 440 // Browser should drop connection after this 441 // Thinks it's got the whole image 442} 443 444//Setup VIM: ex: et ts=4 enc=utf-8 : 445// No trailing PHP closing tag - no output please! 446// See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php 447