1<?php 2/** 3 * DokuWiki indexer 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../'); 9define('DOKU_DISABLE_GZIP_OUTPUT',1); 10require_once(DOKU_INC.'inc/init.php'); 11session_write_close(); //close session 12if(!defined('NL')) define('NL',"\n"); 13 14// Version tag used to force rebuild on upgrade 15define('INDEXER_VERSION', 2); 16 17// keep running after browser closes connection 18@ignore_user_abort(true); 19 20// check if user abort worked, if yes send output early 21$defer = !@ignore_user_abort() || $conf['broken_iua']; 22if(!$defer){ 23 sendGIF(); // send gif 24} 25 26$ID = cleanID($_REQUEST['id']); 27 28// Catch any possible output (e.g. errors) 29if(!isset($_REQUEST['debug'])) ob_start(); 30 31// run one of the jobs 32$tmp = array(); // No event data 33$evt = new Doku_Event('INDEXER_TASKS_RUN', $tmp); 34if ($evt->advise_before()) { 35 runIndexer() or 36 metaUpdate() or 37 runSitemapper() or 38 sendDigest() or 39 runTrimRecentChanges() or 40 runTrimRecentChanges(true) or 41 $evt->advise_after(); 42} 43if($defer) sendGIF(); 44 45if(!isset($_REQUEST['debug'])) ob_end_clean(); 46exit; 47 48// -------------------------------------------------------------------- 49 50/** 51 * Trims the recent changes cache (or imports the old changelog) as needed. 52 * 53 * @param media_changes If the media changelog shall be trimmed instead of 54 * the page changelog 55 * 56 * @author Ben Coburn <btcoburn@silicodon.net> 57 */ 58function runTrimRecentChanges($media_changes = false) { 59 global $conf; 60 61 $fn = ($media_changes ? $conf['media_changelog'] : $conf['changelog']); 62 63 // Trim the Recent Changes 64 // Trims the recent changes cache to the last $conf['changes_days'] recent 65 // changes or $conf['recent'] items, which ever is larger. 66 // The trimming is only done once a day. 67 if (@file_exists($fn) && 68 (@filemtime($fn.'.trimmed')+86400)<time() && 69 !@file_exists($fn.'_tmp')) { 70 @touch($fn.'.trimmed'); 71 io_lock($fn); 72 $lines = file($fn); 73 if (count($lines)<=$conf['recent']) { 74 // nothing to trim 75 io_unlock($fn); 76 return false; 77 } 78 79 io_saveFile($fn.'_tmp', ''); // presave tmp as 2nd lock 80 $trim_time = time() - $conf['recent_days']*86400; 81 $out_lines = array(); 82 83 for ($i=0; $i<count($lines); $i++) { 84 $log = parseChangelogLine($lines[$i]); 85 if ($log === false) continue; // discard junk 86 if ($log['date'] < $trim_time) { 87 $old_lines[$log['date'].".$i"] = $lines[$i]; // keep old lines for now (append .$i to prevent key collisions) 88 } else { 89 $out_lines[$log['date'].".$i"] = $lines[$i]; // definitely keep these lines 90 } 91 } 92 93 if (count($lines)==count($out_lines)) { 94 // nothing to trim 95 @unlink($fn.'_tmp'); 96 io_unlock($fn); 97 return false; 98 } 99 100 // sort the final result, it shouldn't be necessary, 101 // however the extra robustness in making the changelog cache self-correcting is worth it 102 ksort($out_lines); 103 $extra = $conf['recent'] - count($out_lines); // do we need extra lines do bring us up to minimum 104 if ($extra > 0) { 105 ksort($old_lines); 106 $out_lines = array_merge(array_slice($old_lines,-$extra),$out_lines); 107 } 108 109 // save trimmed changelog 110 io_saveFile($fn.'_tmp', implode('', $out_lines)); 111 @unlink($fn); 112 if (!rename($fn.'_tmp', $fn)) { 113 // rename failed so try another way... 114 io_unlock($fn); 115 io_saveFile($fn, implode('', $out_lines)); 116 @unlink($fn.'_tmp'); 117 } else { 118 io_unlock($fn); 119 } 120 return true; 121 } 122 123 // nothing done 124 return false; 125} 126 127/** 128 * Runs the indexer for the current page 129 * 130 * @author Andreas Gohr <andi@splitbrain.org> 131 */ 132function runIndexer(){ 133 global $ID; 134 global $conf; 135 print "runIndexer(): started".NL; 136 137 if(!$ID) return false; 138 139 // check if indexing needed 140 $idxtag = metaFN($ID,'.indexed'); 141 if(@file_exists($idxtag)){ 142 if(io_readFile($idxtag) >= INDEXER_VERSION){ 143 $last = @filemtime($idxtag); 144 if($last > @filemtime(wikiFN($ID))){ 145 print "runIndexer(): index for $ID up to date".NL; 146 return false; 147 } 148 } 149 } 150 151 // try to aquire a lock 152 $lock = $conf['lockdir'].'/_indexer.lock'; 153 while(!@mkdir($lock,$conf['dmode'])){ 154 usleep(50); 155 if(time()-@filemtime($lock) > 60*5){ 156 // looks like a stale lock - remove it 157 @rmdir($lock); 158 print "runIndexer(): stale lock removed".NL; 159 }else{ 160 print "runIndexer(): indexer locked".NL; 161 return false; 162 } 163 } 164 if($conf['dperm']) chmod($lock, $conf['dperm']); 165 166 // upgrade to version 2 167 if (!@file_exists($conf['indexdir'].'/pageword.idx')) 168 idx_upgradePageWords(); 169 170 // do the work 171 idx_addPage($ID); 172 173 // we're finished - save and free lock 174 io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); 175 @rmdir($lock); 176 print "runIndexer(): finished".NL; 177 return true; 178} 179 180/** 181 * Will render the metadata for the page if not exists yet 182 * 183 * This makes sure pages which are created from outside DokuWiki will 184 * gain their data when viewed for the first time. 185 */ 186function metaUpdate(){ 187 global $ID; 188 print "metaUpdate(): started".NL; 189 190 if(!$ID) return false; 191 $file = metaFN($ID, '.meta'); 192 echo "meta file: $file".NL; 193 194 // rendering needed? 195 if (@file_exists($file)) return false; 196 if (!@file_exists(wikiFN($ID))) return false; 197 198 global $conf; 199 200 // gather some additional info from changelog 201 $info = io_grep($conf['changelog'], 202 '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/', 203 0,true); 204 205 $meta = array(); 206 if(!empty($info)){ 207 $meta['date']['created'] = $info[0][1]; 208 foreach($info as $item){ 209 if($item[4] != '*'){ 210 $meta['date']['modified'] = $item[1]; 211 if($item[3]){ 212 $meta['contributor'][$item[3]] = $item[3]; 213 } 214 } 215 } 216 } 217 218 $meta = p_render_metadata($ID, $meta); 219 io_saveFile($file, serialize($meta)); 220 221 echo "metaUpdate(): finished".NL; 222 return true; 223} 224 225/** 226 * Builds a Google Sitemap of all public pages known to the indexer 227 * 228 * The map is placed in the root directory named sitemap.xml.gz - This 229 * file needs to be writable! 230 * 231 * @author Andreas Gohr 232 * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html 233 */ 234function runSitemapper(){ 235 global $conf; 236 print "runSitemapper(): started".NL; 237 if(!$conf['sitemap']) return false; 238 239 if($conf['compression'] == 'bz2' || $conf['compression'] == 'gz'){ 240 $sitemap = 'sitemap.xml.gz'; 241 }else{ 242 $sitemap = 'sitemap.xml'; 243 } 244 print "runSitemapper(): using $sitemap".NL; 245 246 if(@file_exists(DOKU_INC.$sitemap)){ 247 if(!is_writable(DOKU_INC.$sitemap)) return false; 248 }else{ 249 if(!is_writable(DOKU_INC)) return false; 250 } 251 252 if(@filesize(DOKU_INC.$sitemap) && 253 @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){ 254 print 'runSitemapper(): Sitemap up to date'.NL; 255 return false; 256 } 257 258 $pages = idx_getIndex('page', ''); 259 print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL; 260 261 // build the sitemap 262 ob_start(); 263 print '<?xml version="1.0" encoding="UTF-8"?>'.NL; 264 print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL; 265 foreach($pages as $id){ 266 $id = trim($id); 267 $file = wikiFN($id); 268 269 //skip hidden, non existing and restricted files 270 if(isHiddenPage($id)) continue; 271 $date = @filemtime($file); 272 if(!$date) continue; 273 if(auth_aclcheck($id,'','') < AUTH_READ) continue; 274 275 print ' <url>'.NL; 276 print ' <loc>'.wl($id,'',true).'</loc>'.NL; 277 print ' <lastmod>'.date_iso8601($date).'</lastmod>'.NL; 278 print ' </url>'.NL; 279 } 280 print '</urlset>'.NL; 281 $data = ob_get_contents(); 282 ob_end_clean(); 283 284 //save the new sitemap 285 io_saveFile(DOKU_INC.$sitemap,$data); 286 287 //ping search engines... 288 $http = new DokuHTTPClient(); 289 $http->timeout = 8; 290 291 //ping google 292 print 'runSitemapper(): pinging google'.NL; 293 $url = 'http://www.google.com/webmasters/sitemaps/ping?sitemap='; 294 $url .= urlencode(DOKU_URL.$sitemap); 295 $resp = $http->get($url); 296 if($http->error) print 'runSitemapper(): '.$http->error.NL; 297 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 298 299 //ping yahoo 300 print 'runSitemapper(): pinging yahoo'.NL; 301 $url = 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url='; 302 $url .= urlencode(DOKU_URL.$sitemap); 303 $resp = $http->get($url); 304 if($http->error) print 'runSitemapper(): '.$http->error.NL; 305 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 306 307 //ping microsoft 308 print 'runSitemapper(): pinging microsoft'.NL; 309 $url = 'http://www.bing.com/webmaster/ping.aspx?siteMap='; 310 $url .= urlencode(DOKU_URL.$sitemap); 311 $resp = $http->get($url); 312 if($http->error) print 'runSitemapper(): '.$http->error.NL; 313 print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; 314 315 print 'runSitemapper(): finished'.NL; 316 return true; 317} 318 319/** 320 * Send digest and list mails for all subscriptions which are in effect for the 321 * current page 322 * 323 * @author Adrian Lang <lang@cosmocode.de> 324 */ 325function sendDigest() { 326 echo 'sendDigest(): start'.NL; 327 global $ID; 328 global $conf; 329 if (!$conf['subscribers']) { 330 return; 331 } 332 $subscriptions = subscription_find($ID, array('style' => '(digest|list)', 333 'escaped' => true)); 334 global $auth; 335 global $lang; 336 global $conf; 337 global $USERINFO; 338 339 // remember current user info 340 $olduinfo = $USERINFO; 341 $olduser = $_SERVER['REMOTE_USER']; 342 343 foreach($subscriptions as $id => $users) { 344 if (!subscription_lock($id)) { 345 continue; 346 } 347 foreach($users as $data) { 348 list($user, $style, $lastupdate) = $data; 349 $lastupdate = (int) $lastupdate; 350 if ($lastupdate + $conf['subscribe_time'] > time()) { 351 // Less than the configured time period passed since last 352 // update. 353 continue; 354 } 355 356 // Work as the user to make sure ACLs apply correctly 357 $USERINFO = $auth->getUserData($user); 358 $_SERVER['REMOTE_USER'] = $user; 359 if ($USERINFO === false) { 360 continue; 361 } 362 363 if (substr($id, -1, 1) === ':') { 364 // The subscription target is a namespace 365 $changes = getRecentsSince($lastupdate, null, getNS($id)); 366 if (count($changes) === 0) { 367 continue; 368 } 369 if ($style === 'digest') { 370 foreach($changes as $change) { 371 subscription_send_digest($USERINFO['mail'], $change, 372 $lastupdate); 373 } 374 } elseif ($style === 'list') { 375 subscription_send_list($USERINFO['mail'], $changes, $id); 376 } 377 // TODO: Handle duplicate subscriptions. 378 } else { 379 if(auth_quickaclcheck($id) < AUTH_READ) continue; 380 381 $meta = p_get_metadata($id); 382 $rev = $meta['last_change']['date']; 383 if ($rev < $lastupdate) { 384 // There is no new revision. 385 continue; 386 } 387 subscription_send_digest($USERINFO['mail'], $meta['last_change'], 388 $lastupdate); 389 } 390 // Update notification time. 391 subscription_set($user, $id, $style, time(), true); 392 } 393 subscription_unlock($id); 394 } 395 396 // restore current user info 397 $USERINFO = $olduinfo; 398 $_SERVER['REMOTE_USER'] = $olduser; 399} 400 401/** 402 * Formats a timestamp as ISO 8601 date 403 * 404 * @author <ungu at terong dot com> 405 * @link http://www.php.net/manual/en/function.date.php#54072 406 */ 407function date_iso8601($int_date) { 408 //$int_date: current date in UNIX timestamp 409 $date_mod = date('Y-m-d\TH:i:s', $int_date); 410 $pre_timezone = date('O', $int_date); 411 $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2); 412 $date_mod .= $time_zone; 413 return $date_mod; 414} 415 416/** 417 * Just send a 1x1 pixel blank gif to the browser 418 * 419 * @author Andreas Gohr <andi@splitbrain.org> 420 * @author Harry Fuecks <fuecks@gmail.com> 421 */ 422function sendGIF(){ 423 if(isset($_REQUEST['debug'])){ 424 header('Content-Type: text/plain'); 425 return; 426 } 427 $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'); 428 header('Content-Type: image/gif'); 429 header('Content-Length: '.strlen($img)); 430 header('Connection: Close'); 431 print $img; 432 flush(); 433 // Browser should drop connection after this 434 // Thinks it's got the whole image 435} 436 437//Setup VIM: ex: et ts=4 enc=utf-8 : 438// No trailing PHP closing tag - no output please! 439// See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php 440