xref: /dokuwiki/lib/exe/indexer.php (revision bb4866bd74ec6b55bf41e75c158d940dced91f2f)
1<?php
2/**
3 * DokuWiki indexer
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../../').'/');
9define('DOKU_DISABLE_GZIP_OUTPUT',1);
10require_once(DOKU_INC.'inc/init.php');
11require_once(DOKU_INC.'inc/auth.php');
12require_once(DOKU_INC.'inc/events.php');
13session_write_close();  //close session
14if(!defined('NL')) define('NL',"\n");
15
16// keep running after browser closes connection
17@ignore_user_abort(true);
18
19// check if user abort worked, if yes send output early
20if(@ignore_user_abort()){
21    sendGIF(); // send gif
22    $defer = false;
23}else{
24    $defer = true;
25}
26
27// Catch any possible output (e.g. errors)
28if(!$_REQUEST['debug']) ob_start();
29
30// run one of the jobs
31runIndexer() or metaUpdate() or runSitemapper() or runTrimRecentChanges();
32if($defer) sendGIF();
33
34if(!$_REQUEST['debug']) ob_end_clean();
35exit;
36
37// --------------------------------------------------------------------
38
39/**
40 * Trims the recent changes cache (or imports the old changelog) as needed.
41 *
42 * @author Ben Coburn <btcoburn@silicodon.net>
43 */
44function runTrimRecentChanges() {
45    global $conf;
46
47    // Import old changelog (if needed)
48    // Uses the imporoldchangelog plugin to upgrade the changelog automaticaly.
49    // FIXME: Remove this from runTrimRecentChanges when it is no longer needed.
50    if (isset($conf['changelog_old']) &&
51        @file_exists($conf['changelog_old']) && !@file_exists($conf['changelog']) &&
52        !@file_exists($conf['changelog'].'_importing') && !@file_exists($conf['changelog'].'_tmp')) {
53            $tmp = array(); // no event data
54            trigger_event('TEMPORARY_CHANGELOG_UPGRADE_EVENT', $tmp);
55            return true;
56    }
57
58    // Trim the Recent Changes
59    // Trims the recent changes cache to the last $conf['changes_days'] recent
60    // changes or $conf['recent'] items, which ever is larger.
61    // The trimming is only done once a day.
62    if (@file_exists($conf['changelog']) &&
63        (filectime($conf['changelog'])+86400)<time() &&
64        !@file_exists($conf['changelog'].'_tmp')) {
65            io_lock($conf['changelog']);
66            $lines = file($conf['changelog']);
67            if (count($lines)<$conf['recent']) {
68                // nothing to trim
69                io_unlock($conf['changelog']);
70                return true;
71            }
72            // trim changelog
73            io_saveFile($conf['changelog'].'_tmp', ''); // presave tmp as 2nd lock
74            $kept = 0;
75            $trim_time = time() - $conf['recent_days']*86400;
76            $out_lines = array();
77            // check lines from newest to oldest
78            for ($i = count($lines)-1; $i >= 0; $i--) {
79                $tmp = parseChangelogLine($lines[$i]);
80                if ($tmp===false) { continue; }
81                if ($tmp['date']>$trim_time || $kept<$conf['recent']) {
82                    array_push($out_lines, implode("\t", $tmp)."\n");
83                    $kept++;
84                } else {
85                    // no more lines worth keeping
86                    break;
87                }
88            }
89            io_saveFile($conf['changelog'].'_tmp', implode('', $out_lines));
90            @unlink($conf['changelog']);
91            if (!rename($conf['changelog'].'_tmp', $conf['changelog'])) {
92                // rename failed so try another way...
93                io_unlock($conf['changelog']);
94                io_saveFile($conf['changelog'], implode('', $out_lines));
95                @unlink($conf['changelog'].'_tmp');
96            } else {
97                io_unlock($conf['changelog']);
98            }
99            return true;
100    }
101
102    // nothing done
103    return false;
104}
105
106/**
107 * Runs the indexer for the current page
108 *
109 * @author Andreas Gohr <andi@splitbrain.org>
110 */
111function runIndexer(){
112    global $conf;
113    print "runIndexer(): started".NL;
114
115    $ID = cleanID($_REQUEST['id']);
116    if(!$ID) return false;
117
118    // check if indexing needed
119    $last = @filemtime(metaFN($ID,'.indexed'));
120    if($last > @filemtime(wikiFN($ID))){
121        print "runIndexer(): index for $ID up to date".NL;
122        return false;
123    }
124
125    // try to aquire a lock
126    $lock = $conf['lockdir'].'/_indexer.lock';
127    while(!@mkdir($lock,$conf['dmode'])){
128        usleep(50);
129        if(time()-@filemtime($lock) > 60*5){
130            // looks like a stale lock - remove it
131            @rmdir($lock);
132            print "runIndexer(): stale lock removed".NL;
133        }else{
134            print "runIndexer(): indexer locked".NL;
135            return false;
136        }
137    }
138    if($conf['dperm']) chmod($lock, $conf['dperm']);
139
140    require_once(DOKU_INC.'inc/indexer.php');
141
142    // do the work
143    idx_addPage($ID);
144
145    // we're finished - save and free lock
146    io_saveFile(metaFN($ID,'.indexed'),' ');
147    @rmdir($lock);
148    print "runIndexer(): finished".NL;
149    return true;
150}
151
152/**
153 * Will render the metadata for the page if not exists yet
154 *
155 * This makes sure pages which are created from outside DokuWiki will
156 * gain their data when viewed for the first time.
157 */
158function metaUpdate(){
159    print "metaUpdate(): started".NL;
160
161    $ID = cleanID($_REQUEST['id']);
162    if(!$ID) return false;
163    $file = metaFN($ID, '.meta');
164    echo "meta file: $file".NL;
165
166    // rendering needed?
167    if (@file_exists($file)) return false;
168    if (!@file_exists(wikiFN($ID))) return false;
169
170    require_once(DOKU_INC.'inc/common.php');
171    require_once(DOKU_INC.'inc/parserutils.php');
172    global $conf;
173
174
175    // gather some additional info from changelog
176    $info = io_grep($conf['changelog'],
177                    '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/',
178                    0,true);
179
180    $meta = array();
181    if(count($info)){
182        $meta['date']['created'] = $info[0][1];
183        foreach($info as $item){
184            if($item[4] != '*'){
185                $meta['date']['modified'] = $item[1];
186                if($item[3]){
187                    $meta['contributor'][$item[3]] = $item[3];
188                }
189            }
190        }
191    }
192
193    $meta = p_render_metadata($ID, $meta);
194    io_saveFile($file, serialize($meta));
195
196    echo "metaUpdate(): finished".NL;
197    return true;
198}
199
200/**
201 * Builds a Google Sitemap of all public pages known to the indexer
202 *
203 * The map is placed in the root directory named sitemap.xml.gz - This
204 * file needs to be writable!
205 *
206 * @author Andreas Gohr
207 * @link   https://www.google.com/webmasters/sitemaps/docs/en/about.html
208 */
209function runSitemapper(){
210    global $conf;
211    print "runSitemapper(): started".NL;
212    if(!$conf['sitemap']) return false;
213
214    if($conf['usegzip']){
215        $sitemap = 'sitemap.xml.gz';
216    }else{
217        $sitemap = 'sitemap.xml';
218    }
219    print "runSitemapper(): using $sitemap".NL;
220
221    if(!is_writable(DOKU_INC.$sitemap)) return false;
222    if(@filesize(DOKU_INC.$sitemap) &&
223       @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){
224       print 'runSitemapper(): Sitemap up to date'.NL;
225       return false;
226    }
227
228    $pages = file($conf['cachedir'].'/page.idx');
229    print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL;
230
231    // build the sitemap
232    ob_start();
233    print '<?xml version="1.0" encoding="UTF-8"?>'.NL;
234    print '<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">'.NL;
235    foreach($pages as $id){
236        $id = trim($id);
237        $file = wikiFN($id);
238
239        //skip hidden, non existing and restricted files
240        if(isHiddenPage($id)) continue;
241        $date = @filemtime($file);
242        if(!$date) continue;
243        if(auth_aclcheck($id,'','') < AUTH_READ) continue;
244
245        print '  <url>'.NL;
246        print '    <loc>'.wl($id,'',true).'</loc>'.NL;
247        print '    <lastmod>'.date_iso8601($date).'</lastmod>'.NL;
248        print '  </url>'.NL;
249    }
250    print '</urlset>'.NL;
251    $data = ob_get_contents();
252    ob_end_clean();
253
254    //save the new sitemap
255    io_saveFile(DOKU_INC.$sitemap,$data);
256
257    print 'runSitemapper(): pinging google'.NL;
258    //ping google
259    $url  = 'http://www.google.com/webmasters/sitemaps/ping?sitemap=';
260    $url .= urlencode(DOKU_URL.$sitemap);
261    $http = new DokuHTTPClient();
262    $http->get($url);
263    if($http->error) print 'runSitemapper(): '.$http->error.NL;
264
265    print 'runSitemapper(): finished'.NL;
266    return true;
267}
268
269/**
270 * Formats a timestamp as ISO 8601 date
271 *
272 * @author <ungu at terong dot com>
273 * @link http://www.php.net/manual/en/function.date.php#54072
274 */
275function date_iso8601($int_date) {
276   //$int_date: current date in UNIX timestamp
277   $date_mod = date('Y-m-d\TH:i:s', $int_date);
278   $pre_timezone = date('O', $int_date);
279   $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2);
280   $date_mod .= $time_zone;
281   return $date_mod;
282}
283
284/**
285 * Just send a 1x1 pixel blank gif to the browser
286 *
287 * @author Andreas Gohr <andi@splitbrain.org>
288 * @author Harry Fuecks <fuecks@gmail.com>
289 */
290function sendGIF(){
291    if($_REQUEST['debug']){
292        header('Content-Type: text/plain');
293        return;
294    }
295    $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7');
296    header('Content-Type: image/gif');
297    header('Content-Length: '.strlen($img));
298    header('Connection: Close');
299    print $img;
300    flush();
301    // Browser should drop connection after this
302    // Thinks it's got the whole image
303}
304
305//Setup VIM: ex: et ts=4 enc=utf-8 :
306// No trailing PHP closing tag - no output please!
307// See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php
308