xref: /dokuwiki/lib/exe/indexer.php (revision 71726d7801bdcbf41dfdc79d244f09a0988529c0)
1<?php
2/**
3 * DokuWiki indexer
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../../').'/');
9require_once(DOKU_INC.'inc/init.php');
10require_once(DOKU_INC.'inc/auth.php');
11require_once(DOKU_INC.'inc/events.php');
12session_write_close();  //close session
13if(!defined('NL')) define('NL',"\n");
14
15// keep running after browser closes connection
16@ignore_user_abort(true);
17
18// check if user abort worked, if yes send output early
19if(@ignore_user_abort()){
20    sendGIF(); // send gif
21    $defer = false;
22}else{
23    $defer = true;
24}
25
26// Catch any possible output (e.g. errors)
27if(!$_REQUEST['debug']) ob_start();
28
29// run one of the jobs
30runIndexer() or metaUpdate() or runSitemapper() or runTrimRecentChanges();
31if($defer) sendGIF();
32
33if(!$_REQUEST['debug']) ob_end_clean();
34exit;
35
36// --------------------------------------------------------------------
37
38/**
39 * Trims the recent changes cache (or imports the old changelog) as needed.
40 *
41 * @author Ben Coburn <btcoburn@silicodon.net>
42 */
43function runTrimRecentChanges() {
44    global $conf;
45
46    // Import old changelog (if needed)
47    // Uses the imporoldchangelog plugin to upgrade the changelog automaticaly.
48    // FIXME: Remove this from runTrimRecentChanges when it is no longer needed.
49    if (isset($conf['changelog_old']) &&
50        file_exists($conf['changelog_old']) && !file_exists($conf['changelog']) &&
51        !file_exists($conf['changelog'].'_importing') && !file_exists($conf['changelog'].'_tmp')) {
52            $tmp = array(); // no event data
53            trigger_event('TEMPORARY_CHANGELOG_UPGRADE_EVENT', $tmp);
54            return true;
55    }
56
57    // Trim the Recent Changes
58    // Trims the recent changes cache to the last $conf['changes_days'] recent
59    // changes or $conf['recent'] items, which ever is larger.
60    // The trimming is only done once a day.
61    if (file_exists($conf['changelog']) &&
62        (filectime($conf['changelog'])+86400)<time() &&
63        !file_exists($conf['changelog'].'_tmp')) {
64            io_lock($conf['changelog']);
65            $lines = file($conf['changelog']);
66            if (count($lines)<$conf['recent']) {
67                // nothing to trim
68                io_unlock($conf['changelog']);
69                return true;
70            }
71            // trim changelog
72            io_saveFile($conf['changelog'].'_tmp', ''); // presave tmp as 2nd lock
73            $kept = 0;
74            $trim_time = time() - $conf['recent_days']*86400;
75            $out_lines = array();
76            // check lines from newest to oldest
77            for ($i = count($lines)-1; $i >= 0; $i--) {
78                $tmp = parseChangelogLine($lines[$i]);
79                if ($tmp===false) { continue; }
80                if ($tmp['date']>$trim_time || $kept<$conf['recent']) {
81                    array_push($out_lines, implode("\t", $tmp)."\n");
82                    $kept++;
83                } else {
84                    // no more lines worth keeping
85                    break;
86                }
87            }
88            io_saveFile($conf['changelog'].'_tmp', implode('', $out_lines));
89            unlink($conf['changelog']);
90            if (!rename($conf['changelog'].'_tmp', $conf['changelog'])) {
91                // rename failed so try another way...
92                io_unlock($conf['changelog']);
93                io_saveFile($conf['changelog'], implode('', $out_lines));
94                unlink($conf['changelog'].'_tmp');
95            } else {
96                io_unlock($conf['changelog']);
97            }
98            return true;
99    }
100
101    // nothing done
102    return false;
103}
104
105/**
106 * Runs the indexer for the current page
107 *
108 * @author Andreas Gohr <andi@splitbrain.org>
109 */
110function runIndexer(){
111    global $conf;
112    print "runIndexer(): started".NL;
113
114    $ID = cleanID($_REQUEST['id']);
115    if(!$ID) return false;
116
117    // check if indexing needed
118    $last = @filemtime(metaFN($ID,'.indexed'));
119    if($last > @filemtime(wikiFN($ID))){
120        print "runIndexer(): index for $ID up to date".NL;
121        return false;
122    }
123
124    // try to aquire a lock
125    $lock = $conf['lockdir'].'/_indexer.lock';
126    while(!@mkdir($lock,$conf['dmode'])){
127        usleep(50);
128        if(time()-@filemtime($lock) > 60*5){
129            // looks like a stale lock - remove it
130            @rmdir($lock);
131            print "runIndexer(): stale lock removed".NL;
132        }else{
133            print "runIndexer(): indexer locked".NL;
134            return false;
135        }
136    }
137    if($conf['dperm']) chmod($lock, $conf['dperm']);
138
139    require_once(DOKU_INC.'inc/indexer.php');
140
141    // do the work
142    idx_addPage($ID);
143
144    // we're finished - save and free lock
145    io_saveFile(metaFN($ID,'.indexed'),' ');
146    @rmdir($lock);
147    print "runIndexer(): finished".NL;
148    return true;
149}
150
151/**
152 * Will render the metadata for the page if not exists yet
153 *
154 * This makes sure pages which are created from outside DokuWiki will
155 * gain their data when viewed for the first time.
156 */
157function metaUpdate(){
158    print "metaUpdate(): started".NL;
159
160    $ID = cleanID($_REQUEST['id']);
161    if(!$ID) return false;
162    $file = metaFN($ID, '.meta');
163    echo "meta file: $file".NL;
164
165    // rendering needed?
166    if (@file_exists($file)) return false;
167    if (!@file_exists(wikiFN($ID))) return false;
168
169    require_once(DOKU_INC.'inc/common.php');
170    require_once(DOKU_INC.'inc/parserutils.php');
171    global $conf;
172
173
174    // gather some additional info from changelog
175    $info = io_grep($conf['changelog'],
176                    '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/',
177                    0,true);
178
179    $meta = array();
180    if(count($info)){
181        $meta['date']['created'] = $info[0][1];
182        foreach($info as $item){
183            if($item[4] != '*'){
184                $meta['date']['modified'] = $item[1];
185                if($item[3]){
186                    $meta['contributor'][$item[3]] = $item[3];
187                }
188            }
189        }
190    }
191
192    $meta = p_render_metadata($ID, $meta);
193    io_saveFile($file, serialize($meta));
194
195    echo "metaUpdate(): finished".NL;
196    return true;
197}
198
199/**
200 * Builds a Google Sitemap of all public pages known to the indexer
201 *
202 * The map is placed in the root directory named sitemap.xml.gz - This
203 * file needs to be writable!
204 *
205 * @author Andreas Gohr
206 * @link   https://www.google.com/webmasters/sitemaps/docs/en/about.html
207 */
208function runSitemapper(){
209    global $conf;
210    print "runSitemapper(): started".NL;
211    if(!$conf['sitemap']) return false;
212
213    if($conf['usegzip']){
214        $sitemap = 'sitemap.xml.gz';
215    }else{
216        $sitemap = 'sitemap.xml';
217    }
218    print "runSitemapper(): using $sitemap".NL;
219
220    if(!is_writable(DOKU_INC.$sitemap)) return false;
221    if(@filesize(DOKU_INC.$sitemap) &&
222       @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){
223       print 'runSitemapper(): Sitemap up to date'.NL;
224       return false;
225    }
226
227    $pages = file($conf['cachedir'].'/page.idx');
228    print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL;
229
230    // build the sitemap
231    ob_start();
232    print '<?xml version="1.0" encoding="UTF-8"?>'.NL;
233    print '<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">'.NL;
234    foreach($pages as $id){
235        $id = trim($id);
236        $file = wikiFN($id);
237
238        //skip hidden, non existing and restricted files
239        if(isHiddenPage($id)) continue;
240        $date = @filemtime($file);
241        if(!$date) continue;
242        if(auth_aclcheck($id,'','') < AUTH_READ) continue;
243
244        print '  <url>'.NL;
245        print '    <loc>'.wl($id,'',true).'</loc>'.NL;
246        print '    <lastmod>'.date_iso8601($date).'</lastmod>'.NL;
247        print '  </url>'.NL;
248    }
249    print '</urlset>'.NL;
250    $data = ob_get_contents();
251    ob_end_clean();
252
253    //save the new sitemap
254    io_saveFile(DOKU_INC.$sitemap,$data);
255
256    print 'runSitemapper(): pinging google'.NL;
257    //ping google
258    $url  = 'http://www.google.com/webmasters/sitemaps/ping?sitemap=';
259    $url .= urlencode(DOKU_URL.$sitemap);
260    $http = new DokuHTTPClient();
261    $http->get($url);
262    if($http->error) print 'runSitemapper(): '.$http->error.NL;
263
264    print 'runSitemapper(): finished'.NL;
265    return true;
266}
267
268/**
269 * Formats a timestamp as ISO 8601 date
270 *
271 * @author <ungu at terong dot com>
272 * @link http://www.php.net/manual/en/function.date.php#54072
273 */
274function date_iso8601($int_date) {
275   //$int_date: current date in UNIX timestamp
276   $date_mod = date('Y-m-d\TH:i:s', $int_date);
277   $pre_timezone = date('O', $int_date);
278   $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2);
279   $date_mod .= $time_zone;
280   return $date_mod;
281}
282
283/**
284 * Just send a 1x1 pixel blank gif to the browser
285 *
286 * @author Andreas Gohr <andi@splitbrain.org>
287 * @author Harry Fuecks <fuecks@gmail.com>
288 */
289function sendGIF(){
290    if($_REQUEST['debug']){
291        header('Content-Type: text/plain');
292        return;
293    }
294    $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7');
295    header('Content-Type: image/gif');
296    header('Content-Length: '.strlen($img));
297    header('Connection: Close');
298    print $img;
299    flush();
300    // Browser should drop connection after this
301    // Thinks it's got the whole image
302}
303
304//Setup VIM: ex: et ts=4 enc=utf-8 :
305// No trailing PHP closing tag - no output please!
306// See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php
307