xref: /dokuwiki/inc/search.php (revision 4d1e8edfed1d8b8025e900215ae05d0633b2a27f)
1<?php
2/**
3 * DokuWiki search functions
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9if(!defined('DOKU_INC')) die('meh.');
10
11/**
12 * Recurse directory
13 *
14 * This function recurses into a given base directory
15 * and calls the supplied function for each file and directory
16 *
17 * @param   array    &$data The results of the search are stored here
18 * @param   string    $base Where to start the search
19 * @param   callback  $func Callback (function name or array with object,method)
20 * @param   array     $opts option array will be given to the Callback
21 * @param   string    $dir  Current directory beyond $base
22 * @param   int       $lvl  Recursion Level
23 * @param   mixed     $sort 'natural' to use natural order sorting (default); 'date' to sort by filemtime; leave empty to skip sorting.
24 * @author  Andreas Gohr <andi@splitbrain.org>
25 */
26function search(&$data,$base,$func,$opts,$dir='',$lvl=1,$sort='natural'){
27    $dirs   = array();
28    $files  = array();
29    $filepaths = array();
30
31    //read in directories and files
32    $dh = @opendir($base.'/'.$dir);
33    if(!$dh) return;
34    while(($file = readdir($dh)) !== false){
35        if(preg_match('/^[\._]/',$file)) continue; //skip hidden files and upper dirs
36        if(is_dir($base.'/'.$dir.'/'.$file)){
37            $dirs[] = $dir.'/'.$file;
38            continue;
39        }
40        $files[] = $dir.'/'.$file;
41        $filepaths[] = $base.'/'.$dir.'/'.$file;
42    }
43    closedir($dh);
44    if (!empty($sort)) {
45        if ($sort == 'date') {
46            @array_multisort(array_map('filemtime', $filepaths), SORT_NUMERIC, SORT_DESC, $files);
47        } else /* natural */ {
48            natsort($files);
49        }
50        natsort($dirs);
51    }
52
53    //give directories to userfunction then recurse
54    foreach($dirs as $dir){
55        if (call_user_func_array($func, array(&$data,$base,$dir,'d',$lvl,$opts))){
56            search($data,$base,$func,$opts,$dir,$lvl+1,$sort);
57        }
58    }
59    //now handle the files
60    foreach($files as $file){
61        call_user_func_array($func, array(&$data,$base,$file,'f',$lvl,$opts));
62    }
63}
64
65/**
66 * The following functions are userfunctions to use with the search
67 * function above. This function is called for every found file or
68 * directory. When a directory is given to the function it has to
69 * decide if this directory should be traversed (true) or not (false)
70 * The function has to accept the following parameters:
71 *
72 * array &$data  - Reference to the result data structure
73 * string $base  - Base usually $conf['datadir']
74 * string $file  - current file or directory relative to $base
75 * string $type  - Type either 'd' for directory or 'f' for file
76 * int    $lvl   - Current recursion depht
77 * array  $opts  - option array as given to search()
78 *
79 * return values for files are ignored
80 *
81 * All functions should check the ACL for document READ rights
82 * namespaces (directories) are NOT checked (when sneaky_index is 0) as this
83 * would break the recursion (You can have an nonreadable dir over a readable
84 * one deeper nested) also make sure to check the file type (for example
85 * in case of lockfiles).
86 */
87
88/**
89 * Searches for pages beginning with the given query
90 *
91 * @author Andreas Gohr <andi@splitbrain.org>
92 */
93function search_qsearch(&$data,$base,$file,$type,$lvl,$opts){
94    $opts = array(
95            'idmatch'   => '(^|:)'.preg_quote($opts['query'],'/').'/',
96            'listfiles' => true,
97            'pagesonly' => true,
98            );
99    return search_universal($data,$base,$file,$type,$lvl,$opts);
100}
101
102/**
103 * Build the browsable index of pages
104 *
105 * $opts['ns'] is the currently viewed namespace
106 *
107 * @author  Andreas Gohr <andi@splitbrain.org>
108 */
109function search_index(&$data,$base,$file,$type,$lvl,$opts){
110    global $conf;
111    $opts = array(
112        'pagesonly' => true,
113        'listdirs' => true,
114        'listfiles' => empty($opts['nofiles']),
115        'sneakyacl' => $conf['sneaky_index'],
116        // Hacky, should rather use recmatch
117        'depth' => preg_match('#^'.preg_quote($file, '#').'(/|$)#','/'.$opts['ns']) ? 0 : -1
118    );
119
120    return search_universal($data, $base, $file, $type, $lvl, $opts);
121}
122
123/**
124 * List all namespaces
125 *
126 * @author  Andreas Gohr <andi@splitbrain.org>
127 */
128function search_namespaces(&$data,$base,$file,$type,$lvl,$opts){
129    $opts = array(
130            'listdirs' => true,
131            );
132    return search_universal($data,$base,$file,$type,$lvl,$opts);
133}
134
135/**
136 * List all mediafiles in a namespace
137 *   $opts['depth']     recursion level, 0 for all
138 *   $opts['showmsg']   shows message if invalid media id is used
139 *   $opts['skipacl']   skip acl checking
140 *   $opts['pattern']   check given pattern
141 *   $opts['hash']      add hashes to result list
142 *
143 * @author  Andreas Gohr <andi@splitbrain.org>
144 */
145function search_media(&$data,$base,$file,$type,$lvl,$opts){
146
147    //we do nothing with directories
148    if($type == 'd') {
149        if(empty($opts['depth'])) return true; // recurse forever
150        $depth = substr_count($file,'/');
151        if($depth >= $opts['depth']) return false; // depth reached
152        return true;
153    }
154
155    $info         = array();
156    $info['id']   = pathID($file,true);
157    if($info['id'] != cleanID($info['id'])){
158        if($opts['showmsg'])
159            msg(hsc($info['id']).' is not a valid file name for DokuWiki - skipped',-1);
160        return false; // skip non-valid files
161    }
162
163    //check ACL for namespace (we have no ACL for mediafiles)
164    $info['perm'] = auth_quickaclcheck(getNS($info['id']).':*');
165    if(empty($opts['skipacl']) && $info['perm'] < AUTH_READ){
166        return false;
167    }
168
169    //check pattern filter
170    if(!empty($opts['pattern']) && !@preg_match($opts['pattern'], $info['id'])){
171        return false;
172    }
173
174    $info['file']     = utf8_basename($file);
175    $info['size']     = filesize($base.'/'.$file);
176    $info['mtime']    = filemtime($base.'/'.$file);
177    $info['writable'] = is_writable($base.'/'.$file);
178    if(preg_match("/\.(jpe?g|gif|png)$/",$file)){
179        $info['isimg'] = true;
180        $info['meta']  = new JpegMeta($base.'/'.$file);
181    }else{
182        $info['isimg'] = false;
183    }
184    if(!empty($opts['hash'])){
185        $info['hash'] = md5(io_readFile(mediaFN($info['id']),false));
186    }
187
188    $data[] = $info;
189
190    return false;
191}
192
193/**
194 * This function just lists documents (for RSS namespace export)
195 *
196 * @author  Andreas Gohr <andi@splitbrain.org>
197 */
198function search_list(&$data,$base,$file,$type,$lvl,$opts){
199    //we do nothing with directories
200    if($type == 'd') return false;
201    //only search txt files
202    if(substr($file,-4) == '.txt'){
203        //check ACL
204        $id = pathID($file);
205        if(auth_quickaclcheck($id) < AUTH_READ){
206            return false;
207        }
208        $data[]['id'] = $id;
209    }
210    return false;
211}
212
213/**
214 * Quicksearch for searching matching pagenames
215 *
216 * $opts['query'] is the search query
217 *
218 * @author  Andreas Gohr <andi@splitbrain.org>
219 */
220function search_pagename(&$data,$base,$file,$type,$lvl,$opts){
221    //we do nothing with directories
222    if($type == 'd') return true;
223    //only search txt files
224    if(substr($file,-4) != '.txt') return true;
225
226    //simple stringmatching
227    if (!empty($opts['query'])){
228        if(strpos($file,$opts['query']) !== false){
229            //check ACL
230            $id = pathID($file);
231            if(auth_quickaclcheck($id) < AUTH_READ){
232                return false;
233            }
234            $data[]['id'] = $id;
235        }
236    }
237    return true;
238}
239
240/**
241 * Just lists all documents
242 *
243 * $opts['depth']   recursion level, 0 for all
244 * $opts['hash']    do md5 sum of content?
245 * $opts['skipacl'] list everything regardless of ACL
246 *
247 * @author  Andreas Gohr <andi@splitbrain.org>
248 */
249function search_allpages(&$data,$base,$file,$type,$lvl,$opts){
250    if(isset($opts['depth']) && $opts['depth']){
251        $parts = explode('/',ltrim($file,'/'));
252        if(($type == 'd' && count($parts) >= $opts['depth'])
253          || ($type != 'd' && count($parts) > $opts['depth'])){
254            return false; // depth reached
255        }
256    }
257
258    //we do nothing with directories
259    if($type == 'd'){
260        return true;
261    }
262
263    //only search txt files
264    if(substr($file,-4) != '.txt') return true;
265
266    $item = array();
267    $item['id']   = pathID($file);
268    if(!$opts['skipacl'] && auth_quickaclcheck($item['id']) < AUTH_READ){
269        return false;
270    }
271
272    $item['rev']   = filemtime($base.'/'.$file);
273    $item['mtime'] = $item['rev'];
274    $item['size']  = filesize($base.'/'.$file);
275    if($opts['hash']){
276        $item['hash'] = md5(trim(rawWiki($item['id'])));
277    }
278
279    $data[] = $item;
280    return true;
281}
282
283/* ------------- helper functions below -------------- */
284
285/**
286 * fulltext sort
287 *
288 * Callback sort function for use with usort to sort the data
289 * structure created by search_fulltext. Sorts descending by count
290 *
291 * @author  Andreas Gohr <andi@splitbrain.org>
292 */
293function sort_search_fulltext($a,$b){
294    if($a['count'] > $b['count']){
295        return -1;
296    }elseif($a['count'] < $b['count']){
297        return 1;
298    }else{
299        return strcmp($a['id'],$b['id']);
300    }
301}
302
303/**
304 * translates a document path to an ID
305 *
306 * @author  Andreas Gohr <andi@splitbrain.org>
307 * @todo    move to pageutils
308 */
309function pathID($path,$keeptxt=false){
310    $id = utf8_decodeFN($path);
311    $id = str_replace('/',':',$id);
312    if(!$keeptxt) $id = preg_replace('#\.txt$#','',$id);
313    $id = trim($id, ':');
314    return $id;
315}
316
317
318/**
319 * This is a very universal callback for the search() function, replacing
320 * many of the former individual functions at the cost of a more complex
321 * setup.
322 *
323 * How the function behaves, depends on the options passed in the $opts
324 * array, where the following settings can be used.
325 *
326 * depth      int     recursion depth. 0 for unlimited                       (default: 0)
327 * keeptxt    bool    keep .txt extension for IDs                            (default: false)
328 * listfiles  bool    include files in listing                               (default: false)
329 * listdirs   bool    include namespaces in listing                          (default: false)
330 * pagesonly  bool    restrict files to pages                                (default: false)
331 * skipacl    bool    do not check for READ permission                       (default: false)
332 * sneakyacl  bool    don't recurse into nonreadable dirs                    (default: false)
333 * hash       bool    create MD5 hash for files                              (default: false)
334 * meta       bool    return file metadata                                   (default: false)
335 * filematch  string  match files against this regexp                        (default: '', so accept everything)
336 * idmatch    string  match full ID against this regexp                      (default: '', so accept everything)
337 * dirmatch   string  match directory against this regexp when adding        (default: '', so accept everything)
338 * nsmatch    string  match namespace against this regexp when adding        (default: '', so accept everything)
339 * recmatch   string  match directory against this regexp when recursing     (default: '', so accept everything)
340 * showmsg    bool    warn about non-ID files                                (default: false)
341 * showhidden bool    show hidden files(e.g. by hidepages config) too        (default: false)
342 * firsthead  bool    return first heading for pages                         (default: false)
343 *
344 * @param array &$data  - Reference to the result data structure
345 * @param string $base  - Base usually $conf['datadir']
346 * @param string $file  - current file or directory relative to $base
347 * @param string $type  - Type either 'd' for directory or 'f' for file
348 * @param int    $lvl   - Current recursion depht
349 * @param array  $opts  - option array as given to search()
350 * @return bool if this directory should be traversed (true) or not (false)
351 *              return value is ignored for files
352 *
353 * @author Andreas Gohr <gohr@cosmocode.de>
354 */
355function search_universal(&$data,$base,$file,$type,$lvl,$opts){
356    $item   = array();
357    $return = true;
358
359    // get ID and check if it is a valid one
360    $item['id'] = pathID($file,($type == 'd' || !empty($opts['keeptxt'])));
361    if($item['id'] != cleanID($item['id'])){
362        if(!empty($opts['showmsg'])){
363            msg(hsc($item['id']).' is not a valid file name for DokuWiki - skipped',-1);
364        }
365        return false; // skip non-valid files
366    }
367    $item['ns']  = getNS($item['id']);
368
369    if($type == 'd') {
370        // decide if to recursion into this directory is wanted
371        if(empty($opts['depth'])){
372            $return = true; // recurse forever
373        }else{
374            $depth = substr_count($file,'/');
375            if($depth >= $opts['depth']){
376                $return = false; // depth reached
377            }else{
378                $return = true;
379            }
380        }
381
382        if ($return) {
383            $match = empty($opts['recmatch']) || preg_match('/'.$opts['recmatch'].'/',$file);
384            if (!$match) {
385                return false; // doesn't match
386            }
387        }
388    }
389
390    // check ACL
391    if(empty($opts['skipacl'])){
392        if($type == 'd'){
393            $item['perm'] = auth_quickaclcheck($item['id'].':*');
394        }else{
395            $item['perm'] = auth_quickaclcheck($item['id']); //FIXME check namespace for media files
396        }
397    }else{
398        $item['perm'] = AUTH_DELETE;
399    }
400
401    // are we done here maybe?
402    if($type == 'd'){
403        if(empty($opts['listdirs'])) return $return;
404        if(empty($opts['skipacl']) && !empty($opts['sneakyacl']) && $item['perm'] < AUTH_READ) return false; //neither list nor recurse
405        if(!empty($opts['dirmatch']) && !preg_match('/'.$opts['dirmatch'].'/',$file)) return $return;
406        if(!empty($opts['nsmatch']) && !preg_match('/'.$opts['nsmatch'].'/',$item['ns'])) return $return;
407    }else{
408        if(empty($opts['listfiles'])) return $return;
409        if(empty($opts['skipacl']) && $item['perm'] < AUTH_READ) return $return;
410        if(!empty($opts['pagesonly']) && (substr($file,-4) != '.txt')) return $return;
411        if(empty($opts['showhidden']) && isHiddenPage($item['id'])) return $return;
412        if(!empty($opts['filematch']) && !preg_match('/'.$opts['filematch'].'/',$file)) return $return;
413        if(!empty($opts['idmatch']) && !preg_match('/'.$opts['idmatch'].'/',$item['id'])) return $return;
414    }
415
416    // still here? prepare the item
417    $item['type']  = $type;
418    $item['level'] = $lvl;
419    $item['open']  = $return;
420
421    if(!empty($opts['meta'])){
422        $item['file']       = utf8_basename($file);
423        $item['size']       = filesize($base.'/'.$file);
424        $item['mtime']      = filemtime($base.'/'.$file);
425        $item['rev']        = $item['mtime'];
426        $item['writable']   = is_writable($base.'/'.$file);
427        $item['executable'] = is_executable($base.'/'.$file);
428    }
429
430    if($type == 'f'){
431        if(!empty($opts['hash'])) $item['hash'] = md5(io_readFile($base.'/'.$file,false));
432        if(!empty($opts['firsthead'])) $item['title'] = p_get_first_heading($item['id'],METADATA_DONT_RENDER);
433    }
434
435    // finally add the item
436    $data[] = $item;
437    return $return;
438}
439
440//Setup VIM: ex: et ts=4 :
441