xref: /dokuwiki/inc/search.php (revision f3f0262c480d7e509b008d37c90aed884532bba8)
1*f3f0262cSandi<?
2*f3f0262cSandi
3*f3f0262cSandirequire_once("inc/common.php");
4*f3f0262cSandi
5*f3f0262cSandi/**
6*f3f0262cSandi * This function recurses into a given base directory
7*f3f0262cSandi * and calls the supplied function for each file and directory
8*f3f0262cSandi */
9*f3f0262cSandifunction search(&$data,$base,$func,$opts,$dir='',$lvl=1){
10*f3f0262cSandi  $dirs   = array();
11*f3f0262cSandi  $files  = array();
12*f3f0262cSandi
13*f3f0262cSandi  //read in directories and files
14*f3f0262cSandi  $dh = @opendir($base.'/'.$dir);
15*f3f0262cSandi  if(!$dh) return;
16*f3f0262cSandi  while(($file = readdir($dh)) !== false){
17*f3f0262cSandi    if(preg_match('/^\./',$file)) continue; //skip hidden files and upper dirs
18*f3f0262cSandi    if(is_dir($base.'/'.$dir.'/'.$file)){
19*f3f0262cSandi      $dirs[] = $dir.'/'.$file;
20*f3f0262cSandi      continue;
21*f3f0262cSandi    }
22*f3f0262cSandi    $files[] = $dir.'/'.$file;
23*f3f0262cSandi  }
24*f3f0262cSandi  closedir($dh);
25*f3f0262cSandi  sort($files);
26*f3f0262cSandi  sort($dirs);
27*f3f0262cSandi
28*f3f0262cSandi  //give directories to userfunction then recurse
29*f3f0262cSandi  foreach($dirs as $dir){
30*f3f0262cSandi    if ($func($data,$base,$dir,'d',$lvl,$opts)){
31*f3f0262cSandi      search($data,$base,$func,$opts,$dir,$lvl+1);
32*f3f0262cSandi    }
33*f3f0262cSandi  }
34*f3f0262cSandi  //now handle the files
35*f3f0262cSandi  foreach($files as $file){
36*f3f0262cSandi    $func($data,$base,$file,'f',$lvl,$opts);
37*f3f0262cSandi  }
38*f3f0262cSandi}
39*f3f0262cSandi
40*f3f0262cSandi/**
41*f3f0262cSandi * The following functions are userfunctions to use with the search
42*f3f0262cSandi * function above. This function is called for every found file or
43*f3f0262cSandi * directory. When a directory is given to the function it has to
44*f3f0262cSandi * decide if this directory should be traversed (true) or not (false)
45*f3f0262cSandi * The function has to accept the following parameters:
46*f3f0262cSandi *
47*f3f0262cSandi * &$data - Reference to the result data structure
48*f3f0262cSandi * $base  - Base usually $conf['datadir']
49*f3f0262cSandi * $file  - current file or directory relative to $base
50*f3f0262cSandi * $type  - Type either 'd' for directory or 'f' for file
51*f3f0262cSandi * $lvl   - Current recursion depht
52*f3f0262cSandi * $opts  - option array as given to search()
53*f3f0262cSandi *
54*f3f0262cSandi * return values for files are ignored
55*f3f0262cSandi *
56*f3f0262cSandi * All functions should check the ACL for document READ rights
57*f3f0262cSandi * namespaces (directories) are NOT checked as this would break
58*f3f0262cSandi * the recursion (You can have an nonreadable dir over a readable
59*f3f0262cSandi * one deeper nested)
60*f3f0262cSandi */
61*f3f0262cSandi
62*f3f0262cSandi/**
63*f3f0262cSandi * This function build the browsable index of pages
64*f3f0262cSandi *
65*f3f0262cSandi * $opts['ns'] is the current namespace
66*f3f0262cSandi */
67*f3f0262cSandifunction search_index(&$data,$base,$file,$type,$lvl,$opts){
68*f3f0262cSandi  $return = true;
69*f3f0262cSandi
70*f3f0262cSandi  if($type == 'd' && !preg_match('#^'.$file.'(/|$)#','/'.$opts['ns'])){
71*f3f0262cSandi    //add but don't recurse
72*f3f0262cSandi    $return = false;
73*f3f0262cSandi  }elseif($type == 'f' && !preg_match('#\.txt$#',$file)){
74*f3f0262cSandi    //don't add
75*f3f0262cSandi    return false;
76*f3f0262cSandi  }
77*f3f0262cSandi
78*f3f0262cSandi  //check ACL
79*f3f0262cSandi  $id = pathID($file);
80*f3f0262cSandi  if($type=='f' && auth_quickaclcheck($id) < AUTH_READ){
81*f3f0262cSandi    return false;
82*f3f0262cSandi  }
83*f3f0262cSandi
84*f3f0262cSandi  $data[]=array( 'id'    => $id,
85*f3f0262cSandi                 'type'  => $type,
86*f3f0262cSandi                 'level' => $lvl );
87*f3f0262cSandi  return $return;
88*f3f0262cSandi}
89*f3f0262cSandi
90*f3f0262cSandi/**
91*f3f0262cSandi * This function lists all namespaces
92*f3f0262cSandi */
93*f3f0262cSandifunction search_namespaces(&$data,$base,$file,$type,$lvl,$opts){
94*f3f0262cSandi  if($type == 'f') return true; //nothing to do on files
95*f3f0262cSandi
96*f3f0262cSandi  $id = pathID($file);
97*f3f0262cSandi  $data[]=array( 'id'    => $id,
98*f3f0262cSandi                 'type'  => $type,
99*f3f0262cSandi                 'level' => $lvl );
100*f3f0262cSandi  return true;
101*f3f0262cSandi}
102*f3f0262cSandi
103*f3f0262cSandi/**
104*f3f0262cSandi * This function lists all mediafiles in a namespace
105*f3f0262cSandi */
106*f3f0262cSandifunction search_media(&$data,$base,$file,$type,$lvl,$opts){
107*f3f0262cSandi  //we do nothing with directories
108*f3f0262cSandi  if($type == 'd') return false;
109*f3f0262cSandi
110*f3f0262cSandi  $info         = array();
111*f3f0262cSandi  $info['id']   = pathID($file);
112*f3f0262cSandi
113*f3f0262cSandi  //check ACL for namespace (we have no ACL for mediafiles)
114*f3f0262cSandi  if(auth_quickaclcheck(getNS($info['id']).':*') < AUTH_READ){
115*f3f0262cSandi    return false;
116*f3f0262cSandi  }
117*f3f0262cSandi
118*f3f0262cSandi  $info['file'] = basename($file);
119*f3f0262cSandi  $info['size'] = filesize($base.'/'.$file);
120*f3f0262cSandi  if(preg_match("/\.(jpe?g|gif|png)$/",$file)){
121*f3f0262cSandi    $info['isimg'] = true;
122*f3f0262cSandi    $info['info']  = getimagesize($base.'/'.$file);
123*f3f0262cSandi  }else{
124*f3f0262cSandi    $info['isimg'] = false;
125*f3f0262cSandi  }
126*f3f0262cSandi  $data[] = $info;
127*f3f0262cSandi
128*f3f0262cSandi  return false;
129*f3f0262cSandi}
130*f3f0262cSandi
131*f3f0262cSandi/**
132*f3f0262cSandi * This function just lists documents (for RSS namespace export)
133*f3f0262cSandi */
134*f3f0262cSandifunction search_list(&$data,$base,$file,$type,$lvl,$opts){
135*f3f0262cSandi  //we do nothing with directories
136*f3f0262cSandi  if($type == 'd') return false;
137*f3f0262cSandi  if(preg_match('#\.txt$#',$file)){
138*f3f0262cSandi    //check ACL
139*f3f0262cSandi    $id = pathID($file);
140*f3f0262cSandi    if(auth_quickaclcheck($id) < AUTH_READ){
141*f3f0262cSandi      return false;
142*f3f0262cSandi    }
143*f3f0262cSandi    $data[]['id'] = $id;;
144*f3f0262cSandi  }
145*f3f0262cSandi  return false;
146*f3f0262cSandi}
147*f3f0262cSandi
148*f3f0262cSandi/**
149*f3f0262cSandi * Quicksearch for searching matching pagenames
150*f3f0262cSandi *
151*f3f0262cSandi * $opts['query'] is the search query
152*f3f0262cSandi */
153*f3f0262cSandifunction search_pagename(&$data,$base,$file,$type,$lvl,$opts){
154*f3f0262cSandi  //we do nothing with directories
155*f3f0262cSandi  if($type == 'd') return true;
156*f3f0262cSandi  //only search txt files
157*f3f0262cSandi  if(!preg_match('#\.txt$#',$file)) return true;
158*f3f0262cSandi
159*f3f0262cSandi  //simple stringmatching
160*f3f0262cSandi  if(strpos($file,$opts['query']) !== false){
161*f3f0262cSandi    //check ACL
162*f3f0262cSandi    $id = pathID($file);
163*f3f0262cSandi    if(auth_quickaclcheck($id) < AUTH_READ){
164*f3f0262cSandi      return false;
165*f3f0262cSandi    }
166*f3f0262cSandi    $data[]['id'] = $id;
167*f3f0262cSandi  }
168*f3f0262cSandi
169*f3f0262cSandi  return true;
170*f3f0262cSandi}
171*f3f0262cSandi
172*f3f0262cSandi/**
173*f3f0262cSandi * Search for backlinks to a given page
174*f3f0262cSandi *
175*f3f0262cSandi * $opts['ns']    namespace of the page
176*f3f0262cSandi * $opts['name']  name of the page without namespace
177*f3f0262cSandi */
178*f3f0262cSandifunction search_backlinks(&$data,$base,$file,$type,$lvl,$opts){
179*f3f0262cSandi  //we do nothing with directories
180*f3f0262cSandi  if($type == 'd') return true;;
181*f3f0262cSandi  //only search txt files
182*f3f0262cSandi  if(!preg_match('#\.txt$#',$file)) return true;;
183*f3f0262cSandi
184*f3f0262cSandi  //get text
185*f3f0262cSandi  $text = io_readfile($base.'/'.$file);
186*f3f0262cSandi
187*f3f0262cSandi  //absolute search id
188*f3f0262cSandi  $sid = cleanID($opts['ns'].':'.$opts['name']);
189*f3f0262cSandi
190*f3f0262cSandi  //construct current namespace
191*f3f0262cSandi  $cid = pathID($file);
192*f3f0262cSandi  $cns = getNS($cid);
193*f3f0262cSandi
194*f3f0262cSandi  //check ACL
195*f3f0262cSandi  if(auth_quickaclcheck($cid) < AUTH_READ){
196*f3f0262cSandi    return false;
197*f3f0262cSandi  }
198*f3f0262cSandi
199*f3f0262cSandi  //match all links
200*f3f0262cSandi  //FIXME may be incorrect because of code blocks
201*f3f0262cSandi  //      CamelCase isn't supported, too
202*f3f0262cSandi  preg_match_all('#\[\[(.+?)\]\]#si',$text,$matches,PREG_SET_ORDER);
203*f3f0262cSandi  foreach($matches as $match){
204*f3f0262cSandi    //get ID from link and discard most non wikilinks
205*f3f0262cSandi    list($mid) = split('\|',$match[1],2);
206*f3f0262cSandi    if(preg_match("#^(https?|telnet|gopher|file|wais|ftp|ed2k|irc)://#",$mid)) continue;
207*f3f0262cSandi    if(preg_match("#\w+>#",$mid)) continue;
208*f3f0262cSandi    $mns = getNS($mid);
209*f3f0262cSandi   	//namespace starting with "." - prepend current namespace
210*f3f0262cSandi    if(strpos($mns,'.')===0){
211*f3f0262cSandi      $mid = $cns.":".substr($mid,1);
212*f3f0262cSandi    }
213*f3f0262cSandi    if($mns===false){
214*f3f0262cSandi      //no namespace in link? add current
215*f3f0262cSandi      $mid = "$cns:$mid";
216*f3f0262cSandi    }
217*f3f0262cSandi    $mid = cleanID($mid);
218*f3f0262cSandi
219*f3f0262cSandi    if ($mid == $sid){
220*f3f0262cSandi      $data[]['id'] = $cid;
221*f3f0262cSandi      break;
222*f3f0262cSandi    }
223*f3f0262cSandi  }
224*f3f0262cSandi}
225*f3f0262cSandi
226*f3f0262cSandi/**
227*f3f0262cSandi * Fulltextsearch
228*f3f0262cSandi *
229*f3f0262cSandi * $opts['query'] is the search query
230*f3f0262cSandi */
231*f3f0262cSandifunction search_fulltext(&$data,$base,$file,$type,$lvl,$opts){
232*f3f0262cSandi  //we do nothing with directories
233*f3f0262cSandi  if($type == 'd') return true;;
234*f3f0262cSandi  //only search txt files
235*f3f0262cSandi  if(!preg_match('#\.txt$#',$file)) return true;;
236*f3f0262cSandi
237*f3f0262cSandi  //check ACL
238*f3f0262cSandi  $id = pathID($file);
239*f3f0262cSandi  if(auth_quickaclcheck($id) < AUTH_READ){
240*f3f0262cSandi    return false;
241*f3f0262cSandi  }
242*f3f0262cSandi
243*f3f0262cSandi  //get text
244*f3f0262cSandi  $text = io_readfile($base.'/'.$file);
245*f3f0262cSandi
246*f3f0262cSandi  //create regexp from queries
247*f3f0262cSandi  $qpreg = preg_split('/\s+/',preg_quote($opts['query'],'#'));
248*f3f0262cSandi  $qpreg = '('.join('|',$qpreg).')';
249*f3f0262cSandi
250*f3f0262cSandi  //do the fulltext search
251*f3f0262cSandi  $matches = array();
252*f3f0262cSandi  if($cnt = preg_match_all('#'.$qpreg.'#si',$text,$matches)){
253*f3f0262cSandi    //this is not the best way for snippet generation but the fastest I could find
254*f3f0262cSandi    //split query and only use the first token
255*f3f0262cSandi    $q = preg_split('/\s+/',$opts['query'],2);
256*f3f0262cSandi    $q = $q[0];
257*f3f0262cSandi    $p = strpos(strtolower($text),$q);
258*f3f0262cSandi    $f = $p - 100;
259*f3f0262cSandi    $l = strlen($q) + 200;
260*f3f0262cSandi    if($f < 0) $f = 0;
261*f3f0262cSandi    $snippet = '<span class="search_sep"> ... </span>'.
262*f3f0262cSandi               htmlspecialchars(substr($text,$f,$l)).
263*f3f0262cSandi               '<span class="search_sep"> ... </span>';
264*f3f0262cSandi    $snippet = preg_replace('#'.$qpreg.'#si','<span class="search_hit">\\1</span>',$snippet);
265*f3f0262cSandi
266*f3f0262cSandi    $data[] = array(
267*f3f0262cSandi      'id'      => $id,
268*f3f0262cSandi      'count'   => $cnt,
269*f3f0262cSandi      'snippet' => $snippet,
270*f3f0262cSandi    );
271*f3f0262cSandi  }
272*f3f0262cSandi
273*f3f0262cSandi  return true;
274*f3f0262cSandi}
275*f3f0262cSandi
276*f3f0262cSandi/**
277*f3f0262cSandi * Callback sort function for use with usort to sort the data
278*f3f0262cSandi * structure created by search_fulltext. Sorts descending by count
279*f3f0262cSandi */
280*f3f0262cSandifunction sort_search_fulltext($a,$b){
281*f3f0262cSandi  if($a['count'] > $b['count']){
282*f3f0262cSandi    return -1;
283*f3f0262cSandi  }elseif($a['count'] < $b['count']){
284*f3f0262cSandi    return 1;
285*f3f0262cSandi  }else{
286*f3f0262cSandi    return strcmp($a['id'],$b['id']);
287*f3f0262cSandi  }
288*f3f0262cSandi}
289*f3f0262cSandi
290*f3f0262cSandi/**
291*f3f0262cSandi * translates a document path to an ID
292*f3f0262cSandi */
293*f3f0262cSandifunction pathID($path){
294*f3f0262cSandi  $id = str_replace('/',':',$path);
295*f3f0262cSandi  $id = preg_replace('#\.txt$#','',$id);
296*f3f0262cSandi  $id = preg_replace('#^:+#','',$id);
297*f3f0262cSandi  $id = preg_replace('#:+$#','',$id);
298*f3f0262cSandi  return $id;
299*f3f0262cSandi}
300*f3f0262cSandi
301*f3f0262cSandi?>
302