1ed7b5f09Sandi<?php 215fae107Sandi/** 315fae107Sandi * DokuWiki search functions 415fae107Sandi * 515fae107Sandi * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 615fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 715fae107Sandi */ 8f3f0262cSandi 9ed7b5f09Sandi if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10ed7b5f09Sandi require_once(DOKU_INC.'inc/common.php'); 11f3f0262cSandi 12f3f0262cSandi/** 1315fae107Sandi * recurse direcory 1415fae107Sandi * 15f3f0262cSandi * This function recurses into a given base directory 16f3f0262cSandi * and calls the supplied function for each file and directory 1715fae107Sandi * 1815fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 19f3f0262cSandi */ 20f3f0262cSandifunction search(&$data,$base,$func,$opts,$dir='',$lvl=1){ 21f3f0262cSandi $dirs = array(); 22f3f0262cSandi $files = array(); 23f3f0262cSandi 24f3f0262cSandi //read in directories and files 25f3f0262cSandi $dh = @opendir($base.'/'.$dir); 26f3f0262cSandi if(!$dh) return; 27f3f0262cSandi while(($file = readdir($dh)) !== false){ 28de3dfc91Sandi if(preg_match('/^[\._]/',$file)) continue; //skip hidden files and upper dirs 29f3f0262cSandi if(is_dir($base.'/'.$dir.'/'.$file)){ 30f3f0262cSandi $dirs[] = $dir.'/'.$file; 31f3f0262cSandi continue; 32f3f0262cSandi } 33f3f0262cSandi $files[] = $dir.'/'.$file; 34f3f0262cSandi } 35f3f0262cSandi closedir($dh); 36f3f0262cSandi sort($files); 37f3f0262cSandi sort($dirs); 38f3f0262cSandi 39f3f0262cSandi //give directories to userfunction then recurse 40f3f0262cSandi foreach($dirs as $dir){ 41f3f0262cSandi if ($func($data,$base,$dir,'d',$lvl,$opts)){ 42f3f0262cSandi search($data,$base,$func,$opts,$dir,$lvl+1); 43f3f0262cSandi } 44f3f0262cSandi } 45f3f0262cSandi //now handle the files 46f3f0262cSandi foreach($files as $file){ 47f3f0262cSandi $func($data,$base,$file,'f',$lvl,$opts); 48f3f0262cSandi } 49f3f0262cSandi} 50f3f0262cSandi 51f3f0262cSandi/** 52f3f0262cSandi * The following functions are userfunctions to use with the search 53f3f0262cSandi * function above. This function is called for every found file or 54f3f0262cSandi * directory. When a directory is given to the function it has to 55f3f0262cSandi * decide if this directory should be traversed (true) or not (false) 56f3f0262cSandi * The function has to accept the following parameters: 57f3f0262cSandi * 58f3f0262cSandi * &$data - Reference to the result data structure 59f3f0262cSandi * $base - Base usually $conf['datadir'] 60f3f0262cSandi * $file - current file or directory relative to $base 61f3f0262cSandi * $type - Type either 'd' for directory or 'f' for file 62f3f0262cSandi * $lvl - Current recursion depht 63f3f0262cSandi * $opts - option array as given to search() 64f3f0262cSandi * 65f3f0262cSandi * return values for files are ignored 66f3f0262cSandi * 67f3f0262cSandi * All functions should check the ACL for document READ rights 68f3f0262cSandi * namespaces (directories) are NOT checked as this would break 69f3f0262cSandi * the recursion (You can have an nonreadable dir over a readable 70f3f0262cSandi * one deeper nested) 71f3f0262cSandi */ 72f3f0262cSandi 73f3f0262cSandi/** 74*63f2400bSandi * Searches for pages beginning with the given query 75*63f2400bSandi * 76*63f2400bSandi * @author Andreas Gohr <andi@splitbrain.org> 77*63f2400bSandi */ 78*63f2400bSandifunction search_qsearch(&$data,$base,$file,$type,$lvl,$opts){ 79*63f2400bSandi $item = array(); 80*63f2400bSandi 81*63f2400bSandi if($type == 'd'){ 82*63f2400bSandi return false; //no handling yet 83*63f2400bSandi } 84*63f2400bSandi 85*63f2400bSandi //get id 86*63f2400bSandi $id = pathID($file); 87*63f2400bSandi 88*63f2400bSandi //check if it matches the query 89*63f2400bSandi if(!preg_match('/^'.preg_quote($opts['query'],'/').'/u',$id)){ 90*63f2400bSandi return false; 91*63f2400bSandi } 92*63f2400bSandi 93*63f2400bSandi //check ACL 94*63f2400bSandi if(auth_quickaclcheck($id) < AUTH_READ){ 95*63f2400bSandi return false; 96*63f2400bSandi } 97*63f2400bSandi 98*63f2400bSandi $data[]=array( 'id' => $id, 99*63f2400bSandi 'type' => $type, 100*63f2400bSandi 'level' => 1, 101*63f2400bSandi 'open' => true); 102*63f2400bSandi return true; 103*63f2400bSandi} 104*63f2400bSandi 105*63f2400bSandi/** 10615fae107Sandi * Build the browsable index of pages 107f3f0262cSandi * 108f3f0262cSandi * $opts['ns'] is the current namespace 10915fae107Sandi * 11015fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 111f3f0262cSandi */ 112f3f0262cSandifunction search_index(&$data,$base,$file,$type,$lvl,$opts){ 113f3f0262cSandi $return = true; 114f3f0262cSandi 115cb70c441Sandi $item = array(); 116cb70c441Sandi 117f3f0262cSandi if($type == 'd' && !preg_match('#^'.$file.'(/|$)#','/'.$opts['ns'])){ 118f3f0262cSandi //add but don't recurse 119f3f0262cSandi $return = false; 120f3f0262cSandi }elseif($type == 'f' && !preg_match('#\.txt$#',$file)){ 121f3f0262cSandi //don't add 122f3f0262cSandi return false; 123f3f0262cSandi } 124f3f0262cSandi 125f3f0262cSandi //check ACL 126f3f0262cSandi $id = pathID($file); 127f3f0262cSandi if($type=='f' && auth_quickaclcheck($id) < AUTH_READ){ 128f3f0262cSandi return false; 129f3f0262cSandi } 130f3f0262cSandi 131f3f0262cSandi $data[]=array( 'id' => $id, 132f3f0262cSandi 'type' => $type, 133cb70c441Sandi 'level' => $lvl, 134cb70c441Sandi 'open' => $return ); 135f3f0262cSandi return $return; 136f3f0262cSandi} 137f3f0262cSandi 138f3f0262cSandi/** 13915fae107Sandi * List all namespaces 14015fae107Sandi * 14115fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 142f3f0262cSandi */ 143f3f0262cSandifunction search_namespaces(&$data,$base,$file,$type,$lvl,$opts){ 144f3f0262cSandi if($type == 'f') return true; //nothing to do on files 145f3f0262cSandi 146f3f0262cSandi $id = pathID($file); 147f3f0262cSandi $data[]=array( 'id' => $id, 148f3f0262cSandi 'type' => $type, 149f3f0262cSandi 'level' => $lvl ); 150f3f0262cSandi return true; 151f3f0262cSandi} 152f3f0262cSandi 153f3f0262cSandi/** 15415fae107Sandi * List all mediafiles in a namespace 15515fae107Sandi * 15615fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 157f3f0262cSandi */ 158f3f0262cSandifunction search_media(&$data,$base,$file,$type,$lvl,$opts){ 159f3f0262cSandi //we do nothing with directories 160f3f0262cSandi if($type == 'd') return false; 161f3f0262cSandi 162f3f0262cSandi $info = array(); 163f3f0262cSandi $info['id'] = pathID($file); 164f3f0262cSandi 165f3f0262cSandi //check ACL for namespace (we have no ACL for mediafiles) 166f3f0262cSandi if(auth_quickaclcheck(getNS($info['id']).':*') < AUTH_READ){ 167f3f0262cSandi return false; 168f3f0262cSandi } 169f3f0262cSandi 170f3f0262cSandi $info['file'] = basename($file); 171f3f0262cSandi $info['size'] = filesize($base.'/'.$file); 172f3f0262cSandi if(preg_match("/\.(jpe?g|gif|png)$/",$file)){ 173f3f0262cSandi $info['isimg'] = true; 174f3f0262cSandi $info['info'] = getimagesize($base.'/'.$file); 175f3f0262cSandi }else{ 176f3f0262cSandi $info['isimg'] = false; 177f3f0262cSandi } 178f3f0262cSandi $data[] = $info; 179f3f0262cSandi 180f3f0262cSandi return false; 181f3f0262cSandi} 182f3f0262cSandi 183f3f0262cSandi/** 184f3f0262cSandi * This function just lists documents (for RSS namespace export) 18515fae107Sandi * 18615fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 187f3f0262cSandi */ 188f3f0262cSandifunction search_list(&$data,$base,$file,$type,$lvl,$opts){ 189f3f0262cSandi //we do nothing with directories 190f3f0262cSandi if($type == 'd') return false; 191f3f0262cSandi if(preg_match('#\.txt$#',$file)){ 192f3f0262cSandi //check ACL 193f3f0262cSandi $id = pathID($file); 194f3f0262cSandi if(auth_quickaclcheck($id) < AUTH_READ){ 195f3f0262cSandi return false; 196f3f0262cSandi } 197f3f0262cSandi $data[]['id'] = $id;; 198f3f0262cSandi } 199f3f0262cSandi return false; 200f3f0262cSandi} 201f3f0262cSandi 202f3f0262cSandi/** 203f3f0262cSandi * Quicksearch for searching matching pagenames 204f3f0262cSandi * 205f3f0262cSandi * $opts['query'] is the search query 20615fae107Sandi * 20715fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 208f3f0262cSandi */ 209f3f0262cSandifunction search_pagename(&$data,$base,$file,$type,$lvl,$opts){ 210f3f0262cSandi //we do nothing with directories 211f3f0262cSandi if($type == 'd') return true; 212f3f0262cSandi //only search txt files 213f3f0262cSandi if(!preg_match('#\.txt$#',$file)) return true; 214f3f0262cSandi 215f3f0262cSandi //simple stringmatching 216f3f0262cSandi if(strpos($file,$opts['query']) !== false){ 217f3f0262cSandi //check ACL 218f3f0262cSandi $id = pathID($file); 219f3f0262cSandi if(auth_quickaclcheck($id) < AUTH_READ){ 220f3f0262cSandi return false; 221f3f0262cSandi } 222f3f0262cSandi $data[]['id'] = $id; 223f3f0262cSandi } 224f3f0262cSandi 225f3f0262cSandi return true; 226f3f0262cSandi} 227f3f0262cSandi 228f3f0262cSandi/** 229f3f0262cSandi * Search for backlinks to a given page 230f3f0262cSandi * 231f3f0262cSandi * $opts['ns'] namespace of the page 232f3f0262cSandi * $opts['name'] name of the page without namespace 23315fae107Sandi * 23415fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 235f3f0262cSandi */ 236f3f0262cSandifunction search_backlinks(&$data,$base,$file,$type,$lvl,$opts){ 237f3f0262cSandi //we do nothing with directories 238f3f0262cSandi if($type == 'd') return true;; 239f3f0262cSandi //only search txt files 240f3f0262cSandi if(!preg_match('#\.txt$#',$file)) return true;; 241f3f0262cSandi 242f3f0262cSandi //absolute search id 243f3f0262cSandi $sid = cleanID($opts['ns'].':'.$opts['name']); 244f3f0262cSandi 24537e34a5eSandi //current id and namespace 246f3f0262cSandi $cid = pathID($file); 247f3f0262cSandi $cns = getNS($cid); 248f3f0262cSandi 249f3f0262cSandi //check ACL 250f3f0262cSandi if(auth_quickaclcheck($cid) < AUTH_READ){ 251f3f0262cSandi return false; 252f3f0262cSandi } 253f3f0262cSandi 25437e34a5eSandi //fetch instructions 25537e34a5eSandi require_once(DOKU_INC.'inc/parserutils.php'); 25637e34a5eSandi $instructions = p_cached_instructions($base.$file,true); 25737e34a5eSandi if(is_null($instructions)) return false; 258f3f0262cSandi 25937e34a5eSandi //check all links for match 26037e34a5eSandi foreach($instructions as $ins){ 26137e34a5eSandi if($ins[0] == 'internallink' || ($conf['camelcase'] && $ins[0] == 'camelcaselink') ){ 26237e34a5eSandi $mid = $ins[1][0]; 26337e34a5eSandi resolve_pageid($cns,$mid,$exists); //exists is not used 264f3f0262cSandi if($mid == $sid){ 26537e34a5eSandi //we have a match - finish 266f3f0262cSandi $data[]['id'] = $cid; 267f3f0262cSandi break; 268f3f0262cSandi } 269f3f0262cSandi } 270f3f0262cSandi } 271f3f0262cSandi 27237e34a5eSandi return false; 27337e34a5eSandi} 27437e34a5eSandi 275f3f0262cSandi/** 276f3f0262cSandi * Fulltextsearch 277f3f0262cSandi * 278f3f0262cSandi * $opts['query'] is the search query 27915fae107Sandi * 28015fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 281f3f0262cSandi */ 282f3f0262cSandifunction search_fulltext(&$data,$base,$file,$type,$lvl,$opts){ 283f3f0262cSandi //we do nothing with directories 284f3f0262cSandi if($type == 'd') return true;; 285f3f0262cSandi //only search txt files 286f3f0262cSandi if(!preg_match('#\.txt$#',$file)) return true;; 287f3f0262cSandi 288f3f0262cSandi //check ACL 289f3f0262cSandi $id = pathID($file); 290f3f0262cSandi if(auth_quickaclcheck($id) < AUTH_READ){ 291f3f0262cSandi return false; 292f3f0262cSandi } 293f3f0262cSandi 294f3f0262cSandi //get text 295f3f0262cSandi $text = io_readfile($base.'/'.$file); 296d5a2a500Sandi //lowercase text (u modifier does not help with case) 297d5a2a500Sandi $lctext = utf8_strtolower($text); 298f3f0262cSandi 299f3f0262cSandi //create regexp from queries 300f3f0262cSandi $qpreg = preg_split('/\s+/',preg_quote($opts['query'],'#')); 301f3f0262cSandi $qpreg = '('.join('|',$qpreg).')'; 302f3f0262cSandi 303f3f0262cSandi //do the fulltext search 304f3f0262cSandi $matches = array(); 305d5a2a500Sandi if($cnt = preg_match_all('#'.$qpreg.'#usi',$lctext,$matches)){ 306f3f0262cSandi //this is not the best way for snippet generation but the fastest I could find 307f3f0262cSandi //split query and only use the first token 308f3f0262cSandi $q = preg_split('/\s+/',$opts['query'],2); 309f3f0262cSandi $q = $q[0]; 310d5a2a500Sandi $p = utf8_strpos($lctext,$q); 311f3f0262cSandi $f = $p - 100; 312d5a2a500Sandi $l = utf8_strlen($q) + 200; 313f3f0262cSandi if($f < 0) $f = 0; 314f3f0262cSandi $snippet = '<span class="search_sep"> ... </span>'. 315d5a2a500Sandi htmlspecialchars(utf8_substr($text,$f,$l)). 316f3f0262cSandi '<span class="search_sep"> ... </span>'; 317f3f0262cSandi $snippet = preg_replace('#'.$qpreg.'#si','<span class="search_hit">\\1</span>',$snippet); 318f3f0262cSandi 319f3f0262cSandi $data[] = array( 320f3f0262cSandi 'id' => $id, 321f3f0262cSandi 'count' => $cnt, 322f3f0262cSandi 'snippet' => $snippet, 323f3f0262cSandi ); 324f3f0262cSandi } 325f3f0262cSandi 326f3f0262cSandi return true; 327f3f0262cSandi} 328f3f0262cSandi 329f3f0262cSandi/** 33015fae107Sandi * fulltext sort 33115fae107Sandi * 332f3f0262cSandi * Callback sort function for use with usort to sort the data 333f3f0262cSandi * structure created by search_fulltext. Sorts descending by count 33415fae107Sandi * 33515fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 336f3f0262cSandi */ 337f3f0262cSandifunction sort_search_fulltext($a,$b){ 338f3f0262cSandi if($a['count'] > $b['count']){ 339f3f0262cSandi return -1; 340f3f0262cSandi }elseif($a['count'] < $b['count']){ 341f3f0262cSandi return 1; 342f3f0262cSandi }else{ 343f3f0262cSandi return strcmp($a['id'],$b['id']); 344f3f0262cSandi } 345f3f0262cSandi} 346f3f0262cSandi 347f3f0262cSandi/** 348f3f0262cSandi * translates a document path to an ID 34915fae107Sandi * 35015fae107Sandi * @author Andreas Gohr <andi@splitbrain.org> 35137e34a5eSandi * @todo move to pageutils 352f3f0262cSandi */ 353f3f0262cSandifunction pathID($path){ 35449c713a3Sandi $id = utf8_decodeFN($path); 35549c713a3Sandi $id = str_replace('/',':',$id); 356f3f0262cSandi $id = preg_replace('#\.txt$#','',$id); 357f3f0262cSandi $id = preg_replace('#^:+#','',$id); 358f3f0262cSandi $id = preg_replace('#:+$#','',$id); 359f3f0262cSandi return $id; 360f3f0262cSandi} 361f3f0262cSandi 362340756e4Sandi 363340756e4Sandi//Setup VIM: ex: et ts=2 enc=utf-8 : 364