1#!/usr/bin/php 2<?php 3 4// ensure that the request comes from the cli 5if('cli' != php_sapi_name()) die(); 6 7error_reporting(E_ALL & ~E_NOTICE); 8 9// allow setting an animal as first commandline parameter for use in farming 10if(isset($argv[1])) { 11 $_SERVER['animal'] = $argv[1]; 12} 13 14if(!defined('DOKU_INC')) define('DOKU_INC', realpath(dirname(__FILE__) . '/../../../') . '/'); 15require_once(DOKU_INC . 'inc/init.php'); 16 17/** 18 * Walks recursive through a directory and reports all files to the inspect function 19 * 20 * @param string $dir the folder to walk through 21 */ 22function walk($dir) { 23 24 if(!is_readable($dir)) return; 25 if(!is_dir($dir)) return; 26 27 $handle = opendir($dir); 28 if(!$handle) return; 29 30 while(false !== ($file = readdir($handle))) { 31 if($file == '.' || $file == '..') continue; 32 33 $file = "$dir/$file"; 34 if(is_file($file)) { 35 inspect($file); 36 continue; 37 } 38 39 if(is_dir($file)) { 40 walk($file); 41 continue; 42 } 43 } 44} 45 46/** 47 * Try to convert a given file to text and add it to the DocSearch index 48 * 49 * @var string $file File to inspect 50 */ 51function inspect($file) { 52 global $input; 53 global $output; 54 global $conf; 55 global $ID; 56 57 // dont handle non pdf files 58 $extension = array(); 59 60 preg_match('/.([^\.]*)$/', $file, $extension); 61 62 // no file extension -> woops maybe a TODO ? 63 if(!isset($extension[1])) { 64 return; 65 } 66 $extension = $extension[1]; 67 68 // unknown extension -> return 69 if(!in_array($extension, $conf['docsearchext'])) { 70 return; 71 } 72 73 // prepare folder and paths 74 $inputPath = preg_quote($input, '/'); 75 $abstract = preg_replace('/^' . $inputPath . '/', '', $file, 1); 76 $out = $output . $abstract . '.txt'; 77 $id = str_replace('/', ':', $abstract); 78 io_mkdir_p(dirname($out)); 79 80 #echo "indexing: $id\n"; 81 82 // prepare command 83 $cmd = $conf['docsearch'][$extension]; 84 $cmd = str_replace('%in%', escapeshellarg($file), $cmd); 85 $cmd = str_replace('%out%', escapeshellarg($out), $cmd); 86 87 // Run command 88 $exitCode = 0; 89 system($cmd, $exitCode); 90 if($exitCode != 0) fwrite(STDERR, "Command failed: $cmd\n"); 91 92 // check file encoding for bad utf8 characters - if a bad thing is found convert assuming latin1 as source encoding 93 $text = file_get_contents($out); 94 if(!utf8_check($text)) { 95 $text = utf8_encode($text); 96 file_put_contents($out, $text); 97 } 98 99 // add the page to the index 100 $ID = cleanID($id); 101 idx_addPage($ID); 102} 103 104/** 105 * Delete a file, or a folder and its contents (recursive algorithm) 106 * 107 * @author Aidan Lister <aidan@php.net> 108 * @version 1.0.3 109 * @link http://aidanlister.com/repos/v/function.rmdirr.php 110 * @param string $dirname Directory to delete 111 * @return bool Returns TRUE on success, FALSE on failure 112 */ 113function rmdirr($dirname) { 114 // Sanity check 115 if(!file_exists($dirname)) { 116 return false; 117 } 118 119 // Simple delete for a file 120 if(is_file($dirname) || is_link($dirname)) { 121 return unlink($dirname); 122 } 123 124 // Loop through the folder 125 $dir = dir($dirname); 126 while(false !== $entry = $dir->read()) { 127 // Skip pointers 128 if($entry == '.' || $entry == '..') { 129 continue; 130 } 131 132 // Recurse 133 rmdirr($dirname . DIRECTORY_SEPARATOR . $entry); 134 } 135 136 // Clean up 137 $dir->close(); 138 return rmdir($dirname); 139} 140 141/****************************************************************************** 142 ********************************** Script ************************************ 143 ******************************************************************************/ 144 145$ID = ''; 146 147// load the plugin converter settings. 148 149$converter_conf = DOKU_INC . 'lib/plugins/docsearch/conf/converter.php'; 150$conf['docsearch'] = confToHash($converter_conf); 151 152// no converters == no work ;-) 153if(empty($conf['docsearch'])) { 154 fwrite(STDERR, "No converters found in $converter_conf\n"); 155 exit(1); 156} 157 158$conf['docsearchext'] = array_keys($conf['docsearch']); 159 160// build the data pathes 161 162// the base "data" dir 163$base = ''; 164 165if($conf['savedir'][0] === '.') { 166 $base = DOKU_INC; 167} 168$base .= $conf['savedir'] . '/'; 169 170// cleanup old data 171rmdirr($base . 'docsearch'); 172 173// build the important pathes 174$input = $conf['mediadir']; 175$output = $base . 'docsearch/pages'; 176$index = $base . 'docsearch/index'; 177$cache = $base . 'docsearch/cache'; 178$meta = $base . 'docsearch/meta'; 179$locks = $base . 'docsearch/locks'; 180 181// create output dir 182io_mkdir_p($output); 183io_mkdir_p($index); 184io_mkdir_p($cache); 185io_mkdir_p($meta); 186io_mkdir_p($locks); 187 188// change the data folders 189$conf['datadir'] = $output; 190$conf['indexdir'] = $index; 191$conf['cachedir'] = $cache; 192$conf['metadir'] = $meta; 193$conf['lockdir'] = $locks; 194 195// walk through the media dir and search for pdf files 196walk($input); 197