1#!/usr/bin/php
2<?php
3
4// ensure that the request comes from the cli
5if('cli' != php_sapi_name()) die();
6
7error_reporting(E_ALL & ~E_NOTICE);
8
9// allow setting an animal as first commandline parameter for use in farming
10if(isset($argv[1])) {
11    $_SERVER['animal'] = $argv[1];
12}
13
14if(!defined('DOKU_INC')) define('DOKU_INC', realpath(dirname(__FILE__) . '/../../../') . '/');
15require_once(DOKU_INC . 'inc/init.php');
16
17/**
18 * Walks recursive through a directory and reports all files to the inspect function
19 *
20 * @param string $dir the folder to walk through
21 */
22function walk($dir) {
23
24    if(!is_readable($dir)) return;
25    if(!is_dir($dir)) return;
26
27    $handle = opendir($dir);
28    if(!$handle) return;
29
30    while(false !== ($file = readdir($handle))) {
31        if($file == '.' || $file == '..') continue;
32
33        $file = "$dir/$file";
34        if(is_file($file)) {
35            inspect($file);
36            continue;
37        }
38
39        if(is_dir($file)) {
40            walk($file);
41            continue;
42        }
43    }
44}
45
46/**
47 * Try to convert a given file to text and add it to the DocSearch index
48 *
49 * @var string $file File to inspect
50 */
51function inspect($file) {
52    global $input;
53    global $output;
54    global $conf;
55    global $ID;
56
57    // dont handle non pdf files
58    $extension = array();
59
60    preg_match('/.([^\.]*)$/', $file, $extension);
61
62    // no file extension -> woops maybe a TODO ?
63    if(!isset($extension[1])) {
64        return;
65    }
66    $extension = $extension[1];
67
68    // unknown extension -> return
69    if(!in_array($extension, $conf['docsearchext'])) {
70        return;
71    }
72
73    // prepare folder and paths
74    $inputPath = preg_quote($input, '/');
75    $abstract = preg_replace('/^' . $inputPath . '/', '', $file, 1);
76    $out = $output . $abstract . '.txt';
77    $id = str_replace('/', ':', $abstract);
78    io_mkdir_p(dirname($out));
79
80    #echo "indexing: $id\n";
81
82    // prepare command
83    $cmd = $conf['docsearch'][$extension];
84    $cmd = str_replace('%in%', escapeshellarg($file), $cmd);
85    $cmd = str_replace('%out%', escapeshellarg($out), $cmd);
86
87    // Run command
88    $exitCode = 0;
89    system($cmd, $exitCode);
90    if($exitCode != 0) fwrite(STDERR, "Command failed: $cmd\n");
91
92    // check file encoding for bad utf8 characters - if a bad thing is found convert assuming latin1 as source encoding
93    $text = file_get_contents($out);
94    if(!utf8_check($text)) {
95        $text = utf8_encode($text);
96        file_put_contents($out, $text);
97    }
98
99    // add the page to the index
100    $ID = cleanID($id);
101    idx_addPage($ID);
102}
103
104/**
105 * Delete a file, or a folder and its contents (recursive algorithm)
106 *
107 * @author      Aidan Lister <aidan@php.net>
108 * @version     1.0.3
109 * @link        http://aidanlister.com/repos/v/function.rmdirr.php
110 * @param       string $dirname    Directory to delete
111 * @return      bool     Returns TRUE on success, FALSE on failure
112 */
113function rmdirr($dirname) {
114    // Sanity check
115    if(!file_exists($dirname)) {
116        return false;
117    }
118
119    // Simple delete for a file
120    if(is_file($dirname) || is_link($dirname)) {
121        return unlink($dirname);
122    }
123
124    // Loop through the folder
125    $dir = dir($dirname);
126    while(false !== $entry = $dir->read()) {
127        // Skip pointers
128        if($entry == '.' || $entry == '..') {
129            continue;
130        }
131
132        // Recurse
133        rmdirr($dirname . DIRECTORY_SEPARATOR . $entry);
134    }
135
136    // Clean up
137    $dir->close();
138    return rmdir($dirname);
139}
140
141/******************************************************************************
142 ********************************** Script ************************************
143 ******************************************************************************/
144
145$ID = '';
146
147// load the plugin converter settings.
148
149$converter_conf = DOKU_INC . 'lib/plugins/docsearch/conf/converter.php';
150$conf['docsearch'] = confToHash($converter_conf);
151
152// no converters == no work ;-)
153if(empty($conf['docsearch'])) {
154    fwrite(STDERR, "No converters found in $converter_conf\n");
155    exit(1);
156}
157
158$conf['docsearchext'] = array_keys($conf['docsearch']);
159
160// build the data pathes
161
162// the base "data" dir
163$base = '';
164
165if($conf['savedir'][0] === '.') {
166    $base = DOKU_INC;
167}
168$base .= $conf['savedir'] . '/';
169
170// cleanup old data
171rmdirr($base . 'docsearch');
172
173// build the important pathes
174$input = $conf['mediadir'];
175$output = $base . 'docsearch/pages';
176$index = $base . 'docsearch/index';
177$cache = $base . 'docsearch/cache';
178$meta = $base . 'docsearch/meta';
179$locks = $base . 'docsearch/locks';
180
181// create output dir
182io_mkdir_p($output);
183io_mkdir_p($index);
184io_mkdir_p($cache);
185io_mkdir_p($meta);
186io_mkdir_p($locks);
187
188// change the data folders
189$conf['datadir'] = $output;
190$conf['indexdir'] = $index;
191$conf['cachedir'] = $cache;
192$conf['metadir'] = $meta;
193$conf['lockdir'] = $locks;
194
195// walk through the media dir and search for pdf files
196walk($input);
197