1<?php
2/**
3 * CSV Plugin helper plugin
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <gohr@cosmocode.de>
7 */
8
9if(!defined('DOKU_INC')) die('meh');
10
11/**
12 * All DokuWiki plugins to extend the parser/rendering mechanism
13 * need to inherit from this class
14 */
15class helper_plugin_csv extends DokuWiki_Plugin {
16
17    /**
18     * Returns the default options
19     *
20     * @return array
21     */
22    public static function getDefaultOpt() {
23        return array(
24            'hdr_rows' => 1,
25            'hdr_cols' => 0,
26            'span_empty_cols' => 0,
27            'maxlines' => 0,
28            'offset' => 0,
29            'file' => '',
30            'delim' => ',',
31            'enclosure' => '"',
32            'escape' => '"',
33            'content' => '',
34            'filter' => array(),
35            'output' => '',
36            'outc' => 0,
37            'outr' => 0,
38        );
39    }
40
41    /**
42     * Parse the options given in the syntax
43     *
44     * @param $optstr
45     * @return array
46     */
47    public static function parseOptions($optstr) {
48        global $INFO;
49
50        // defaults
51        $opt = helper_plugin_csv::getDefaultOpt();
52
53        $filters = array();
54        // parse options - https://regex101.com/r/tNdS9P/3/
55        preg_match_all(
56            '/([^ =\[\]]+)(?:\[(\d+)\](?:\[(\w)\])?)?(?:=((?:".*?")|(?:[^ ]+)))?/',
57            $optstr,
58            $matches,
59            PREG_SET_ORDER
60        );
61        foreach($matches as $set) {
62            $option = $set[1];
63            $value = isset($set[4]) ? $set[4] : '';
64            $value = trim($value, '"');
65
66            if($option == 'filter') {
67                $col = isset($set[2]) ? $set[2] : 1;
68                $typ = isset($set[3]) ? $set[3] : 'g';
69                $filters[$col] = array($value, $typ);
70            } elseif($value === '') {
71                $opt['file'] = $option;
72            } else {
73                $opt[$option] = $value;
74            }
75        }
76
77        // fix tab delimiter
78        if($opt['delim'] == 'tab') $opt['delim'] = "\t";
79
80        // resolve local files
81        if($opt['file'] !== '' && !preg_match('/^https?:\/\//i', $opt['file'])) {
82            $opt['file'] = cleanID($opt['file']);
83            if(!strlen(getNS($opt['file']))) {
84                $opt['file'] = $INFO['namespace'] . ':' . $opt['file'];
85            }
86        }
87
88        // create regexp filters
89        foreach($filters as $col => $filter) {
90            list($text, $type) = $filter;
91            if($type != 'r') {
92                $text = preg_quote_cb($text);
93                $text = str_replace('\*', '.*?', $text);
94                $text = '^' . $text . '$';
95            }
96
97            if(@preg_match("/$text/", null) === false) {
98                msg("Invalid filter for column $col");
99            } else {
100                $opt['filter'][$col - 1] = $text; // use zero based index internally
101            }
102        }
103
104        // prepare the value output
105        list($c, $r) = explode(',', $opt['output']);
106        $opt['outc'] = (int) $c;
107        $opt['outr'] = (int) $r;
108        if($opt['outc']) $opt['outc'] -= 1;
109        if($opt['outr']) $opt['outr'] -= 1;
110
111        return $opt;
112    }
113
114    /**
115     * Load CSV data from the given file or remote address
116     *
117     * @param $file
118     * @return string
119     * @throws Exception
120     */
121    public static function loadContent($file) {
122        // load file data
123        if(preg_match('/^https?:\/\//i', $file)) {
124            $http = new DokuHTTPClient();
125            $content = $http->get($file);
126            if($content === false) throw new \Exception('Failed to fetch remote CSV data');
127
128        } else {
129            if(auth_quickaclcheck(getNS($file) . ':*') < AUTH_READ) {
130                throw new \Exception('Access denied to CSV data');
131            }
132            $file = mediaFN($file);
133            if(!file_exists($file)) {
134                throw new \Exception('requested local CSV file does not exist');
135            }
136            $content = io_readFile($file);
137        }
138        // if not valid UTF-8 is given we assume ISO-8859-1
139        if(!utf8_check($content)) $content = utf8_encode($content);
140
141        return $content;
142    }
143
144    /**
145     * @param string $content
146     * @param array $opt
147     * @return array
148     */
149    public static function prepareData($content, $opt) {
150        $data = array();
151
152        // get the first row - it will define the structure
153        $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
154        $maxcol = count($row);
155        $line = 0;
156
157        while($row !== false) {
158            // make sure we have enough columns
159            $row = array_pad($row, $maxcol, '');
160
161            if($line < $opt['hdr_rows']) {
162                // if headers are wanted, always add them
163                $data[] = $row;
164            } elseif($opt['offset'] && $line < $opt['offset'] + $opt['hdr_rows']) {
165                // ignore the line
166            } elseif($opt['maxlines'] && $line >= $opt['maxlines'] + $opt['hdr_rows']) {
167                // we're done
168                break;
169            } else {
170                // check filters
171                $filterok = true;
172                foreach($opt['filter'] as $col => $filter) {
173                    if(!preg_match("/$filter/i", $row[$col])) {
174                        $filterok = false;
175                        break;
176                    }
177                }
178
179                // add the line
180                if($filterok) {
181                    $data[] = $row;
182                }
183            }
184
185            $line++;
186            $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
187        }
188
189        return $data;
190    }
191
192    /**
193     * Reads one CSV line from the given string
194     *
195     * Should handle embedded new lines, escapes, quotes and whatever else CSVs tend to have
196     *
197     * Note $delim, $enc, $esc have to be one ASCII character only! The encoding of the content is not
198     * handled here but is read byte by byte - if you need conversions do it on the output
199     *
200     * @author Andreas Gohr <andi@splitbrain.org>
201     * @param string $str Input string, first CSV line will be removed
202     * @param string $delim Delimiter character
203     * @param string $enc Enclosing character
204     * @param string $esc Escape character
205     * @return array|boolean fields found on the line, false when no more lines could be found
206     */
207    public static function csv_explode_row(&$str, $delim = ',', $enc = '"', $esc = '\\') {
208        $len = strlen($str);
209
210        $infield = false;
211        $inenc = false;
212
213        $fields = array();
214        $word = '';
215
216        for($i = 0; $i < $len; $i++) {
217            // convert to unix line endings
218            if($str[$i] == "\015") {
219                if($str[($i + 1)] != "\012") {
220                    $str[$i] = "\012";
221                } else {
222                    $i++;
223                    if($i >= $len) break;
224                }
225            }
226
227            // simple escape that is not an enclosure
228            if($str[$i] == $esc && $esc != $enc) {
229                $i++; // skip this char and take next as is
230                $word .= $str[$i];
231                $infield = true; // we are obviously in a field
232                continue;
233            }
234
235            /*
236             * Now decide special cases depending on current field and enclosure state
237             */
238            if(!$infield) { // not in field
239
240                // we hit a delimiter even though we're not in a field - an empty field
241                if($str[$i] == $delim) {
242                    $fields[] = $word;
243                    $word = '';
244                    $infield = false;
245                    $inenc = false;
246                    continue;
247                }
248
249                // a newline - an empty field as well, but we're done with this line
250                if($str[$i] == "\n") {
251                    $infield = false;
252                    $inenc = false;
253
254                    //we saw no fields or content yet? empty line! skip it.
255                    if(!count($fields) && $word === '') continue;
256
257                    // otherwise add field
258                    $fields[] = $word;
259                    $word = '';
260                    break;
261                }
262
263                // we skip leading whitespace when we're not in a field yet
264                if($str[$i] === ' ') {
265                    continue;
266                }
267
268                // cell starts with an enclosure
269                if($str[$i] == $enc) {
270                    // skip this one but open an enclosed field
271                    $infield = true;
272                    $inenc = true;
273                    continue;
274                }
275
276                // still here? whatever is here, is content and starts a field
277                $word .= $str[$i];
278                $infield = true;
279                $inenc = false;
280
281            } elseif($inenc) { // in field and enclosure
282
283                // we have an escape char that is an enclosure and the next char is an enclosure, too
284                if($str[$i] == $esc && $esc == $enc && $str[$i + 1] == $esc) {
285                    $i++; // skip this char and take next as is
286                    $word .= $str[$i];
287                    continue;
288                }
289
290                // we have an enclosure char
291                if($str[$i] == $enc) {
292                    // skip this one but close the enclosure
293                    $infield = true;
294                    $inenc = false;
295                    continue;
296                }
297
298                // still here? just add more content
299                $word .= $str[$i];
300
301            } else { // in field but no enclosure
302
303                // a delimiter - next field please
304                if($str[$i] == $delim) {
305                    $fields[] = $word;
306                    $word = '';
307                    $infield = false;
308                    $inenc = false;
309                    continue;
310                }
311
312                // EOL - we're done with the line
313                if($str[$i] == "\n") {
314                    $infield = false;
315                    $inenc = false;
316
317                    //we saw no fields or content yet? empty line! skip it.
318                    if(!count($fields) && $word === '') continue;
319
320                    $fields[] = $word;
321                    $word = '';
322                    break;
323                }
324
325                // still here? just add more content
326                $word .= $str[$i];
327            }
328        }
329
330        // did we hit the end?
331        if($infield && ($word || count($fields))) {
332            $fields[] = $word;
333        }
334
335        // shorten the string by the stuff we read
336        $str = substr($str, $i + 1);
337
338        if(!count($fields)) return false;
339        return $fields;
340    }
341}
342
343