1<?php
2/**
3 * CSV Plugin helper plugin
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <gohr@cosmocode.de>
7 */
8
9/**
10 * Implement CSV parser and other helpers
11 */
12class helper_plugin_csv extends DokuWiki_Plugin
13{
14
15    /**
16     * Returns the default options
17     *
18     * @return array
19     */
20    public static function getDefaultOpt()
21    {
22        return array(
23            'hdr_rows' => 1,
24            'hdr_cols' => 0,
25            'span_empty_cols' => 0,
26            'maxlines' => 0,
27            'offset' => 0,
28            'file' => '',
29            'delim' => ',',
30            'enclosure' => '"',
31            'escape' => '"',
32            'content' => '',
33            'filter' => array(),
34            'output' => '',
35            'outc' => 0,
36            'outr' => 0,
37        );
38    }
39
40    /**
41     * Parse the options given in the syntax
42     *
43     * @param $optstr
44     * @return array
45     */
46    public static function parseOptions($optstr)
47    {
48        global $INFO;
49
50        // defaults
51        $opt = helper_plugin_csv::getDefaultOpt();
52
53        $filters = array();
54        // parse options - https://regex101.com/r/tNdS9P/3/
55        preg_match_all(
56            '/([^ =\[\]]+)(?:\[(\d+)\](?:\[(\w)\])?)?(?:=((?:".*?")|(?:[^ ]+)))?/',
57            $optstr,
58            $matches,
59            PREG_SET_ORDER
60        );
61        foreach ($matches as $set) {
62            $option = $set[1];
63            $value = isset($set[4]) ? $set[4] : '';
64            $value = trim($value, '"');
65
66            if ($option == 'filter') {
67                $col = isset($set[2]) ? $set[2] : 1;
68                $typ = isset($set[3]) ? $set[3] : 'g';
69                $filters[$col] = array($value, $typ);
70            } elseif ($value === '') {
71                $opt['file'] = $option;
72            } else {
73                $opt[$option] = $value;
74            }
75        }
76
77        // fix tab delimiter
78        if ($opt['delim'] == 'tab') $opt['delim'] = "\t";
79
80        // resolve local files
81        if ($opt['file'] !== '' && !preg_match('/^https?:\/\//i', $opt['file'])) {
82            resolve_mediaid($INFO['namespace'] ?? '', $opt['file'], $exists);
83        }
84
85        // create regexp filters
86        foreach ($filters as $col => $filter) {
87            list($text, $type) = $filter;
88            if ($type != 'r') {
89                $text = preg_quote_cb($text);
90                $text = str_replace('\*', '.*?', $text);
91                $text = '^' . $text . '$';
92            }
93
94            if (@preg_match("/$text/", null) === false) {
95                msg("Invalid filter for column $col");
96            } else {
97                $opt['filter'][$col - 1] = $text; // use zero based index internally
98            }
99        }
100
101        // prepare the value output
102        list($c, $r) = array_pad(explode(',', $opt['output']), 2, 0);
103        $opt['outc'] = (int)$c;
104        $opt['outr'] = (int)$r;
105        if ($opt['outc']) $opt['outc'] -= 1;
106        if ($opt['outr']) $opt['outr'] -= 1;
107        unset($opt['output']);
108
109        return $opt;
110    }
111
112    /**
113     * Load CSV data from the given file or remote address
114     *
115     * @param $file
116     * @return string
117     * @throws Exception
118     */
119    public static function loadContent($file)
120    {
121        // load file data
122        if (preg_match('/^https?:\/\//i', $file)) {
123            $http = new DokuHTTPClient();
124            $content = $http->get($file);
125            if ($content === false) throw new \Exception('Failed to fetch remote CSV data');
126
127        } else {
128            if (auth_quickaclcheck(getNS($file) . ':*') < AUTH_READ) {
129                throw new \Exception('Access denied to CSV data');
130            }
131            $file = mediaFN($file);
132            if (!file_exists($file)) {
133                throw new \Exception('requested local CSV file does not exist');
134            }
135            $content = io_readFile($file);
136        }
137        // if not valid UTF-8 is given we assume ISO-8859-1
138        if (!utf8_check($content)) $content = utf8_encode($content);
139
140        return $content;
141    }
142
143    /**
144     * @param string $content
145     * @param array $opt
146     * @return array
147     */
148    public static function prepareData($content, $opt)
149    {
150        $data = array();
151
152        // get the first row - it will define the structure
153        $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
154        $maxcol = count($row);
155        $line = 0;
156
157        while ($row !== false) {
158            // make sure we have enough columns
159            $row = array_pad($row, $maxcol, '');
160
161            if ($line < $opt['hdr_rows']) {
162                // if headers are wanted, always add them
163                $data[] = $row;
164            } elseif ($opt['offset'] && $line < $opt['offset'] + $opt['hdr_rows']) {
165                // ignore the line
166            } elseif ($opt['maxlines'] && $line >= $opt['maxlines'] + $opt['hdr_rows']) {
167                // we're done
168                break;
169            } else {
170                // check filters
171                $filterok = true;
172                foreach ($opt['filter'] as $col => $filter) {
173                    if (!preg_match("/$filter/i", $row[$col])) {
174                        $filterok = false;
175                        break;
176                    }
177                }
178
179                // add the line
180                if ($filterok) {
181                    $data[] = $row;
182                }
183            }
184
185            $line++;
186            $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
187        }
188
189        return $data;
190    }
191
192    /**
193     * Reads one CSV line from the given string
194     *
195     * Should handle embedded new lines, escapes, quotes and whatever else CSVs tend to have
196     *
197     * Note $delim, $enc, $esc have to be one ASCII character only! The encoding of the content is not
198     * handled here but is read byte by byte - if you need conversions do it on the output
199     *
200     * @param string $str Input string, first CSV line will be removed
201     * @param string $delim Delimiter character
202     * @param string $enc Enclosing character
203     * @param string $esc Escape character
204     * @return array|boolean fields found on the line, false when no more lines could be found
205     * @author Andreas Gohr <andi@splitbrain.org>
206     */
207    public static function csv_explode_row(&$str, $delim = ',', $enc = '"', $esc = '\\')
208    {
209        $len = strlen($str);
210
211        $infield = false;
212        $inenc = false;
213
214        $fields = array();
215        $word = '';
216
217        for ($i = 0; $i < $len; $i++) {
218            // convert to unix line endings
219            if ($str[$i] == "\015") {
220                if ($str[($i + 1)] != "\012") {
221                    $str[$i] = "\012";
222                } else {
223                    $i++;
224                    if ($i >= $len) break;
225                }
226            }
227
228            // simple escape that is not an enclosure
229            if ($str[$i] == $esc && $esc != $enc) {
230                $i++; // skip this char and take next as is
231                $word .= $str[$i];
232                $infield = true; // we are obviously in a field
233                continue;
234            }
235
236            /*
237             * Now decide special cases depending on current field and enclosure state
238             */
239            if (!$infield) { // not in field
240
241                // we hit a delimiter even though we're not in a field - an empty field
242                if ($str[$i] == $delim) {
243                    $fields[] = $word;
244                    $word = '';
245                    $infield = false;
246                    $inenc = false;
247                    continue;
248                }
249
250                // a newline - an empty field as well, but we're done with this line
251                if ($str[$i] == "\n") {
252                    $infield = false;
253                    $inenc = false;
254
255                    //we saw no fields or content yet? empty line! skip it.
256                    if (!count($fields) && $word === '') continue;
257
258                    // otherwise add field
259                    $fields[] = $word;
260                    $word = '';
261                    break;
262                }
263
264                // we skip leading whitespace when we're not in a field yet
265                if ($str[$i] === ' ') {
266                    continue;
267                }
268
269                // cell starts with an enclosure
270                if ($str[$i] == $enc) {
271                    // skip this one but open an enclosed field
272                    $infield = true;
273                    $inenc = true;
274                    continue;
275                }
276
277                // still here? whatever is here, is content and starts a field
278                $word .= $str[$i];
279                $infield = true;
280                $inenc = false;
281
282            } elseif ($inenc) { // in field and enclosure
283
284                // we have an escape char that is an enclosure and the next char is an enclosure, too
285                if ($str[$i] == $esc && $esc == $enc && isset($str[$i + 1]) && $str[$i + 1] == $esc) {
286                    $i++; // skip this char and take next as is
287                    $word .= $str[$i];
288                    continue;
289                }
290
291                // we have an enclosure char
292                if ($str[$i] == $enc) {
293                    // skip this one but close the enclosure
294                    $infield = true;
295                    $inenc = false;
296                    continue;
297                }
298
299                // still here? just add more content
300                $word .= $str[$i];
301
302            } else { // in field but no enclosure
303
304                // a delimiter - next field please
305                if ($str[$i] == $delim) {
306                    $fields[] = $word;
307                    $word = '';
308                    $infield = false;
309                    $inenc = false;
310                    continue;
311                }
312
313                // EOL - we're done with the line
314                if ($str[$i] == "\n") {
315                    $infield = false;
316                    $inenc = false;
317
318                    //we saw no fields or content yet? empty line! skip it.
319                    if (!count($fields) && $word === '') continue;
320
321                    $fields[] = $word;
322                    $word = '';
323                    break;
324                }
325
326                // still here? just add more content
327                $word .= $str[$i];
328            }
329        }
330
331        // did we hit the end?
332        if ($infield && ($word || count($fields))) {
333            $fields[] = $word;
334        }
335
336        // shorten the string by the stuff we read
337        $str = substr($str, $i + 1);
338
339        if (!count($fields)) return false;
340        return $fields;
341    }
342}
343
344