1<?php
2
3use dokuwiki\Extension\Plugin;
4use dokuwiki\File\MediaResolver;
5use dokuwiki\HTTP\DokuHTTPClient;
6use dokuwiki\Utf8\Clean;
7
8/**
9 * CSV Plugin helper plugin
10 *
11 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
12 * @author     Andreas Gohr <gohr@cosmocode.de>
13 */
14/**
15 * Implement CSV parser and other helpers
16 */
17class helper_plugin_csv extends Plugin
18{
19    /**
20     * Returns the default options
21     *
22     * @return array
23     */
24    public static function getDefaultOpt()
25    {
26        return [
27            'hdr_rows' => 1,
28            'hdr_cols' => 0,
29            'span_empty_cols' => 0,
30            'maxlines' => 0,
31            'offset' => 0,
32            'file' => '',
33            'delim' => ',',
34            'enclosure' => '"',
35            'escape' => '"',
36            'content' => '',
37            'filter' => [],
38            'output' => '',
39            'outc' => 0,
40            'outr' => 0
41        ];
42    }
43
44    /**
45     * Parse the options given in the syntax
46     *
47     * @param $optstr
48     * @return array
49     */
50    public static function parseOptions($optstr)
51    {
52        global $INFO;
53
54        // defaults
55        $opt = helper_plugin_csv::getDefaultOpt();
56
57        $filters = [];
58        // parse options - https://regex101.com/r/tNdS9P/3/
59        preg_match_all(
60            '/([^ =\[\]]+)(?:\[(\d+)\](?:\[(\w)\])?)?(?:=((?:".*?")|(?:[^ ]+)))?/',
61            $optstr,
62            $matches,
63            PREG_SET_ORDER
64        );
65        foreach ($matches as $set) {
66            $option = $set[1];
67            $value = $set[4] ?? '';
68            $value = trim($value, '"');
69
70            if ($option == 'filter') {
71                $col = $set[2] ?? 1;
72                $typ = $set[3] ?? 'g';
73                $filters[$col] = [$value, $typ];
74            } elseif ($value === '') {
75                $opt['file'] = $option;
76            } else {
77                $opt[$option] = $value;
78            }
79        }
80
81        // fix tab delimiter
82        if ($opt['delim'] == 'tab') $opt['delim'] = "\t";
83
84        // resolve local files
85        if ($opt['file'] !== '' && !preg_match('/^https?:\/\//i', $opt['file'])) {
86            $resolver = new MediaResolver($INFO['id'] ?? '');
87            $opt['file'] = $resolver->resolveId($opt['file']);
88        }
89
90        // create regexp filters
91        foreach ($filters as $col => $filter) {
92            [$text, $type] = $filter;
93            if ($type != 'r') {
94                $text = preg_quote_cb($text);
95                $text = str_replace('\*', '.*?', $text);
96                $text = '^' . $text . '$';
97            }
98
99            if (@preg_match("/$text/", null) === false) {
100                msg("Invalid filter for column $col");
101            } else {
102                $opt['filter'][$col - 1] = $text; // use zero based index internally
103            }
104        }
105
106        // prepare the value output
107        [$c, $r] = array_pad(explode(',', $opt['output']), 2, 0);
108        $opt['outc'] = (int)$c;
109        $opt['outr'] = (int)$r;
110        if ($opt['outc']) $opt['outc'] -= 1;
111        if ($opt['outr']) $opt['outr'] -= 1;
112        unset($opt['output']);
113
114        return $opt;
115    }
116
117    /**
118     * Load CSV data from the given file or remote address
119     *
120     * @param $file
121     * @return string
122     * @throws Exception
123     */
124    public static function loadContent($file)
125    {
126        // load file data
127        if (preg_match('/^https?:\/\//i', $file)) {
128            $http = new DokuHTTPClient();
129            $content = $http->get($file);
130            if ($content === false) throw new \Exception('Failed to fetch remote CSV data');
131        } else {
132            if (auth_quickaclcheck(getNS($file) . ':*') < AUTH_READ) {
133                throw new \Exception('Access denied to CSV data');
134            }
135            $file = mediaFN($file);
136            if (!file_exists($file)) {
137                throw new \Exception('requested local CSV file does not exist');
138            }
139            $content = io_readFile($file);
140        }
141        // if not valid UTF-8 is given we assume ISO-8859-1
142        if (!Clean::isUtf8($content)) $content = utf8_encode($content);
143
144        return $content;
145    }
146
147    /**
148     * @param string $content
149     * @param array $opt
150     * @return array
151     */
152    public static function prepareData($content, $opt)
153    {
154        $data = [];
155
156        // get the first row - it will define the structure
157        $row = helper_plugin_csv::csvExplodeRow($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
158        $maxcol = count($row);
159        $line = 0;
160
161        while ($row !== false) {
162            // make sure we have enough columns
163            $row = array_pad($row, $maxcol, '');
164
165            if ($line < $opt['hdr_rows']) {
166                // if headers are wanted, always add them
167                $data[] = $row;
168            } elseif ($opt['offset'] && $line < $opt['offset'] + $opt['hdr_rows']) {
169                // ignore the line
170            } elseif ($opt['maxlines'] && $line >= $opt['maxlines'] + $opt['hdr_rows']) {
171                // we're done
172                break;
173            } else {
174                // check filters
175                $filterok = true;
176                foreach ($opt['filter'] as $col => $filter) {
177                    if (!preg_match("/$filter/i", $row[$col])) {
178                        $filterok = false;
179                        break;
180                    }
181                }
182
183                // add the line
184                if ($filterok) {
185                    $data[] = $row;
186                }
187            }
188
189            $line++;
190            $row = helper_plugin_csv::csvExplodeRow($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
191        }
192
193        return $data;
194    }
195
196    /**
197     * Reads one CSV line from the given string
198     *
199     * Should handle embedded new lines, escapes, quotes and whatever else CSVs tend to have
200     *
201     * Note $delim, $enc, $esc have to be one ASCII character only! The encoding of the content is not
202     * handled here but is read byte by byte - if you need conversions do it on the output
203     *
204     * @param string $str Input string, first CSV line will be removed
205     * @param string $delim Delimiter character
206     * @param string $enc Enclosing character
207     * @param string $esc Escape character
208     * @return array|boolean fields found on the line, false when no more lines could be found
209     * @author Andreas Gohr <andi@splitbrain.org>
210     */
211    public static function csvExplodeRow(&$str, $delim = ',', $enc = '"', $esc = '\\')
212    {
213        $len = strlen($str);
214
215        $infield = false;
216        $inenc = false;
217
218        $fields = [];
219        $word = '';
220
221        for ($i = 0; $i < $len; $i++) {
222            // convert to unix line endings
223            if ($str[$i] == "\015") {
224                if ($str[($i + 1)] != "\012") {
225                    $str[$i] = "\012";
226                } else {
227                    $i++;
228                    if ($i >= $len) break;
229                }
230            }
231
232            // simple escape that is not an enclosure
233            if ($str[$i] == $esc && $esc != $enc) {
234                $i++; // skip this char and take next as is
235                $word .= $str[$i];
236                $infield = true; // we are obviously in a field
237                continue;
238            }
239
240            /*
241             * Now decide special cases depending on current field and enclosure state
242             */
243            if (!$infield) { // not in field
244                // we hit a delimiter even though we're not in a field - an empty field
245                if ($str[$i] == $delim) {
246                    $fields[] = $word;
247                    $word = '';
248                    $infield = false;
249                    $inenc = false;
250                    continue;
251                }
252
253                // a newline - an empty field as well, but we're done with this line
254                if ($str[$i] == "\n") {
255                    $infield = false;
256                    $inenc = false;
257
258                    //we saw no fields or content yet? empty line! skip it.
259                    if ($fields === [] && $word === '') continue;
260
261                    // otherwise add field
262                    $fields[] = $word;
263                    $word = '';
264                    break;
265                }
266
267                // we skip leading whitespace when we're not in a field yet
268                if ($str[$i] === ' ') {
269                    continue;
270                }
271
272                // cell starts with an enclosure
273                if ($str[$i] == $enc) {
274                    // skip this one but open an enclosed field
275                    $infield = true;
276                    $inenc = true;
277                    continue;
278                }
279
280                // still here? whatever is here, is content and starts a field
281                $word .= $str[$i];
282                $infield = true;
283                $inenc = false;
284            } elseif ($inenc) { // in field and enclosure
285                // we have an escape char that is an enclosure and the next char is an enclosure, too
286                if ($str[$i] == $esc && $esc == $enc && isset($str[$i + 1]) && $str[$i + 1] == $esc) {
287                    $i++; // skip this char and take next as is
288                    $word .= $str[$i];
289                    continue;
290                }
291
292                // we have an enclosure char
293                if ($str[$i] == $enc) {
294                    // skip this one but close the enclosure
295                    $infield = true;
296                    $inenc = false;
297                    continue;
298                }
299
300                // still here? just add more content
301                $word .= $str[$i];
302            } else { // in field but no enclosure
303                // a delimiter - next field please
304                if ($str[$i] == $delim) {
305                    $fields[] = $word;
306                    $word = '';
307                    $infield = false;
308                    $inenc = false;
309                    continue;
310                }
311
312                // EOL - we're done with the line
313                if ($str[$i] == "\n") {
314                    $infield = false;
315                    $inenc = false;
316
317                    //we saw no fields or content yet? empty line! skip it.
318                    if ($fields === [] && $word === '') continue;
319
320                    $fields[] = $word;
321                    $word = '';
322                    break;
323                }
324
325                // still here? just add more content
326                $word .= $str[$i];
327            }
328        }
329
330        // did we hit the end?
331        if ($infield && ($word || count($fields))) {
332            $fields[] = $word;
333        }
334
335        // shorten the string by the stuff we read
336        $str = substr($str, $i + 1);
337
338        if ($fields === []) return false;
339        return $fields;
340    }
341}
342