1 <?php
2 /**
3  * CSV Plugin helper plugin
4  *
5  * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6  * @author     Andreas Gohr <gohr@cosmocode.de>
7  */
8 
9 /**
10  * Implement CSV parser and other helpers
11  */
12 class helper_plugin_csv extends DokuWiki_Plugin
13 {
14 
15     /**
16      * Returns the default options
17      *
18      * @return array
19      */
20     public static function getDefaultOpt()
21     {
22         return array(
23             'hdr_rows' => 1,
24             'hdr_cols' => 0,
25             'span_empty_cols' => 0,
26             'maxlines' => 0,
27             'offset' => 0,
28             'file' => '',
29             'delim' => ',',
30             'enclosure' => '"',
31             'escape' => '"',
32             'content' => '',
33             'filter' => array(),
34             'output' => '',
35             'outc' => 0,
36             'outr' => 0,
37         );
38     }
39 
40     /**
41      * Parse the options given in the syntax
42      *
43      * @param $optstr
44      * @return array
45      */
46     public static function parseOptions($optstr)
47     {
48         global $INFO;
49 
50         // defaults
51         $opt = helper_plugin_csv::getDefaultOpt();
52 
53         $filters = array();
54         // parse options - https://regex101.com/r/tNdS9P/3/
55         preg_match_all(
56             '/([^ =\[\]]+)(?:\[(\d+)\](?:\[(\w)\])?)?(?:=((?:".*?")|(?:[^ ]+)))?/',
57             $optstr,
58             $matches,
59             PREG_SET_ORDER
60         );
61         foreach ($matches as $set) {
62             $option = $set[1];
63             $value = isset($set[4]) ? $set[4] : '';
64             $value = trim($value, '"');
65 
66             if ($option == 'filter') {
67                 $col = isset($set[2]) ? $set[2] : 1;
68                 $typ = isset($set[3]) ? $set[3] : 'g';
69                 $filters[$col] = array($value, $typ);
70             } elseif ($value === '') {
71                 $opt['file'] = $option;
72             } else {
73                 $opt[$option] = $value;
74             }
75         }
76 
77         // fix tab delimiter
78         if ($opt['delim'] == 'tab') $opt['delim'] = "\t";
79 
80         // resolve local files
81         if ($opt['file'] !== '' && !preg_match('/^https?:\/\//i', $opt['file'])) {
82             resolve_mediaid($INFO['namespace'] ?? '', $opt['file'], $exists);
83         }
84 
85         // create regexp filters
86         foreach ($filters as $col => $filter) {
87             list($text, $type) = $filter;
88             if ($type != 'r') {
89                 $text = preg_quote_cb($text);
90                 $text = str_replace('\*', '.*?', $text);
91                 $text = '^' . $text . '$';
92             }
93 
94             if (@preg_match("/$text/", null) === false) {
95                 msg("Invalid filter for column $col");
96             } else {
97                 $opt['filter'][$col - 1] = $text; // use zero based index internally
98             }
99         }
100 
101         // prepare the value output
102         list($c, $r) = array_pad(explode(',', $opt['output']), 2, 0);
103         $opt['outc'] = (int)$c;
104         $opt['outr'] = (int)$r;
105         if ($opt['outc']) $opt['outc'] -= 1;
106         if ($opt['outr']) $opt['outr'] -= 1;
107         unset($opt['output']);
108 
109         return $opt;
110     }
111 
112     /**
113      * Load CSV data from the given file or remote address
114      *
115      * @param $file
116      * @return string
117      * @throws Exception
118      */
119     public static function loadContent($file)
120     {
121         // load file data
122         if (preg_match('/^https?:\/\//i', $file)) {
123             $http = new DokuHTTPClient();
124             $content = $http->get($file);
125             if ($content === false) throw new \Exception('Failed to fetch remote CSV data');
126 
127         } else {
128             if (auth_quickaclcheck(getNS($file) . ':*') < AUTH_READ) {
129                 throw new \Exception('Access denied to CSV data');
130             }
131             $file = mediaFN($file);
132             if (!file_exists($file)) {
133                 throw new \Exception('requested local CSV file does not exist');
134             }
135             $content = io_readFile($file);
136         }
137         // if not valid UTF-8 is given we assume ISO-8859-1
138         if (!utf8_check($content)) $content = utf8_encode($content);
139 
140         return $content;
141     }
142 
143     /**
144      * @param string $content
145      * @param array $opt
146      * @return array
147      */
148     public static function prepareData($content, $opt)
149     {
150         $data = array();
151 
152         // get the first row - it will define the structure
153         $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
154         $maxcol = count($row);
155         $line = 0;
156 
157         while ($row !== false) {
158             // make sure we have enough columns
159             $row = array_pad($row, $maxcol, '');
160 
161             if ($line < $opt['hdr_rows']) {
162                 // if headers are wanted, always add them
163                 $data[] = $row;
164             } elseif ($opt['offset'] && $line < $opt['offset'] + $opt['hdr_rows']) {
165                 // ignore the line
166             } elseif ($opt['maxlines'] && $line >= $opt['maxlines'] + $opt['hdr_rows']) {
167                 // we're done
168                 break;
169             } else {
170                 // check filters
171                 $filterok = true;
172                 foreach ($opt['filter'] as $col => $filter) {
173                     if (!preg_match("/$filter/i", $row[$col])) {
174                         $filterok = false;
175                         break;
176                     }
177                 }
178 
179                 // add the line
180                 if ($filterok) {
181                     $data[] = $row;
182                 }
183             }
184 
185             $line++;
186             $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']);
187         }
188 
189         return $data;
190     }
191 
192     /**
193      * Reads one CSV line from the given string
194      *
195      * Should handle embedded new lines, escapes, quotes and whatever else CSVs tend to have
196      *
197      * Note $delim, $enc, $esc have to be one ASCII character only! The encoding of the content is not
198      * handled here but is read byte by byte - if you need conversions do it on the output
199      *
200      * @param string $str Input string, first CSV line will be removed
201      * @param string $delim Delimiter character
202      * @param string $enc Enclosing character
203      * @param string $esc Escape character
204      * @return array|boolean fields found on the line, false when no more lines could be found
205      * @author Andreas Gohr <andi@splitbrain.org>
206      */
207     public static function csv_explode_row(&$str, $delim = ',', $enc = '"', $esc = '\\')
208     {
209         $len = strlen($str);
210 
211         $infield = false;
212         $inenc = false;
213 
214         $fields = array();
215         $word = '';
216 
217         for ($i = 0; $i < $len; $i++) {
218             // convert to unix line endings
219             if ($str[$i] == "\015") {
220                 if ($str[($i + 1)] != "\012") {
221                     $str[$i] = "\012";
222                 } else {
223                     $i++;
224                     if ($i >= $len) break;
225                 }
226             }
227 
228             // simple escape that is not an enclosure
229             if ($str[$i] == $esc && $esc != $enc) {
230                 $i++; // skip this char and take next as is
231                 $word .= $str[$i];
232                 $infield = true; // we are obviously in a field
233                 continue;
234             }
235 
236             /*
237              * Now decide special cases depending on current field and enclosure state
238              */
239             if (!$infield) { // not in field
240 
241                 // we hit a delimiter even though we're not in a field - an empty field
242                 if ($str[$i] == $delim) {
243                     $fields[] = $word;
244                     $word = '';
245                     $infield = false;
246                     $inenc = false;
247                     continue;
248                 }
249 
250                 // a newline - an empty field as well, but we're done with this line
251                 if ($str[$i] == "\n") {
252                     $infield = false;
253                     $inenc = false;
254 
255                     //we saw no fields or content yet? empty line! skip it.
256                     if (!count($fields) && $word === '') continue;
257 
258                     // otherwise add field
259                     $fields[] = $word;
260                     $word = '';
261                     break;
262                 }
263 
264                 // we skip leading whitespace when we're not in a field yet
265                 if ($str[$i] === ' ') {
266                     continue;
267                 }
268 
269                 // cell starts with an enclosure
270                 if ($str[$i] == $enc) {
271                     // skip this one but open an enclosed field
272                     $infield = true;
273                     $inenc = true;
274                     continue;
275                 }
276 
277                 // still here? whatever is here, is content and starts a field
278                 $word .= $str[$i];
279                 $infield = true;
280                 $inenc = false;
281 
282             } elseif ($inenc) { // in field and enclosure
283 
284                 // we have an escape char that is an enclosure and the next char is an enclosure, too
285                 if ($str[$i] == $esc && $esc == $enc && isset($str[$i + 1]) && $str[$i + 1] == $esc) {
286                     $i++; // skip this char and take next as is
287                     $word .= $str[$i];
288                     continue;
289                 }
290 
291                 // we have an enclosure char
292                 if ($str[$i] == $enc) {
293                     // skip this one but close the enclosure
294                     $infield = true;
295                     $inenc = false;
296                     continue;
297                 }
298 
299                 // still here? just add more content
300                 $word .= $str[$i];
301 
302             } else { // in field but no enclosure
303 
304                 // a delimiter - next field please
305                 if ($str[$i] == $delim) {
306                     $fields[] = $word;
307                     $word = '';
308                     $infield = false;
309                     $inenc = false;
310                     continue;
311                 }
312 
313                 // EOL - we're done with the line
314                 if ($str[$i] == "\n") {
315                     $infield = false;
316                     $inenc = false;
317 
318                     //we saw no fields or content yet? empty line! skip it.
319                     if (!count($fields) && $word === '') continue;
320 
321                     $fields[] = $word;
322                     $word = '';
323                     break;
324                 }
325 
326                 // still here? just add more content
327                 $word .= $str[$i];
328             }
329         }
330 
331         // did we hit the end?
332         if ($infield && ($word || count($fields))) {
333             $fields[] = $word;
334         }
335 
336         // shorten the string by the stuff we read
337         $str = substr($str, $i + 1);
338 
339         if (!count($fields)) return false;
340         return $fields;
341     }
342 }
343 
344