1<?php 2 3use dokuwiki\Extension\Plugin; 4use dokuwiki\File\MediaResolver; 5use dokuwiki\HTTP\DokuHTTPClient; 6use dokuwiki\Utf8\Clean; 7 8/** 9 * CSV Plugin helper plugin 10 * 11 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 12 * @author Andreas Gohr <gohr@cosmocode.de> 13 */ 14/** 15 * Implement CSV parser and other helpers 16 */ 17class helper_plugin_csv extends Plugin 18{ 19 /** 20 * Returns the default options 21 * 22 * @return array 23 */ 24 public static function getDefaultOpt() 25 { 26 return [ 27 'hdr_rows' => 1, 28 'hdr_cols' => 0, 29 'span_empty_cols' => 0, 30 'maxlines' => 0, 31 'offset' => 0, 32 'file' => '', 33 'delim' => ',', 34 'enclosure' => '"', 35 'escape' => '"', 36 'content' => '', 37 'filter' => [], 38 'output' => '', 39 'outc' => 0, 40 'outr' => 0 41 ]; 42 } 43 44 /** 45 * Parse the options given in the syntax 46 * 47 * @param $optstr 48 * @return array 49 */ 50 public static function parseOptions($optstr) 51 { 52 global $INFO; 53 54 // defaults 55 $opt = helper_plugin_csv::getDefaultOpt(); 56 57 $filters = []; 58 // parse options - https://regex101.com/r/tNdS9P/3/ 59 preg_match_all( 60 '/([^ =\[\]]+)(?:\[(\d+)\](?:\[(\w)\])?)?(?:=((?:".*?")|(?:[^ ]+)))?/', 61 $optstr, 62 $matches, 63 PREG_SET_ORDER 64 ); 65 foreach ($matches as $set) { 66 $option = $set[1]; 67 $value = $set[4] ?? ''; 68 $value = trim($value, '"'); 69 70 if ($option == 'filter') { 71 $col = $set[2] ?? 1; 72 $typ = $set[3] ?? 'g'; 73 $filters[$col] = [$value, $typ]; 74 } elseif ($value === '') { 75 $opt['file'] = $option; 76 } else { 77 $opt[$option] = $value; 78 } 79 } 80 81 // fix tab delimiter 82 if ($opt['delim'] == 'tab') $opt['delim'] = "\t"; 83 84 // resolve local files 85 if ($opt['file'] !== '' && !preg_match('/^https?:\/\//i', $opt['file'])) { 86 $resolver = new MediaResolver($INFO['id'] ?? ''); 87 $opt['file'] = $resolver->resolveId($opt['file']); 88 } 89 90 // create regexp filters 91 foreach ($filters as $col => $filter) { 92 [$text, $type] = $filter; 93 if ($type != 'r') { 94 $text = preg_quote_cb($text); 95 $text = str_replace('\*', '.*?', $text); 96 $text = '^' . $text . '$'; 97 } 98 99 if (@preg_match("/$text/", null) === false) { 100 msg("Invalid filter for column $col"); 101 } else { 102 $opt['filter'][$col - 1] = $text; // use zero based index internally 103 } 104 } 105 106 // prepare the value output 107 [$c, $r] = array_pad(explode(',', $opt['output']), 2, 0); 108 $opt['outc'] = (int)$c; 109 $opt['outr'] = (int)$r; 110 if ($opt['outc']) $opt['outc'] -= 1; 111 if ($opt['outr']) $opt['outr'] -= 1; 112 unset($opt['output']); 113 114 return $opt; 115 } 116 117 /** 118 * Load CSV data from the given file or remote address 119 * 120 * @param $file 121 * @return string 122 * @throws Exception 123 */ 124 public static function loadContent($file) 125 { 126 // load file data 127 if (preg_match('/^https?:\/\//i', $file)) { 128 $http = new DokuHTTPClient(); 129 $content = $http->get($file); 130 if ($content === false) throw new \Exception('Failed to fetch remote CSV data'); 131 } else { 132 if (auth_quickaclcheck(getNS($file) . ':*') < AUTH_READ) { 133 throw new \Exception('Access denied to CSV data'); 134 } 135 $file = mediaFN($file); 136 if (!file_exists($file)) { 137 throw new \Exception('requested local CSV file does not exist'); 138 } 139 $content = io_readFile($file); 140 } 141 // if not valid UTF-8 is given we assume ISO-8859-1 142 if (!Clean::isUtf8($content)) $content = utf8_encode($content); 143 144 return $content; 145 } 146 147 /** 148 * @param string $content 149 * @param array $opt 150 * @return array 151 */ 152 public static function prepareData($content, $opt) 153 { 154 $data = []; 155 156 // get the first row - it will define the structure 157 $row = helper_plugin_csv::csvExplodeRow($content, $opt['delim'], $opt['enclosure'], $opt['escape']); 158 $maxcol = count($row); 159 $line = 0; 160 161 while ($row !== false) { 162 // make sure we have enough columns 163 $row = array_pad($row, $maxcol, ''); 164 165 if ($line < $opt['hdr_rows']) { 166 // if headers are wanted, always add them 167 $data[] = $row; 168 } elseif ($opt['offset'] && $line < $opt['offset'] + $opt['hdr_rows']) { 169 // ignore the line 170 } elseif ($opt['maxlines'] && $line >= $opt['maxlines'] + $opt['hdr_rows']) { 171 // we're done 172 break; 173 } else { 174 // check filters 175 $filterok = true; 176 foreach ($opt['filter'] as $col => $filter) { 177 if (!preg_match("/$filter/i", $row[$col])) { 178 $filterok = false; 179 break; 180 } 181 } 182 183 // add the line 184 if ($filterok) { 185 $data[] = $row; 186 } 187 } 188 189 $line++; 190 $row = helper_plugin_csv::csvExplodeRow($content, $opt['delim'], $opt['enclosure'], $opt['escape']); 191 } 192 193 return $data; 194 } 195 196 /** 197 * Reads one CSV line from the given string 198 * 199 * Should handle embedded new lines, escapes, quotes and whatever else CSVs tend to have 200 * 201 * Note $delim, $enc, $esc have to be one ASCII character only! The encoding of the content is not 202 * handled here but is read byte by byte - if you need conversions do it on the output 203 * 204 * @param string $str Input string, first CSV line will be removed 205 * @param string $delim Delimiter character 206 * @param string $enc Enclosing character 207 * @param string $esc Escape character 208 * @return array|boolean fields found on the line, false when no more lines could be found 209 * @author Andreas Gohr <andi@splitbrain.org> 210 */ 211 public static function csvExplodeRow(&$str, $delim = ',', $enc = '"', $esc = '\\') 212 { 213 $len = strlen($str); 214 215 $infield = false; 216 $inenc = false; 217 218 $fields = []; 219 $word = ''; 220 221 for ($i = 0; $i < $len; $i++) { 222 // convert to unix line endings 223 if ($str[$i] == "\015") { 224 if ($str[($i + 1)] != "\012") { 225 $str[$i] = "\012"; 226 } else { 227 $i++; 228 if ($i >= $len) break; 229 } 230 } 231 232 // simple escape that is not an enclosure 233 if ($str[$i] == $esc && $esc != $enc) { 234 $i++; // skip this char and take next as is 235 $word .= $str[$i]; 236 $infield = true; // we are obviously in a field 237 continue; 238 } 239 240 /* 241 * Now decide special cases depending on current field and enclosure state 242 */ 243 if (!$infield) { // not in field 244 // we hit a delimiter even though we're not in a field - an empty field 245 if ($str[$i] == $delim) { 246 $fields[] = $word; 247 $word = ''; 248 $infield = false; 249 $inenc = false; 250 continue; 251 } 252 253 // a newline - an empty field as well, but we're done with this line 254 if ($str[$i] == "\n") { 255 $infield = false; 256 $inenc = false; 257 258 //we saw no fields or content yet? empty line! skip it. 259 if ($fields === [] && $word === '') continue; 260 261 // otherwise add field 262 $fields[] = $word; 263 $word = ''; 264 break; 265 } 266 267 // we skip leading whitespace when we're not in a field yet 268 if ($str[$i] === ' ') { 269 continue; 270 } 271 272 // cell starts with an enclosure 273 if ($str[$i] == $enc) { 274 // skip this one but open an enclosed field 275 $infield = true; 276 $inenc = true; 277 continue; 278 } 279 280 // still here? whatever is here, is content and starts a field 281 $word .= $str[$i]; 282 $infield = true; 283 $inenc = false; 284 } elseif ($inenc) { // in field and enclosure 285 // we have an escape char that is an enclosure and the next char is an enclosure, too 286 if ($str[$i] == $esc && $esc == $enc && isset($str[$i + 1]) && $str[$i + 1] == $esc) { 287 $i++; // skip this char and take next as is 288 $word .= $str[$i]; 289 continue; 290 } 291 292 // we have an enclosure char 293 if ($str[$i] == $enc) { 294 // skip this one but close the enclosure 295 $infield = true; 296 $inenc = false; 297 continue; 298 } 299 300 // still here? just add more content 301 $word .= $str[$i]; 302 } else { // in field but no enclosure 303 // a delimiter - next field please 304 if ($str[$i] == $delim) { 305 $fields[] = $word; 306 $word = ''; 307 $infield = false; 308 $inenc = false; 309 continue; 310 } 311 312 // EOL - we're done with the line 313 if ($str[$i] == "\n") { 314 $infield = false; 315 $inenc = false; 316 317 //we saw no fields or content yet? empty line! skip it. 318 if ($fields === [] && $word === '') continue; 319 320 $fields[] = $word; 321 $word = ''; 322 break; 323 } 324 325 // still here? just add more content 326 $word .= $str[$i]; 327 } 328 } 329 330 // did we hit the end? 331 if ($infield && ($word || count($fields))) { 332 $fields[] = $word; 333 } 334 335 // shorten the string by the stuff we read 336 $str = substr($str, $i + 1); 337 338 if ($fields === []) return false; 339 return $fields; 340 } 341} 342