1<?php 2/** 3 * CSV Plugin helper plugin 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <gohr@cosmocode.de> 7 */ 8 9/** 10 * Implement CSV parser and other helpers 11 */ 12class helper_plugin_csv extends DokuWiki_Plugin 13{ 14 15 /** 16 * Returns the default options 17 * 18 * @return array 19 */ 20 public static function getDefaultOpt() 21 { 22 return array( 23 'hdr_rows' => 1, 24 'hdr_cols' => 0, 25 'span_empty_cols' => 0, 26 'maxlines' => 0, 27 'offset' => 0, 28 'file' => '', 29 'delim' => ',', 30 'enclosure' => '"', 31 'escape' => '"', 32 'content' => '', 33 'filter' => array(), 34 'output' => '', 35 'outc' => 0, 36 'outr' => 0, 37 ); 38 } 39 40 /** 41 * Parse the options given in the syntax 42 * 43 * @param $optstr 44 * @return array 45 */ 46 public static function parseOptions($optstr) 47 { 48 global $INFO; 49 50 // defaults 51 $opt = helper_plugin_csv::getDefaultOpt(); 52 53 $filters = array(); 54 // parse options - https://regex101.com/r/tNdS9P/3/ 55 preg_match_all( 56 '/([^ =\[\]]+)(?:\[(\d+)\](?:\[(\w)\])?)?(?:=((?:".*?")|(?:[^ ]+)))?/', 57 $optstr, 58 $matches, 59 PREG_SET_ORDER 60 ); 61 foreach ($matches as $set) { 62 $option = $set[1]; 63 $value = isset($set[4]) ? $set[4] : ''; 64 $value = trim($value, '"'); 65 66 if ($option == 'filter') { 67 $col = isset($set[2]) ? $set[2] : 1; 68 $typ = isset($set[3]) ? $set[3] : 'g'; 69 $filters[$col] = array($value, $typ); 70 } elseif ($value === '') { 71 $opt['file'] = $option; 72 } else { 73 $opt[$option] = $value; 74 } 75 } 76 77 // fix tab delimiter 78 if ($opt['delim'] == 'tab') $opt['delim'] = "\t"; 79 80 // resolve local files 81 if ($opt['file'] !== '' && !preg_match('/^https?:\/\//i', $opt['file'])) { 82 resolve_mediaid($INFO['namespace'] ?? '', $opt['file'], $exists); 83 } 84 85 // create regexp filters 86 foreach ($filters as $col => $filter) { 87 list($text, $type) = $filter; 88 if ($type != 'r') { 89 $text = preg_quote_cb($text); 90 $text = str_replace('\*', '.*?', $text); 91 $text = '^' . $text . '$'; 92 } 93 94 if (@preg_match("/$text/", null) === false) { 95 msg("Invalid filter for column $col"); 96 } else { 97 $opt['filter'][$col - 1] = $text; // use zero based index internally 98 } 99 } 100 101 // prepare the value output 102 list($c, $r) = array_pad(explode(',', $opt['output']), 2, 0); 103 $opt['outc'] = (int)$c; 104 $opt['outr'] = (int)$r; 105 if ($opt['outc']) $opt['outc'] -= 1; 106 if ($opt['outr']) $opt['outr'] -= 1; 107 unset($opt['output']); 108 109 return $opt; 110 } 111 112 /** 113 * Load CSV data from the given file or remote address 114 * 115 * @param $file 116 * @return string 117 * @throws Exception 118 */ 119 public static function loadContent($file) 120 { 121 // load file data 122 if (preg_match('/^https?:\/\//i', $file)) { 123 $http = new DokuHTTPClient(); 124 $content = $http->get($file); 125 if ($content === false) throw new \Exception('Failed to fetch remote CSV data'); 126 127 } else { 128 if (auth_quickaclcheck(getNS($file) . ':*') < AUTH_READ) { 129 throw new \Exception('Access denied to CSV data'); 130 } 131 $file = mediaFN($file); 132 if (!file_exists($file)) { 133 throw new \Exception('requested local CSV file does not exist'); 134 } 135 $content = io_readFile($file); 136 } 137 // if not valid UTF-8 is given we assume ISO-8859-1 138 if (!utf8_check($content)) $content = utf8_encode($content); 139 140 return $content; 141 } 142 143 /** 144 * @param string $content 145 * @param array $opt 146 * @return array 147 */ 148 public static function prepareData($content, $opt) 149 { 150 $data = array(); 151 152 // get the first row - it will define the structure 153 $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']); 154 $maxcol = count($row); 155 $line = 0; 156 157 while ($row !== false) { 158 // make sure we have enough columns 159 $row = array_pad($row, $maxcol, ''); 160 161 if ($line < $opt['hdr_rows']) { 162 // if headers are wanted, always add them 163 $data[] = $row; 164 } elseif ($opt['offset'] && $line < $opt['offset'] + $opt['hdr_rows']) { 165 // ignore the line 166 } elseif ($opt['maxlines'] && $line >= $opt['maxlines'] + $opt['hdr_rows']) { 167 // we're done 168 break; 169 } else { 170 // check filters 171 $filterok = true; 172 foreach ($opt['filter'] as $col => $filter) { 173 if (!preg_match("/$filter/i", $row[$col])) { 174 $filterok = false; 175 break; 176 } 177 } 178 179 // add the line 180 if ($filterok) { 181 $data[] = $row; 182 } 183 } 184 185 $line++; 186 $row = helper_plugin_csv::csv_explode_row($content, $opt['delim'], $opt['enclosure'], $opt['escape']); 187 } 188 189 return $data; 190 } 191 192 /** 193 * Reads one CSV line from the given string 194 * 195 * Should handle embedded new lines, escapes, quotes and whatever else CSVs tend to have 196 * 197 * Note $delim, $enc, $esc have to be one ASCII character only! The encoding of the content is not 198 * handled here but is read byte by byte - if you need conversions do it on the output 199 * 200 * @param string $str Input string, first CSV line will be removed 201 * @param string $delim Delimiter character 202 * @param string $enc Enclosing character 203 * @param string $esc Escape character 204 * @return array|boolean fields found on the line, false when no more lines could be found 205 * @author Andreas Gohr <andi@splitbrain.org> 206 */ 207 public static function csv_explode_row(&$str, $delim = ',', $enc = '"', $esc = '\\') 208 { 209 $len = strlen($str); 210 211 $infield = false; 212 $inenc = false; 213 214 $fields = array(); 215 $word = ''; 216 217 for ($i = 0; $i < $len; $i++) { 218 // convert to unix line endings 219 if ($str[$i] == "\015") { 220 if ($str[($i + 1)] != "\012") { 221 $str[$i] = "\012"; 222 } else { 223 $i++; 224 if ($i >= $len) break; 225 } 226 } 227 228 // simple escape that is not an enclosure 229 if ($str[$i] == $esc && $esc != $enc) { 230 $i++; // skip this char and take next as is 231 $word .= $str[$i]; 232 $infield = true; // we are obviously in a field 233 continue; 234 } 235 236 /* 237 * Now decide special cases depending on current field and enclosure state 238 */ 239 if (!$infield) { // not in field 240 241 // we hit a delimiter even though we're not in a field - an empty field 242 if ($str[$i] == $delim) { 243 $fields[] = $word; 244 $word = ''; 245 $infield = false; 246 $inenc = false; 247 continue; 248 } 249 250 // a newline - an empty field as well, but we're done with this line 251 if ($str[$i] == "\n") { 252 $infield = false; 253 $inenc = false; 254 255 //we saw no fields or content yet? empty line! skip it. 256 if (!count($fields) && $word === '') continue; 257 258 // otherwise add field 259 $fields[] = $word; 260 $word = ''; 261 break; 262 } 263 264 // we skip leading whitespace when we're not in a field yet 265 if ($str[$i] === ' ') { 266 continue; 267 } 268 269 // cell starts with an enclosure 270 if ($str[$i] == $enc) { 271 // skip this one but open an enclosed field 272 $infield = true; 273 $inenc = true; 274 continue; 275 } 276 277 // still here? whatever is here, is content and starts a field 278 $word .= $str[$i]; 279 $infield = true; 280 $inenc = false; 281 282 } elseif ($inenc) { // in field and enclosure 283 284 // we have an escape char that is an enclosure and the next char is an enclosure, too 285 if ($str[$i] == $esc && $esc == $enc && isset($str[$i + 1]) && $str[$i + 1] == $esc) { 286 $i++; // skip this char and take next as is 287 $word .= $str[$i]; 288 continue; 289 } 290 291 // we have an enclosure char 292 if ($str[$i] == $enc) { 293 // skip this one but close the enclosure 294 $infield = true; 295 $inenc = false; 296 continue; 297 } 298 299 // still here? just add more content 300 $word .= $str[$i]; 301 302 } else { // in field but no enclosure 303 304 // a delimiter - next field please 305 if ($str[$i] == $delim) { 306 $fields[] = $word; 307 $word = ''; 308 $infield = false; 309 $inenc = false; 310 continue; 311 } 312 313 // EOL - we're done with the line 314 if ($str[$i] == "\n") { 315 $infield = false; 316 $inenc = false; 317 318 //we saw no fields or content yet? empty line! skip it. 319 if (!count($fields) && $word === '') continue; 320 321 $fields[] = $word; 322 $word = ''; 323 break; 324 } 325 326 // still here? just add more content 327 $word .= $str[$i]; 328 } 329 } 330 331 // did we hit the end? 332 if ($infield && ($word || count($fields))) { 333 $fields[] = $word; 334 } 335 336 // shorten the string by the stuff we read 337 $str = substr($str, $i + 1); 338 339 if (!count($fields)) return false; 340 return $fields; 341 } 342} 343 344