Qore CsvUtil Module Reference  1.5
 All Classes Namespaces Functions Variables Groups Pages
CsvUtil.qm.dox.h
1 // -*- mode: c++; indent-tabs-mode: nil -*-
2 // @file CsvUtil.qm Qore user module for working with CSV files
3 
4 /* CsvUtil.qm Copyright 2012 - 2016 Qore Technologies, sro
5 
6  Permission is hereby granted, free of charge, to any person obtaining a
7  copy of this software and associated documentation files (the "Software"),
8  to deal in the Software without restriction, including without limitation
9  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  and/or sell copies of the Software, and to permit persons to whom the
11  Software is furnished to do so, subject to the following conditions:
12 
13  The above copyright notice and this permission notice shall be included in
14  all copies or substantial portions of the Software.
15 
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  DEALINGS IN THE SOFTWARE.
23 */
24 
25 // minimum required Qore version
26 
27 // assume local var scope, do not use "$" for vars, members, and method calls
28 
29 
30 /* see release notes below for version history
31 */
32 
263 class CsvHelper {
264 
265 public:
266  private :
267  const C_OPT1 = 0x1;
268  const C_OPT2 = 0x2;
270  const Types = (
271  "int": True,
272  "*int": True,
273  "float": True,
274  "*float": True,
275  "number": True,
276  "*number": True,
277  "string": True,
278  "*string": True,
279  "date": True,
280  "*date": True,
281  );
282 
284  const FieldAttrs = ("type", "format", "timezone", "code", "header");
285 
287  bool tolwr = False;
288 
290  string date_format;
291 
293  hash m_specs;
294 
296  string errname;
297 
298  // reorder data according headers set by options.headers or read from CsvHeader
299  bool headerReorder = True;
300 
301 
302 public:
303 
305  constructor (string n_errname);
306 
307 
309  private bool isMultiType();
310 
311 
313  private checkType(string fld_errs, string key, string value);
314 
315 
316  // get spec from options.fields for old Csv. Check spec param for new Csv
317  private hash getSpec(*hash fields, string fld_errs, int C_OPTx);
318 
319 
320  private hash getSpec1(*hash fields);
321 
322 
323  private hash getSpec2(hash spec);
324 
325 
330  private list adjustFieldsFromHeaders(string type, *list headers);
331 
332 
333 }; // class CsvHelper
334 
336 namespace CsvUtil {
338  const EOL_UNIX = "\n";
340  const EOL_WIN = "\r\n";
342  const EOL_MACINTOSH = "\r";
343 
344  // helper list of end of line values
345  const EOLS = (EOL_UNIX, EOL_WIN, EOL_MACINTOSH, );
346 
348  const CSV_TYPE_UNKNOWN = "<unknown>";
350  const CSV_TYPE_SINGLE = "<single>";
351 
352 
354 
592 class AbstractCsvIterator : public Qore::AbstractIterator, private CsvHelper {
593 
594 public:
595  private :
597  const Options = (
598  "date_format": C_OPT1|C_OPT2,
599  "date-format": C_OPT1|C_OPT2,
600  "encoding": C_OPT1|C_OPT2,
601  "eol": C_OPT1|C_OPT2,
602  "extended_record": C_OPT2,
603  "fields": C_OPT1,
604  "header-lines": C_OPT1|C_OPT2,
605  "header_lines": C_OPT1|C_OPT2,
606  "header-names": C_OPT1|C_OPT2,
607  "header_names": C_OPT1|C_OPT2,
608  "header_reorder": C_OPT1|C_OPT2,
609  "headers": C_OPT1,
610  "ignore-empty": C_OPT1|C_OPT2,
611  "ignore_empty": C_OPT1|C_OPT2,
612  "ignore-whitespace": C_OPT1|C_OPT2,
613  "ignore_whitespace": C_OPT1|C_OPT2,
614  "quote": C_OPT1|C_OPT2,
615  "separator": C_OPT1|C_OPT2,
616  "timezone": C_OPT1|C_OPT2,
617  "tolwr": C_OPT1|C_OPT2,
618  "verify-columns": C_OPT1|C_OPT2,
619  "verify_columns": C_OPT1|C_OPT2,
620  );
621 
622  // field separator
623  string separator = ",";
624 
625  // field content delimiter
626  string quote = "\"";
627 
628  // number of header lines
629  softint headerLines = 0;
630 
631  // flag to use string names from the first header row if possible
632  bool headerNames = False;
633 
634  // True if empty lines should be ignored
635  bool ignoreEmptyLines = True;
636 
637  // Flag to trim the field content (trim leading and trailing whitespace) from unquoted fields
638  bool ignoreWhitespace = True;
639 
640  // the @ref Qore::TimeZone to use when parsing dates (default: current time zone)
641  *TimeZone timezone;
642 
643  // verify the column count for every row; if a row does not match, then throw a \c CSVFILEITERATOR-DATA-ERROR exception
644  bool checkElementCounts = False;
645 
646  // getRecord/getValue returns extended hash
647  bool extendedRecord = False;
648 
649  // column count for verifying column counts
650  int cc;
651 
652  // current record count for the index() method
653  int rc = 0;
654 
655  // to resolve record type by rules
656  hash m_resolve_by_rule;
657 
658  // to resolve record type by number of fields
659  hash m_resolve_by_count;
660 
661  // list of idx to field transformarions, in order of spec
662  hash m_resolve_by_idx;
663 
664  // fake specs based on the first non-header row
665  bool fakeHeaderNames;
666 
667 
668 public:
669 
671 
675  constructor(*hash opts);
676 
677 
679 
683  // NOTE: when declared as *hash then always calls this constructor
684  constructor(hash spec, hash opts);
685 
686 
688  private processCommonOptions(*hash opts, int C_OPTx);
689 
690 
692  private processSpec(hash spec);
693 
694 
696  private prepareFieldsFromHeaders(*list headers);
697 
698 
700  private *string getDataName();
701 
702 
704  private abstract int lineNumberImpl();
705 
707  private abstract string getLineValueImpl();
708 
710  private abstract bool nextLineImpl();
711 
713 
718  bool next();
719 
720 
722 
729  any memberGate(string name);
730 
731 
733 
744  hash getValue();
745 
746 
748 
761  hash getRecord(bool extended);
762 
763 
765 
776  hash getRecord();
777 
778 
780 
792  any getRecordList();
793 
794 
796 
803  string getSeparator();
804 
805 
807 
814  string getQuote();
815 
816 
818 
825  *list getHeaders();
826 
827 
829 
834  *list getHeaders(string type);
835 
836 
838 
849  int index();
850 
851 
853 
868  int lineNumber();
869 
870 
871  private any handleType(hash fh, *string val);
872 
873 
875  private list getLineAndSplit();
876 
877 
879 
886  string identifyType(list rec);
887 
888 
890 
897  private *string identifyTypeImpl(list rec);
898 
899 
901  private hash parseLine();
902 
903  };
904 
906 
912 
913 public:
915 
920  constructor(string path, *hash opts) ;
921 
923 
927  constructor(string path, hash spec, hash opts) ;
928 
929 
930  any memberGate(string name);
931 
932 
934  private *string getDataName();
935 
936 
938  private int lineNumberImpl();
939 
940 
942  private string getLineValueImpl();
943 
944 
946  private bool nextLineImpl();
947 
948  }; // CsvFileIterator class
949 
951 
956 class CsvDataIterator : public CsvUtil::AbstractCsvIterator,public DataLineIterator {
957 
958 public:
959 
961 
966  constructor(string data, *hash opts) ;
967 
968 
970 
974  constructor(string data, hash spec, hash opts) ;
975 
976 
977  any memberGate(string name);
978 
979 
981  private int lineNumberImpl();
982 
983 
985  private string getLineValueImpl();
986 
987 
989  private bool nextLineImpl();
990 
991  };
992 
994 
1103 class AbstractCsvWriter : private CsvHelper {
1104 
1105 public:
1106  private :
1108  const Options = (
1109  "block": C_OPT1|C_OPT2,
1110  "datamap": C_OPT1,
1111  "date_format": C_OPT1|C_OPT2,
1112  "date-format": C_OPT1|C_OPT2,
1113  "encoding": C_OPT1|C_OPT2,
1114  "eol": C_OPT1|C_OPT2,
1115  "fields": C_OPT1,
1116  "headers": C_OPT1,
1117  "header_reorder": C_OPT1,
1118  "info_log": C_OPT1|C_OPT2,
1119  "optimal_quotes": C_OPT1|C_OPT2,
1120  "optimal-quotes": C_OPT1|C_OPT2,
1121  "quote": C_OPT1|C_OPT2,
1122  "quote_escape": C_OPT1|C_OPT2,
1123  "separator": C_OPT1|C_OPT2,
1124  "verify_columns": C_OPT1|C_OPT2,
1125  "verify-columns": C_OPT1|C_OPT2,
1126  "write_headers": C_OPT1|C_OPT2,
1127  "write-headers": C_OPT1|C_OPT2,
1128  );
1129 
1131  string encoding;
1132 
1134  string separator = ",";
1135 
1137  string quote = "\"";
1138 
1140  string m_quoteEscapeChar = "\\";
1141 
1143  string eol = EOL_UNIX;
1144 
1147 
1149  int lineNo = 0;
1150 
1152  int block = 1000;
1153 
1156 
1159 
1162 
1164  *code info_log;
1165 
1168 
1171 
1172 public:
1173 
1175 
1181  constructor(string n_errname, *hash n_opts);
1182 
1183 
1185 
1193  constructor(string n_errname, hash spec, hash n_opts);
1194 
1195 
1197  private processCommonOptions(*hash n_opts, int C_OPTx);
1198 
1199 
1201  private processSpec();
1202 
1203 
1205  private writeHeaders();
1206 
1207 
1209 
1214  writeLine(list values);
1215 
1216 
1218 
1223  writeLine(hash values);
1224 
1225 
1227 
1233  writeLine(string type, list values);
1234 
1235 
1237 
1243  writeLine(string type, hash values);
1244 
1245 
1247 
1254  write(Qore::AbstractIterator iterator);
1255 
1256 
1258 
1265  write(Qore::SQL::SQLStatement iterator);
1266 
1267 
1269 
1276  write(list l);
1277 
1278 
1280  abstract private writeRawLine(list values);
1281 
1283 
1287  private string prepareRawLine(list values);
1288 
1289 
1290  }; // AbstractCsvWriter class
1291 
1294 
1295 public:
1296 
1297  private :
1298  // a file to write
1299  File file;
1300 
1301 public:
1302 
1304 
1312  constructor(string path, *hash opts) ;
1313 
1314 
1316 
1325  constructor(string path, hash spec, hash opts) ;
1326 
1327 
1328  private openFile(string path);
1329 
1330 
1331  private writeRawLine(list values);
1332 
1333 
1334  }; // CsvFileWriter
1335 
1338 
1339 public:
1340 
1341  private :
1342  // a csv content
1343  string content;
1344 
1345 public:
1346 
1348 
1353  constructor(*hash opts) ;
1354 
1355 
1357 
1363  constructor(hash spec, hash opts) ;
1364 
1365 
1366  private initContent();
1367 
1368 
1369  private writeRawLine(list values);
1370 
1371 
1373 
1382  string write(Qore::AbstractIterator iterator);
1383 
1384 
1386 
1395  string write(list l);
1396 
1397 
1399  string getContent();
1400 
1401 
1402  }; // CsvStringWriter
1403 
1404 }; // CsvUtil namespace
private writeRawLine(list values)
This method must be overridden in child classes to provide the output implementation.
bool write_headers
this flag determines if any stored headers are output
Definition: CsvUtil.qm.dox.h:1158
constructor(string data, *hash opts)
Creates the CsvDataIterator with the input data and optionally an option hash.
hash m_out_by_name
mapping output field by name
Definition: CsvUtil.qm.dox.h:1167
constructor(*hash opts)
creates the CsvStringWriter single-type mode with content in the memory
private string prepareRawLine(list values)
Prepare a string (line with EOF) with formatting and escaping.
int index()
Returns the row index being iterated, which does not necessarily correspond to the line number when t...
string write(Qore::AbstractIterator iterator)
Stream iterator and return a CSV-formatted output string.
the AbstractCsvIterator class is an abstract base class that allows abstract CSV data to be iterated ...
Definition: CsvUtil.qm.dox.h:592
private *string getDataName()
Returns the name of the input data.
private bool nextLineImpl()
Moves the current line / record position to the next line / record; returns False if there are no mor...
const True
const Options
valid options for the object (a hash for quick lookups of valid keys)
Definition: CsvUtil.qm.dox.h:597
hash m_out_by_idx
mapping output field by index
Definition: CsvUtil.qm.dox.h:1170
private processCommonOptions(*hash opts, int C_OPTx)
process common options and and assing internal fields
string getQuote()
Returns the current quote string.
private processSpec()
Process specification and set internal variable for mapping.
abstract private int lineNumberImpl()
Returns the current line number.
private writeRawLine(list values)
This method must be overridden in child classes to provide the output implementation.
bool checkElementCounts
verify the column count for every row; if a row does not match, then throw a CSVFILEITERATOR-DATA-ERR...
Definition: CsvUtil.qm.dox.h:1146
private int lineNumberImpl()
Returns the current line number.
constructor(string path, *hash opts)
Creates the CsvFileIterator in single-type mode with the path of the file to read and an option hash...
private hash parseLine()
Parses a line in the file and returns a processed list of the fields.
const False
private list getLineAndSplit()
Read line split by separator/quote into list.
string m_quoteEscapeChar
quote escape character
Definition: CsvUtil.qm.dox.h:1140
private *string getDataName()
Returns the name of the input data.
*code info_log
a closure/call reference for informational logging when using write(SQLStatement) ...
Definition: CsvUtil.qm.dox.h:1164
list list(...)
const Options
valid options for the object (a hash for quick lookups of valid keys)
Definition: CsvUtil.qm.dox.h:1108
write(Qore::AbstractIterator iterator)
Stream an iterator into the output.
The CsvFileIterator class allows CSV files to be iterated on a record basis.
Definition: CsvUtil.qm.dox.h:911
string getContent()
Get the current in-memory content as a string.
string eol
end of line sequence
Definition: CsvUtil.qm.dox.h:1143
string getSeparator()
Returns the current separator string.
any getRecordList()
Returns the current record as a list.
private processCommonOptions(*hash n_opts, int C_OPTx)
Process options and set internal variables.
The CsvStringWriter class for in-memory string CSV creation.
Definition: CsvUtil.qm.dox.h:1337
string separator
field separator
Definition: CsvUtil.qm.dox.h:1134
string quote
field content delimiter
Definition: CsvUtil.qm.dox.h:1137
any memberGate(string name)
Returns the given column value for the current row.
The AbstractCsvWriter class provides a parent for all CSV writers.
Definition: CsvUtil.qm.dox.h:1103
private processSpec(hash spec)
process specification and assing internal data for resolving
private writeHeaders()
Write csv headers.
string baseTemplate
base template for value format
Definition: CsvUtil.qm.dox.h:1155
const EOL_MACINTOSH
Old (pre-OSX) Macintosh end of line character sequence.
Definition: CsvUtil.qm.dox.h:342
const CSV_TYPE_UNKNOWN
Record type when non matching any type.
Definition: CsvUtil.qm.dox.h:348
abstract private string getLineValueImpl()
Returns the current line.
string type(any arg)
private bool nextLineImpl()
Moves the current line / record position to the next line / record; returns False if there are no mor...
const EOL_UNIX
Unix end of line character sequence (for new OS X too)
Definition: CsvUtil.qm.dox.h:338
*list getHeaders()
Returns the current record headers or NOTHING if no headers have been detected or saved yet...
private *string identifyTypeImpl(list rec)
Identify a input record, given the raw line string. This method performs a lookup to a precalculated ...
constructor(string n_errname, *hash n_opts)
Creates the AbstractCsvWriter in single-type mode.
private prepareFieldsFromHeaders(*list headers)
match headers provided at Csv header or in options, never called for multi-type because header_names ...
private int lineNumberImpl()
Returns the current line number; returns 0 if not pointing at any data.
int lineNo
the latest line number
Definition: CsvUtil.qm.dox.h:1149
writeLine(list values)
Write a line with a list of values; data are checked against column rules.
hash getValue()
Returns the current record as a hash.
private string getLineValueImpl()
Returns the current line trimmed of the EOL character(s)
const EOL_WIN
MS DOS/Windows end of line character sequence.
Definition: CsvUtil.qm.dox.h:340
constructor(*hash opts)
creates the AbstractCsvIterator with an option hash in single-type mode
The CsvDataIterator class allows arbitrary CSV string data to be iterated on a record basis...
Definition: CsvUtil.qm.dox.h:956
constructor(string path, *hash opts)
creates the CsvFileWriter in single-type mode with the path of the file to read and an optional optio...
int block
block size for bulk DML
Definition: CsvUtil.qm.dox.h:1152
string identifyType(list rec)
Identify a fixed-length line type using identifyTypeImpl(); may be overridden if necessary.
bool optimal_quotes
stores the optimal quotes option
Definition: CsvUtil.qm.dox.h:1161
string encoding
output file character encoding
Definition: CsvUtil.qm.dox.h:1131
abstract private bool nextLineImpl()
Moves the current line / record position to the next line / record; returns False if there are no mor...
The CsvFileWriter class for safe CSV file creation.
Definition: CsvUtil.qm.dox.h:1293
hash hash(object obj)
private string getLineValueImpl()
Returns the current line trimmed of the EOL character(s)
abstract private writeRawLine(list values)
This method must be overridden in child classes to provide the output implementation.
bool next()
Moves the current line / record position to the next line / record; returns False if there are no mor...
const CSV_TYPE_SINGLE
Record type when multi-type is disabled.
Definition: CsvUtil.qm.dox.h:350
hash getRecord()
Returns the current record as a hash.
int lineNumber()
Returns the current iterator line number in the file (the first line is line 1) or 0 if not pointing ...