-
Notifications
You must be signed in to change notification settings - Fork 5
/
csvstrm.h
528 lines (476 loc) · 14.9 KB
/
csvstrm.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
#ifndef CSV_STREAM_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* # CSV stream reader
*
* Single header library to read a [CSV file][wiki] row-by-row.
*
* To use this library, define `CSV_IMPLEMENTATION` before including
* **csvstrm.h** in _one_ of your C files (other C files include
* **csvstrm.h** normally), like so:
*
* ```
* #include <stdio.h>
*
* #define CSV_IMPLEMENTATION
* #include "csvstrm.h"
* ```
*
* It will parse CSV documents as specified by [RFC4180][RFC], but it
* follows the rule of _"be liberal in what you accept from others"_,
* so there are a couple of deviations:
*
* * Leading and trailing whitespaces in each field are trimmed by default.
* * This behaviour can be changed by defining `CSV_TRIM` as 0.
* * You can have spaces before and after the quotes in a quoted field.
* * Double quotes inside unquoted fields are allowed.
* * Records can end with CRLF or with LF character sequences.
* * It does not enforce that all records (rows) have the same number of
* fields. That is an application concern.
* * It does not specify whether the first row contains headers. That is
* left up to the application.
*
* It also took some ideas from the [Repici][] document cited by the [RFC][].
*
* ## Basic usage example
*
* Here is a simple usage example. Some error handling code has been omitted.
*
* ```
* CsvContext csv;
* FILE *f = fopen(argv[1], "r");
*
* // Call csv_context_file() to initialise the CsvContext object
* // to read read CSV data from an open file.
* csv_context_file(&csv, f);
*
* // csv_read_record() reads a row from the file.
* // It will return 0 when it reaches the end of the file
* while(csv_read_record(&csv)) {
* int j;
* // You can use csv_count() to retrieve the number of fields
* // read from the file.
* // csv_field() can then be used to access an individual field.
* for(j = 0; j < csv_count(&csv); j++) {
* printf("[%s]", csv_field(&csv,j));
* }
* printf("\n");
* }
* fclose(f);
* ```
*
* ## License
*
* Author: Werner Stoop
*
* This is free and unencumbered software released into the public domain.
*
* See <http://unlicense.org/> for details
*
* [wiki]: https://en.wikipedia.org/wiki/Comma-separated_values
* [RFC]: https://datatracker.ietf.org/doc/html/rfc4180
* [Repici]: https://www.creativyst.com/Doc/Articles/CSV/CSV01.shtml
*/
/**
* ## Configuration
*
* These macros can be defined before including **csvstrm.h** in your
* C file to control the behaviour of the library.
*
* * `CSV_DELIMITER` -
* The delimiter to separate fields (columns) in each record (row)
* It defaults to `','`.
* * `CSV_BUFFER_SIZE` -
* While each record is being read, the characters from the file
* are copied to an internal buffer. This controls the size of
* that internal buffer.
* * `CSV_READ_BUFFER_SIZE` -
* This controls the size of the second internal buffer that
* stores raw bytes as they are read from the input before they're
* processed.
* * `CSV_MAX_FIELDS` -
* The maximum number of fields expected per record.
* * `CSV_TRIM` -
* Determines whether leading and trailing whitespace characters will
* be trimmed from fields by the parser. \
* For example, consider a CSV section `..., foo ,...`: \
* If `CSV_TRIM` is non-zero the field will be returned as `"foo"`. \
* If it is 0 then the whitespace will be left intact, so it will be
* returned as `" foo "`
*
* These macros _must_ be the same in all files that include **csvstrm.h**.
*/
# ifndef CSV_DELIMITER
# define CSV_DELIMITER ','
# endif
# ifndef CSV_BUFFER_SIZE
# define CSV_BUFFER_SIZE 256
# endif
# ifndef CSV_READ_BUFFER_SIZE
# define CSV_READ_BUFFER_SIZE 64
# endif
# ifndef CSV_MAX_FIELDS
# define CSV_MAX_FIELDS 32
# endif
# ifndef CSV_TRIM
# define CSV_TRIM 1
# endif
/**
* ## Definitions
*
* ### `csv_read_data_fun`
*
* `typedef int (*csv_read_data_fun)(char *b, int n, void *d);`
*
* Prototype for functions that can read CSV data.
*
* `b` is a pointer to a buffer that will be filled with chars from
* the input files. `n` contains the size in bytes of the buffer.
* `d` is a pointer to some structure where the data is read from.
*
* For example, when reading a CSV file from a ZIP archive, `d` might
* point to the structure that the ZIP library to encapsulate the archive.
*
* The function should return 0 if it reaches the end of the input data,
* non-zero otherwise.
*
* See `csv_context_custom()` in section [initialising the csvcontext](#initialising-the-csvcontext)
*/
typedef int (*csv_read_data_fun)(char *b, int n, void *d);
/**
* ### `enum csv_error_code`
*
* * `CSV_OK` -
* No error
* * `CSV_ERR_BUFFER` -
* The buffer used to store field data internally is full.
* It is too small for the record (row) you're reading. \
* Increase `CSV_BUFFER_SIZE`.
* * `CSV_ERR_FIELDS` -
* There are too many fields (columns) in the record. \
* Increase `CSV_MAX_FIELDS`.
* * `CSV_ERR_BAD_QUOTE` -
* A quoted field is incorrectly formatted.
* * `CSV_ERR_LINE_END` -
* There is a problem with a line ending.
*/
enum csv_error_code {
CSV_OK = 0,
CSV_ERR_BUFFER, /* increase CSV_BUFFER_SIZE */
CSV_ERR_FIELDS, /* increase CSV_MAX_FIELDS */
CSV_ERR_BAD_QUOTE,
CSV_ERR_LINE_END,
};
/**
* ### `typedef struct CsvContext CsvContext;`
*
* Structure that contains the state of the CSV stream parser.
*
* The fields in the structure should not be manipulated directly,
* but these are some members of interest:
*
* * `char *fields[CSV_MAX_FIELDS]` -
* The array of pointers that contain the fields after parsing a record.
* Rather use `csv_field()` to access the individual fields.
* * `int nf` -
* The number of fields parsed from a record.
* Rather use `csv_count()` to read this value.
* * `enum csv_error_code err` -
* An error code that may have resulted from parsing the record.
* Rather use `csv_get_error()` to retrieve this value.
*
* Section [initialising the csvcontext](#initialising-the-csvcontext)
* below describes how to initialise the structure to read CSV data.
*/
typedef struct CsvContext {
/* Determines where the data is read from */
csv_read_data_fun get_data;
void *data;
/* The internal buffer, where bytes are read into
from the file, but before they're processed. */
char raw_buffer[CSV_READ_BUFFER_SIZE];
int in_pos;
int last_char;
/* Where the data for the fields are stored.
The values in `fields` are a pointers into this buffer */
char buffer[CSV_BUFFER_SIZE];
/* The fields that have been parsed from the file */
char *fields[CSV_MAX_FIELDS];
int nf;
/* Error code? */
enum csv_error_code err;
} CsvContext;
#ifdef EOF /* EOF will be defined if <stdio.h> is #included */
/**
* ## Initialising the `CsvContext`
*
* ### `void csv_context_file(CsvContext *csv, FILE *file)`
*
* Initialises a `CsvContext` structure to read data from a file
* pointed to by `file`.
*/
void csv_context_file(CsvContext *csv, FILE *file);
/**
* ### `void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll)`
*
* Initialises a `CsvContext` structure to read data from a file, but it will
* only read a limited number of bytes from the file.
*
* (The intended use-case is where a CSV file has been concatenated with other
* files into an archive file)
*
* The `csv_read_limit` structure `ll` is defined as follows:
*
* ```
* struct csv_read_limit {
* FILE *f;
* int limit;
* };
* ```
*
* where `f` is the file to read from and `limit` is the maximum
* number of bytes that will be read from the file.
*/
struct csv_read_limit {
FILE *f;
int limit;
};
void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll);
#endif
/**
* ### `void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data)`
*
* Initialises a `CsvContext` with a custom function `fun` that will read bytes
* from an object `data`.
*/
void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data);
/**
* ## Reading records
*
* ### `int csv_read_record(CsvContext *csv)`
*
* Reads a record from the CSV file.
*
* It returns the number of fields that were read from the record.
* If the number of fields does not match the number of fields expected
* then `csv_get_error()` can be used to retrieve the error code.
*
* ### `int csv_count(CsvContext *csv)`
*
* Get the number of fields in the last record that was read by
* `csv_read_record()`.
*
* ### `const char *csv_field(CsvContext *csv, int i)`
*
* Get the `i`'th field of the last record that was read by
* `csv_read_record()`.
*
* ### `enum csv_error_code csv_get_error(CsvContext *csv)`
*
* Retrieves an error code (if any) from the `CsvContext`.
* The error codes are described in Subsection [enum csv_error_code](#enum-csv_error_code).
*/
int csv_read_record(CsvContext *csv);
int csv_count(CsvContext *csv);
const char *csv_field(CsvContext *csv, int i);
enum csv_error_code csv_get_error(CsvContext *csv);
/* *********************************************************************** */
# ifdef CSV_IMPLEMENTATION
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#ifdef __cplusplus
# define CAST(x, y) (x)y
#else
# define CAST(x, y) y
#endif
static int _csv_get_char(CsvContext *csv) {
char c = 0;
if(csv->last_char == EOF) {
return EOF;
} else if(csv->last_char) {
c = csv->last_char;
csv->last_char = 0;
return c;
}
if(csv->in_pos >= CSV_READ_BUFFER_SIZE || (c = csv->raw_buffer[csv->in_pos++]) == '\0') {
int cnt = csv->get_data(csv->raw_buffer, CSV_READ_BUFFER_SIZE - 1, csv->data);
if(!cnt) {
csv->last_char = EOF;
return EOF;
}
csv->in_pos = 0;
c = csv->raw_buffer[csv->in_pos++];
}
return c;
}
static void _csv_unget_char(CsvContext *csv, int c) {
csv->last_char = c;
}
int csv_read_record(CsvContext *csv) {
int c = 0;
size_t start, bump = 0;
enum parse_state {
RECORD_START, FIELD_START, FIELD, QUOTE, FIELD_END, RECORD_END
} state = RECORD_START;
if(csv->last_char == EOF) return 0;
csv->nf = 0;
csv->err = CSV_OK;
for(;;) {
switch(state) {
case RECORD_START:
c = _csv_get_char(csv);
if(c == EOF)
return 0;
state = FIELD_START;
_csv_unget_char(csv, c);
break;
case FIELD_START:
if(csv->nf == CSV_MAX_FIELDS) {
csv->err = CSV_ERR_FIELDS;
return csv->nf;
}
c = _csv_get_char(csv);
#if CSV_TRIM
while(strchr(" \t\v\f", c))
c = _csv_get_char(csv);
#endif
csv->fields[csv->nf] = &csv->buffer[bump];
if(c == '\"')
state = QUOTE;
else {
_csv_unget_char(csv, c);
start = bump;
state = FIELD;
}
break;
case FIELD:
c = _csv_get_char(csv);
if(c == '\r') {
c = _csv_get_char(csv);
if(c != '\n') {
csv->err = CSV_ERR_LINE_END;
return csv->nf;
}
}
if(c == EOF || c == '\n' || c == CSV_DELIMITER) {
#if CSV_TRIM
while(bump > start && strchr(" \t\v\f", csv->buffer[bump-1]))
bump--;
#endif
state = c == CSV_DELIMITER ? FIELD_END : RECORD_END;
} else {
if(bump == CSV_BUFFER_SIZE - 1) {
csv->err = CSV_ERR_BUFFER;
return csv->nf;
}
csv->buffer[bump++] = c;
}
break;
case QUOTE:
c = _csv_get_char(csv);
if(c == EOF) {
csv->err = CSV_ERR_BAD_QUOTE;
return csv->nf;
}
if(c == '\"') {
c = _csv_get_char(csv);
if(c != '\"') {
#if CSV_TRIM
while(strchr(" \t\v\f", c))
c = _csv_get_char(csv);
#endif
if(c == EOF || c == '\n') {
state = RECORD_END;
} else if(c == CSV_DELIMITER) {
state = FIELD_END;
} else {
csv->err = CSV_ERR_BAD_QUOTE;
return csv->nf;
}
break;
}
}
if(bump == CSV_BUFFER_SIZE - 1) {
csv->err = CSV_ERR_BUFFER;
return csv->nf;
}
csv->buffer[bump++] = c;
break;
case FIELD_END:
case RECORD_END:
if(bump == CSV_BUFFER_SIZE - 1) {
csv->err = CSV_ERR_BUFFER;
return csv->nf;
}
csv->buffer[bump++] = '\0';
csv->nf++;
if(state == RECORD_END)
return csv->nf;
else
state = FIELD_START;
break;
}
}
/*return 0;*/
}
void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data) {
csv->get_data = fun;
csv->data = data;
csv->last_char = 0;
csv->in_pos = CSV_READ_BUFFER_SIZE;
csv->nf = 0;
csv->err = CSV_OK;
}
static int _csv_file_input_get_line(char *str, int num, void *data) {
size_t read;
FILE *file = CAST(FILE*, data);
if(feof(file))
return 0;
read = fread(str, 1, num, file);
str[read] = '\0';
if(!read)
return 0;
return 1;
}
void csv_context_file(CsvContext *csv, FILE *file) {
assert(file != NULL);
csv_context_custom(csv, _csv_file_input_get_line, file);
}
static int _csv_file_input_get_line_limit(char *str, int num, void *data) {
size_t read;
struct csv_read_limit *ll = CAST(struct csv_read_limit *, data);
if(!ll->limit) return 0;
num--;
if(num > ll->limit)
num = ll->limit;
read = fread(str, 1, num, ll->f);
str[read] = '\0';
if(!read)
return 0;
ll->limit -= strlen(str);
return 1;
}
void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll) {
assert(ll->f != NULL);
assert(ll->limit > 0);
csv_context_custom(csv, _csv_file_input_get_line_limit, ll);
}
int csv_count(CsvContext *csv) {
return csv->nf;
}
const char *csv_field(CsvContext *csv, int i) {
if(i < 0 || i >= csv->nf) return "";
return csv->fields[i];
}
enum csv_error_code csv_get_error(CsvContext *csv) {
return csv->err;
}
# endif /* CSV_IMPLEMENTATION */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* CSV_STREAM_H */