diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile new file mode 100644 index 0000000..11570c3 --- /dev/null +++ b/.clusterfuzzlite/Dockerfile @@ -0,0 +1,6 @@ +FROM gcr.io/oss-fuzz-base/base-builder +RUN apt-get update && apt-get install -y make autoconf automake libtool + +COPY . $SRC/fast-cpp-csv-parser +COPY .clusterfuzzlite/build.sh $SRC/build.sh +WORKDIR $SRC/fast-cpp-csv-parser \ No newline at end of file diff --git a/.clusterfuzzlite/README.md b/.clusterfuzzlite/README.md new file mode 100644 index 0000000..c1b96f8 --- /dev/null +++ b/.clusterfuzzlite/README.md @@ -0,0 +1,19 @@ +# ClusterFuzzLite set up + +This folder contains a fuzzing set for [ClusterFuzzLite](https://google.github.io/clusterfuzzlite). + +## Running fuzzing locally + +To reproduce this set up the way ClusterFuzzLite does it (by way of [OSS-Fuzz](https://github.com/google/oss-fuzz)) you can do: + +```sh +git clone https://github.com/google/oss-fuzz +git clone https://github.com/ben-strasser/fast-cpp-csv-parser +cd fast-cpp-csv-parser + +# Build the fuzzers in .clusterfuzzlite +python3 ../oss-fuzz/infra/helper.py build_fuzzers --external $PWD + +# Run the fuzzer for 180 seconds +python3 ../oss-fuzz/infra/helper.py run_fuzzer --external $PWD parse_fuzzer-- -max_total_time=180 +``` diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh new file mode 100644 index 0000000..df4c246 --- /dev/null +++ b/.clusterfuzzlite/build.sh @@ -0,0 +1,16 @@ +#!/bin/bash -eu +# Supply build instructions +# Use the following environment variables to build the code +# $CXX: c++ compiler +# $CC: c compiler +# CFLAGS: compiler flags for C files +# CXXFLAGS: compiler flags for CPP files +# LIB_FUZZING_ENGINE: linker flag for fuzzing harnesses + +# Copy all fuzzer executables to $OUT/ +# CXXFLAGS holds variables necessary to instrument fuzzing, e.g. saniizer and +# fuzzing flags. +$CXX $CXXFLAGS $LIB_FUZZING_ENGINE \ + $SRC/fast-cpp-csv-parser/.clusterfuzzlite/parse_fuzzer.cpp \ + -o $OUT/parse_fuzzer \ + -I$SRC/fast-cpp-csv-parser diff --git a/.clusterfuzzlite/parse_fuzzer.cpp b/.clusterfuzzlite/parse_fuzzer.cpp new file mode 100644 index 0000000..2812f2a --- /dev/null +++ b/.clusterfuzzlite/parse_fuzzer.cpp @@ -0,0 +1,27 @@ + +#include "csv.h" +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + char filename[256]; + sprintf(filename, "/tmp/libfuzzer.%d.csv", getpid()); + FILE *fp = fopen(filename, "wb"); + if (!fp) + return 0; + fwrite(data, size, 1, fp); + fclose(fp); + + io::CSVReader<3> in(filename); + try { + in.read_header(io::ignore_extra_column, "vendor", "col2", "col3"); + std::string vendor; + int col2; + double col3; + while (in.read_row(vendor, col2, col3)) { + } + } catch (...) { + } + + unlink(filename); + return 0; +} \ No newline at end of file diff --git a/.clusterfuzzlite/project.yaml b/.clusterfuzzlite/project.yaml new file mode 100644 index 0000000..7f563eb --- /dev/null +++ b/.clusterfuzzlite/project.yaml @@ -0,0 +1 @@ +language: c++ \ No newline at end of file diff --git a/.github/workflows/cflite_pr.yml b/.github/workflows/cflite_pr.yml new file mode 100644 index 0000000..a6ddd01 --- /dev/null +++ b/.github/workflows/cflite_pr.yml @@ -0,0 +1,30 @@ +name: ClusterFuzzLite PR fuzzing +on: + workflow_dispatch: + pull_request: + branches: [ master ] +permissions: read-all +jobs: + PR: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + sanitizer: [address] + steps: + - name: Build Fuzzers (${{ matrix.sanitizer }}) + id: build + uses: google/clusterfuzzlite/actions/build_fuzzers@v1 + with: + sanitizer: ${{ matrix.sanitizer }} + language: c++ + bad-build-check: false + - name: Run Fuzzers (${{ matrix.sanitizer }}) + id: run + uses: google/clusterfuzzlite/actions/run_fuzzers@v1 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + fuzz-seconds: 100 + mode: 'code-change' + report-unreproducible-crashes: false + sanitizer: ${{ matrix.sanitizer }} diff --git a/README.md b/README.md index 0b1d2c8..f8f2021 100644 --- a/README.md +++ b/README.md @@ -45,11 +45,9 @@ Remember that the library makes use of C++11 features and therefore you have to The library was developed and tested with GCC 4.6.1 -Note that VS2013 is not C++11 compilant and will therefore not work out of the box. See [here](https://code.google.com/p/fast-cpp-csv-parser/issues/detail?id=6) for what needs to be adjusted to make the code work. - ## Documentation -The libary provides two classes: +The library provides two classes: * `LineReader`: A class to efficiently read large files line by line. * `CSVReader`: A class that efficiently reads large CSV files. @@ -71,6 +69,7 @@ public: char*next_line(); // File Location + // (These only affect the content of the error message) void set_file_line(unsigned); unsigned get_file_line()const; void set_file_name(some_string_type file_name); @@ -90,7 +89,7 @@ public: }; ``` -The read function should fill the provided buffer with at most `size` bytes from the data source. It should return the number of bytes actually written to the buffer. If data source has run out of bytes (because for example an end of file was reached) then the function should return 0. If a fatal error occures then you can throw an exception. Note that the function can be called both from the main and the worker thread. However, it is guarenteed that they do not call the function at the same time. +The read function should fill the provided buffer with at most `size` bytes from the data source. It should return the number of bytes actually written to the buffer. If data source has run out of bytes (because for example an end of file was reached) then the function should return 0. If a fatal error occurs then you can throw an exception. Note that the function can be called both from the main and the worker thread. However, it is guaranteed that they do not call the function at the same time. Lines are read by calling the `next_line` function. It returns a pointer to a null terminated C-string that contains the line. If the end of file is reached a null pointer is returned. The newline character is not included in the string. You may modify the string as long as you do not write past the null terminator. The string stays valid until the destructor is called or until next_line is called again. Windows and `*`nix newlines are handled transparently. UTF-8 BOMs are automatically ignored and missing newlines at the end of the file are no problem. @@ -191,7 +190,7 @@ Examples: The constructors and the file location functions are exactly the same as for `LineReader`. See its documentation for details. -There are three methods that deal with headers. The `read_header` methods reads a line from the file and rearranges the columns to match that order. It also checks whether all necessary columns are present. The `set_header` method does *not* read any input. Use it if the file does not have any header. Obviously it is impossible to rearrange columns or check for their availability when using it. The order in the file and in the program must match when using `set_header`. The `has_column` method checks whether a column is present in the file. The first argument of `read_header` is a bitfield that determines how the function should react to column mismatches. The default behavior is to throw an `error::extra_column_in_header` exception if the file contains more columns than expected and an `error::missing_column_in_header` when there are not enough. This behavior can be altered using the following flags. +There are three methods that deal with headers. The `read_header` methods reads a line from the file and rearranges the columns to match that order. It also checks whether all necessary columns are present. The `set_header` method does *not* read any input. Use it if the file does not have any header. Obviously it is impossible to rearrange columns or check for their availability when using it. The order in the file and in the program must match when using `set_header`. The `has_column` method checks whether a column is present in the file. The first argument of `read_header` is a bit field that determines how the function should react to column mismatches. The default behavior is to throw an `error::extra_column_in_header` exception if the file contains more columns than expected and an `error::missing_column_in_header` when there are not enough. This behavior can be altered using the following flags. * `ignore_no_column`: The default behavior, no flags are set * `ignore_extra_column`: If a column with a name is in the file but not in the argument list, then it is silently ignored. @@ -257,7 +256,7 @@ A: Read a `char*` and parse the string. At first this seems expensive but it is Q: I get lots of compiler errors when compiling the header! Please fix it. :( -A: Have you enabled the C++11 mode of your compiler? If you use GCC you have to add -std=c++0x to the commandline. If this does not resolve the problem, then please open a ticket. +A: Have you enabled the C++11 mode of your compiler? If you use GCC you have to add -std=c++0x to the command line. If this does not resolve the problem, then please open a ticket. Q: The library crashes when parsing large files! Please fix it. :( @@ -273,3 +272,8 @@ A: The library has basic UTF-8 support, or to be more precise it does not break Q: Does the library support string fields that span multiple lines? A: No. This feature has been often requested in the past, however, it is difficult to make it work with the current design without breaking something else. + + +Q: Can this library handle a variable number of columns? + +A: You can read a compile-time known constant number of columns from a file with a variable number of columns. Which columns will be read depends on the strings in the header line. There is no way to read a variable number of columns. You can think of the provided functionality as a SQL `select col1,col2,col3 from my_file.csv` statement and the CSV file as table. You can change the number of columns in the table without affecting the result of the select as long as the queried columns remain. diff --git a/csv.h b/csv.h index d1cf7f7..9a1919a 100644 --- a/csv.h +++ b/csv.h @@ -32,1241 +32,1168 @@ #ifndef CSV_H #define CSV_H -#include -#include -#include #include -#include #include +#include #include +#include +#include +#include #ifndef CSV_IO_NO_THREAD +#include #include #include -#include #endif -#include #include #include #include +#include +#include + +namespace io { +//////////////////////////////////////////////////////////////////////////// +// LineReader // +//////////////////////////////////////////////////////////////////////////// + +namespace error { +struct base : std::exception { + virtual void format_error_message() const = 0; + + const char *what() const noexcept override { + format_error_message(); + return error_message_buffer; + } + + mutable char error_message_buffer[2048]; +}; + +// this only affects the file name in the error message +const int max_file_name_length = 1024; + +struct with_file_name { + with_file_name() { std::memset(file_name, 0, sizeof(file_name)); } + + void set_file_name(const char *file_name) { + if (file_name != nullptr) { + // This call to strncpy has parenthesis around it + // to silence the GCC -Wstringop-truncation warning + (strncpy(this->file_name, file_name, sizeof(this->file_name))); + this->file_name[sizeof(this->file_name) - 1] = '\0'; + } else { + this->file_name[0] = '\0'; + } + } + + char file_name[max_file_name_length + 1]; +}; + +struct with_file_line { + with_file_line() { file_line = -1; } + + void set_file_line(int file_line) { this->file_line = file_line; } + + int file_line; +}; + +struct with_errno { + with_errno() { errno_value = 0; } + + void set_errno(int errno_value) { this->errno_value = errno_value; } + + int errno_value; +}; + +struct can_not_open_file : base, with_file_name, with_errno { + void format_error_message() const override { + if (errno_value != 0) + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\" because \"%s\".", file_name, + std::strerror(errno_value)); + else + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\".", file_name); + } +}; + +struct line_length_limit_exceeded : base, with_file_name, with_file_line { + void format_error_message() const override { + std::snprintf( + error_message_buffer, sizeof(error_message_buffer), + "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1.", + file_line, file_name); + } +}; +} // namespace error + +class ByteSourceBase { +public: + virtual int read(char *buffer, int size) = 0; + virtual ~ByteSourceBase() {} +}; + +namespace detail { + +class OwningStdIOByteSourceBase : public ByteSourceBase { +public: + explicit OwningStdIOByteSourceBase(FILE *file) : file(file) { + // Tell the std library that we want to do the buffering ourself. + std::setvbuf(file, 0, _IONBF, 0); + } + + int read(char *buffer, int size) { return std::fread(buffer, 1, size, file); } + + ~OwningStdIOByteSourceBase() { std::fclose(file); } + +private: + FILE *file; +}; + +class NonOwningIStreamByteSource : public ByteSourceBase { +public: + explicit NonOwningIStreamByteSource(std::istream &in) : in(in) {} + + int read(char *buffer, int size) { + in.read(buffer, size); + return in.gcount(); + } + + ~NonOwningIStreamByteSource() {} + +private: + std::istream ∈ +}; + +class NonOwningStringByteSource : public ByteSourceBase { +public: + NonOwningStringByteSource(const char *str, long long size) + : str(str), remaining_byte_count(size) {} + + int read(char *buffer, int desired_byte_count) { + int to_copy_byte_count = desired_byte_count; + if (remaining_byte_count < to_copy_byte_count) + to_copy_byte_count = remaining_byte_count; + std::memcpy(buffer, str, to_copy_byte_count); + remaining_byte_count -= to_copy_byte_count; + str += to_copy_byte_count; + return to_copy_byte_count; + } + + ~NonOwningStringByteSource() {} + +private: + const char *str; + long long remaining_byte_count; +}; -namespace io{ - //////////////////////////////////////////////////////////////////////////// - // LineReader // - //////////////////////////////////////////////////////////////////////////// - - namespace error{ - struct base : std::exception{ - virtual void format_error_message()const = 0; - - const char*what()const noexcept override{ - format_error_message(); - return error_message_buffer; - } - - mutable char error_message_buffer[512]; - }; - - const int max_file_name_length = 255; - - struct with_file_name{ - with_file_name(){ - std::memset(file_name, 0, sizeof(file_name)); - } - - void set_file_name(const char*file_name){ - if(file_name != nullptr){ - // This call to strncpy has parenthesis around it - // to silence the GCC -Wstringop-truncation warning - (strncpy(this->file_name, file_name, sizeof(this->file_name))); - this->file_name[sizeof(this->file_name)-1] = '\0'; - }else{ - this->file_name[0] = '\0'; - } - } - - char file_name[max_file_name_length+1]; - }; - - struct with_file_line{ - with_file_line(){ - file_line = -1; - } - - void set_file_line(int file_line){ - this->file_line = file_line; - } - - int file_line; - }; - - struct with_errno{ - with_errno(){ - errno_value = 0; - } - - void set_errno(int errno_value){ - this->errno_value = errno_value; - } - - int errno_value; - }; - - struct can_not_open_file : - base, - with_file_name, - with_errno{ - void format_error_message()const override{ - if(errno_value != 0) - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Can not open file \"%s\" because \"%s\"." - , file_name, std::strerror(errno_value)); - else - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Can not open file \"%s\"." - , file_name); - } - }; - - struct line_length_limit_exceeded : - base, - with_file_name, - with_file_line{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1." - , file_line, file_name); - } - }; +#ifndef CSV_IO_NO_THREAD +class AsynchronousReader { +public: + void init(std::unique_ptr arg_byte_source) { + std::unique_lock guard(lock); + byte_source = std::move(arg_byte_source); + desired_byte_count = -1; + termination_requested = false; + worker = std::thread([&] { + std::unique_lock guard(lock); + try { + for (;;) { + read_requested_condition.wait(guard, [&] { + return desired_byte_count != -1 || termination_requested; + }); + if (termination_requested) + return; + + read_byte_count = byte_source->read(buffer, desired_byte_count); + desired_byte_count = -1; + if (read_byte_count == 0) + break; + read_finished_condition.notify_one(); + } + } catch (...) { + read_error = std::current_exception(); + } + read_finished_condition.notify_one(); + }); + } + + bool is_valid() const { return byte_source != nullptr; } + + void start_read(char *arg_buffer, int arg_desired_byte_count) { + std::unique_lock guard(lock); + buffer = arg_buffer; + desired_byte_count = arg_desired_byte_count; + read_byte_count = -1; + read_requested_condition.notify_one(); + } + + int finish_read() { + std::unique_lock guard(lock); + read_finished_condition.wait( + guard, [&] { return read_byte_count != -1 || read_error; }); + if (read_error) + std::rethrow_exception(read_error); + else + return read_byte_count; + } + + ~AsynchronousReader() { + if (byte_source != nullptr) { + { + std::unique_lock guard(lock); + termination_requested = true; + } + read_requested_condition.notify_one(); + worker.join(); + } + } + +private: + std::unique_ptr byte_source; + + std::thread worker; + + bool termination_requested; + std::exception_ptr read_error; + char *buffer; + int desired_byte_count; + int read_byte_count; + + std::mutex lock; + std::condition_variable read_finished_condition; + std::condition_variable read_requested_condition; +}; +#endif + +class SynchronousReader { +public: + void init(std::unique_ptr arg_byte_source) { + byte_source = std::move(arg_byte_source); + } + + bool is_valid() const { return byte_source != nullptr; } + + void start_read(char *arg_buffer, int arg_desired_byte_count) { + buffer = arg_buffer; + desired_byte_count = arg_desired_byte_count; + } + + int finish_read() { return byte_source->read(buffer, desired_byte_count); } + +private: + std::unique_ptr byte_source; + char *buffer; + int desired_byte_count; +}; +} // namespace detail + +class LineReader { +private: + static const int block_len = 1 << 20; + std::unique_ptr buffer; // must be constructed before (and thus + // destructed after) the reader! +#ifdef CSV_IO_NO_THREAD + detail::SynchronousReader reader; +#else + detail::AsynchronousReader reader; +#endif + int data_begin; + int data_end; + + char file_name[error::max_file_name_length + 1]; + unsigned file_line; + + static std::unique_ptr open_file(const char *file_name) { + // We open the file in binary mode as it makes no difference under *nix + // and under Windows we handle \r\n newlines ourself. + FILE *file = std::fopen(file_name, "rb"); + if (file == 0) { + int x = errno; // store errno as soon as possible, doing it after + // constructor call can fail. + error::can_not_open_file err; + err.set_errno(x); + err.set_file_name(file_name); + throw err; + } + return std::unique_ptr( + new detail::OwningStdIOByteSourceBase(file)); + } + + void init(std::unique_ptr byte_source) { + file_line = 0; + + buffer = std::unique_ptr(new char[3 * block_len]); + data_begin = 0; + data_end = byte_source->read(buffer.get(), 2 * block_len); + + // Ignore UTF-8 BOM + if (data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && + buffer[2] == '\xBF') + data_begin = 3; + + if (data_end == 2 * block_len) { + reader.init(std::move(byte_source)); + reader.start_read(buffer.get() + 2 * block_len, block_len); + } + } + +public: + LineReader() = delete; + LineReader(const LineReader &) = delete; + LineReader &operator=(const LineReader &) = delete; + + explicit LineReader(const char *file_name) { + set_file_name(file_name); + init(open_file(file_name)); + } + + explicit LineReader(const std::string &file_name) { + set_file_name(file_name.c_str()); + init(open_file(file_name.c_str())); + } + + LineReader(const char *file_name, + std::unique_ptr byte_source) { + set_file_name(file_name); + init(std::move(byte_source)); + } + + LineReader(const std::string &file_name, + std::unique_ptr byte_source) { + set_file_name(file_name.c_str()); + init(std::move(byte_source)); + } + + LineReader(const char *file_name, const char *data_begin, + const char *data_end) { + set_file_name(file_name); + init(std::unique_ptr(new detail::NonOwningStringByteSource( + data_begin, data_end - data_begin))); + } + + LineReader(const std::string &file_name, const char *data_begin, + const char *data_end) { + set_file_name(file_name.c_str()); + init(std::unique_ptr(new detail::NonOwningStringByteSource( + data_begin, data_end - data_begin))); + } + + LineReader(const char *file_name, FILE *file) { + set_file_name(file_name); + init(std::unique_ptr( + new detail::OwningStdIOByteSourceBase(file))); + } + + LineReader(const std::string &file_name, FILE *file) { + set_file_name(file_name.c_str()); + init(std::unique_ptr( + new detail::OwningStdIOByteSourceBase(file))); + } + + LineReader(const char *file_name, std::istream &in) { + set_file_name(file_name); + init(std::unique_ptr( + new detail::NonOwningIStreamByteSource(in))); + } + + LineReader(const std::string &file_name, std::istream &in) { + set_file_name(file_name.c_str()); + init(std::unique_ptr( + new detail::NonOwningIStreamByteSource(in))); + } + + void set_file_name(const std::string &file_name) { + set_file_name(file_name.c_str()); + } + + void set_file_name(const char *file_name) { + if (file_name != nullptr) { + strncpy(this->file_name, file_name, sizeof(this->file_name) - 1); + this->file_name[sizeof(this->file_name) - 1] = '\0'; + } else { + this->file_name[0] = '\0'; + } + } + + const char *get_truncated_file_name() const { return file_name; } + + void set_file_line(unsigned file_line) { this->file_line = file_line; } + + unsigned get_file_line() const { return file_line; } + + char *next_line() { + if (data_begin == data_end) + return nullptr; + + ++file_line; + + assert(data_begin < data_end); + assert(data_end <= block_len * 2); + + if (data_begin >= block_len) { + std::memcpy(buffer.get(), buffer.get() + block_len, block_len); + data_begin -= block_len; + data_end -= block_len; + if (reader.is_valid()) { + data_end += reader.finish_read(); + std::memcpy(buffer.get() + block_len, buffer.get() + 2 * block_len, + block_len); + reader.start_read(buffer.get() + 2 * block_len, block_len); + } + } + + int line_end = data_begin; + while (line_end != data_end && buffer[line_end] != '\n') { + ++line_end; + } + + if (line_end - data_begin + 1 > block_len) { + error::line_length_limit_exceeded err; + err.set_file_name(file_name); + err.set_file_line(file_line); + throw err; + } + + if (line_end != data_end && buffer[line_end] == '\n') { + buffer[line_end] = '\0'; + } else { + // some files are missing the newline at the end of the + // last line + ++data_end; + buffer[line_end] = '\0'; + } + + // handle windows \r\n-line breaks + if (line_end != data_begin && buffer[line_end - 1] == '\r') + buffer[line_end - 1] = '\0'; + + char *ret = buffer.get() + data_begin; + data_begin = line_end + 1; + return ret; + } +}; + +//////////////////////////////////////////////////////////////////////////// +// CSV // +//////////////////////////////////////////////////////////////////////////// + +namespace error { +const int max_column_name_length = 63; +struct with_column_name { + with_column_name() { + std::memset(column_name, 0, max_column_name_length + 1); + } + + void set_column_name(const char *column_name) { + if (column_name != nullptr) { + std::strncpy(this->column_name, column_name, max_column_name_length); + this->column_name[max_column_name_length] = '\0'; + } else { + this->column_name[0] = '\0'; + } + } + + char column_name[max_column_name_length + 1]; +}; + +const int max_column_content_length = 63; + +struct with_column_content { + with_column_content() { + std::memset(column_content, 0, max_column_content_length + 1); + } + + void set_column_content(const char *column_content) { + if (column_content != nullptr) { + std::strncpy(this->column_content, column_content, + max_column_content_length); + this->column_content[max_column_content_length] = '\0'; + } else { + this->column_content[0] = '\0'; + } + } + + char column_content[max_column_content_length + 1]; +}; + +struct extra_column_in_header : base, with_file_name, with_column_name { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(Extra column "%s" in header of file "%s".)", column_name, + file_name); + } +}; + +struct missing_column_in_header : base, with_file_name, with_column_name { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(Missing column "%s" in header of file "%s".)", column_name, + file_name); + } +}; + +struct duplicated_column_in_header : base, with_file_name, with_column_name { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + R"(Duplicated column "%s" in header of file "%s".)", + column_name, file_name); + } +}; + +struct header_missing : base, with_file_name { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Header missing in file \"%s\".", file_name); + } +}; + +struct too_few_columns : base, with_file_name, with_file_line { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too few columns in line %d in file \"%s\".", file_line, + file_name); + } +}; + +struct too_many_columns : base, with_file_name, with_file_line { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too many columns in line %d in file \"%s\".", file_line, + file_name); + } +}; + +struct escaped_string_not_closed : base, with_file_name, with_file_line { + void format_error_message() const override { + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Escaped string was not closed in line %d in file \"%s\".", + file_line, file_name); + } +}; + +struct integer_must_be_positive : base, + with_file_name, + with_file_line, + with_column_name, + with_column_content { + void format_error_message() const override { + std::snprintf( + error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)", + column_content, column_name, file_name, file_line); + } +}; + +struct no_digit : base, + with_file_name, + with_file_line, + with_column_name, + with_column_content { + void format_error_message() const override { + std::snprintf( + error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)", + column_content, column_name, file_name, file_line); + } +}; + +struct integer_overflow : base, + with_file_name, + with_file_line, + with_column_name, + with_column_content { + void format_error_message() const override { + std::snprintf( + error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)", + column_content, column_name, file_name, file_line); + } +}; + +struct integer_underflow : base, + with_file_name, + with_file_line, + with_column_name, + with_column_content { + void format_error_message() const override { + std::snprintf( + error_message_buffer, sizeof(error_message_buffer), + R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)", + column_content, column_name, file_name, file_line); + } +}; + +struct invalid_single_character : base, + with_file_name, + with_file_line, + with_column_name, + with_column_content { + void format_error_message() const override { + std::snprintf( + error_message_buffer, sizeof(error_message_buffer), + R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)", + column_content, column_name, file_name, file_line); + } +}; +} // namespace error + +using ignore_column = unsigned int; +static const ignore_column ignore_no_column = 0; +static const ignore_column ignore_extra_column = 1; +static const ignore_column ignore_missing_column = 2; + +template struct trim_chars { +private: + constexpr static bool is_trim_char(char) { return false; } + + template + constexpr static bool is_trim_char(char c, char trim_char, + OtherTrimChars... other_trim_chars) { + return c == trim_char || is_trim_char(c, other_trim_chars...); + } + +public: + static void trim(char *&str_begin, char *&str_end) { + while (str_begin != str_end && is_trim_char(*str_begin, trim_char_list...)) + ++str_begin; + while (str_begin != str_end && + is_trim_char(*(str_end - 1), trim_char_list...)) + --str_end; + *str_end = '\0'; + } +}; + +struct no_comment { + static bool is_comment(const char *) { return false; } +}; + +template struct single_line_comment { +private: + constexpr static bool is_comment_start_char(char) { return false; } + + template + constexpr static bool + is_comment_start_char(char c, char comment_start_char, + OtherCommentStartChars... other_comment_start_chars) { + return c == comment_start_char || + is_comment_start_char(c, other_comment_start_chars...); + } + +public: + static bool is_comment(const char *line) { + return is_comment_start_char(*line, comment_start_char_list...); + } +}; + +struct empty_line_comment { + static bool is_comment(const char *line) { + if (*line == '\0') + return true; + while (*line == ' ' || *line == '\t') { + ++line; + if (*line == 0) + return true; + } + return false; + } +}; + +template +struct single_and_empty_line_comment { + static bool is_comment(const char *line) { + return single_line_comment::is_comment(line) || + empty_line_comment::is_comment(line); + } +}; + +template struct no_quote_escape { + static const char *find_next_column_end(const char *col_begin) { + while (*col_begin != sep && *col_begin != '\0') + ++col_begin; + return col_begin; + } + + static void unescape(char *&, char *&) {} +}; + +template struct double_quote_escape { + static const char *find_next_column_end(const char *col_begin) { + while (*col_begin != sep && *col_begin != '\0') + if (*col_begin != quote) + ++col_begin; + else { + do { + ++col_begin; + while (*col_begin != quote) { + if (*col_begin == '\0') + throw error::escaped_string_not_closed(); + ++col_begin; + } + ++col_begin; + } while (*col_begin == quote); + } + return col_begin; + } + + static void unescape(char *&col_begin, char *&col_end) { + if (col_end - col_begin >= 2) { + if (*col_begin == quote && *(col_end - 1) == quote) { + ++col_begin; + --col_end; + char *out = col_begin; + for (char *in = col_begin; in != col_end; ++in) { + if (*in == quote && (in + 1) != col_end && *(in + 1) == quote) { + ++in; + } + *out = *in; + ++out; } + col_end = out; + *col_end = '\0'; + } + } + } +}; + +struct throw_on_overflow { + template static void on_overflow(T &) { + throw error::integer_overflow(); + } + + template static void on_underflow(T &) { + throw error::integer_underflow(); + } +}; + +struct ignore_overflow { + template static void on_overflow(T &) {} + + template static void on_underflow(T &) {} +}; + +struct set_to_max_on_overflow { + template static void on_overflow(T &x) { + // using (std::numeric_limits::max) instead of + // std::numeric_limits::max to make code including windows.h with its max + // macro happy + x = (std::numeric_limits::max)(); + } + + template static void on_underflow(T &x) { + x = (std::numeric_limits::min)(); + } +}; + +namespace detail { +template +void chop_next_column(char *&line, char *&col_begin, char *&col_end) { + assert(line != nullptr); + + col_begin = line; + // the col_begin + (... - col_begin) removes the constness + col_end = + col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); + + if (*col_end == '\0') { + line = nullptr; + } else { + *col_end = '\0'; + line = col_end + 1; + } +} - class ByteSourceBase{ - public: - virtual int read(char*buffer, int size)=0; - virtual ~ByteSourceBase(){} - }; - - namespace detail{ - - class OwningStdIOByteSourceBase : public ByteSourceBase{ - public: - explicit OwningStdIOByteSourceBase(FILE*file):file(file){ - // Tell the std library that we want to do the buffering ourself. - std::setvbuf(file, 0, _IONBF, 0); - } - - int read(char*buffer, int size){ - return std::fread(buffer, 1, size, file); - } - - ~OwningStdIOByteSourceBase(){ - std::fclose(file); - } - - private: - FILE*file; - }; - - class NonOwningIStreamByteSource : public ByteSourceBase{ - public: - explicit NonOwningIStreamByteSource(std::istream&in):in(in){} - - int read(char*buffer, int size){ - in.read(buffer, size); - return in.gcount(); - } - - ~NonOwningIStreamByteSource(){} - - private: - std::istream∈ - }; - - class NonOwningStringByteSource : public ByteSourceBase{ - public: - NonOwningStringByteSource(const char*str, long long size):str(str), remaining_byte_count(size){} - - int read(char*buffer, int desired_byte_count){ - int to_copy_byte_count = desired_byte_count; - if(remaining_byte_count < to_copy_byte_count) - to_copy_byte_count = remaining_byte_count; - std::memcpy(buffer, str, to_copy_byte_count); - remaining_byte_count -= to_copy_byte_count; - str += to_copy_byte_count; - return to_copy_byte_count; - } - - ~NonOwningStringByteSource(){} - - private: - const char*str; - long long remaining_byte_count; - }; - - #ifndef CSV_IO_NO_THREAD - class AsynchronousReader{ - public: - void init(std::unique_ptrarg_byte_source){ - std::unique_lockguard(lock); - byte_source = std::move(arg_byte_source); - desired_byte_count = -1; - termination_requested = false; - worker = std::thread( - [&]{ - std::unique_lockguard(lock); - try{ - for(;;){ - read_requested_condition.wait( - guard, - [&]{ - return desired_byte_count != -1 || termination_requested; - } - ); - if(termination_requested) - return; - - read_byte_count = byte_source->read(buffer, desired_byte_count); - desired_byte_count = -1; - if(read_byte_count == 0) - break; - read_finished_condition.notify_one(); - } - }catch(...){ - read_error = std::current_exception(); - } - read_finished_condition.notify_one(); - } - ); - } - - bool is_valid()const{ - return byte_source != nullptr; - } - - void start_read(char*arg_buffer, int arg_desired_byte_count){ - std::unique_lockguard(lock); - buffer = arg_buffer; - desired_byte_count = arg_desired_byte_count; - read_byte_count = -1; - read_requested_condition.notify_one(); - } - - int finish_read(){ - std::unique_lockguard(lock); - read_finished_condition.wait( - guard, - [&]{ - return read_byte_count != -1 || read_error; - } - ); - if(read_error) - std::rethrow_exception(read_error); - else - return read_byte_count; - } - - ~AsynchronousReader(){ - if(byte_source != nullptr){ - { - std::unique_lockguard(lock); - termination_requested = true; - } - read_requested_condition.notify_one(); - worker.join(); - } - } - - private: - std::unique_ptrbyte_source; - - std::thread worker; - - bool termination_requested; - std::exception_ptr read_error; - char*buffer; - int desired_byte_count; - int read_byte_count; - - std::mutex lock; - std::condition_variable read_finished_condition; - std::condition_variable read_requested_condition; - }; - #endif - - class SynchronousReader{ - public: - void init(std::unique_ptrarg_byte_source){ - byte_source = std::move(arg_byte_source); - } - - bool is_valid()const{ - return byte_source != nullptr; - } - - void start_read(char*arg_buffer, int arg_desired_byte_count){ - buffer = arg_buffer; - desired_byte_count = arg_desired_byte_count; - } - - int finish_read(){ - return byte_source->read(buffer, desired_byte_count); - } - private: - std::unique_ptrbyte_source; - char*buffer; - int desired_byte_count; - }; +template +void parse_line(char *line, char **sorted_col, + const std::vector &col_order) { + for (int i : col_order) { + if (line == nullptr) + throw ::io::error::too_few_columns(); + char *col_begin, *col_end; + chop_next_column(line, col_begin, col_end); + + if (i != -1) { + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + sorted_col[i] = col_begin; + } + } + if (line != nullptr) + throw ::io::error::too_many_columns(); +} + +template +void parse_header_line(char *line, std::vector &col_order, + const std::string *col_name, + ignore_column ignore_policy) { + col_order.clear(); + + bool found[column_count]; + std::fill(found, found + column_count, false); + while (line) { + char *col_begin, *col_end; + chop_next_column(line, col_begin, col_end); + + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + for (unsigned i = 0; i < column_count; ++i) + if (col_begin == col_name[i]) { + if (found[i]) { + error::duplicated_column_in_header err; + err.set_column_name(col_begin); + throw err; } + found[i] = true; + col_order.push_back(i); + col_begin = 0; + break; + } + if (col_begin) { + if (ignore_policy & ::io::ignore_extra_column) + col_order.push_back(-1); + else { + error::extra_column_in_header err; + err.set_column_name(col_begin); + throw err; + } + } + } + if (!(ignore_policy & ::io::ignore_missing_column)) { + for (unsigned i = 0; i < column_count; ++i) { + if (!found[i]) { + error::missing_column_in_header err; + err.set_column_name(col_name[i].c_str()); + throw err; + } + } + } +} + +template void parse(char *col, char &x) { + if (!*col) + throw error::invalid_single_character(); + x = *col; + ++col; + if (*col) + throw error::invalid_single_character(); +} + +template void parse(char *col, std::string &x) { + x = col; +} + +template void parse(char *col, const char *&x) { + x = col; +} - class LineReader{ - private: - static const int block_len = 1<<20; - std::unique_ptrbuffer; // must be constructed before (and thus destructed after) the reader! - #ifdef CSV_IO_NO_THREAD - detail::SynchronousReader reader; - #else - detail::AsynchronousReader reader; - #endif - int data_begin; - int data_end; - - char file_name[error::max_file_name_length+1]; - unsigned file_line; - - static std::unique_ptr open_file(const char*file_name){ - // We open the file in binary mode as it makes no difference under *nix - // and under Windows we handle \r\n newlines ourself. - FILE*file = std::fopen(file_name, "rb"); - if(file == 0){ - int x = errno; // store errno as soon as possible, doing it after constructor call can fail. - error::can_not_open_file err; - err.set_errno(x); - err.set_file_name(file_name); - throw err; - } - return std::unique_ptr(new detail::OwningStdIOByteSourceBase(file)); - } - - void init(std::unique_ptrbyte_source){ - file_line = 0; - - buffer = std::unique_ptr(new char[3*block_len]); - data_begin = 0; - data_end = byte_source->read(buffer.get(), 2*block_len); - - // Ignore UTF-8 BOM - if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') - data_begin = 3; - - if(data_end == 2*block_len){ - reader.init(std::move(byte_source)); - reader.start_read(buffer.get() + 2*block_len, block_len); - } - } - - public: - LineReader() = delete; - LineReader(const LineReader&) = delete; - LineReader&operator=(const LineReader&) = delete; - - explicit LineReader(const char*file_name){ - set_file_name(file_name); - init(open_file(file_name)); - } - - explicit LineReader(const std::string&file_name){ - set_file_name(file_name.c_str()); - init(open_file(file_name.c_str())); - } - - LineReader(const char*file_name, std::unique_ptrbyte_source){ - set_file_name(file_name); - init(std::move(byte_source)); - } - - LineReader(const std::string&file_name, std::unique_ptrbyte_source){ - set_file_name(file_name.c_str()); - init(std::move(byte_source)); - } - - LineReader(const char*file_name, const char*data_begin, const char*data_end){ - set_file_name(file_name); - init(std::unique_ptr(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); - } - - LineReader(const std::string&file_name, const char*data_begin, const char*data_end){ - set_file_name(file_name.c_str()); - init(std::unique_ptr(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); - } - - LineReader(const char*file_name, FILE*file){ - set_file_name(file_name); - init(std::unique_ptr(new detail::OwningStdIOByteSourceBase(file))); - } - - LineReader(const std::string&file_name, FILE*file){ - set_file_name(file_name.c_str()); - init(std::unique_ptr(new detail::OwningStdIOByteSourceBase(file))); - } - - LineReader(const char*file_name, std::istream&in){ - set_file_name(file_name); - init(std::unique_ptr(new detail::NonOwningIStreamByteSource(in))); - } - - LineReader(const std::string&file_name, std::istream&in){ - set_file_name(file_name.c_str()); - init(std::unique_ptr(new detail::NonOwningIStreamByteSource(in))); - } - - void set_file_name(const std::string&file_name){ - set_file_name(file_name.c_str()); - } - - void set_file_name(const char*file_name){ - if(file_name != nullptr){ - strncpy(this->file_name, file_name, sizeof(this->file_name)); - this->file_name[sizeof(this->file_name)-1] = '\0'; - }else{ - this->file_name[0] = '\0'; - } - } - - const char*get_truncated_file_name()const{ - return file_name; - } - - void set_file_line(unsigned file_line){ - this->file_line = file_line; - } - - unsigned get_file_line()const{ - return file_line; - } - - char*next_line(){ - if(data_begin == data_end) - return nullptr; - - ++file_line; - - assert(data_begin < data_end); - assert(data_end <= block_len*2); - - if(data_begin >= block_len){ - std::memcpy(buffer.get(), buffer.get()+block_len, block_len); - data_begin -= block_len; - data_end -= block_len; - if(reader.is_valid()) - { - data_end += reader.finish_read(); - std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len); - reader.start_read(buffer.get() + 2*block_len, block_len); - } - } - - int line_end = data_begin; - while(line_end != data_end && buffer[line_end] != '\n'){ - ++line_end; - } - - if(line_end - data_begin + 1 > block_len){ - error::line_length_limit_exceeded err; - err.set_file_name(file_name); - err.set_file_line(file_line); - throw err; - } - - if(line_end != data_end && buffer[line_end] == '\n'){ - buffer[line_end] = '\0'; - }else{ - // some files are missing the newline at the end of the - // last line - ++data_end; - buffer[line_end] = '\0'; - } - - // handle windows \r\n-line breaks - if(line_end != data_begin && buffer[line_end-1] == '\r') - buffer[line_end-1] = '\0'; - - char*ret = buffer.get() + data_begin; - data_begin = line_end+1; - return ret; - } - }; - - - //////////////////////////////////////////////////////////////////////////// - // CSV // - //////////////////////////////////////////////////////////////////////////// - - namespace error{ - const int max_column_name_length = 63; - struct with_column_name{ - with_column_name(){ - std::memset(column_name, 0, max_column_name_length+1); - } - - void set_column_name(const char*column_name){ - if(column_name != nullptr){ - std::strncpy(this->column_name, column_name, max_column_name_length); - this->column_name[max_column_name_length] = '\0'; - }else{ - this->column_name[0] = '\0'; - } - } - - char column_name[max_column_name_length+1]; - }; - - - const int max_column_content_length = 63; - - struct with_column_content{ - with_column_content(){ - std::memset(column_content, 0, max_column_content_length+1); - } - - void set_column_content(const char*column_content){ - if(column_content != nullptr){ - std::strncpy(this->column_content, column_content, max_column_content_length); - this->column_content[max_column_content_length] = '\0'; - }else{ - this->column_content[0] = '\0'; - } - } - - char column_content[max_column_content_length+1]; - }; - - - struct extra_column_in_header : - base, - with_file_name, - with_column_name{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(Extra column "%s" in header of file "%s".)" - , column_name, file_name); - } - }; - - struct missing_column_in_header : - base, - with_file_name, - with_column_name{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(Missing column "%s" in header of file "%s".)" - , column_name, file_name); - } - }; - - struct duplicated_column_in_header : - base, - with_file_name, - with_column_name{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(Duplicated column "%s" in header of file "%s".)" - , column_name, file_name); - } - }; - - struct header_missing : - base, - with_file_name{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Header missing in file \"%s\"." - , file_name); - } - }; - - struct too_few_columns : - base, - with_file_name, - with_file_line{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Too few columns in line %d in file \"%s\"." - , file_line, file_name); - } - }; - - struct too_many_columns : - base, - with_file_name, - with_file_line{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Too many columns in line %d in file \"%s\"." - , file_line, file_name); - } - }; - - struct escaped_string_not_closed : - base, - with_file_name, - with_file_line{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - "Escaped string was not closed in line %d in file \"%s\"." - , file_line, file_name); - } - }; - - struct integer_must_be_positive : - base, - with_file_name, - with_file_line, - with_column_name, - with_column_content{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)" - , column_content, column_name, file_name, file_line); - } - }; - - struct no_digit : - base, - with_file_name, - with_file_line, - with_column_name, - with_column_content{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)" - , column_content, column_name, file_name, file_line); - } - }; - - struct integer_overflow : - base, - with_file_name, - with_file_line, - with_column_name, - with_column_content{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)" - , column_content, column_name, file_name, file_line); - } - }; - - struct integer_underflow : - base, - with_file_name, - with_file_line, - with_column_name, - with_column_content{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)" - , column_content, column_name, file_name, file_line); - } - }; - - struct invalid_single_character : - base, - with_file_name, - with_file_line, - with_column_name, - with_column_content{ - void format_error_message()const override{ - std::snprintf(error_message_buffer, sizeof(error_message_buffer), - R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)" - , column_content, column_name, file_name, file_line); - } - }; +template void parse(char *col, char *&x) { x = col; } + +template +void parse_unsigned_integer(const char *col, T &x) { + x = 0; + while (*col != '\0') { + if ('0' <= *col && *col <= '9') { + T y = *col - '0'; + if (x > ((std::numeric_limits::max)() - y) / 10) { + overflow_policy::on_overflow(x); + return; + } + x = 10 * x + y; + } else + throw error::no_digit(); + ++col; + } +} + +template void parse(char *col, unsigned char &x) { + parse_unsigned_integer(col, x); +} +template void parse(char *col, unsigned short &x) { + parse_unsigned_integer(col, x); +} +template void parse(char *col, unsigned int &x) { + parse_unsigned_integer(col, x); +} +template void parse(char *col, unsigned long &x) { + parse_unsigned_integer(col, x); +} +template void parse(char *col, unsigned long long &x) { + parse_unsigned_integer(col, x); +} + +template +void parse_signed_integer(const char *col, T &x) { + if (*col == '-') { + ++col; + + x = 0; + while (*col != '\0') { + if ('0' <= *col && *col <= '9') { + T y = *col - '0'; + if (x < ((std::numeric_limits::min)() + y) / 10) { + overflow_policy::on_underflow(x); + return; } + x = 10 * x - y; + } else + throw error::no_digit(); + ++col; + } + return; + } else if (*col == '+') + ++col; + parse_unsigned_integer(col, x); +} - using ignore_column = unsigned int; - static const ignore_column ignore_no_column = 0; - static const ignore_column ignore_extra_column = 1; - static const ignore_column ignore_missing_column = 2; - - template - struct trim_chars{ - private: - constexpr static bool is_trim_char(char){ - return false; - } - - template - constexpr static bool is_trim_char(char c, char trim_char, OtherTrimChars...other_trim_chars){ - return c == trim_char || is_trim_char(c, other_trim_chars...); - } - - public: - static void trim(char*&str_begin, char*&str_end){ - while(str_begin != str_end && is_trim_char(*str_begin, trim_char_list...)) - ++str_begin; - while(str_begin != str_end && is_trim_char(*(str_end-1), trim_char_list...)) - --str_end; - *str_end = '\0'; - } - }; - - - struct no_comment{ - static bool is_comment(const char*){ - return false; - } - }; - - template - struct single_line_comment{ - private: - constexpr static bool is_comment_start_char(char){ - return false; - } - - template - constexpr static bool is_comment_start_char(char c, char comment_start_char, OtherCommentStartChars...other_comment_start_chars){ - return c == comment_start_char || is_comment_start_char(c, other_comment_start_chars...); - } - - public: - - static bool is_comment(const char*line){ - return is_comment_start_char(*line, comment_start_char_list...); - } - }; - - struct empty_line_comment{ - static bool is_comment(const char*line){ - if(*line == '\0') - return true; - while(*line == ' ' || *line == '\t'){ - ++line; - if(*line == 0) - return true; - } - return false; - } - }; - - template - struct single_and_empty_line_comment{ - static bool is_comment(const char*line){ - return single_line_comment::is_comment(line) || empty_line_comment::is_comment(line); - } - }; - - template - struct no_quote_escape{ - static const char*find_next_column_end(const char*col_begin){ - while(*col_begin != sep && *col_begin != '\0') - ++col_begin; - return col_begin; - } - - static void unescape(char*&, char*&){ - - } - }; - - template - struct double_quote_escape{ - static const char*find_next_column_end(const char*col_begin){ - while(*col_begin != sep && *col_begin != '\0') - if(*col_begin != quote) - ++col_begin; - else{ - do{ - ++col_begin; - while(*col_begin != quote){ - if(*col_begin == '\0') - throw error::escaped_string_not_closed(); - ++col_begin; - } - ++col_begin; - }while(*col_begin == quote); - } - return col_begin; - } - - static void unescape(char*&col_begin, char*&col_end){ - if(col_end - col_begin >= 2){ - if(*col_begin == quote && *(col_end-1) == quote){ - ++col_begin; - --col_end; - char*out = col_begin; - for(char*in = col_begin; in!=col_end; ++in){ - if(*in == quote && (in+1) != col_end && *(in+1) == quote){ - ++in; - } - *out = *in; - ++out; - } - col_end = out; - *col_end = '\0'; - } - } - - } - }; - - struct throw_on_overflow{ - template - static void on_overflow(T&){ - throw error::integer_overflow(); - } - - template - static void on_underflow(T&){ - throw error::integer_underflow(); - } - }; - - struct ignore_overflow{ - template - static void on_overflow(T&){} - - template - static void on_underflow(T&){} - }; - - struct set_to_max_on_overflow{ - template - static void on_overflow(T&x){ - // using (std::numeric_limits::max) instead of std::numeric_limits::max - // to make code including windows.h with its max macro happy - x = (std::numeric_limits::max)(); - } - - template - static void on_underflow(T&x){ - x = (std::numeric_limits::min)(); - } - }; - - - namespace detail{ - template - void chop_next_column( - char*&line, char*&col_begin, char*&col_end - ){ - assert(line != nullptr); - - col_begin = line; - // the col_begin + (... - col_begin) removes the constness - col_end = col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); - - if(*col_end == '\0'){ - line = nullptr; - }else{ - *col_end = '\0'; - line = col_end + 1; - } - } - - template - void parse_line( - char*line, - char**sorted_col, - const std::vector&col_order - ){ - for (int i : col_order) { - if(line == nullptr) - throw ::io::error::too_few_columns(); - char*col_begin, *col_end; - chop_next_column(line, col_begin, col_end); - - if (i != -1) { - trim_policy::trim(col_begin, col_end); - quote_policy::unescape(col_begin, col_end); - - sorted_col[i] = col_begin; - } - } - if(line != nullptr) - throw ::io::error::too_many_columns(); - } - - template - void parse_header_line( - char*line, - std::vector&col_order, - const std::string*col_name, - ignore_column ignore_policy - ){ - col_order.clear(); - - bool found[column_count]; - std::fill(found, found + column_count, false); - while(line){ - char*col_begin,*col_end; - chop_next_column(line, col_begin, col_end); - - trim_policy::trim(col_begin, col_end); - quote_policy::unescape(col_begin, col_end); - - for(unsigned i=0; i - void parse(char*col, char &x){ - if(!*col) - throw error::invalid_single_character(); - x = *col; - ++col; - if(*col) - throw error::invalid_single_character(); - } - - template - void parse(char*col, std::string&x){ - x = col; - } - - template - void parse(char*col, const char*&x){ - x = col; - } - - template - void parse(char*col, char*&x){ - x = col; - } - - template - void parse_unsigned_integer(const char*col, T&x){ - x = 0; - while(*col != '\0'){ - if('0' <= *col && *col <= '9'){ - T y = *col - '0'; - if(x > ((std::numeric_limits::max)()-y)/10){ - overflow_policy::on_overflow(x); - return; - } - x = 10*x+y; - }else - throw error::no_digit(); - ++col; - } - } - - templatevoid parse(char*col, unsigned char &x) - {parse_unsigned_integer(col, x);} - templatevoid parse(char*col, unsigned short &x) - {parse_unsigned_integer(col, x);} - templatevoid parse(char*col, unsigned int &x) - {parse_unsigned_integer(col, x);} - templatevoid parse(char*col, unsigned long &x) - {parse_unsigned_integer(col, x);} - templatevoid parse(char*col, unsigned long long &x) - {parse_unsigned_integer(col, x);} - - template - void parse_signed_integer(const char*col, T&x){ - if(*col == '-'){ - ++col; - - x = 0; - while(*col != '\0'){ - if('0' <= *col && *col <= '9'){ - T y = *col - '0'; - if(x < ((std::numeric_limits::min)()+y)/10){ - overflow_policy::on_underflow(x); - return; - } - x = 10*x-y; - }else - throw error::no_digit(); - ++col; - } - return; - }else if(*col == '+') - ++col; - parse_unsigned_integer(col, x); - } - - templatevoid parse(char*col, signed char &x) - {parse_signed_integer(col, x);} - templatevoid parse(char*col, signed short &x) - {parse_signed_integer(col, x);} - templatevoid parse(char*col, signed int &x) - {parse_signed_integer(col, x);} - templatevoid parse(char*col, signed long &x) - {parse_signed_integer(col, x);} - templatevoid parse(char*col, signed long long &x) - {parse_signed_integer(col, x);} - - template - void parse_float(const char*col, T&x){ - bool is_neg = false; - if(*col == '-'){ - is_neg = true; - ++col; - }else if(*col == '+') - ++col; - - x = 0; - while('0' <= *col && *col <= '9'){ - int y = *col - '0'; - x *= 10; - x += y; - ++col; - } - - if(*col == '.'|| *col == ','){ - ++col; - T pos = 1; - while('0' <= *col && *col <= '9'){ - pos /= 10; - int y = *col - '0'; - ++col; - x += y*pos; - } - } - - if(*col == 'e' || *col == 'E'){ - ++col; - int e; - - parse_signed_integer(col, e); - - if(e != 0){ - T base; - if(e < 0){ - base = T(0.1); - e = -e; - }else{ - base = T(10); - } - - while(e != 1){ - if((e & 1) == 0){ - base = base*base; - e >>= 1; - }else{ - x *= base; - --e; - } - } - x *= base; - } - }else{ - if(*col != '\0') - throw error::no_digit(); - } - - if(is_neg) - x = -x; - } - - template void parse(char*col, float&x) { parse_float(col, x); } - template void parse(char*col, double&x) { parse_float(col, x); } - template void parse(char*col, long double&x) { parse_float(col, x); } - - template - void parse(char*col, T&x){ - // Mute unused variable compiler warning - (void)col; - (void)x; - // GCC evalutes "false" when reading the template and - // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why - // this strange construct is used. - static_assert(sizeof(T)!=sizeof(T), - "Can not parse this type. Only buildin integrals, floats, char, char*, const char* and std::string are supported"); - } +template void parse(char *col, signed char &x) { + parse_signed_integer(col, x); +} +template void parse(char *col, signed short &x) { + parse_signed_integer(col, x); +} +template void parse(char *col, signed int &x) { + parse_signed_integer(col, x); +} +template void parse(char *col, signed long &x) { + parse_signed_integer(col, x); +} +template void parse(char *col, signed long long &x) { + parse_signed_integer(col, x); +} +template void parse_float(const char *col, T &x) { + bool is_neg = false; + if (*col == '-') { + is_neg = true; + ++col; + } else if (*col == '+') + ++col; + + x = 0; + while ('0' <= *col && *col <= '9') { + int y = *col - '0'; + x *= 10; + x += y; + ++col; + } + + if (*col == '.' || *col == ',') { + ++col; + T pos = 1; + while ('0' <= *col && *col <= '9') { + pos /= 10; + int y = *col - '0'; + ++col; + x += y * pos; + } + } + + if (*col == 'e' || *col == 'E') { + ++col; + int e; + + parse_signed_integer(col, e); + + if (e != 0) { + T base; + if (e < 0) { + base = T(0.1); + e = -e; + } else { + base = T(10); + } + + while (e != 1) { + if ((e & 1) == 0) { + base = base * base; + e >>= 1; + } else { + x *= base; + --e; } + } + x *= base; + } + } else { + if (*col != '\0') + throw error::no_digit(); + } + + if (is_neg) + x = -x; +} - template, - class quote_policy = no_quote_escape<','>, - class overflow_policy = throw_on_overflow, - class comment_policy = no_comment - > - class CSVReader{ - private: - LineReader in; - - char*row[column_count]; - std::string column_names[column_count]; - - std::vectorcol_order; - - template - void set_column_names(std::string s, ColNames...cols){ - column_names[column_count-sizeof...(ColNames)-1] = std::move(s); - set_column_names(std::forward(cols)...); - } - - void set_column_names(){} - - - public: - CSVReader() = delete; - CSVReader(const CSVReader&) = delete; - CSVReader&operator=(const CSVReader&); - - template - explicit CSVReader(Args&&...args):in(std::forward(args)...){ - std::fill(row, row+column_count, nullptr); - col_order.resize(column_count); - for(unsigned i=0; i - void read_header(ignore_column ignore_policy, ColNames...cols){ - static_assert(sizeof...(ColNames)>=column_count, "not enough column names specified"); - static_assert(sizeof...(ColNames)<=column_count, "too many column names specified"); - try{ - set_column_names(std::forward(cols)...); - - char*line; - do{ - line = in.next_line(); - if(!line) - throw error::header_missing(); - }while(comment_policy::is_comment(line)); - - detail::parse_header_line - - (line, col_order, column_names, ignore_policy); - }catch(error::with_file_name&err){ - err.set_file_name(in.get_truncated_file_name()); - throw; - } - } - - template - void set_header(ColNames...cols){ - static_assert(sizeof...(ColNames)>=column_count, - "not enough column names specified"); - static_assert(sizeof...(ColNames)<=column_count, - "too many column names specified"); - set_column_names(std::forward(cols)...); - std::fill(row, row+column_count, nullptr); - col_order.resize(column_count); - for(unsigned i=0; i - void parse_helper(std::size_t r, T&t, ColType&...cols){ - if(row[r]){ - try{ - try{ - ::io::detail::parse(row[r], t); - }catch(error::with_column_content&err){ - err.set_column_content(row[r]); - throw; - } - }catch(error::with_column_name&err){ - err.set_column_name(column_names[r].c_str()); - throw; - } - } - parse_helper(r+1, cols...); - } - - - public: - template - bool read_row(ColType& ...cols){ - static_assert(sizeof...(ColType)>=column_count, - "not enough columns specified"); - static_assert(sizeof...(ColType)<=column_count, - "too many columns specified"); - try{ - try{ - - char*line; - do{ - line = in.next_line(); - if(!line) - return false; - }while(comment_policy::is_comment(line)); - - detail::parse_line - (line, row, col_order); - - parse_helper(0, cols...); - }catch(error::with_file_name&err){ - err.set_file_name(in.get_truncated_file_name()); - throw; - } - }catch(error::with_file_line&err){ - err.set_file_line(in.get_file_line()); - throw; - } - - return true; - } - }; +template void parse(char *col, float &x) { + parse_float(col, x); +} +template void parse(char *col, double &x) { + parse_float(col, x); +} +template void parse(char *col, long double &x) { + parse_float(col, x); +} + +template void parse(char *col, T &x) { + // Mute unused variable compiler warning + (void)col; + (void)x; + // GCC evaluates "false" when reading the template and + // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why + // this strange construct is used. + static_assert(sizeof(T) != sizeof(T), + "Can not parse this type. Only builtin integrals, floats, " + "char, char*, const char* and std::string are supported"); } -#endif +} // namespace detail + +template , + class quote_policy = no_quote_escape<','>, + class overflow_policy = throw_on_overflow, + class comment_policy = no_comment> +class CSVReader { +private: + LineReader in; + + char *row[column_count]; + std::string column_names[column_count]; + + std::vector col_order; + + template + void set_column_names(std::string s, ColNames... cols) { + column_names[column_count - sizeof...(ColNames) - 1] = std::move(s); + set_column_names(std::forward(cols)...); + } + + void set_column_names() {} + +public: + CSVReader() = delete; + CSVReader(const CSVReader &) = delete; + CSVReader &operator=(const CSVReader &); + + template + explicit CSVReader(Args &&... args) : in(std::forward(args)...) { + std::fill(row, row + column_count, nullptr); + col_order.resize(column_count); + for (unsigned i = 0; i < column_count; ++i) + col_order[i] = i; + for (unsigned i = 1; i <= column_count; ++i) + column_names[i - 1] = "col" + std::to_string(i); + } + + char *next_line() { return in.next_line(); } + + template + void read_header(ignore_column ignore_policy, ColNames... cols) { + static_assert(sizeof...(ColNames) >= column_count, + "not enough column names specified"); + static_assert(sizeof...(ColNames) <= column_count, + "too many column names specified"); + try { + set_column_names(std::forward(cols)...); + + char *line; + do { + line = in.next_line(); + if (!line) + throw error::header_missing(); + } while (comment_policy::is_comment(line)); + + detail::parse_header_line( + line, col_order, column_names, ignore_policy); + } catch (error::with_file_name &err) { + err.set_file_name(in.get_truncated_file_name()); + throw; + } + } + + template void set_header(ColNames... cols) { + static_assert(sizeof...(ColNames) >= column_count, + "not enough column names specified"); + static_assert(sizeof...(ColNames) <= column_count, + "too many column names specified"); + set_column_names(std::forward(cols)...); + std::fill(row, row + column_count, nullptr); + col_order.resize(column_count); + for (unsigned i = 0; i < column_count; ++i) + col_order[i] = i; + } + + bool has_column(const std::string &name) const { + return col_order.end() != + std::find(col_order.begin(), col_order.end(), + std::find(std::begin(column_names), std::end(column_names), + name) - + std::begin(column_names)); + } + + void set_file_name(const std::string &file_name) { + in.set_file_name(file_name); + } + + void set_file_name(const char *file_name) { in.set_file_name(file_name); } + + const char *get_truncated_file_name() const { + return in.get_truncated_file_name(); + } + + void set_file_line(unsigned file_line) { in.set_file_line(file_line); } + + unsigned get_file_line() const { return in.get_file_line(); } + +private: + void parse_helper(std::size_t) {} + + template + void parse_helper(std::size_t r, T &t, ColType &... cols) { + if (row[r]) { + try { + try { + ::io::detail::parse(row[r], t); + } catch (error::with_column_content &err) { + err.set_column_content(row[r]); + throw; + } + } catch (error::with_column_name &err) { + err.set_column_name(column_names[r].c_str()); + throw; + } + } + parse_helper(r + 1, cols...); + } + +public: + template bool read_row(ColType &... cols) { + static_assert(sizeof...(ColType) >= column_count, + "not enough columns specified"); + static_assert(sizeof...(ColType) <= column_count, + "too many columns specified"); + try { + try { + + char *line; + do { + line = in.next_line(); + if (!line) + return false; + } while (comment_policy::is_comment(line)); + + detail::parse_line(line, row, col_order); + + parse_helper(0, cols...); + } catch (error::with_file_name &err) { + err.set_file_name(in.get_truncated_file_name()); + throw; + } + } catch (error::with_file_line &err) { + err.set_file_line(in.get_file_line()); + throw; + } + + return true; + } +}; +} // namespace io +#endif