diff --git a/.gitignore b/.gitignore index f6f60e6..e469492 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,21 @@ +# CMake's installation directory +install/ + # Custom Settings CMakeLists2.txt # Build bin/ build/ +Testing/ + +# CMake # +!*cmake.in +*.cmake +/cmake-build-debug +/cmake-build-release +/cmake-superbuild-debug +/cmake-superbuild-release # Build: Python *.pyc @@ -13,6 +25,11 @@ build/ docs/html *.tmp +# CLion # +################# +/.idea/* +!/.idea/runConfigurations + # Visual Studio .vs/ *.pdb diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b920b1..5e5ad52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,3 +122,48 @@ if (CSV_DEVELOPER) # ) #endif() endif() + +add_library(csv_parser INTERFACE) +target_include_directories(csv_parser INTERFACE + $ +) + +option(CSV_SINGLE_INCLUDE "Use single include" ON) +if (CSV_SINGLE_INCLUDE) + set(CSV_INCLUDES_DIR "single_include/") +else () + set(CSV_INCLUDES_DIR "include/") +endif() + +message("Using single include: ${CSV_SINGLE_INCLUDE}") + +install(DIRECTORY ${CSV_INCLUDES_DIR} DESTINATION include/csv_parser) + +install(TARGETS csv_parser + EXPORT csv_parserTargets +) + +include(CMakePackageConfigHelpers) + +# write_basic_package_version_file( +# "${CMAKE_CURRENT_SOURCE_DIR}/cmake/csv_parserConfigVersion.cmake" +# VERSION ${PROJECT_VERSION} +# COMPATIBILITY SameMajorVersion +# ) + +configure_package_config_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/csv_parserConfig.cmake.in" + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/csv_parserConfig.cmake" + INSTALL_DESTINATION lib/cmake/csv_parser +) + +install(FILES + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/csv_parserConfig.cmake" + # "${CMAKE_CURRENT_SOURCE_DIR}/cmake/csv_parserConfigVersion.cmake" + DESTINATION lib/cmake/csv_parser +) + +install(EXPORT csv_parserTargets + NAMESPACE csv:: + DESTINATION lib/cmake/csv_parser +) \ No newline at end of file diff --git a/cmake/csv_parserConfig.cmake.in b/cmake/csv_parserConfig.cmake.in new file mode 100644 index 0000000..fae7dd0 --- /dev/null +++ b/cmake/csv_parserConfig.cmake.in @@ -0,0 +1,27 @@ + +####### Expanded from @PACKAGE_INIT@ by configure_package_config_file() ####### +####### Any changes to this file will be overwritten by the next CMake run #### +####### The input file was csv_parserConfig.cmake.in ######## + +get_filename_component(PACKAGE_PREFIX_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE) + +macro(set_and_check _var _file) + set(${_var} "${_file}") + if(NOT EXISTS "${_file}") + message(FATAL_ERROR "File or directory ${_file} referenced by variable ${_var} does not exist !") + endif() +endmacro() + +macro(check_required_components _NAME) + foreach(comp ${${_NAME}_FIND_COMPONENTS}) + if(NOT ${_NAME}_${comp}_FOUND) + if(${_NAME}_FIND_REQUIRED_${comp}) + set(${_NAME}_FOUND FALSE) + endif() + endif() + endforeach() +endmacro() + +#################################################################################### + +include("${CMAKE_CURRENT_LIST_DIR}/csv_parserTargets.cmake") diff --git a/include/internal/csv_reader.hpp b/include/internal/csv_reader.hpp index 300e7da..35dbbf8 100644 --- a/include/internal/csv_reader.hpp +++ b/include/internal/csv_reader.hpp @@ -73,7 +73,7 @@ namespace csv { using difference_type = std::ptrdiff_t; using pointer = CSVRow * ; using reference = CSVRow & ; - using iterator_category = std::input_iterator_tag; + using iterator_category = std::forward_iterator_tag; #endif iterator() = default; diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 3571bbb..d7dc1da 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -1988,22 +1988,22 @@ inline namespace literals { inline namespace string_view_literals { -constexpr std::string_view operator "" _sv( const char* str, size_t len ) noexcept // (1) +constexpr std::string_view operator ""_sv( const char* str, size_t len ) noexcept // (1) { return std::string_view{ str, len }; } -constexpr std::u16string_view operator "" _sv( const char16_t* str, size_t len ) noexcept // (2) +constexpr std::u16string_view operator ""_sv( const char16_t* str, size_t len ) noexcept // (2) { return std::u16string_view{ str, len }; } -constexpr std::u32string_view operator "" _sv( const char32_t* str, size_t len ) noexcept // (3) +constexpr std::u32string_view operator ""_sv( const char32_t* str, size_t len ) noexcept // (3) { return std::u32string_view{ str, len }; } -constexpr std::wstring_view operator "" _sv( const wchar_t* str, size_t len ) noexcept // (4) +constexpr std::wstring_view operator ""_sv( const wchar_t* str, size_t len ) noexcept // (4) { return std::wstring_view{ str, len }; } @@ -6283,7 +6283,7 @@ namespace csv { using difference_type = std::ptrdiff_t; using pointer = CSVRow * ; using reference = CSVRow & ; - using iterator_category = std::input_iterator_tag; + using iterator_category = std::forward_iterator_tag; #endif iterator() = default; @@ -6940,6 +6940,175 @@ namespace csv { ///@} } +#include +#include + + +namespace csv { + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(std::string(in.data(), in.length())); + return CSVReader(stream, format); + } + + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); + + return parse(in, format); + } + + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * + */ + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); + } + + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); + } + + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise + * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file + */ + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); + } + + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); + + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; + + return info; + } +} +/** @file + * Defines an object used to store CSV format settings + */ + +#include +#include + + +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + + this->header = row; + this->col_names = {}; + return *this; + } + + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); + + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); + } + + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; + + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; + + if (i + 1 < intersection.size()) + err_msg += ", "; + } + + throw std::runtime_error(err_msg + '.'); + } + } +} namespace csv { namespace internals { @@ -7236,96 +7405,344 @@ namespace csv { } } /** @file - * Defines an object used to store CSV format settings + * Defines the data type used for storing information about a CSV row */ -#include -#include - +#include +#include namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; - } + namespace internals { + CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { + const size_t page_no = n / _single_buffer_capacity; + const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; + return this->buffers[page_no][buffer_idx]; + } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + CSV_INLINE void CSVFieldList::allocate() { + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; + _current_buffer_size = 0; + _back = buffers.back().get(); + } } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; + /** Return a CSVField object corrsponding to the nth value in the row. + * + * @note This method performs bounds checking, and will throw an + * `std::runtime_error` if n is invalid. + * + * @complexity + * Constant, by calling csv::CSVRow::get_csv::string_view() + * + */ + CSV_INLINE CSVField CSVRow::operator[](size_t n) const { + return CSVField(this->get_field(n)); } - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; + /** Retrieve a value by its associated column name. If the column + * specified can't be round, a runtime error is thrown. + * + * @complexity + * Constant. This calls the other CSVRow::operator[]() after + * converting column names into indices using a hash table. + * + * @param[in] col_name The column to look for + */ + CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { + auto & col_names = this->data->col_names; + auto col_pos = col_names->index_of(col_name); + if (col_pos > -1) { + return this->operator[](col_pos); + } + + throw std::runtime_error("Can't find a column named " + col_name); } - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + CSV_INLINE CSVRow::operator std::vector() const { + std::vector ret; + for (size_t i = 0; i < size(); i++) + ret.push_back(std::string(this->get_field(i))); - this->header = row; - this->col_names = {}; - return *this; + return ret; } - CSV_INLINE void CSVFormat::assert_no_char_overlap() + CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); + using internals::ParseFlags; - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; + if (index >= this->size()) + throw std::runtime_error("Index out of bounds."); - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); + const size_t field_index = this->fields_start + index; + auto& field = this->data->fields[field_index]; + auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); + if (field.has_double_quote) { + auto& value = this->data->double_quote_fields[field_index]; + if (value.empty()) { + bool prev_ch_quote = false; + for (size_t i = 0; i < field.length; i++) { + if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { + if (prev_ch_quote) { + prev_ch_quote = false; + continue; + } + else { + prev_ch_quote = true; + } + } + + value += field_str[i]; + } + } + + return csv::string_view(value); } - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; + return field_str.substr(0, field.length); + } - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; + CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { + size_t start = 0, end = 0; + + // Trim out whitespace chars + for (; start < this->sv.size() && this->sv[start] == ' '; start++); + for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); + + int value_ = 0; + + size_t digits = (end - start); + size_t base16_exponent = digits - 1; + + if (digits == 0) return false; + + for (const auto& ch : this->sv.substr(start, digits)) { + int digit = 0; + + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digit = static_cast(ch - '0'); + break; + case 'a': + case 'A': + digit = 10; + break; + case 'b': + case 'B': + digit = 11; + break; + case 'c': + case 'C': + digit = 12; + break; + case 'd': + case 'D': + digit = 13; + break; + case 'e': + case 'E': + digit = 14; + break; + case 'f': + case 'F': + digit = 15; + break; + default: + return false; + } + + value_ += digit * (int)pow(16, (double)base16_exponent); + base16_exponent--; + } + + parsedValue = value_; + return true; + } + + CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { + // If field has already been parsed to empty, no need to do it aagin: + if (this->_type == DataType::CSV_NULL) + return false; + + // Not yet parsed or possibly parsed with other decimalSymbol + if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) + this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again + + // Integral types are not affected by decimalSymbol and need not be parsed again + + // Either we already had an integral type before, or we we just got any numeric type now. + if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { + dVal = this->value; + return true; + } + + // CSV_NULL or CSV_STRING, not numeric + return false; + } + +#ifdef _MSC_VER +#pragma region CSVRow Iterator +#endif + /** Return an iterator pointing to the first field. */ + CSV_INLINE CSVRow::iterator CSVRow::begin() const { + return CSVRow::iterator(this, 0); + } + + /** Return an iterator pointing to just after the end of the CSVRow. + * + * @warning Attempting to dereference the end iterator results + * in dereferencing a null pointer. + */ + CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { + return CSVRow::iterator(this, (int)this->size()); + } + + CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { + return std::reverse_iterator(this->end()); + } + + CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { + return std::reverse_iterator(this->begin()); + } + + CSV_INLINE HEDLEY_NON_NULL(2) + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) + : daddy(_reader), i(_i) { + if (_i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](_i)); + else + this->field = nullptr; + } + + CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { + return *(this->field.get()); + } + + CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { + return this->field; + } + + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { + // Pre-increment operator + this->i++; + if (this->i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](i)); + else // Reached the end of row + this->field = nullptr; + return *this; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { + // Post-increment operator + auto temp = *this; + this->operator++(); + return temp; + } + + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { + // Pre-decrement operator + this->i--; + this->field = std::make_shared( + this->daddy->operator[](this->i)); + return *this; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { + // Post-decrement operator + auto temp = *this; + this->operator--(); + return temp; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator(this->daddy, i + (int)n); + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator::operator+(-n); + } +#ifdef _MSC_VER +#pragma endregion CSVRow Iterator +#endif +} + +/** @file + * Defines an input iterator for csv::CSVReader + */ + + +namespace csv { + /** Return an iterator to the first row in the reader */ + CSV_INLINE CSVReader::iterator CSVReader::begin() { + if (this->records->empty()) { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); + + // Still empty => return end iterator + if (this->records->empty()) return this->end(); + } + + this->_n_rows++; + CSVReader::iterator ret(this, this->records->pop_front()); + return ret; + } + + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. + */ + CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { + return CSVReader::iterator(); + } + + ///////////////////////// + // CSVReader::iterator // + ///////////////////////// + + CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : + daddy(_daddy) { + row = std::move(_row); + } + + /** Advance the iterator by one row. If this CSVReader has an + * associated file, then the iterator will lazily pull more data from + * that file until the end of file is reached. + * + * @note This iterator does **not** block the thread responsible for parsing CSV. + * + */ + CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } - if (i + 1 < intersection.size()) - err_msg += ", "; - } + return *this; + } - throw std::runtime_error(err_msg + '.'); + /** Post-increment iterator */ + CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { + auto temp = *this; + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() } + + return temp; } } + /** @file * @brief Defines functionality needed for basic CSV parsing */ @@ -7410,568 +7827,229 @@ namespace csv { } } - return { - final_score, - header_row - }; - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { - /** For each delimiter, find out which row length was most common. - * The delimiter with the longest mode row length wins. - * Then, the line number of the header row is the first row with - * the mode row length. - */ - - CSVFormat format; - size_t max_score = 0, - header = 0; - char current_delim = delims[0]; - - for (char cand_delim : delims) { - auto result = calculate_score(head, format.delimiter(cand_delim)); - - if ((size_t)result.score > max_score) { - max_score = (size_t)result.score; - current_delim = cand_delim; - header = result.header; - } - } - - return { current_delim, (int)header }; - } - } - - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { - auto head = internals::get_csv_head(filename); - - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = guess_format(filename, format.get_possible_delims()); - format.delimiter(guess_result.delim).header_row(guess_result.header_row); - } - - return internals::_get_col_names(head, format); - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { - auto head = internals::get_csv_head(filename); - return internals::_guess_format(head, delims); - } - - /** Reads an arbitrarily large CSV file using memory-mapped IO. - * - * **Details:** Reads the first block of a CSV file synchronously to get information - * such as column names and delimiting character. - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - * \snippet tests/test_read_csv.cpp CSVField Example - * - */ - CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { - auto head = internals::get_csv_head(filename); - using Parser = internals::MmapParser; - - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = internals::_guess_format(head, format.possible_delimiters); - format.delimiter(guess_result.delim); - format.header = guess_result.header_row; - this->_format = format; - } - - if (!format.col_names.empty()) - this->set_col_names(format.col_names); - - this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 - this->initial_read(); - } - - /** Return the format of the original raw CSV */ - CSV_INLINE CSVFormat CSVReader::get_format() const { - CSVFormat new_format = this->_format; - - // Since users are normally not allowed to set - // column names and header row simulatenously, - // we will set the backing variables directly here - new_format.col_names = this->col_names->get_col_names(); - new_format.header = this->_format.header; - - return new_format; - } - - /** Return the CSV's column names as a vector of strings. */ - CSV_INLINE std::vector CSVReader::get_col_names() const { - if (this->col_names) { - return this->col_names->get_col_names(); - } - - return std::vector(); - } - - /** Return the index of the column name if found or - * csv::CSV_NOT_FOUND otherwise. - */ - CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { - auto _col_names = this->get_col_names(); - for (size_t i = 0; i < _col_names.size(); i++) - if (_col_names[i] == col_name) return (int)i; - - return CSV_NOT_FOUND; - } - - CSV_INLINE void CSVReader::trim_header() { - if (!this->header_trimmed) { - for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { - if (i == this->_format.header && this->col_names->empty()) { - this->set_col_names(this->records->pop_front()); - } - else { - this->records->pop_front(); - } - } - - this->header_trimmed = true; - } - } - - /** - * @param[in] names Column names - */ - CSV_INLINE void CSVReader::set_col_names(const std::vector& names) - { - this->col_names->set_col_names(names); - this->n_cols = names.size(); - } - - /** - * Read a chunk of CSV data. - * - * @note This method is meant to be run on its own thread. Only one `read_csv()` thread - * should be active at a time. - * - * @param[in] bytes Number of bytes to read. - * - * @see CSVReader::read_csv_worker - * @see CSVReader::read_row() - */ - CSV_INLINE bool CSVReader::read_csv(size_t bytes) { - // Tell read_row() to listen for CSV rows - this->records->notify_all(); - - this->parser->set_output(*this->records); - this->parser->next(bytes); - - if (!this->header_trimmed) { - this->trim_header(); - } - - // Tell read_row() to stop waiting - this->records->kill_all(); - - return true; - } - - /** - * Retrieve rows as CSVRow objects, returning true if more rows are available. - * - * @par Performance Notes - * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time - * - For performance details, read the documentation for CSVRow and CSVField. - * - * @param[out] row The variable where the parsed row will be stored - * @see CSVRow, CSVField - * - * **Example:** - * \snippet tests/test_read_csv.cpp CSVField Example - * - */ - CSV_INLINE bool CSVReader::read_row(CSVRow &row) { - while (true) { - if (this->records->empty()) { - if (this->records->is_waitable()) - // Reading thread is currently active => wait for it to populate records - this->records->wait(); - else if (this->parser->eof()) - // End of file and no more records - return false; - else { - // Reading thread is not active => start another one - if (this->read_csv_worker.joinable()) - this->read_csv_worker.join(); - - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - } - } - else if (this->records->front().size() != this->n_cols && - this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { - auto errored_row = this->records->pop_front(); - - if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { - if (errored_row.size() < this->n_cols) - throw std::runtime_error("Line too short " + internals::format_row(errored_row)); - - throw std::runtime_error("Line too long " + internals::format_row(errored_row)); - } - } - else { - row = this->records->pop_front(); - this->_n_rows++; - return true; - } - } - - return false; - } -} - -/** @file - * Defines an input iterator for csv::CSVReader - */ - - -namespace csv { - /** Return an iterator to the first row in the reader */ - CSV_INLINE CSVReader::iterator CSVReader::begin() { - if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); - - // Still empty => return end iterator - if (this->records->empty()) return this->end(); + return { + final_score, + header_row + }; } - this->_n_rows++; - CSVReader::iterator ret(this, this->records->pop_front()); - return ret; - } + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { + /** For each delimiter, find out which row length was most common. + * The delimiter with the longest mode row length wins. + * Then, the line number of the header row is the first row with + * the mode row length. + */ - /** A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. - */ - CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { - return CSVReader::iterator(); - } + CSVFormat format; + size_t max_score = 0, + header = 0; + char current_delim = delims[0]; - ///////////////////////// - // CSVReader::iterator // - ///////////////////////// + for (char cand_delim : delims) { + auto result = calculate_score(head, format.delimiter(cand_delim)); - CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : - daddy(_daddy) { - row = std::move(_row); + if ((size_t)result.score > max_score) { + max_score = (size_t)result.score; + current_delim = cand_delim; + header = result.header; + } + } + + return { current_delim, (int)header }; + } } - /** Advance the iterator by one row. If this CSVReader has an - * associated file, then the iterator will lazily pull more data from - * that file until the end of file is reached. + /** Return a CSV's column names * - * @note This iterator does **not** block the thread responsible for parsing CSV. + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file * */ - CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } - - return *this; - } + CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { + auto head = internals::get_csv_head(filename); - /** Post-increment iterator */ - CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { - auto temp = *this; - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = guess_format(filename, format.get_possible_delims()); + format.delimiter(guess_result.delim).header_row(guess_result.header_row); } - return temp; + return internals::_get_col_names(head, format); } -} - -/** @file - * Defines the data type used for storing information about a CSV row - */ - -#include -#include - -namespace csv { - namespace internals { - CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { - const size_t page_no = n / _single_buffer_capacity; - const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; - return this->buffers[page_no][buffer_idx]; - } - - CSV_INLINE void CSVFieldList::allocate() { - buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); - _current_buffer_size = 0; - _back = buffers.back().get(); - } + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { + auto head = internals::get_csv_head(filename); + return internals::_guess_format(head, delims); } - /** Return a CSVField object corrsponding to the nth value in the row. - * - * @note This method performs bounds checking, and will throw an - * `std::runtime_error` if n is invalid. + /** Reads an arbitrarily large CSV file using memory-mapped IO. * - * @complexity - * Constant, by calling csv::CSVRow::get_csv::string_view() + * **Details:** Reads the first block of a CSV file synchronously to get information + * such as column names and delimiting character. * - */ - CSV_INLINE CSVField CSVRow::operator[](size_t n) const { - return CSVField(this->get_field(n)); - } - - /** Retrieve a value by its associated column name. If the column - * specified can't be round, a runtime error is thrown. + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file * - * @complexity - * Constant. This calls the other CSVRow::operator[]() after - * converting column names into indices using a hash table. + * \snippet tests/test_read_csv.cpp CSVField Example * - * @param[in] col_name The column to look for */ - CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { - auto & col_names = this->data->col_names; - auto col_pos = col_names->index_of(col_name); - if (col_pos > -1) { - return this->operator[](col_pos); - } + CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { + auto head = internals::get_csv_head(filename); + using Parser = internals::MmapParser; - throw std::runtime_error("Can't find a column named " + col_name); - } + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = internals::_guess_format(head, format.possible_delimiters); + format.delimiter(guess_result.delim); + format.header = guess_result.header_row; + this->_format = format; + } - CSV_INLINE CSVRow::operator std::vector() const { - std::vector ret; - for (size_t i = 0; i < size(); i++) - ret.push_back(std::string(this->get_field(i))); + if (!format.col_names.empty()) + this->set_col_names(format.col_names); - return ret; + this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 + this->initial_read(); } - CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const - { - using internals::ParseFlags; - - if (index >= this->size()) - throw std::runtime_error("Index out of bounds."); - - const size_t field_index = this->fields_start + index; - auto& field = this->data->fields[field_index]; - auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); + /** Return the format of the original raw CSV */ + CSV_INLINE CSVFormat CSVReader::get_format() const { + CSVFormat new_format = this->_format; - if (field.has_double_quote) { - auto& value = this->data->double_quote_fields[field_index]; - if (value.empty()) { - bool prev_ch_quote = false; - for (size_t i = 0; i < field.length; i++) { - if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { - if (prev_ch_quote) { - prev_ch_quote = false; - continue; - } - else { - prev_ch_quote = true; - } - } + // Since users are normally not allowed to set + // column names and header row simulatenously, + // we will set the backing variables directly here + new_format.col_names = this->col_names->get_col_names(); + new_format.header = this->_format.header; - value += field_str[i]; - } - } + return new_format; + } - return csv::string_view(value); + /** Return the CSV's column names as a vector of strings. */ + CSV_INLINE std::vector CSVReader::get_col_names() const { + if (this->col_names) { + return this->col_names->get_col_names(); } - return field_str.substr(0, field.length); + return std::vector(); } - CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { - size_t start = 0, end = 0; - - // Trim out whitespace chars - for (; start < this->sv.size() && this->sv[start] == ' '; start++); - for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - - int value_ = 0; - - size_t digits = (end - start); - size_t base16_exponent = digits - 1; - - if (digits == 0) return false; - - for (const auto& ch : this->sv.substr(start, digits)) { - int digit = 0; - - switch (ch) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - digit = static_cast(ch - '0'); - break; - case 'a': - case 'A': - digit = 10; - break; - case 'b': - case 'B': - digit = 11; - break; - case 'c': - case 'C': - digit = 12; - break; - case 'd': - case 'D': - digit = 13; - break; - case 'e': - case 'E': - digit = 14; - break; - case 'f': - case 'F': - digit = 15; - break; - default: - return false; - } - - value_ += digit * (int)pow(16, (double)base16_exponent); - base16_exponent--; - } + /** Return the index of the column name if found or + * csv::CSV_NOT_FOUND otherwise. + */ + CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { + auto _col_names = this->get_col_names(); + for (size_t i = 0; i < _col_names.size(); i++) + if (_col_names[i] == col_name) return (int)i; - parsedValue = value_; - return true; + return CSV_NOT_FOUND; } - CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { - // If field has already been parsed to empty, no need to do it aagin: - if (this->_type == DataType::CSV_NULL) - return false; - - // Not yet parsed or possibly parsed with other decimalSymbol - if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) - this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again - - // Integral types are not affected by decimalSymbol and need not be parsed again + CSV_INLINE void CSVReader::trim_header() { + if (!this->header_trimmed) { + for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { + if (i == this->_format.header && this->col_names->empty()) { + this->set_col_names(this->records->pop_front()); + } + else { + this->records->pop_front(); + } + } - // Either we already had an integral type before, or we we just got any numeric type now. - if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { - dVal = this->value; - return true; + this->header_trimmed = true; } - - // CSV_NULL or CSV_STRING, not numeric - return false; } -#ifdef _MSC_VER -#pragma region CSVRow Iterator -#endif - /** Return an iterator pointing to the first field. */ - CSV_INLINE CSVRow::iterator CSVRow::begin() const { - return CSVRow::iterator(this, 0); + /** + * @param[in] names Column names + */ + CSV_INLINE void CSVReader::set_col_names(const std::vector& names) + { + this->col_names->set_col_names(names); + this->n_cols = names.size(); } - /** Return an iterator pointing to just after the end of the CSVRow. + /** + * Read a chunk of CSV data. * - * @warning Attempting to dereference the end iterator results - * in dereferencing a null pointer. + * @note This method is meant to be run on its own thread. Only one `read_csv()` thread + * should be active at a time. + * + * @param[in] bytes Number of bytes to read. + * + * @see CSVReader::read_csv_worker + * @see CSVReader::read_row() */ - CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { - return CSVRow::iterator(this, (int)this->size()); - } - - CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { - return std::reverse_iterator(this->end()); - } + CSV_INLINE bool CSVReader::read_csv(size_t bytes) { + // Tell read_row() to listen for CSV rows + this->records->notify_all(); - CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { - return std::reverse_iterator(this->begin()); - } + this->parser->set_output(*this->records); + this->parser->next(bytes); - CSV_INLINE HEDLEY_NON_NULL(2) - CSVRow::iterator::iterator(const CSVRow* _reader, int _i) - : daddy(_reader), i(_i) { - if (_i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](_i)); - else - this->field = nullptr; - } + if (!this->header_trimmed) { + this->trim_header(); + } - CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { - return *(this->field.get()); - } + // Tell read_row() to stop waiting + this->records->kill_all(); - CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - return this->field; + return true; } - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { - // Pre-increment operator - this->i++; - if (this->i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](i)); - else // Reached the end of row - this->field = nullptr; - return *this; - } + /** + * Retrieve rows as CSVRow objects, returning true if more rows are available. + * + * @par Performance Notes + * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time + * - For performance details, read the documentation for CSVRow and CSVField. + * + * @param[out] row The variable where the parsed row will be stored + * @see CSVRow, CSVField + * + * **Example:** + * \snippet tests/test_read_csv.cpp CSVField Example + * + */ + CSV_INLINE bool CSVReader::read_row(CSVRow &row) { + while (true) { + if (this->records->empty()) { + if (this->records->is_waitable()) + // Reading thread is currently active => wait for it to populate records + this->records->wait(); + else if (this->parser->eof()) + // End of file and no more records + return false; + else { + // Reading thread is not active => start another one + if (this->read_csv_worker.joinable()) + this->read_csv_worker.join(); - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { - // Post-increment operator - auto temp = *this; - this->operator++(); - return temp; - } + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + } + } + else if (this->records->front().size() != this->n_cols && + this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { + auto errored_row = this->records->pop_front(); - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { - // Pre-decrement operator - this->i--; - this->field = std::make_shared( - this->daddy->operator[](this->i)); - return *this; - } + if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { + if (errored_row.size() < this->n_cols) + throw std::runtime_error("Line too short " + internals::format_row(errored_row)); - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { - // Post-decrement operator - auto temp = *this; - this->operator--(); - return temp; - } - - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator(this->daddy, i + (int)n); - } + throw std::runtime_error("Line too long " + internals::format_row(errored_row)); + } + } + else { + row = this->records->pop_front(); + this->_n_rows++; + return true; + } + } - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator::operator+(-n); + return false; } -#ifdef _MSC_VER -#pragma endregion CSVRow Iterator -#endif } /** @file @@ -8321,8 +8399,8 @@ namespace csv { counts.push_back({}); rolling_means.push_back(0); rolling_vars.push_back(0); - mins.push_back(std::numeric_limits::quiet_NaN()); - maxes.push_back(std::numeric_limits::quiet_NaN()); + mins.push_back(NAN); + maxes.push_back(NAN); n.push_back(0); } } @@ -8502,84 +8580,6 @@ namespace csv { return csv_dtypes; } } -#include -#include - - -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example - */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(std::string(in.data(), in.length())); - return CSVReader(stream, format); - } - - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); - - return parse(in, format); - } - - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma - * - */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); - } - - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); - } - - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } - - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); - - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; - - return info; - } -} #endif diff --git a/tests/test_csv_iterator.cpp b/tests/test_csv_iterator.cpp index a750875..7ec5340 100644 --- a/tests/test_csv_iterator.cpp +++ b/tests/test_csv_iterator.cpp @@ -34,8 +34,8 @@ TEST_CASE("Test CSVRow Interator", "[test_csv_row_iter]") { } // Backwards - REQUIRE(row.rbegin()->get() == 345); - REQUIRE((row.rend() - 1)->get<>() == "123"); + REQUIRE(row.rbegin().base()->get() == 345); + REQUIRE((row.rend() - 1).base()->get<>() == "123"); } SECTION("Iterator Arithmetic") {