From b5913e0e45ffeb4a8453b7bc2931a0ab3273a45e Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Sat, 21 Sep 2019 12:52:46 +0800 Subject: [PATCH 01/10] Added first attempt at using ranges to process unicode strings --- .clang-format | 3 +- CMakeLists.txt | 1 + include/skyr/unicode/range/octet_range.hpp | 530 +++++++++++++++++++++ include/skyr/unicode/range/u16_range.hpp | 289 +++++++++++ include/skyr/unicode/range/u32_range.hpp | 206 ++++++++ include/skyr/unicode/unicode.hpp | 13 +- src/CMakeLists.txt | 5 +- tests/CMakeLists.txt | 1 + tests/unicode_range_tests.cpp | 168 +++++++ tests/unicode_tests.cpp | 2 +- 10 files changed, 1211 insertions(+), 7 deletions(-) create mode 100644 include/skyr/unicode/range/octet_range.hpp create mode 100644 include/skyr/unicode/range/u16_range.hpp create mode 100644 include/skyr/unicode/range/u32_range.hpp create mode 100644 tests/unicode_range_tests.cpp diff --git a/.clang-format b/.clang-format index ea8ba3c7..c49d0de3 100644 --- a/.clang-format +++ b/.clang-format @@ -3,7 +3,7 @@ AccessModifierOffset: -1 ConstructorInitializerIndentWidth: 4 AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: true -AllowShortIfStatementsOnASingleLine: WithoutElse +#AllowShortIfStatementsOnASingleLine: WithoutElse AllowShortLoopsOnASingleLine: true AlwaysBreakTemplateDeclarations: Yes AlwaysBreakBeforeMultilineStrings: true @@ -31,4 +31,3 @@ BreakBeforeBraces: Attach SpacesInParentheses: false SpaceInEmptyParentheses: false SpacesInCStyleCastParentheses: false - diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bf8508d..dc60a281 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ set(CMAKE_CXX_STANDARD 17) find_package(Threads REQUIRED) find_package(tl-expected CONFIG REQUIRED) +find_package(range-v3 CONFIG REQUIRED) if (${CMAKE_CXX_COMPILER_ID} MATCHES GNU) if (Skyr_FULL_WARNINGS) diff --git a/include/skyr/unicode/range/octet_range.hpp b/include/skyr/unicode/range/octet_range.hpp new file mode 100644 index 00000000..e8eccf19 --- /dev/null +++ b/include/skyr/unicode/range/octet_range.hpp @@ -0,0 +1,530 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_OCTET_RANGE_HPP +#define SKYR_OCTET_RANGE_HPP + +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \tparam OctetIterator +template +class code_point_octet_t { + public: + + /// + using const_iterator = OctetIterator; + /// + using iterator = const_iterator; + /// + using value_type = char; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using size_type = std::size_t; + + /// + /// \param first + constexpr code_point_octet_t( + OctetIterator first, + OctetIterator last) + : first(first) + , last(last) {} + + /// + /// \param first + explicit constexpr code_point_octet_t(OctetIterator first) + : code_point_octet_t(first, first + sequence_length(*first)) {} + + /// + constexpr code_point_octet_t(const code_point_octet_t &) = default; + /// + constexpr code_point_octet_t(code_point_octet_t &&) noexcept = default; + /// + constexpr code_point_octet_t &operator=(const code_point_octet_t &) = default; + /// + constexpr code_point_octet_t &operator=(code_point_octet_t &&) noexcept = default; + /// + ~code_point_octet_t() = default; + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return first; + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return last; + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr auto size() const noexcept -> size_type { + return sequence_length(*first); + } + + private: + + OctetIterator first, last; + +}; + +/// +/// \tparam OctetRange +/// \param range +/// \return +template +inline tl::expected, unicode_errc> code_point_octet( + const OctetRange &range) { + auto first = std::begin(range), last = std::end(range); + if (std::distance(first, last) > sequence_length(*first)) { + return tl::make_unexpected(unicode_errc::overflow); + } + return code_point_octet_t( + first, + first + sequence_length(*first)); +} + +template +inline bool is_valid(const code_point_octet_t &code_point) { + return static_cast(find_code_point(std::begin(code_point))); +} + +/// +/// \tparam OctetRange +/// \param range +/// \return +template +inline tl::expected, unicode_errc> valid_code_point( + const OctetRange &range) { + using result_type = tl::expected, unicode_errc>; + + auto check_code_point = [] (auto &&code_point) -> result_type { + return find_code_point(std::begin(code_point)) + .and_then([=] (auto) -> result_type { + return code_point; + }) + .or_else([] (auto &&error) -> result_type { + return tl::make_unexpected(error); + }); + }; + + return + code_point_octet(range) + .and_then(check_code_point); +} + +/// +template +class unchecked_octet_range_iterator { + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = code_point_octet_t; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + constexpr unchecked_octet_range_iterator() = default; + /// + /// \param it + explicit constexpr unchecked_octet_range_iterator(OctetIterator it) + : it_(it) {} + /// + constexpr unchecked_octet_range_iterator(const unchecked_octet_range_iterator&) = default; + /// + constexpr unchecked_octet_range_iterator(unchecked_octet_range_iterator&&) noexcept = default; + /// + constexpr unchecked_octet_range_iterator &operator=(const unchecked_octet_range_iterator&) = default; + /// + constexpr unchecked_octet_range_iterator &operator=(unchecked_octet_range_iterator&&) noexcept = default; + /// + ~unchecked_octet_range_iterator() = default; + + /// + /// \return + unchecked_octet_range_iterator operator ++ (int) { + assert(it_); + auto result = *this; + std::advance(it_.value(), sequence_length(*it_.value())); + return result; + } + + /// + /// \return + unchecked_octet_range_iterator &operator ++ () { + assert(it_); + std::advance(it_.value(), sequence_length(*it_.value())); + return *this; + } + + /// + /// \return + constexpr reference operator * () const noexcept { + assert(it_); + return code_point_octet_t( + it_.value(), + it_.value() + sequence_length(*it_.value())); + } + + /// + /// \param other + /// \return + constexpr bool operator == (const unchecked_octet_range_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + constexpr bool operator != (const unchecked_octet_range_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + std::optional it_; + +}; + +/// +template +class octet_range_iterator { + + using iterator_type = unchecked_octet_range_iterator; + + public: + + /// + using iterator_category = typename iterator_type::iterator_category; + /// + using value_type = typename iterator_type::value_type; + /// + using reference = typename iterator_type::reference; + /// + using pointer = typename iterator_type::pointer; + /// + using difference_type = typename iterator_type::difference_type; + + /// + constexpr octet_range_iterator() = default; + /// + /// \param it + explicit constexpr octet_range_iterator(OctetIterator it) + : it_(it) {} + /// + constexpr octet_range_iterator(const octet_range_iterator&) = default; + /// + constexpr octet_range_iterator(octet_range_iterator&&) noexcept = default; + /// + constexpr octet_range_iterator &operator=(const octet_range_iterator&) = default; + /// + constexpr octet_range_iterator &operator=(octet_range_iterator&&) noexcept = default; + /// + ~octet_range_iterator() = default; + + /// + /// \return + octet_range_iterator operator ++ (int) { + auto result = *this; + increment(); + return result; + } + + /// + /// \return + octet_range_iterator &operator ++ () { + increment(); + return *this; + } + + /// + /// \return + constexpr reference operator * () const noexcept { + using result_type = tl::expected, unicode_errc>; + + return + valid_code_point(*it_) + .or_else([=] (auto) -> result_type { + auto first = std::begin(*it_); + return code_point_octet(ranges::iterator_range(first, first)); + }) + .value(); + } + + /// + /// \param other + /// \return + constexpr bool operator == (const octet_range_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + constexpr bool operator != (const octet_range_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + void increment() { + if (valid_code_point(*it_)) { + ++it_; + } else { + it_ = iterator_type(); + } + } + + iterator_type it_; + +}; + +/// +template +class view_unchecked_octet_range + : public ranges::view_base { + + using octet_iterator_type = typename OctetRange::const_iterator; + using iterator_type = unchecked_octet_range_iterator; + + public: + + /// + using value_type = code_point_octet_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr view_unchecked_octet_range() = default; + + /// + /// \param range + explicit constexpr view_unchecked_octet_range(const OctetRange &range) + : impl_( + impl(std::begin(range), + std::end(range))) {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return impl_? impl_.value().first : iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return impl_? impl_.value().last : iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return begin() == end(); + } + + /// + /// \return + [[nodiscard]] size_type size() const noexcept { + return static_cast(std::distance(begin(), end())); + } + + private: + + struct impl { + constexpr impl( + octet_iterator_type first, + octet_iterator_type last) + : first(first) + , last(last) {} + iterator_type first, last; + }; + + std::optional impl_; + +}; + +/// +template +class view_octet_range + : public ranges::view_base { + + using octet_iterator_type = typename OctetRange::const_iterator; + using iterator_type = octet_range_iterator; + + public: + + /// + using value_type = code_point_octet_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr view_octet_range() = default; + + /// + /// \param range + explicit constexpr view_octet_range(const OctetRange &range) + : impl_( + impl(std::begin(range), + std::end(range))) {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return impl_? impl_.value().first : iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return impl_? impl_.value().last : iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return begin() == end(); + } + + /// + /// \return + [[nodiscard]] size_type size() const noexcept { + return static_cast(std::distance(begin(), end())); + } + + private: + + struct impl { + constexpr impl( + octet_iterator_type first, + octet_iterator_type last) + : first(first) + , last(last) {} + iterator_type first, last; + }; + + std::optional impl_; + +}; + +/// +struct unchecked_octet_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(OctetRange &&range) const { + return view_unchecked_octet_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(OctetRange &&range, const unchecked_octet_range_fn&) { + return view_unchecked_octet_range{std::forward(range)}; + } +}; + +/// +struct octet_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(OctetRange &&range) const { + return view_octet_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(OctetRange &&range, const octet_range_fn&) { + return view_octet_range{std::forward(range)}; + } +}; + +namespace view { +/// +//static constexpr unchecked_octet_range_fn u8; +static constexpr octet_range_fn u8; +} // namespace view +} // namespace skyr::unicode + +#endif //SKYR_OCTET_RANGE_HPP diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/u16_range.hpp new file mode 100644 index 00000000..9d6be5bc --- /dev/null +++ b/include/skyr/unicode/range/u16_range.hpp @@ -0,0 +1,289 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_U16_RANGE_HPP +#define SKYR_U16_RANGE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +template +class u16_code_point_t { + + public: + + /// + using const_iterator = U16Iterator; + /// + using iterator = const_iterator; + /// + using value_type = char16_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using size_type = std::size_t; + + /// + /// \param first + explicit constexpr u16_code_point_t(U16Iterator first) + : first(first) + , last(first + (*first > 0xffff? 2 : 1)) {} + + /// + constexpr u16_code_point_t(const u16_code_point_t &) = default; + /// + constexpr u16_code_point_t(u16_code_point_t &&) noexcept = default; + /// + constexpr u16_code_point_t &operator=(const u16_code_point_t &) = default; + /// + constexpr u16_code_point_t &operator=(u16_code_point_t &&) noexcept = default; + /// + ~u16_code_point_t() = default; + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return first; + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return last; + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + [[nodiscard]] constexpr auto size() const noexcept -> size_type { + return *first > 0xffff? 2 : 1; + } + + private: + + U16Iterator first, last; + +}; + +//template +//inline std::tuple u16(u16_code_point_t code_point) { +// auto state = find_code_point(begin(code_point)); +// if (state.value().value > 0xffff) { // make a surrogate pair +// return { +// static_cast((state.value().value >> 10) + +// constants::surrogates::lead_offset), +// static_cast((state.value().value & 0x3ff) + +// constants::surrogates::trail_min) +// }; +// } else { +// return { +// static_cast(state.value().value), +// 0 +// }; +// } +//} + +/// +template +class u16_range_iterator { + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = char16_t; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + u16_range_iterator() = default; + /// + /// \param it + explicit constexpr u16_range_iterator(unchecked_octet_range_iterator it) + : it_(it) {} + /// + constexpr u16_range_iterator(const u16_range_iterator&) = default; + /// + constexpr u16_range_iterator(u16_range_iterator&&) noexcept = default; + /// + constexpr u16_range_iterator &operator=(const u16_range_iterator&) = default; + /// + constexpr u16_range_iterator &operator=(u16_range_iterator&&) noexcept = default; + /// + ~u16_range_iterator() = default; + + /// + /// \return + u16_range_iterator operator ++ (int) { + auto result = *this; + ++it_; +// if (u32(*it_) > 0xffff) { +// ++it_; +// } + return result; + } + + /// + /// \return + u16_range_iterator &operator ++ () { + ++it_; +// if (u32(*it_) > 0xffff) { +// ++it_; +// } + return *this; + } + + /// + /// \return + reference operator * () const noexcept { +// return u16(*it_); + return 0; + } + + /// + /// \param other + /// \return + bool operator == (const u16_range_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + bool operator != (const u16_range_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + unchecked_octet_range_iterator it_; + +}; + +/// +/// \tparam OctetRange +template +class view_u16_range + : public ranges::view_base { + + using octet_iterator_type = typename OctetRange::const_iterator; + using iterator_type = u16_range_iterator; + + public: + + /// + using value_type = char16_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr view_u16_range() = default; + + /// + /// \param range + explicit constexpr view_u16_range(const OctetRange &range) + : range_{range} {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return u16_range_iterator(range_.begin()); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return u16_range_iterator(range_.end()); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return range_.empty(); + } + + /// + /// \return + [[nodiscard]] constexpr size_type size() const noexcept { + return range_.size(); + } + + private: + + view_unchecked_octet_range range_; + +}; + +/// +struct u16_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(OctetRange &&range) const { + return view_u16_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(OctetRange &&range, const u16_range_fn&) { + return view_u16_range{std::forward(range)}; + } + +}; + +namespace view { +/// +static constexpr u16_range_fn u16; +} // namespace view +} // namespace skyr::unicode + +#endif //SKYR_U16_RANGE_HPP diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp new file mode 100644 index 00000000..d772f8e3 --- /dev/null +++ b/include/skyr/unicode/range/u32_range.hpp @@ -0,0 +1,206 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_U32_RANGE_HPP +#define SKYR_U32_RANGE_HPP + +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \tparam OctetIterator +/// \param code_point +/// \return +template +inline char32_t u32(code_point_octet_t code_point) { + using std::begin; + auto state = find_code_point(begin(code_point)); + return state.value().value; +} + +/// +/// \tparam OctetIterator +template +class u32_range_iterator { + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = char32_t; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + constexpr u32_range_iterator() = default; + /// + /// \param it + explicit constexpr u32_range_iterator(unchecked_octet_range_iterator it) + : it_(it) {} + /// + constexpr u32_range_iterator(const u32_range_iterator&) = default; + /// + constexpr u32_range_iterator(u32_range_iterator&&) noexcept = default; + /// + constexpr u32_range_iterator &operator=(const u32_range_iterator&) = default; + /// + constexpr u32_range_iterator &operator=(u32_range_iterator&&) noexcept = default; + /// + ~u32_range_iterator() = default; + + /// + /// \return + u32_range_iterator operator ++ (int) { + auto result = *this; + ++it_; + return result; + } + + /// + /// \return + u32_range_iterator &operator ++ () { + ++it_; + return *this; + } + + /// + /// \return + reference operator * () const noexcept { + return u32(*it_); + } + + /// + /// \param other + /// \return + constexpr bool operator == (const u32_range_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + constexpr bool operator != (const u32_range_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + unchecked_octet_range_iterator it_; + +}; + +/// +/// \tparam OctetRange +template +class view_u32_range + : public ranges::view_base { + + using octet_iterator_type = typename OctetRange::const_iterator; + using iterator_type = u32_range_iterator; + + public: + + /// + using value_type = char32_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr view_u32_range() = default; + + /// + /// \param range + explicit constexpr view_u32_range(const OctetRange &range) + : range_{range} {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return u32_range_iterator(range_.begin()); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return u32_range_iterator(range_.end()); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return range_.empty(); + } + + /// + /// \return + [[nodiscard]] constexpr size_type size() const noexcept { + return range_.size(); + } + + private: + + view_unchecked_octet_range range_; + +}; + +/// +struct u32_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(OctetRange &&range) const { + return view_u32_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(OctetRange &&range, const u32_range_fn&) { + return view_u32_range{std::forward(range)}; + } + +}; + +namespace view { +/// +static constexpr u32_range_fn u32; +} // namespace view +} // namespace skyr::unicode + +#endif //SKYR_U32_RANGE_HPP diff --git a/include/skyr/unicode/unicode.hpp b/include/skyr/unicode/unicode.hpp index d7966d0f..bfee9e32 100644 --- a/include/skyr/unicode/unicode.hpp +++ b/include/skyr/unicode/unicode.hpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include +//#include #include namespace skyr { @@ -116,6 +119,10 @@ constexpr long sequence_length(char lead_value) { return 0; } +//constexpr long sequence_length(char16_t lead_value) { +// return lead_value > 0xffff? 2 : 1; +//} + /// /// \param code_point /// \param length @@ -392,12 +399,12 @@ tl::expected append_bytes( /// \return The updated iterator or an error if the sequence is /// invalid template -tl::expected advance( +tl::expected advance( OctetIterator& it, std::size_t n, OctetIterator last) { while (n != 0) { - if (std::distance(it, last) < sequence_length(*it)) { + if (ranges::distance(it, last) < sequence_length(*it)) { return tl::make_unexpected(unicode_errc::overflow); } @@ -409,7 +416,7 @@ tl::expected advance( --n; } - return {}; + return it; } /// Counts the number of code points in the octet sequence. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5487992b..537ec0a0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,6 +30,9 @@ set(Skyr_SRCS ${Skyr_SOURCE_DIR}/include/skyr/config.hpp ${Skyr_SOURCE_DIR}/include/skyr/traits/string_traits.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/unicode.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/octet_range.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u16_range.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u32_range.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/percent_encode.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/domain.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/url_record.hpp @@ -45,7 +48,7 @@ if (Skyr_BUILD_FILESYSTEM_PATH_FUNCTIONS) endif() add_library(skyr-url ${Skyr_SRCS}) -target_link_libraries(skyr-url tl::expected) +target_link_libraries(skyr-url tl::expected meta range-v3) if(${CMAKE_CXX_COMPILER_ID} MATCHES Clang) if (NOT Skyr_DISABLE_LIBCXX) target_link_libraries(skyr-url "c++") diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1756f7e5..fc36ba81 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,6 +23,7 @@ set( punycode_tests domain_tests unicode_tests + unicode_range_tests ) if (Skyr_BUILD_FILESYSTEM_PATH_FUNCTIONS) diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp new file mode 100644 index 00000000..5ef60742 --- /dev/null +++ b/tests/unicode_range_tests.cpp @@ -0,0 +1,168 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#define CATCH_CONFIG_MAIN +#include +#include +#include +#include +#include + + +TEST_CASE("code point tests") { + using std::begin; + using std::end; + + SECTION("u8 code point 01") { + auto bytes = std::string("\xf0\x9f\x92\xa9"); + auto cp = skyr::unicode::code_point_octet(bytes); + REQUIRE(cp); + CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); + CHECK(U'\x1f4a9' == u32(cp.value())); +// CHECK(u'\xd83d' == std::get<0>(u16(cp.value()))); +// CHECK(u'\xdca9' == std::get<1>(u16(cp.value()))); + } + + SECTION("u8 code point 02") { + auto bytes = std::string("\x9f\x92\xa9"); + auto cp = skyr::unicode::code_point_octet(bytes); + REQUIRE(!cp); + } +} + +TEST_CASE("octet range iterator") { + using iterator_type = skyr::unicode::octet_range_iterator; + + SECTION("construction") { + auto bytes = std::string("\xf0\x9f\x92\xa9"); + auto it = iterator_type(std::begin(bytes)); + CHECK(U'\x1F4A9' == u32(*it)); + } + + SECTION("increment") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto it = iterator_type(std::begin(bytes)); + CHECK(U'\x1F3F3' == u32(*it)); + ++it; + CHECK(U'\xFE0F' == u32(*it)); + } + + SECTION("increment invalid") { + auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto it = iterator_type(std::begin(bytes)); + CHECK(!is_valid(*it)); + ++it; + CHECK(iterator_type() == it); + } + + SECTION("equality") { + auto bytes = std::string("\xf0\x9f\x92\xa9"); + auto it = iterator_type(std::begin(bytes)); + auto last = iterator_type(std::end(bytes)); + ++it; + CHECK(last == it); + } + + SECTION("inequality") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto it = iterator_type(std::begin(bytes)); + auto last = iterator_type(std::end(bytes)); + CHECK(last != it); + } + + SECTION("end of sequence") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto it = iterator_type(std::begin(bytes)); + auto last = iterator_type(std::end(bytes)); + std::advance(it, 4); + CHECK(last == it); + } + + SECTION("two characters") + { + auto bytes = std::string("\xe6\x97\xa5\xd1\x88"); + auto it = iterator_type(std::begin(bytes)); + CHECK(U'\x65e5' == u32(*it++)); + CHECK(U'\x448' == u32(*it++)); + } + + SECTION("three characters") + { + auto bytes = std::string("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); + auto it = iterator_type(std::begin(bytes)); + CHECK(U'\x10346' == u32(*it++)); + CHECK(U'\x65e5' == u32(*it++)); + CHECK(U'\x448' == u32(*it++)); + } +} + +TEST_CASE("u8 range") { + using std::begin; + using std::end; + + SECTION("construction") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto view = skyr::unicode::view_octet_range(bytes); + CHECK(begin(view) != end(view)); + } + + SECTION("empty") { + auto view = skyr::unicode::view_octet_range(); + CHECK(begin(view) == end(view)); + } + + SECTION("count") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto view = skyr::unicode::view_octet_range(bytes); + CHECK(4 == ranges::size(view)); + CHECK(!ranges::empty(view)); + } + + SECTION("empty count") { + auto view = skyr::unicode::view_octet_range(); + CHECK(0 == ranges::size(view)); + CHECK(ranges::empty(view)); + } + + SECTION("pipe syntax") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto view = bytes | skyr::unicode::view::u8; + CHECK(4 == ranges::size(view)); + CHECK(!ranges::empty(view)); + } + + SECTION("pipe syntax with string_view") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto bytes_view = std::string_view(bytes); + auto view = bytes_view | skyr::unicode::view::u8; + CHECK(4 == ranges::size(view)); + CHECK(!ranges::empty(view)); + } + +// 0xD83C 0xDFF3, 0xFE0F, 0x200D, 0xD83C 0xDF08 + +// SECTION("pipe syntax with u16 string") { +// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); +// auto view = bytes | skyr::unicode::view::u16; +// auto u16 = std::u16string(begin(view), end(view)); +// CHECK(u"\xD83C\xDFF3\xFE0F\x200D\xD83C\xDF08" == u16); +// } + + SECTION("pipe syntax with u32 string") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto view = bytes | skyr::unicode::view::u32; + auto u32 = std::u32string(begin(view), end(view)); + CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32); + } +// +// SECTION("pipe syntax with u32 string invalid") { +// auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); +// auto view = bytes | skyr::unicode::view::u32; +// auto u32 = std::u32string(begin(view), end(view)); +// CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32); +// } +} diff --git a/tests/unicode_tests.cpp b/tests/unicode_tests.cpp index 6bcb60c0..c3a1237e 100644 --- a/tests/unicode_tests.cpp +++ b/tests/unicode_tests.cpp @@ -4,7 +4,7 @@ // http://www.boost.org/LICENSE_1_0.txt) #define CATCH_CONFIG_MAIN -#include +#include #include #include "skyr/unicode/unicode.hpp" From 84937bb5340bbbfa8cde08cb04e9ffa5af881b90 Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Sun, 22 Sep 2019 10:22:18 +0800 Subject: [PATCH 02/10] Updated unicode encoding API to be more idiomatic --- include/skyr/unicode/range/octet_range.hpp | 47 +++++---- include/skyr/unicode/range/u16_range.hpp | 112 +++++++-------------- include/skyr/unicode/range/u32_range.hpp | 34 +++++-- include/skyr/unicode/unicode.hpp | 5 - tests/unicode_range_tests.cpp | 103 ++++++++++++------- 5 files changed, 155 insertions(+), 146 deletions(-) diff --git a/include/skyr/unicode/range/octet_range.hpp b/include/skyr/unicode/range/octet_range.hpp index e8eccf19..de789cf9 100644 --- a/include/skyr/unicode/range/octet_range.hpp +++ b/include/skyr/unicode/range/octet_range.hpp @@ -98,7 +98,7 @@ class code_point_octet_t { /// \param range /// \return template -inline tl::expected, unicode_errc> code_point_octet( +inline tl::expected, unicode_errc> code_point_octets( const OctetRange &range) { auto first = std::begin(range), last = std::end(range); if (std::distance(first, last) > sequence_length(*first)) { @@ -134,7 +134,7 @@ inline tl::expected, uni }; return - code_point_octet(range) + code_point_octets(range) .and_then(check_code_point); } @@ -228,11 +228,11 @@ class octet_range_iterator { /// using iterator_category = typename iterator_type::iterator_category; /// - using value_type = typename iterator_type::value_type; + using value_type = tl::expected; /// - using reference = typename iterator_type::reference; + using reference = value_type; /// - using pointer = typename iterator_type::pointer; + using pointer = typename std::add_pointer::type; /// using difference_type = typename iterator_type::difference_type; @@ -240,8 +240,9 @@ class octet_range_iterator { constexpr octet_range_iterator() = default; /// /// \param it - explicit constexpr octet_range_iterator(OctetIterator it) - : it_(it) {} + explicit constexpr octet_range_iterator(OctetIterator it, OctetIterator last) + : it_(it) + , last_(last) {} /// constexpr octet_range_iterator(const octet_range_iterator&) = default; /// @@ -271,15 +272,16 @@ class octet_range_iterator { /// /// \return constexpr reference operator * () const noexcept { - using result_type = tl::expected, unicode_errc>; - - return - valid_code_point(*it_) - .or_else([=] (auto) -> result_type { - auto first = std::begin(*it_); - return code_point_octet(ranges::iterator_range(first, first)); - }) - .value(); + return valid_code_point(*it_); +// using result_type = tl::expected, unicode_errc>; +// +// return +// valid_code_point(*it_) +// .or_else([=] (auto) -> result_type { +// auto first = std::begin(*it_); +// return code_point_octet(ranges::iterator_range(first, first)); +// }) +// .value(); } /// @@ -299,14 +301,17 @@ class octet_range_iterator { private: void increment() { - if (valid_code_point(*it_)) { + if (**this) { ++it_; + if (it_ == last_) { + it_ = iterator_type(); + } } else { it_ = iterator_type(); } } - iterator_type it_; + iterator_type it_, last_; }; @@ -436,7 +441,7 @@ class view_octet_range /// /// \return [[nodiscard]] constexpr const_iterator end() const noexcept { - return impl_? impl_.value().last : iterator_type(); + return iterator_type(); } /// @@ -469,8 +474,8 @@ class view_octet_range constexpr impl( octet_iterator_type first, octet_iterator_type last) - : first(first) - , last(last) {} + : first(first, last) + , last(last, last) {} iterator_type first, last; }; diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/u16_range.hpp index 9d6be5bc..379a356f 100644 --- a/include/skyr/unicode/range/u16_range.hpp +++ b/include/skyr/unicode/range/u16_range.hpp @@ -16,102 +16,64 @@ #include namespace skyr::unicode { -template class u16_code_point_t { public: - /// - using const_iterator = U16Iterator; - /// - using iterator = const_iterator; - /// - using value_type = char16_t; - /// - using const_reference = value_type; - /// - using reference = const_reference; - /// - using size_type = std::size_t; - /// /// \param first - explicit constexpr u16_code_point_t(U16Iterator first) - : first(first) - , last(first + (*first > 0xffff? 2 : 1)) {} + explicit constexpr u16_code_point_t(char32_t code_point) + : code_point_(code_point) {} /// constexpr u16_code_point_t(const u16_code_point_t &) = default; /// constexpr u16_code_point_t(u16_code_point_t &&) noexcept = default; /// - constexpr u16_code_point_t &operator=(const u16_code_point_t &) = default; + u16_code_point_t &operator=(const u16_code_point_t &) = default; /// - constexpr u16_code_point_t &operator=(u16_code_point_t &&) noexcept = default; + u16_code_point_t &operator=(u16_code_point_t &&) noexcept = default; /// ~u16_code_point_t() = default; - /// - /// \return - [[nodiscard]] constexpr const_iterator begin() const noexcept { - return first; + [[nodiscard]] uint16_t lead_value() const { + return is_surrogate_pair()? + static_cast((code_point_ >> 10U) + constants::surrogates::lead_offset) : + static_cast(code_point_); } - /// - /// \return - [[nodiscard]] constexpr const_iterator end() const noexcept { - return last; - } - - /// - /// \return - [[nodiscard]] constexpr auto cbegin() const noexcept { - return begin(); - } - - /// - /// \return - [[nodiscard]] constexpr auto cend() const noexcept { - return end(); + [[nodiscard]] uint16_t trail_value() const { + return is_surrogate_pair()? + static_cast((code_point_ & 0x3ffU) + constants::surrogates::trail_min) : + 0; } - [[nodiscard]] constexpr auto size() const noexcept -> size_type { - return *first > 0xffff? 2 : 1; + [[nodiscard]] constexpr bool is_surrogate_pair() const noexcept { + return code_point_ > 0xffffU; } private: - U16Iterator first, last; + char32_t code_point_; }; -//template -//inline std::tuple u16(u16_code_point_t code_point) { -// auto state = find_code_point(begin(code_point)); -// if (state.value().value > 0xffff) { // make a surrogate pair -// return { -// static_cast((state.value().value >> 10) + -// constants::surrogates::lead_offset), -// static_cast((state.value().value & 0x3ff) + -// constants::surrogates::trail_min) -// }; -// } else { -// return { -// static_cast(state.value().value), -// 0 -// }; -// } -//} +inline u16_code_point_t u16_code_point(char32_t code_point) { + return u16_code_point_t(code_point); +} /// template class u16_range_iterator { + + using iterator_type = u32_range_iterator; + public: /// using iterator_category = std::forward_iterator_tag; /// - using value_type = char16_t; + using value_type = u16_code_point_t; /// using reference = value_type; /// @@ -123,8 +85,11 @@ class u16_range_iterator { u16_range_iterator() = default; /// /// \param it - explicit constexpr u16_range_iterator(unchecked_octet_range_iterator it) - : it_(it) {} + explicit constexpr u16_range_iterator( + octet_range_iterator it, + octet_range_iterator last) + : it_(it) + , last_(last) {} /// constexpr u16_range_iterator(const u16_range_iterator&) = default; /// @@ -139,29 +104,24 @@ class u16_range_iterator { /// /// \return u16_range_iterator operator ++ (int) { + assert(it_); auto result = *this; - ++it_; -// if (u32(*it_) > 0xffff) { -// ++it_; -// } + ++it_.value(); return result; } /// /// \return u16_range_iterator &operator ++ () { - ++it_; -// if (u32(*it_) > 0xffff) { -// ++it_; -// } + assert(it_); + ++it_.value(); return *this; } /// /// \return reference operator * () const noexcept { -// return u16(*it_); - return 0; + return u16_code_point(*it_.value()); } /// @@ -180,7 +140,7 @@ class u16_range_iterator { private: - unchecked_octet_range_iterator it_; + std::optional> it_, last_; }; @@ -219,13 +179,13 @@ class view_u16_range /// /// \return [[nodiscard]] constexpr const_iterator begin() const noexcept { - return u16_range_iterator(range_.begin()); + return iterator_type(std::begin(range_), std::end(range_)); } /// /// \return [[nodiscard]] constexpr const_iterator end() const noexcept { - return u16_range_iterator(range_.end()); + return iterator_type(); } /// @@ -254,7 +214,7 @@ class view_u16_range private: - view_unchecked_octet_range range_; + view_octet_range range_; }; diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp index d772f8e3..f895a63b 100644 --- a/include/skyr/unicode/range/u32_range.hpp +++ b/include/skyr/unicode/range/u32_range.hpp @@ -21,9 +21,8 @@ namespace skyr::unicode { /// \return template inline char32_t u32(code_point_octet_t code_point) { - using std::begin; - auto state = find_code_point(begin(code_point)); - return state.value().value; + auto state = find_code_point(std::begin(code_point)); + return state ? state.value().value : U'\x0000'; } /// @@ -35,7 +34,7 @@ class u32_range_iterator { /// using iterator_category = std::forward_iterator_tag; /// - using value_type = char32_t; + using value_type = tl::expected; /// using reference = value_type; /// @@ -47,7 +46,7 @@ class u32_range_iterator { constexpr u32_range_iterator() = default; /// /// \param it - explicit constexpr u32_range_iterator(unchecked_octet_range_iterator it) + explicit constexpr u32_range_iterator(octet_range_iterator it) : it_(it) {} /// constexpr u32_range_iterator(const u32_range_iterator&) = default; @@ -78,7 +77,13 @@ class u32_range_iterator { /// /// \return reference operator * () const noexcept { - return u32(*it_); + return (*it_) + .and_then([] (auto code_point) -> value_type { + return u32(code_point); + }) + .or_else([] (auto &&error) -> value_type { + return tl::make_unexpected(error); + }); } /// @@ -97,7 +102,7 @@ class u32_range_iterator { private: - unchecked_octet_range_iterator it_; + octet_range_iterator it_; }; @@ -171,7 +176,7 @@ class view_u32_range private: - view_unchecked_octet_range range_; + view_octet_range range_; }; @@ -201,6 +206,19 @@ namespace view { /// static constexpr u32_range_fn u32; } // namespace view + +template +tl::expected u32string(U32Range &&range) { + auto result = std::u32string(); + result.reserve(ranges::size(range)); + for (auto &&code_point : range) { + if (!code_point) { + return tl::make_unexpected(code_point.error()); + } + result.push_back(code_point.value()); + } + return result; +} } // namespace skyr::unicode #endif //SKYR_U32_RANGE_HPP diff --git a/include/skyr/unicode/unicode.hpp b/include/skyr/unicode/unicode.hpp index bfee9e32..16cdaa73 100644 --- a/include/skyr/unicode/unicode.hpp +++ b/include/skyr/unicode/unicode.hpp @@ -13,7 +13,6 @@ #include #include #include -//#include #include namespace skyr { @@ -119,10 +118,6 @@ constexpr long sequence_length(char lead_value) { return 0; } -//constexpr long sequence_length(char16_t lead_value) { -// return lead_value > 0xffff? 2 : 1; -//} - /// /// \param code_point /// \param length diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index 5ef60742..c01f6830 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -19,7 +19,7 @@ TEST_CASE("code point tests") { SECTION("u8 code point 01") { auto bytes = std::string("\xf0\x9f\x92\xa9"); - auto cp = skyr::unicode::code_point_octet(bytes); + auto cp = skyr::unicode::code_point_octets(bytes); REQUIRE(cp); CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); CHECK(U'\x1f4a9' == u32(cp.value())); @@ -29,7 +29,7 @@ TEST_CASE("code point tests") { SECTION("u8 code point 02") { auto bytes = std::string("\x9f\x92\xa9"); - auto cp = skyr::unicode::code_point_octet(bytes); + auto cp = skyr::unicode::code_point_octets(bytes); REQUIRE(!cp); } } @@ -39,45 +39,49 @@ TEST_CASE("octet range iterator") { SECTION("construction") { auto bytes = std::string("\xf0\x9f\x92\xa9"); - auto it = iterator_type(std::begin(bytes)); - CHECK(U'\x1F4A9' == u32(*it)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + auto code_point = *it; + REQUIRE(code_point); + CHECK(U'\x1F4A9' == u32(code_point.value())); } SECTION("increment") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto it = iterator_type(std::begin(bytes)); - CHECK(U'\x1F3F3' == u32(*it)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + auto code_point = *it; + REQUIRE(code_point); + CHECK(U'\x1F3F3' == u32(code_point.value())); ++it; - CHECK(U'\xFE0F' == u32(*it)); + code_point = *it; + REQUIRE(code_point); + CHECK(U'\xFE0F' == u32(code_point.value())); } SECTION("increment invalid") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto it = iterator_type(std::begin(bytes)); - CHECK(!is_valid(*it)); - ++it; - CHECK(iterator_type() == it); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + CHECK(!*it); } SECTION("equality") { auto bytes = std::string("\xf0\x9f\x92\xa9"); - auto it = iterator_type(std::begin(bytes)); - auto last = iterator_type(std::end(bytes)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + auto last = iterator_type(); ++it; CHECK(last == it); } SECTION("inequality") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto it = iterator_type(std::begin(bytes)); - auto last = iterator_type(std::end(bytes)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + auto last = iterator_type(); CHECK(last != it); } SECTION("end of sequence") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto it = iterator_type(std::begin(bytes)); - auto last = iterator_type(std::end(bytes)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + auto last = iterator_type(); std::advance(it, 4); CHECK(last == it); } @@ -85,18 +89,38 @@ TEST_CASE("octet range iterator") { SECTION("two characters") { auto bytes = std::string("\xe6\x97\xa5\xd1\x88"); - auto it = iterator_type(std::begin(bytes)); - CHECK(U'\x65e5' == u32(*it++)); - CHECK(U'\x448' == u32(*it++)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + { + auto code_point = *it++; + REQUIRE(code_point); + CHECK(U'\x65e5' == u32(code_point.value())); + } + { + auto code_point = *it++; + REQUIRE(code_point); + CHECK(U'\x448' == u32(code_point.value())); + } } SECTION("three characters") { auto bytes = std::string("\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"); - auto it = iterator_type(std::begin(bytes)); - CHECK(U'\x10346' == u32(*it++)); - CHECK(U'\x65e5' == u32(*it++)); - CHECK(U'\x448' == u32(*it++)); + auto it = iterator_type(std::begin(bytes), std::end(bytes)); + { + auto code_point = *it++; + REQUIRE(code_point); + CHECK(U'\x10346' == u32(code_point.value())); + } + { + auto code_point = *it++; + REQUIRE(code_point); + CHECK(U'\x65e5' == u32(code_point.value())); + } + { + auto code_point = *it++; + REQUIRE(code_point); + CHECK(U'\x448' == u32(code_point.value())); + } } } @@ -143,7 +167,15 @@ TEST_CASE("u8 range") { CHECK(!ranges::empty(view)); } -// 0xD83C 0xDFF3, 0xFE0F, 0x200D, 0xD83C 0xDF08 + SECTION("pipe syntax invalid") { + auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto view = bytes | skyr::unicode::view::u8; + auto it = std::begin(view), last = std::end(view); + *it++; + CHECK(it == last); + CHECK(1 == ranges::size(view)); + CHECK(!ranges::empty(view)); + } // SECTION("pipe syntax with u16 string") { // auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); @@ -154,15 +186,14 @@ TEST_CASE("u8 range") { SECTION("pipe syntax with u32 string") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto view = bytes | skyr::unicode::view::u32; - auto u32 = std::u32string(begin(view), end(view)); - CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32); - } -// -// SECTION("pipe syntax with u32 string invalid") { -// auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); -// auto view = bytes | skyr::unicode::view::u32; -// auto u32 = std::u32string(begin(view), end(view)); -// CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32); -// } + auto u32 = skyr::unicode::u32string(bytes | skyr::unicode::view::u32); + REQUIRE(u32); + CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32.value()); + } + + SECTION("pipe syntax with u32 string invalid") { + auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto u32 = skyr::unicode::u32string(bytes | skyr::unicode::view::u32); + CHECK(!u32); + } } From 9705e5ff41189387c663cd00d63132c31ab8a55c Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Sun, 22 Sep 2019 10:58:31 +0800 Subject: [PATCH 03/10] Added tests for octet -> u16 string adapter --- include/skyr/unicode/range/octet_range.hpp | 9 ---- include/skyr/unicode/range/u16_range.hpp | 53 ++++++++++++++++------ include/skyr/unicode/range/u32_range.hpp | 3 -- tests/unicode_range_tests.cpp | 17 +++---- 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/include/skyr/unicode/range/octet_range.hpp b/include/skyr/unicode/range/octet_range.hpp index de789cf9..cfa322ca 100644 --- a/include/skyr/unicode/range/octet_range.hpp +++ b/include/skyr/unicode/range/octet_range.hpp @@ -273,15 +273,6 @@ class octet_range_iterator { /// \return constexpr reference operator * () const noexcept { return valid_code_point(*it_); -// using result_type = tl::expected, unicode_errc>; -// -// return -// valid_code_point(*it_) -// .or_else([=] (auto) -> result_type { -// auto first = std::begin(*it_); -// return code_point_octet(ranges::iterator_range(first, first)); -// }) -// .value(); } /// diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/u16_range.hpp index 379a356f..d3aa13e5 100644 --- a/include/skyr/unicode/range/u16_range.hpp +++ b/include/skyr/unicode/range/u16_range.hpp @@ -63,17 +63,24 @@ inline u16_code_point_t u16_code_point(char32_t code_point) { } /// +/// \tparam OctetIterator +/// \param code_point +/// \return template -class u16_range_iterator { +inline u16_code_point_t u16(code_point_octet_t code_point) { + return u16_code_point(u32(code_point)); +} - using iterator_type = u32_range_iterator; +/// +template +class u16_range_iterator { public: /// using iterator_category = std::forward_iterator_tag; /// - using value_type = u16_code_point_t; + using value_type = tl::expected; /// using reference = value_type; /// @@ -86,8 +93,8 @@ class u16_range_iterator { /// /// \param it explicit constexpr u16_range_iterator( - octet_range_iterator it, - octet_range_iterator last) + u32_range_iterator it, + u32_range_iterator last) : it_(it) , last_(last) {} /// @@ -104,24 +111,27 @@ class u16_range_iterator { /// /// \return u16_range_iterator operator ++ (int) { - assert(it_); auto result = *this; - ++it_.value(); + ++it_; return result; } /// /// \return u16_range_iterator &operator ++ () { - assert(it_); - ++it_.value(); + ++it_; return *this; } /// /// \return reference operator * () const noexcept { - return u16_code_point(*it_.value()); + auto code_point = *it_; + return + code_point + .and_then([] (auto value) -> value_type { + return u16_code_point(value); + }); } /// @@ -140,7 +150,7 @@ class u16_range_iterator { private: - std::optional> it_, last_; + u32_range_iterator it_, last_; }; @@ -156,7 +166,7 @@ class view_u16_range public: /// - using value_type = char16_t; + using value_type = tl::expected; /// using const_reference = value_type; /// @@ -214,7 +224,7 @@ class view_u16_range private: - view_octet_range range_; + view_u32_range range_; }; @@ -244,6 +254,23 @@ namespace view { /// static constexpr u16_range_fn u16; } // namespace view + + +template +tl::expected u16string(U16Range &&range) { + auto result = std::u16string(); + result.reserve(ranges::size(range)); + for (auto &&code_point : range) { + if (!code_point) { + return tl::make_unexpected(code_point.error()); + } + result.push_back(code_point.value().lead_value()); + if (code_point.value().is_surrogate_pair()) { + result.push_back(code_point.value().trail_value()); + } + } + return result; +} } // namespace skyr::unicode #endif //SKYR_U16_RANGE_HPP diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp index f895a63b..e6f6996e 100644 --- a/include/skyr/unicode/range/u32_range.hpp +++ b/include/skyr/unicode/range/u32_range.hpp @@ -80,9 +80,6 @@ class u32_range_iterator { return (*it_) .and_then([] (auto code_point) -> value_type { return u32(code_point); - }) - .or_else([] (auto &&error) -> value_type { - return tl::make_unexpected(error); }); } diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index c01f6830..bbdaf1fa 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -23,8 +23,9 @@ TEST_CASE("code point tests") { REQUIRE(cp); CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); CHECK(U'\x1f4a9' == u32(cp.value())); -// CHECK(u'\xd83d' == std::get<0>(u16(cp.value()))); -// CHECK(u'\xdca9' == std::get<1>(u16(cp.value()))); + CHECK(u16(cp.value()).is_surrogate_pair()); + CHECK(u'\xd83d' == u16(cp.value()).lead_value()); + CHECK(u'\xdca9' == u16(cp.value()).trail_value()); } SECTION("u8 code point 02") { @@ -177,12 +178,12 @@ TEST_CASE("u8 range") { CHECK(!ranges::empty(view)); } -// SECTION("pipe syntax with u16 string") { -// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); -// auto view = bytes | skyr::unicode::view::u16; -// auto u16 = std::u16string(begin(view), end(view)); -// CHECK(u"\xD83C\xDFF3\xFE0F\x200D\xD83C\xDF08" == u16); -// } + SECTION("pipe syntax with u16 string") { + auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto u16 = skyr::unicode::u16string(bytes | skyr::unicode::view::u16); + REQUIRE(u16); + CHECK(u"\xD83C\xDFF3\xFE0F\x200D\xD83C\xDF08" == u16.value()); + } SECTION("pipe syntax with u32 string") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); From c488bea2b42d3c2fbdc8719b7657c99d1cb79659 Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Sun, 22 Sep 2019 14:07:18 +0800 Subject: [PATCH 04/10] Moved some files around --- include/skyr/unicode/core.hpp | 420 ++++++++++++++++ include/skyr/unicode/errors.hpp | 35 ++ include/skyr/unicode/range/u16_range.hpp | 34 +- include/skyr/unicode/range/u32_range.hpp | 17 +- .../range/{octet_range.hpp => u8_range.hpp} | 180 +++---- include/skyr/unicode/unicode.hpp | 451 +----------------- src/CMakeLists.txt | 5 +- src/unicode/errors.cpp | 36 ++ src/unicode/unicode.cpp | 57 +-- tests/unicode_range_tests.cpp | 24 +- 10 files changed, 658 insertions(+), 601 deletions(-) create mode 100644 include/skyr/unicode/core.hpp create mode 100644 include/skyr/unicode/errors.hpp rename include/skyr/unicode/range/{octet_range.hpp => u8_range.hpp} (60%) create mode 100644 src/unicode/errors.cpp diff --git a/include/skyr/unicode/core.hpp b/include/skyr/unicode/core.hpp new file mode 100644 index 00000000..61c385e8 --- /dev/null +++ b/include/skyr/unicode/core.hpp @@ -0,0 +1,420 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_CORE_HPP +#define SKYR_UNICODE_CORE_HPP + +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \param octet +/// \return +constexpr uint8_t mask8(char octet) { + return 0xffu & static_cast(octet); +} + +/// +/// \param value +/// \return +constexpr char16_t mask16(char16_t value) { + return 0xffffu & value; +} + +/// +/// \param octet +/// \return +constexpr bool is_trail(char octet) { + return ((mask8(octet) >> 6u) == 0x2u); +} + +/// +/// \param code_point +/// \return +constexpr bool is_lead_surrogate(char16_t code_point) { + return + (code_point >= constants::surrogates::lead_min) && + (code_point <= constants::surrogates::lead_max); +} + +/// +/// \param value +/// \return +constexpr bool is_trail_surrogate(char16_t value) { + return + (value >= constants::surrogates::trail_min) && + (value <= constants::surrogates::trail_max); +} + +/// +/// \param value +/// \return +constexpr bool is_surrogate(char16_t value) { + return + (value >= constants::surrogates::lead_min) && + (value <= constants::surrogates::trail_max); +} + +/// Tests if the code point is a valid value. +/// \param code_point +/// \return \c true if it has a valid value, \c false otherwise +constexpr bool is_valid_code_point(char32_t code_point) { + return + (code_point <= constants::code_points::max) && + !is_surrogate(static_cast(code_point)); +} + +/// Returns the size of the sequnce given the lead octet value. +/// \param lead_value +/// \return 1, 2, 3 or 4 +constexpr long sequence_length(char lead_value) { + auto lead = mask8(lead_value); + if (lead < 0x80) { + return 1; + } else if ((lead >> 5) == 0x6) { + return 2; + } else if ((lead >> 4) == 0xe) { + return 3; + } else if ((lead >> 3) == 0x1e) { + return 4; + } + return 0; +} + +/// +/// \param code_point +/// \param length +/// \return +constexpr bool is_overlong_sequence( + char32_t code_point, + long length) { + bool result = false; + result &= (code_point < 0x80) && (length != 1); + result &= (code_point < 0x800) && (length != 2); + result &= (code_point < 0x10000) && (length != 3); + return result; +} + +/// A type used to extract a code point value from an octet sequence +/// \tparam OctetIterator +template +struct sequence_state { + sequence_state( + OctetIterator it, + char32_t value) + : it(it), value(value) {} + + /// The current iterator + OctetIterator it; + /// The (intermediate) value of the code point + char32_t value; +}; + +/// Creates an expected state, so that can be chained +/// functional-style. +/// +/// \tparam OctetIterator +/// \param it The lead value of the next code point in the octet +/// sequence +/// \return A sequence_state with a value of 0, and the iterator +/// pointing to the lead value +template +tl::expected, std::error_code> +make_state(OctetIterator it) { + return sequence_state(it, 0); +} + +/// Updates the value in the sequence state +/// +/// \tparam OctetIterator +/// \param state The input state +/// \param value The updated value +/// \return A new state with an updateds value +template +inline sequence_state update_value( + sequence_state state, + char32_t value) { + return {state.it, value}; +} + +/// Moves the octet iterator one character ahead +/// \tparam OctetIterator +/// \param state The input state +/// \return The new state with the updated iterator, on an error if +/// the sequence isn't valid +template +inline tl::expected, std::error_code> +increment(sequence_state state) { + ++state.it; + if (!is_trail(*state.it)) { + return tl::make_unexpected( + make_error_code(unicode_errc::illegal_byte_sequence)); + } + return state; +} + +/// Checks if the code point value is valid +/// +/// \tparam OctetIterator +/// \param state The input state +/// \return The new state +template +tl::expected, std::error_code> +check_code_point(sequence_state state) { + if (!is_valid_code_point(state.value)) { + return tl::make_unexpected( + make_error_code(unicode_errc::invalid_code_point)); + } else if (is_overlong_sequence(state.value, sequence_length(*state.it))) { + return tl::make_unexpected( + make_error_code(unicode_errc::illegal_byte_sequence)); + } + + return state; +} + +namespace details { +/// +/// \tparam OctetIterator +/// \param state +/// \return +template +tl::expected, std::error_code> +inline mask_byte(sequence_state state) { + return update_value(state, mask8(*state.it)); +} + +/// Converts a two byte code octet sequence to a code point value. +/// +/// \tparam OctetIterator +/// \param first +/// \return +template +tl::expected, std::error_code> +from_two_byte_sequence(OctetIterator first) { + using result_type = tl::expected, std::error_code>; + + auto set_code_point = [](auto state) -> result_type { + return update_value( + state, + ((state.value << 6) & 0x7ff) + (*state.it & 0x3f)); + }; + + return + make_state(first) + .and_then(mask_byte) + .and_then(increment) + .and_then(set_code_point); +} + +/// Converts a three byte code octet sequence to a code point value. +/// +/// \tparam OctetIterator +/// \param first +/// \return +template +tl::expected, std::error_code> +from_three_byte_sequence(OctetIterator first) { + using result_type = tl::expected, std::error_code>; + + auto update_code_point_from_second_byte = [](auto state) -> result_type { + return update_value( + state, + ((state.value << 12) & 0xffff) + + ((mask8(*state.it) << 6) & 0xfff)); + }; + + auto set_code_point = [](auto state) -> result_type { + return update_value( + state, + state.value + (*state.it & 0x3f)); + }; + + return make_state(first) + .and_then(mask_byte) + .and_then(increment) + .and_then(update_code_point_from_second_byte) + .and_then(increment) + .and_then(set_code_point); +} + +/// Converts a four byte code octet sequence to a code point value. +/// +/// \tparam OctetIterator +/// \param first +/// \return +template +tl::expected, std::error_code> +from_four_byte_sequence(OctetIterator first) { + using result_type = tl::expected, std::error_code>; + + auto update_code_point_from_second_byte = [](auto state) -> result_type { + return update_value( + state, + ((state.value << 18) & 0x1fffff) + + ((mask8(*state.it) << 12) & 0x3ffff)); + }; + + auto update_code_point_from_third_byte = [](auto state) -> result_type { + return update_value( + state, + state.value + ((mask8(*state.it) << 6) & 0xfff)); + }; + + auto set_code_point = [](auto state) -> result_type { + return update_value( + state, + state.value + (*state.it & 0x3f)); + }; + + return + make_state(first) + .and_then(mask_byte) + .and_then(increment) + .and_then(update_code_point_from_second_byte) + .and_then(increment) + .and_then(update_code_point_from_third_byte) + .and_then(increment) + .and_then(set_code_point); +} +} // namespace details + +/// Finds and computes the next code point value in the octet +/// sequence. +/// +/// \tparam OctetIterator +/// \param first +/// \return +template +tl::expected, std::error_code> find_code_point( + OctetIterator first) { + const auto length = sequence_length(*first); + return + (length == 1) ? make_state(first).and_then(details::mask_byte) : + (length == 2) ? details::from_two_byte_sequence(first) : + (length == 3) ? details::from_three_byte_sequence(first) : + (length == 4) ? details::from_four_byte_sequence(first) : + tl::make_unexpected(make_error_code(unicode_errc::overflow)) + ; +} + +/// Updates the state to next code point +/// +/// \tparam OctetIterator +/// \param it An octer iterator +/// \return A sequence state with the computed code point value +template +tl::expected, std::error_code> next( + OctetIterator it) { + using result_type = tl::expected, std::error_code>; + + auto increment = [] (auto state) -> result_type { + ++state.it; + return state; + }; + + return + find_code_point(it) + .and_then(check_code_point) + .and_then(increment); +} + +/// Appends values to a octet sequence given a code point value +/// +/// \tparam OctetIterator +/// \param code_point +/// \param octet_it +/// \return +template +tl::expected append_bytes( + char32_t code_point, + OctetIterator octet_it) { + if (!is_valid_code_point(code_point)) { + return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); + } + + auto value = static_cast(code_point); + + if (value < 0x80u) { // one octet + *(octet_it++) = static_cast(value); + } else if (value < 0x800u) { // two octets + *(octet_it++) = static_cast((value >> 6u) | 0xc0u); + *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); + } else if (value < 0x10000u) { // three octets + *(octet_it++) = static_cast((value >> 12u) | 0xe0u); + *(octet_it++) = static_cast(((value >> 6u) & 0x3fu) | 0x80u); + *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); + } else { // four octets + *(octet_it++) = static_cast((value >> 18u) | 0xf0u); + *(octet_it++) = static_cast(((value >> 12u) & 0x3fu) | 0x80u); + *(octet_it++) = static_cast(((value >> 6u) & 0x3fu) | 0x80u); + *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); + } + return octet_it; +} + +/// Advances `n` code oints through the octet sequence +/// \tparam OctetIterator +/// \param it An iterator to a lead octet in the octet sequence +/// \param n The number of code points to advance +/// \param last The last iterator in the octet sequence +/// \return The updated iterator or an error if the sequence is +/// invalid +template +tl::expected advance( + OctetIterator& it, + std::size_t n, + OctetIterator last) { + while (n != 0) { + if (ranges::distance(it, last) < sequence_length(*it)) { + return tl::make_unexpected(make_error_code(unicode_errc::overflow)); + } + + auto state = unicode::next(it); + if (!state) { + return tl::make_unexpected(std::move(state.error())); + } + it = state.value().it; + --n; + } + + return it; +} + +/// Counts the number of code points in the octet sequence. +/// +/// \tparam OctetIterator +/// \param first The first element in the octet sequence +/// \param last The last element in the sequence +/// \return The number of code points or an error if it's not a +/// valid sequence. +template +tl::expected count( + OctetIterator first, + OctetIterator last) { + std::size_t count = 0; + auto it = first; + + while (it != last) { + if (std::distance(it, last) < sequence_length(*it)) { + return tl::make_unexpected( + make_error_code(unicode_errc::overflow)); + } + + auto state = unicode::next(it); + if (!state) { + return tl::make_unexpected(std::move(state.error())); + } + it = state.value().it; + ++count; + } + return count; +} + +} // namespace skyr::unicode + +#endif //SKYR_UNICODE_CORE_HPP diff --git a/include/skyr/unicode/errors.hpp b/include/skyr/unicode/errors.hpp new file mode 100644 index 00000000..c7500496 --- /dev/null +++ b/include/skyr/unicode/errors.hpp @@ -0,0 +1,35 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_ERROR_HPP +#define SKYR_UNICODE_ERROR_HPP + +#include + +namespace skyr::unicode { +/// Enumerates Unicode errors +enum class unicode_errc { + /// Overflow + overflow, + /// Invalid lead code point + invalid_lead, + /// Illegal byte sequence + illegal_byte_sequence, + /// Invalid code point + invalid_code_point, +}; + +/// Creates a `std::error_code` given a `skyr::unicode_errc` value +/// \param error A Unicode error +/// \returns A `std::error_code` object +std::error_code make_error_code(unicode_errc error); +} // namespace skyr::unicode + +namespace std { +template <> +struct is_error_code_enum : true_type {}; +} // namespace std + +#endif //SKYR_UNICODE_ERROR_HPP diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/u16_range.hpp index d3aa13e5..a7fe96b4 100644 --- a/include/skyr/unicode/range/u16_range.hpp +++ b/include/skyr/unicode/range/u16_range.hpp @@ -11,8 +11,9 @@ #include #include #include -#include -#include +#include +#include +#include #include namespace skyr::unicode { @@ -58,6 +59,9 @@ class u16_code_point_t { }; +/// +/// \param code_point +/// \return inline u16_code_point_t u16_code_point(char32_t code_point) { return u16_code_point_t(code_point); } @@ -67,11 +71,12 @@ inline u16_code_point_t u16_code_point(char32_t code_point) { /// \param code_point /// \return template -inline u16_code_point_t u16(code_point_octet_t code_point) { +inline u16_code_point_t u16(u8_code_point_t code_point) { return u16_code_point(u32(code_point)); } /// +/// \tparam OctetIterator template class u16_range_iterator { @@ -80,7 +85,7 @@ class u16_range_iterator { /// using iterator_category = std::forward_iterator_tag; /// - using value_type = tl::expected; + using value_type = tl::expected; /// using reference = value_type; /// @@ -166,7 +171,7 @@ class view_u16_range public: /// - using value_type = tl::expected; + using value_type = tl::expected; /// using const_reference = value_type; /// @@ -255,9 +260,8 @@ namespace view { static constexpr u16_range_fn u16; } // namespace view - template -tl::expected u16string(U16Range &&range) { +tl::expected u16string(U16Range &&range) { auto result = std::u16string(); result.reserve(ranges::size(range)); for (auto &&code_point : range) { @@ -271,6 +275,22 @@ tl::expected u16string(U16Range &&range) { } return result; } + +template +tl::expected wstring(U16Range &&range) { + auto result = std::wstring(); + result.reserve(ranges::size(range)); + for (auto &&code_point : range) { + if (!code_point) { + return tl::make_unexpected(code_point.error()); + } + result.push_back(code_point.value().lead_value()); + if (code_point.value().is_surrogate_pair()) { + result.push_back(code_point.value().trail_value()); + } + } + return result; +} } // namespace skyr::unicode #endif //SKYR_U16_RANGE_HPP diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp index e6f6996e..3c66f915 100644 --- a/include/skyr/unicode/range/u32_range.hpp +++ b/include/skyr/unicode/range/u32_range.hpp @@ -11,8 +11,9 @@ #include #include #include -#include -#include +#include +#include +#include namespace skyr::unicode { /// @@ -20,7 +21,7 @@ namespace skyr::unicode { /// \param code_point /// \return template -inline char32_t u32(code_point_octet_t code_point) { +inline char32_t u32(u8_code_point_t code_point) { auto state = find_code_point(std::begin(code_point)); return state ? state.value().value : U'\x0000'; } @@ -34,7 +35,7 @@ class u32_range_iterator { /// using iterator_category = std::forward_iterator_tag; /// - using value_type = tl::expected; + using value_type = tl::expected; /// using reference = value_type; /// @@ -46,7 +47,7 @@ class u32_range_iterator { constexpr u32_range_iterator() = default; /// /// \param it - explicit constexpr u32_range_iterator(octet_range_iterator it) + explicit constexpr u32_range_iterator(u8_range_iterator it) : it_(it) {} /// constexpr u32_range_iterator(const u32_range_iterator&) = default; @@ -99,7 +100,7 @@ class u32_range_iterator { private: - octet_range_iterator it_; + u8_range_iterator it_; }; @@ -173,7 +174,7 @@ class view_u32_range private: - view_octet_range range_; + view_u8_range range_; }; @@ -205,7 +206,7 @@ static constexpr u32_range_fn u32; } // namespace view template -tl::expected u32string(U32Range &&range) { +tl::expected u32string(U32Range &&range) { auto result = std::u32string(); result.reserve(ranges::size(range)); for (auto &&code_point : range) { diff --git a/include/skyr/unicode/range/octet_range.hpp b/include/skyr/unicode/range/u8_range.hpp similarity index 60% rename from include/skyr/unicode/range/octet_range.hpp rename to include/skyr/unicode/range/u8_range.hpp index cfa322ca..f20ec7ed 100644 --- a/include/skyr/unicode/range/octet_range.hpp +++ b/include/skyr/unicode/range/u8_range.hpp @@ -3,21 +3,23 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#ifndef SKYR_OCTET_RANGE_HPP -#define SKYR_OCTET_RANGE_HPP +#ifndef SKYR_U8_RANGE_HPP +#define SKYR_U8_RANGE_HPP #include #include #include #include #include -#include +#include +#include namespace skyr::unicode { -/// -/// \tparam OctetIterator +/// This class defines a range over a code point in raw bytes, +/// according to UTF-8. +/// \tparam OctetIterator An iterator type over the raw bytes template -class code_point_octet_t { +class u8_code_point_t { public: /// @@ -33,29 +35,31 @@ class code_point_octet_t { /// using size_type = std::size_t; - /// - /// \param first - constexpr code_point_octet_t( + /// \brief Constructor + /// \param first An iterator at the beginning of the code point + /// \param last An iterator at the end of the code point + constexpr u8_code_point_t( OctetIterator first, OctetIterator last) : first(first) , last(last) {} - /// - /// \param first - explicit constexpr code_point_octet_t(OctetIterator first) - : code_point_octet_t(first, first + sequence_length(*first)) {} + /// \brief Constructor. The length of the code point sequence is + /// inferred from the first code point value. + /// \param first An iterator at the beginning of the code point + explicit constexpr u8_code_point_t(OctetIterator first) + : u8_code_point_t(first, first + sequence_length(*first)) {} - /// - constexpr code_point_octet_t(const code_point_octet_t &) = default; - /// - constexpr code_point_octet_t(code_point_octet_t &&) noexcept = default; - /// - constexpr code_point_octet_t &operator=(const code_point_octet_t &) = default; - /// - constexpr code_point_octet_t &operator=(code_point_octet_t &&) noexcept = default; - /// - ~code_point_octet_t() = default; + /// \brief Copy constructor. + constexpr u8_code_point_t(const u8_code_point_t &) = default; + /// \brief Move constructor. + constexpr u8_code_point_t(u8_code_point_t &&) noexcept = default; + /// \brief Copy assignment operator. + constexpr u8_code_point_t &operator=(const u8_code_point_t &) = default; + /// \brief Move assignment operator. + constexpr u8_code_point_t &operator=(u8_code_point_t &&) noexcept = default; + /// \brief Destructor. + ~u8_code_point_t() = default; /// /// \return @@ -81,7 +85,7 @@ class code_point_octet_t { return end(); } - /// + /// \brief Returns the length in bytes of this code point. /// \return [[nodiscard]] constexpr auto size() const noexcept -> size_type { return sequence_length(*first); @@ -98,19 +102,21 @@ class code_point_octet_t { /// \param range /// \return template -inline tl::expected, unicode_errc> code_point_octets( +inline tl::expected, std::error_code> u8_code_point( const OctetRange &range) { auto first = std::begin(range), last = std::end(range); if (std::distance(first, last) > sequence_length(*first)) { - return tl::make_unexpected(unicode_errc::overflow); + return tl::make_unexpected(make_error_code(unicode_errc::overflow)); } - return code_point_octet_t( + return u8_code_point_t( first, first + sequence_length(*first)); } +/// Tests if the code point value is valid. +/// \returns \c true if the value is a valid code point, \c false otherwise template -inline bool is_valid(const code_point_octet_t &code_point) { +inline bool is_valid(const u8_code_point_t &code_point) { return static_cast(find_code_point(std::begin(code_point))); } @@ -119,34 +125,32 @@ inline bool is_valid(const code_point_octet_t &code_point) { /// \param range /// \return template -inline tl::expected, unicode_errc> valid_code_point( +inline tl::expected, std::error_code> valid_u8_code_point( const OctetRange &range) { - using result_type = tl::expected, unicode_errc>; + using result_type = tl::expected, std::error_code>; auto check_code_point = [] (auto &&code_point) -> result_type { return find_code_point(std::begin(code_point)) .and_then([=] (auto) -> result_type { return code_point; - }) - .or_else([] (auto &&error) -> result_type { - return tl::make_unexpected(error); }); }; return - code_point_octets(range) - .and_then(check_code_point); + u8_code_point(range) + .and_then(check_code_point); } /// +/// \tparam OctetIterator template -class unchecked_octet_range_iterator { +class unchecked_u8_range_iterator { public: /// using iterator_category = std::forward_iterator_tag; /// - using value_type = code_point_octet_t; + using value_type = u8_code_point_t; /// using reference = value_type; /// @@ -155,25 +159,25 @@ class unchecked_octet_range_iterator { using difference_type = std::ptrdiff_t; /// - constexpr unchecked_octet_range_iterator() = default; + constexpr unchecked_u8_range_iterator() = default; /// /// \param it - explicit constexpr unchecked_octet_range_iterator(OctetIterator it) + explicit constexpr unchecked_u8_range_iterator(OctetIterator it) : it_(it) {} /// - constexpr unchecked_octet_range_iterator(const unchecked_octet_range_iterator&) = default; + constexpr unchecked_u8_range_iterator(const unchecked_u8_range_iterator&) = default; /// - constexpr unchecked_octet_range_iterator(unchecked_octet_range_iterator&&) noexcept = default; + constexpr unchecked_u8_range_iterator(unchecked_u8_range_iterator&&) noexcept = default; /// - constexpr unchecked_octet_range_iterator &operator=(const unchecked_octet_range_iterator&) = default; + constexpr unchecked_u8_range_iterator &operator=(const unchecked_u8_range_iterator&) = default; /// - constexpr unchecked_octet_range_iterator &operator=(unchecked_octet_range_iterator&&) noexcept = default; + constexpr unchecked_u8_range_iterator &operator=(unchecked_u8_range_iterator&&) noexcept = default; /// - ~unchecked_octet_range_iterator() = default; + ~unchecked_u8_range_iterator() = default; /// /// \return - unchecked_octet_range_iterator operator ++ (int) { + unchecked_u8_range_iterator operator ++ (int) { assert(it_); auto result = *this; std::advance(it_.value(), sequence_length(*it_.value())); @@ -182,7 +186,7 @@ class unchecked_octet_range_iterator { /// /// \return - unchecked_octet_range_iterator &operator ++ () { + unchecked_u8_range_iterator &operator ++ () { assert(it_); std::advance(it_.value(), sequence_length(*it_.value())); return *this; @@ -192,7 +196,7 @@ class unchecked_octet_range_iterator { /// \return constexpr reference operator * () const noexcept { assert(it_); - return code_point_octet_t( + return u8_code_point_t( it_.value(), it_.value() + sequence_length(*it_.value())); } @@ -200,14 +204,14 @@ class unchecked_octet_range_iterator { /// /// \param other /// \return - constexpr bool operator == (const unchecked_octet_range_iterator &other) const noexcept { + constexpr bool operator == (const unchecked_u8_range_iterator &other) const noexcept { return it_ == other.it_; } /// /// \param other /// \return - constexpr bool operator != (const unchecked_octet_range_iterator &other) const noexcept { + constexpr bool operator != (const unchecked_u8_range_iterator &other) const noexcept { return !(*this == other); } @@ -218,17 +222,18 @@ class unchecked_octet_range_iterator { }; /// +/// \tparam OctetIterator template -class octet_range_iterator { +class u8_range_iterator { - using iterator_type = unchecked_octet_range_iterator; + using iterator_type = unchecked_u8_range_iterator; public: /// using iterator_category = typename iterator_type::iterator_category; /// - using value_type = tl::expected; + using value_type = tl::expected; /// using reference = value_type; /// @@ -237,26 +242,26 @@ class octet_range_iterator { using difference_type = typename iterator_type::difference_type; /// - constexpr octet_range_iterator() = default; + constexpr u8_range_iterator() = default; /// /// \param it - explicit constexpr octet_range_iterator(OctetIterator it, OctetIterator last) + explicit constexpr u8_range_iterator(OctetIterator it, OctetIterator last) : it_(it) , last_(last) {} /// - constexpr octet_range_iterator(const octet_range_iterator&) = default; + constexpr u8_range_iterator(const u8_range_iterator&) = default; /// - constexpr octet_range_iterator(octet_range_iterator&&) noexcept = default; + constexpr u8_range_iterator(u8_range_iterator&&) noexcept = default; /// - constexpr octet_range_iterator &operator=(const octet_range_iterator&) = default; + constexpr u8_range_iterator &operator=(const u8_range_iterator&) = default; /// - constexpr octet_range_iterator &operator=(octet_range_iterator&&) noexcept = default; + constexpr u8_range_iterator &operator=(u8_range_iterator&&) noexcept = default; /// - ~octet_range_iterator() = default; + ~u8_range_iterator() = default; /// /// \return - octet_range_iterator operator ++ (int) { + u8_range_iterator operator ++ (int) { auto result = *this; increment(); return result; @@ -264,7 +269,7 @@ class octet_range_iterator { /// /// \return - octet_range_iterator &operator ++ () { + u8_range_iterator &operator ++ () { increment(); return *this; } @@ -272,20 +277,20 @@ class octet_range_iterator { /// /// \return constexpr reference operator * () const noexcept { - return valid_code_point(*it_); + return valid_u8_code_point(*it_); } /// /// \param other /// \return - constexpr bool operator == (const octet_range_iterator &other) const noexcept { + constexpr bool operator == (const u8_range_iterator &other) const noexcept { return it_ == other.it_; } /// /// \param other /// \return - constexpr bool operator != (const octet_range_iterator &other) const noexcept { + constexpr bool operator != (const u8_range_iterator &other) const noexcept { return !(*this == other); } @@ -307,17 +312,18 @@ class octet_range_iterator { }; /// +/// \tparam OctetRange template -class view_unchecked_octet_range +class view_unchecked_u8_range : public ranges::view_base { using octet_iterator_type = typename OctetRange::const_iterator; - using iterator_type = unchecked_octet_range_iterator; + using iterator_type = unchecked_u8_range_iterator; public: /// - using value_type = code_point_octet_t; + using value_type = u8_code_point_t; /// using const_reference = value_type; /// @@ -329,12 +335,12 @@ class view_unchecked_octet_range /// using size_type = std::size_t; - /// - constexpr view_unchecked_octet_range() = default; + /// Default constructor + constexpr view_unchecked_u8_range() = default; /// /// \param range - explicit constexpr view_unchecked_octet_range(const OctetRange &range) + explicit constexpr view_unchecked_u8_range(const OctetRange &range) : impl_( impl(std::begin(range), std::end(range))) {} @@ -391,17 +397,18 @@ class view_unchecked_octet_range }; /// +/// \tparam OctetRange template -class view_octet_range +class view_u8_range : public ranges::view_base { using octet_iterator_type = typename OctetRange::const_iterator; - using iterator_type = octet_range_iterator; + using iterator_type = u8_range_iterator; public: /// - using value_type = code_point_octet_t; + using value_type = u8_code_point_t; /// using const_reference = value_type; /// @@ -414,11 +421,11 @@ class view_octet_range using size_type = std::size_t; /// - constexpr view_octet_range() = default; + constexpr view_u8_range() = default; /// /// \param range - explicit constexpr view_octet_range(const OctetRange &range) + explicit constexpr view_u8_range(const OctetRange &range) : impl_( impl(std::begin(range), std::end(range))) {} @@ -475,14 +482,14 @@ class view_octet_range }; /// -struct unchecked_octet_range_fn { +struct unchecked_u8_range_fn { /// /// \tparam OctetRange /// \param range /// \return template constexpr auto operator()(OctetRange &&range) const { - return view_unchecked_octet_range{std::forward(range)}; + return view_unchecked_u8_range{std::forward(range)}; } /// @@ -490,20 +497,20 @@ struct unchecked_octet_range_fn { /// \param range /// \return template - friend constexpr auto operator|(OctetRange &&range, const unchecked_octet_range_fn&) { - return view_unchecked_octet_range{std::forward(range)}; + friend constexpr auto operator|(OctetRange &&range, const unchecked_u8_range_fn&) { + return view_unchecked_u8_range{std::forward(range)}; } }; /// -struct octet_range_fn { +struct u8_range_fn { /// /// \tparam OctetRange /// \param range /// \return template constexpr auto operator()(OctetRange &&range) const { - return view_octet_range{std::forward(range)}; + return view_u8_range{std::forward(range)}; } /// @@ -511,16 +518,17 @@ struct octet_range_fn { /// \param range /// \return template - friend constexpr auto operator|(OctetRange &&range, const octet_range_fn&) { - return view_octet_range{std::forward(range)}; + friend constexpr auto operator|(OctetRange &&range, const u8_range_fn&) { + return view_u8_range{std::forward(range)}; } }; namespace view { /// -//static constexpr unchecked_octet_range_fn u8; -static constexpr octet_range_fn u8; +static constexpr unchecked_u8_range_fn unchecked_u8; +/// +static constexpr u8_range_fn u8; } // namespace view } // namespace skyr::unicode -#endif //SKYR_OCTET_RANGE_HPP +#endif //SKYR_U8_RANGE_HPP diff --git a/include/skyr/unicode/unicode.hpp b/include/skyr/unicode/unicode.hpp index 16cdaa73..bf050044 100644 --- a/include/skyr/unicode/unicode.hpp +++ b/include/skyr/unicode/unicode.hpp @@ -13,436 +13,10 @@ #include #include #include -#include - -namespace skyr { -/// \namespace unicode -/// Unicode encoding functions -namespace unicode { -/// Enumerates Unicode errors -enum class unicode_errc { - /// Overflow - overflow, - /// Invalid lead code point - invalid_lead, - /// Illegal byte sequence - illegal_byte_sequence, - /// Invalid code point - invalid_code_point, -}; -} // namespace unicode -} // namespace skyr - -namespace std { -template <> -struct is_error_code_enum : true_type {}; -} // namespace std +#include +#include namespace skyr::unicode { -/// Creates a `std::error_code` given a `skyr::unicode_errc` value -/// \param error A Unicode error -/// \returns A `std::error_code` object -std::error_code make_error_code(unicode_errc error); - -/// -/// \param octet -/// \return -constexpr uint8_t mask8(char octet) { - return 0xffu & static_cast(octet); -} - -/// -/// \param octet -/// \return -constexpr char16_t mask16(char16_t octet) { - return 0xffffu & octet; -} - -/// -/// \param octet -/// \return -constexpr bool is_trail(char octet) { - return ((mask8(octet) >> 6u) == 0x2u); -} - -/// -/// \param code_point -/// \return -constexpr bool is_lead_surrogate(char16_t code_point) { - return - (code_point >= constants::surrogates::lead_min) && - (code_point <= constants::surrogates::lead_max); -} - -/// -/// \param code_point -/// \return -constexpr bool is_trail_surrogate(char16_t code_point) { - return - (code_point >= constants::surrogates::trail_min) && - (code_point <= constants::surrogates::trail_max); -} - -/// -/// \param code_point -/// \return -constexpr bool is_surrogate(char16_t code_point) { - return - (code_point >= constants::surrogates::lead_min) && - (code_point <= constants::surrogates::trail_max); -} - -/// Tests if the code point is a valid value. -/// \param code_point -/// \return \c true if it has a valid value, \c false otherwise -constexpr bool is_valid_code_point(char32_t code_point) { - return - (code_point <= constants::code_points::max) && - !is_surrogate(static_cast(code_point)); -} - -/// Returns the size of the sequnce given the lead octet value. -/// \param lead_value -/// \return 1, 2, 3 or 4 -constexpr long sequence_length(char lead_value) { - auto lead = mask8(lead_value); - if (lead < 0x80) { - return 1; - } else if ((lead >> 5) == 0x6) { - return 2; - } else if ((lead >> 4) == 0xe) { - return 3; - } else if ((lead >> 3) == 0x1e) { - return 4; - } - return 0; -} - -/// -/// \param code_point -/// \param length -/// \return -constexpr bool is_overlong_sequence( - char32_t code_point, - long length) { - bool result = false; - result &= (code_point < 0x80) && (length != 1); - result &= (code_point < 0x800) && (length != 2); - result &= (code_point < 0x10000) && (length != 3); - return result; -} - -/// A type used to extract a code point value from an octet sequence -/// \tparam OctetIterator -template -struct sequence_state { - sequence_state( - OctetIterator it, - char32_t value) - : it(it), value(value) {} - - /// The current iterator - OctetIterator it; - /// The (intermediate) value of the code point - char32_t value; -}; - -/// Creates an expected state, so that can be chained -/// functional-style. -/// -/// \tparam OctetIterator -/// \param it The lead value of the next code point in the octet -/// sequence -/// \return A sequence_state with a value of 0, and the iterator -/// pointing to the lead value -template -tl::expected, unicode_errc> -make_state(OctetIterator it) { - return sequence_state(it, 0); -} - -/// Updates the value in the sequence state -/// -/// \tparam OctetIterator -/// \param state The input state -/// \param value The updated value -/// \return A new state with an updateds value -template -inline sequence_state update_value( - sequence_state state, - char32_t value) { - return {state.it, value}; -} - -/// Moves the octet iterator one character ahead -/// \tparam OctetIterator -/// \param state The input state -/// \return The new state with the updated iterator, on an error if -/// the sequence isn't valid -template -inline tl::expected, unicode_errc> -increment(sequence_state state) { - ++state.it; - if (!is_trail(*state.it)) { - return tl::make_unexpected(unicode_errc::illegal_byte_sequence); - } - return state; -} - -/// Checks if the code point value is valid -/// -/// \tparam OctetIterator -/// \param state The input state -/// \return The new state -template -tl::expected, unicode_errc> -check_code_point(sequence_state state) { - if (!is_valid_code_point(state.value)) { - return tl::make_unexpected(unicode_errc::invalid_code_point); - } else if (is_overlong_sequence(state.value, sequence_length(*state.it))) { - return tl::make_unexpected(unicode_errc::illegal_byte_sequence); - } - - return state; -} - -namespace details { -/// -/// \tparam OctetIterator -/// \param state -/// \return -template -tl::expected, unicode_errc> -inline mask_byte(sequence_state state) { - return update_value(state, mask8(*state.it)); -} - -/// Converts a two byte code octet sequence to a code point value. -/// -/// \tparam OctetIterator -/// \param first -/// \return -template -tl::expected, unicode_errc> -from_two_byte_sequence(OctetIterator first) { - using result_type = tl::expected, unicode_errc>; - - auto set_code_point = [](auto state) -> result_type { - return update_value( - state, - ((state.value << 6) & 0x7ff) + (*state.it & 0x3f)); - }; - - return - make_state(first) - .and_then(mask_byte) - .and_then(increment) - .and_then(set_code_point); -} - -/// Converts a three byte code octet sequence to a code point value. -/// -/// \tparam OctetIterator -/// \param first -/// \return -template -tl::expected, unicode_errc> -from_three_byte_sequence(OctetIterator first) { - using result_type = tl::expected, unicode_errc>; - - auto update_code_point_from_second_byte = [](auto state) -> result_type { - return update_value( - state, - ((state.value << 12) & 0xffff) + - ((mask8(*state.it) << 6) & 0xfff)); - }; - - auto set_code_point = [](auto state) -> result_type { - return update_value( - state, - state.value + (*state.it & 0x3f)); - }; - - return make_state(first) - .and_then(mask_byte) - .and_then(increment) - .and_then(update_code_point_from_second_byte) - .and_then(increment) - .and_then(set_code_point); -} - -/// Converts a four byte code octet sequence to a code point value. -/// -/// \tparam OctetIterator -/// \param first -/// \return -template -tl::expected, unicode_errc> -from_four_byte_sequence(OctetIterator first) { - using result_type = tl::expected, unicode_errc>; - - auto update_code_point_from_second_byte = [](auto state) -> result_type { - return update_value( - state, - ((state.value << 18) & 0x1fffff) + - ((mask8(*state.it) << 12) & 0x3ffff)); - }; - - auto update_code_point_from_third_byte = [](auto state) -> result_type { - return update_value( - state, - state.value + ((mask8(*state.it) << 6) & 0xfff)); - }; - - auto set_code_point = [](auto state) -> result_type { - return update_value( - state, - state.value + (*state.it & 0x3f)); - }; - - return - make_state(first) - .and_then(mask_byte) - .and_then(increment) - .and_then(update_code_point_from_second_byte) - .and_then(increment) - .and_then(update_code_point_from_third_byte) - .and_then(increment) - .and_then(set_code_point); -} -} // namespace details - -/// Finds and computes the next code point value in the octet -/// sequence. -/// -/// \tparam OctetIterator -/// \param first -/// \return -template -tl::expected, unicode_errc> find_code_point( - OctetIterator first) { - const auto length = sequence_length(*first); - return - (length == 1) ? make_state(first).and_then(details::mask_byte) : - (length == 2) ? details::from_two_byte_sequence(first) : - (length == 3) ? details::from_three_byte_sequence(first) : - (length == 4) ? details::from_four_byte_sequence(first) : - tl::make_unexpected(unicode_errc::overflow) - ; -} - -/// Updates the state to next code point -/// -/// \tparam OctetIterator -/// \param it An octer iterator -/// \return A sequence state with the computed code point value -template -tl::expected, unicode_errc> next( - OctetIterator it) { - using result_type = tl::expected, unicode_errc>; - - auto increment = [] (auto state) -> result_type { - ++state.it; - return state; - }; - - return - find_code_point(it) - .and_then(check_code_point) - .and_then(increment); -} - -/// Appends values to a octet sequence given a code point value -/// -/// \tparam OctetIterator -/// \param code_point -/// \param octet_it -/// \return -template -tl::expected append_bytes( - char32_t code_point, - OctetIterator octet_it) { - if (!is_valid_code_point(code_point)) { - return tl::make_unexpected(unicode_errc::invalid_code_point); - } - - auto value = static_cast(code_point); - - if (value < 0x80u) { // one octet - *(octet_it++) = static_cast(value); - } else if (value < 0x800u) { // two octets - *(octet_it++) = static_cast((value >> 6u) | 0xc0u); - *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); - } else if (value < 0x10000u) { // three octets - *(octet_it++) = static_cast((value >> 12u) | 0xe0u); - *(octet_it++) = static_cast(((value >> 6u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); - } else { // four octets - *(octet_it++) = static_cast((value >> 18u) | 0xf0u); - *(octet_it++) = static_cast(((value >> 12u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast(((value >> 6u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); - } - return octet_it; -} - -/// Advances `n` code oints through the octet sequence -/// \tparam OctetIterator -/// \param it An iterator to a lead octet in the octet sequence -/// \param n The number of code points to advance -/// \param last The last iterator in the octet sequence -/// \return The updated iterator or an error if the sequence is -/// invalid -template -tl::expected advance( - OctetIterator& it, - std::size_t n, - OctetIterator last) { - while (n != 0) { - if (ranges::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(unicode_errc::overflow); - } - - auto state = unicode::next(it); - if (!state) { - return tl::make_unexpected(std::move(state.error())); - } - it = state.value().it; - --n; - } - - return it; -} - -/// Counts the number of code points in the octet sequence. -/// -/// \tparam OctetIterator -/// \param first The first element in the octet sequence -/// \param last The last element in the sequence -/// \return The number of code points or an error if it's not a -/// valid sequence. -template -tl::expected count( - OctetIterator first, - OctetIterator last) { - std::size_t count = 0; - auto it = first; - - while (it != last) { - if (std::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(unicode_errc::overflow); - } - - auto state = unicode::next(it); - if (!state) { - return tl::make_unexpected(std::move(state.error())); - } - it = state.value().it; - ++count; - } - return count; -} - /// Copies characters from a UTF-16 encoded string to a UTF-8 /// encoded string. /// @@ -453,7 +27,7 @@ tl::expected count( /// \param u8_it The output iterator /// \return The last output iterator or an error if the sequence was invalid template -tl::expected copy_u16u8( +tl::expected copy_u16u8( U16BitIterator first, U16BitIterator last, OctetIterator result) { @@ -465,18 +39,21 @@ tl::expected copy_u16u8( // Take care of surrogate pairs first if (is_lead_surrogate(code_point)) { if (it == last) { - return tl::make_unexpected(unicode_errc::invalid_code_point); + return tl::make_unexpected( + make_error_code(unicode_errc::invalid_code_point)); } auto trail_surrogate = mask16(*it); ++it; if (!is_trail_surrogate(trail_surrogate)) { - return tl::make_unexpected(unicode_errc::invalid_code_point); + return tl::make_unexpected( + make_error_code(unicode_errc::invalid_code_point)); } code_point = (code_point << 10) + trail_surrogate + constants::surrogates::offset; } else if (is_trail_surrogate(code_point)) { - return tl::make_unexpected(unicode_errc::invalid_code_point); + return tl::make_unexpected( + make_error_code(unicode_errc::invalid_code_point)); } auto result_it = append_bytes(code_point, result); @@ -499,7 +76,7 @@ tl::expected copy_u16u8( /// \return An expected iterator to the last eleent in the new /// UTF-16 sequence, or an error. template -tl::expected copy_u8u16( +tl::expected copy_u8u16( OctetIterator first, OctetIterator last, U16BitIterator u16_first) { @@ -507,7 +84,7 @@ tl::expected copy_u8u16( auto u16_it = u16_first; while (it != last) { if (std::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(unicode_errc::overflow); + return tl::make_unexpected(make_error_code(unicode_errc::overflow)); } auto state = unicode::next(it); @@ -540,7 +117,7 @@ tl::expected copy_u8u16( /// \param u8_it The output iterator /// \return The last output iterator or an error if the sequence was invalid template -tl::expected copy_u32u8( +tl::expected copy_u32u8( U32BitIterator first, U32BitIterator last, OctetIterator u8_it) { @@ -568,7 +145,7 @@ tl::expected copy_u32u8( /// \return An expected iterator to the last eleent in the new /// UTF-32 sequence, or an error. template -tl::expected copy_u8u32( +tl::expected copy_u8u32( OctetIterator first, OctetIterator last, U32BitIterator u32_first) { @@ -576,7 +153,7 @@ tl::expected copy_u8u32( auto u32_it = u32_first; while (it != last) { if (std::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(unicode_errc::overflow); + return tl::make_unexpected(make_error_code(unicode_errc::overflow)); } auto state = unicode::next(it); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 537ec0a0..8f89ef79 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,7 @@ # http://www.boost.org/LICENSE_1_0.txt) set(Skyr_SRCS + unicode/errors.cpp unicode/unicode.cpp url/url_parser_context.hpp url/url_parser_context.cpp @@ -29,8 +30,10 @@ set(Skyr_SRCS ${Skyr_SOURCE_DIR}/include/skyr/url/details/to_bytes.hpp ${Skyr_SOURCE_DIR}/include/skyr/config.hpp ${Skyr_SOURCE_DIR}/include/skyr/traits/string_traits.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/errors.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/core.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/unicode.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/octet_range.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u8_range.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u16_range.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u32_range.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/percent_encode.hpp diff --git a/src/unicode/errors.cpp b/src/unicode/errors.cpp new file mode 100644 index 00000000..4739d58e --- /dev/null +++ b/src/unicode/errors.cpp @@ -0,0 +1,36 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include + +namespace skyr::unicode { +namespace { +class unicode_error_category : public std::error_category { + public: + [[nodiscard]] const char *name() const noexcept override; + [[nodiscard]] std::string message(int error) const noexcept override; +}; + +const char *unicode_error_category::name() const noexcept { + return "unicode"; +} + +std::string unicode_error_category::message(int error) const noexcept { + switch (static_cast(error)) { + case unicode_errc::overflow:return "Overflow"; + case unicode_errc::invalid_lead:return "Invalid lead"; + case unicode_errc::illegal_byte_sequence:return "Illegal byte sequence"; + case unicode_errc::invalid_code_point:return "Invalid code point"; + default:return "(Unknown error)"; + } +} + +const unicode_error_category category{}; +} // namespace + +std::error_code make_error_code(unicode_errc error) { + return std::error_code(static_cast(error), category); +} +} // namespace skyr::unicode diff --git a/src/unicode/unicode.cpp b/src/unicode/unicode.cpp index eea72fbb..0fe4b6e4 100644 --- a/src/unicode/unicode.cpp +++ b/src/unicode/unicode.cpp @@ -5,47 +5,10 @@ #include +#include -namespace skyr::unicode { -namespace { -class unicode_error_category : public std::error_category { - public: - [[nodiscard]] const char *name() const noexcept override; - [[nodiscard]] std::string message(int error) const noexcept override; -}; - -const char *unicode_error_category::name() const noexcept { - return "unicode"; -} - -std::string unicode_error_category::message(int error) const noexcept { - switch (static_cast(error)) { - case unicode_errc::overflow:return "Overflow"; - case unicode_errc::invalid_lead:return "Invalid lead"; - case unicode_errc::illegal_byte_sequence:return "Illegal byte sequence"; - case unicode_errc::invalid_code_point:return "Invalid code point"; - default:return "(Unknown error)"; - } -} - -const unicode_error_category category{}; -} // namespace - -std::error_code make_error_code(unicode_errc error) { - return std::error_code(static_cast(error), category); -} - -tl::expected wstring_from_bytes( - std::string_view input) { - std::wstring result; - auto expected = copy_u8u16( - begin(input), end(input), std::back_inserter(result)); - if (!expected) { - return tl::make_unexpected(std::error_code(expected.error())); - } - return result; -} +namespace skyr::unicode { tl::expected wstring_to_bytes( std::wstring_view input) { std::string result; @@ -59,13 +22,7 @@ tl::expected wstring_to_bytes( tl::expected utf16_from_bytes( std::string_view bytes) { - std::u16string result; - auto expected = copy_u8u16( - begin(bytes), end(bytes), std::back_inserter(result)); - if (!expected) { - return tl::make_unexpected(std::error_code(expected.error())); - } - return result; + return u16string(bytes | view::u16); } tl::expected utf16_to_bytes( @@ -81,13 +38,7 @@ tl::expected utf16_to_bytes( tl::expected utf32_from_bytes( std::string_view bytes) { - std::u32string result; - auto expected = copy_u8u32( - begin(bytes), end(bytes), std::back_inserter(result)); - if (!expected) { - return tl::make_unexpected(std::error_code(expected.error())); - } - return result; + return u32string(bytes | view::u32); } tl::expected utf32_to_bytes( diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index bbdaf1fa..b7216468 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -8,7 +8,7 @@ #define CATCH_CONFIG_MAIN #include #include -#include +#include #include #include @@ -19,7 +19,7 @@ TEST_CASE("code point tests") { SECTION("u8 code point 01") { auto bytes = std::string("\xf0\x9f\x92\xa9"); - auto cp = skyr::unicode::code_point_octets(bytes); + auto cp = skyr::unicode::u8_code_point(bytes); REQUIRE(cp); CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); CHECK(U'\x1f4a9' == u32(cp.value())); @@ -30,13 +30,13 @@ TEST_CASE("code point tests") { SECTION("u8 code point 02") { auto bytes = std::string("\x9f\x92\xa9"); - auto cp = skyr::unicode::code_point_octets(bytes); + auto cp = skyr::unicode::u8_code_point(bytes); REQUIRE(!cp); } } TEST_CASE("octet range iterator") { - using iterator_type = skyr::unicode::octet_range_iterator; + using iterator_type = skyr::unicode::u8_range_iterator; SECTION("construction") { auto bytes = std::string("\xf0\x9f\x92\xa9"); @@ -131,24 +131,24 @@ TEST_CASE("u8 range") { SECTION("construction") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto view = skyr::unicode::view_octet_range(bytes); + auto view = skyr::unicode::view_u8_range(bytes); CHECK(begin(view) != end(view)); } SECTION("empty") { - auto view = skyr::unicode::view_octet_range(); + auto view = skyr::unicode::view_u8_range(); CHECK(begin(view) == end(view)); } SECTION("count") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto view = skyr::unicode::view_octet_range(bytes); + auto view = skyr::unicode::view_u8_range(bytes); CHECK(4 == ranges::size(view)); CHECK(!ranges::empty(view)); } SECTION("empty count") { - auto view = skyr::unicode::view_octet_range(); + auto view = skyr::unicode::view_u8_range(); CHECK(0 == ranges::size(view)); CHECK(ranges::empty(view)); } @@ -172,7 +172,7 @@ TEST_CASE("u8 range") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); auto view = bytes | skyr::unicode::view::u8; auto it = std::begin(view), last = std::end(view); - *it++; + CHECK(!*it++); CHECK(it == last); CHECK(1 == ranges::size(view)); CHECK(!ranges::empty(view)); @@ -192,6 +192,12 @@ TEST_CASE("u8 range") { CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32.value()); } + SECTION("pipe syntax with u16 string invalid") { + auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); + auto u16 = skyr::unicode::u16string(bytes | skyr::unicode::view::u16); + CHECK(!u16); + } + SECTION("pipe syntax with u32 string invalid") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); auto u32 = skyr::unicode::u32string(bytes | skyr::unicode::view::u32); From e0db27a230d109731583751f346af9245f00456b Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Fri, 27 Sep 2019 08:16:06 +0800 Subject: [PATCH 05/10] Added conversion from u32 to bytes implementation --- include/skyr/unicode/core.hpp | 28 ++-- include/skyr/unicode/range/u16_range.hpp | 2 +- include/skyr/unicode/range/u32_range.hpp | 196 ++++++++++++++++++++++- src/unicode/unicode.cpp | 5 + tests/unicode_range_tests.cpp | 27 ++-- 5 files changed, 231 insertions(+), 27 deletions(-) diff --git a/include/skyr/unicode/core.hpp b/include/skyr/unicode/core.hpp index 61c385e8..8a4f8fe8 100644 --- a/include/skyr/unicode/core.hpp +++ b/include/skyr/unicode/core.hpp @@ -337,22 +337,20 @@ tl::expected append_bytes( return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); } - auto value = static_cast(code_point); - - if (value < 0x80u) { // one octet - *(octet_it++) = static_cast(value); - } else if (value < 0x800u) { // two octets - *(octet_it++) = static_cast((value >> 6u) | 0xc0u); - *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); - } else if (value < 0x10000u) { // three octets - *(octet_it++) = static_cast((value >> 12u) | 0xe0u); - *(octet_it++) = static_cast(((value >> 6u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); + if (code_point < 0x80u) { // one octet + *(octet_it++) = static_cast(code_point); + } else if (code_point < 0x800u) { // two octets + *(octet_it++) = static_cast((code_point >> 6u) | 0xc0u); + *(octet_it++) = static_cast((code_point & 0x3fu) | 0x80u); + } else if (code_point < 0x10000u) { // three octets + *(octet_it++) = static_cast((code_point >> 12u) | 0xe0u); + *(octet_it++) = static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); + *(octet_it++) = static_cast((code_point & 0x3fu) | 0x80u); } else { // four octets - *(octet_it++) = static_cast((value >> 18u) | 0xf0u); - *(octet_it++) = static_cast(((value >> 12u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast(((value >> 6u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast((value & 0x3fu) | 0x80u); + *(octet_it++) = static_cast((code_point >> 18u) | 0xf0u); + *(octet_it++) = static_cast(((code_point >> 12u) & 0x3fu) | 0x80u); + *(octet_it++) = static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); + *(octet_it++) = static_cast((code_point & 0x3fu) | 0x80u); } return octet_it; } diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/u16_range.hpp index a7fe96b4..24808cdd 100644 --- a/include/skyr/unicode/range/u16_range.hpp +++ b/include/skyr/unicode/range/u16_range.hpp @@ -72,7 +72,7 @@ inline u16_code_point_t u16_code_point(char32_t code_point) { /// \return template inline u16_code_point_t u16(u8_code_point_t code_point) { - return u16_code_point(u32(code_point)); + return u16_code_point(details::u32(code_point)); } /// diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp index 3c66f915..9fc918b8 100644 --- a/include/skyr/unicode/range/u32_range.hpp +++ b/include/skyr/unicode/range/u32_range.hpp @@ -16,15 +16,17 @@ #include namespace skyr::unicode { +namespace details { /// /// \tparam OctetIterator /// \param code_point /// \return -template +template inline char32_t u32(u8_code_point_t code_point) { auto state = find_code_point(std::begin(code_point)); return state ? state.value().value : U'\x0000'; } +} // namespace details /// /// \tparam OctetIterator @@ -80,7 +82,7 @@ class u32_range_iterator { reference operator * () const noexcept { return (*it_) .and_then([] (auto code_point) -> value_type { - return u32(code_point); + return details::u32(code_point); }); } @@ -200,9 +202,187 @@ struct u32_range_fn { }; +namespace u32 { +template +class byte_iterator { + + public: + + using value_type = tl::expected; + using reference = value_type; + using difference_type = std::ptrdiff_t; + + byte_iterator() = default; + + byte_iterator(U32Iterator first, U32Iterator last) + : it_(first), last_(last) {} + + byte_iterator(const byte_iterator &) = default; + byte_iterator(byte_iterator &&) noexcept = default; + byte_iterator &operator=(const byte_iterator &) = default; + byte_iterator &operator=(byte_iterator &&) noexcept = default; + ~byte_iterator() = default; + + byte_iterator &operator++() { + increment(); + return *this; + } + + byte_iterator operator++(int) { + auto result = *this; + increment(); + return result; + } + + reference operator*() { + auto code_point = *it_; + + if (!is_valid_code_point(code_point)) { + return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); + } + + if (code_point < 0x80u) { + return static_cast(code_point); + } else if (code_point < 0x800u) { + if (octet_index_ == 0) { + return static_cast((code_point >> 6u) | 0xc0u); + } else if (octet_index_ == 1) { + return static_cast((code_point & 0x3fu) | 0x80u); + } + } else if (code_point < 0x10000u) { + if (octet_index_ == 0) { + return static_cast((code_point >> 12u) | 0xe0u); + } else if (octet_index_ == 1) { + return static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); + } else if (octet_index_ == 2) { + return static_cast((code_point & 0x3fu) | 0x80u); + } + } else { + if (octet_index_ == 0) { + return static_cast((code_point >> 18u) | 0xf0u); + } else if (octet_index_ == 1) { + return static_cast(((code_point >> 12u) & 0x3fu) | 0x80u); + } else if (octet_index_ == 2) { + return static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); + } else if (octet_index_ == 3) { + return static_cast((code_point & 0x3fu) | 0x80u); + } + } + return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); + } + + constexpr bool operator == (const byte_iterator &other) const noexcept { + return (it_ == other.it_) && (octet_index_ == other.octet_index_); + } + + constexpr bool operator != (const byte_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + constexpr auto octet_count(char32_t code_point) { + if (code_point < 0x80u) { + return 1; + } else if (code_point < 0x800u) { + return 2; + } else if (code_point < 0x10000u) { + return 3; + } else { + return 4; + }; + } + + void increment() { + if (**this) { + ++octet_index_; + if (octet_index_ == octet_count(*it_)) { + octet_index_ = 0; + ++it_; + } + } + else { + it_ = last_; + } + } + + U32Iterator it_, last_; + int octet_index_ = 0; + +}; + +template +class view_byte_range { + + using iterator_type = byte_iterator; + + public: + + /// + using value_type = tl::expected; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + view_byte_range() = default; + + explicit view_byte_range( + const U32Range &range) + : first(iterator_type{std::begin(range), std::end(range)}) + , last(iterator_type{std::end(range), std::end(range)}) {} + + const_iterator begin() const { + return first? first.value() : iterator_type(); + } + + const_iterator end() const { + return last? last.value() : iterator_type(); + } + + bool empty() const noexcept { + return begin() == end(); + } + + private: + + std::optional first, last; + +}; + +/// +struct byte_range_fn { + /// + /// \tparam U32Range + /// \param range + /// \return + template + constexpr auto operator()(U32Range &&range) const { + return view_byte_range{std::forward(range)}; + } + + /// + /// \tparam U32Range + /// \param range + /// \return + template + friend constexpr auto operator|(U32Range &&range, const byte_range_fn&) { + return view_byte_range{std::forward(range)}; + } + +}; +} // namespace u32 + namespace view { /// static constexpr u32_range_fn u32; +static constexpr u32::byte_range_fn bytes; } // namespace view template @@ -217,6 +397,18 @@ tl::expected u32string(U32Range &&range) { } return result; } + +template +tl::expected bytes(OctetRange &&range) { + auto result = std::string(); + for (auto &&octet : range) { + if (!octet) { + return tl::make_unexpected(octet.error()); + } + result.push_back(octet.value()); + } + return result; +} } // namespace skyr::unicode #endif //SKYR_U32_RANGE_HPP diff --git a/src/unicode/unicode.cpp b/src/unicode/unicode.cpp index 0fe4b6e4..7d760275 100644 --- a/src/unicode/unicode.cpp +++ b/src/unicode/unicode.cpp @@ -9,6 +9,11 @@ namespace skyr::unicode { +tl::expected wstring_from_bytes( + std::string_view bytes) { + return wstring(bytes | view::u16); +} + tl::expected wstring_to_bytes( std::wstring_view input) { std::string result; diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index b7216468..47c9a4ba 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -22,7 +22,7 @@ TEST_CASE("code point tests") { auto cp = skyr::unicode::u8_code_point(bytes); REQUIRE(cp); CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); - CHECK(U'\x1f4a9' == u32(cp.value())); + CHECK(U'\x1f4a9' == skyr::unicode::details::u32(cp.value())); CHECK(u16(cp.value()).is_surrogate_pair()); CHECK(u'\xd83d' == u16(cp.value()).lead_value()); CHECK(u'\xdca9' == u16(cp.value()).trail_value()); @@ -43,7 +43,7 @@ TEST_CASE("octet range iterator") { auto it = iterator_type(std::begin(bytes), std::end(bytes)); auto code_point = *it; REQUIRE(code_point); - CHECK(U'\x1F4A9' == u32(code_point.value())); + CHECK(U'\x1F4A9' == skyr::unicode::details::u32(code_point.value())); } SECTION("increment") { @@ -51,11 +51,11 @@ TEST_CASE("octet range iterator") { auto it = iterator_type(std::begin(bytes), std::end(bytes)); auto code_point = *it; REQUIRE(code_point); - CHECK(U'\x1F3F3' == u32(code_point.value())); + CHECK(U'\x1F3F3' == skyr::unicode::details::u32(code_point.value())); ++it; code_point = *it; REQUIRE(code_point); - CHECK(U'\xFE0F' == u32(code_point.value())); + CHECK(U'\xFE0F' == skyr::unicode::details::u32(code_point.value())); } SECTION("increment invalid") { @@ -94,12 +94,12 @@ TEST_CASE("octet range iterator") { { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x65e5' == u32(code_point.value())); + CHECK(U'\x65e5' == skyr::unicode::details::u32(code_point.value())); } { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x448' == u32(code_point.value())); + CHECK(U'\x448' == skyr::unicode::details::u32(code_point.value())); } } @@ -110,17 +110,17 @@ TEST_CASE("octet range iterator") { { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x10346' == u32(code_point.value())); + CHECK(U'\x10346' == skyr::unicode::details::u32(code_point.value())); } { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x65e5' == u32(code_point.value())); + CHECK(U'\x65e5' == skyr::unicode::details::u32(code_point.value())); } { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x448' == u32(code_point.value())); + CHECK(U'\x448' == skyr::unicode::details::u32(code_point.value())); } } } @@ -204,3 +204,12 @@ TEST_CASE("u8 range") { CHECK(!u32); } } + +TEST_CASE("write bytes") { + SECTION("append_bytes") { + auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); + auto bytes = skyr::unicode::bytes(input | skyr::unicode::view::bytes); + REQUIRE(bytes); + CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); + } +} From 3e00231861bc48b5f5804c904e17a5e508348513 Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Sat, 28 Sep 2019 08:25:38 +0800 Subject: [PATCH 06/10] moved code around --- include/skyr/unicode/code_point.hpp | 216 ++++++++++++ include/skyr/unicode/range/u16_range.hpp | 64 +--- include/skyr/unicode/range/u32_range.hpp | 50 +-- include/skyr/unicode/range/u8_range.hpp | 325 +----------------- .../skyr/unicode/range/unchecked_u8_range.hpp | 209 +++++++++++ src/unicode/unicode.cpp | 2 +- tests/CMakeLists.txt | 1 + tests/unicode_code_point_tests.cpp | 34 ++ tests/unicode_range_tests.cpp | 31 +- 9 files changed, 494 insertions(+), 438 deletions(-) create mode 100644 include/skyr/unicode/code_point.hpp create mode 100644 include/skyr/unicode/range/unchecked_u8_range.hpp create mode 100644 tests/unicode_code_point_tests.cpp diff --git a/include/skyr/unicode/code_point.hpp b/include/skyr/unicode/code_point.hpp new file mode 100644 index 00000000..96660c99 --- /dev/null +++ b/include/skyr/unicode/code_point.hpp @@ -0,0 +1,216 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_CODE_POINT_HPP +#define SKYR_UNICODE_CODE_POINT_HPP + + +namespace skyr::unicode { +/// This class defines a range over a code point in raw bytes, +/// according to UTF-8. +/// \tparam OctetIterator An iterator type over the raw bytes +template +class u8_code_point_t { + public: + + /// + using const_iterator = OctetIterator; + /// + using iterator = const_iterator; + /// + using value_type = char; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using size_type = std::size_t; + + /// \brief Constructor + /// \param first An iterator at the beginning of the code point + /// \param last An iterator at the end of the code point + constexpr u8_code_point_t( + OctetIterator first, + OctetIterator last) + : first(first), last(last) {} + + /// \brief Constructor. The length of the code point sequence is + /// inferred from the first code point value. + /// \param first An iterator at the beginning of the code point + explicit constexpr u8_code_point_t(OctetIterator first) + : u8_code_point_t(first, first + sequence_length(*first)) {} + + /// \brief Copy constructor. + constexpr u8_code_point_t(const u8_code_point_t &) = default; + /// \brief Move constructor. + constexpr u8_code_point_t(u8_code_point_t &&) noexcept = default; + /// \brief Copy assignment operator. + constexpr u8_code_point_t &operator=(const u8_code_point_t &) = default; + /// \brief Move assignment operator. + constexpr u8_code_point_t &operator=(u8_code_point_t &&) noexcept = default; + /// \brief Destructor. + ~u8_code_point_t() = default; + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return first; + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return last; + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// \brief Returns the length in bytes of this code point. + /// \return + [[nodiscard]] constexpr auto size() const noexcept -> size_type { + return sequence_length(*first); + } + + private: + + OctetIterator first, last; + +}; + +/// +/// \tparam OctetRange +/// \param range +/// \return +template +inline tl::expected, std::error_code> u8_code_point( + const OctetRange &range) { + auto first = std::begin(range); + if (ranges::distance(range) > sequence_length(*first)) { + return tl::make_unexpected(make_error_code(unicode_errc::overflow)); + } + return u8_code_point_t( + first, + first + sequence_length(*first)); +} + + +/// Tests if the code point value is valid. +/// \returns \c true if the value is a valid code point, \c false otherwise +template +inline bool is_valid(const u8_code_point_t &code_point) { + return static_cast(find_code_point(std::begin(code_point))); +} + +/// +/// \tparam OctetRange +/// \param range +/// \return +template +inline tl::expected, std::error_code> valid_u8_code_point( + const OctetRange &range) { + using result_type = tl::expected, std::error_code>; + + auto check_code_point = [] (auto &&code_point) -> result_type { + return find_code_point(std::begin(code_point)) + .and_then([=] (auto) -> result_type { + return code_point; + }); + }; + + return + u8_code_point(range) + .and_then(check_code_point); +} + +namespace details { +/// +/// \tparam OctetIterator +/// \param code_point +/// \return +template +inline char32_t u32(u8_code_point_t code_point) { + auto state = find_code_point(std::begin(code_point)); + return state ? state.value().value : U'\x0000'; +} +} // namespace details + +/// +class u16_code_point_t { + + public: + + /// + /// \param first + explicit constexpr u16_code_point_t(char32_t code_point) + : code_point_(code_point) {} + + /// + constexpr u16_code_point_t(const u16_code_point_t &) = default; + /// + constexpr u16_code_point_t(u16_code_point_t &&) noexcept = default; + /// + u16_code_point_t &operator=(const u16_code_point_t &) = default; + /// + u16_code_point_t &operator=(u16_code_point_t &&) noexcept = default; + /// + ~u16_code_point_t() = default; + + /// + /// \return + [[nodiscard]] uint16_t lead_value() const { + return is_surrogate_pair()? + static_cast((code_point_ >> 10U) + constants::surrogates::lead_offset) : + static_cast(code_point_); + } + + /// + /// \return + [[nodiscard]] uint16_t trail_value() const { + return is_surrogate_pair()? + static_cast((code_point_ & 0x3ffU) + constants::surrogates::trail_min) : + 0; + } + + /// + /// \return + [[nodiscard]] constexpr bool is_surrogate_pair() const noexcept { + return code_point_ > 0xffffU; + } + + private: + + char32_t code_point_; + +}; + +/// +/// \param code_point +/// \return +inline u16_code_point_t u16_code_point(char32_t code_point) { + return u16_code_point_t(code_point); +} + +/// +/// \tparam OctetIterator +/// \param code_point +/// \return +template +inline u16_code_point_t u16(u8_code_point_t code_point) { + return u16_code_point(details::u32(code_point)); +} +} // namespace skyr::unicode + + +#endif //SKYR_UNICODE_CODE_POINT_HPP diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/u16_range.hpp index 24808cdd..e3f891dd 100644 --- a/include/skyr/unicode/range/u16_range.hpp +++ b/include/skyr/unicode/range/u16_range.hpp @@ -3,8 +3,8 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#ifndef SKYR_U16_RANGE_HPP -#define SKYR_U16_RANGE_HPP +#ifndef SKYR_UNICODE_U16_RANGE_HPP +#define SKYR_UNICODE_U16_RANGE_HPP #include #include @@ -17,64 +17,6 @@ #include namespace skyr::unicode { -class u16_code_point_t { - - public: - - /// - /// \param first - explicit constexpr u16_code_point_t(char32_t code_point) - : code_point_(code_point) {} - - /// - constexpr u16_code_point_t(const u16_code_point_t &) = default; - /// - constexpr u16_code_point_t(u16_code_point_t &&) noexcept = default; - /// - u16_code_point_t &operator=(const u16_code_point_t &) = default; - /// - u16_code_point_t &operator=(u16_code_point_t &&) noexcept = default; - /// - ~u16_code_point_t() = default; - - [[nodiscard]] uint16_t lead_value() const { - return is_surrogate_pair()? - static_cast((code_point_ >> 10U) + constants::surrogates::lead_offset) : - static_cast(code_point_); - } - - [[nodiscard]] uint16_t trail_value() const { - return is_surrogate_pair()? - static_cast((code_point_ & 0x3ffU) + constants::surrogates::trail_min) : - 0; - } - - [[nodiscard]] constexpr bool is_surrogate_pair() const noexcept { - return code_point_ > 0xffffU; - } - - private: - - char32_t code_point_; - -}; - -/// -/// \param code_point -/// \return -inline u16_code_point_t u16_code_point(char32_t code_point) { - return u16_code_point_t(code_point); -} - -/// -/// \tparam OctetIterator -/// \param code_point -/// \return -template -inline u16_code_point_t u16(u8_code_point_t code_point) { - return u16_code_point(details::u32(code_point)); -} - /// /// \tparam OctetIterator template @@ -293,4 +235,4 @@ tl::expected wstring(U16Range &&range) { } } // namespace skyr::unicode -#endif //SKYR_U16_RANGE_HPP +#endif //SKYR_UNICODE_U16_RANGE_HPP diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp index 9fc918b8..fef0501f 100644 --- a/include/skyr/unicode/range/u32_range.hpp +++ b/include/skyr/unicode/range/u32_range.hpp @@ -3,8 +3,8 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#ifndef SKYR_U32_RANGE_HPP -#define SKYR_U32_RANGE_HPP +#ifndef SKYR_UNICODE_U32_RANGE_HPP +#define SKYR_UNICODE_U32_RANGE_HPP #include #include @@ -16,18 +16,6 @@ #include namespace skyr::unicode { -namespace details { -/// -/// \tparam OctetIterator -/// \param code_point -/// \return -template -inline char32_t u32(u8_code_point_t code_point) { - auto state = find_code_point(std::begin(code_point)); - return state ? state.value().value : U'\x0000'; -} -} // namespace details - /// /// \tparam OctetIterator template @@ -281,7 +269,7 @@ class byte_iterator { private: - constexpr auto octet_count(char32_t code_point) { + static constexpr auto octet_count(char32_t code_point) { if (code_point < 0x80u) { return 1; } else if (code_point < 0x800u) { @@ -311,6 +299,7 @@ class byte_iterator { }; +/// template class view_byte_range { @@ -331,21 +320,30 @@ class view_byte_range { /// using size_type = std::size_t; + /// view_byte_range() = default; + /// + /// \param range explicit view_byte_range( const U32Range &range) : first(iterator_type{std::begin(range), std::end(range)}) , last(iterator_type{std::end(range), std::end(range)}) {} + /// + /// \return const_iterator begin() const { return first? first.value() : iterator_type(); } + /// + /// \return const_iterator end() const { return last? last.value() : iterator_type(); } + /// + /// \return bool empty() const noexcept { return begin() == end(); } @@ -385,10 +383,10 @@ static constexpr u32_range_fn u32; static constexpr u32::byte_range_fn bytes; } // namespace view -template -tl::expected u32string(U32Range &&range) { - auto result = std::u32string(); - result.reserve(ranges::size(range)); +template +tl::expected as(InputRange &&range) { + auto result = Output{}; +// result.reserve(ranges::size(range)); for (auto &&code_point : range) { if (!code_point) { return tl::make_unexpected(code_point.error()); @@ -397,18 +395,6 @@ tl::expected u32string(U32Range &&range) { } return result; } - -template -tl::expected bytes(OctetRange &&range) { - auto result = std::string(); - for (auto &&octet : range) { - if (!octet) { - return tl::make_unexpected(octet.error()); - } - result.push_back(octet.value()); - } - return result; -} } // namespace skyr::unicode -#endif //SKYR_U32_RANGE_HPP +#endif //SKYR_UNICODE_U32_RANGE_HPP diff --git a/include/skyr/unicode/range/u8_range.hpp b/include/skyr/unicode/range/u8_range.hpp index f20ec7ed..813a6116 100644 --- a/include/skyr/unicode/range/u8_range.hpp +++ b/include/skyr/unicode/range/u8_range.hpp @@ -3,224 +3,21 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#ifndef SKYR_U8_RANGE_HPP -#define SKYR_U8_RANGE_HPP +#ifndef SKYR_UNICODE_U8_RANGE_HPP +#define SKYR_UNICODE_U8_RANGE_HPP #include #include #include #include #include +#include #include #include +#include +#include namespace skyr::unicode { -/// This class defines a range over a code point in raw bytes, -/// according to UTF-8. -/// \tparam OctetIterator An iterator type over the raw bytes -template -class u8_code_point_t { - public: - - /// - using const_iterator = OctetIterator; - /// - using iterator = const_iterator; - /// - using value_type = char; - /// - using const_reference = value_type; - /// - using reference = const_reference; - /// - using size_type = std::size_t; - - /// \brief Constructor - /// \param first An iterator at the beginning of the code point - /// \param last An iterator at the end of the code point - constexpr u8_code_point_t( - OctetIterator first, - OctetIterator last) - : first(first) - , last(last) {} - - /// \brief Constructor. The length of the code point sequence is - /// inferred from the first code point value. - /// \param first An iterator at the beginning of the code point - explicit constexpr u8_code_point_t(OctetIterator first) - : u8_code_point_t(first, first + sequence_length(*first)) {} - - /// \brief Copy constructor. - constexpr u8_code_point_t(const u8_code_point_t &) = default; - /// \brief Move constructor. - constexpr u8_code_point_t(u8_code_point_t &&) noexcept = default; - /// \brief Copy assignment operator. - constexpr u8_code_point_t &operator=(const u8_code_point_t &) = default; - /// \brief Move assignment operator. - constexpr u8_code_point_t &operator=(u8_code_point_t &&) noexcept = default; - /// \brief Destructor. - ~u8_code_point_t() = default; - - /// - /// \return - [[nodiscard]] constexpr const_iterator begin() const noexcept { - return first; - } - - /// - /// \return - [[nodiscard]] constexpr const_iterator end() const noexcept { - return last; - } - - /// - /// \return - [[nodiscard]] constexpr auto cbegin() const noexcept { - return begin(); - } - - /// - /// \return - [[nodiscard]] constexpr auto cend() const noexcept { - return end(); - } - - /// \brief Returns the length in bytes of this code point. - /// \return - [[nodiscard]] constexpr auto size() const noexcept -> size_type { - return sequence_length(*first); - } - - private: - - OctetIterator first, last; - -}; - -/// -/// \tparam OctetRange -/// \param range -/// \return -template -inline tl::expected, std::error_code> u8_code_point( - const OctetRange &range) { - auto first = std::begin(range), last = std::end(range); - if (std::distance(first, last) > sequence_length(*first)) { - return tl::make_unexpected(make_error_code(unicode_errc::overflow)); - } - return u8_code_point_t( - first, - first + sequence_length(*first)); -} - -/// Tests if the code point value is valid. -/// \returns \c true if the value is a valid code point, \c false otherwise -template -inline bool is_valid(const u8_code_point_t &code_point) { - return static_cast(find_code_point(std::begin(code_point))); -} - -/// -/// \tparam OctetRange -/// \param range -/// \return -template -inline tl::expected, std::error_code> valid_u8_code_point( - const OctetRange &range) { - using result_type = tl::expected, std::error_code>; - - auto check_code_point = [] (auto &&code_point) -> result_type { - return find_code_point(std::begin(code_point)) - .and_then([=] (auto) -> result_type { - return code_point; - }); - }; - - return - u8_code_point(range) - .and_then(check_code_point); -} - -/// -/// \tparam OctetIterator -template -class unchecked_u8_range_iterator { - public: - - /// - using iterator_category = std::forward_iterator_tag; - /// - using value_type = u8_code_point_t; - /// - using reference = value_type; - /// - using pointer = typename std::add_pointer::type; - /// - using difference_type = std::ptrdiff_t; - - /// - constexpr unchecked_u8_range_iterator() = default; - /// - /// \param it - explicit constexpr unchecked_u8_range_iterator(OctetIterator it) - : it_(it) {} - /// - constexpr unchecked_u8_range_iterator(const unchecked_u8_range_iterator&) = default; - /// - constexpr unchecked_u8_range_iterator(unchecked_u8_range_iterator&&) noexcept = default; - /// - constexpr unchecked_u8_range_iterator &operator=(const unchecked_u8_range_iterator&) = default; - /// - constexpr unchecked_u8_range_iterator &operator=(unchecked_u8_range_iterator&&) noexcept = default; - /// - ~unchecked_u8_range_iterator() = default; - - /// - /// \return - unchecked_u8_range_iterator operator ++ (int) { - assert(it_); - auto result = *this; - std::advance(it_.value(), sequence_length(*it_.value())); - return result; - } - - /// - /// \return - unchecked_u8_range_iterator &operator ++ () { - assert(it_); - std::advance(it_.value(), sequence_length(*it_.value())); - return *this; - } - - /// - /// \return - constexpr reference operator * () const noexcept { - assert(it_); - return u8_code_point_t( - it_.value(), - it_.value() + sequence_length(*it_.value())); - } - - /// - /// \param other - /// \return - constexpr bool operator == (const unchecked_u8_range_iterator &other) const noexcept { - return it_ == other.it_; - } - - /// - /// \param other - /// \return - constexpr bool operator != (const unchecked_u8_range_iterator &other) const noexcept { - return !(*this == other); - } - - private: - - std::optional it_; - -}; - /// /// \tparam OctetIterator template @@ -311,91 +108,6 @@ class u8_range_iterator { }; -/// -/// \tparam OctetRange -template -class view_unchecked_u8_range - : public ranges::view_base { - - using octet_iterator_type = typename OctetRange::const_iterator; - using iterator_type = unchecked_u8_range_iterator; - - public: - - /// - using value_type = u8_code_point_t; - /// - using const_reference = value_type; - /// - using reference = const_reference; - /// - using const_iterator = iterator_type; - /// - using iterator = const_iterator; - /// - using size_type = std::size_t; - - /// Default constructor - constexpr view_unchecked_u8_range() = default; - - /// - /// \param range - explicit constexpr view_unchecked_u8_range(const OctetRange &range) - : impl_( - impl(std::begin(range), - std::end(range))) {} - - /// - /// \return - [[nodiscard]] constexpr const_iterator begin() const noexcept { - return impl_? impl_.value().first : iterator_type(); - } - - /// - /// \return - [[nodiscard]] constexpr const_iterator end() const noexcept { - return impl_? impl_.value().last : iterator_type(); - } - - /// - /// \return - [[nodiscard]] constexpr auto cbegin() const noexcept { - return begin(); - } - - /// - /// \return - [[nodiscard]] constexpr auto cend() const noexcept { - return end(); - } - - /// - /// \return - [[nodiscard]] constexpr bool empty() const noexcept { - return begin() == end(); - } - - /// - /// \return - [[nodiscard]] size_type size() const noexcept { - return static_cast(std::distance(begin(), end())); - } - - private: - - struct impl { - constexpr impl( - octet_iterator_type first, - octet_iterator_type last) - : first(first) - , last(last) {} - iterator_type first, last; - }; - - std::optional impl_; - -}; - /// /// \tparam OctetRange template @@ -407,6 +119,8 @@ class view_u8_range public: +// using iterator_tag = std::forward_iterator_tag; + /// using value_type = u8_code_point_t; /// @@ -481,27 +195,6 @@ class view_u8_range }; -/// -struct unchecked_u8_range_fn { - /// - /// \tparam OctetRange - /// \param range - /// \return - template - constexpr auto operator()(OctetRange &&range) const { - return view_unchecked_u8_range{std::forward(range)}; - } - - /// - /// \tparam OctetRange - /// \param range - /// \return - template - friend constexpr auto operator|(OctetRange &&range, const unchecked_u8_range_fn&) { - return view_unchecked_u8_range{std::forward(range)}; - } -}; - /// struct u8_range_fn { /// @@ -525,10 +218,8 @@ struct u8_range_fn { namespace view { /// -static constexpr unchecked_u8_range_fn unchecked_u8; -/// static constexpr u8_range_fn u8; } // namespace view } // namespace skyr::unicode -#endif //SKYR_U8_RANGE_HPP +#endif // SKYR_UNICODE_U8_RANGE_HPP diff --git a/include/skyr/unicode/range/unchecked_u8_range.hpp b/include/skyr/unicode/range/unchecked_u8_range.hpp new file mode 100644 index 00000000..bb20e66f --- /dev/null +++ b/include/skyr/unicode/range/unchecked_u8_range.hpp @@ -0,0 +1,209 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_UNCHECKED_U8_RANGE_HPP +#define SKYR_UNICODE_UNCHECKED_U8_RANGE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \tparam OctetIterator +template +class unchecked_u8_range_iterator { + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = u8_code_point_t; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + constexpr unchecked_u8_range_iterator() = default; + /// + /// \param it + explicit constexpr unchecked_u8_range_iterator(OctetIterator it) + : it_(it) {} + /// + constexpr unchecked_u8_range_iterator(const unchecked_u8_range_iterator&) = default; + /// + constexpr unchecked_u8_range_iterator(unchecked_u8_range_iterator&&) noexcept = default; + /// + constexpr unchecked_u8_range_iterator &operator=(const unchecked_u8_range_iterator&) = default; + /// + constexpr unchecked_u8_range_iterator &operator=(unchecked_u8_range_iterator&&) noexcept = default; + /// + ~unchecked_u8_range_iterator() = default; + + /// + /// \return + unchecked_u8_range_iterator operator ++ (int) { + auto result = *this; + std::advance(it_, sequence_length(*it_)); + return result; + } + + /// + /// \return + unchecked_u8_range_iterator &operator ++ () { + std::advance(it_, sequence_length(*it_)); + return *this; + } + + /// + /// \return + constexpr reference operator * () const noexcept { + return u8_code_point_t( + it_, + it_ + sequence_length(*it_)); + } + + /// + /// \param other + /// \return + constexpr bool operator == (const unchecked_u8_range_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + constexpr bool operator != (const unchecked_u8_range_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + OctetIterator it_; + +}; + +/// +/// \tparam OctetRange +template +class view_unchecked_u8_range + : public ranges::view_base { + + using octet_iterator_type = typename OctetRange::const_iterator; + using iterator_type = unchecked_u8_range_iterator; + + public: + + /// + using value_type = u8_code_point_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// Default constructor + constexpr view_unchecked_u8_range() = default; + + /// + /// \param range + explicit constexpr view_unchecked_u8_range(const OctetRange &range) + : impl_( + impl(std::begin(range), + std::end(range))) {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return impl_? impl_.value().first : iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return impl_? impl_.value().last : iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return begin() == end(); + } + + /// + /// \return + [[nodiscard]] size_type size() const noexcept { + return static_cast(std::distance(begin(), end())); + } + + private: + + struct impl { + constexpr impl( + octet_iterator_type first, + octet_iterator_type last) + : first(first) + , last(last) {} + iterator_type first, last; + }; + + std::optional impl_; + +}; + +/// +struct unchecked_u8_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(OctetRange &&range) const { + return view_unchecked_u8_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(OctetRange &&range, const unchecked_u8_range_fn&) { + return view_unchecked_u8_range{std::forward(range)}; + } +}; + +namespace view { +/// +static constexpr unchecked_u8_range_fn unchecked_u8; +} // namespace view +} // namespace skyr::unicode + +#endif //SKYR_UNICODE_UNCHECKED_U8_RANGE_HPP diff --git a/src/unicode/unicode.cpp b/src/unicode/unicode.cpp index 7d760275..1cdb27b9 100644 --- a/src/unicode/unicode.cpp +++ b/src/unicode/unicode.cpp @@ -43,7 +43,7 @@ tl::expected utf16_to_bytes( tl::expected utf32_from_bytes( std::string_view bytes) { - return u32string(bytes | view::u32); + return as(bytes | view::u32); } tl::expected utf32_to_bytes( diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fc36ba81..f42d2a80 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -23,6 +23,7 @@ set( punycode_tests domain_tests unicode_tests + unicode_code_point_tests unicode_range_tests ) diff --git a/tests/unicode_code_point_tests.cpp b/tests/unicode_code_point_tests.cpp new file mode 100644 index 00000000..74125605 --- /dev/null +++ b/tests/unicode_code_point_tests.cpp @@ -0,0 +1,34 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#define CATCH_CONFIG_MAIN +#include +#include +#include + + +TEST_CASE("code point tests") { + using std::begin; + using std::end; + + SECTION("u8 code point 01") { + auto bytes = std::string("\xf0\x9f\x92\xa9"); + auto cp = skyr::unicode::u8_code_point(bytes); + REQUIRE(cp); + CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); + CHECK(U'\x1f4a9' == skyr::unicode::details::u32(cp.value())); + CHECK(u16(cp.value()).is_surrogate_pair()); + CHECK(u'\xd83d' == u16(cp.value()).lead_value()); + CHECK(u'\xdca9' == u16(cp.value()).trail_value()); + } + + SECTION("u8 code point 02") { + auto bytes = std::string("\x9f\x92\xa9"); + auto cp = skyr::unicode::u8_code_point(bytes); + REQUIRE(!cp); + } +} diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index 47c9a4ba..556db638 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -13,28 +13,6 @@ #include -TEST_CASE("code point tests") { - using std::begin; - using std::end; - - SECTION("u8 code point 01") { - auto bytes = std::string("\xf0\x9f\x92\xa9"); - auto cp = skyr::unicode::u8_code_point(bytes); - REQUIRE(cp); - CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); - CHECK(U'\x1f4a9' == skyr::unicode::details::u32(cp.value())); - CHECK(u16(cp.value()).is_surrogate_pair()); - CHECK(u'\xd83d' == u16(cp.value()).lead_value()); - CHECK(u'\xdca9' == u16(cp.value()).trail_value()); - } - - SECTION("u8 code point 02") { - auto bytes = std::string("\x9f\x92\xa9"); - auto cp = skyr::unicode::u8_code_point(bytes); - REQUIRE(!cp); - } -} - TEST_CASE("octet range iterator") { using iterator_type = skyr::unicode::u8_range_iterator; @@ -162,8 +140,7 @@ TEST_CASE("u8 range") { SECTION("pipe syntax with string_view") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto bytes_view = std::string_view(bytes); - auto view = bytes_view | skyr::unicode::view::u8; + auto view = std::string_view(bytes) | skyr::unicode::view::u8; CHECK(4 == ranges::size(view)); CHECK(!ranges::empty(view)); } @@ -187,7 +164,7 @@ TEST_CASE("u8 range") { SECTION("pipe syntax with u32 string") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto u32 = skyr::unicode::u32string(bytes | skyr::unicode::view::u32); + auto u32 = skyr::unicode::as(bytes | skyr::unicode::view::u32); REQUIRE(u32); CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32.value()); } @@ -200,7 +177,7 @@ TEST_CASE("u8 range") { SECTION("pipe syntax with u32 string invalid") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto u32 = skyr::unicode::u32string(bytes | skyr::unicode::view::u32); + auto u32 = skyr::unicode::as(bytes | skyr::unicode::view::u32); CHECK(!u32); } } @@ -208,7 +185,7 @@ TEST_CASE("u8 range") { TEST_CASE("write bytes") { SECTION("append_bytes") { auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); - auto bytes = skyr::unicode::bytes(input | skyr::unicode::view::bytes); + auto bytes = skyr::unicode::as(input | skyr::unicode::view::bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); } From 3305837c8dc85a786f4ecb3a833d8e0a341c9dbc Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Tue, 1 Oct 2019 14:53:28 +0800 Subject: [PATCH 07/10] Unicode conversions work now using the new syntax --- include/skyr/unicode/code_point.hpp | 141 +++++- include/skyr/unicode/constants.hpp | 8 +- include/skyr/unicode/core.hpp | 61 +-- include/skyr/unicode/range/traits.hpp | 29 ++ .../range/transforms/byte_transform.hpp | 220 ++++++++++ .../range/transforms/u16_transform.hpp | 226 ++++++++++ .../range/transforms/u32_transform.hpp | 217 ++++++++++ include/skyr/unicode/range/u32_range.hpp | 400 ------------------ .../{u16_range.hpp => views/u16_view.hpp} | 117 +++-- .../range/{u8_range.hpp => views/u8_view.hpp} | 10 +- .../unchecked_u8_view.hpp} | 3 +- include/skyr/unicode/unicode.hpp | 59 --- src/CMakeLists.txt | 11 +- src/unicode/unicode.cpp | 18 +- tests/unicode_code_point_tests.cpp | 28 +- tests/unicode_range_tests.cpp | 70 ++- tests/unicode_tests.cpp | 101 +++-- 17 files changed, 1024 insertions(+), 695 deletions(-) create mode 100644 include/skyr/unicode/range/traits.hpp create mode 100644 include/skyr/unicode/range/transforms/byte_transform.hpp create mode 100644 include/skyr/unicode/range/transforms/u16_transform.hpp create mode 100644 include/skyr/unicode/range/transforms/u32_transform.hpp delete mode 100644 include/skyr/unicode/range/u32_range.hpp rename include/skyr/unicode/range/{u16_range.hpp => views/u16_view.hpp} (59%) rename include/skyr/unicode/range/{u8_range.hpp => views/u8_view.hpp} (95%) rename include/skyr/unicode/range/{unchecked_u8_range.hpp => views/unchecked_u8_view.hpp} (97%) diff --git a/include/skyr/unicode/code_point.hpp b/include/skyr/unicode/code_point.hpp index 96660c99..c4b3a902 100644 --- a/include/skyr/unicode/code_point.hpp +++ b/include/skyr/unicode/code_point.hpp @@ -6,6 +6,11 @@ #ifndef SKYR_UNICODE_CODE_POINT_HPP #define SKYR_UNICODE_CODE_POINT_HPP +#include +#include +#include +#include +#include namespace skyr::unicode { /// This class defines a range over a code point in raw bytes, @@ -83,6 +88,13 @@ class u8_code_point_t { return sequence_length(*first); } + [[nodiscard]] tl::expected u32_value() const noexcept { + return find_code_point(first) + .and_then([] (auto state) -> tl::expected { + return state.value; + }); + } + private: OctetIterator first, last; @@ -134,17 +146,17 @@ inline tl::expected, std::e .and_then(check_code_point); } -namespace details { -/// -/// \tparam OctetIterator -/// \param code_point -/// \return -template -inline char32_t u32(u8_code_point_t code_point) { - auto state = find_code_point(std::begin(code_point)); - return state ? state.value().value : U'\x0000'; -} -} // namespace details +//namespace details { +///// +///// \tparam OctetIterator +///// \param code_point +///// \return +//template +//inline char32_t u32(u8_code_point_t code_point) { +// auto state = find_code_point(std::begin(code_point)); +// return state ? state.value().value : U'\x0000'; +//} +//} // namespace details /// class u16_code_point_t { @@ -152,9 +164,17 @@ class u16_code_point_t { public: /// - /// \param first + /// \param code_point explicit constexpr u16_code_point_t(char32_t code_point) - : code_point_(code_point) {} + : code_point_(code_point) {} + + /// + /// \param code_point + explicit constexpr u16_code_point_t(char16_t code_point) + : code_point_(code_point) {} + + constexpr u16_code_point_t(char16_t lead_value, char16_t trail_value) + : code_point_((lead_value << 10) + trail_value + constants::surrogates::offset) {} /// constexpr u16_code_point_t(const u16_code_point_t &) = default; @@ -189,6 +209,10 @@ class u16_code_point_t { return code_point_ > 0xffffU; } + tl::expected u32_value() const noexcept { + return code_point_; + } + private: char32_t code_point_; @@ -202,13 +226,100 @@ inline u16_code_point_t u16_code_point(char32_t code_point) { return u16_code_point_t(code_point); } +/// +/// \param code_point +/// \return +inline u16_code_point_t u16_code_point(char16_t code_point) { + return u16_code_point_t(code_point); +} + +/// +/// \param lead +/// \param value +/// \return +inline u16_code_point_t u16_code_point(char16_t lead, char16_t value) { + return u16_code_point_t(lead, value); +} + +/// +/// \tparam OctetIterator +/// \param code_point +/// \return +template +inline tl::expected u32_value( + u8_code_point_t code_point) noexcept { + return code_point.u32_value(); +} + +/// +/// \tparam OctetIterator +/// \param code_point +/// \return +template +inline tl::expected u32_value( + tl::expected, std::error_code> code_point) noexcept { + return code_point + .and_then([] (auto code_point) -> tl::expected { + return code_point.u32_value(); + }); +} + +/// +/// \param code_point +/// \return +inline tl::expected u32_value( + u16_code_point_t code_point) noexcept { + return code_point.u32_value(); +} + +/// +/// \param code_point +/// \return +inline tl::expected u32_value( + tl::expected code_point) noexcept { + return code_point + .and_then([] (auto code_point) -> tl::expected { + return code_point.u32_value(); + }); +} + +/// +/// \param code_point +/// \return +inline tl::expected u32_value( + char32_t code_point) noexcept { + return code_point; +} + +/// +/// \param code_point +/// \return +inline tl::expected u32_value( + tl::expected code_point) noexcept { + return code_point; +} + +/// +/// \tparam OctetIterator +/// \param code_point +/// \return +template +inline tl::expected u16_value( + u8_code_point_t code_point) { + return u16_code_point(u32_value(code_point)); +} + /// /// \tparam OctetIterator /// \param code_point /// \return template -inline u16_code_point_t u16(u8_code_point_t code_point) { - return u16_code_point(details::u32(code_point)); +inline tl::expected u16_value( + tl::expected, std::error_code> code_point) { + return u32_value(code_point) + .and_then([] (auto code_point) -> tl::expected { + return u16_code_point(code_point); + }); } } // namespace skyr::unicode diff --git a/include/skyr/unicode/constants.hpp b/include/skyr/unicode/constants.hpp index db5ae7e5..5fc58119 100644 --- a/include/skyr/unicode/constants.hpp +++ b/include/skyr/unicode/constants.hpp @@ -3,8 +3,8 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#ifndef SKYR_CONSTANTS_HPP -#define SKYR_CONSTANTS_HPP +#ifndef SKYR_UNICODE_CONSTANTS_HPP +#define SKYR_UNICODE_CONSTANTS_HPP namespace skyr::unicode::constants { namespace surrogates { @@ -22,6 +22,8 @@ namespace code_points { // Maximum valid value for a Unicode code point constexpr char32_t max = 0x0010ffffu; } // namespace code_points + +constexpr char bom[] = {'\xef', '\xbb', '\xbf'}; } // namespace skyr::unicode::constants -#endif //SKYR_CONSTANTS_HPP +#endif //SKYR_UNICODE_CONSTANTS_HPP diff --git a/include/skyr/unicode/core.hpp b/include/skyr/unicode/core.hpp index 8a4f8fe8..7f9eda66 100644 --- a/include/skyr/unicode/core.hpp +++ b/include/skyr/unicode/core.hpp @@ -323,7 +323,7 @@ tl::expected, std::error_code> next( .and_then(increment); } -/// Appends values to a octet sequence given a code point value +/// Appends values to an octet sequence given a code point value /// /// \tparam OctetIterator /// \param code_point @@ -354,65 +354,6 @@ tl::expected append_bytes( } return octet_it; } - -/// Advances `n` code oints through the octet sequence -/// \tparam OctetIterator -/// \param it An iterator to a lead octet in the octet sequence -/// \param n The number of code points to advance -/// \param last The last iterator in the octet sequence -/// \return The updated iterator or an error if the sequence is -/// invalid -template -tl::expected advance( - OctetIterator& it, - std::size_t n, - OctetIterator last) { - while (n != 0) { - if (ranges::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(make_error_code(unicode_errc::overflow)); - } - - auto state = unicode::next(it); - if (!state) { - return tl::make_unexpected(std::move(state.error())); - } - it = state.value().it; - --n; - } - - return it; -} - -/// Counts the number of code points in the octet sequence. -/// -/// \tparam OctetIterator -/// \param first The first element in the octet sequence -/// \param last The last element in the sequence -/// \return The number of code points or an error if it's not a -/// valid sequence. -template -tl::expected count( - OctetIterator first, - OctetIterator last) { - std::size_t count = 0; - auto it = first; - - while (it != last) { - if (std::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected( - make_error_code(unicode_errc::overflow)); - } - - auto state = unicode::next(it); - if (!state) { - return tl::make_unexpected(std::move(state.error())); - } - it = state.value().it; - ++count; - } - return count; -} - } // namespace skyr::unicode #endif //SKYR_UNICODE_CORE_HPP diff --git a/include/skyr/unicode/range/traits.hpp b/include/skyr/unicode/range/traits.hpp new file mode 100644 index 00000000..5f55ac70 --- /dev/null +++ b/include/skyr/unicode/range/traits.hpp @@ -0,0 +1,29 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_RANGE_TRAITS_HPP +#define SKYR_UNICODE_RANGE_TRAITS_HPP + +namespace skyr::unicode::traits { +/// +/// \tparam Range +template +class iterator { + public: + using type = typename Range::const_iterator; +}; + +/// +/// \tparam T +/// \tparam N +template +class iterator { + public: + using type = const T *; +}; +} // namespace skyr::unicode::traits + + +#endif //SKYR_UNICODE_RANGE_TRAITS_HPP diff --git a/include/skyr/unicode/range/transforms/byte_transform.hpp b/include/skyr/unicode/range/transforms/byte_transform.hpp new file mode 100644 index 00000000..85cddb52 --- /dev/null +++ b/include/skyr/unicode/range/transforms/byte_transform.hpp @@ -0,0 +1,220 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_BYTE_RANGE_HPP +#define SKYR_UNICODE_BYTE_RANGE_HPP + +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +template +class transform_byte_iterator { + + public: + + using value_type = tl::expected; + using reference = value_type; + using difference_type = std::ptrdiff_t; + + transform_byte_iterator() = default; + + transform_byte_iterator(CodePointIterator first, CodePointIterator last) + : it_(first), last_(last) {} + + transform_byte_iterator(const transform_byte_iterator &) = default; + transform_byte_iterator(transform_byte_iterator &&) noexcept = default; + transform_byte_iterator &operator=(const transform_byte_iterator &) = default; + transform_byte_iterator &operator=(transform_byte_iterator &&) noexcept = default; + ~transform_byte_iterator() = default; + + transform_byte_iterator &operator++() { + increment(); + return *this; + } + + transform_byte_iterator operator++(int) { + auto result = *this; + increment(); + return result; + } + + reference operator*() { + auto code_point = u32_value(*it_).value(); + + if (!is_valid_code_point(code_point)) { + return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); + } + + if (code_point < 0x80u) { + return static_cast(code_point); + } else if (code_point < 0x800u) { + if (octet_index_ == 0) { + return static_cast((code_point >> 6u) | 0xc0u); + } else if (octet_index_ == 1) { + return static_cast((code_point & 0x3fu) | 0x80u); + } + } else if (code_point < 0x10000u) { + if (octet_index_ == 0) { + return static_cast((code_point >> 12u) | 0xe0u); + } else if (octet_index_ == 1) { + return static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); + } else if (octet_index_ == 2) { + return static_cast((code_point & 0x3fu) | 0x80u); + } + } else { + if (octet_index_ == 0) { + return static_cast((code_point >> 18u) | 0xf0u); + } else if (octet_index_ == 1) { + return static_cast(((code_point >> 12u) & 0x3fu) | 0x80u); + } else if (octet_index_ == 2) { + return static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); + } else if (octet_index_ == 3) { + return static_cast((code_point & 0x3fu) | 0x80u); + } + } + return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); + } + + constexpr bool operator==(const transform_byte_iterator &other) const noexcept { + return (it_ == other.it_) && (octet_index_ == other.octet_index_); + } + + constexpr bool operator!=(const transform_byte_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + static constexpr auto octet_count(char32_t code_point) { + if (code_point < 0x80u) { + return 1; + } else if (code_point < 0x800u) { + return 2; + } else if (code_point < 0x10000u) { + return 3; + } else { + return 4; + }; + } + + void increment() { + if (**this) { + ++octet_index_; + if (octet_index_ == octet_count(u32_value(*it_).value())) { + octet_index_ = 0; + ++it_; + } + } else { + it_ = last_; + } + } + + CodePointIterator it_, last_; + int octet_index_ = 0; + +}; + + +/// +template +class transform_byte_range { + + using iterator_type = transform_byte_iterator::type>; + + public: + + /// + using value_type = tl::expected; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + transform_byte_range() = default; + + /// + /// \param range + explicit transform_byte_range( + const CodePointRange &range) + : first(iterator_type{std::begin(range), std::end(range)}), + last(iterator_type{std::end(range), std::end(range)}) {} + + /// + /// \return + const_iterator begin() const { + return first ? first.value() : iterator_type(); + } + + /// + /// \return + const_iterator end() const { + return last ? last.value() : iterator_type(); + } + + /// + /// \return + bool empty() const noexcept { + return begin() == end(); + } + + private: + + std::optional first, last; + +}; + +/// +struct byte_range_fn { + + /// + /// \tparam CodePointRange + /// \param range + /// \return + template + constexpr auto operator()(CodePointRange &&range) const { + return transform_byte_range{std::forward(range)}; + } + + /// + /// \tparam CodePointRange + /// \param range + /// \return + template + friend constexpr auto operator|(CodePointRange &&range, const byte_range_fn &) { + return transform_byte_range{std::forward(range)}; + } +}; + +namespace transform { +static constexpr byte_range_fn to_bytes; +} // namespace transform + +template +tl::expected as(transform_byte_range &&range) { + auto result = Output{}; + for (auto &&byte : range) { + if (!byte) { + return tl::make_unexpected(byte.error()); + } + result.push_back(byte.value()); + } + return result; +} +} // namespace skyr::unicode + +#endif //SKYR_UNICODE_BYTE_RANGE_HPP diff --git a/include/skyr/unicode/range/transforms/u16_transform.hpp b/include/skyr/unicode/range/transforms/u16_transform.hpp new file mode 100644 index 00000000..ea6117ac --- /dev/null +++ b/include/skyr/unicode/range/transforms/u16_transform.hpp @@ -0,0 +1,226 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_U16_RANGE_HPP +#define SKYR_UNICODE_U16_RANGE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \tparam OctetIterator +template +class transform_u16_iterator { + + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = tl::expected; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + transform_u16_iterator() = default; + /// + /// \param it + explicit constexpr transform_u16_iterator( + CodePointIterator it, + CodePointIterator last) + : it_(it) + , last_(last) {} + /// + constexpr transform_u16_iterator(const transform_u16_iterator&) = default; + /// + constexpr transform_u16_iterator(transform_u16_iterator&&) noexcept = default; + /// + constexpr transform_u16_iterator &operator=(const transform_u16_iterator&) = default; + /// + constexpr transform_u16_iterator &operator=(transform_u16_iterator&&) noexcept = default; + /// + ~transform_u16_iterator() = default; + + /// + /// \return + transform_u16_iterator operator ++ (int) { + auto result = *this; + ++it_; + return result; + } + + /// + /// \return + transform_u16_iterator &operator ++ () { + ++it_; + return *this; + } + + /// + /// \return + reference operator * () const noexcept { + auto code_point = *it_; + return + code_point + .and_then([] (auto value) -> value_type { + return u16_code_point(value); + }); + } + + /// + /// \param other + /// \return + bool operator == (const transform_u16_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + bool operator != (const transform_u16_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + transform_u32_iterator it_, last_; + +}; + +/// +/// \tparam OctetRange +template +class transform_u16_range + : public ranges::view_base { + + using iterator_type = transform_u16_iterator::type>; + + public: + + /// + using value_type = tl::expected; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr transform_u16_range() = default; + + /// + /// \param range + explicit constexpr transform_u16_range(CodePointRange &&range) + : range_{std::forward(range)} {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return iterator_type(std::begin(range_), std::end(range_)); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return range_.empty(); + } + + /// + /// \return + [[nodiscard]] constexpr size_type size() const noexcept { + return range_.size(); + } + + private: + + transform_u32_range range_; + +}; + +/// +struct transform_u16_range_fn { + /// + /// \tparam CodePointRange + /// \param range + /// \return + template + constexpr auto operator()(CodePointRange &&range) const { + return transform_u16_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(CodePointRange &&range, const transform_u16_range_fn&) { + return transform_u16_range{std::forward(range)}; + } + +}; + +namespace transform { +/// +static constexpr transform_u16_range_fn to_u16; +} // namespace transform + +/// +/// \tparam Output +/// \tparam OctetRange +/// \param range +/// \return +template +tl::expected as(transform_u16_range &&range) { + auto result = Output{}; + for (auto &&code_point : range) { + if (!code_point) { + return tl::make_unexpected(code_point.error()); + } + result.push_back(code_point.value().lead_value()); + if (code_point.value().is_surrogate_pair()) { + result.push_back(code_point.value().trail_value()); + } + } + return result; +} +} // namespace skyr::unicode + +#endif //SKYR_UNICODE_U16_RANGE_HPP diff --git a/include/skyr/unicode/range/transforms/u32_transform.hpp b/include/skyr/unicode/range/transforms/u32_transform.hpp new file mode 100644 index 00000000..0fb2f33c --- /dev/null +++ b/include/skyr/unicode/range/transforms/u32_transform.hpp @@ -0,0 +1,217 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_U32_RANGE_HPP +#define SKYR_UNICODE_U32_RANGE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \tparam OctetIterator +template +class transform_u32_iterator { + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = tl::expected; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + constexpr transform_u32_iterator() = default; + /// + /// \param it + explicit constexpr transform_u32_iterator(CodePointIterator it) + : it_(it) {} + /// + constexpr transform_u32_iterator(const transform_u32_iterator&) = default; + /// + constexpr transform_u32_iterator(transform_u32_iterator&&) noexcept = default; + /// + constexpr transform_u32_iterator &operator=(const transform_u32_iterator&) = default; + /// + constexpr transform_u32_iterator &operator=(transform_u32_iterator&&) noexcept = default; + /// + ~transform_u32_iterator() = default; + + /// + /// \return + transform_u32_iterator operator ++ (int) { + auto result = *this; + ++it_; + return result; + } + + /// + /// \return + transform_u32_iterator &operator ++ () { + ++it_; + return *this; + } + + /// + /// \return + reference operator * () const noexcept { + return (*it_) + .and_then([] (auto code_point) -> value_type { + return u32_value(code_point); + }); + } + + /// + /// \param other + /// \return + constexpr bool operator == (const transform_u32_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + constexpr bool operator != (const transform_u32_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + CodePointIterator it_; + +}; + +/// +/// \tparam OctetRange +template +class transform_u32_range + : public ranges::view_base { + + using iterator_type = typename traits::iterator::type; + + public: + + /// + using value_type = char32_t; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr transform_u32_range() = default; + + /// + /// \param range + explicit constexpr transform_u32_range(CodePointRange &&range) noexcept + : range_(std::forward(range)) {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return const_iterator(range_.begin()); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return const_iterator(range_.end()); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return range_.empty(); + } + + /// + /// \return + [[nodiscard]] constexpr size_type size() const noexcept { + return range_.size(); + } + + private: + + CodePointRange range_; + +}; + +/// +struct u32_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(CodePointRange &&range) const { + return transform_u32_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(CodePointRange &&range, const u32_range_fn&) { + return transform_u32_range{std::forward(range)}; + } + +}; + +namespace transform { +/// +static constexpr u32_range_fn to_u32; +} // namespace transform + +/// +/// \tparam Output +/// \tparam CodePointRange +/// \param range +/// \return +template +tl::expected as(transform_u32_range &&range) { + auto result = Output{}; + for (auto &&code_point : range) { + auto u32_code_point = u32_value(code_point); + if (!u32_code_point) { + return tl::make_unexpected(u32_code_point.error()); + } + result.push_back(u32_code_point.value()); + } + return result; +} +} // namespace skyr::unicode + +#endif //SKYR_UNICODE_U32_RANGE_HPP diff --git a/include/skyr/unicode/range/u32_range.hpp b/include/skyr/unicode/range/u32_range.hpp deleted file mode 100644 index fef0501f..00000000 --- a/include/skyr/unicode/range/u32_range.hpp +++ /dev/null @@ -1,400 +0,0 @@ -// Copyright 2019 Glyn Matthews. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#ifndef SKYR_UNICODE_U32_RANGE_HPP -#define SKYR_UNICODE_U32_RANGE_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace skyr::unicode { -/// -/// \tparam OctetIterator -template -class u32_range_iterator { - public: - - /// - using iterator_category = std::forward_iterator_tag; - /// - using value_type = tl::expected; - /// - using reference = value_type; - /// - using pointer = typename std::add_pointer::type; - /// - using difference_type = std::ptrdiff_t; - - /// - constexpr u32_range_iterator() = default; - /// - /// \param it - explicit constexpr u32_range_iterator(u8_range_iterator it) - : it_(it) {} - /// - constexpr u32_range_iterator(const u32_range_iterator&) = default; - /// - constexpr u32_range_iterator(u32_range_iterator&&) noexcept = default; - /// - constexpr u32_range_iterator &operator=(const u32_range_iterator&) = default; - /// - constexpr u32_range_iterator &operator=(u32_range_iterator&&) noexcept = default; - /// - ~u32_range_iterator() = default; - - /// - /// \return - u32_range_iterator operator ++ (int) { - auto result = *this; - ++it_; - return result; - } - - /// - /// \return - u32_range_iterator &operator ++ () { - ++it_; - return *this; - } - - /// - /// \return - reference operator * () const noexcept { - return (*it_) - .and_then([] (auto code_point) -> value_type { - return details::u32(code_point); - }); - } - - /// - /// \param other - /// \return - constexpr bool operator == (const u32_range_iterator &other) const noexcept { - return it_ == other.it_; - } - - /// - /// \param other - /// \return - constexpr bool operator != (const u32_range_iterator &other) const noexcept { - return !(*this == other); - } - - private: - - u8_range_iterator it_; - -}; - -/// -/// \tparam OctetRange -template -class view_u32_range - : public ranges::view_base { - - using octet_iterator_type = typename OctetRange::const_iterator; - using iterator_type = u32_range_iterator; - - public: - - /// - using value_type = char32_t; - /// - using const_reference = value_type; - /// - using reference = const_reference; - /// - using const_iterator = iterator_type; - /// - using iterator = const_iterator; - /// - using size_type = std::size_t; - - /// - constexpr view_u32_range() = default; - - /// - /// \param range - explicit constexpr view_u32_range(const OctetRange &range) - : range_{range} {} - - /// - /// \return - [[nodiscard]] constexpr const_iterator begin() const noexcept { - return u32_range_iterator(range_.begin()); - } - - /// - /// \return - [[nodiscard]] constexpr const_iterator end() const noexcept { - return u32_range_iterator(range_.end()); - } - - /// - /// \return - [[nodiscard]] constexpr auto cbegin() const noexcept { - return begin(); - } - - /// - /// \return - [[nodiscard]] constexpr auto cend() const noexcept { - return end(); - } - - /// - /// \return - [[nodiscard]] constexpr bool empty() const noexcept { - return range_.empty(); - } - - /// - /// \return - [[nodiscard]] constexpr size_type size() const noexcept { - return range_.size(); - } - - private: - - view_u8_range range_; - -}; - -/// -struct u32_range_fn { - /// - /// \tparam OctetRange - /// \param range - /// \return - template - constexpr auto operator()(OctetRange &&range) const { - return view_u32_range{std::forward(range)}; - } - - /// - /// \tparam OctetRange - /// \param range - /// \return - template - friend constexpr auto operator|(OctetRange &&range, const u32_range_fn&) { - return view_u32_range{std::forward(range)}; - } - -}; - -namespace u32 { -template -class byte_iterator { - - public: - - using value_type = tl::expected; - using reference = value_type; - using difference_type = std::ptrdiff_t; - - byte_iterator() = default; - - byte_iterator(U32Iterator first, U32Iterator last) - : it_(first), last_(last) {} - - byte_iterator(const byte_iterator &) = default; - byte_iterator(byte_iterator &&) noexcept = default; - byte_iterator &operator=(const byte_iterator &) = default; - byte_iterator &operator=(byte_iterator &&) noexcept = default; - ~byte_iterator() = default; - - byte_iterator &operator++() { - increment(); - return *this; - } - - byte_iterator operator++(int) { - auto result = *this; - increment(); - return result; - } - - reference operator*() { - auto code_point = *it_; - - if (!is_valid_code_point(code_point)) { - return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); - } - - if (code_point < 0x80u) { - return static_cast(code_point); - } else if (code_point < 0x800u) { - if (octet_index_ == 0) { - return static_cast((code_point >> 6u) | 0xc0u); - } else if (octet_index_ == 1) { - return static_cast((code_point & 0x3fu) | 0x80u); - } - } else if (code_point < 0x10000u) { - if (octet_index_ == 0) { - return static_cast((code_point >> 12u) | 0xe0u); - } else if (octet_index_ == 1) { - return static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); - } else if (octet_index_ == 2) { - return static_cast((code_point & 0x3fu) | 0x80u); - } - } else { - if (octet_index_ == 0) { - return static_cast((code_point >> 18u) | 0xf0u); - } else if (octet_index_ == 1) { - return static_cast(((code_point >> 12u) & 0x3fu) | 0x80u); - } else if (octet_index_ == 2) { - return static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); - } else if (octet_index_ == 3) { - return static_cast((code_point & 0x3fu) | 0x80u); - } - } - return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); - } - - constexpr bool operator == (const byte_iterator &other) const noexcept { - return (it_ == other.it_) && (octet_index_ == other.octet_index_); - } - - constexpr bool operator != (const byte_iterator &other) const noexcept { - return !(*this == other); - } - - private: - - static constexpr auto octet_count(char32_t code_point) { - if (code_point < 0x80u) { - return 1; - } else if (code_point < 0x800u) { - return 2; - } else if (code_point < 0x10000u) { - return 3; - } else { - return 4; - }; - } - - void increment() { - if (**this) { - ++octet_index_; - if (octet_index_ == octet_count(*it_)) { - octet_index_ = 0; - ++it_; - } - } - else { - it_ = last_; - } - } - - U32Iterator it_, last_; - int octet_index_ = 0; - -}; - -/// -template -class view_byte_range { - - using iterator_type = byte_iterator; - - public: - - /// - using value_type = tl::expected; - /// - using const_reference = value_type; - /// - using reference = const_reference; - /// - using const_iterator = iterator_type; - /// - using iterator = const_iterator; - /// - using size_type = std::size_t; - - /// - view_byte_range() = default; - - /// - /// \param range - explicit view_byte_range( - const U32Range &range) - : first(iterator_type{std::begin(range), std::end(range)}) - , last(iterator_type{std::end(range), std::end(range)}) {} - - /// - /// \return - const_iterator begin() const { - return first? first.value() : iterator_type(); - } - - /// - /// \return - const_iterator end() const { - return last? last.value() : iterator_type(); - } - - /// - /// \return - bool empty() const noexcept { - return begin() == end(); - } - - private: - - std::optional first, last; - -}; - -/// -struct byte_range_fn { - /// - /// \tparam U32Range - /// \param range - /// \return - template - constexpr auto operator()(U32Range &&range) const { - return view_byte_range{std::forward(range)}; - } - - /// - /// \tparam U32Range - /// \param range - /// \return - template - friend constexpr auto operator|(U32Range &&range, const byte_range_fn&) { - return view_byte_range{std::forward(range)}; - } - -}; -} // namespace u32 - -namespace view { -/// -static constexpr u32_range_fn u32; -static constexpr u32::byte_range_fn bytes; -} // namespace view - -template -tl::expected as(InputRange &&range) { - auto result = Output{}; -// result.reserve(ranges::size(range)); - for (auto &&code_point : range) { - if (!code_point) { - return tl::make_unexpected(code_point.error()); - } - result.push_back(code_point.value()); - } - return result; -} -} // namespace skyr::unicode - -#endif //SKYR_UNICODE_U32_RANGE_HPP diff --git a/include/skyr/unicode/range/u16_range.hpp b/include/skyr/unicode/range/views/u16_view.hpp similarity index 59% rename from include/skyr/unicode/range/u16_range.hpp rename to include/skyr/unicode/range/views/u16_view.hpp index e3f891dd..68e28311 100644 --- a/include/skyr/unicode/range/u16_range.hpp +++ b/include/skyr/unicode/range/views/u16_view.hpp @@ -3,8 +3,8 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#ifndef SKYR_UNICODE_U16_RANGE_HPP -#define SKYR_UNICODE_U16_RANGE_HPP +#ifndef SKYR_UNICODE_U16_VIEW_HPP +#define SKYR_UNICODE_U16_VIEW_HPP #include #include @@ -13,13 +13,14 @@ #include #include #include -#include -#include +#include +#include +#include namespace skyr::unicode { /// -/// \tparam OctetIterator -template +/// \tparam U16Iterator +template class u16_range_iterator { public: @@ -40,8 +41,8 @@ class u16_range_iterator { /// /// \param it explicit constexpr u16_range_iterator( - u32_range_iterator it, - u32_range_iterator last) + U16Iterator it, + U16Iterator last) : it_(it) , last_(last) {} /// @@ -59,26 +60,38 @@ class u16_range_iterator { /// \return u16_range_iterator operator ++ (int) { auto result = *this; - ++it_; + increment(); return result; } /// /// \return u16_range_iterator &operator ++ () { - ++it_; + increment(); return *this; } /// /// \return reference operator * () const noexcept { - auto code_point = *it_; - return - code_point - .and_then([] (auto value) -> value_type { + assert(it_); + auto value = mask16(*it_.value()); + if (is_lead_surrogate(value)) { + auto next_it = it_.value(); + ++next_it; + auto trail_value = mask16(*next_it); + if (!is_trail_surrogate(trail_value)) { + return tl::make_unexpected( + make_error_code(unicode_errc::invalid_code_point)); + } + + return u16_code_point(value, trail_value); + } else if (is_trail_surrogate(value)) { + return tl::make_unexpected( + make_error_code(unicode_errc::invalid_code_point)); + } else { return u16_code_point(value); - }); + } } /// @@ -97,18 +110,26 @@ class u16_range_iterator { private: - u32_range_iterator it_, last_; + void increment() { + assert(it_); + auto value = mask16(*it_.value()); + std::advance(it_.value(), is_lead_surrogate(value)? 2 : 1); + if (it_ == last_) { + it_ = std::nullopt; + } + } + + std::optional it_, last_; }; /// -/// \tparam OctetRange -template +/// \tparam U16Range +template class view_u16_range : public ranges::view_base { - using octet_iterator_type = typename OctetRange::const_iterator; - using iterator_type = u16_range_iterator; + using iterator_type = u16_range_iterator::type>; public: @@ -130,7 +151,7 @@ class view_u16_range /// /// \param range - explicit constexpr view_u16_range(const OctetRange &range) + explicit constexpr view_u16_range(const U16Range &range) : range_{range} {} /// @@ -160,18 +181,18 @@ class view_u16_range /// /// \return [[nodiscard]] constexpr bool empty() const noexcept { - return range_.empty(); + return begin() == end(); } /// /// \return [[nodiscard]] constexpr size_type size() const noexcept { - return range_.size(); + return std::distance(begin(), end()); } private: - view_u32_range range_; + U16Range range_; }; @@ -181,58 +202,26 @@ struct u16_range_fn { /// \tparam OctetRange /// \param range /// \return - template - constexpr auto operator()(OctetRange &&range) const { - return view_u16_range{std::forward(range)}; + template + constexpr auto operator()(U16Range &&range) const { + return view_u16_range{std::forward(range)}; } /// /// \tparam OctetRange /// \param range /// \return - template - friend constexpr auto operator|(OctetRange &&range, const u16_range_fn&) { - return view_u16_range{std::forward(range)}; + template + friend constexpr auto operator|(U16Range &&range, const u16_range_fn&) { + return view_u16_range{std::forward(range)}; } }; namespace view { /// -static constexpr u16_range_fn u16; +static constexpr u16_range_fn as_u16; } // namespace view - -template -tl::expected u16string(U16Range &&range) { - auto result = std::u16string(); - result.reserve(ranges::size(range)); - for (auto &&code_point : range) { - if (!code_point) { - return tl::make_unexpected(code_point.error()); - } - result.push_back(code_point.value().lead_value()); - if (code_point.value().is_surrogate_pair()) { - result.push_back(code_point.value().trail_value()); - } - } - return result; -} - -template -tl::expected wstring(U16Range &&range) { - auto result = std::wstring(); - result.reserve(ranges::size(range)); - for (auto &&code_point : range) { - if (!code_point) { - return tl::make_unexpected(code_point.error()); - } - result.push_back(code_point.value().lead_value()); - if (code_point.value().is_surrogate_pair()) { - result.push_back(code_point.value().trail_value()); - } - } - return result; -} } // namespace skyr::unicode -#endif //SKYR_UNICODE_U16_RANGE_HPP +#endif //SKYR_UNICODE_U16_VIEW_HPP diff --git a/include/skyr/unicode/range/u8_range.hpp b/include/skyr/unicode/range/views/u8_view.hpp similarity index 95% rename from include/skyr/unicode/range/u8_range.hpp rename to include/skyr/unicode/range/views/u8_view.hpp index 813a6116..c416c2b9 100644 --- a/include/skyr/unicode/range/u8_range.hpp +++ b/include/skyr/unicode/range/views/u8_view.hpp @@ -11,11 +11,11 @@ #include #include #include -#include #include #include #include -#include +#include +#include namespace skyr::unicode { /// @@ -114,13 +114,11 @@ template class view_u8_range : public ranges::view_base { - using octet_iterator_type = typename OctetRange::const_iterator; + using octet_iterator_type = typename traits::iterator::type; using iterator_type = u8_range_iterator; public: -// using iterator_tag = std::forward_iterator_tag; - /// using value_type = u8_code_point_t; /// @@ -218,7 +216,7 @@ struct u8_range_fn { namespace view { /// -static constexpr u8_range_fn u8; +static constexpr u8_range_fn as_u8; } // namespace view } // namespace skyr::unicode diff --git a/include/skyr/unicode/range/unchecked_u8_range.hpp b/include/skyr/unicode/range/views/unchecked_u8_view.hpp similarity index 97% rename from include/skyr/unicode/range/unchecked_u8_range.hpp rename to include/skyr/unicode/range/views/unchecked_u8_view.hpp index bb20e66f..e6309a7d 100644 --- a/include/skyr/unicode/range/unchecked_u8_range.hpp +++ b/include/skyr/unicode/range/views/unchecked_u8_view.hpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace skyr::unicode { /// @@ -100,7 +101,7 @@ template class view_unchecked_u8_range : public ranges::view_base { - using octet_iterator_type = typename OctetRange::const_iterator; + using octet_iterator_type = typename traits::iterator::type ; using iterator_type = unchecked_u8_range_iterator; public: diff --git a/include/skyr/unicode/unicode.hpp b/include/skyr/unicode/unicode.hpp index bf050044..30cb7877 100644 --- a/include/skyr/unicode/unicode.hpp +++ b/include/skyr/unicode/unicode.hpp @@ -107,65 +107,6 @@ tl::expected copy_u8u16( return u16_it; } -/// Copies characters from a UTF-32 encoded string to a UTF-8 -/// encoded string. -/// -/// \tparam OctetIterator -/// \tparam U32BitIterator -/// \param first The first iterator in the UTF-32 encoded sequence -/// \param last The last iterator in the UTF-32 encoded sequence -/// \param u8_it The output iterator -/// \return The last output iterator or an error if the sequence was invalid -template -tl::expected copy_u32u8( - U32BitIterator first, - U32BitIterator last, - OctetIterator u8_it) { - auto it = first; - while (it != last) { - auto result_it = append_bytes(*it, u8_it); - if (!result_it) { - return tl::make_unexpected(std::move(result_it.error())); - } - u8_it = result_it.value(); - ++it; - } - return u8_it; -} - -/// Copies characters from a UTF-8 encoded string to a UTF-32 -/// encoded string. -/// -/// \tparam OctetIterator -/// \tparam U32BitIterator -/// \param first The first iterator in the octet sequence -/// \param last The last iterator in the octet sequence -/// \param u32_first The first iterator in the UTf-32 encoded -/// sequence -/// \return An expected iterator to the last eleent in the new -/// UTF-32 sequence, or an error. -template -tl::expected copy_u8u32( - OctetIterator first, - OctetIterator last, - U32BitIterator u32_first) { - auto it = first; - auto u32_it = u32_first; - while (it != last) { - if (std::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(make_error_code(unicode_errc::overflow)); - } - - auto state = unicode::next(it); - if (!state) { - return tl::make_unexpected(std::move(state.error())); - } - it = state.value().it; - (*u32_it)++ = state.value().value; - } - return u32_it; -} - /// Converts a `std::string` (assuming UTF-8) string to UTF-16 /// \param input A UTF-8 string /// \returns A UTF-16 `std::wstring` or an error on failure diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8f89ef79..6f0c00af 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) Glyn Matthews 2012-2018. +# Copyright (c) Glyn Matthews 2012-2019. # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) @@ -33,9 +33,12 @@ set(Skyr_SRCS ${Skyr_SOURCE_DIR}/include/skyr/unicode/errors.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/core.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/unicode.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u8_range.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u16_range.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/u32_range.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/u8_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/unchecked_u8_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/u16_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/transforms/byte_transform.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/transforms/u16_transform.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/transforms/u32_transform.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/percent_encode.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/domain.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/url_record.hpp diff --git a/src/unicode/unicode.cpp b/src/unicode/unicode.cpp index 1cdb27b9..419882b4 100644 --- a/src/unicode/unicode.cpp +++ b/src/unicode/unicode.cpp @@ -5,13 +5,15 @@ #include -#include +#include +#include +#include namespace skyr::unicode { tl::expected wstring_from_bytes( std::string_view bytes) { - return wstring(bytes | view::u16); + return as(bytes | view::as_u8 | transform::to_u16); } tl::expected wstring_to_bytes( @@ -27,7 +29,7 @@ tl::expected wstring_to_bytes( tl::expected utf16_from_bytes( std::string_view bytes) { - return u16string(bytes | view::u16); + return as(bytes | view::as_u8 | transform::to_u16); } tl::expected utf16_to_bytes( @@ -43,17 +45,11 @@ tl::expected utf16_to_bytes( tl::expected utf32_from_bytes( std::string_view bytes) { - return as(bytes | view::u32); + return as(bytes | view::as_u8 | transform::to_u32); } tl::expected utf32_to_bytes( std::u32string_view input) { - std::string result; - auto expected = copy_u32u8( - begin(input), end(input), std::back_inserter(result)); - if (!expected) { - return tl::make_unexpected(std::error_code(expected.error())); - } - return result; + return as(input | transform::to_bytes); } } // namespace skyr::unicode diff --git a/tests/unicode_code_point_tests.cpp b/tests/unicode_code_point_tests.cpp index 74125605..1b3b700d 100644 --- a/tests/unicode_code_point_tests.cpp +++ b/tests/unicode_code_point_tests.cpp @@ -11,24 +11,38 @@ #include -TEST_CASE("code point tests") { +TEST_CASE("u8 code point tests") { using std::begin; using std::end; - SECTION("u8 code point 01") { + SECTION("code point 01") { auto bytes = std::string("\xf0\x9f\x92\xa9"); auto cp = skyr::unicode::u8_code_point(bytes); REQUIRE(cp); CHECK(std::string("\xf0\x9f\x92\xa9") == std::string(begin(cp.value()), end(cp.value()))); - CHECK(U'\x1f4a9' == skyr::unicode::details::u32(cp.value())); - CHECK(u16(cp.value()).is_surrogate_pair()); - CHECK(u'\xd83d' == u16(cp.value()).lead_value()); - CHECK(u'\xdca9' == u16(cp.value()).trail_value()); + CHECK(U'\x1f4a9' == u32_value(cp)); + CHECK(u16_value(cp).value().is_surrogate_pair()); + CHECK(u'\xd83d' == u16_value(cp).value().lead_value()); + CHECK(u'\xdca9' == u16_value(cp).value().trail_value()); } - SECTION("u8 code point 02") { + SECTION("code point fail") { auto bytes = std::string("\x9f\x92\xa9"); auto cp = skyr::unicode::u8_code_point(bytes); REQUIRE(!cp); } } + +TEST_CASE("u16 code point tests") { + using std::begin; + using std::end; + + SECTION("code point 01") { + auto lead = u'\xD83C', trail = u'\xDFF3'; + auto cp = skyr::unicode::u16_code_point(lead, trail); + CHECK(U'\x1F3F3' == u32_value(cp)); + CHECK(cp.is_surrogate_pair()); + CHECK(lead == cp.lead_value()); + CHECK(trail == cp.trail_value()); + } +} diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index 556db638..137c9fec 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -8,9 +8,11 @@ #define CATCH_CONFIG_MAIN #include #include -#include -#include -#include +#include +#include +#include +#include +#include TEST_CASE("octet range iterator") { @@ -21,7 +23,18 @@ TEST_CASE("octet range iterator") { auto it = iterator_type(std::begin(bytes), std::end(bytes)); auto code_point = *it; REQUIRE(code_point); - CHECK(U'\x1F4A9' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x1F4A9' == u32_value(code_point.value())); + } + + SECTION("construction from array") { + using iterator_type = skyr::unicode::u8_range_iterator; + + char bytes[] = "\xf0\x9f\x92\xa9"; + auto first = std::begin(bytes), last = std::end(bytes); + auto it = iterator_type(first, last); + auto code_point = *it; + REQUIRE(code_point); + CHECK(U'\x1F4A9' == u32_value(code_point.value())); } SECTION("increment") { @@ -29,11 +42,11 @@ TEST_CASE("octet range iterator") { auto it = iterator_type(std::begin(bytes), std::end(bytes)); auto code_point = *it; REQUIRE(code_point); - CHECK(U'\x1F3F3' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x1F3F3' == u32_value(code_point.value())); ++it; code_point = *it; REQUIRE(code_point); - CHECK(U'\xFE0F' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\xFE0F' == u32_value(code_point.value())); } SECTION("increment invalid") { @@ -72,12 +85,12 @@ TEST_CASE("octet range iterator") { { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x65e5' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x65e5' == u32_value(code_point.value())); } { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x448' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x448' == u32_value(code_point.value())); } } @@ -88,17 +101,17 @@ TEST_CASE("octet range iterator") { { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x10346' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x10346' == u32_value(code_point.value())); } { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x65e5' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x65e5' == u32_value(code_point.value())); } { auto code_point = *it++; REQUIRE(code_point); - CHECK(U'\x448' == skyr::unicode::details::u32(code_point.value())); + CHECK(U'\x448' == u32_value(code_point.value())); } } } @@ -133,21 +146,21 @@ TEST_CASE("u8 range") { SECTION("pipe syntax") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto view = bytes | skyr::unicode::view::u8; + auto view = bytes | skyr::unicode::view::as_u8; CHECK(4 == ranges::size(view)); CHECK(!ranges::empty(view)); } SECTION("pipe syntax with string_view") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto view = std::string_view(bytes) | skyr::unicode::view::u8; + auto view = std::string_view(bytes) | skyr::unicode::view::as_u8; CHECK(4 == ranges::size(view)); CHECK(!ranges::empty(view)); } SECTION("pipe syntax invalid") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto view = bytes | skyr::unicode::view::u8; + auto view = bytes | skyr::unicode::view::as_u8; auto it = std::begin(view), last = std::end(view); CHECK(!*it++); CHECK(it == last); @@ -157,35 +170,52 @@ TEST_CASE("u8 range") { SECTION("pipe syntax with u16 string") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto u16 = skyr::unicode::u16string(bytes | skyr::unicode::view::u16); + auto u16 = skyr::unicode::as(bytes | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u16); REQUIRE(u16); CHECK(u"\xD83C\xDFF3\xFE0F\x200D\xD83C\xDF08" == u16.value()); } SECTION("pipe syntax with u32 string") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto u32 = skyr::unicode::as(bytes | skyr::unicode::view::u32); + auto u32 = skyr::unicode::as(bytes | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u32); REQUIRE(u32); CHECK(U"\x1F3F3\xFE0F\x200D\x1F308" == u32.value()); } SECTION("pipe syntax with u16 string invalid") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto u16 = skyr::unicode::u16string(bytes | skyr::unicode::view::u16); + auto u16 = skyr::unicode::as(bytes | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u16); CHECK(!u16); } SECTION("pipe syntax with u32 string invalid") { auto bytes = std::string("\xf0\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto u32 = skyr::unicode::as(bytes | skyr::unicode::view::u32); + auto u32 = skyr::unicode::as(bytes | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u32); CHECK(!u32); } } TEST_CASE("write bytes") { - SECTION("append_bytes") { + SECTION("bytes from u32") { auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); - auto bytes = skyr::unicode::as(input | skyr::unicode::view::bytes); + auto bytes = skyr::unicode::as( + input | skyr::unicode::transform::to_bytes); + REQUIRE(bytes); + CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); + } + + SECTION("bytes from u16") { + auto input = std::u16string(u"\xD83C\xDFF3\xFE0F\x200D\xD83C\xDF08"); + auto bytes = skyr::unicode::as( + input | skyr::unicode::view::as_u16 | skyr::unicode::transform::to_bytes); + REQUIRE(bytes); + CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); + } + + SECTION("bytes from u16 (2)") { + auto input = std::u16string(u"\xD83C\xDFF3\xFE0F\x200D\xD83C\xDF08"); + auto bytes = skyr::unicode::as( + input | skyr::unicode::view::as_u16 | skyr::unicode::transform::to_u32 | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); } diff --git a/tests/unicode_tests.cpp b/tests/unicode_tests.cpp index c3a1237e..ff76c915 100644 --- a/tests/unicode_tests.cpp +++ b/tests/unicode_tests.cpp @@ -6,98 +6,109 @@ #define CATCH_CONFIG_MAIN #include #include -#include "skyr/unicode/unicode.hpp" +#include +#include +#include +#include +#include TEST_CASE("unicode_tests", "[unicode]") { SECTION("utf32_to_bytes_poo_emoji_test") { auto input = std::u32string(U"\x1F4A9"); - auto bytes = skyr::unicode::utf32_to_bytes(input); + auto bytes = skyr::unicode::as( + input | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x92\xa9" == bytes.value()); } SECTION("bytes_to_utf32_poo_emoji_test") { auto input = std::string("\xf0\x9f\x92\xa9"); - auto utf32 = skyr::unicode::utf32_from_bytes(input); + auto utf32 = skyr::unicode::as( + input | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u32); REQUIRE(utf32); CHECK(U"\x1F4A9" == utf32.value()); } SECTION("utf16_to_bytes_poo_emoji_test") { auto input = std::u16string(u"\xd83d\xdca9"); - auto bytes = skyr::unicode::utf16_to_bytes(input); + auto bytes = skyr::unicode::as( + input | skyr::unicode::view::as_u16 | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x92\xa9" == bytes.value()); } SECTION("bytes_to_utf16_poo_emoji_test") { auto input = std::string("\xf0\x9f\x92\xa9"); - auto utf16 = skyr::unicode::utf16_from_bytes(input); + auto utf16 = skyr::unicode::as( + input | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u16); REQUIRE(utf16); CHECK(u"\xd83d\xdca9" == utf16.value()); } SECTION("wstring_to_bytes_poo_emoji_test") { auto input = std::wstring(L"\xd83d\xdca9"); - auto bytes = skyr::unicode::wstring_to_bytes(input); + auto bytes = skyr::unicode::as( + input | skyr::unicode::view::as_u16 | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x92\xa9" == bytes.value()); } SECTION("bytes_to_wstring_poo_emoji_test") { auto input = std::string("\xf0\x9f\x92\xa9"); - auto utf16 = skyr::unicode::wstring_from_bytes(input); + auto utf16 = skyr::unicode::as( + input | skyr::unicode::view::as_u8 | skyr::unicode::transform::to_u16); REQUIRE(utf16); CHECK(L"\xd83d\xdca9" == utf16.value()); } SECTION("utf32_rainbow_flag_test") { auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); - auto bytes = skyr::unicode::utf32_to_bytes(input); + auto bytes = skyr::unicode::as( + input | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); } - SECTION("count_utf8_chars_01") { - auto bytes = std::string("\xf0\x9f\x92\xa9"); - auto first = begin(bytes), last = end(bytes); - auto count = skyr::unicode::count(first, last); - REQUIRE(count); - CHECK(1 == count.value()); - } +// SECTION("count_utf8_chars_01") { +// auto bytes = std::string("\xf0\x9f\x92\xa9"); +// auto first = begin(bytes), last = end(bytes); +// auto count = skyr::unicode::count(first, last); +// REQUIRE(count); +// CHECK(1 == count.value()); +// } - SECTION("count_utf8_chars_02") { - auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto first = begin(bytes), last = end(bytes); - auto count = skyr::unicode::count(first, last); - REQUIRE(count); - CHECK(4 == count.value()); - } +// SECTION("count_utf8_chars_02") { +// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); +// auto first = begin(bytes), last = end(bytes); +// auto count = skyr::unicode::count(first, last); +// REQUIRE(count); +// CHECK(4 == count.value()); +// } - SECTION("advance_utf8_chars") { - auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto first = begin(bytes), last = end(bytes); - skyr::unicode::advance(first, 2, last); - CHECK("\xe2\x80\x8d\xf0\x9f\x8c\x88" == std::string(first, last)); - } +// SECTION("advance_utf8_chars") { +// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); +// auto first = begin(bytes), last = end(bytes); +// skyr::unicode::advance(first, 2, last); +// CHECK("\xe2\x80\x8d\xf0\x9f\x8c\x88" == std::string(first, last)); +// } - SECTION("advance_and_count_utf8_chars") { - auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); - auto first = begin(bytes), last = end(bytes); - skyr::unicode::advance(first, 2, last); - auto count = skyr::unicode::count(first, last); - REQUIRE(count); - CHECK(2 == count.value()); - } +// SECTION("advance_and_count_utf8_chars") { +// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); +// auto first = begin(bytes), last = end(bytes); +// skyr::unicode::advance(first, 2, last); +// auto count = skyr::unicode::count(first, last); +// REQUIRE(count); +// CHECK(2 == count.value()); +// } - SECTION("append_bytes") { - auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); - auto bytes = std::string(); - for (auto value : input) { - auto result = skyr::unicode::append_bytes(value, std::back_inserter(bytes)); - REQUIRE(result); - } - CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes); - } +// SECTION("append_bytes") { +// auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); +// auto bytes = std::string(); +// for (auto value : input) { +// auto result = skyr::unicode::append_bytes(value, std::back_inserter(bytes)); +// REQUIRE(result); +// } +// CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes); +// } } From 65943a76171bbb693cf4dcda6841de90314c55c8 Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Tue, 1 Oct 2019 18:47:02 +0800 Subject: [PATCH 08/10] Renaming and updating documentation --- include/skyr/unicode/code_point.hpp | 42 ++--- include/skyr/unicode/core.hpp | 53 ------ .../skyr/unicode/{range => ranges}/traits.hpp | 7 + .../transforms/byte_transform.hpp | 94 ++++++++--- .../transforms/u16_transform.hpp | 90 +++++------ .../transforms/u32_transform.hpp | 8 +- .../{range => ranges}/views/u16_view.hpp | 6 +- .../{range => ranges}/views/u8_view.hpp | 18 ++- .../views/unchecked_u8_view.hpp | 4 +- include/skyr/unicode/unicode.hpp | 152 ------------------ include/skyr/url/details/to_bytes.hpp | 8 +- src/CMakeLists.txt | 2 - src/unicode/unicode.cpp | 55 ------- src/url/domain.cpp | 14 +- src/url/percent_encode.cpp | 10 +- tests/unicode_range_tests.cpp | 10 +- tests/unicode_tests.cpp | 10 +- 17 files changed, 180 insertions(+), 403 deletions(-) rename include/skyr/unicode/{range => ranges}/traits.hpp (84%) rename include/skyr/unicode/{range => ranges}/transforms/byte_transform.hpp (70%) rename include/skyr/unicode/{range => ranges}/transforms/u16_transform.hpp (75%) rename include/skyr/unicode/{range => ranges}/transforms/u32_transform.hpp (97%) rename include/skyr/unicode/{range => ranges}/views/u16_view.hpp (96%) rename include/skyr/unicode/{range => ranges}/views/u8_view.hpp (92%) rename include/skyr/unicode/{range => ranges}/views/unchecked_u8_view.hpp (97%) delete mode 100644 include/skyr/unicode/unicode.hpp delete mode 100644 src/unicode/unicode.cpp diff --git a/include/skyr/unicode/code_point.hpp b/include/skyr/unicode/code_point.hpp index c4b3a902..1aa079a1 100644 --- a/include/skyr/unicode/code_point.hpp +++ b/include/skyr/unicode/code_point.hpp @@ -58,26 +58,26 @@ class u8_code_point_t { /// \brief Destructor. ~u8_code_point_t() = default; - /// - /// \return + /// Returns an iterator to the beginning + /// \return \c const_iterator [[nodiscard]] constexpr const_iterator begin() const noexcept { return first; } - /// - /// \return + /// Returns an iterator to the end + /// \return \c const_iterator [[nodiscard]] constexpr const_iterator end() const noexcept { return last; } - /// - /// \return + /// Returns an iterator to the beginning + /// \return \c const_iterator [[nodiscard]] constexpr auto cbegin() const noexcept { return begin(); } - /// - /// \return + /// Returns an iterator to the end + /// \return \c const_iterator [[nodiscard]] constexpr auto cend() const noexcept { return end(); } @@ -88,13 +88,6 @@ class u8_code_point_t { return sequence_length(*first); } - [[nodiscard]] tl::expected u32_value() const noexcept { - return find_code_point(first) - .and_then([] (auto state) -> tl::expected { - return state.value; - }); - } - private: OctetIterator first, last; @@ -146,18 +139,6 @@ inline tl::expected, std::e .and_then(check_code_point); } -//namespace details { -///// -///// \tparam OctetIterator -///// \param code_point -///// \return -//template -//inline char32_t u32(u8_code_point_t code_point) { -// auto state = find_code_point(std::begin(code_point)); -// return state ? state.value().value : U'\x0000'; -//} -//} // namespace details - /// class u16_code_point_t { @@ -248,7 +229,10 @@ inline u16_code_point_t u16_code_point(char16_t lead, char16_t value) { template inline tl::expected u32_value( u8_code_point_t code_point) noexcept { - return code_point.u32_value(); + return find_code_point(code_point.begin()) + .and_then([] (auto state) -> tl::expected { + return state.value; + }); } /// @@ -260,7 +244,7 @@ inline tl::expected u32_value( tl::expected, std::error_code> code_point) noexcept { return code_point .and_then([] (auto code_point) -> tl::expected { - return code_point.u32_value(); + return u32_value(code_point); }); } diff --git a/include/skyr/unicode/core.hpp b/include/skyr/unicode/core.hpp index 7f9eda66..13669fed 100644 --- a/include/skyr/unicode/core.hpp +++ b/include/skyr/unicode/core.hpp @@ -301,59 +301,6 @@ tl::expected, std::error_code> find_code_point( tl::make_unexpected(make_error_code(unicode_errc::overflow)) ; } - -/// Updates the state to next code point -/// -/// \tparam OctetIterator -/// \param it An octer iterator -/// \return A sequence state with the computed code point value -template -tl::expected, std::error_code> next( - OctetIterator it) { - using result_type = tl::expected, std::error_code>; - - auto increment = [] (auto state) -> result_type { - ++state.it; - return state; - }; - - return - find_code_point(it) - .and_then(check_code_point) - .and_then(increment); -} - -/// Appends values to an octet sequence given a code point value -/// -/// \tparam OctetIterator -/// \param code_point -/// \param octet_it -/// \return -template -tl::expected append_bytes( - char32_t code_point, - OctetIterator octet_it) { - if (!is_valid_code_point(code_point)) { - return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); - } - - if (code_point < 0x80u) { // one octet - *(octet_it++) = static_cast(code_point); - } else if (code_point < 0x800u) { // two octets - *(octet_it++) = static_cast((code_point >> 6u) | 0xc0u); - *(octet_it++) = static_cast((code_point & 0x3fu) | 0x80u); - } else if (code_point < 0x10000u) { // three octets - *(octet_it++) = static_cast((code_point >> 12u) | 0xe0u); - *(octet_it++) = static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast((code_point & 0x3fu) | 0x80u); - } else { // four octets - *(octet_it++) = static_cast((code_point >> 18u) | 0xf0u); - *(octet_it++) = static_cast(((code_point >> 12u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast(((code_point >> 6u) & 0x3fu) | 0x80u); - *(octet_it++) = static_cast((code_point & 0x3fu) | 0x80u); - } - return octet_it; -} } // namespace skyr::unicode #endif //SKYR_UNICODE_CORE_HPP diff --git a/include/skyr/unicode/range/traits.hpp b/include/skyr/unicode/ranges/traits.hpp similarity index 84% rename from include/skyr/unicode/range/traits.hpp rename to include/skyr/unicode/ranges/traits.hpp index 5f55ac70..325e2a8f 100644 --- a/include/skyr/unicode/range/traits.hpp +++ b/include/skyr/unicode/ranges/traits.hpp @@ -23,6 +23,13 @@ class iterator { public: using type = const T *; }; + +//template +//class category { +// public: +// using type = typename Iterator::iterator_category; +//}; + } // namespace skyr::unicode::traits diff --git a/include/skyr/unicode/range/transforms/byte_transform.hpp b/include/skyr/unicode/ranges/transforms/byte_transform.hpp similarity index 70% rename from include/skyr/unicode/range/transforms/byte_transform.hpp rename to include/skyr/unicode/ranges/transforms/byte_transform.hpp index 85cddb52..52d696a0 100644 --- a/include/skyr/unicode/range/transforms/byte_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/byte_transform.hpp @@ -11,41 +11,62 @@ #include #include #include -#include -#include +#include +#include namespace skyr::unicode { +/// An iterator that converts a code point to bytes when dereferenced +/// \tparam CodePointIterator template class transform_byte_iterator { public: + /// + using iterator_category = std::forward_iterator_tag; + /// using value_type = tl::expected; + /// using reference = value_type; + /// \c std::ptrdiff_t using difference_type = std::ptrdiff_t; + /// Constructor transform_byte_iterator() = default; - - transform_byte_iterator(CodePointIterator first, CodePointIterator last) - : it_(first), last_(last) {} - + /// Constructs an iterator from an iterator that iterates over + /// code points + /// \param it + /// \param last + transform_byte_iterator(CodePointIterator it, CodePointIterator last) + : it_(it), last_(last) {} + /// Copy constructor transform_byte_iterator(const transform_byte_iterator &) = default; + /// Move constructor transform_byte_iterator(transform_byte_iterator &&) noexcept = default; + /// Copy assignment operator transform_byte_iterator &operator=(const transform_byte_iterator &) = default; + /// Move assignment operator transform_byte_iterator &operator=(transform_byte_iterator &&) noexcept = default; + /// Destructor ~transform_byte_iterator() = default; + /// Pre-increment operator + /// \return A reference to this iterator transform_byte_iterator &operator++() { increment(); return *this; } + /// Post-increment operator + /// \return A copy of the previous iterator transform_byte_iterator operator++(int) { auto result = *this; increment(); return result; } + /// Dereference operator + /// \return An expected value reference operator*() { auto code_point = u32_value(*it_).value(); @@ -83,10 +104,16 @@ class transform_byte_iterator { return tl::make_unexpected(make_error_code(unicode_errc::invalid_code_point)); } + /// Equality operator + /// \param other The other iterator + /// \return \c true if the iterators are the same, \c false otherwise constexpr bool operator==(const transform_byte_iterator &other) const noexcept { return (it_ == other.it_) && (octet_index_ == other.octet_index_); } + /// Inequality operator + /// \param other The other iterator + /// \return \c true if the iterators are not the same, \c false otherwise constexpr bool operator!=(const transform_byte_iterator &other) const noexcept { return !(*this == other); } @@ -123,7 +150,8 @@ class transform_byte_iterator { }; -/// +/// A range that transforms code point values to bytes. +/// \tparam CodePointRange template class transform_byte_range { @@ -131,43 +159,58 @@ class transform_byte_range { public: - /// + /// An expected byte value using value_type = tl::expected; - /// + /// \c value_type using const_reference = value_type; - /// + /// \c const_reference using reference = const_reference; - /// + /// \c value_type* + using pointer = typename std::add_pointer::type; + /// \c transform_byte_iterator using const_iterator = iterator_type; - /// + /// \c const_iterator using iterator = const_iterator; - /// + /// \c std::size_t using size_type = std::size_t; - /// + /// Default constructor + /// \post empty() transform_byte_range() = default; /// /// \param range explicit transform_byte_range( - const CodePointRange &range) + const CodePointRange &range) noexcept : first(iterator_type{std::begin(range), std::end(range)}), last(iterator_type{std::end(range), std::end(range)}) {} - /// - /// \return - const_iterator begin() const { + /// Returns an iterator to the beginning + /// \return \c const_iterator + const_iterator begin() const noexcept { return first ? first.value() : iterator_type(); } - /// - /// \return - const_iterator end() const { + /// Returns an iterator to the end + /// \return \c const_iterator + const_iterator end() const noexcept { return last ? last.value() : iterator_type(); } - /// - /// \return + /// Returns an iterator to the beginning + /// \return \c const_iterator + const_iterator cbegin() const noexcept { + return begin(); + } + + /// Returns an iterator to the end + /// \return \c const_iterator + const_iterator cend() const noexcept { + return end(); + } + + /// Tests if the byte range is empty + /// \return \c true if the range is empty, \c false otherwise bool empty() const noexcept { return begin() == end(); } @@ -204,6 +247,11 @@ namespace transform { static constexpr byte_range_fn to_bytes; } // namespace transform +/// +/// \tparam Output +/// \tparam CodePointRange +/// \param range +/// \return template tl::expected as(transform_byte_range &&range) { auto result = Output{}; diff --git a/include/skyr/unicode/range/transforms/u16_transform.hpp b/include/skyr/unicode/ranges/transforms/u16_transform.hpp similarity index 75% rename from include/skyr/unicode/range/transforms/u16_transform.hpp rename to include/skyr/unicode/ranges/transforms/u16_transform.hpp index ea6117ac..bcd5a575 100644 --- a/include/skyr/unicode/range/transforms/u16_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/u16_transform.hpp @@ -13,13 +13,13 @@ #include #include #include -#include -#include -#include +#include +#include +#include namespace skyr::unicode { /// -/// \tparam OctetIterator +/// \tparam CodePointIterator template class transform_u16_iterator { @@ -33,10 +33,10 @@ class transform_u16_iterator { using reference = value_type; /// using pointer = typename std::add_pointer::type; - /// + /// \c std::ptrdiff_t using difference_type = std::ptrdiff_t; - /// + /// Default constructor transform_u16_iterator() = default; /// /// \param it @@ -53,26 +53,26 @@ class transform_u16_iterator { constexpr transform_u16_iterator &operator=(const transform_u16_iterator&) = default; /// constexpr transform_u16_iterator &operator=(transform_u16_iterator&&) noexcept = default; - /// + /// Destructor ~transform_u16_iterator() = default; - /// - /// \return - transform_u16_iterator operator ++ (int) { - auto result = *this; + /// Pre-increment operator + /// \return A reference to this iterator + transform_u16_iterator &operator ++ () { ++it_; - return result; + return *this; } - /// - /// \return - transform_u16_iterator &operator ++ () { + /// Post-increment operator + /// \return A copy of the previous iterator + transform_u16_iterator operator ++ (int) { + auto result = *this; ++it_; - return *this; + return result; } - /// - /// \return + /// Dereference operator + /// \return An expected value reference operator * () const noexcept { auto code_point = *it_; return @@ -82,16 +82,16 @@ class transform_u16_iterator { }); } - /// - /// \param other - /// \return + /// Equality operator + /// \param other The other iterator + /// \return \c true if the iterators are the same, \c false otherwise bool operator == (const transform_u16_iterator &other) const noexcept { return it_ == other.it_; } - /// - /// \param other - /// \return + /// Inequality operator + /// \param other The other iterator + /// \return \c true if the iterators are not the same, \c false otherwise bool operator != (const transform_u16_iterator &other) const noexcept { return !(*this == other); } @@ -103,7 +103,7 @@ class transform_u16_iterator { }; /// -/// \tparam OctetRange +/// \tparam CodePointRange template class transform_u16_range : public ranges::view_base { @@ -125,7 +125,7 @@ class transform_u16_range /// using size_type = std::size_t; - /// + /// Default constructor constexpr transform_u16_range() = default; /// @@ -133,41 +133,41 @@ class transform_u16_range explicit constexpr transform_u16_range(CodePointRange &&range) : range_{std::forward(range)} {} - /// - /// \return - [[nodiscard]] constexpr const_iterator begin() const noexcept { + /// Returns an iterator to the beginning + /// \return \c const_iterator + [[nodiscard]] const_iterator begin() const noexcept { return iterator_type(std::begin(range_), std::end(range_)); } - /// - /// \return - [[nodiscard]] constexpr const_iterator end() const noexcept { + /// Returns an iterator to the end + /// \return \c const_iterator + [[nodiscard]] const_iterator end() const noexcept { return iterator_type(); } - /// - /// \return + /// Returns an iterator to the beginning + /// \return \c const_iterator [[nodiscard]] constexpr auto cbegin() const noexcept { return begin(); } - /// - /// \return + /// Returns an iterator to the end + /// \return \c const_iterator [[nodiscard]] constexpr auto cend() const noexcept { return end(); } - /// - /// \return + /// Tests if the byte range is empty + /// \return \c true if the range is empty, \c false otherwise [[nodiscard]] constexpr bool empty() const noexcept { return range_.empty(); } - - /// - /// \return - [[nodiscard]] constexpr size_type size() const noexcept { - return range_.size(); - } +// +// /// +// /// \return +// [[nodiscard]] constexpr size_type size() const noexcept { +// return range_.size(); +// } private: @@ -202,7 +202,7 @@ namespace transform { static constexpr transform_u16_range_fn to_u16; } // namespace transform -/// +/// A sink that converts a U16 range to string. /// \tparam Output /// \tparam OctetRange /// \param range diff --git a/include/skyr/unicode/range/transforms/u32_transform.hpp b/include/skyr/unicode/ranges/transforms/u32_transform.hpp similarity index 97% rename from include/skyr/unicode/range/transforms/u32_transform.hpp rename to include/skyr/unicode/ranges/transforms/u32_transform.hpp index 0fb2f33c..2efb5240 100644 --- a/include/skyr/unicode/range/transforms/u32_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/u32_transform.hpp @@ -13,12 +13,12 @@ #include #include #include -#include -#include +#include +#include namespace skyr::unicode { /// -/// \tparam OctetIterator +/// \tparam CodePointIterator template class transform_u32_iterator { public: @@ -48,7 +48,7 @@ class transform_u32_iterator { constexpr transform_u32_iterator &operator=(const transform_u32_iterator&) = default; /// constexpr transform_u32_iterator &operator=(transform_u32_iterator&&) noexcept = default; - /// + /// Destructor ~transform_u32_iterator() = default; /// diff --git a/include/skyr/unicode/range/views/u16_view.hpp b/include/skyr/unicode/ranges/views/u16_view.hpp similarity index 96% rename from include/skyr/unicode/range/views/u16_view.hpp rename to include/skyr/unicode/ranges/views/u16_view.hpp index 68e28311..881d8844 100644 --- a/include/skyr/unicode/range/views/u16_view.hpp +++ b/include/skyr/unicode/ranges/views/u16_view.hpp @@ -13,9 +13,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include namespace skyr::unicode { /// diff --git a/include/skyr/unicode/range/views/u8_view.hpp b/include/skyr/unicode/ranges/views/u8_view.hpp similarity index 92% rename from include/skyr/unicode/range/views/u8_view.hpp rename to include/skyr/unicode/ranges/views/u8_view.hpp index c416c2b9..d8286601 100644 --- a/include/skyr/unicode/range/views/u8_view.hpp +++ b/include/skyr/unicode/ranges/views/u8_view.hpp @@ -11,11 +11,12 @@ #include #include #include +#include #include #include #include -#include -#include +#include +#include namespace skyr::unicode { /// @@ -59,6 +60,7 @@ class u8_range_iterator { /// /// \return u8_range_iterator operator ++ (int) { + assert(it_); auto result = *this; increment(); return result; @@ -67,6 +69,7 @@ class u8_range_iterator { /// /// \return u8_range_iterator &operator ++ () { + assert(it_); increment(); return *this; } @@ -74,7 +77,8 @@ class u8_range_iterator { /// /// \return constexpr reference operator * () const noexcept { - return valid_u8_code_point(*it_); + assert(it_); + return valid_u8_code_point(*it_.value()); } /// @@ -95,16 +99,16 @@ class u8_range_iterator { void increment() { if (**this) { - ++it_; + ++it_.value(); if (it_ == last_) { - it_ = iterator_type(); + it_ = std::nullopt; } } else { - it_ = iterator_type(); + it_ = std::nullopt; } } - iterator_type it_, last_; + std::optional it_, last_; }; diff --git a/include/skyr/unicode/range/views/unchecked_u8_view.hpp b/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp similarity index 97% rename from include/skyr/unicode/range/views/unchecked_u8_view.hpp rename to include/skyr/unicode/ranges/views/unchecked_u8_view.hpp index e6309a7d..05012456 100644 --- a/include/skyr/unicode/range/views/unchecked_u8_view.hpp +++ b/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include namespace skyr::unicode { /// @@ -31,8 +31,6 @@ class unchecked_u8_range_iterator { /// using reference = value_type; /// - using pointer = typename std::add_pointer::type; - /// using difference_type = std::ptrdiff_t; /// diff --git a/include/skyr/unicode/unicode.hpp b/include/skyr/unicode/unicode.hpp deleted file mode 100644 index 30cb7877..00000000 --- a/include/skyr/unicode/unicode.hpp +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2018-19 Glyn Matthews. -// Copyright 2006-2016 Nemanja Trifunovic -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - -#ifndef SKYR_UNICODE_HPP -#define SKYR_UNICODE_HPP - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace skyr::unicode { -/// Copies characters from a UTF-16 encoded string to a UTF-8 -/// encoded string. -/// -/// \tparam U16BitIterator -/// \tparam OctetIterator -/// \param first The first iterator in the UTF-16 encoded sequence -/// \param last The last iterator in the UTF-16 encoded sequence -/// \param u8_it The output iterator -/// \return The last output iterator or an error if the sequence was invalid -template -tl::expected copy_u16u8( - U16BitIterator first, - U16BitIterator last, - OctetIterator result) { - auto it = first; - while (it != last) { - auto code_point = static_cast(mask16(*it)); - ++it; - - // Take care of surrogate pairs first - if (is_lead_surrogate(code_point)) { - if (it == last) { - return tl::make_unexpected( - make_error_code(unicode_errc::invalid_code_point)); - } - - auto trail_surrogate = mask16(*it); - ++it; - if (!is_trail_surrogate(trail_surrogate)) { - return tl::make_unexpected( - make_error_code(unicode_errc::invalid_code_point)); - } - code_point = (code_point << 10) + trail_surrogate + constants::surrogates::offset; - } - else if (is_trail_surrogate(code_point)) { - return tl::make_unexpected( - make_error_code(unicode_errc::invalid_code_point)); - } - - auto result_it = append_bytes(code_point, result); - if (!result_it) { - return tl::make_unexpected(std::move(result_it.error())); - } - } - return result; -} - -/// Copies characters from a UTF-8 encoded string to a UTF-16 -/// encoded string. -/// -/// \tparam U16BitIterator -/// \tparam OctetIterator -/// \param first The first iterator in the octet sequence -/// \param last The last iterator in the octet sequence -/// \param u16_first The first iterator in the UTf-16 encoded -/// sequence -/// \return An expected iterator to the last eleent in the new -/// UTF-16 sequence, or an error. -template -tl::expected copy_u8u16( - OctetIterator first, - OctetIterator last, - U16BitIterator u16_first) { - auto it = first; - auto u16_it = u16_first; - while (it != last) { - if (std::distance(it, last) < sequence_length(*it)) { - return tl::make_unexpected(make_error_code(unicode_errc::overflow)); - } - - auto state = unicode::next(it); - if (!state) { - return tl::make_unexpected(std::move(state.error())); - } - - it = state.value().it; - if (state.value().value > 0xffff) { // make a surrogate pair - *u16_it++ = - static_cast((state.value().value >> 10) + - constants::surrogates::lead_offset); - *u16_it++ = - static_cast((state.value().value & 0x3ff) + - constants::surrogates::trail_min); - } else { - *u16_it++ = static_cast(state.value().value); - } - } - return u16_it; -} - -/// Converts a `std::string` (assuming UTF-8) string to UTF-16 -/// \param input A UTF-8 string -/// \returns A UTF-16 `std::wstring` or an error on failure -tl::expected wstring_from_bytes( - std::string_view input); - -/// Converts a `std::u16string` string to UTF-8 -/// -/// \param input A UTF-16 string -/// \returns A UTF-8 `std::string` or an error on failure -tl::expected wstring_to_bytes( - std::wstring_view input); - -/// Converts a `std::string` (assuming UTF-8) string to UTF-16 -/// -/// \param input A UTF-8 string -/// \returns A UTF-16 `std::u16string` or an error on failure -tl::expected utf16_from_bytes( - std::string_view input); - -/// Converts a `std::u16string` string to UTF-8 -/// -/// \param input A UTF-16 string -/// \returns A UTF-8 `std::string` or an error on failure -tl::expected utf16_to_bytes( - std::u16string_view input); - -/// Converts a `std::string` (assuming UTF-8) string to UTF-32 -/// -/// \param input A UTF-8 string -/// \returns A UTF-32 `std::u32string` or an error on failure -tl::expected utf32_from_bytes( - std::string_view input); - -/// Converts a `std::u32string` string to UTF-8 -/// -/// \param input A UTF-32 string -/// \returns A UTF-8 `std::string` or an error on failure -tl::expected utf32_to_bytes( - std::u32string_view input); -} // namespace skyr::unicode - -#endif //SKYR_UNICODE_HPP diff --git a/include/skyr/url/details/to_bytes.hpp b/include/skyr/url/details/to_bytes.hpp index 68696083..dfcd652c 100644 --- a/include/skyr/url/details/to_bytes.hpp +++ b/include/skyr/url/details/to_bytes.hpp @@ -8,7 +8,7 @@ #include #include -#include +#include namespace skyr::details { template @@ -26,7 +26,7 @@ template struct to_bytes_impl< Source, typename std::enable_if::value>::type> { tl::expected operator()(const Source &source) const { - return unicode::wstring_to_bytes(source); + return unicode::as(source | unicode::view::as_u16 | unicode::transform::to_bytes); } }; @@ -34,7 +34,7 @@ template struct to_bytes_impl< Source, typename std::enable_if::value>::type> { tl::expected operator()(const Source &source) const { - return unicode::utf16_to_bytes(source); + return unicode::as(source | unicode::view::as_u16 | unicode::transform::to_bytes); } }; @@ -42,7 +42,7 @@ template struct to_bytes_impl< Source, typename std::enable_if::value>::type> { tl::expected operator()(const Source &source) const { - return unicode::utf32_to_bytes(source); + return unicode::as(source | unicode::transform::to_bytes); } }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6f0c00af..8739778a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,6 @@ set(Skyr_SRCS unicode/errors.cpp - unicode/unicode.cpp url/url_parser_context.hpp url/url_parser_context.cpp url/url_record.cpp @@ -32,7 +31,6 @@ set(Skyr_SRCS ${Skyr_SOURCE_DIR}/include/skyr/traits/string_traits.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/errors.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/core.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/unicode.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/u8_view.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/unchecked_u8_view.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/u16_view.hpp diff --git a/src/unicode/unicode.cpp b/src/unicode/unicode.cpp deleted file mode 100644 index 419882b4..00000000 --- a/src/unicode/unicode.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2018-19 Glyn Matthews. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - - -#include -#include -#include -#include - - -namespace skyr::unicode { -tl::expected wstring_from_bytes( - std::string_view bytes) { - return as(bytes | view::as_u8 | transform::to_u16); -} - -tl::expected wstring_to_bytes( - std::wstring_view input) { - std::string result; - auto expected = copy_u16u8( - begin(input), end(input), std::back_inserter(result)); - if (!expected) { - return tl::make_unexpected(std::error_code(expected.error())); - } - return result; -} - -tl::expected utf16_from_bytes( - std::string_view bytes) { - return as(bytes | view::as_u8 | transform::to_u16); -} - -tl::expected utf16_to_bytes( - std::u16string_view input) { - std::string result; - auto expected = copy_u16u8( - begin(input), end(input), std::back_inserter(result)); - if (!expected) { - return tl::make_unexpected(std::error_code(expected.error())); - } - return result; -} - -tl::expected utf32_from_bytes( - std::string_view bytes) { - return as(bytes | view::as_u8 | transform::to_u32); -} - -tl::expected utf32_to_bytes( - std::u32string_view input) { - return as(input | transform::to_bytes); -} -} // namespace skyr::unicode diff --git a/src/url/domain.cpp b/src/url/domain.cpp index af565125..1be49814 100644 --- a/src/url/domain.cpp +++ b/src/url/domain.cpp @@ -3,10 +3,10 @@ // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) -#include #include -#include "skyr/unicode/unicode.hpp" -#include "skyr/url/domain.hpp" +#include +#include +#include #include "algorithms.hpp" #include "idna_table.hpp" @@ -86,7 +86,7 @@ inline bool delim(char32_t c) { } // namespace tl::expected punycode_encode(std::string_view input) { - auto utf32 = unicode::utf32_from_bytes(input); + auto utf32 = unicode::as(input | unicode::view::as_u8 | unicode::transform::to_u32); if (!utf32) { return tl::make_unexpected(make_error_code(domain_errc::bad_input)); } @@ -229,7 +229,7 @@ tl::expected punycode_decode(std::string_view inpu result.insert(i++, 1, n); } - auto bytes = unicode::utf32_to_bytes(result); + auto bytes = unicode::as(result | unicode::transform::to_bytes); if (!bytes) { return tl::make_unexpected(make_error_code(domain_errc::bad_input)); } @@ -347,7 +347,7 @@ tl::expected unicode_to_ascii( } auto utf32_domain = join(labels, U'.'); - auto ascii_domain = unicode::utf32_to_bytes(utf32_domain); + auto ascii_domain = unicode::as(utf32_domain | unicode::transform::to_bytes); if (!ascii_domain) { return tl::make_unexpected( make_error_code(domain_errc::encoding_error)); @@ -359,7 +359,7 @@ tl::expected unicode_to_ascii( tl::expected domain_to_ascii( std::string_view domain, bool be_strict) { - auto utf32 = unicode::utf32_from_bytes(domain); + auto utf32 = unicode::as(domain | unicode::view::as_u8 | unicode::transform::to_u32); if (!utf32) { return tl::make_unexpected( make_error_code(domain_errc::encoding_error)); diff --git a/src/url/percent_encode.cpp b/src/url/percent_encode.cpp index 30b4ee4f..565c34ad 100644 --- a/src/url/percent_encode.cpp +++ b/src/url/percent_encode.cpp @@ -4,11 +4,9 @@ // http://www.boost.org/LICENSE_1_0.txt) #include -#include -#include -#include -#include "skyr/url/percent_encode.hpp" -#include "skyr/unicode/unicode.hpp" +#include +#include + namespace skyr { namespace { @@ -141,7 +139,7 @@ tl::expected percent_encode( tl::expected percent_encode( std::u32string_view input, encode_set excludes) { - auto bytes = unicode::utf32_to_bytes(input); + auto bytes = unicode::as(input | unicode::transform::to_bytes); if (!bytes) { return tl::make_unexpected(make_error_code( percent_encode_errc::overflow)); diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index 137c9fec..5e1762a6 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -8,11 +8,11 @@ #define CATCH_CONFIG_MAIN #include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include TEST_CASE("octet range iterator") { diff --git a/tests/unicode_tests.cpp b/tests/unicode_tests.cpp index ff76c915..ffa4b953 100644 --- a/tests/unicode_tests.cpp +++ b/tests/unicode_tests.cpp @@ -6,11 +6,11 @@ #define CATCH_CONFIG_MAIN #include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include TEST_CASE("unicode_tests", "[unicode]") { From 9ce4d9dc4ee646f4b2a39554392a52575eadcecb Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Tue, 1 Oct 2019 18:58:23 +0800 Subject: [PATCH 09/10] Added u32_view --- .../ranges/transforms/u32_transform.hpp | 6 +- .../skyr/unicode/ranges/views/u32_view.hpp | 204 ++++++++++++++++++ src/CMakeLists.txt | 13 +- tests/unicode_range_tests.cpp | 3 +- tests/unicode_tests.cpp | 47 +--- 5 files changed, 219 insertions(+), 54 deletions(-) create mode 100644 include/skyr/unicode/ranges/views/u32_view.hpp diff --git a/include/skyr/unicode/ranges/transforms/u32_transform.hpp b/include/skyr/unicode/ranges/transforms/u32_transform.hpp index 2efb5240..c0356d17 100644 --- a/include/skyr/unicode/ranges/transforms/u32_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/u32_transform.hpp @@ -169,7 +169,7 @@ class transform_u32_range }; /// -struct u32_range_fn { +struct transform_u32_range_fn { /// /// \tparam OctetRange /// \param range @@ -184,7 +184,7 @@ struct u32_range_fn { /// \param range /// \return template - friend constexpr auto operator|(CodePointRange &&range, const u32_range_fn&) { + friend constexpr auto operator|(CodePointRange &&range, const transform_u32_range_fn&) { return transform_u32_range{std::forward(range)}; } @@ -192,7 +192,7 @@ struct u32_range_fn { namespace transform { /// -static constexpr u32_range_fn to_u32; +static constexpr transform_u32_range_fn to_u32; } // namespace transform /// diff --git a/include/skyr/unicode/ranges/views/u32_view.hpp b/include/skyr/unicode/ranges/views/u32_view.hpp new file mode 100644 index 00000000..fa80e9fd --- /dev/null +++ b/include/skyr/unicode/ranges/views/u32_view.hpp @@ -0,0 +1,204 @@ +// Copyright 2019 Glyn Matthews. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef SKYR_UNICODE_U32_VIEW_HPP +#define SKYR_UNICODE_U32_VIEW_HPP + +#include +#include +#include +#include +#include +#include +#include + +namespace skyr::unicode { +/// +/// \tparam U32Iterator +template +class u32_range_iterator { + + public: + + /// + using iterator_category = std::forward_iterator_tag; + /// + using value_type = tl::expected; + /// + using reference = value_type; + /// + using pointer = typename std::add_pointer::type; + /// + using difference_type = std::ptrdiff_t; + + /// + u32_range_iterator() = default; + /// + /// \param it + explicit constexpr u32_range_iterator( + U32Iterator it, + U32Iterator last) + : it_(it) + , last_(last) {} + /// + constexpr u32_range_iterator(const u32_range_iterator&) = default; + /// + constexpr u32_range_iterator(u32_range_iterator&&) noexcept = default; + /// + constexpr u32_range_iterator &operator=(const u32_range_iterator&) = default; + /// + constexpr u32_range_iterator &operator=(u32_range_iterator&&) noexcept = default; + /// + ~u32_range_iterator() = default; + + /// + /// \return + u32_range_iterator operator ++ (int) { + auto result = *this; + increment(); + return result; + } + + /// + /// \return + u32_range_iterator &operator ++ () { + increment(); + return *this; + } + + /// + /// \return + reference operator * () const noexcept { + assert(it_); + return *it_.value(); + } + + /// + /// \param other + /// \return + bool operator == (const u32_range_iterator &other) const noexcept { + return it_ == other.it_; + } + + /// + /// \param other + /// \return + bool operator != (const u32_range_iterator &other) const noexcept { + return !(*this == other); + } + + private: + + void increment() { + assert(it_); + ++it_.value(); + } + + std::optional it_, last_; + +}; + +/// +/// \tparam U32Range +template +class view_u32_range + : public ranges::view_base { + + using iterator_type = u32_range_iterator::type>; + + public: + + /// + using value_type = tl::expected; + /// + using const_reference = value_type; + /// + using reference = const_reference; + /// + using const_iterator = iterator_type; + /// + using iterator = const_iterator; + /// + using size_type = std::size_t; + + /// + constexpr view_u32_range() = default; + + /// + /// \param range + explicit constexpr view_u32_range(const U32Range &range) + : range_{range} {} + + /// + /// \return + [[nodiscard]] constexpr const_iterator begin() const noexcept { + return iterator_type(std::begin(range_), std::end(range_)); + } + + /// + /// \return + [[nodiscard]] constexpr const_iterator end() const noexcept { + return iterator_type(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cbegin() const noexcept { + return begin(); + } + + /// + /// \return + [[nodiscard]] constexpr auto cend() const noexcept { + return end(); + } + + /// + /// \return + [[nodiscard]] constexpr bool empty() const noexcept { + return begin() == end(); + } + + /// + /// \return + [[nodiscard]] constexpr size_type size() const noexcept { + return std::distance(begin(), end()); + } + + private: + + U32Range range_; + +}; + +/// +struct u32_range_fn { + /// + /// \tparam OctetRange + /// \param range + /// \return + template + constexpr auto operator()(U32Range &&range) const { + return view_u32_range{std::forward(range)}; + } + + /// + /// \tparam OctetRange + /// \param range + /// \return + template + friend constexpr auto operator|(U32Range &&range, const u32_range_fn&) { + return view_u32_range{std::forward(range)}; + } + +}; + +namespace view { +/// +static constexpr u32_range_fn as_u32; +} // namespace view +} // namespace skyr::unicode + +#endif //SKYR_UNICODE_U32_VIEW_HPP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8739778a..c01c7902 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,12 +31,13 @@ set(Skyr_SRCS ${Skyr_SOURCE_DIR}/include/skyr/traits/string_traits.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/errors.hpp ${Skyr_SOURCE_DIR}/include/skyr/unicode/core.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/u8_view.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/unchecked_u8_view.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/views/u16_view.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/transforms/byte_transform.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/transforms/u16_transform.hpp - ${Skyr_SOURCE_DIR}/include/skyr/unicode/range/transforms/u32_transform.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/views/u8_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/views/u16_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/views/u32_view.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/transforms/byte_transform.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/transforms/u16_transform.hpp + ${Skyr_SOURCE_DIR}/include/skyr/unicode/ranges/transforms/u32_transform.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/percent_encode.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/domain.hpp ${Skyr_SOURCE_DIR}/include/skyr/url/url_record.hpp diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index 5e1762a6..ac2673b7 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -199,7 +200,7 @@ TEST_CASE("write bytes") { SECTION("bytes from u32") { auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); auto bytes = skyr::unicode::as( - input | skyr::unicode::transform::to_bytes); + input | skyr::unicode::view::as_u32 | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); } diff --git a/tests/unicode_tests.cpp b/tests/unicode_tests.cpp index ffa4b953..bea33e1b 100644 --- a/tests/unicode_tests.cpp +++ b/tests/unicode_tests.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,7 @@ TEST_CASE("unicode_tests", "[unicode]") { SECTION("utf32_to_bytes_poo_emoji_test") { auto input = std::u32string(U"\x1F4A9"); auto bytes = skyr::unicode::as( - input | skyr::unicode::transform::to_bytes); + input | skyr::unicode::view::as_u32 | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x92\xa9" == bytes.value()); } @@ -65,50 +66,8 @@ TEST_CASE("unicode_tests", "[unicode]") { SECTION("utf32_rainbow_flag_test") { auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); auto bytes = skyr::unicode::as( - input | skyr::unicode::transform::to_bytes); + input | skyr::unicode::view::as_u32 | skyr::unicode::transform::to_bytes); REQUIRE(bytes); CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes.value()); } - -// SECTION("count_utf8_chars_01") { -// auto bytes = std::string("\xf0\x9f\x92\xa9"); -// auto first = begin(bytes), last = end(bytes); -// auto count = skyr::unicode::count(first, last); -// REQUIRE(count); -// CHECK(1 == count.value()); -// } - -// SECTION("count_utf8_chars_02") { -// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); -// auto first = begin(bytes), last = end(bytes); -// auto count = skyr::unicode::count(first, last); -// REQUIRE(count); -// CHECK(4 == count.value()); -// } - -// SECTION("advance_utf8_chars") { -// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); -// auto first = begin(bytes), last = end(bytes); -// skyr::unicode::advance(first, 2, last); -// CHECK("\xe2\x80\x8d\xf0\x9f\x8c\x88" == std::string(first, last)); -// } - -// SECTION("advance_and_count_utf8_chars") { -// auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); -// auto first = begin(bytes), last = end(bytes); -// skyr::unicode::advance(first, 2, last); -// auto count = skyr::unicode::count(first, last); -// REQUIRE(count); -// CHECK(2 == count.value()); -// } - -// SECTION("append_bytes") { -// auto input = std::u32string(U"\x1F3F3\xFE0F\x200D\x1F308"); -// auto bytes = std::string(); -// for (auto value : input) { -// auto result = skyr::unicode::append_bytes(value, std::back_inserter(bytes)); -// REQUIRE(result); -// } -// CHECK("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88" == bytes); -// } } From bc48e52ad0cdb4a0c6d63554ae48ac84176cad5b Mon Sep 17 00:00:00 2001 From: Glyn Matthews Date: Tue, 1 Oct 2019 19:28:37 +0800 Subject: [PATCH 10/10] Removed dependency on old package of ranges in vcpkg --- CMakeLists.txt | 1 - include/skyr/unicode/code_point.hpp | 5 ++--- .../ranges/transforms/byte_transform.hpp | 1 - .../ranges/transforms/u16_transform.hpp | 4 +--- .../ranges/transforms/u32_transform.hpp | 4 +--- .../skyr/unicode/ranges/views/u16_view.hpp | 5 ++--- .../skyr/unicode/ranges/views/u32_view.hpp | 10 ++++++---- include/skyr/unicode/ranges/views/u8_view.hpp | 6 ++---- .../ranges/views/unchecked_u8_view.hpp | 5 +---- src/CMakeLists.txt | 2 +- src/url/algorithms.hpp | 3 ++- src/url/domain.cpp | 1 + tests/unicode_code_point_tests.cpp | 1 - tests/unicode_range_tests.cpp | 20 +++++++++---------- 14 files changed, 28 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc60a281..4bf8508d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,6 @@ set(CMAKE_CXX_STANDARD 17) find_package(Threads REQUIRED) find_package(tl-expected CONFIG REQUIRED) -find_package(range-v3 CONFIG REQUIRED) if (${CMAKE_CXX_COMPILER_ID} MATCHES GNU) if (Skyr_FULL_WARNINGS) diff --git a/include/skyr/unicode/code_point.hpp b/include/skyr/unicode/code_point.hpp index 1aa079a1..a2c7ac2c 100644 --- a/include/skyr/unicode/code_point.hpp +++ b/include/skyr/unicode/code_point.hpp @@ -7,7 +7,6 @@ #define SKYR_UNICODE_CODE_POINT_HPP #include -#include #include #include #include @@ -101,8 +100,8 @@ class u8_code_point_t { template inline tl::expected, std::error_code> u8_code_point( const OctetRange &range) { - auto first = std::begin(range); - if (ranges::distance(range) > sequence_length(*first)) { + auto first = std::begin(range), last = std::end(range); + if (std::distance(first, last) > sequence_length(*first)) { return tl::make_unexpected(make_error_code(unicode_errc::overflow)); } return u8_code_point_t( diff --git a/include/skyr/unicode/ranges/transforms/byte_transform.hpp b/include/skyr/unicode/ranges/transforms/byte_transform.hpp index 52d696a0..fe0f8cc3 100644 --- a/include/skyr/unicode/ranges/transforms/byte_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/byte_transform.hpp @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/include/skyr/unicode/ranges/transforms/u16_transform.hpp b/include/skyr/unicode/ranges/transforms/u16_transform.hpp index bcd5a575..d810f3c5 100644 --- a/include/skyr/unicode/ranges/transforms/u16_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/u16_transform.hpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -105,8 +104,7 @@ class transform_u16_iterator { /// /// \tparam CodePointRange template -class transform_u16_range - : public ranges::view_base { +class transform_u16_range { using iterator_type = transform_u16_iterator::type>; diff --git a/include/skyr/unicode/ranges/transforms/u32_transform.hpp b/include/skyr/unicode/ranges/transforms/u32_transform.hpp index c0356d17..208939cc 100644 --- a/include/skyr/unicode/ranges/transforms/u32_transform.hpp +++ b/include/skyr/unicode/ranges/transforms/u32_transform.hpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -98,8 +97,7 @@ class transform_u32_iterator { /// /// \tparam OctetRange template -class transform_u32_range - : public ranges::view_base { +class transform_u32_range { using iterator_type = typename traits::iterator::type; diff --git a/include/skyr/unicode/ranges/views/u16_view.hpp b/include/skyr/unicode/ranges/views/u16_view.hpp index 881d8844..e3356a4f 100644 --- a/include/skyr/unicode/ranges/views/u16_view.hpp +++ b/include/skyr/unicode/ranges/views/u16_view.hpp @@ -9,8 +9,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -126,8 +126,7 @@ class u16_range_iterator { /// /// \tparam U16Range template -class view_u16_range - : public ranges::view_base { +class view_u16_range { using iterator_type = u16_range_iterator::type>; diff --git a/include/skyr/unicode/ranges/views/u32_view.hpp b/include/skyr/unicode/ranges/views/u32_view.hpp index fa80e9fd..0aea82f9 100644 --- a/include/skyr/unicode/ranges/views/u32_view.hpp +++ b/include/skyr/unicode/ranges/views/u32_view.hpp @@ -9,8 +9,8 @@ #include #include #include +#include #include -#include #include #include @@ -72,7 +72,7 @@ class u32_range_iterator { /// \return reference operator * () const noexcept { assert(it_); - return *it_.value(); + return u32_value(*it_.value()); } /// @@ -94,6 +94,9 @@ class u32_range_iterator { void increment() { assert(it_); ++it_.value(); + if (it_ == last_) { + it_ = std::nullopt; + } } std::optional it_, last_; @@ -103,8 +106,7 @@ class u32_range_iterator { /// /// \tparam U32Range template -class view_u32_range - : public ranges::view_base { +class view_u32_range { using iterator_type = u32_range_iterator::type>; diff --git a/include/skyr/unicode/ranges/views/u8_view.hpp b/include/skyr/unicode/ranges/views/u8_view.hpp index d8286601..3db4f2a3 100644 --- a/include/skyr/unicode/ranges/views/u8_view.hpp +++ b/include/skyr/unicode/ranges/views/u8_view.hpp @@ -9,9 +9,8 @@ #include #include #include +#include #include -#include -#include #include #include #include @@ -115,8 +114,7 @@ class u8_range_iterator { /// /// \tparam OctetRange template -class view_u8_range - : public ranges::view_base { +class view_u8_range { using octet_iterator_type = typename traits::iterator::type; using iterator_type = u8_range_iterator; diff --git a/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp b/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp index 05012456..5b31d35f 100644 --- a/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp +++ b/include/skyr/unicode/ranges/views/unchecked_u8_view.hpp @@ -10,8 +10,6 @@ #include #include #include -#include -#include #include #include #include @@ -96,8 +94,7 @@ class unchecked_u8_range_iterator { /// /// \tparam OctetRange template -class view_unchecked_u8_range - : public ranges::view_base { +class view_unchecked_u8_range { using octet_iterator_type = typename traits::iterator::type ; using iterator_type = unchecked_u8_range_iterator; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c01c7902..624d3403 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -53,7 +53,7 @@ if (Skyr_BUILD_FILESYSTEM_PATH_FUNCTIONS) endif() add_library(skyr-url ${Skyr_SRCS}) -target_link_libraries(skyr-url tl::expected meta range-v3) +target_link_libraries(skyr-url tl::expected) if(${CMAKE_CXX_COMPILER_ID} MATCHES Clang) if (NOT Skyr_DISABLE_LIBCXX) target_link_libraries(skyr-url "c++") diff --git a/src/url/algorithms.hpp b/src/url/algorithms.hpp index d15f30b7..902172e3 100644 --- a/src/url/algorithms.hpp +++ b/src/url/algorithms.hpp @@ -57,7 +57,8 @@ inline bool starts_with( std::string_view::const_iterator first, std::string_view::const_iterator last, const char *chars) noexcept { - auto chars_first = chars, chars_last = chars + std::strlen(chars); + auto chars_view = std::string_view(chars); + auto chars_first = std::begin(chars_view), chars_last = std::end(chars_view); auto chars_it = chars_first; auto it = first; if (it == last) { diff --git a/src/url/domain.cpp b/src/url/domain.cpp index 1be49814..4e4c485e 100644 --- a/src/url/domain.cpp +++ b/src/url/domain.cpp @@ -4,6 +4,7 @@ // http://www.boost.org/LICENSE_1_0.txt) #include +#include #include #include #include diff --git a/tests/unicode_code_point_tests.cpp b/tests/unicode_code_point_tests.cpp index 1b3b700d..8d1a0072 100644 --- a/tests/unicode_code_point_tests.cpp +++ b/tests/unicode_code_point_tests.cpp @@ -7,7 +7,6 @@ #include #define CATCH_CONFIG_MAIN #include -#include #include diff --git a/tests/unicode_range_tests.cpp b/tests/unicode_range_tests.cpp index ac2673b7..9cbab9e8 100644 --- a/tests/unicode_range_tests.cpp +++ b/tests/unicode_range_tests.cpp @@ -7,7 +7,6 @@ #include #define CATCH_CONFIG_MAIN #include -#include #include #include #include @@ -135,28 +134,27 @@ TEST_CASE("u8 range") { SECTION("count") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); auto view = skyr::unicode::view_u8_range(bytes); - CHECK(4 == ranges::size(view)); - CHECK(!ranges::empty(view)); + CHECK(4 == view.size()); + CHECK(!view.empty()); } SECTION("empty count") { auto view = skyr::unicode::view_u8_range(); - CHECK(0 == ranges::size(view)); - CHECK(ranges::empty(view)); + CHECK(view.empty()); } SECTION("pipe syntax") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); auto view = bytes | skyr::unicode::view::as_u8; - CHECK(4 == ranges::size(view)); - CHECK(!ranges::empty(view)); + CHECK(4 == view.size()); + CHECK(!view.empty()); } SECTION("pipe syntax with string_view") { auto bytes = std::string("\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88"); auto view = std::string_view(bytes) | skyr::unicode::view::as_u8; - CHECK(4 == ranges::size(view)); - CHECK(!ranges::empty(view)); + CHECK(4 == view.size()); + CHECK(!view.empty()); } SECTION("pipe syntax invalid") { @@ -165,8 +163,8 @@ TEST_CASE("u8 range") { auto it = std::begin(view), last = std::end(view); CHECK(!*it++); CHECK(it == last); - CHECK(1 == ranges::size(view)); - CHECK(!ranges::empty(view)); + CHECK(1 == view.size()); + CHECK(!view.empty()); } SECTION("pipe syntax with u16 string") {