Skip to content

Commit 7c96694

Browse files
authored
Range experiment (#21)
* Added first attempt at using ranges to process unicode strings * Updated unicode encoding API to be more idiomatic * Added tests for octet -> u16 string adapter * Moved some files around * Added conversion from u32 to bytes implementation * moved code around * Unicode conversions work now using the new syntax * Renaming and updating documentation * Added u32_view * Removed dependency on old package of ranges in vcpkg
1 parent 91f710a commit 7c96694

25 files changed

+2617
-812
lines changed

.clang-format

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ AccessModifierOffset: -1
33
ConstructorInitializerIndentWidth: 4
44
AlignTrailingComments: true
55
AllowAllParametersOfDeclarationOnNextLine: true
6-
AllowShortIfStatementsOnASingleLine: WithoutElse
6+
#AllowShortIfStatementsOnASingleLine: WithoutElse
77
AllowShortLoopsOnASingleLine: true
88
AlwaysBreakTemplateDeclarations: Yes
99
AlwaysBreakBeforeMultilineStrings: true
@@ -31,4 +31,3 @@ BreakBeforeBraces: Attach
3131
SpacesInParentheses: false
3232
SpaceInEmptyParentheses: false
3333
SpacesInCStyleCastParentheses: false
34-
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
// Copyright 2019 Glyn Matthews.
2+
// Distributed under the Boost Software License, Version 1.0.
3+
// (See accompanying file LICENSE_1_0.txt or copy at
4+
// http://www.boost.org/LICENSE_1_0.txt)
5+
6+
#ifndef SKYR_UNICODE_CODE_POINT_HPP
7+
#define SKYR_UNICODE_CODE_POINT_HPP
8+
9+
#include <tl/expected.hpp>
10+
#include <skyr/unicode/errors.hpp>
11+
#include <skyr/unicode/constants.hpp>
12+
#include <skyr/unicode/core.hpp>
13+
14+
namespace skyr::unicode {
15+
/// This class defines a range over a code point in raw bytes,
16+
/// according to UTF-8.
17+
/// \tparam OctetIterator An iterator type over the raw bytes
18+
template<typename OctetIterator>
19+
class u8_code_point_t {
20+
public:
21+
22+
///
23+
using const_iterator = OctetIterator;
24+
///
25+
using iterator = const_iterator;
26+
///
27+
using value_type = char;
28+
///
29+
using const_reference = value_type;
30+
///
31+
using reference = const_reference;
32+
///
33+
using size_type = std::size_t;
34+
35+
/// \brief Constructor
36+
/// \param first An iterator at the beginning of the code point
37+
/// \param last An iterator at the end of the code point
38+
constexpr u8_code_point_t(
39+
OctetIterator first,
40+
OctetIterator last)
41+
: first(first), last(last) {}
42+
43+
/// \brief Constructor. The length of the code point sequence is
44+
/// inferred from the first code point value.
45+
/// \param first An iterator at the beginning of the code point
46+
explicit constexpr u8_code_point_t(OctetIterator first)
47+
: u8_code_point_t(first, first + sequence_length(*first)) {}
48+
49+
/// \brief Copy constructor.
50+
constexpr u8_code_point_t(const u8_code_point_t &) = default;
51+
/// \brief Move constructor.
52+
constexpr u8_code_point_t(u8_code_point_t &&) noexcept = default;
53+
/// \brief Copy assignment operator.
54+
constexpr u8_code_point_t &operator=(const u8_code_point_t &) = default;
55+
/// \brief Move assignment operator.
56+
constexpr u8_code_point_t &operator=(u8_code_point_t &&) noexcept = default;
57+
/// \brief Destructor.
58+
~u8_code_point_t() = default;
59+
60+
/// Returns an iterator to the beginning
61+
/// \return \c const_iterator
62+
[[nodiscard]] constexpr const_iterator begin() const noexcept {
63+
return first;
64+
}
65+
66+
/// Returns an iterator to the end
67+
/// \return \c const_iterator
68+
[[nodiscard]] constexpr const_iterator end() const noexcept {
69+
return last;
70+
}
71+
72+
/// Returns an iterator to the beginning
73+
/// \return \c const_iterator
74+
[[nodiscard]] constexpr auto cbegin() const noexcept {
75+
return begin();
76+
}
77+
78+
/// Returns an iterator to the end
79+
/// \return \c const_iterator
80+
[[nodiscard]] constexpr auto cend() const noexcept {
81+
return end();
82+
}
83+
84+
/// \brief Returns the length in bytes of this code point.
85+
/// \return
86+
[[nodiscard]] constexpr auto size() const noexcept -> size_type {
87+
return sequence_length(*first);
88+
}
89+
90+
private:
91+
92+
OctetIterator first, last;
93+
94+
};
95+
96+
///
97+
/// \tparam OctetRange
98+
/// \param range
99+
/// \return
100+
template<typename OctetRange>
101+
inline tl::expected<u8_code_point_t<typename OctetRange::const_iterator>, std::error_code> u8_code_point(
102+
const OctetRange &range) {
103+
auto first = std::begin(range), last = std::end(range);
104+
if (std::distance(first, last) > sequence_length(*first)) {
105+
return tl::make_unexpected(make_error_code(unicode_errc::overflow));
106+
}
107+
return u8_code_point_t<typename OctetRange::const_iterator>(
108+
first,
109+
first + sequence_length(*first));
110+
}
111+
112+
113+
/// Tests if the code point value is valid.
114+
/// \returns \c true if the value is a valid code point, \c false otherwise
115+
template <typename OctetIterator>
116+
inline bool is_valid(const u8_code_point_t<OctetIterator> &code_point) {
117+
return static_cast<bool>(find_code_point(std::begin(code_point)));
118+
}
119+
120+
///
121+
/// \tparam OctetRange
122+
/// \param range
123+
/// \return
124+
template <typename OctetRange>
125+
inline tl::expected<u8_code_point_t<typename OctetRange::const_iterator>, std::error_code> valid_u8_code_point(
126+
const OctetRange &range) {
127+
using result_type = tl::expected<u8_code_point_t<typename OctetRange::const_iterator>, std::error_code>;
128+
129+
auto check_code_point = [] (auto &&code_point) -> result_type {
130+
return find_code_point(std::begin(code_point))
131+
.and_then([=] (auto) -> result_type {
132+
return code_point;
133+
});
134+
};
135+
136+
return
137+
u8_code_point(range)
138+
.and_then(check_code_point);
139+
}
140+
141+
///
142+
class u16_code_point_t {
143+
144+
public:
145+
146+
///
147+
/// \param code_point
148+
explicit constexpr u16_code_point_t(char32_t code_point)
149+
: code_point_(code_point) {}
150+
151+
///
152+
/// \param code_point
153+
explicit constexpr u16_code_point_t(char16_t code_point)
154+
: code_point_(code_point) {}
155+
156+
constexpr u16_code_point_t(char16_t lead_value, char16_t trail_value)
157+
: code_point_((lead_value << 10) + trail_value + constants::surrogates::offset) {}
158+
159+
///
160+
constexpr u16_code_point_t(const u16_code_point_t &) = default;
161+
///
162+
constexpr u16_code_point_t(u16_code_point_t &&) noexcept = default;
163+
///
164+
u16_code_point_t &operator=(const u16_code_point_t &) = default;
165+
///
166+
u16_code_point_t &operator=(u16_code_point_t &&) noexcept = default;
167+
///
168+
~u16_code_point_t() = default;
169+
170+
///
171+
/// \return
172+
[[nodiscard]] uint16_t lead_value() const {
173+
return is_surrogate_pair()?
174+
static_cast<char16_t>((code_point_ >> 10U) + constants::surrogates::lead_offset) :
175+
static_cast<char16_t>(code_point_);
176+
}
177+
178+
///
179+
/// \return
180+
[[nodiscard]] uint16_t trail_value() const {
181+
return is_surrogate_pair()?
182+
static_cast<char16_t>((code_point_ & 0x3ffU) + constants::surrogates::trail_min) :
183+
0;
184+
}
185+
186+
///
187+
/// \return
188+
[[nodiscard]] constexpr bool is_surrogate_pair() const noexcept {
189+
return code_point_ > 0xffffU;
190+
}
191+
192+
tl::expected<char32_t, std::error_code> u32_value() const noexcept {
193+
return code_point_;
194+
}
195+
196+
private:
197+
198+
char32_t code_point_;
199+
200+
};
201+
202+
///
203+
/// \param code_point
204+
/// \return
205+
inline u16_code_point_t u16_code_point(char32_t code_point) {
206+
return u16_code_point_t(code_point);
207+
}
208+
209+
///
210+
/// \param code_point
211+
/// \return
212+
inline u16_code_point_t u16_code_point(char16_t code_point) {
213+
return u16_code_point_t(code_point);
214+
}
215+
216+
///
217+
/// \param lead
218+
/// \param value
219+
/// \return
220+
inline u16_code_point_t u16_code_point(char16_t lead, char16_t value) {
221+
return u16_code_point_t(lead, value);
222+
}
223+
224+
///
225+
/// \tparam OctetIterator
226+
/// \param code_point
227+
/// \return
228+
template <typename OctetIterator>
229+
inline tl::expected<char32_t, std::error_code> u32_value(
230+
u8_code_point_t<OctetIterator> code_point) noexcept {
231+
return find_code_point(code_point.begin())
232+
.and_then([] (auto state) -> tl::expected<char32_t, std::error_code> {
233+
return state.value;
234+
});
235+
}
236+
237+
///
238+
/// \tparam OctetIterator
239+
/// \param code_point
240+
/// \return
241+
template <typename OctetIterator>
242+
inline tl::expected<char32_t, std::error_code> u32_value(
243+
tl::expected<u8_code_point_t<OctetIterator>, std::error_code> code_point) noexcept {
244+
return code_point
245+
.and_then([] (auto code_point) -> tl::expected<char32_t , std::error_code> {
246+
return u32_value(code_point);
247+
});
248+
}
249+
250+
///
251+
/// \param code_point
252+
/// \return
253+
inline tl::expected<char32_t, std::error_code> u32_value(
254+
u16_code_point_t code_point) noexcept {
255+
return code_point.u32_value();
256+
}
257+
258+
///
259+
/// \param code_point
260+
/// \return
261+
inline tl::expected<char32_t, std::error_code> u32_value(
262+
tl::expected<u16_code_point_t, std::error_code> code_point) noexcept {
263+
return code_point
264+
.and_then([] (auto code_point) -> tl::expected<char32_t, std::error_code> {
265+
return code_point.u32_value();
266+
});
267+
}
268+
269+
///
270+
/// \param code_point
271+
/// \return
272+
inline tl::expected<char32_t, std::error_code> u32_value(
273+
char32_t code_point) noexcept {
274+
return code_point;
275+
}
276+
277+
///
278+
/// \param code_point
279+
/// \return
280+
inline tl::expected<char32_t, std::error_code> u32_value(
281+
tl::expected<char32_t, std::error_code> code_point) noexcept {
282+
return code_point;
283+
}
284+
285+
///
286+
/// \tparam OctetIterator
287+
/// \param code_point
288+
/// \return
289+
template <typename OctetIterator>
290+
inline tl::expected<u16_code_point_t, std::error_code> u16_value(
291+
u8_code_point_t<OctetIterator> code_point) {
292+
return u16_code_point(u32_value(code_point));
293+
}
294+
295+
///
296+
/// \tparam OctetIterator
297+
/// \param code_point
298+
/// \return
299+
template <typename OctetIterator>
300+
inline tl::expected<u16_code_point_t, std::error_code> u16_value(
301+
tl::expected<u8_code_point_t<OctetIterator>, std::error_code> code_point) {
302+
return u32_value(code_point)
303+
.and_then([] (auto code_point) -> tl::expected<u16_code_point_t, std::error_code> {
304+
return u16_code_point(code_point);
305+
});
306+
}
307+
} // namespace skyr::unicode
308+
309+
310+
#endif //SKYR_UNICODE_CODE_POINT_HPP

include/skyr/unicode/constants.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
// (See accompanying file LICENSE_1_0.txt or copy at
44
// http://www.boost.org/LICENSE_1_0.txt)
55

6-
#ifndef SKYR_CONSTANTS_HPP
7-
#define SKYR_CONSTANTS_HPP
6+
#ifndef SKYR_UNICODE_CONSTANTS_HPP
7+
#define SKYR_UNICODE_CONSTANTS_HPP
88

99
namespace skyr::unicode::constants {
1010
namespace surrogates {
@@ -22,6 +22,8 @@ namespace code_points {
2222
// Maximum valid value for a Unicode code point
2323
constexpr char32_t max = 0x0010ffffu;
2424
} // namespace code_points
25+
26+
constexpr char bom[] = {'\xef', '\xbb', '\xbf'};
2527
} // namespace skyr::unicode::constants
2628

27-
#endif //SKYR_CONSTANTS_HPP
29+
#endif //SKYR_UNICODE_CONSTANTS_HPP

0 commit comments

Comments
 (0)