Skip to content

Commit e53bc1c

Browse files
authored
Unify UTF16 conversion path for String (#670)
1 parent 142f409 commit e53bc1c

File tree

2 files changed

+85
-100
lines changed

2 files changed

+85
-100
lines changed

Sources/FoundationEssentials/String/String+EndianAdaptorSequence.swift

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -244,66 +244,6 @@ struct UTF32EndianAdaptor<S : Sequence> : Sequence where S.Element == UInt8 {
244244
}
245245
}
246246

247-
/// Converts a UTF16View to endian-swapped UInt16 values.
248-
struct UTF16ToDataAdaptor : Sequence {
249-
typealias Element = UInt8
250-
typealias S = String.UTF16View
251-
252-
let underlying: S
253-
let endianness: Endianness
254-
255-
init(_ sequence: S, endianness: Endianness) {
256-
underlying = sequence
257-
self.endianness = endianness
258-
}
259-
260-
func makeIterator() -> Iterator {
261-
Iterator(i: underlying.makeIterator(), endianness: endianness)
262-
}
263-
264-
struct Iterator : IteratorProtocol {
265-
var u16: UInt16?
266-
var i: S.Iterator
267-
var endianness: Endianness
268-
var done: Bool
269-
270-
init(i: S.Iterator, endianness: Endianness) {
271-
u16 = nil
272-
done = false
273-
self.i = i
274-
self.endianness = endianness
275-
}
276-
277-
mutating func next() -> Element? {
278-
guard !done else { return nil }
279-
280-
if var u16 {
281-
// We have a value already, return second byte
282-
self.u16 = nil
283-
return withUnsafeBytes(of: &u16) {
284-
$0[1]
285-
}
286-
} else {
287-
if let u16 = i.next() {
288-
var value = switch endianness {
289-
case .little:
290-
u16.littleEndian
291-
case .big:
292-
u16.bigEndian
293-
}
294-
self.u16 = value
295-
return withUnsafeBytes(of: &value) {
296-
$0[0]
297-
}
298-
} else {
299-
done = true
300-
return nil
301-
}
302-
}
303-
}
304-
}
305-
}
306-
307247
struct UnicodeScalarToDataAdaptor : Sequence {
308248
typealias Element = UInt8
309249
typealias S = String.UnicodeScalarView

Sources/FoundationEssentials/String/StringProtocol+Essentials.swift

Lines changed: 85 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ internal import _ForSwiftFoundation
1818
@available(FoundationPreview 0.4, *)
1919
extension String {
2020
public func data(using encoding: String.Encoding, allowLossyConversion: Bool = false) -> Data? {
21+
// allowLossyConversion is a no-op for UTF8 and UTF16. For UTF32, we fall back to NSString when lossy conversion is requested on Darwin platforms.
2122
switch encoding {
2223
case .utf8:
2324
return Data(self.utf8)
@@ -53,57 +54,101 @@ extension String {
5354
}
5455
return allASCII ? data : nil
5556
}
56-
default:
57-
#if FOUNDATION_FRAMEWORK
58-
// TODO: Implement data(using:allowLossyConversion:) in Swift
59-
return _ns.data(
60-
using: encoding.rawValue,
61-
allowLossyConversion: allowLossyConversion)
57+
case .utf16BigEndian, .utf16LittleEndian, .utf16:
58+
let bom: UInt16?
59+
let swap: Bool
60+
61+
if encoding == .utf16 {
62+
swap = false
63+
bom = 0xFEFF
64+
} else if encoding == .utf16BigEndian {
65+
#if _endian(little)
66+
swap = true
6267
#else
63-
switch encoding {
64-
case .utf16BigEndian, .utf16LittleEndian:
65-
// This creates a contiguous storage for Data to simply memcpy, the most efficient way to give it bytes.
66-
return withUnsafeTemporaryAllocation(of: UInt8.self, capacity: self.utf16.count * 2) { utf16Buffer in
67-
_ = utf16Buffer.initialize(from: UTF16ToDataAdaptor(self.utf16, endianness: Endianness(encoding)!))
68-
defer { utf16Buffer.deinitialize() }
69-
return Data(utf16Buffer)
70-
}
71-
case .utf16:
68+
swap = false
69+
#endif
70+
bom = nil
71+
} else if encoding == .utf16LittleEndian {
7272
#if _endian(little)
73-
let data = Data([0xFF, 0xFE])
74-
let hostEncoding : String.Encoding = .utf16LittleEndian
73+
swap = false
7574
#else
76-
let data = Data([0xFE, 0xFF])
77-
let hostEncoding : String.Encoding = .utf16BigEndian
75+
swap = true
7876
#endif
79-
guard let swapped = self.data(using: hostEncoding, allowLossyConversion: allowLossyConversion) else {
80-
return nil
77+
bom = nil
78+
} else {
79+
fatalError("Unreachable")
80+
}
81+
82+
// Grab this value once, as it requires doing a calculation over String's UTF8 storage
83+
let inputCount = self.utf16.count
84+
85+
// The output may have 1 additional UTF16 character, if it has a BOM
86+
let outputCount = bom == nil ? inputCount : inputCount + 1
87+
88+
// Allocate enough memory to hold the UTF16 bytes after conversion. We will pass this off to Data.
89+
let utf16Pointer = calloc(outputCount, MemoryLayout<UInt16>.size)!.assumingMemoryBound(to: UInt16.self)
90+
let utf16Buffer = UnsafeMutableBufferPointer<UInt16>(start: utf16Pointer, count: outputCount)
91+
92+
if let bom {
93+
// Put the BOM in, then copy the UTF16 bytes to the buffer after it.
94+
utf16Buffer[0] = bom
95+
let afterBOMBuffer = UnsafeMutableBufferPointer(rebasing: utf16Buffer[1..<utf16Buffer.endIndex])
96+
self._copyUTF16CodeUnits(into: afterBOMBuffer, range: 0..<inputCount)
97+
} else {
98+
self._copyUTF16CodeUnits(into: utf16Buffer, range: 0..<inputCount)
99+
}
100+
101+
102+
// If we need to swap endianness, we do it as a second pass over the data
103+
if swap {
104+
#if _endian(little)
105+
// Swap, including the BOM if it is there
106+
for u in utf16Buffer.enumerated() {
107+
utf16Buffer[u.0] = u.1.bigEndian
81108
}
82-
83-
return data + swapped
84-
case .utf32BigEndian, .utf32LittleEndian:
85-
// This creates a contiguous storage for Data to simply memcpy, the most efficient way to give it bytes.
86-
return withUnsafeTemporaryAllocation(of: UInt8.self, capacity: self.unicodeScalars.count * 4) { utf32Buffer in
87-
_ = utf32Buffer.initialize(from: UnicodeScalarToDataAdaptor(self.unicodeScalars, endianness: Endianness(encoding)!))
88-
defer { utf32Buffer.deinitialize() }
89-
return Data(utf32Buffer)
109+
#else
110+
for u in utf16Buffer.enumerated() {
111+
utf16Buffer[u.0] = u.1.littleEndian
90112
}
91-
case .utf32:
113+
#endif
114+
}
115+
116+
return Data(bytesNoCopy: utf16Buffer.baseAddress!, count: utf16Buffer.count * 2, deallocator: .free)
117+
118+
case .utf32BigEndian, .utf32LittleEndian:
119+
// This creates a contiguous storage for Data to simply memcpy.
120+
return withUnsafeTemporaryAllocation(of: UInt8.self, capacity: self.unicodeScalars.count * 4) { utf32Buffer in
121+
_ = utf32Buffer.initialize(from: UnicodeScalarToDataAdaptor(self.unicodeScalars, endianness: Endianness(encoding)!))
122+
defer { utf32Buffer.deinitialize() }
123+
return Data(utf32Buffer)
124+
}
125+
case .utf32:
126+
#if FOUNDATION_FRAMEWORK
127+
// Only the CoreFoundation code currently handles the rare case of allowing lossy conversion for UTF32
128+
if allowLossyConversion {
129+
return _ns.data(
130+
using: encoding.rawValue,
131+
allowLossyConversion: allowLossyConversion)
132+
}
133+
#endif
92134
#if _endian(little)
93-
let data = Data([0xFF, 0xFE, 0x00, 0x00])
94-
let hostEncoding : String.Encoding = .utf32LittleEndian
135+
let data = Data([0xFF, 0xFE, 0x00, 0x00])
136+
let hostEncoding : String.Encoding = .utf32LittleEndian
95137
#else
96-
let data = Data([0x00, 0x00, 0xFE, 0xFF])
97-
let hostEncoding : String.Encoding = .utf32BigEndian
138+
let data = Data([0x00, 0x00, 0xFE, 0xFF])
139+
let hostEncoding : String.Encoding = .utf32BigEndian
98140
#endif
99-
guard let swapped = self.data(using: hostEncoding, allowLossyConversion: allowLossyConversion) else {
100-
return nil
101-
}
102-
103-
return data + swapped
104-
default:
141+
guard let swapped = self.data(using: hostEncoding, allowLossyConversion: allowLossyConversion) else {
105142
return nil
106143
}
144+
145+
return data + swapped
146+
default:
147+
#if FOUNDATION_FRAMEWORK
148+
// Other encodings, defer to the CoreFoundation implementation
149+
return _ns.data(using: encoding.rawValue, allowLossyConversion: allowLossyConversion)
150+
#else
151+
return nil
107152
#endif
108153
}
109154
}

0 commit comments

Comments
 (0)