Skip to content

Commit ea334c6

Browse files
authored
(118839391) Implement file system representation for String
* (118839391) Implement file system representation for String * (118839391) Fix some build issues * (118839391) Fix embedded nulls and precomposed characters
1 parent 4c74af4 commit ea334c6

File tree

3 files changed

+243
-1
lines changed

3 files changed

+243
-1
lines changed

Sources/FoundationEssentials/String/BuiltInUnicodeScalarSet.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ internal struct BuiltInUnicodeScalarSet {
2626
case canonicalDecomposable
2727

2828
// Below are internal
29+
case hfsPlusDecomposable
2930
case caseIgnorable
3031
case graphemeExtend
3132
}
@@ -44,6 +45,8 @@ internal struct BuiltInUnicodeScalarSet {
4445
return 3
4546
case .canonicalDecomposable:
4647
return 5
48+
case .hfsPlusDecomposable:
49+
return 12
4750
case .caseIgnorable:
4851
return 20
4952
case .graphemeExtend:
@@ -90,6 +93,7 @@ internal struct BuiltInUnicodeScalarSet {
9093
static let uppercaseLetters = Self.init(type: .uppercaseLetter)
9194
static let lowercaseLetters = Self.init(type: .lowercaseLetter)
9295
static let caseIgnorables = Self.init(type: .caseIgnorable)
96+
static let hfsPlusDecomposables = Self.init(type: .hfsPlusDecomposable)
9397
static let graphemeExtends = Self.init(type: .graphemeExtend)
9498
static let canonicalDecomposables = Self.init(type: .canonicalDecomposable)
9599
}

Sources/FoundationEssentials/String/String+Internals.swift

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
#if FOUNDATION_FRAMEWORK
14+
@_spi(_Unicode) import Swift
15+
@_implementationOnly import Foundation_Private.NSString
16+
#endif
17+
18+
#if canImport(Darwin)
19+
import Darwin
20+
#endif
21+
1322
extension String {
1423
package func _trimmingWhitespace() -> String {
1524
String(unicodeScalars._trimmingCharacters {
@@ -52,5 +61,199 @@ extension String {
5261
}
5362
self = str
5463
}
64+
65+
enum _NormalizationType {
66+
case canonical
67+
case hfsPlus
68+
69+
fileprivate var setType: BuiltInUnicodeScalarSet.SetType {
70+
switch self {
71+
case .canonical: .canonicalDecomposable
72+
case .hfsPlus: .hfsPlusDecomposable
73+
}
74+
}
75+
}
76+
77+
private func _decomposed(_ type: String._NormalizationType, into buffer: UnsafeMutableBufferPointer<UInt8>, nullTerminated: Bool = false) -> Int? {
78+
var copy = self
79+
return copy.withUTF8 {
80+
try? $0._decomposed(type, as: Unicode.UTF8.self, into: buffer, nullTerminated: nullTerminated)
81+
}
82+
}
83+
84+
#if canImport(Darwin) || FOUNDATION_FRAMEWORK
85+
fileprivate func _fileSystemRepresentation(into buffer: UnsafeMutableBufferPointer<CChar>) -> Bool {
86+
let result = buffer.withMemoryRebound(to: UInt8.self) { uintBuffer in
87+
let newBuffer = UnsafeMutableBufferPointer(start: uintBuffer.baseAddress, count: uintBuffer.count - 1)
88+
return _decomposed(.hfsPlus, into: newBuffer, nullTerminated: true)
89+
}
90+
91+
return result != nil
92+
}
93+
#endif
94+
95+
package func withFileSystemRepresentation<R>(_ block: (UnsafePointer<CChar>?) throws -> R) rethrows -> R {
96+
#if canImport(Darwin) || FOUNDATION_FRAMEWORK
97+
try withUnsafeTemporaryAllocation(of: CChar.self, capacity: Int(PATH_MAX)) { buffer in
98+
guard _fileSystemRepresentation(into: buffer) else {
99+
return try block(nil)
100+
}
101+
return try block(buffer.baseAddress!)
102+
}
103+
#else
104+
try self.withCString {
105+
try block($0)
106+
}
107+
#endif
108+
}
109+
}
110+
111+
extension UnsafeBufferPointer {
112+
private enum DecompositionError : Error {
113+
case insufficientSpace
114+
case illegalScalar
115+
case decodingError
116+
}
117+
118+
fileprivate func _decomposedRebinding<T: UnicodeCodec, InputElement>(_ type: String._NormalizationType, as codec: T.Type, into buffer: UnsafeMutableBufferPointer<InputElement>, nullTerminated: Bool = false) throws -> Int {
119+
try self.withMemoryRebound(to: T.CodeUnit.self) { reboundSelf in
120+
try buffer.withMemoryRebound(to: Unicode.UTF8.CodeUnit.self) { reboundBuffer in
121+
try reboundSelf._decomposed(type, as: codec, into: reboundBuffer, nullTerminated: nullTerminated)
122+
}
123+
}
124+
}
125+
126+
fileprivate func _decomposed<T: UnicodeCodec>(_ type: String._NormalizationType, as codec: T.Type, into buffer: UnsafeMutableBufferPointer<UInt8>, nullTerminated: Bool = false) throws -> Int where Element == T.CodeUnit {
127+
let scalarSet = BuiltInUnicodeScalarSet(type: type.setType)
128+
var bufferIdx = 0
129+
let bufferLength = buffer.count
130+
var sortBuffer: [UnicodeScalar] = []
131+
var seenNullIdx: Int? = nil
132+
var decoder = T()
133+
var iterator = self.makeIterator()
134+
135+
func appendOutput(_ values: some Sequence<UInt8>) throws {
136+
let bufferPortion = UnsafeMutableBufferPointer(start: buffer.baseAddress!.advanced(by: bufferIdx), count: bufferLength - bufferIdx)
137+
var (leftOver, idx) = bufferPortion.initialize(from: values)
138+
bufferIdx += idx
139+
if bufferIdx == bufferLength && leftOver.next() != nil {
140+
throw DecompositionError.insufficientSpace
141+
}
142+
}
143+
144+
func appendOutput(_ value: UInt8) throws {
145+
guard bufferIdx < bufferLength else {
146+
throw DecompositionError.insufficientSpace
147+
}
148+
buffer.initializeElement(at: bufferIdx, to: value)
149+
bufferIdx += 1
150+
}
151+
152+
func encodedScalar(_ scalar: UnicodeScalar) throws -> some Collection<UInt8> {
153+
guard let encoded = UTF8.encode(scalar) else {
154+
throw DecompositionError.illegalScalar
155+
}
156+
return encoded
157+
}
158+
159+
func fillFromSortBuffer() throws {
160+
guard !sortBuffer.isEmpty else { return }
161+
sortBuffer.sort {
162+
$0.properties.canonicalCombiningClass.rawValue < $1.properties.canonicalCombiningClass.rawValue
163+
}
164+
for scalar in sortBuffer {
165+
try appendOutput(encodedScalar(scalar))
166+
}
167+
sortBuffer.removeAll(keepingCapacity: true)
168+
}
169+
170+
decodingLoop: while bufferIdx < bufferLength {
171+
var scalar: UnicodeScalar
172+
switch decoder.decode(&iterator) {
173+
// We've finished the input, return the index
174+
case .emptyInput: break decodingLoop
175+
case .error: throw DecompositionError.decodingError
176+
case .scalarValue(let v): scalar = v
177+
}
178+
179+
if scalar.value == 0 {
180+
// Null bytes within the string are fine as long as they are at the end
181+
seenNullIdx = bufferIdx
182+
} else if seenNullIdx != nil {
183+
// File system representations are c-strings that do not support embedded null bytes
184+
throw DecompositionError.illegalScalar
185+
}
186+
187+
let isASCII = scalar.isASCII
188+
if isASCII || scalar.properties.canonicalCombiningClass == .notReordered {
189+
try fillFromSortBuffer()
190+
}
191+
192+
if isASCII {
193+
try appendOutput(UInt8(scalar.value))
194+
} else {
195+
#if FOUNDATION_FRAMEWORK
196+
// Only decompose scalars present in the declared set
197+
if scalarSet.contains(scalar) {
198+
sortBuffer.append(contentsOf: String(scalar)._nfd)
199+
} else {
200+
// Even if a scalar isn't decomposed, it may still need to be re-ordered
201+
sortBuffer.append(scalar)
202+
}
203+
#else
204+
// TODO: Implement Unicode decomposition in swift-foundation
205+
sortBuffer.append(scalar)
206+
#endif
207+
}
208+
}
209+
try fillFromSortBuffer()
210+
211+
if iterator.next() != nil {
212+
throw DecompositionError.insufficientSpace
213+
} else {
214+
if let seenNullIdx {
215+
return seenNullIdx + 1
216+
}
217+
if nullTerminated {
218+
try appendOutput(0)
219+
}
220+
return bufferIdx
221+
}
222+
}
223+
}
55224

225+
#if FOUNDATION_FRAMEWORK
226+
@objc
227+
extension NSString {
228+
@objc
229+
func __swiftFillFileSystemRepresentation(pointer: UnsafeMutablePointer<CChar>, maxLength: Int) -> Bool {
230+
let buffer = UnsafeMutableBufferPointer(start: pointer, count: maxLength)
231+
// See if we have a quick-access buffer we can just convert directly
232+
if let fastCharacters = self._fastCharacterContents() {
233+
// If we have quick access to UTF-16 contents, decompose from UTF-16
234+
let charsBuffer = UnsafeBufferPointer(start: fastCharacters, count: self.length)
235+
return (try? charsBuffer._decomposedRebinding(.hfsPlus, as: Unicode.UTF16.self, into: buffer, nullTerminated: true)) != nil
236+
} else if self.fastestEncoding == NSASCIIStringEncoding, let fastUTF8 = self._fastCStringContents(false) {
237+
// If we have quick access to ASCII contents, no need to decompose
238+
let utf8Buffer = UnsafeBufferPointer(start: fastUTF8, count: self.length)
239+
240+
// We only allow embedded nulls if there are no non-null characters following the first null character
241+
if let embeddedNullIdx = utf8Buffer.firstIndex(of: 0) {
242+
if !utf8Buffer[embeddedNullIdx...].allSatisfy({ $0 == 0 }) {
243+
return false
244+
}
245+
}
246+
247+
let next = buffer.initialize(fromContentsOf: utf8Buffer)
248+
guard next < buffer.endIndex else {
249+
return false
250+
}
251+
buffer[next] = 0
252+
return true
253+
} else {
254+
// Otherwise, bridge to a String which will create a UTF-8 buffer
255+
return String(self)._fileSystemRepresentation(into: buffer)
256+
}
257+
}
56258
}
259+
#endif

Tests/FoundationEssentialsTests/StringTests.swift

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,42 @@ final class StringTests : XCTestCase {
325325
XCTAssertEqual(lineResult.end, string.endIndex)
326326
XCTAssertEqual(lineResult.contentsEnd, string.endIndex)
327327
}
328-
328+
329+
func testFileSystemRepresentation() {
330+
func assertCString(_ ptr: UnsafePointer<CChar>, equals other: String, file: StaticString = #file, line: UInt = #line) {
331+
XCTAssertEqual(String(cString: ptr), other, file: file, line: line)
332+
}
333+
334+
let original = "/Path1/Path Two/Path Three/Some Really Long File Name Section.txt"
335+
original.withFileSystemRepresentation {
336+
XCTAssertNotNil($0)
337+
assertCString($0!, equals: original)
338+
}
339+
340+
let withWhitespace = original + "\u{2000}\u{2001}"
341+
withWhitespace.withFileSystemRepresentation {
342+
XCTAssertNotNil($0)
343+
assertCString($0!, equals: withWhitespace)
344+
}
345+
346+
let withHangul = original + "\u{AC00}\u{AC01}"
347+
withHangul.withFileSystemRepresentation { buf1 in
348+
XCTAssertNotNil(buf1)
349+
buf1!.withMemoryRebound(to: UInt8.self, capacity: strlen(buf1!)) { buf1Rebound in
350+
let fsr = String(decodingCString: buf1Rebound, as: UTF8.self)
351+
fsr.withFileSystemRepresentation { buf2 in
352+
XCTAssertNotNil(buf2)
353+
XCTAssertEqual(strcmp(buf1!, buf2!), 0)
354+
}
355+
}
356+
}
357+
358+
let withNullSuffix = original + "\u{0000}\u{0000}"
359+
withNullSuffix.withFileSystemRepresentation {
360+
XCTAssertNotNil($0)
361+
assertCString($0!, equals: original)
362+
}
363+
}
329364
}
330365

331366

0 commit comments

Comments
 (0)