forked from swiftlang/swift-experimental-string-processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDecoding.swift
152 lines (129 loc) · 4.33 KB
/
Decoding.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
Provide very low-level interfaces for scalar decoding.
These can be faster if we assume certain invariants are
maintained. We assert, of course, because we're not monsters.
Thus they are unsafe in the following senses:
- They assume validly encoded contents, otherwise UB
- They assume any pointers passed in will be live and valid
during execution and not concurrently written to, otherwise UB
- They assume any pointer passed in has sufficient bounds
for decoding a scalar, otherwise UB.
String maintains these invariants for its in-memory storage.
*/
// TODO: Design an "unsafe" and "assumingValid" API convention
enum UnsafeAssumingValidUTF8 {
@inlinable @inline(__always)
public func decode(_ x: UInt8) -> Unicode.Scalar {
_internalInvariant(UTF8.isASCII(x))
return Unicode.Scalar(_unchecked: UInt32(x))
}
@inlinable @inline(__always)
public func decode(
_ x: UInt8, _ y: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 2)
_internalInvariant(UTF8.isContinuation(y))
let x = UInt32(x)
let value = ((x & 0b0001_1111) &<< 6) | continuationPayload(y)
return Unicode.Scalar(_unchecked: value)
}
@inlinable @inline(__always)
public func decode(
_ x: UInt8, _ y: UInt8, _ z: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 3)
_internalInvariant(UTF8.isContinuation(y) && UTF8.isContinuation(z))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 12)
| (continuationPayload(y) &<< 6)
| continuationPayload(z)
return Unicode.Scalar(_unchecked: value)
}
@inlinable @inline(__always)
public func decode(
_ x: UInt8, _ y: UInt8, _ z: UInt8, _ w: UInt8
) -> Unicode.Scalar {
_internalInvariant(scalarLength(x) == 4)
_internalInvariant(
UTF8.isContinuation(y) && UTF8.isContinuation(z)
&& UTF8.isContinuation(w))
let x = UInt32(x)
let value = ((x & 0b0000_1111) &<< 18)
| (continuationPayload(y) &<< 12)
| (continuationPayload(z) &<< 6)
| continuationPayload(w)
return Unicode.Scalar(_unchecked: value)
}
// Also, assuming we can load from those bounds...
@inlinable
public func decode(
_ utf8: UnsafeByteBuffer, startingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let cu0 = utf8[_unchecked: i]
let len = scalarLength(cu0)
switch len {
case 1: return (decode(cu0), len)
case 2: return (decode(cu0, utf8[_unchecked: i &+ 1]), len)
case 3: return (decode(
cu0, utf8[_unchecked: i &+ 1], utf8[_unchecked: i &+ 2]), len)
case 4:
return (decode(
cu0,
utf8[_unchecked: i &+ 1],
utf8[_unchecked: i &+ 2],
utf8[_unchecked: i &+ 3]),
len)
default:
fatalError("unreachable")//Builtin.unreachable()
}
}
@inlinable
public func decode(
_ utf8: UnsafeByteBuffer, endingAt i: Int
) -> (Unicode.Scalar, scalarLength: Int) {
let len = scalarLength(utf8, endingAt: i)
let (scalar, scalarLen) = decode(utf8, startingAt: i &- len)
_internalInvariant(len == scalarLen)
return (scalar, len)
}
@inlinable @inline(__always)
public func scalarLength(_ x: UInt8) -> Int {
_internalInvariant(!UTF8.isContinuation(x))
if UTF8.isASCII(x) { return 1 }
// TODO(String micro-performance): check codegen
return (~x).leadingZeroBitCount
}
@inlinable @inline(__always)
public func scalarLength(
_ utf8: UnsafeByteBuffer, endingAt i: Int
) -> Int {
var len = 1
while UTF8.isContinuation(utf8[_unchecked: i &- len]) {
len &+= 1
}
_internalInvariant(len == scalarLength(utf8[i &- len]))
return len
}
@inlinable @inline(__always)
public func continuationPayload(_ x: UInt8) -> UInt32 {
return UInt32(x & 0x3F)
}
@inlinable
public func scalarAlign(
_ utf8: UnsafeByteBuffer, _ idx: Int
) -> Int {
guard _fastPath(idx != utf8.count) else { return idx }
var i = idx
while _slowPath(UTF8.isContinuation(utf8[_unchecked: i])) {
i &-= 1
_internalInvariant(i >= 0,
"Malformed contents: starts with continuation byte")
}
return i
}
}
// TODO: Validating versions that remove that aspect of
// unsafety. Stdlib has stuff on _StrinGuts that could be
// at least partially refactored.
// TODO: Consider UTF-16 support, but that's normally best
// handled as a transcoding concern.