@@ -1064,11 +1064,16 @@ extension Source {
1064
1064
}
1065
1065
1066
1066
mutating func lexCustomCCStart(
1067
+ context: ParsingContext
1067
1068
) throws -> Located < CustomCC . Start > ? {
1068
1069
recordLoc { src in
1069
- // POSIX named sets are atoms.
1070
- guard !src. starts ( with: " [: " ) else { return nil }
1071
-
1070
+ // Make sure we don't have a POSIX character property. This may require
1071
+ // walking to its ending to make sure we have a closing ':]', as otherwise
1072
+ // we have a custom character class.
1073
+ // TODO: This behavior seems subtle, could we warn?
1074
+ guard !src. canLexPOSIXCharacterProperty ( context: context) else {
1075
+ return nil
1076
+ }
1072
1077
if src. tryEat ( " [ " ) {
1073
1078
return src. tryEat ( " ^ " ) ? . inverted : . normal
1074
1079
}
@@ -1099,12 +1104,38 @@ extension Source {
1099
1104
}
1100
1105
1101
1106
private mutating func lexPOSIXCharacterProperty(
1107
+ context: ParsingContext
1102
1108
) throws -> Located < AST . Atom . CharacterProperty > ? {
1103
- try recordLoc { src in
1104
- guard src. tryEat ( sequence: " [: " ) else { return nil }
1105
- let inverted = src. tryEat ( " ^ " )
1106
- let prop = try src. lexCharacterPropertyContents ( end: " :] " ) . value
1107
- return . init( prop, isInverted: inverted, isPOSIX: true )
1109
+ // Only allowed in a custom character class.
1110
+ guard context. isInCustomCharacterClass else { return nil }
1111
+ return try recordLoc { src in
1112
+ try src. tryEating { src in
1113
+ guard src. tryEat ( sequence: " [: " ) else { return nil }
1114
+ let inverted = src. tryEat ( " ^ " )
1115
+
1116
+ // Note we lex the contents and ending *before* classifying, because we
1117
+ // want to bail with nil if we don't have the right ending. This allows
1118
+ // the lexing of a custom character class if we don't have a ':]'
1119
+ // ending.
1120
+ let ( key, value) = src. lexCharacterPropertyKeyValue ( )
1121
+ guard src. tryEat ( sequence: " :] " ) else { return nil }
1122
+
1123
+ let prop = try Source . classifyCharacterPropertyContents ( key: key,
1124
+ value: value)
1125
+ return . init( prop, isInverted: inverted, isPOSIX: true )
1126
+ }
1127
+ }
1128
+ }
1129
+
1130
+ private func canLexPOSIXCharacterProperty( context: ParsingContext ) -> Bool {
1131
+ do {
1132
+ var src = self
1133
+ return try src. lexPOSIXCharacterProperty ( context: context) != nil
1134
+ } catch {
1135
+ // We want to tend on the side of lexing a POSIX character property, so
1136
+ // even if it is invalid in some way (e.g invalid property names), still
1137
+ // try and lex it.
1138
+ return true
1108
1139
}
1109
1140
}
1110
1141
@@ -1129,26 +1160,52 @@ extension Source {
1129
1160
}
1130
1161
}
1131
1162
1132
- private mutating func lexCharacterPropertyContents(
1133
- end: String
1134
- ) throws -> Located < AST . Atom . CharacterProperty . Kind > {
1135
- try recordLoc { src in
1136
- // We should either have:
1137
- // - 'x=y' where 'x' is a property key, and 'y' is a value.
1138
- // - 'y' where 'y' is a value (or a bool key with an inferred value
1139
- // of true), and its key is inferred.
1140
- // TODO: We could have better recovery here if we only ate the characters
1141
- // that property keys and values can use.
1142
- let lhs = src. lexUntil {
1143
- $0. isEmpty || $0. peek ( ) == " = " || $0. starts ( with: end)
1144
- } . value
1145
- if src. tryEat ( " = " ) {
1146
- let rhs = try src. lexUntil ( eating: end) . value
1147
- return try Source . classifyCharacterProperty ( key: lhs, value: rhs)
1163
+ private mutating func lexCharacterPropertyKeyValue(
1164
+ ) -> ( key: String ? , value: String ) {
1165
+ func atPossibleEnding( _ src: inout Source ) -> Bool {
1166
+ guard let next = src. peek ( ) else { return true }
1167
+ switch next {
1168
+ case " = " :
1169
+ // End of a key.
1170
+ return true
1171
+ case " : " , " [ " , " ] " :
1172
+ // POSIX character property endings to cover ':]', ']', and '[' as the
1173
+ // start of a nested character class.
1174
+ return true
1175
+ case " } " :
1176
+ // Ending of '\p{'. We cover this for POSIX too as it's not a valid
1177
+ // character property name anyway, and it's nice not to have diverging
1178
+ // logic for these cases.
1179
+ return true
1180
+ default :
1181
+ // We may want to handle other metacharacters here, e.g '{', '(', ')',
1182
+ // as they're not valid character property names. However for now
1183
+ // let's tend on the side of forming an unknown property name in case
1184
+ // these characters are ever used in future character property names
1185
+ // (though it's very unlikely). Users can always escape e.g the ':'
1186
+ // in '[:' if they definitely want a custom character class.
1187
+ return false
1148
1188
}
1149
- try src. expect ( sequence: end)
1150
- return try Source . classifyCharacterPropertyValueOnly ( lhs)
1151
1189
}
1190
+ // We should either have:
1191
+ // - 'x=y' where 'x' is a property key, and 'y' is a value.
1192
+ // - 'y' where 'y' is a value (or a bool key with an inferred value of true)
1193
+ // and its key is inferred.
1194
+ let lhs = lexUntil ( atPossibleEnding) . value
1195
+ if tryEat ( " = " ) {
1196
+ let rhs = lexUntil ( atPossibleEnding) . value
1197
+ return ( lhs, rhs)
1198
+ }
1199
+ return ( nil , lhs)
1200
+ }
1201
+
1202
+ private static func classifyCharacterPropertyContents(
1203
+ key: String ? , value: String
1204
+ ) throws -> AST . Atom . CharacterProperty . Kind {
1205
+ if let key = key {
1206
+ return try classifyCharacterProperty ( key: key, value: value)
1207
+ }
1208
+ return try classifyCharacterPropertyValueOnly ( value)
1152
1209
}
1153
1210
1154
1211
/// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
1164
1221
let isInverted = src. peek ( ) == " P "
1165
1222
src. advance ( 2 )
1166
1223
1167
- let prop = try src. lexCharacterPropertyContents ( end: " } " ) . value
1224
+ let ( key, value) = src. lexCharacterPropertyKeyValue ( )
1225
+ let prop = try Source . classifyCharacterPropertyContents ( key: key,
1226
+ value: value)
1227
+ try src. expect ( " } " )
1168
1228
return . init( prop, isInverted: isInverted, isPOSIX: false )
1169
1229
}
1170
1230
}
@@ -1758,11 +1818,8 @@ extension Source {
1758
1818
if !customCC && ( src. peek ( ) == " ) " || src. peek ( ) == " | " ) { return nil }
1759
1819
// TODO: Store customCC in the atom, if that's useful
1760
1820
1761
- // POSIX character property. This is only allowed in a custom character
1762
- // class.
1763
- // TODO: Can we try and recover and diagnose these outside character
1764
- // classes?
1765
- if customCC, let prop = try src. lexPOSIXCharacterProperty ( ) ? . value {
1821
+ // POSIX character property.
1822
+ if let prop = try src. lexPOSIXCharacterProperty ( context: context) ? . value {
1766
1823
return . property( prop)
1767
1824
}
1768
1825
0 commit comments