Skip to content

Commit 19adde4

Browse files
authored
Merge pull request #269 from hamishknight/posixbly
2 parents 969bd91 + 0454f13 commit 19adde4

File tree

3 files changed

+129
-35
lines changed

3 files changed

+129
-35
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

+89-32
Original file line numberDiff line numberDiff line change
@@ -1064,11 +1064,16 @@ extension Source {
10641064
}
10651065

10661066
mutating func lexCustomCCStart(
1067+
context: ParsingContext
10671068
) throws -> Located<CustomCC.Start>? {
10681069
recordLoc { src in
1069-
// POSIX named sets are atoms.
1070-
guard !src.starts(with: "[:") else { return nil }
1071-
1070+
// Make sure we don't have a POSIX character property. This may require
1071+
// walking to its ending to make sure we have a closing ':]', as otherwise
1072+
// we have a custom character class.
1073+
// TODO: This behavior seems subtle, could we warn?
1074+
guard !src.canLexPOSIXCharacterProperty(context: context) else {
1075+
return nil
1076+
}
10721077
if src.tryEat("[") {
10731078
return src.tryEat("^") ? .inverted : .normal
10741079
}
@@ -1099,12 +1104,38 @@ extension Source {
10991104
}
11001105

11011106
private mutating func lexPOSIXCharacterProperty(
1107+
context: ParsingContext
11021108
) throws -> Located<AST.Atom.CharacterProperty>? {
1103-
try recordLoc { src in
1104-
guard src.tryEat(sequence: "[:") else { return nil }
1105-
let inverted = src.tryEat("^")
1106-
let prop = try src.lexCharacterPropertyContents(end: ":]").value
1107-
return .init(prop, isInverted: inverted, isPOSIX: true)
1109+
// Only allowed in a custom character class.
1110+
guard context.isInCustomCharacterClass else { return nil }
1111+
return try recordLoc { src in
1112+
try src.tryEating { src in
1113+
guard src.tryEat(sequence: "[:") else { return nil }
1114+
let inverted = src.tryEat("^")
1115+
1116+
// Note we lex the contents and ending *before* classifying, because we
1117+
// want to bail with nil if we don't have the right ending. This allows
1118+
// the lexing of a custom character class if we don't have a ':]'
1119+
// ending.
1120+
let (key, value) = src.lexCharacterPropertyKeyValue()
1121+
guard src.tryEat(sequence: ":]") else { return nil }
1122+
1123+
let prop = try Source.classifyCharacterPropertyContents(key: key,
1124+
value: value)
1125+
return .init(prop, isInverted: inverted, isPOSIX: true)
1126+
}
1127+
}
1128+
}
1129+
1130+
private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
1131+
do {
1132+
var src = self
1133+
return try src.lexPOSIXCharacterProperty(context: context) != nil
1134+
} catch {
1135+
// We want to tend on the side of lexing a POSIX character property, so
1136+
// even if it is invalid in some way (e.g invalid property names), still
1137+
// try and lex it.
1138+
return true
11081139
}
11091140
}
11101141

@@ -1129,26 +1160,52 @@ extension Source {
11291160
}
11301161
}
11311162

1132-
private mutating func lexCharacterPropertyContents(
1133-
end: String
1134-
) throws -> Located<AST.Atom.CharacterProperty.Kind> {
1135-
try recordLoc { src in
1136-
// We should either have:
1137-
// - 'x=y' where 'x' is a property key, and 'y' is a value.
1138-
// - 'y' where 'y' is a value (or a bool key with an inferred value
1139-
// of true), and its key is inferred.
1140-
// TODO: We could have better recovery here if we only ate the characters
1141-
// that property keys and values can use.
1142-
let lhs = src.lexUntil {
1143-
$0.isEmpty || $0.peek() == "=" || $0.starts(with: end)
1144-
}.value
1145-
if src.tryEat("=") {
1146-
let rhs = try src.lexUntil(eating: end).value
1147-
return try Source.classifyCharacterProperty(key: lhs, value: rhs)
1163+
private mutating func lexCharacterPropertyKeyValue(
1164+
) -> (key: String?, value: String) {
1165+
func atPossibleEnding(_ src: inout Source) -> Bool {
1166+
guard let next = src.peek() else { return true }
1167+
switch next {
1168+
case "=":
1169+
// End of a key.
1170+
return true
1171+
case ":", "[", "]":
1172+
// POSIX character property endings to cover ':]', ']', and '[' as the
1173+
// start of a nested character class.
1174+
return true
1175+
case "}":
1176+
// Ending of '\p{'. We cover this for POSIX too as it's not a valid
1177+
// character property name anyway, and it's nice not to have diverging
1178+
// logic for these cases.
1179+
return true
1180+
default:
1181+
// We may want to handle other metacharacters here, e.g '{', '(', ')',
1182+
// as they're not valid character property names. However for now
1183+
// let's tend on the side of forming an unknown property name in case
1184+
// these characters are ever used in future character property names
1185+
// (though it's very unlikely). Users can always escape e.g the ':'
1186+
// in '[:' if they definitely want a custom character class.
1187+
return false
11481188
}
1149-
try src.expect(sequence: end)
1150-
return try Source.classifyCharacterPropertyValueOnly(lhs)
11511189
}
1190+
// We should either have:
1191+
// - 'x=y' where 'x' is a property key, and 'y' is a value.
1192+
// - 'y' where 'y' is a value (or a bool key with an inferred value of true)
1193+
// and its key is inferred.
1194+
let lhs = lexUntil(atPossibleEnding).value
1195+
if tryEat("=") {
1196+
let rhs = lexUntil(atPossibleEnding).value
1197+
return (lhs, rhs)
1198+
}
1199+
return (nil, lhs)
1200+
}
1201+
1202+
private static func classifyCharacterPropertyContents(
1203+
key: String?, value: String
1204+
) throws -> AST.Atom.CharacterProperty.Kind {
1205+
if let key = key {
1206+
return try classifyCharacterProperty(key: key, value: value)
1207+
}
1208+
return try classifyCharacterPropertyValueOnly(value)
11521209
}
11531210

11541211
/// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
11641221
let isInverted = src.peek() == "P"
11651222
src.advance(2)
11661223

1167-
let prop = try src.lexCharacterPropertyContents(end: "}").value
1224+
let (key, value) = src.lexCharacterPropertyKeyValue()
1225+
let prop = try Source.classifyCharacterPropertyContents(key: key,
1226+
value: value)
1227+
try src.expect("}")
11681228
return .init(prop, isInverted: isInverted, isPOSIX: false)
11691229
}
11701230
}
@@ -1758,11 +1818,8 @@ extension Source {
17581818
if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
17591819
// TODO: Store customCC in the atom, if that's useful
17601820

1761-
// POSIX character property. This is only allowed in a custom character
1762-
// class.
1763-
// TODO: Can we try and recover and diagnose these outside character
1764-
// classes?
1765-
if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value {
1821+
// POSIX character property.
1822+
if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
17661823
return .property(prop)
17671824
}
17681825

Sources/_RegexParser/Regex/Parse/Parse.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ extension Parser {
403403
}
404404

405405
// Check if we have the start of a custom character class '['.
406-
if let cccStart = try source.lexCustomCCStart() {
406+
if let cccStart = try source.lexCustomCCStart(context: context) {
407407
return .customCharacterClass(
408408
try parseCustomCharacterClass(cccStart))
409409
}
@@ -487,7 +487,7 @@ extension Parser {
487487
while source.peek() != "]" && source.peekCCBinOp() == nil {
488488

489489
// Nested custom character class.
490-
if let cccStart = try source.lexCustomCCStart() {
490+
if let cccStart = try source.lexCustomCCStart(context: context) {
491491
members.append(.custom(try parseCustomCharacterClass(cccStart)))
492492
continue
493493
}

Tests/RegexTests/ParseTests.swift

+38-1
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,26 @@ extension RegexTests {
474474
parseTest(
475475
"[a^]", charClass("a", "^"))
476476

477+
// These are custom character classes, not invalid POSIX character classes.
478+
// TODO: This behavior is subtle, we ought to warn.
479+
parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
480+
parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
481+
parseTest("[:a]", charClass(":", "a"))
482+
parseTest("[a:]", charClass("a", ":"))
483+
parseTest("[:]", charClass(":"))
484+
parseTest("[::]", charClass(":", ":"))
485+
parseTest("[:=:]", charClass(":", "=", ":"))
486+
parseTest("[[:]]", charClass(charClass(":")))
487+
parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
488+
489+
parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":")))
490+
parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":")))
491+
parseTest(#"[[:a]]"#, charClass(charClass(":", "a")))
492+
parseTest(#"[[:}]]"#, charClass(charClass(":", "}")))
493+
parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
494+
parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{"))))
495+
parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
496+
477497
parseTest(
478498
#"\D\S\W"#,
479499
concat(
@@ -1096,9 +1116,13 @@ extension RegexTests {
10961116
#"\p{C}+"#,
10971117
oneOrMore(of: prop(.generalCategory(.other))))
10981118

1119+
// TODO: Start erroring on these?
10991120
parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx")))
11001121
parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL")))
11011122
parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y")))
1123+
parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)")))
1124+
parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()"))))
1125+
parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}"))
11021126

11031127
// UAX44-LM3 means all of the below are equivalent.
11041128
let lowercaseLetter = prop(.generalCategory(.lowercaseLetter))
@@ -2183,7 +2207,11 @@ extension RegexTests {
21832207
diagnosticTest(#"\N{A"#, .expected("}"))
21842208
diagnosticTest(#"\N{U+A"#, .expected("}"))
21852209
diagnosticTest(#"\p{a"#, .expected("}"))
2186-
diagnosticTest(#"\p{a="#, .expected("}"))
2210+
diagnosticTest(#"\p{a="#, .emptyProperty)
2211+
diagnosticTest(#"\p{a=}"#, .emptyProperty)
2212+
diagnosticTest(#"\p{a=b"#, .expected("}"))
2213+
diagnosticTest(#"\p{aaa[b]}"#, .expected("}"))
2214+
diagnosticTest(#"\p{a=b=c}"#, .expected("}"))
21872215
diagnosticTest(#"(?#"#, .expected(")"))
21882216
diagnosticTest(#"(?x"#, .expected(")"))
21892217

@@ -2218,6 +2246,15 @@ extension RegexTests {
22182246
// the closing bracket.
22192247
diagnosticTest("[]", .expected("]"))
22202248

2249+
diagnosticTest("[:a", .expected("]"))
2250+
diagnosticTest("[:a:", .expected("]"))
2251+
diagnosticTest("[[:a", .expected("]"))
2252+
diagnosticTest("[[:a:", .expected("]"))
2253+
diagnosticTest("[[:a[:]", .expected("]"))
2254+
2255+
diagnosticTest("[[::]]", .emptyProperty)
2256+
diagnosticTest("[[:=:]]", .emptyProperty)
2257+
22212258
// MARK: Bad escapes
22222259

22232260
diagnosticTest("\\", .expectedEscape)

0 commit comments

Comments
 (0)