Skip to content

Commit 0454f13

Browse files
committedApr 12, 2022
Allow custom character classes to begin with :
ICU and Oniguruma allow custom character classes to begin with `:`, and only lex a POSIX character property if they detect a closing `:]`. However their behavior for this differs: - ICU will consider *any* `:]` in the regex as a closing delimiter, even e.g `[[:a]][:]`. - Oniguruma will stop if it hits a `]`, so `[[:a]][:]` is treated as a custom character class. However it only scans ahead 20 chars max, and doesn't stop for e.g a nested character class opening `[`. Our detection behavior for this is as follows: - When `[:` is encountered inside a custom character class, scan ahead to the closing `:]`. - While scanning, bail if we see any characters that are obviously invalid property names. Currently this includes `[`, `]`, `}`, as well as a second occurrence of `=`. - Otherwise, if we end on `:]`, consider that a POSIX character property. We could include more metacharacters to bail on, e.g `{`, `(`, `)`, but for now I'm tending on the side of lexing an invalid POSIX character property. We can always relax this in the future (as we'd be turning invalid code into valid code). Users can always escape the initial `:` in `[:` if they want a custom character class. In fact, we may want to suggest this via a warning, as this behavior can be pretty subtle.
1 parent fed4c53 commit 0454f13

File tree

3 files changed

+129
-35
lines changed

3 files changed

+129
-35
lines changed
 

‎Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 89 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,11 +1064,16 @@ extension Source {
10641064
}
10651065

10661066
mutating func lexCustomCCStart(
1067+
context: ParsingContext
10671068
) throws -> Located<CustomCC.Start>? {
10681069
recordLoc { src in
1069-
// POSIX named sets are atoms.
1070-
guard !src.starts(with: "[:") else { return nil }
1071-
1070+
// Make sure we don't have a POSIX character property. This may require
1071+
// walking to its ending to make sure we have a closing ':]', as otherwise
1072+
// we have a custom character class.
1073+
// TODO: This behavior seems subtle, could we warn?
1074+
guard !src.canLexPOSIXCharacterProperty(context: context) else {
1075+
return nil
1076+
}
10721077
if src.tryEat("[") {
10731078
return src.tryEat("^") ? .inverted : .normal
10741079
}
@@ -1099,12 +1104,38 @@ extension Source {
10991104
}
11001105

11011106
private mutating func lexPOSIXCharacterProperty(
1107+
context: ParsingContext
11021108
) throws -> Located<AST.Atom.CharacterProperty>? {
1103-
try recordLoc { src in
1104-
guard src.tryEat(sequence: "[:") else { return nil }
1105-
let inverted = src.tryEat("^")
1106-
let prop = try src.lexCharacterPropertyContents(end: ":]").value
1107-
return .init(prop, isInverted: inverted, isPOSIX: true)
1109+
// Only allowed in a custom character class.
1110+
guard context.isInCustomCharacterClass else { return nil }
1111+
return try recordLoc { src in
1112+
try src.tryEating { src in
1113+
guard src.tryEat(sequence: "[:") else { return nil }
1114+
let inverted = src.tryEat("^")
1115+
1116+
// Note we lex the contents and ending *before* classifying, because we
1117+
// want to bail with nil if we don't have the right ending. This allows
1118+
// the lexing of a custom character class if we don't have a ':]'
1119+
// ending.
1120+
let (key, value) = src.lexCharacterPropertyKeyValue()
1121+
guard src.tryEat(sequence: ":]") else { return nil }
1122+
1123+
let prop = try Source.classifyCharacterPropertyContents(key: key,
1124+
value: value)
1125+
return .init(prop, isInverted: inverted, isPOSIX: true)
1126+
}
1127+
}
1128+
}
1129+
1130+
private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
1131+
do {
1132+
var src = self
1133+
return try src.lexPOSIXCharacterProperty(context: context) != nil
1134+
} catch {
1135+
// We want to tend on the side of lexing a POSIX character property, so
1136+
// even if it is invalid in some way (e.g invalid property names), still
1137+
// try and lex it.
1138+
return true
11081139
}
11091140
}
11101141

@@ -1129,26 +1160,52 @@ extension Source {
11291160
}
11301161
}
11311162

1132-
private mutating func lexCharacterPropertyContents(
1133-
end: String
1134-
) throws -> Located<AST.Atom.CharacterProperty.Kind> {
1135-
try recordLoc { src in
1136-
// We should either have:
1137-
// - 'x=y' where 'x' is a property key, and 'y' is a value.
1138-
// - 'y' where 'y' is a value (or a bool key with an inferred value
1139-
// of true), and its key is inferred.
1140-
// TODO: We could have better recovery here if we only ate the characters
1141-
// that property keys and values can use.
1142-
let lhs = src.lexUntil {
1143-
$0.isEmpty || $0.peek() == "=" || $0.starts(with: end)
1144-
}.value
1145-
if src.tryEat("=") {
1146-
let rhs = try src.lexUntil(eating: end).value
1147-
return try Source.classifyCharacterProperty(key: lhs, value: rhs)
1163+
private mutating func lexCharacterPropertyKeyValue(
1164+
) -> (key: String?, value: String) {
1165+
func atPossibleEnding(_ src: inout Source) -> Bool {
1166+
guard let next = src.peek() else { return true }
1167+
switch next {
1168+
case "=":
1169+
// End of a key.
1170+
return true
1171+
case ":", "[", "]":
1172+
// POSIX character property endings to cover ':]', ']', and '[' as the
1173+
// start of a nested character class.
1174+
return true
1175+
case "}":
1176+
// Ending of '\p{'. We cover this for POSIX too as it's not a valid
1177+
// character property name anyway, and it's nice not to have diverging
1178+
// logic for these cases.
1179+
return true
1180+
default:
1181+
// We may want to handle other metacharacters here, e.g '{', '(', ')',
1182+
// as they're not valid character property names. However for now
1183+
// let's tend on the side of forming an unknown property name in case
1184+
// these characters are ever used in future character property names
1185+
// (though it's very unlikely). Users can always escape e.g the ':'
1186+
// in '[:' if they definitely want a custom character class.
1187+
return false
11481188
}
1149-
try src.expect(sequence: end)
1150-
return try Source.classifyCharacterPropertyValueOnly(lhs)
11511189
}
1190+
// We should either have:
1191+
// - 'x=y' where 'x' is a property key, and 'y' is a value.
1192+
// - 'y' where 'y' is a value (or a bool key with an inferred value of true)
1193+
// and its key is inferred.
1194+
let lhs = lexUntil(atPossibleEnding).value
1195+
if tryEat("=") {
1196+
let rhs = lexUntil(atPossibleEnding).value
1197+
return (lhs, rhs)
1198+
}
1199+
return (nil, lhs)
1200+
}
1201+
1202+
private static func classifyCharacterPropertyContents(
1203+
key: String?, value: String
1204+
) throws -> AST.Atom.CharacterProperty.Kind {
1205+
if let key = key {
1206+
return try classifyCharacterProperty(key: key, value: value)
1207+
}
1208+
return try classifyCharacterPropertyValueOnly(value)
11521209
}
11531210

11541211
/// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
11641221
let isInverted = src.peek() == "P"
11651222
src.advance(2)
11661223

1167-
let prop = try src.lexCharacterPropertyContents(end: "}").value
1224+
let (key, value) = src.lexCharacterPropertyKeyValue()
1225+
let prop = try Source.classifyCharacterPropertyContents(key: key,
1226+
value: value)
1227+
try src.expect("}")
11681228
return .init(prop, isInverted: isInverted, isPOSIX: false)
11691229
}
11701230
}
@@ -1758,11 +1818,8 @@ extension Source {
17581818
if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
17591819
// TODO: Store customCC in the atom, if that's useful
17601820

1761-
// POSIX character property. This is only allowed in a custom character
1762-
// class.
1763-
// TODO: Can we try and recover and diagnose these outside character
1764-
// classes?
1765-
if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value {
1821+
// POSIX character property.
1822+
if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
17661823
return .property(prop)
17671824
}
17681825

‎Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ extension Parser {
403403
}
404404

405405
// Check if we have the start of a custom character class '['.
406-
if let cccStart = try source.lexCustomCCStart() {
406+
if let cccStart = try source.lexCustomCCStart(context: context) {
407407
return .customCharacterClass(
408408
try parseCustomCharacterClass(cccStart))
409409
}
@@ -487,7 +487,7 @@ extension Parser {
487487
while source.peek() != "]" && source.peekCCBinOp() == nil {
488488

489489
// Nested custom character class.
490-
if let cccStart = try source.lexCustomCCStart() {
490+
if let cccStart = try source.lexCustomCCStart(context: context) {
491491
members.append(.custom(try parseCustomCharacterClass(cccStart)))
492492
continue
493493
}

‎Tests/RegexTests/ParseTests.swift

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,26 @@ extension RegexTests {
474474
parseTest(
475475
"[a^]", charClass("a", "^"))
476476

477+
// These are custom character classes, not invalid POSIX character classes.
478+
// TODO: This behavior is subtle, we ought to warn.
479+
parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
480+
parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
481+
parseTest("[:a]", charClass(":", "a"))
482+
parseTest("[a:]", charClass("a", ":"))
483+
parseTest("[:]", charClass(":"))
484+
parseTest("[::]", charClass(":", ":"))
485+
parseTest("[:=:]", charClass(":", "=", ":"))
486+
parseTest("[[:]]", charClass(charClass(":")))
487+
parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
488+
489+
parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":")))
490+
parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":")))
491+
parseTest(#"[[:a]]"#, charClass(charClass(":", "a")))
492+
parseTest(#"[[:}]]"#, charClass(charClass(":", "}")))
493+
parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
494+
parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{"))))
495+
parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
496+
477497
parseTest(
478498
#"\D\S\W"#,
479499
concat(
@@ -1096,9 +1116,13 @@ extension RegexTests {
10961116
#"\p{C}+"#,
10971117
oneOrMore(of: prop(.generalCategory(.other))))
10981118

1119+
// TODO: Start erroring on these?
10991120
parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx")))
11001121
parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL")))
11011122
parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y")))
1123+
parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)")))
1124+
parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()"))))
1125+
parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}"))
11021126

11031127
// UAX44-LM3 means all of the below are equivalent.
11041128
let lowercaseLetter = prop(.generalCategory(.lowercaseLetter))
@@ -2183,7 +2207,11 @@ extension RegexTests {
21832207
diagnosticTest(#"\N{A"#, .expected("}"))
21842208
diagnosticTest(#"\N{U+A"#, .expected("}"))
21852209
diagnosticTest(#"\p{a"#, .expected("}"))
2186-
diagnosticTest(#"\p{a="#, .expected("}"))
2210+
diagnosticTest(#"\p{a="#, .emptyProperty)
2211+
diagnosticTest(#"\p{a=}"#, .emptyProperty)
2212+
diagnosticTest(#"\p{a=b"#, .expected("}"))
2213+
diagnosticTest(#"\p{aaa[b]}"#, .expected("}"))
2214+
diagnosticTest(#"\p{a=b=c}"#, .expected("}"))
21872215
diagnosticTest(#"(?#"#, .expected(")"))
21882216
diagnosticTest(#"(?x"#, .expected(")"))
21892217

@@ -2218,6 +2246,15 @@ extension RegexTests {
22182246
// the closing bracket.
22192247
diagnosticTest("[]", .expected("]"))
22202248

2249+
diagnosticTest("[:a", .expected("]"))
2250+
diagnosticTest("[:a:", .expected("]"))
2251+
diagnosticTest("[[:a", .expected("]"))
2252+
diagnosticTest("[[:a:", .expected("]"))
2253+
diagnosticTest("[[:a[:]", .expected("]"))
2254+
2255+
diagnosticTest("[[::]]", .emptyProperty)
2256+
diagnosticTest("[[:=:]]", .emptyProperty)
2257+
22212258
// MARK: Bad escapes
22222259

22232260
diagnosticTest("\\", .expectedEscape)

0 commit comments

Comments
 (0)