diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index e8b7e9e18..6a61ccdf7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1176,6 +1176,14 @@ extension Source { // character property name anyway, and it's nice not to have diverging // logic for these cases. return true + case "\\": + // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here + // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It + // seems character class escapes e.g '\d' are excluded, however it's not + // clear that is intentional. Let's apply the rule for any escape, as a + // backslash would never be a valid character property name, and we can + // diagnose any invalid escapes when parsing as a character class. + return true default: // We may want to handle other metacharacters here, e.g '{', '(', ')', // as they're not valid character property names. However for now diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4043e4ccb..94c134853 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -503,6 +503,25 @@ extension RegexTests { parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + parseTest( + #"[:[:space:]:]"#, + charClass(":", posixProp_m(.binary(.whitespace)), ":") + ) + parseTest( + #"[:a[:space:]b:]"#, + charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":") + ) + + // ICU parses a custom character class if it sees any of its known escape + // sequences in a POSIX character property (though it appears to exclude + // character class escapes e.g '\d'). We do so for any escape sequence as + // '\' is not a valid character property character. + parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]"))) + parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":")) + parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":")) + parseTest(#"[:\\:]"#, charClass(":", "\\", ":")) + parseTest(#"[:\:]"#, charClass(":", ":")) + parseTest( #"\D\S\W"#, concat( @@ -2319,7 +2338,7 @@ extension RegexTests { diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) - diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#)) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) // MARK: Matching options