From 3f161701c9f76a2e18cb3c47ee95453911337b83 Mon Sep 17 00:00:00 2001 From: Hamish Knight <hamish_github@mediocremail.com> Date: Tue, 19 Apr 2022 11:46:59 +0100 Subject: [PATCH] Don't parse a character property containing a backslash Add backslash to the list of characters we don't consider valid for a character property name. This means that we'll bail when attempting to lex a POSIX character property and instead lex a custom character class. This allows e.g `[:\Q :] \E]` to be lexed as a custom character class. For `\p{...}` this just means we'll emit a truncated invalid property error, which is arguably more inline with what the user was expecting.. I noticed when digging through the ICU source code that it will bail out of parsing a POSIX character property if it encounters one of its known escape sequences (e.g `\a`, `\e`, `\f`, ...). Interestingly this doesn't cover character property escapes e.g `\d`, but it's not clear that is intentional. Given backslash is not a valid character property character anyway, it seems reasonable to broaden this behavior to bail on any backslash. --- .../Regex/Parse/LexicalAnalysis.swift | 8 +++++++ Tests/RegexTests/ParseTests.swift | 21 ++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index e8b7e9e18..6a61ccdf7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1176,6 +1176,14 @@ extension Source { // character property name anyway, and it's nice not to have diverging // logic for these cases. return true + case "\\": + // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here + // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It + // seems character class escapes e.g '\d' are excluded, however it's not + // clear that is intentional. Let's apply the rule for any escape, as a + // backslash would never be a valid character property name, and we can + // diagnose any invalid escapes when parsing as a character class. + return true default: // We may want to handle other metacharacters here, e.g '{', '(', ')', // as they're not valid character property names. However for now diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4043e4ccb..94c134853 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -503,6 +503,25 @@ extension RegexTests { parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + parseTest( + #"[:[:space:]:]"#, + charClass(":", posixProp_m(.binary(.whitespace)), ":") + ) + parseTest( + #"[:a[:space:]b:]"#, + charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":") + ) + + // ICU parses a custom character class if it sees any of its known escape + // sequences in a POSIX character property (though it appears to exclude + // character class escapes e.g '\d'). We do so for any escape sequence as + // '\' is not a valid character property character. + parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]"))) + parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":")) + parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":")) + parseTest(#"[:\\:]"#, charClass(":", "\\", ":")) + parseTest(#"[:\:]"#, charClass(":", ":")) + parseTest( #"\D\S\W"#, concat( @@ -2319,7 +2338,7 @@ extension RegexTests { diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) - diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#)) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) // MARK: Matching options