Don't parse a character property containing a backslash

hamishknight · hamishknight · commit 81f135011398 · 2022-04-21T11:11:17.000+01:00
Add backslash to the list of characters we don't
consider valid for a character property name. This
means that we'll bail when attempting to lex a
POSIX character property and instead lex a custom
character class. This allows e.g `[:\Q :] \E]` to
be lexed as a custom character class. For `\p{...}`
this just means we'll emit a truncated invalid
property error, which is arguably more inline with
what the user was expecting..

I noticed when digging through the ICU source code
that it will bail out of parsing a POSIX character
property if it encounters one of its known escape
sequences (e.g `\a`, `\e`, `\f`, ...). Interestingly
this doesn't cover character property escapes e.g
`\d`, but it's not clear that is intentional. Given
backslash is not a valid character property character
anyway, it seems reasonable to broaden this behavior
to bail on any backslash.
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1176,6 +1176,14 @@ extension Source {
         // character property name anyway, and it's nice not to have diverging
         // logic for these cases.
         return true
+      case "\\":
+        // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here
+        // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It
+        // seems character class escapes e.g '\d' are excluded, however it's not
+        // clear that is intentional. Let's apply the rule for any escape, as a
+        // backslash would never be a valid character property name, and we can
+        // diagnose any invalid escapes when parsing as a character class.
+        return true
       default:
         // We may want to handle other metacharacters here, e.g '{', '(', ')',
         // as they're not valid character property names. However for now
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -503,6 +503,25 @@ extension RegexTests {
     parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
     parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
 
+    parseTest(
+      #"[:[:space:]:]"#,
+      charClass(":", posixProp_m(.binary(.whitespace)), ":")
+    )
+    parseTest(
+      #"[:a[:space:]b:]"#,
+      charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":")
+    )
+
+    // ICU parses a custom character class if it sees any of its known escape
+    // sequences in a POSIX character property (though it appears to exclude
+    // character class escapes e.g '\d'). We do so for any escape sequence as
+    // '\' is not a valid character property character.
+    parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]")))
+    parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":"))
+    parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":"))
+    parseTest(#"[:\\:]"#, charClass(":", "\\", ":"))
+    parseTest(#"[:\:]"#, charClass(":", ":"))
+
     parseTest(
       #"\D\S\W"#,
       concat(
@@ -2319,7 +2338,7 @@ extension RegexTests {
     diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y"))
     diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)"))
     diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()"))
-    diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#))
+    diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa"))
     diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{"))
 
     // MARK: Matching options