swiftlang · hamishknight · Apr 12, 2022 · Apr 12, 2022 · hamishknight · Apr 12, 2022
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1064,11 +1064,16 @@ extension Source {
   }
 
   mutating func lexCustomCCStart(
+    context: ParsingContext
   ) throws -> Located<CustomCC.Start>? {
     recordLoc { src in
-      // POSIX named sets are atoms.
-      guard !src.starts(with: "[:") else { return nil }
-
+      // Make sure we don't have a POSIX character property. This may require
+      // walking to its ending to make sure we have a closing ':]', as otherwise
+      // we have a custom character class.
+      // TODO: This behavior seems subtle, could we warn?
+      guard !src.canLexPOSIXCharacterProperty(context: context) else {
+        return nil
+      }
       if src.tryEat("[") {
         return src.tryEat("^") ? .inverted : .normal
       }
@@ -1099,12 +1104,38 @@ extension Source {
   }
 
   private mutating func lexPOSIXCharacterProperty(
+    context: ParsingContext
   ) throws -> Located<AST.Atom.CharacterProperty>? {
-    try recordLoc { src in
-      guard src.tryEat(sequence: "[:") else { return nil }
-      let inverted = src.tryEat("^")
-      let prop = try src.lexCharacterPropertyContents(end: ":]").value
-      return .init(prop, isInverted: inverted, isPOSIX: true)
+    // Only allowed in a custom character class.
+    guard context.isInCustomCharacterClass else { return nil }
+    return try recordLoc { src in
+      try src.tryEating { src in
+        guard src.tryEat(sequence: "[:") else { return nil }
+        let inverted = src.tryEat("^")
+
+        // Note we lex the contents and ending *before* classifying, because we
+        // want to bail with nil if we don't have the right ending. This allows
+        // the lexing of a custom character class if we don't have a ':]'
+        // ending.
+        let (key, value) = src.lexCharacterPropertyKeyValue()
+        guard src.tryEat(sequence: ":]") else { return nil }
+
+        let prop = try Source.classifyCharacterPropertyContents(key: key,
+                                                                value: value)
+        return .init(prop, isInverted: inverted, isPOSIX: true)
+      }
+    }
+  }
+
+  private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
+    do {
+      var src = self
+      return try src.lexPOSIXCharacterProperty(context: context) != nil
+    } catch {
+      // We want to tend on the side of lexing a POSIX character property, so
+      // even if it is invalid in some way (e.g invalid property names), still
+      // try and lex it.
+      return true
     }
   }
 
@@ -1129,26 +1160,52 @@ extension Source {
     }
   }
 
-  private mutating func lexCharacterPropertyContents(
-    end: String
-  ) throws -> Located<AST.Atom.CharacterProperty.Kind> {
-    try recordLoc { src in
-      // We should either have:
-      // - 'x=y' where 'x' is a property key, and 'y' is a value.
-      // - 'y' where 'y' is a value (or a bool key with an inferred value
-      //   of true), and its key is inferred.
-      // TODO: We could have better recovery here if we only ate the characters
-      // that property keys and values can use.
-      let lhs = src.lexUntil {
-        $0.isEmpty || $0.peek() == "=" || $0.starts(with: end)
-      }.value
-      if src.tryEat("=") {
-        let rhs = try src.lexUntil(eating: end).value
-        return try Source.classifyCharacterProperty(key: lhs, value: rhs)
+  private mutating func lexCharacterPropertyKeyValue(
+  ) -> (key: String?, value: String) {
+    func atPossibleEnding(_ src: inout Source) -> Bool {
+      guard let next = src.peek() else { return true }
+      switch next {
+      case "=":
+        // End of a key.
+        return true
+      case ":", "[", "]":
+        // POSIX character property endings to cover ':]', ']', and '[' as the
+        // start of a nested character class.
+        return true
+      case "}":
+        // Ending of '\p{'. We cover this for POSIX too as it's not a valid
+        // character property name anyway, and it's nice not to have diverging
+        // logic for these cases.
+        return true
+      default:
+        // We may want to handle other metacharacters here, e.g '{', '(', ')',
+        // as they're not valid character property names. However for now
+        // let's tend on the side of forming an unknown property name in case
+        // these characters are ever used in future character property names
+        // (though it's very unlikely). Users can always escape e.g the ':'
+        // in '[:' if they definitely want a custom character class.
+        return false
       }
-      try src.expect(sequence: end)
-      return try Source.classifyCharacterPropertyValueOnly(lhs)
     }
+    // We should either have:
+    // - 'x=y' where 'x' is a property key, and 'y' is a value.
+    // - 'y' where 'y' is a value (or a bool key with an inferred value of true)
+    //   and its key is inferred.
+    let lhs = lexUntil(atPossibleEnding).value
+    if tryEat("=") {
+      let rhs = lexUntil(atPossibleEnding).value
+      return (lhs, rhs)
+    }
+    return (nil, lhs)
+  }
+
+  private static func classifyCharacterPropertyContents(
+    key: String?, value: String
+  ) throws -> AST.Atom.CharacterProperty.Kind {
+    if let key = key {
+      return try classifyCharacterProperty(key: key, value: value)
+    }
+    return try classifyCharacterPropertyValueOnly(value)
   }
 
   /// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
       let isInverted = src.peek() == "P"
       src.advance(2)
 
-      let prop = try src.lexCharacterPropertyContents(end: "}").value
+      let (key, value) = src.lexCharacterPropertyKeyValue()
+      let prop = try Source.classifyCharacterPropertyContents(key: key,
+                                                              value: value)
+      try src.expect("}")
       return .init(prop, isInverted: isInverted, isPOSIX: false)
     }
   }
@@ -1758,11 +1818,8 @@ extension Source {
       if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
       // TODO: Store customCC in the atom, if that's useful
 
-      // POSIX character property. This is only allowed in a custom character
-      // class.
-      // TODO: Can we try and recover and diagnose these outside character
-      // classes?
-      if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value {
+      // POSIX character property.
+      if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
         return .property(prop)
       }
 

diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -403,7 +403,7 @@ extension Parser {
     }
 
     // Check if we have the start of a custom character class '['.
-    if let cccStart = try source.lexCustomCCStart() {
+    if let cccStart = try source.lexCustomCCStart(context: context) {
       return .customCharacterClass(
         try parseCustomCharacterClass(cccStart))
     }
@@ -487,7 +487,7 @@ extension Parser {
     while source.peek() != "]" && source.peekCCBinOp() == nil {
 
       // Nested custom character class.
-      if let cccStart = try source.lexCustomCCStart() {
+      if let cccStart = try source.lexCustomCCStart(context: context) {
         members.append(.custom(try parseCustomCharacterClass(cccStart)))
         continue
       }

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -474,6 +474,26 @@ extension RegexTests {
     parseTest(
       "[a^]", charClass("a", "^"))
 
+    // These are custom character classes, not invalid POSIX character classes.
+    // TODO: This behavior is subtle, we ought to warn.
+    parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
+    parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
+    parseTest("[:a]", charClass(":", "a"))
+    parseTest("[a:]", charClass("a", ":"))
+    parseTest("[:]", charClass(":"))
+    parseTest("[::]", charClass(":", ":"))
+    parseTest("[:=:]", charClass(":", "=", ":"))
+    parseTest("[[:]]", charClass(charClass(":")))
+    parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
+
+    parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":")))
+    parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":")))
+    parseTest(#"[[:a]]"#, charClass(charClass(":", "a")))
+    parseTest(#"[[:}]]"#, charClass(charClass(":", "}")))
+    parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
+    parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{"))))
+    parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
+
     parseTest(
       #"\D\S\W"#,
       concat(
@@ -1096,9 +1116,13 @@ extension RegexTests {
       #"\p{C}+"#,
       oneOrMore(of: prop(.generalCategory(.other))))
 
+    // TODO: Start erroring on these?
     parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx")))
     parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL")))
     parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y")))
+    parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)")))
+    parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()"))))
+    parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}"))
 
     // UAX44-LM3 means all of the below are equivalent.
     let lowercaseLetter = prop(.generalCategory(.lowercaseLetter))
@@ -2183,7 +2207,11 @@ extension RegexTests {
     diagnosticTest(#"\N{A"#, .expected("}"))
     diagnosticTest(#"\N{U+A"#, .expected("}"))
     diagnosticTest(#"\p{a"#, .expected("}"))
-    diagnosticTest(#"\p{a="#, .expected("}"))
+    diagnosticTest(#"\p{a="#, .emptyProperty)
+    diagnosticTest(#"\p{a=}"#, .emptyProperty)
+    diagnosticTest(#"\p{a=b"#, .expected("}"))
+    diagnosticTest(#"\p{aaa[b]}"#, .expected("}"))
+    diagnosticTest(#"\p{a=b=c}"#, .expected("}"))
     diagnosticTest(#"(?#"#, .expected(")"))
     diagnosticTest(#"(?x"#, .expected(")"))
 
@@ -2218,6 +2246,15 @@ extension RegexTests {
     // the closing bracket.
     diagnosticTest("[]", .expected("]"))
 
+    diagnosticTest("[:a", .expected("]"))
+    diagnosticTest("[:a:", .expected("]"))
+    diagnosticTest("[[:a", .expected("]"))
+    diagnosticTest("[[:a:", .expected("]"))
+    diagnosticTest("[[:a[:]", .expected("]"))
+
+    diagnosticTest("[[::]]", .emptyProperty)
+    diagnosticTest("[[:=:]]", .emptyProperty)
+
     // MARK: Bad escapes
 
     diagnosticTest("\\", .expectedEscape)