swiftlang · hamishknight · Apr 7, 2022 · Apr 4, 2022 · Apr 4, 2022
diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -631,6 +631,41 @@ extension AST.Atom {
   }
 }
 
+extension AST.Atom.EscapedBuiltin {
+  /// If the escape sequence represents a unicode scalar value, returns the
+  /// value, otherwise `nil`.
+  public var scalarValue: UnicodeScalar? {
+    switch self {
+    // TODO: Should we separate these into a separate enum? Or move the
+    // specifics of the scalar to the DSL tree?
+    case .alarm:
+      return "\u{7}"
+    case .backspace:
+      return "\u{8}"
+    case .escape:
+      return "\u{1B}"
+    case .formfeed:
+      return "\u{C}"
+    case .newline:
+      return "\n"
+    case .carriageReturn:
+      return "\r"
+    case .tab:
+      return "\t"
+
+    case .singleDataUnit, .decimalDigit, .notDecimalDigit,
+        .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
+        .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
+        .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
+        .wordBoundary, .notWordBoundary, .startOfSubject,
+        .endOfSubjectBeforeNewline, .endOfSubject,
+        .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
+        .textSegment, .notTextSegment:
+      return nil
+    }
+  }
+}
+
 extension AST.Atom {
   /// Retrieve the character value of the atom if it represents a literal
   /// character or unicode scalar, nil otherwise.
@@ -642,34 +677,7 @@ extension AST.Atom {
       return Character(s)
 
     case .escaped(let c):
-      switch c {
-      // TODO: Should we separate these into a separate enum? Or move the
-      // specifics of the scalar to the DSL tree?
-      case .alarm:
-        return "\u{7}"
-      case .backspace:
-        return "\u{8}"
-      case .escape:
-        return "\u{1B}"
-      case .formfeed:
-        return "\u{C}"
-      case .newline:
-        return "\n"
-      case .carriageReturn:
-        return "\r"
-      case .tab:
-        return "\t"
-
-      case .singleDataUnit, .decimalDigit, .notDecimalDigit,
-          .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
-          .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
-          .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
-          .wordBoundary, .notWordBoundary, .startOfSubject,
-          .endOfSubjectBeforeNewline, .endOfSubject,
-          .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
-          .textSegment, .notTextSegment:
-        return nil
-      }
+      return c.scalarValue.map(Character.init)
 
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
       // TODO: These should have unicode scalar values.

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -295,101 +295,6 @@ extension DSLTree.CustomCharacterClass.Member {
   }
 }
 
-extension AST.CustomCharacterClass.Member {
-  func generateConsumer(
-    _ opts: MatchingOptions
-  ) throws -> MEProgram<String>.ConsumeFunction {
-    switch self {
-    case .custom(let ccc):
-      return try ccc.generateConsumer(opts)
-
-    case .range(let r):
-      guard let lhs = r.lhs.literalCharacterValue else {
-        throw Unsupported("\(r.lhs) in range")
-      }
-      guard let rhs = r.rhs.literalCharacterValue else {
-        throw Unsupported("\(r.rhs) in range")
-      }
-
-      return { input, bounds in
-        // TODO: check for out of bounds?
-        let curIdx = bounds.lowerBound
-        if (lhs...rhs).contains(input[curIdx]) {
-          // TODO: semantic level
-          return input.index(after: curIdx)
-        }
-        return nil
-      }
-
-    case .atom(let atom):
-      guard let gen = try atom.generateConsumer(opts) else {
-        throw Unsupported("TODO")
-      }
-      return gen
-
-    case .quote(let q):
-      // TODO: Not optimal.
-      let consumers = try q.literal.map {
-        try AST.Atom(.char($0), .fake).generateConsumer(opts)!
-      }
-      return { input, bounds in
-        for consumer in consumers {
-          if let idx = consumer(input, bounds) {
-            return idx
-          }
-        }
-        return nil
-      }
-
-    case .trivia:
-      throw Unreachable(
-        "Should have been stripped by caller")
-
-    case .setOperation(let lhs, let op, let rhs):
-      // TODO: We should probably have a component type
-      // instead of a members array... for now we reconstruct
-      // an AST node...
-      let start = AST.Located(
-        faking: AST.CustomCharacterClass.Start.normal)
-
-      let lhs = try AST.CustomCharacterClass(
-        start, lhs, .fake
-      ).generateConsumer(opts)
-      let rhs = try AST.CustomCharacterClass(
-        start, rhs, .fake
-      ).generateConsumer(opts)
-
-      return { input, bounds in
-        // NOTE: Easy way to implement, not performant
-        let lhsIdxOpt = lhs(input, bounds)
-        let rhsIdxOpt = rhs(input, bounds)
-
-        // TODO: What if lengths don't line up?
-        assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil
-               || rhsIdxOpt == nil)
-
-        switch op.value {
-        case .subtraction:
-          guard rhsIdxOpt == nil else { return nil }
-          return lhsIdxOpt
-
-        case .intersection:
-          if let idx = lhsIdxOpt {
-            return rhsIdxOpt == nil ? nil : idx
-          }
-          return nil
-
-        case .symmetricDifference:
-          if let idx = lhsIdxOpt {
-            return rhsIdxOpt == nil ? idx : nil
-          }
-          return rhsIdxOpt
-        }
-      }
-    }
-  }
-}
-
 extension DSLTree.CustomCharacterClass {
   func generateConsumer(
     _ opts: MatchingOptions
@@ -413,29 +318,6 @@ extension DSLTree.CustomCharacterClass {
   }
 }
 
-extension AST.CustomCharacterClass {
-  func generateConsumer(
-    _ opts: MatchingOptions
-  ) throws -> MEProgram<String>.ConsumeFunction {
-    // NOTE: Easy way to implement, obviously not performant
-    let consumers = try strippingTriviaShallow.members.map {
-      try $0.generateConsumer(opts)
-    }
-    return { input, bounds in
-      for consumer in consumers {
-        if let idx = consumer(input, bounds) {
-          return isInverted ? nil : idx
-        }
-      }
-      if isInverted {
-        // FIXME: semantic level
-        return input.index(after: bounds.lowerBound)
-      }
-      return nil
-    }
-  }
-}
-
 // NOTE: Conveniences, though not most performant
 private func consumeScalarScript(
   _ s: Unicode.Script

diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift
@@ -211,6 +211,9 @@ extension AST.Atom {
     case .any:                  return .any
     case let .backreference(r): return .backreference(r)
 
+    case .escaped(let c) where c.scalarValue != nil:
+      return .scalar(c.scalarValue!)
+
     default: return .unconverted(self)
     }
   }

diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -281,6 +281,15 @@ extension RegexTests {
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
 
+    // Escape sequences that represent scalar values.
+    firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
+                   input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
+                   match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
+    firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#,
+                   input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
+                   match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
+
+    firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n")
 
     // MARK: Quotes
 
@@ -596,24 +605,20 @@ extension RegexTests {
 
     func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
 
-    // Currently not supported in the matching engine.
     for s in scalar("\u{C}") ... scalar("\u{1B}") {
       let u = UnicodeScalar(s)!
-      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)")
     }
     for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
-      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)")
     }
     for s in scalar("\u{A}") ... scalar("\u{D}") {
       let u = UnicodeScalar(s)!
-      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)")
     }
-    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
-                   xfail: true)
+    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
 
+    // Currently not supported in the matching engine.
     for c: UnicodeScalar in ["a", "b", "c"] {
       firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
                      xfail: true)