From 3f2832df8593c803ee85696daabb49fa4a93ca38 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:12 +0100
Subject: [PATCH 01/11] Fix HexDigit definition in RegexSyntax.md

---
 Documentation/Evolution/RegexSyntaxRunTimeConstruction.md | 2 +-
 Tests/RegexTests/ParseTests.swift                         | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md
index f3dda59ba..7dd56b6a8 100644
--- a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md
+++ b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md
@@ -339,7 +339,7 @@ UnicodeScalar -> '\u{' HexDigit{1...} '}'
                | '\o{' OctalDigit{1...} '}'
                | '\0' OctalDigit{0...3}
 
-HexDigit   -> [0-9a-zA-Z]
+HexDigit   -> [0-9a-fA-F]
 OctalDigit -> [0-7]
 
 NamedScalar -> '\N{' ScalarName '}'
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index c6ff3e46d..5e3defb11 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -2313,6 +2313,10 @@ extension RegexTests {
     diagnosticTest("{5}", .quantifierRequiresOperand("{5}"))
     diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}"))
 
+    // MARK: Unicode scalars
+
+    diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex))
+
     // MARK: Matching options
 
     diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret)

From 3cce15d40218349fbd6512f6c9482f476f339f52 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:13 +0100
Subject: [PATCH 02/11] Remove AST CustomCharacterClass consumer generation

This isn't actually used, as we convert to a DSL
custom character class, and then use that consumer
logic.
---
 .../_StringProcessing/ConsumerInterface.swift | 118 ------------------
 1 file changed, 118 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 600234e0d..85e73801d 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -295,101 +295,6 @@ extension DSLTree.CustomCharacterClass.Member {
   }
 }
 
-extension AST.CustomCharacterClass.Member {
-  func generateConsumer(
-    _ opts: MatchingOptions
-  ) throws -> MEProgram<String>.ConsumeFunction {
-    switch self {
-    case .custom(let ccc):
-      return try ccc.generateConsumer(opts)
-
-    case .range(let r):
-      guard let lhs = r.lhs.literalCharacterValue else {
-        throw Unsupported("\(r.lhs) in range")
-      }
-      guard let rhs = r.rhs.literalCharacterValue else {
-        throw Unsupported("\(r.rhs) in range")
-      }
-
-      return { input, bounds in
-        // TODO: check for out of bounds?
-        let curIdx = bounds.lowerBound
-        if (lhs...rhs).contains(input[curIdx]) {
-          // TODO: semantic level
-          return input.index(after: curIdx)
-        }
-        return nil
-      }
-
-    case .atom(let atom):
-      guard let gen = try atom.generateConsumer(opts) else {
-        throw Unsupported("TODO")
-      }
-      return gen
-
-    case .quote(let q):
-      // TODO: Not optimal.
-      let consumers = try q.literal.map {
-        try AST.Atom(.char($0), .fake).generateConsumer(opts)!
-      }
-      return { input, bounds in
-        for consumer in consumers {
-          if let idx = consumer(input, bounds) {
-            return idx
-          }
-        }
-        return nil
-      }
-
-    case .trivia:
-      throw Unreachable(
-        "Should have been stripped by caller")
-
-    case .setOperation(let lhs, let op, let rhs):
-      // TODO: We should probably have a component type
-      // instead of a members array... for now we reconstruct
-      // an AST node...
-      let start = AST.Located(
-        faking: AST.CustomCharacterClass.Start.normal)
-
-      let lhs = try AST.CustomCharacterClass(
-        start, lhs, .fake
-      ).generateConsumer(opts)
-      let rhs = try AST.CustomCharacterClass(
-        start, rhs, .fake
-      ).generateConsumer(opts)
-
-      return { input, bounds in
-        // NOTE: Easy way to implement, not performant
-        let lhsIdxOpt = lhs(input, bounds)
-        let rhsIdxOpt = rhs(input, bounds)
-
-        // TODO: What if lengths don't line up?
-        assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil
-               || rhsIdxOpt == nil)
-
-        switch op.value {
-        case .subtraction:
-          guard rhsIdxOpt == nil else { return nil }
-          return lhsIdxOpt
-
-        case .intersection:
-          if let idx = lhsIdxOpt {
-            return rhsIdxOpt == nil ? nil : idx
-          }
-          return nil
-
-        case .symmetricDifference:
-          if let idx = lhsIdxOpt {
-            return rhsIdxOpt == nil ? idx : nil
-          }
-          return rhsIdxOpt
-        }
-      }
-    }
-  }
-}
-
 extension DSLTree.CustomCharacterClass {
   func generateConsumer(
     _ opts: MatchingOptions
@@ -413,29 +318,6 @@ extension DSLTree.CustomCharacterClass {
   }
 }
 
-extension AST.CustomCharacterClass {
-  func generateConsumer(
-    _ opts: MatchingOptions
-  ) throws -> MEProgram<String>.ConsumeFunction {
-    // NOTE: Easy way to implement, obviously not performant
-    let consumers = try strippingTriviaShallow.members.map {
-      try $0.generateConsumer(opts)
-    }
-    return { input, bounds in
-      for consumer in consumers {
-        if let idx = consumer(input, bounds) {
-          return isInverted ? nil : idx
-        }
-      }
-      if isInverted {
-        // FIXME: semantic level
-        return input.index(after: bounds.lowerBound)
-      }
-      return nil
-    }
-  }
-}
-
 // NOTE: Conveniences, though not most performant
 private func consumeScalarScript(
   _ s: Unicode.Script

From 577dc6e6f345ad1a5b80ee4444e27aa187142990 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:13 +0100
Subject: [PATCH 03/11] Convert scalar escape sequences to DSL scalars

Convert AST escape sequences that represent a
scalar value (e.g `\f`, `n`, `\a`) into scalars in
the DSL tree. This allows the matching engine to
match against them.
---
 Sources/_RegexParser/Regex/AST/Atom.swift     | 64 +++++++++++--------
 .../Regex/ASTConversion.swift                 |  3 +
 Tests/RegexTests/MatchTests.swift             | 23 ++++---
 3 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
index 0aa0951c5..6482c4042 100644
--- a/Sources/_RegexParser/Regex/AST/Atom.swift
+++ b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -631,6 +631,41 @@ extension AST.Atom {
   }
 }
 
+extension AST.Atom.EscapedBuiltin {
+  /// If the escape sequence represents a unicode scalar value, returns the
+  /// value, otherwise `nil`.
+  public var scalarValue: UnicodeScalar? {
+    switch self {
+    // TODO: Should we separate these into a separate enum? Or move the
+    // specifics of the scalar to the DSL tree?
+    case .alarm:
+      return "\u{7}"
+    case .backspace:
+      return "\u{8}"
+    case .escape:
+      return "\u{1B}"
+    case .formfeed:
+      return "\u{C}"
+    case .newline:
+      return "\n"
+    case .carriageReturn:
+      return "\r"
+    case .tab:
+      return "\t"
+
+    case .singleDataUnit, .decimalDigit, .notDecimalDigit,
+        .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
+        .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
+        .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
+        .wordBoundary, .notWordBoundary, .startOfSubject,
+        .endOfSubjectBeforeNewline, .endOfSubject,
+        .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
+        .textSegment, .notTextSegment:
+      return nil
+    }
+  }
+}
+
 extension AST.Atom {
   /// Retrieve the character value of the atom if it represents a literal
   /// character or unicode scalar, nil otherwise.
@@ -642,34 +677,7 @@ extension AST.Atom {
       return Character(s)
 
     case .escaped(let c):
-      switch c {
-      // TODO: Should we separate these into a separate enum? Or move the
-      // specifics of the scalar to the DSL tree?
-      case .alarm:
-        return "\u{7}"
-      case .backspace:
-        return "\u{8}"
-      case .escape:
-        return "\u{1B}"
-      case .formfeed:
-        return "\u{C}"
-      case .newline:
-        return "\n"
-      case .carriageReturn:
-        return "\r"
-      case .tab:
-        return "\t"
-
-      case .singleDataUnit, .decimalDigit, .notDecimalDigit,
-          .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
-          .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
-          .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
-          .wordBoundary, .notWordBoundary, .startOfSubject,
-          .endOfSubjectBeforeNewline, .endOfSubject,
-          .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
-          .textSegment, .notTextSegment:
-        return nil
-      }
+      return c.scalarValue.map(Character.init)
 
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
       // TODO: These should have unicode scalar values.
diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift
index e00dcd806..86041f528 100644
--- a/Sources/_StringProcessing/Regex/ASTConversion.swift
+++ b/Sources/_StringProcessing/Regex/ASTConversion.swift
@@ -211,6 +211,9 @@ extension AST.Atom {
     case .any:                  return .any
     case let .backreference(r): return .backreference(r)
 
+    case .escaped(let c) where c.scalarValue != nil:
+      return .scalar(c.scalarValue!)
+
     default: return .unconverted(self)
     }
   }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index b1287fc8b..f808fa0e3 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -281,6 +281,15 @@ extension RegexTests {
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
 
+    // Escape sequences that represent scalar values.
+    firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
+                   input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
+                   match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
+    firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#,
+                   input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
+                   match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
+
+    firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n")
 
     // MARK: Quotes
 
@@ -596,24 +605,20 @@ extension RegexTests {
 
     func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
 
-    // Currently not supported in the matching engine.
     for s in scalar("\u{C}") ... scalar("\u{1B}") {
       let u = UnicodeScalar(s)!
-      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)")
     }
     for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
-      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)")
     }
     for s in scalar("\u{A}") ... scalar("\u{D}") {
       let u = UnicodeScalar(s)!
-      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)")
     }
-    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
-                   xfail: true)
+    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
 
+    // Currently not supported in the matching engine.
     for c: UnicodeScalar in ["a", "b", "c"] {
       firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
                      xfail: true)

From 2d1de9e74f6acfe2eaae34a8f904ecee82c06ad9 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:14 +0100
Subject: [PATCH 04/11] Allow custom character classes to begin with `:`

ICU and Oniguruma allow custom character classes to
begin with `:`, and only lex a POSIX character
property if they detect a closing `:]`. However
their behavior for this differs:

- ICU will consider *any* `:]` in the regex as a
closing delimiter, even e.g `[[:a]][:]`.

- Oniguruma will stop if it hits a `]`, so
`[[:a]][:]` is treated as a custom character class.
However it only scans ahead 20 chars max, and doesn't
stop for e.g a nested character class opening `[`.

Our detection behavior for this is as follows:

- When `[:` is encountered inside a custom character
class, scan ahead to the closing `:]`.
- While scanning, bail if we see any characters
that are obviously invalid property names. Currently
this includes `[`, `]`, `}`, as well as a second
occurrence of `=`.
- Otherwise, if we end on `:]`, consider that a
POSIX character property.

We could include more metacharacters to bail on,
e.g `{`, `(`, `)`, but for now I'm tending on the
side of lexing an invalid POSIX character property.
We can always relax this in the future (as we'd be
turning invalid code into valid code). Users can
always escape the initial `:` in `[:` if they want
a custom character class. In fact, we may want to
suggest this via a warning, as this behavior can
be pretty subtle.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 121 +++++++++++++-----
 Sources/_RegexParser/Regex/Parse/Parse.swift  |   4 +-
 Tests/RegexTests/ParseTests.swift             |  39 +++++-
 3 files changed, 129 insertions(+), 35 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index c48d53de9..bf53d0ab1 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1064,11 +1064,16 @@ extension Source {
   }
 
   mutating func lexCustomCCStart(
+    context: ParsingContext
   ) throws -> Located<CustomCC.Start>? {
     recordLoc { src in
-      // POSIX named sets are atoms.
-      guard !src.starts(with: "[:") else { return nil }
-
+      // Make sure we don't have a POSIX character property. This may require
+      // walking to its ending to make sure we have a closing ':]', as otherwise
+      // we have a custom character class.
+      // TODO: This behavior seems subtle, could we warn?
+      guard !src.canLexPOSIXCharacterProperty(context: context) else {
+        return nil
+      }
       if src.tryEat("[") {
         return src.tryEat("^") ? .inverted : .normal
       }
@@ -1099,12 +1104,38 @@ extension Source {
   }
 
   private mutating func lexPOSIXCharacterProperty(
+    context: ParsingContext
   ) throws -> Located<AST.Atom.CharacterProperty>? {
-    try recordLoc { src in
-      guard src.tryEat(sequence: "[:") else { return nil }
-      let inverted = src.tryEat("^")
-      let prop = try src.lexCharacterPropertyContents(end: ":]").value
-      return .init(prop, isInverted: inverted, isPOSIX: true)
+    // Only allowed in a custom character class.
+    guard context.isInCustomCharacterClass else { return nil }
+    return try recordLoc { src in
+      try src.tryEating { src in
+        guard src.tryEat(sequence: "[:") else { return nil }
+        let inverted = src.tryEat("^")
+
+        // Note we lex the contents and ending *before* classifying, because we
+        // want to bail with nil if we don't have the right ending. This allows
+        // the lexing of a custom character class if we don't have a ':]'
+        // ending.
+        let (key, value) = src.lexCharacterPropertyKeyValue()
+        guard src.tryEat(sequence: ":]") else { return nil }
+
+        let prop = try Source.classifyCharacterPropertyContents(key: key,
+                                                                value: value)
+        return .init(prop, isInverted: inverted, isPOSIX: true)
+      }
+    }
+  }
+
+  private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
+    do {
+      var src = self
+      return try src.lexPOSIXCharacterProperty(context: context) != nil
+    } catch {
+      // We want to tend on the side of lexing a POSIX character property, so
+      // even if it is invalid in some way (e.g invalid property names), still
+      // try and lex it.
+      return true
     }
   }
 
@@ -1129,26 +1160,52 @@ extension Source {
     }
   }
 
-  private mutating func lexCharacterPropertyContents(
-    end: String
-  ) throws -> Located<AST.Atom.CharacterProperty.Kind> {
-    try recordLoc { src in
-      // We should either have:
-      // - 'x=y' where 'x' is a property key, and 'y' is a value.
-      // - 'y' where 'y' is a value (or a bool key with an inferred value
-      //   of true), and its key is inferred.
-      // TODO: We could have better recovery here if we only ate the characters
-      // that property keys and values can use.
-      let lhs = src.lexUntil {
-        $0.isEmpty || $0.peek() == "=" || $0.starts(with: end)
-      }.value
-      if src.tryEat("=") {
-        let rhs = try src.lexUntil(eating: end).value
-        return try Source.classifyCharacterProperty(key: lhs, value: rhs)
+  private mutating func lexCharacterPropertyKeyValue(
+  ) -> (key: String?, value: String) {
+    func atPossibleEnding(_ src: inout Source) -> Bool {
+      guard let next = src.peek() else { return true }
+      switch next {
+      case "=":
+        // End of a key.
+        return true
+      case ":", "[", "]":
+        // POSIX character property endings to cover ':]', ']', and '[' as the
+        // start of a nested character class.
+        return true
+      case "}":
+        // Ending of '\p{'. We cover this for POSIX too as it's not a valid
+        // character property name anyway, and it's nice not to have diverging
+        // logic for these cases.
+        return true
+      default:
+        // We may want to handle other metacharacters here, e.g '{', '(', ')',
+        // as they're not valid character property names. However for now
+        // let's tend on the side of forming an unknown property name in case
+        // these characters are ever used in future character property names
+        // (though it's very unlikely). Users can always escape e.g the ':'
+        // in '[:' if they definitely want a custom character class.
+        return false
       }
-      try src.expect(sequence: end)
-      return try Source.classifyCharacterPropertyValueOnly(lhs)
     }
+    // We should either have:
+    // - 'x=y' where 'x' is a property key, and 'y' is a value.
+    // - 'y' where 'y' is a value (or a bool key with an inferred value of true)
+    //   and its key is inferred.
+    let lhs = lexUntil(atPossibleEnding).value
+    if tryEat("=") {
+      let rhs = lexUntil(atPossibleEnding).value
+      return (lhs, rhs)
+    }
+    return (nil, lhs)
+  }
+
+  private static func classifyCharacterPropertyContents(
+    key: String?, value: String
+  ) throws -> AST.Atom.CharacterProperty.Kind {
+    if let key = key {
+      return try classifyCharacterProperty(key: key, value: value)
+    }
+    return try classifyCharacterPropertyValueOnly(value)
   }
 
   /// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
       let isInverted = src.peek() == "P"
       src.advance(2)
 
-      let prop = try src.lexCharacterPropertyContents(end: "}").value
+      let (key, value) = src.lexCharacterPropertyKeyValue()
+      let prop = try Source.classifyCharacterPropertyContents(key: key,
+                                                              value: value)
+      try src.expect("}")
       return .init(prop, isInverted: isInverted, isPOSIX: false)
     }
   }
@@ -1758,11 +1818,8 @@ extension Source {
       if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
       // TODO: Store customCC in the atom, if that's useful
 
-      // POSIX character property. This is only allowed in a custom character
-      // class.
-      // TODO: Can we try and recover and diagnose these outside character
-      // classes?
-      if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value {
+      // POSIX character property.
+      if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
         return .property(prop)
       }
 
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index c3aa3500b..d70a1f14f 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -403,7 +403,7 @@ extension Parser {
     }
 
     // Check if we have the start of a custom character class '['.
-    if let cccStart = try source.lexCustomCCStart() {
+    if let cccStart = try source.lexCustomCCStart(context: context) {
       return .customCharacterClass(
         try parseCustomCharacterClass(cccStart))
     }
@@ -487,7 +487,7 @@ extension Parser {
     while source.peek() != "]" && source.peekCCBinOp() == nil {
 
       // Nested custom character class.
-      if let cccStart = try source.lexCustomCCStart() {
+      if let cccStart = try source.lexCustomCCStart(context: context) {
         members.append(.custom(try parseCustomCharacterClass(cccStart)))
         continue
       }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 5e3defb11..aa0ebbf98 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -474,6 +474,26 @@ extension RegexTests {
     parseTest(
       "[a^]", charClass("a", "^"))
 
+    // These are custom character classes, not invalid POSIX character classes.
+    // TODO: This behavior is subtle, we ought to warn.
+    parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
+    parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
+    parseTest("[:a]", charClass(":", "a"))
+    parseTest("[a:]", charClass("a", ":"))
+    parseTest("[:]", charClass(":"))
+    parseTest("[::]", charClass(":", ":"))
+    parseTest("[:=:]", charClass(":", "=", ":"))
+    parseTest("[[:]]", charClass(charClass(":")))
+    parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
+
+    parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":")))
+    parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":")))
+    parseTest(#"[[:a]]"#, charClass(charClass(":", "a")))
+    parseTest(#"[[:}]]"#, charClass(charClass(":", "}")))
+    parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
+    parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{"))))
+    parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
+
     parseTest(
       #"\D\S\W"#,
       concat(
@@ -1096,9 +1116,13 @@ extension RegexTests {
       #"\p{C}+"#,
       oneOrMore(of: prop(.generalCategory(.other))))
 
+    // TODO: Start erroring on these?
     parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx")))
     parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL")))
     parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y")))
+    parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)")))
+    parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()"))))
+    parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}"))
 
     // UAX44-LM3 means all of the below are equivalent.
     let lowercaseLetter = prop(.generalCategory(.lowercaseLetter))
@@ -2183,7 +2207,11 @@ extension RegexTests {
     diagnosticTest(#"\N{A"#, .expected("}"))
     diagnosticTest(#"\N{U+A"#, .expected("}"))
     diagnosticTest(#"\p{a"#, .expected("}"))
-    diagnosticTest(#"\p{a="#, .expected("}"))
+    diagnosticTest(#"\p{a="#, .emptyProperty)
+    diagnosticTest(#"\p{a=}"#, .emptyProperty)
+    diagnosticTest(#"\p{a=b"#, .expected("}"))
+    diagnosticTest(#"\p{aaa[b]}"#, .expected("}"))
+    diagnosticTest(#"\p{a=b=c}"#, .expected("}"))
     diagnosticTest(#"(?#"#, .expected(")"))
     diagnosticTest(#"(?x"#, .expected(")"))
 
@@ -2218,6 +2246,15 @@ extension RegexTests {
     // the closing bracket.
     diagnosticTest("[]", .expected("]"))
 
+    diagnosticTest("[:a", .expected("]"))
+    diagnosticTest("[:a:", .expected("]"))
+    diagnosticTest("[[:a", .expected("]"))
+    diagnosticTest("[[:a:", .expected("]"))
+    diagnosticTest("[[:a[:]", .expected("]"))
+
+    diagnosticTest("[[::]]", .emptyProperty)
+    diagnosticTest("[[:=:]]", .emptyProperty)
+
     // MARK: Bad escapes
 
     diagnosticTest("\\", .expectedEscape)

From 5912ab493f53fdefbbc2cdf5b2d85795d8fe9176 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:14 +0100
Subject: [PATCH 05/11] Allow POSIX character properties outside of custom
 character classes

This matches the ICU behavior, and appears to be
suggested by UTS#18.
---
 .../Regex/Parse/LexicalAnalysis.swift           | 17 +++++++----------
 Sources/_RegexParser/Regex/Parse/Parse.swift    |  4 ++--
 .../_StringProcessing/Utility/ASTBuilder.swift  |  5 +++++
 Tests/RegexTests/ParseTests.swift               | 11 ++++++++---
 4 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index bf53d0ab1..f70989c9f 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1064,14 +1064,13 @@ extension Source {
   }
 
   mutating func lexCustomCCStart(
-    context: ParsingContext
   ) throws -> Located<CustomCC.Start>? {
     recordLoc { src in
       // Make sure we don't have a POSIX character property. This may require
       // walking to its ending to make sure we have a closing ':]', as otherwise
       // we have a custom character class.
       // TODO: This behavior seems subtle, could we warn?
-      guard !src.canLexPOSIXCharacterProperty(context: context) else {
+      guard !src.canLexPOSIXCharacterProperty() else {
         return nil
       }
       if src.tryEat("[") {
@@ -1104,11 +1103,8 @@ extension Source {
   }
 
   private mutating func lexPOSIXCharacterProperty(
-    context: ParsingContext
   ) throws -> Located<AST.Atom.CharacterProperty>? {
-    // Only allowed in a custom character class.
-    guard context.isInCustomCharacterClass else { return nil }
-    return try recordLoc { src in
+    try recordLoc { src in
       try src.tryEating { src in
         guard src.tryEat(sequence: "[:") else { return nil }
         let inverted = src.tryEat("^")
@@ -1127,10 +1123,10 @@ extension Source {
     }
   }
 
-  private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
+  private func canLexPOSIXCharacterProperty() -> Bool {
     do {
       var src = self
-      return try src.lexPOSIXCharacterProperty(context: context) != nil
+      return try src.lexPOSIXCharacterProperty() != nil
     } catch {
       // We want to tend on the side of lexing a POSIX character property, so
       // even if it is invalid in some way (e.g invalid property names), still
@@ -1818,8 +1814,9 @@ extension Source {
       if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
       // TODO: Store customCC in the atom, if that's useful
 
-      // POSIX character property.
-      if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
+      // POSIX character property. Like \p{...} this is also allowed outside of
+      // a custom character class.
+      if let prop = try src.lexPOSIXCharacterProperty()?.value {
         return .property(prop)
       }
 
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index d70a1f14f..c3aa3500b 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -403,7 +403,7 @@ extension Parser {
     }
 
     // Check if we have the start of a custom character class '['.
-    if let cccStart = try source.lexCustomCCStart(context: context) {
+    if let cccStart = try source.lexCustomCCStart() {
       return .customCharacterClass(
         try parseCustomCharacterClass(cccStart))
     }
@@ -487,7 +487,7 @@ extension Parser {
     while source.peek() != "]" && source.peekCCBinOp() == nil {
 
       // Nested custom character class.
-      if let cccStart = try source.lexCustomCCStart(context: context) {
+      if let cccStart = try source.lexCustomCCStart() {
         members.append(.custom(try parseCustomCharacterClass(cccStart)))
         continue
       }
diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift
index be9f61517..961d0e23f 100644
--- a/Sources/_StringProcessing/Utility/ASTBuilder.swift
+++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift
@@ -354,6 +354,11 @@ func prop(
 ) -> AST.Node {
   atom(.property(.init(kind, isInverted: inverted, isPOSIX: false)))
 }
+func posixProp(
+  _ kind: AST.Atom.CharacterProperty.Kind, inverted: Bool = false
+) -> AST.Node {
+  atom(.property(.init(kind, isInverted: inverted, isPOSIX: true)))
+}
 
 // Raw atom constructing variant
 func atom_a(
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index aa0ebbf98..e9b422379 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -477,12 +477,9 @@ extension RegexTests {
     // These are custom character classes, not invalid POSIX character classes.
     // TODO: This behavior is subtle, we ought to warn.
     parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
-    parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
     parseTest("[:a]", charClass(":", "a"))
     parseTest("[a:]", charClass("a", ":"))
     parseTest("[:]", charClass(":"))
-    parseTest("[::]", charClass(":", ":"))
-    parseTest("[:=:]", charClass(":", "=", ":"))
     parseTest("[[:]]", charClass(charClass(":")))
     parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
 
@@ -522,6 +519,12 @@ extension RegexTests {
                         posixProp_m(.binary(.uppercase), inverted: true),
                         "c", "d"))
 
+    // Like ICU, we allow POSIX character properties outside of custom character
+    // classes. This also appears to be suggested by UTS#18.
+    // TODO: We should likely emit a warning.
+    parseTest("[:space:]", posixProp(.binary(.whitespace)))
+    parseTest("[:script=Greek:]", posixProp(.script(.greek)))
+
     parseTest("[[[:space:]]]", charClass(charClass(
       posixProp_m(.binary(.whitespace))
     )))
@@ -2252,6 +2255,8 @@ extension RegexTests {
     diagnosticTest("[[:a:", .expected("]"))
     diagnosticTest("[[:a[:]", .expected("]"))
 
+    diagnosticTest("[::]", .emptyProperty)
+    diagnosticTest("[:=:]", .emptyProperty)
     diagnosticTest("[[::]]", .emptyProperty)
     diagnosticTest("[[:=:]]", .emptyProperty)
 

From c63848655061d431b37715b1033f452a9195cdcf Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:15 +0100
Subject: [PATCH 06/11] Fix character class trivia matching

Rather than matching and not advancing the input,
we should always return `nil` to never match
against the trivia.
---
 Sources/_StringProcessing/ConsumerInterface.swift |  5 +----
 Tests/RegexTests/MatchTests.swift                 | 10 ++++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 85e73801d..d1cc1d327 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -287,11 +287,8 @@ extension DSLTree.CustomCharacterClass.Member {
       }
     case .trivia:
       // TODO: Should probably strip this earlier...
-      return { _, bounds in
-        return bounds.lowerBound
-      }
+      return { _, _ in nil }
     }
-
   }
 }
 
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index f808fa0e3..9a8e89fe0 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -1208,6 +1208,16 @@ extension RegexTests {
       ("CaFe", true),
       ("EfAc", true))
   }
+
+  func testNonSemanticWhitespace() {
+    firstMatchTest(#" \t "#, input: " \t ", match: " \t ")
+    firstMatchTest(#"(?xx) \t "#, input: " \t ", match: "\t")
+
+    firstMatchTest(#"[ \t]+"#, input: " \t ", match: " \t ")
+    firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t")
+    firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t")
+    firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t")
+  }
   
   func testASCIIClasses() {
     // 'D' ASCII-only digits

From f053dc33adba44e56aa535385842d2701663dff8 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:15 +0100
Subject: [PATCH 07/11] Fix trivia parsing for set operations and initial `]`
 cases

Previously we would check for an empty array of
members when deciding whether an initial `]` is
literal, or if the operands of a set operation are
invalid. Switch to checking whether we have any
semantic members instead.
---
 .../Regex/AST/CustomCharClass.swift           |  6 ++-
 Sources/_RegexParser/Regex/Parse/Parse.swift  | 29 ++++++-----
 .../_RegexParser/Regex/Printing/DumpAST.swift |  4 +-
 Tests/RegexTests/MatchTests.swift             |  3 ++
 Tests/RegexTests/ParseTests.swift             | 49 +++++++++++++++++--
 5 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift
index 614048f0a..19e72aef5 100644
--- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift
+++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift
@@ -97,6 +97,10 @@ extension CustomCC.Member {
     if case .trivia = self { return true }
     return false
   }
+
+  public var isSemantic: Bool {
+    !isTrivia
+  }
 }
 
 extension AST.CustomCharacterClass {
@@ -104,7 +108,7 @@ extension AST.CustomCharacterClass {
   /// nested custom character classes.
   public var strippingTriviaShallow: Self {
     var copy = self
-    copy.members = copy.members.filter { !$0.isTrivia }
+    copy.members = copy.members.filter(\.isSemantic)
     return copy
   }
 }
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index c3aa3500b..b24097b83 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -438,16 +438,18 @@ extension Parser {
     defer { context.isInCustomCharacterClass = alreadyInCCC }
 
     typealias Member = CustomCC.Member
-    try source.expectNonEmpty()
-
     var members: Array<Member> = []
+    try parseCCCMembers(into: &members)
 
-    // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty
-    // character classes, and assume an initial ']' is literal.
-    if let loc = source.tryEatWithLoc("]") {
-      members.append(.atom(.init(.char("]"), loc)))
+    // If we didn't parse any semantic members, we can eat a ']' character, as
+    // PCRE, Oniguruma, and ICU forbid empty character classes, and assume an
+    // initial ']' is literal.
+    if members.none(\.isSemantic) {
+      if let loc = source.tryEatWithLoc("]") {
+        members.append(.atom(.init(.char("]"), loc)))
+        try parseCCCMembers(into: &members)
+      }
     }
-    try parseCCCMembers(into: &members)
 
     // If we have a binary set operator, parse it and the next members. Note
     // that this means we left associate for a chain of operators.
@@ -458,8 +460,9 @@ extension Parser {
       var rhs: Array<Member> = []
       try parseCCCMembers(into: &rhs)
 
-      if members.isEmpty || rhs.isEmpty {
-        throw ParseError.expectedCustomCharacterClassMembers
+      if members.none(\.isSemantic) || rhs.none(\.isSemantic) {
+        throw Source.LocatedError(
+          ParseError.expectedCustomCharacterClassMembers, start.location)
       }
 
       // If we're done, bail early
@@ -472,8 +475,9 @@ extension Parser {
       // Otherwise it's just another member to accumulate
       members = [setOp]
     }
-    if members.isEmpty {
-      throw ParseError.expectedCustomCharacterClassMembers
+    if members.none(\.isSemantic) {
+      throw Source.LocatedError(
+        ParseError.expectedCustomCharacterClassMembers, start.location)
     }
     try source.expect("]")
     return CustomCC(start, members, loc(start.location.start))
@@ -484,7 +488,8 @@ extension Parser {
   ) throws {
     // Parse members until we see the end of the custom char class or an
     // operator.
-    while source.peek() != "]" && source.peekCCBinOp() == nil {
+    while !source.isEmpty && source.peek() != "]" &&
+          source.peekCCBinOp() == nil {
 
       // Nested custom character class.
       if let cccStart = try source.lexCustomCCStart() {
diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
index 0e40ad2ce..986f3d86e 100644
--- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift
+++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
@@ -312,7 +312,9 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable {
     case .quote(let q): return "\(q)"
     case .trivia(let t): return "\(t)"
     case .setOperation(let lhs, let op, let rhs):
-      return "op \(lhs) \(op.value) \(rhs)"
+      // TODO: We should eventually have some way of filtering out trivia for
+      // tests, so that it can appear in regular dumps.
+      return "op \(lhs.filter(\.isSemantic)) \(op.value) \(rhs.filter(\.isSemantic))"
     }
   }
 }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 9a8e89fe0..868426d41 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -1217,6 +1217,9 @@ extension RegexTests {
     firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t")
     firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t")
     firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t")
+
+    firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa")
+    firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a")
   }
   
   func testASCIIClasses() {
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index e9b422379..65dd6ed09 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -460,9 +460,25 @@ extension RegexTests {
 
     parseTest("[-]", charClass("-"))
 
-    // Empty character classes are forbidden, therefore this is a character
-    // class of literal ']'.
+    // Empty character classes are forbidden, therefore these are character
+    // classes containing literal ']'.
     parseTest("[]]", charClass("]"))
+    parseTest("[]a]", charClass("]", "a"))
+    parseTest(
+      "(?x)[ ]]", changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true,
+        charClass("]"))
+    )
+    parseTest(
+      "(?x)[ ]  ]", changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true,
+        charClass("]"))
+    )
+    parseTest(
+      "(?x)[ ] a ]", changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true,
+        charClass("]", "a"))
+    )
 
     // These are metacharacters in certain contexts, but normal characters
     // otherwise.
@@ -613,6 +629,16 @@ extension RegexTests {
     parseTest(
       "~~*", concat("~", zeroOrMore(of: "~")))
 
+    parseTest(
+      "[ &&  ]", charClass(
+        .setOperation([" "], .init(faking: .intersection), [" ", " "]))
+    )
+    parseTest(
+      "(?x)[ a && b ]", changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true, charClass(
+          .setOperation(["a"], .init(faking: .intersection), ["b"]))
+      ))
+
     // MARK: Quotes
 
     parseTest(
@@ -2205,6 +2231,9 @@ extension RegexTests {
     diagnosticTest(")))", .unbalancedEndOfGroup)
     diagnosticTest("())()", .unbalancedEndOfGroup)
 
+    diagnosticTest("[", .expectedCustomCharacterClassMembers)
+    diagnosticTest("[^", .expectedCustomCharacterClassMembers)
+
     diagnosticTest(#"\u{5"#, .expected("}"))
     diagnosticTest(#"\x{5"#, .expected("}"))
     diagnosticTest(#"\N{A"#, .expected("}"))
@@ -2245,9 +2274,21 @@ extension RegexTests {
     diagnosticTest("(?<a-b", .expected(">"))
     diagnosticTest("(?<a-b>", .expected(")"))
 
-    // The first ']' of a custom character class is literal, so this is missing
-    // the closing bracket.
+    // MARK: Character classes
+
+    diagnosticTest("[a", .expected("]"))
+
+    // The first ']' of a custom character class is literal, so these are
+    // missing the closing bracket.
     diagnosticTest("[]", .expected("]"))
+    diagnosticTest("(?x)[  ]", .expected("]"))
+
+    diagnosticTest("[&&]", .expectedCustomCharacterClassMembers)
+    diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers)
+    diagnosticTest("[&&a]", .expectedCustomCharacterClassMembers)
+    diagnosticTest("(?x)[ && ]", .expectedCustomCharacterClassMembers)
+    diagnosticTest("(?x)[ &&a]", .expectedCustomCharacterClassMembers)
+    diagnosticTest("(?x)[a&& ]", .expectedCustomCharacterClassMembers)
 
     diagnosticTest("[:a", .expected("]"))
     diagnosticTest("[:a:", .expected("]"))

From e84c93d061d1b57c1917ad202ffad40e4e907548 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:15 +0100
Subject: [PATCH 08/11] Throw error if we encounter stray opening '('

This should be unreachable, let's make sure of
that. Doing so requires generalizing the handling
of LocatedError a bit.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 21 +++++++++++++------
 .../Regex/Parse/SourceLocation.swift          |  5 +++--
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index f70989c9f..daa629055 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -21,6 +21,16 @@ API convention:
 - eat() and tryEat() is still used by the parser as a character-by-character interface
 */
 
+extension Error {
+  func addingLocation(_ loc: Range<Source.Position>) -> Error {
+    // If we're already a LocatedError, don't change the location.
+    if self is _LocatedErrorProtocol {
+      return self
+    }
+    return Source.LocatedError<Self>(self, loc)
+  }
+}
+
 extension Source {
   // MARK: - recordLoc
 
@@ -51,12 +61,8 @@ extension Source {
     do {
       guard let result = try f(&self) else { return nil }
       return Located(result, start..<currentPosition)
-    } catch let e as Source.LocatedError<ParseError> {
-      throw e
-    } catch let e as ParseError {
-      throw LocatedError(e, start..<currentPosition)
-    } catch {
-      fatalError("FIXME: Let's not keep the boxed existential...")
+    } catch let e {
+      throw e.addingLocation(start..<currentPosition)
     }
   }
 
@@ -1841,6 +1847,9 @@ extension Source {
         }
         throw Unreachable("TODO: reason")
 
+      case "(" where !customCC:
+        throw Unreachable("Should have lexed a group or group-like atom")
+
       // (sometimes) special metacharacters
       case ".": return customCC ? .char(".") : .any
       case "^": return customCC ? .char("^") : .startOfLine
diff --git a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift
index 0a1fd2136..96562d15c 100644
--- a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift
+++ b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift
@@ -56,9 +56,11 @@ extension Source {
   var currentPosition: Position { bounds.lowerBound }
 }
 
+protocol _LocatedErrorProtocol: Error {}
+
 extension Source {
   /// An error with source location info
-  public struct LocatedError<E: Error>: Error {
+  public struct LocatedError<E: Error>: Error, _LocatedErrorProtocol {
     public let error: E
     public let location: SourceLocation
 
@@ -70,7 +72,6 @@ extension Source {
       self.error = v
       self.location = Location(r)
     }
-
   }
 
   /// Located value: a value wrapped with a source range

From 1ea6f2043ef824b2314e95b1d22f3080094633b4 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:16 +0100
Subject: [PATCH 09/11] Change matching option scoping behavior to match PCRE

Previously we would always parse a
"change matching option" sequence as a group, and
for the isolated syntax e.g `(?x)`, we would have
it implicitly wrap everything after in the same
group by having it do a recursive parse.

This matched the Oniguruma behavior for such
isolated groups, and was easy to implement, but
its behavior is quite unintuitive when it comes
to alternations, as e.g `a(?x)b|c` becomes
`a(?x:b|c)`, which may not be expected.

Instead, let's follow PCRE's behavior by having
such isolated cases affect the syntax options for
the remainder of the current group, including
across alternation branches. This is done by
lexing such cases as atoms (as they aren't really
group-like anymore), and having them change the
syntax options when we encounter them. The existing
scoping rules around groups take care of resetting
the options when we exit the scope.
---
 Sources/_RegexParser/Regex/AST/Atom.swift     |  10 +-
 Sources/_RegexParser/Regex/AST/Group.swift    |  19 +-
 .../Regex/Parse/LexicalAnalysis.swift         |  98 ++--
 Sources/_RegexParser/Regex/Parse/Parse.swift  |  64 ++-
 .../_RegexParser/Regex/Printing/DumpAST.swift |  34 +-
 Sources/_StringProcessing/ByteCodeGen.swift   |   8 +-
 .../_StringProcessing/ConsumerInterface.swift |   7 +-
 .../_StringProcessing/PrintAsPattern.swift    |   5 +
 .../Regex/ASTConversion.swift                 |   9 +-
 Sources/_StringProcessing/Regex/DSLTree.swift |   2 +
 Sources/_StringProcessing/Regex/Options.swift |   3 +-
 .../Utility/ASTBuilder.swift                  |   9 +-
 Tests/RegexTests/MatchTests.swift             |  11 +
 Tests/RegexTests/ParseTests.swift             | 471 +++++++++---------
 14 files changed, 404 insertions(+), 346 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
index 6482c4042..9cc2e9a96 100644
--- a/Sources/_RegexParser/Regex/AST/Atom.swift
+++ b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -72,6 +72,9 @@ extension AST {
 
       // (*ACCEPT), (*FAIL), ...
       case backtrackingDirective(BacktrackingDirective)
+
+      // (?i), (?i-m), ...
+      case changeMatchingOptions(MatchingOptionSequence)
     }
   }
 }
@@ -91,6 +94,7 @@ extension AST.Atom {
     case .subpattern(let v):            return v
     case .callout(let v):               return v
     case .backtrackingDirective(let v): return v
+    case .changeMatchingOptions(let v): return v
     case .any:                          return nil
     case .startOfLine:                  return nil
     case .endOfLine:                    return nil
@@ -691,7 +695,7 @@ extension AST.Atom {
       return nil
 
     case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
-        .callout, .backtrackingDirective:
+        .callout, .backtrackingDirective, .changeMatchingOptions:
       return nil
     }
   }
@@ -731,7 +735,7 @@ extension AST.Atom {
 
     case .property, .escaped, .any, .startOfLine, .endOfLine,
         .backreference, .subpattern, .namedCharacter, .callout,
-        .backtrackingDirective:
+        .backtrackingDirective, .changeMatchingOptions:
       return nil
     }
   }
@@ -740,6 +744,8 @@ extension AST.Atom {
     switch kind {
     case .backtrackingDirective(let b):
       return b.isQuantifiable
+    case .changeMatchingOptions:
+      return false
     // TODO: Are callouts quantifiable?
     default:
       return true
diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift
index 81e0931ad..a8c4f8b0f 100644
--- a/Sources/_RegexParser/Regex/AST/Group.swift
+++ b/Sources/_RegexParser/Regex/AST/Group.swift
@@ -68,9 +68,7 @@ extension AST {
       case atomicScriptRun
 
       // (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:)
-      // Isolated options are written as e.g (?i), and implicitly form a group
-      // containing all the following elements of the current group.
-      case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool)
+      case changeMatchingOptions(MatchingOptionSequence)
 
       // NOTE: Comments appear to be groups, but are not parsed
       // the same. They parse more like quotes, so are not
@@ -87,21 +85,6 @@ extension AST.Group.Kind {
     }
   }
 
-  /// Whether this is a group with an implicit scope, e.g isolated matching
-  /// options implicitly become parent groups for the rest of the elements in
-  /// the current group:
-  ///
-  ///      (a(?i)bc)de -> (a(?i:bc))de
-  ///
-  public var hasImplicitScope: Bool {
-    switch self {
-    case .changeMatchingOptions(_, let isIsolated):
-      return isIsolated
-    default:
-      return false
-    }
-  }
-
   /// If this is a named group, its name, `nil` otherwise.
   public var name: String? {
     switch self {
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index daa629055..e8b7e9e18 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -712,6 +712,22 @@ extension Source {
     return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: [])
   }
 
+  /// A matching option changing atom.
+  ///
+  ///     '(?' MatchingOptionSeq ')'
+  ///
+  mutating func lexChangeMatchingOptionAtom(
+    context: ParsingContext
+  ) throws -> AST.MatchingOptionSequence? {
+    try tryEating { src in
+      guard src.tryEat(sequence: "(?"),
+            let seq = try src.lexMatchingOptionSequence(context: context)
+      else { return nil }
+      try src.expect(")")
+      return seq
+    }
+  }
+
   /// Try to consume explicitly spelled-out PCRE2 group syntax.
   mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? {
     tryEating { src in
@@ -852,7 +868,7 @@ extension Source {
         // otherwise a matching option specifier. Conversely, '(?P' can be the
         // start of a matching option sequence, or a reference if it is followed
         // by '=' or '<'.
-        guard !src.shouldLexGroupLikeAtom() else { return nil }
+        guard !src.shouldLexGroupLikeAtom(context: context) else { return nil }
 
         guard src.tryEat("(") else { return nil }
         if src.tryEat("?") {
@@ -877,22 +893,13 @@ extension Source {
 
           // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
           if let seq = try src.lexMatchingOptionSequence(context: context) {
-            if src.tryEat(":") {
-              return .changeMatchingOptions(seq, isIsolated: false)
-            }
-            // If this isn't start of an explicit group, we should have an
-            // implicit group that covers the remaining elements of the current
-            // group.
-            // TODO: This implicit scoping behavior matches Oniguruma, but PCRE
-            // also does it across alternations, which will require additional
-            // handling.
-            guard src.tryEat(")") else {
+            guard src.tryEat(":") else {
               if let next = src.peek() {
                 throw ParseError.invalidMatchingOption(next)
               }
               throw ParseError.expected(")")
             }
-            return .changeMatchingOptions(seq, isIsolated: true)
+            return .changeMatchingOptions(seq)
           }
 
           guard let next = src.peek() else {
@@ -1041,18 +1048,8 @@ extension Source {
     context: ParsingContext
   ) throws -> Located<AST.Group.Kind>? {
     try tryEating { src in
-      guard src.tryEat(sequence: "(?"),
-            let group = try src.lexGroupStart(context: context)
-      else { return nil }
-
-      // Implicitly scoped groups are not supported here.
-      guard !group.value.hasImplicitScope else {
-        throw LocatedError(
-          ParseError.unsupportedCondition("implicitly scoped group"),
-          group.location
-        )
-      }
-      return group
+      guard src.tryEat(sequence: "(?") else { return nil }
+      return try src.lexGroupStart(context: context)
     }
   }
 
@@ -1239,17 +1236,19 @@ extension Source {
     allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false
   ) throws -> AST.Reference? {
     let kind = try recordLoc { src -> AST.Reference.Kind? in
-      // Note this logic should match canLexNumberedReference.
-      if src.tryEat("+") {
-        return .relative(try src.expectNumber().value)
-      }
-      if src.tryEat("-") {
-        return .relative(try -src.expectNumber().value)
-      }
-      if let num = try src.lexNumber() {
-        return .absolute(num.value)
+      try src.tryEating { src in
+        // Note this logic should match canLexNumberedReference.
+        if src.tryEat("+"), let num = try src.lexNumber() {
+          return .relative(num.value)
+        }
+        if src.tryEat("-"), let num = try src.lexNumber() {
+          return .relative(-num.value)
+        }
+        if let num = try src.lexNumber() {
+          return .absolute(num.value)
+        }
+        return nil
       }
-      return nil
     }
     guard let kind = kind else { return nil }
     guard allowWholePatternRef || kind.value != .recurseWholePattern else {
@@ -1478,8 +1477,21 @@ extension Source {
     return src.canLexNumberedReference()
   }
 
+  private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool {
+    var src = self
+
+    // See if we can lex a matching option sequence that terminates in ')'. Such
+    // a sequence is an atom. If an error is thrown, there are invalid elements
+    // of the matching option sequence. In such a case, we can lex as a group
+    // and diagnose the invalid group kind.
+    guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
+      return false
+    }
+    return src.tryEat(")")
+  }
+
   /// Whether a group specifier should be lexed as an atom instead of a group.
-  private func shouldLexGroupLikeAtom() -> Bool {
+  private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool {
     var src = self
     guard src.tryEat("(") else { return false }
 
@@ -1493,6 +1505,9 @@ extension Source {
       // The start of an Oniguruma 'of-contents' callout.
       if src.tryEat("{") { return true }
 
+      // A matching option atom (?x), (?i), ...
+      if src.canLexMatchingOptionsAsAtom(context: context) { return true }
+
       return false
     }
     // The start of a backreference directive or Oniguruma named callout.
@@ -1753,13 +1768,20 @@ extension Source {
   ///
   ///     GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective
   ///
-  mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind {
+  mutating func expectGroupLikeAtom(
+    context: ParsingContext
+  ) throws -> AST.Atom.Kind {
     try recordLoc { src in
       // References that look like groups, e.g (?R), (?1), ...
       if let ref = try src.lexGroupLikeReference() {
         return ref.value
       }
 
+      // Change matching options atom (?i), (?x-i), ...
+      if let seq = try src.lexChangeMatchingOptionAtom(context: context) {
+        return .changeMatchingOptions(seq)
+      }
+
       // (*ACCEPT), (*FAIL), (*MARK), ...
       if let b = try src.lexBacktrackingDirective() {
         return .backtrackingDirective(b)
@@ -1828,8 +1850,8 @@ extension Source {
 
       // If we have group syntax that was skipped over in lexGroupStart, we
       // need to handle it as an atom, or throw an error.
-      if !customCC && src.shouldLexGroupLikeAtom() {
-        return try src.expectGroupLikeAtom()
+      if !customCC && src.shouldLexGroupLikeAtom(context: context) {
+        return try src.expectGroupLikeAtom(context: context)
       }
 
       // A quantifier here is invalid.
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index b24097b83..975012546 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -282,42 +282,53 @@ extension Parser {
       loc(start)))
   }
 
+  /// Apply the syntax options of a given matching option sequence to the
+  /// current set of options.
+  private mutating func applySyntaxOptions(
+    of opts: AST.MatchingOptionSequence
+  ) {
+    // We skip this for multi-line, as extended syntax is always enabled there.
+    if context.syntax.contains(.multilineExtendedSyntax) { return }
+
+    // Check if we're introducing or removing extended syntax.
+    // TODO: PCRE differentiates between (?x) and (?xx) where only the latter
+    // handles non-semantic whitespace in a custom character class. Other
+    // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
+    // treat (?x) and (?xx) as the same option here. If we ever get a strict
+    // PCRE mode, we will need to change this to handle that.
+    if opts.resetsCurrentOptions {
+      context.syntax.remove(.extendedSyntax)
+    }
+    if opts.adding.contains(where: \.isAnyExtended) {
+      context.syntax.insert(.extendedSyntax)
+    }
+    if opts.removing.contains(where: \.isAnyExtended) {
+      context.syntax.remove(.extendedSyntax)
+    }
+  }
+
+  /// Apply the syntax options of a matching option changing group to the
+  /// current set of options.
+  private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
+    if case .changeMatchingOptions(let seq) = group {
+      applySyntaxOptions(of: seq)
+    }
+  }
+
   /// Perform a recursive parse for the body of a group.
   mutating func parseGroupBody(
     start: Source.Position, _ kind: AST.Located<AST.Group.Kind>
   ) throws -> AST.Group {
     context.recordGroup(kind.value)
 
-    // Check if we're introducing or removing extended syntax. We skip this for
-    // multi-line, as extended syntax is always enabled there.
-    // TODO: PCRE differentiates between (?x) and (?xx) where only the latter
-    // handles non-semantic whitespace in a custom character class. Other
-    // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
-    // treat (?x) and (?xx) as the same option here. If we ever get a strict
-    // PCRE mode, we will need to change this to handle that.
     let currentSyntax = context.syntax
-    if !context.syntax.contains(.multilineExtendedSyntax) {
-      if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
-        if c.resetsCurrentOptions {
-          context.syntax.remove(.extendedSyntax)
-        }
-        if c.adding.contains(where: \.isAnyExtended) {
-          context.syntax.insert(.extendedSyntax)
-        }
-        if c.removing.contains(where: \.isAnyExtended) {
-          context.syntax.remove(.extendedSyntax)
-        }
-      }
-    }
+    applySyntaxOptions(of: kind.value)
     defer {
       context.syntax = currentSyntax
     }
 
     let child = try parseNode()
-    // An implicit scoped group has already consumed its closing paren.
-    if !kind.value.hasImplicitScope {
-      try source.expect(")")
-    }
+    try source.expect(")")
     return .init(kind, child, loc(start))
   }
 
@@ -409,6 +420,11 @@ extension Parser {
     }
 
     if let atom = try source.lexAtom(context: context) {
+      // If we have a change matching options atom, apply the syntax options. We
+      // already take care of scoping syntax options within a group.
+      if case .changeMatchingOptions(let opts) = atom.kind {
+        applySyntaxOptions(of: opts)
+      }
       // TODO: track source locations
       return .atom(atom)
     }
diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
index 986f3d86e..8565b14e9 100644
--- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift
+++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
@@ -156,6 +156,9 @@ extension AST.Atom {
 
     case .backtrackingDirective(let d): return "\(d)"
 
+    case .changeMatchingOptions(let opts):
+      return "changeMatchingOptions<\(opts)>"
+
     case .char, .scalar:
       fatalError("Unreachable")
     }
@@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable {
 extension AST.Group.Kind: _ASTPrintable {
   public var _dumpBase: String {
     switch self {
-    case .capture:                return "capture"
-    case .namedCapture(let s):    return "capture<\(s.value)>"
-    case .balancedCapture(let b): return "balanced capture \(b)"
-    case .nonCapture:             return "nonCapture"
-    case .nonCaptureReset:        return "nonCaptureReset"
-    case .atomicNonCapturing:     return "atomicNonCapturing"
-    case .lookahead:              return "lookahead"
-    case .negativeLookahead:      return "negativeLookahead"
-    case .nonAtomicLookahead:     return "nonAtomicLookahead"
-    case .lookbehind:             return "lookbehind"
-    case .negativeLookbehind:     return "negativeLookbehind"
-    case .nonAtomicLookbehind:    return "nonAtomicLookbehind"
-    case .scriptRun:              return "scriptRun"
-    case .atomicScriptRun:        return "atomicScriptRun"
-    case .changeMatchingOptions(let seq, let isIsolated):
-      return "changeMatchingOptions<\(seq), \(isIsolated)>"
+    case .capture:                        return "capture"
+    case .namedCapture(let s):            return "capture<\(s.value)>"
+    case .balancedCapture(let b):         return "balanced capture \(b)"
+    case .nonCapture:                     return "nonCapture"
+    case .nonCaptureReset:                return "nonCaptureReset"
+    case .atomicNonCapturing:             return "atomicNonCapturing"
+    case .lookahead:                      return "lookahead"
+    case .negativeLookahead:              return "negativeLookahead"
+    case .nonAtomicLookahead:             return "nonAtomicLookahead"
+    case .lookbehind:                     return "lookbehind"
+    case .negativeLookbehind:             return "negativeLookbehind"
+    case .nonAtomicLookbehind:            return "nonAtomicLookbehind"
+    case .scriptRun:                      return "scriptRun"
+    case .atomicScriptRun:                return "atomicScriptRun"
+    case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>"
     }
   }
 }
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
index 3cc2d4039..b6f9b4732 100644
--- a/Sources/_StringProcessing/ByteCodeGen.swift
+++ b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -34,6 +34,9 @@ extension Compiler.ByteCodeGen {
     case let .symbolicReference(id):
       builder.buildUnresolvedReference(id: id)
 
+    case let .changeMatchingOptions(optionSequence):
+      options.apply(optionSequence)
+
     case let .unconverted(astAtom):
       if let consumer = try astAtom.generateConsumer(options) {
         builder.buildConsume(by: consumer)
@@ -349,7 +352,7 @@ extension Compiler.ByteCodeGen {
     case .capture, .namedCapture, .balancedCapture:
       throw Unreachable("These should produce a capture node")
 
-    case .changeMatchingOptions(let optionSequence, _):
+    case .changeMatchingOptions(let optionSequence):
       options.apply(optionSequence)
       try emitNode(child)
 
@@ -585,6 +588,9 @@ extension Compiler.ByteCodeGen {
       }
 
     case let .capture(_, refId, child):
+      options.beginScope()
+      defer { options.endScope() }
+
       let cap = builder.makeCapture(id: refId)
       switch child {
       case let .matcher(_, m):
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index d1cc1d327..ecb7d1356 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -100,6 +100,10 @@ extension DSLTree.Atom {
       // TODO: Should we handle?
       return nil
 
+    case .changeMatchingOptions:
+      // TODO: Should we handle?
+      return nil
+
     case let .unconverted(a):
       return try a.generateConsumer(opts)
     }
@@ -178,7 +182,8 @@ extension AST.Atom {
       return nil
 
     case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl,
-      .backreference, .subpattern, .callout, .backtrackingDirective:
+        .backreference, .subpattern, .callout, .backtrackingDirective,
+        .changeMatchingOptions:
       // FIXME: implement
       return nil
     }
diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift
index 3afc19836..4d135898b 100644
--- a/Sources/_StringProcessing/PrintAsPattern.swift
+++ b/Sources/_StringProcessing/PrintAsPattern.swift
@@ -137,6 +137,8 @@ extension PrettyPrinter {
         print("/* TOOD: backreferences */")
       case .symbolicReference:
         print("/* TOOD: symbolic references */")
+      case .changeMatchingOptions:
+        print("/* TODO: change matching options */")
       }
 
     case .trivia:
@@ -320,6 +322,9 @@ extension AST.Atom {
 
     case .backtrackingDirective:
       return " /* TODO: backtracking directive */"
+
+    case .changeMatchingOptions:
+      return "/* TODO: change matching options */"
     }
   }
 }
diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift
index 86041f528..8acbd3b1b 100644
--- a/Sources/_StringProcessing/Regex/ASTConversion.swift
+++ b/Sources/_StringProcessing/Regex/ASTConversion.swift
@@ -206,10 +206,11 @@ extension AST.Atom {
     }
 
     switch self.kind {
-    case let .char(c):          return .char(c)
-    case let .scalar(s):        return .scalar(s)
-    case .any:                  return .any
-    case let .backreference(r): return .backreference(r)
+    case let .char(c):                    return .char(c)
+    case let .scalar(s):                  return .scalar(s)
+    case .any:                            return .any
+    case let .backreference(r):           return .backreference(r)
+    case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq)
 
     case .escaped(let c) where c.scalarValue != nil:
       return .scalar(c.scalarValue!)
diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift
index 7b2ecd515..51f5ea36f 100644
--- a/Sources/_StringProcessing/Regex/DSLTree.swift
+++ b/Sources/_StringProcessing/Regex/DSLTree.swift
@@ -166,6 +166,8 @@ extension DSLTree {
     case backreference(AST.Reference)
     case symbolicReference(ReferenceID)
 
+    case changeMatchingOptions(AST.MatchingOptionSequence)
+
     case unconverted(AST.Atom)
   }
 }
diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift
index 6f38956e7..623589b54 100644
--- a/Sources/_StringProcessing/Regex/Options.swift
+++ b/Sources/_StringProcessing/Regex/Options.swift
@@ -195,7 +195,6 @@ extension RegexComponent {
       ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)])
       : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)])
     return Regex(node: .nonCapturingGroup(
-      .changeMatchingOptions(sequence, isIsolated: false),
-      regex.root))
+      .changeMatchingOptions(sequence), regex.root))
   }
 }
diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift
index 961d0e23f..51d4f8bfc 100644
--- a/Sources/_StringProcessing/Utility/ASTBuilder.swift
+++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift
@@ -119,9 +119,14 @@ func atomicScriptRun(_ child: AST.Node) -> AST.Node {
   group(.atomicScriptRun, child)
 }
 func changeMatchingOptions(
-  _ seq: AST.MatchingOptionSequence, isIsolated: Bool, _ child: AST.Node
+  _ seq: AST.MatchingOptionSequence, _ child: AST.Node
 ) -> AST.Node {
-  group(.changeMatchingOptions(seq, isIsolated: isIsolated), child)
+  group(.changeMatchingOptions(seq), child)
+}
+func changeMatchingOptions(
+  _ seq: AST.MatchingOptionSequence
+) -> AST.Node {
+  atom(.changeMatchingOptions(seq))
 }
 
 func matchingOptions(
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 868426d41..e00c77f56 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -1319,6 +1319,17 @@ extension RegexTests {
     firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil)
     firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb")
     firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb")
+
+    // Matching option changing persists across alternations.
+    firstMatchTest(#"a(?s)b|c|.d"#, input: "abc", match: "ab")
+    firstMatchTest(#"a(?s)b|c|.d"#, input: "c", match: "c")
+    firstMatchTest(#"a(?s)b|c|.d"#, input: "a\nd", match: "\nd")
+    firstMatchTest(#"a(?s)(?^)b|c|.d"#, input: "a\nd", match: nil)
+    firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nd", match: nil)
+    firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nc", match: "\nc")
+    firstMatchTest(#"a(?s)b|c(?-s)|(?^s).d"#, input: "a\nd", match: "\nd")
+    firstMatchTest(#"a(?:(?s).b)|.c|.d"#, input: "a\nb", match: "a\nb")
+    firstMatchTest(#"a(?:(?s).b)|.c"#, input: "a\nc", match: nil)
   }
   
   func testOptionMethods() throws {
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 65dd6ed09..bdae250ba 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -464,21 +464,18 @@ extension RegexTests {
     // classes containing literal ']'.
     parseTest("[]]", charClass("]"))
     parseTest("[]a]", charClass("]", "a"))
-    parseTest(
-      "(?x)[ ]]", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        charClass("]"))
-    )
-    parseTest(
-      "(?x)[ ]  ]", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        charClass("]"))
-    )
-    parseTest(
-      "(?x)[ ] a ]", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        charClass("]", "a"))
-    )
+    parseTest("(?x)[ ]]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      charClass("]")
+    ))
+    parseTest("(?x)[ ]  ]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      charClass("]")
+    ))
+    parseTest("(?x)[ ] a ]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      charClass("]", "a")
+    ))
 
     // These are metacharacters in certain contexts, but normal characters
     // otherwise.
@@ -630,14 +627,13 @@ extension RegexTests {
       "~~*", concat("~", zeroOrMore(of: "~")))
 
     parseTest(
-      "[ &&  ]", charClass(
-        .setOperation([" "], .init(faking: .intersection), [" ", " "]))
+      "[ &&  ]",
+      charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "]))
     )
-    parseTest(
-      "(?x)[ a && b ]", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true, charClass(
-          .setOperation(["a"], .init(faking: .intersection), ["b"]))
-      ))
+    parseTest("(?x)[ a && b ]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      charClass(.setOperation(["a"], .init(faking: .intersection), ["b"]))
+    ))
 
     // MARK: Quotes
 
@@ -832,81 +828,67 @@ extension RegexTests {
 
     // Matching option changing groups.
     parseTest("(?-)", changeMatchingOptions(
-      matchingOptions(), isIsolated: true, empty())
-    )
+      matchingOptions()
+    ))
     parseTest("(?i)", changeMatchingOptions(
-      matchingOptions(adding: .caseInsensitive),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: .caseInsensitive)
+    ))
     parseTest("(?m)", changeMatchingOptions(
-      matchingOptions(adding: .multiline),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: .multiline)
+    ))
     parseTest("(?x)", changeMatchingOptions(
-      matchingOptions(adding: .extended),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: .extended)
+    ))
     parseTest("(?xx)", changeMatchingOptions(
-      matchingOptions(adding: .extraExtended),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: .extraExtended)
+    ))
     parseTest("(?xxx)", changeMatchingOptions(
-      matchingOptions(adding: .extraExtended, .extended),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: .extraExtended, .extended)
+    ))
     parseTest("(?P)", changeMatchingOptions(
-      matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: true, empty())
-    )
+      matchingOptions(adding: .asciiOnlyPOSIXProps)
+    ))
     parseTest("(?-i)", changeMatchingOptions(
-      matchingOptions(removing: .caseInsensitive),
-      isIsolated: true, empty())
-    )
+      matchingOptions(removing: .caseInsensitive)
+    ))
     parseTest("(?i-s)", changeMatchingOptions(
-      matchingOptions(adding: .caseInsensitive, removing: .singleLine),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: .caseInsensitive, removing: .singleLine)
+    ))
     parseTest("(?i-is)", changeMatchingOptions(
       matchingOptions(adding: .caseInsensitive,
-                      removing: .caseInsensitive, .singleLine),
-      isIsolated: true, empty())
-    )
+                      removing: .caseInsensitive, .singleLine)
+    ))
 
     parseTest("(?:)", nonCapture(empty()))
     parseTest("(?-:)", changeMatchingOptions(
-      matchingOptions(), isIsolated: false, empty())
-    )
+      matchingOptions(), empty()
+    ))
     parseTest("(?i:)", changeMatchingOptions(
-      matchingOptions(adding: .caseInsensitive),
-      isIsolated: false, empty())
-    )
+      matchingOptions(adding: .caseInsensitive), empty()
+    ))
     parseTest("(?-i:)", changeMatchingOptions(
-      matchingOptions(removing: .caseInsensitive),
-      isIsolated: false, empty())
-    )
+      matchingOptions(removing: .caseInsensitive), empty()
+    ))
     parseTest("(?P:)", changeMatchingOptions(
-      matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: false, empty())
-    )
+      matchingOptions(adding: .asciiOnlyPOSIXProps), empty()
+    ))
 
     parseTest("(?^)", changeMatchingOptions(
-      unsetMatchingOptions(),
-      isIsolated: true, empty())
-    )
+      unsetMatchingOptions()
+    ))
     parseTest("(?^:)", changeMatchingOptions(
-      unsetMatchingOptions(),
-      isIsolated: false, empty())
-    )
+      unsetMatchingOptions(), empty()
+    ))
     parseTest("(?^ims:)", changeMatchingOptions(
       unsetMatchingOptions(adding: .caseInsensitive, .multiline, .singleLine),
-      isIsolated: false, empty())
-    )
+      empty()
+    ))
     parseTest("(?^J:)", changeMatchingOptions(
-      unsetMatchingOptions(adding: .allowDuplicateGroupNames),
-      isIsolated: false, empty())
-    )
+      unsetMatchingOptions(adding: .allowDuplicateGroupNames), empty()
+    ))
     parseTest("(?^y{w}:)", changeMatchingOptions(
-      unsetMatchingOptions(adding: .textSegmentWordMode),
-      isIsolated: false, empty())
-    )
+      unsetMatchingOptions(adding: .textSegmentWordMode), empty()
+    ))
 
     let allOptions: [AST.MatchingOption.Kind] = [
       .caseInsensitive, .allowDuplicateGroupNames, .multiline, .noAutoCapture,
@@ -917,50 +899,64 @@ extension RegexTests {
       .byteSemantics
     ]
     parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions(
-      matchingOptions(
-        adding: allOptions,
-        removing: allOptions.dropLast(5)
-      ),
-      isIsolated: true, empty())
-    )
+      matchingOptions(adding: allOptions, removing: allOptions.dropLast(5))
+    ))
     parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions(
-      matchingOptions(
-        adding: allOptions,
-        removing: allOptions.dropLast(5)
-      ),
-      isIsolated: false, empty())
-    )
+      matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)), empty()
+    ))
 
     parseTest(
-      "a(b(?i)c)d", concat("a", capture(concat("b", changeMatchingOptions(
-        matchingOptions(adding: .caseInsensitive),
-        isIsolated: true, "c"))), "d"),
+      "a(b(?i)c)d", concat(
+        "a",
+        capture(concat(
+          "b",
+          changeMatchingOptions(matchingOptions(adding: .caseInsensitive)),
+          "c"
+        )),
+        "d"
+      ),
       captures: .atom()
     )
     parseTest(
-      "(a(?i)b(c)d)", capture(concat("a", changeMatchingOptions(
-        matchingOptions(adding: .caseInsensitive),
-        isIsolated: true, concat("b", capture("c"), "d")))),
+      "(a(?i)b(c)d)", capture(concat(
+        "a",
+        changeMatchingOptions(matchingOptions(adding: .caseInsensitive)),
+        "b",
+        capture("c"),
+        "d"
+      )),
       captures: .tuple(.atom(), .atom())
     )
     parseTest(
-      "(a(?i)b(?#hello)c)", capture(concat("a", changeMatchingOptions(
-        matchingOptions(adding: .caseInsensitive),
-        isIsolated: true, concat("b", "c")))),
+      "(a(?i)b(?#hello)c)", capture(concat(
+        "a",
+        changeMatchingOptions(matchingOptions(adding: .caseInsensitive)),
+        "b",
+        "c"
+      )),
       captures: .atom()
     )
 
-    // TODO: This is Oniguruma's behavior, but PCRE treats it as:
-    //     ab(?i:c)|(?i:def)|(?i:gh)
-    // instead. We ought to have a mode to emulate that.
-    parseTest("ab(?i)c|def|gh", concat("a", "b", changeMatchingOptions(
-      matchingOptions(adding: .caseInsensitive), isIsolated: true,
-      alt("c", concat("d", "e", "f"), concat("g", "h")))))
+    parseTest("ab(?i)c|def|gh", alt(
+      concat(
+        "a",
+        "b",
+        changeMatchingOptions(matchingOptions(adding: .caseInsensitive)),
+        "c"
+      ),
+      concat("d", "e", "f"),
+      concat("g", "h")
+    ))
 
-    parseTest("(a|b(?i)c|d)", capture(alt("a", concat("b", changeMatchingOptions(
-      matchingOptions(adding: .caseInsensitive), isIsolated: true,
-      alt("c", "d"))))),
-      captures: .atom())
+    parseTest("(a|b(?i)c|d)", capture(alt(
+      "a",
+      concat(
+        "b",
+        changeMatchingOptions(matchingOptions(adding: .caseInsensitive)),
+        "c"
+      ),
+      "d"
+    )), captures: .atom())
 
     // MARK: References
 
@@ -1479,149 +1475,149 @@ extension RegexTests {
     parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")"))
     parseTest("# abc", concat("#", " ", "a", "b", "c"))
 
-    parseTest("(?x) # hello", changeMatchingOptions(matchingOptions(
-      adding: .extended), isIsolated: true, empty()))
-    parseTest("(?xx) # hello", changeMatchingOptions(matchingOptions(
-      adding: .extraExtended), isIsolated: true, empty()))
-    parseTest("(?x) \\# abc", changeMatchingOptions(matchingOptions(
-      adding: .extended), isIsolated: true, concat("#", "a", "b", "c")))
-    parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions(
-      adding: .extraExtended), isIsolated: true, concat(" ")))
+    // MARK: Matching option changing
 
     parseTest(
-      "(?x) a (?^) b",
-      changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat(
-          "a",
-          changeMatchingOptions(
-            unsetMatchingOptions(), isIsolated: true, concat(" ", "b"))
-        )
-      )
+      "(?x) # hello",
+      changeMatchingOptions(matchingOptions(adding: .extended))
     )
-
-    // End of line comments aren't applicable in custom char classes.
-    // TODO: ICU supports this.
     parseTest(
-      "(?x)[ # abc]", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        charClass("#", "a", "b", "c"))
+      "(?xx) # hello",
+      changeMatchingOptions(matchingOptions(adding: .extraExtended))
     )
+    parseTest("(?x) \\# abc", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "#", "a", "b", "c"
+    ))
+    parseTest("(?xx) \\ ", concat(
+      changeMatchingOptions(matchingOptions(adding: .extraExtended)), " "
+    ))
 
     parseTest(
-      "(?x)a b c[d e f]", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "b", "c", charClass("d", "e", "f")))
-    )
-    parseTest(
-      "(?xx)a b c[d e f]", changeMatchingOptions(
-        matchingOptions(adding: .extraExtended), isIsolated: true,
-        concat("a", "b", "c", charClass("d", "e", "f")))
-    )
-    parseTest(
-      "(?x)a b c(?-x)d e f", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "b", "c",
-               changeMatchingOptions(matchingOptions(removing: .extended),
-                                     isIsolated: true, concat("d", " ", "e", " ", "f"))))
-    )
-    parseTest(
-      "(?x)a b c(?-xx)d e f", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "b", "c",
-               changeMatchingOptions(matchingOptions(removing: .extraExtended),
-                                     isIsolated: true, concat("d", " ", "e", " ", "f"))))
-    )
-    parseTest(
-      "(?xx)a b c(?-x)d e f", changeMatchingOptions(
-        matchingOptions(adding: .extraExtended), isIsolated: true,
-        concat("a", "b", "c",
-               changeMatchingOptions(matchingOptions(removing: .extended),
-                                     isIsolated: true, concat("d", " ", "e", " ", "f"))))
-    )
-    parseTest(
-      "(?x)a b c(?^i)d e f", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "b", "c",
-               changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive),
-                                     isIsolated: true, concat("d", " ", "e", " ", "f"))))
-    )
-    parseTest(
-      "(?x)a b c(?^x)d e f", changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "b", "c",
-               changeMatchingOptions(unsetMatchingOptions(adding: .extended),
-                                     isIsolated: true, concat("d", "e", "f"))))
-    )
-    parseTest(
-      "(?:(?x)a b c)d e f", concat(nonCapture(changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "b", "c"))), "d", " ", "e", " ", "f")
-    )
-    parseTest(
-      "(?x:a b c)# hi", concat(changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: false,
-        concat("a", "b", "c")), "#", " ", "h", "i")
+      "(?x) a (?^) b", concat(
+        changeMatchingOptions(matchingOptions(adding: .extended)),
+        "a",
+        changeMatchingOptions(unsetMatchingOptions()),
+        " ", "b"
+      )
     )
 
-    parseTest(
-      "(?x-x)a b c", changeMatchingOptions(
-        matchingOptions(adding: .extended, removing: .extended), isIsolated: true,
-        concat("a", " ", "b", " ", "c"))
-    )
-    parseTest(
-      "(?xxx-x)a b c", changeMatchingOptions(
-        matchingOptions(adding: .extraExtended, .extended, removing: .extended), isIsolated: true,
-        concat("a", " ", "b", " ", "c"))
-    )
-    parseTest(
-      "(?xx-i)a b c", changeMatchingOptions(
-        matchingOptions(adding: .extraExtended, removing: .caseInsensitive), isIsolated: true,
-        concat("a", "b", "c"))
+    // End of line comments aren't applicable in custom char classes.
+    // TODO: ICU supports this.
+    parseTest("(?x)[ # abc]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      charClass("#", "a", "b", "c")
+    ))
+
+    parseTest("(?x)a b c[d e f]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "a", "b", "c", charClass("d", "e", "f")
+    ))
+    parseTest("(?xx)a b c[d e f]", concat(
+      changeMatchingOptions(matchingOptions(adding: .extraExtended)),
+      "a", "b", "c", charClass("d", "e", "f")
+    ))
+    parseTest("(?x)a b c(?-x)d e f", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "a", "b", "c",
+      changeMatchingOptions(matchingOptions(removing: .extended)),
+      "d", " ", "e", " ", "f"
+    ))
+    parseTest("(?x)a b c(?-xx)d e f", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "a", "b", "c",
+      changeMatchingOptions(matchingOptions(removing: .extraExtended)),
+      "d", " ", "e", " ", "f"
+    ))
+    parseTest("(?xx)a b c(?-x)d e f", concat(
+      changeMatchingOptions(matchingOptions(adding: .extraExtended)),
+      "a", "b", "c",
+      changeMatchingOptions(matchingOptions(removing: .extended)),
+      "d", " ", "e", " ", "f"
+    ))
+    parseTest("(?x)a b c(?^i)d e f", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "a", "b", "c",
+      changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive)),
+      "d", " ", "e", " ", "f"
+    ))
+    parseTest("(?x)a b c(?^x)d e f", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "a", "b", "c",
+      changeMatchingOptions(unsetMatchingOptions(adding: .extended)),
+      "d", "e", "f"
+    ))
+    parseTest("(?:(?x)a b c)d e f", concat(
+      nonCapture(concat(
+        changeMatchingOptions(matchingOptions(adding: .extended)),
+        "a", "b", "c"
+      )),
+      "d", " ", "e", " ", "f"
+    ))
+    parseTest("(?x:a b c)# hi", concat(changeMatchingOptions(
+      matchingOptions(adding: .extended),
+      concat("a", "b", "c")), "#", " ", "h", "i")
     )
 
-    // PCRE states that whitespace seperating quantifiers is permitted under
-    // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20
-    parseTest(
-      "(?x)a *",
+    parseTest("(?x-x)a b c", concat(
       changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        zeroOrMore(of: "a"))
-    )
-    parseTest(
-      "(?x)a + ?",
+        matchingOptions(adding: .extended, removing: .extended)
+      ),
+      "a", " ", "b", " ", "c"
+    ))
+    parseTest("(?xxx-x)a b c", concat(
       changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        oneOrMore(.reluctant, of: "a"))
-    )
-    parseTest(
-      "(?x)a {2,4}",
+        matchingOptions(adding: .extraExtended, .extended, removing: .extended)
+      ),
+      "a", " ", "b", " ", "c"
+    ))
+    parseTest("(?xx-i)a b c", concat(
       changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        quantRange(2 ... 4, of: "a"))
-    )
+        matchingOptions(adding: .extraExtended, removing: .caseInsensitive)
+      ),
+      "a", "b", "c"
+    ))
+
+    // PCRE states that whitespace seperating quantifiers is permitted under
+    // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20
+    parseTest("(?x)a *", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      zeroOrMore(of: "a")
+    ))
+    parseTest("(?x)a + ?", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      oneOrMore(.reluctant, of: "a")
+    ))
+    parseTest("(?x)a {2,4}", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      quantRange(2 ... 4, of: "a")
+    ))
 
     // PCRE states that whitespace won't be ignored within a range.
     // http://pcre.org/current/doc/html/pcre2api.html#SEC20
     // TODO: We ought to warn on this, and produce a range anyway.
-    parseTest(
-      "(?x)a{1, 3}",
-      changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true,
-        concat("a", "{", "1", ",", "3", "}"))
-    )
+    parseTest("(?x)a{1, 3}", concat(
+      changeMatchingOptions(matchingOptions(adding: .extended)),
+      "a", "{", "1", ",", "3", "}"
+    ))
 
     // Test that we cover the list of whitespace characters covered by PCRE.
     parseTest(
       "(?x)a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b",
-      changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true, concat("a", "b"))
-    )
+      concat(
+        changeMatchingOptions(matchingOptions(adding: .extended)),
+        "a", "b"
+      ))
     parseTest(
       "(?x)[a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b]",
-      changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b"))
-    )
+      concat(
+        changeMatchingOptions(matchingOptions(adding: .extended)),
+        charClass("a", "b")
+      ))
+
+    parseTest(#"(?i:)?"#, zeroOrOne(of: changeMatchingOptions(
+      matchingOptions(adding: .caseInsensitive), empty()
+    )))
 
     // Test multi-line comment handling.
     parseTest(
@@ -1843,10 +1839,10 @@ extension RegexTests {
 
     parseWithDelimitersTest("#|[a b]|#", charClass("a", "b"))
     parseWithDelimitersTest(
-      "#|(?-x)[a b]|#", changeMatchingOptions(
-        matchingOptions(removing: .extended), isIsolated: true,
-        charClass("a", " ", "b"))
-    )
+      "#|(?-x)[a b]|#", concat(
+        changeMatchingOptions(matchingOptions(removing: .extended)),
+        charClass("a", " ", "b")
+      ))
     parseWithDelimitersTest("#|[[a ] b]|#", charClass(charClass("a"), "b"))
 
     // Non-semantic whitespace between quantifier characters for consistency
@@ -1856,8 +1852,7 @@ extension RegexTests {
     // End-of-line comments aren't enabled by default in experimental syntax.
     parseWithDelimitersTest("#|#abc|#", concat("#", "a", "b", "c"))
     parseWithDelimitersTest("#|(?x)#abc|#", changeMatchingOptions(
-      matchingOptions(adding: .extended), isIsolated: true,
-      empty())
+      matchingOptions(adding: .extended))
     )
 
     parseWithDelimitersTest("#|||#", alt(empty(), empty()))
@@ -1913,8 +1908,7 @@ extension RegexTests {
       (?^)
       # comment
       /#
-      """, changeMatchingOptions(
-        unsetMatchingOptions(), isIsolated: true, empty())
+      """, changeMatchingOptions(unsetMatchingOptions())
     )
 
     // (?x) has no effect.
@@ -1923,8 +1917,7 @@ extension RegexTests {
       (?x)
       # comment
       /#
-      """, changeMatchingOptions(
-        matchingOptions(adding: .extended), isIsolated: true, empty())
+      """, changeMatchingOptions(matchingOptions(adding: .extended))
     )
 
     // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
@@ -2328,17 +2321,18 @@ extension RegexTests {
     diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é"))
     diagnosticTest(#"\˂"#, .invalidEscape("˂"))
 
-    // MARK: Text Segment options
+    // MARK: Matching options
 
     diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)
     diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions)
 
-    // MARK: Semantic Level options
-
     diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions)
     diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions)
     diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions)
 
+    diagnosticTest("(?a)", .unknownGroupKind("?a"))
+    diagnosticTest("(?y{)", .expected("g"))
+
     // Extended syntax may not be removed in multi-line mode.
     diagnosticWithDelimitersTest("""
       #/
@@ -2406,6 +2400,7 @@ extension RegexTests {
     diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret)
     diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret)
     diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret)
+    diagnosticTest(#"(?i)?"#, .notQuantifiable)
 
     // MARK: References
 
@@ -2438,7 +2433,7 @@ extension RegexTests {
 
     diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3))
     diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3))
-    diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group"))
+    diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?("))
 
     // MARK: Callouts
 

From 5cc0ea0b9ff6c8b882696a58b6d3afaf532f5812 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:16 +0100
Subject: [PATCH 10/11] Error on unknown character properties

Previously we would form an `.other` character
property kind for any unclassified properties,
which crash at runtime as unsupported. Instead,
switch to erroring on them. Eventually it would be
nice if we could version this based on what the
runtime being targeted supports.
---
 Sources/_RegexParser/Regex/AST/Atom.swift     |  3 ---
 .../CharacterPropertyClassification.swift     |  9 ++++---
 .../Regex/Parse/Diagnostics.swift             |  9 ++++---
 .../_StringProcessing/ConsumerInterface.swift |  4 ---
 Tests/RegexTests/ParseTests.swift             | 27 ++++++++++---------
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
index 9cc2e9a96..1f6043d72 100644
--- a/Sources/_RegexParser/Regex/AST/Atom.swift
+++ b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -401,9 +401,6 @@ extension AST.Atom.CharacterProperty {
     /// Some special properties implemented by PCRE and Oniguruma.
     case pcreSpecial(PCRESpecialCategory)
     case onigurumaSpecial(OnigurumaSpecialProperty)
-
-    /// Unhandled properties.
-    case other(key: String?, value: String)
   }
 
   // TODO: erm, separate out or fold into something? splat it in?
diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift
index e5b65a46c..911312121 100644
--- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift
+++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift
@@ -397,8 +397,9 @@ extension Source {
       return .pcreSpecial(pcreSpecial)
     }
 
-    // Otherwise we don't know what this is.
-    return .other(key: nil, value: value)
+    // TODO: This should be versioned, and do we want a more lax behavior for
+    // the runtime?
+    throw ParseError.unknownProperty(key: nil, value: value)
   }
 
   static func classifyCharacterProperty(
@@ -435,6 +436,8 @@ extension Source {
     if let match = match {
       return match
     }
-    return .other(key: key, value: value)
+    // TODO: This should be versioned, and do we want a more lax behavior for
+    // the runtime?
+    throw ParseError.unknownProperty(key: key, value: value)
   }
 }
diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
index 621d6ea11..c3d74c30b 100644
--- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
+++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -57,8 +57,8 @@ enum ParseError: Error, Hashable {
   case expectedCustomCharacterClassMembers
   case invalidCharacterClassRangeOperand
 
-  case invalidPOSIXSetName(String)
   case emptyProperty
+  case unknownProperty(key: String?, value: String)
 
   case expectedGroupSpecifier
   case unbalancedEndOfGroup
@@ -142,10 +142,13 @@ extension ParseError: CustomStringConvertible {
       return "expected custom character class members"
     case .invalidCharacterClassRangeOperand:
       return "invalid character class range"
-    case let .invalidPOSIXSetName(n):
-      return "invalid character set name: '\(n)'"
     case .emptyProperty:
       return "empty property"
+    case .unknownProperty(let key, let value):
+      if let key = key {
+        return "unknown character property '\(key)=\(value)'"
+      }
+      return "unknown character property '\(value)'"
     case .expectedGroupSpecifier:
       return "expected group specifier"
     case .unbalancedEndOfGroup:
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index ecb7d1356..f77cd322f 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -423,10 +423,6 @@ extension AST.Atom.CharacterProperty {
 
       case .onigurumaSpecial(let s):
         throw Unsupported("TODO: map Oniguruma special: \(s)")
-
-      case let .other(key, value):
-        throw Unsupported(
-          "TODO: map other \(key ?? "")=\(value)")
       }
     }()
 
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index bdae250ba..4043e4ccb 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -501,7 +501,6 @@ extension RegexTests {
     parseTest(#"[[:a]]"#, charClass(charClass(":", "a")))
     parseTest(#"[[:}]]"#, charClass(charClass(":", "}")))
     parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
-    parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{"))))
     parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
 
     parseTest(
@@ -1141,14 +1140,6 @@ extension RegexTests {
       #"\p{C}+"#,
       oneOrMore(of: prop(.generalCategory(.other))))
 
-    // TODO: Start erroring on these?
-    parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx")))
-    parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL")))
-    parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y")))
-    parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)")))
-    parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()"))))
-    parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}"))
-
     // UAX44-LM3 means all of the below are equivalent.
     let lowercaseLetter = prop(.generalCategory(.lowercaseLetter))
     parseTest(#"\p{ll}"#, lowercaseLetter)
@@ -2231,12 +2222,12 @@ extension RegexTests {
     diagnosticTest(#"\x{5"#, .expected("}"))
     diagnosticTest(#"\N{A"#, .expected("}"))
     diagnosticTest(#"\N{U+A"#, .expected("}"))
-    diagnosticTest(#"\p{a"#, .expected("}"))
+    diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a"))
     diagnosticTest(#"\p{a="#, .emptyProperty)
     diagnosticTest(#"\p{a=}"#, .emptyProperty)
-    diagnosticTest(#"\p{a=b"#, .expected("}"))
-    diagnosticTest(#"\p{aaa[b]}"#, .expected("}"))
-    diagnosticTest(#"\p{a=b=c}"#, .expected("}"))
+    diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"))
+    diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"))
+    diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"))
     diagnosticTest(#"(?#"#, .expected(")"))
     diagnosticTest(#"(?x"#, .expected(")"))
 
@@ -2321,6 +2312,16 @@ extension RegexTests {
     diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é"))
     diagnosticTest(#"\˂"#, .invalidEscape("˂"))
 
+    // MARK: Character properties
+
+    diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx"))
+    diagnosticTest(#"\p{gcL}"#, .unknownProperty(key: nil, value: "gcL"))
+    diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y"))
+    diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)"))
+    diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()"))
+    diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#))
+    diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{"))
+
     // MARK: Matching options
 
     diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)

From 771e7353bc14c83eab16c2a368f888e18aef06fb Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Thu, 21 Apr 2022 11:11:17 +0100
Subject: [PATCH 11/11] Don't parse a character property containing a backslash

Add backslash to the list of characters we don't
consider valid for a character property name. This
means that we'll bail when attempting to lex a
POSIX character property and instead lex a custom
character class. This allows e.g `[:\Q :] \E]` to
be lexed as a custom character class. For `\p{...}`
this just means we'll emit a truncated invalid
property error, which is arguably more inline with
what the user was expecting..

I noticed when digging through the ICU source code
that it will bail out of parsing a POSIX character
property if it encounters one of its known escape
sequences (e.g `\a`, `\e`, `\f`, ...). Interestingly
this doesn't cover character property escapes e.g
`\d`, but it's not clear that is intentional. Given
backslash is not a valid character property character
anyway, it seems reasonable to broaden this behavior
to bail on any backslash.
---
 .../Regex/Parse/LexicalAnalysis.swift         |  8 +++++++
 Tests/RegexTests/ParseTests.swift             | 21 ++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index e8b7e9e18..6a61ccdf7 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1176,6 +1176,14 @@ extension Source {
         // character property name anyway, and it's nice not to have diverging
         // logic for these cases.
         return true
+      case "\\":
+        // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here
+        // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It
+        // seems character class escapes e.g '\d' are excluded, however it's not
+        // clear that is intentional. Let's apply the rule for any escape, as a
+        // backslash would never be a valid character property name, and we can
+        // diagnose any invalid escapes when parsing as a character class.
+        return true
       default:
         // We may want to handle other metacharacters here, e.g '{', '(', ')',
         // as they're not valid character property names. However for now
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 4043e4ccb..94c134853 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -503,6 +503,25 @@ extension RegexTests {
     parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
     parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
 
+    parseTest(
+      #"[:[:space:]:]"#,
+      charClass(":", posixProp_m(.binary(.whitespace)), ":")
+    )
+    parseTest(
+      #"[:a[:space:]b:]"#,
+      charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":")
+    )
+
+    // ICU parses a custom character class if it sees any of its known escape
+    // sequences in a POSIX character property (though it appears to exclude
+    // character class escapes e.g '\d'). We do so for any escape sequence as
+    // '\' is not a valid character property character.
+    parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]")))
+    parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":"))
+    parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":"))
+    parseTest(#"[:\\:]"#, charClass(":", "\\", ":"))
+    parseTest(#"[:\:]"#, charClass(":", ":"))
+
     parseTest(
       #"\D\S\W"#,
       concat(
@@ -2319,7 +2338,7 @@ extension RegexTests {
     diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y"))
     diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)"))
     diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()"))
-    diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#))
+    diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa"))
     diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{"))
 
     // MARK: Matching options