From 3f2832df8593c803ee85696daabb49fa4a93ca38 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:12 +0100 Subject: [PATCH 01/11] Fix HexDigit definition in RegexSyntax.md --- Documentation/Evolution/RegexSyntaxRunTimeConstruction.md | 2 +- Tests/RegexTests/ParseTests.swift | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md index f3dda59ba..7dd56b6a8 100644 --- a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md +++ b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md @@ -339,7 +339,7 @@ UnicodeScalar -> '\u{' HexDigit{1...} '}' | '\o{' OctalDigit{1...} '}' | '\0' OctalDigit{0...3} -HexDigit -> [0-9a-zA-Z] +HexDigit -> [0-9a-fA-F] OctalDigit -> [0-7] NamedScalar -> '\N{' ScalarName '}' diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index c6ff3e46d..5e3defb11 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2313,6 +2313,10 @@ extension RegexTests { diagnosticTest("{5}", .quantifierRequiresOperand("{5}")) diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) + // MARK: Unicode scalars + + diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) From 3cce15d40218349fbd6512f6c9482f476f339f52 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:13 +0100 Subject: [PATCH 02/11] Remove AST CustomCharacterClass consumer generation This isn't actually used, as we convert to a DSL custom character class, and then use that consumer logic. --- .../_StringProcessing/ConsumerInterface.swift | 118 ------------------ 1 file changed, 118 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 600234e0d..85e73801d 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -295,101 +295,6 @@ extension DSLTree.CustomCharacterClass.Member { } } -extension AST.CustomCharacterClass.Member { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - switch self { - case .custom(let ccc): - return try ccc.generateConsumer(opts) - - case .range(let r): - guard let lhs = r.lhs.literalCharacterValue else { - throw Unsupported("\(r.lhs) in range") - } - guard let rhs = r.rhs.literalCharacterValue else { - throw Unsupported("\(r.rhs) in range") - } - - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil - } - - case .atom(let atom): - guard let gen = try atom.generateConsumer(opts) else { - throw Unsupported("TODO") - } - return gen - - case .quote(let q): - // TODO: Not optimal. - let consumers = try q.literal.map { - try AST.Atom(.char($0), .fake).generateConsumer(opts)! - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return idx - } - } - return nil - } - - case .trivia: - throw Unreachable( - "Should have been stripped by caller") - - case .setOperation(let lhs, let op, let rhs): - // TODO: We should probably have a component type - // instead of a members array... for now we reconstruct - // an AST node... - let start = AST.Located( - faking: AST.CustomCharacterClass.Start.normal) - - let lhs = try AST.CustomCharacterClass( - start, lhs, .fake - ).generateConsumer(opts) - let rhs = try AST.CustomCharacterClass( - start, rhs, .fake - ).generateConsumer(opts) - - return { input, bounds in - // NOTE: Easy way to implement, not performant - let lhsIdxOpt = lhs(input, bounds) - let rhsIdxOpt = rhs(input, bounds) - - // TODO: What if lengths don't line up? - assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil - || rhsIdxOpt == nil) - - switch op.value { - case .subtraction: - guard rhsIdxOpt == nil else { return nil } - return lhsIdxOpt - - case .intersection: - if let idx = lhsIdxOpt { - return rhsIdxOpt == nil ? nil : idx - } - return nil - - case .symmetricDifference: - if let idx = lhsIdxOpt { - return rhsIdxOpt == nil ? idx : nil - } - return rhsIdxOpt - } - } - } - } -} - extension DSLTree.CustomCharacterClass { func generateConsumer( _ opts: MatchingOptions @@ -413,29 +318,6 @@ extension DSLTree.CustomCharacterClass { } } -extension AST.CustomCharacterClass { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try strippingTriviaShallow.members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) - } - return nil - } - } -} - // NOTE: Conveniences, though not most performant private func consumeScalarScript( _ s: Unicode.Script From 577dc6e6f345ad1a5b80ee4444e27aa187142990 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:13 +0100 Subject: [PATCH 03/11] Convert scalar escape sequences to DSL scalars Convert AST escape sequences that represent a scalar value (e.g `\f`, `n`, `\a`) into scalars in the DSL tree. This allows the matching engine to match against them. --- Sources/_RegexParser/Regex/AST/Atom.swift | 64 +++++++++++-------- .../Regex/ASTConversion.swift | 3 + Tests/RegexTests/MatchTests.swift | 23 ++++--- 3 files changed, 53 insertions(+), 37 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 0aa0951c5..6482c4042 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -631,6 +631,41 @@ extension AST.Atom { } } +extension AST.Atom.EscapedBuiltin { + /// If the escape sequence represents a unicode scalar value, returns the + /// value, otherwise `nil`. + public var scalarValue: UnicodeScalar? { + switch self { + // TODO: Should we separate these into a separate enum? Or move the + // specifics of the scalar to the DSL tree? + case .alarm: + return "\u{7}" + case .backspace: + return "\u{8}" + case .escape: + return "\u{1B}" + case .formfeed: + return "\u{C}" + case .newline: + return "\n" + case .carriageReturn: + return "\r" + case .tab: + return "\t" + + case .singleDataUnit, .decimalDigit, .notDecimalDigit, + .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, + .newlineSequence, .whitespace, .notWhitespace, .verticalTab, + .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, + .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, + .textSegment, .notTextSegment: + return nil + } + } +} + extension AST.Atom { /// Retrieve the character value of the atom if it represents a literal /// character or unicode scalar, nil otherwise. @@ -642,34 +677,7 @@ extension AST.Atom { return Character(s) case .escaped(let c): - switch c { - // TODO: Should we separate these into a separate enum? Or move the - // specifics of the scalar to the DSL tree? - case .alarm: - return "\u{7}" - case .backspace: - return "\u{8}" - case .escape: - return "\u{1B}" - case .formfeed: - return "\u{C}" - case .newline: - return "\n" - case .carriageReturn: - return "\r" - case .tab: - return "\t" - - case .singleDataUnit, .decimalDigit, .notDecimalDigit, - .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, - .newlineSequence, .whitespace, .notWhitespace, .verticalTab, - .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, - .wordBoundary, .notWordBoundary, .startOfSubject, - .endOfSubjectBeforeNewline, .endOfSubject, - .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, - .textSegment, .notTextSegment: - return nil - } + return c.scalarValue.map(Character.init) case .keyboardControl, .keyboardMeta, .keyboardMetaControl: // TODO: These should have unicode scalar values. diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index e00dcd806..86041f528 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -211,6 +211,9 @@ extension AST.Atom { case .any: return .any case let .backreference(r): return .backreference(r) + case .escaped(let c) where c.scalarValue != nil: + return .scalar(c.scalarValue!) + default: return .unconverted(self) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index b1287fc8b..f808fa0e3 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -281,6 +281,15 @@ extension RegexTests { // code point sequence firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) + // Escape sequences that represent scalar values. + firstMatchTest(#"\a[\b]\e\f\n\r\t"#, + input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", + match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") + firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#, + input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", + match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") + + firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") // MARK: Quotes @@ -596,24 +605,20 @@ extension RegexTests { func scalar(_ u: UnicodeScalar) -> UInt32 { u.value } - // Currently not supported in the matching engine. for s in scalar("\u{C}") ... scalar("\u{1B}") { let u = UnicodeScalar(s)! - firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)") } for u: UnicodeScalar in ["\u{7}", "\u{8}"] { - firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)") } for s in scalar("\u{A}") ... scalar("\u{D}") { let u = UnicodeScalar(s)! - firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)") } - firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}", - xfail: true) + firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", xfail: true) From 2d1de9e74f6acfe2eaae34a8f904ecee82c06ad9 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:14 +0100 Subject: [PATCH 04/11] Allow custom character classes to begin with `:` ICU and Oniguruma allow custom character classes to begin with `:`, and only lex a POSIX character property if they detect a closing `:]`. However their behavior for this differs: - ICU will consider *any* `:]` in the regex as a closing delimiter, even e.g `[[:a]][:]`. - Oniguruma will stop if it hits a `]`, so `[[:a]][:]` is treated as a custom character class. However it only scans ahead 20 chars max, and doesn't stop for e.g a nested character class opening `[`. Our detection behavior for this is as follows: - When `[:` is encountered inside a custom character class, scan ahead to the closing `:]`. - While scanning, bail if we see any characters that are obviously invalid property names. Currently this includes `[`, `]`, `}`, as well as a second occurrence of `=`. - Otherwise, if we end on `:]`, consider that a POSIX character property. We could include more metacharacters to bail on, e.g `{`, `(`, `)`, but for now I'm tending on the side of lexing an invalid POSIX character property. We can always relax this in the future (as we'd be turning invalid code into valid code). Users can always escape the initial `:` in `[:` if they want a custom character class. In fact, we may want to suggest this via a warning, as this behavior can be pretty subtle. --- .../Regex/Parse/LexicalAnalysis.swift | 121 +++++++++++++----- Sources/_RegexParser/Regex/Parse/Parse.swift | 4 +- Tests/RegexTests/ParseTests.swift | 39 +++++- 3 files changed, 129 insertions(+), 35 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index c48d53de9..bf53d0ab1 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1064,11 +1064,16 @@ extension Source { } mutating func lexCustomCCStart( + context: ParsingContext ) throws -> Located? { recordLoc { src in - // POSIX named sets are atoms. - guard !src.starts(with: "[:") else { return nil } - + // Make sure we don't have a POSIX character property. This may require + // walking to its ending to make sure we have a closing ':]', as otherwise + // we have a custom character class. + // TODO: This behavior seems subtle, could we warn? + guard !src.canLexPOSIXCharacterProperty(context: context) else { + return nil + } if src.tryEat("[") { return src.tryEat("^") ? .inverted : .normal } @@ -1099,12 +1104,38 @@ extension Source { } private mutating func lexPOSIXCharacterProperty( + context: ParsingContext ) throws -> Located? { - try recordLoc { src in - guard src.tryEat(sequence: "[:") else { return nil } - let inverted = src.tryEat("^") - let prop = try src.lexCharacterPropertyContents(end: ":]").value - return .init(prop, isInverted: inverted, isPOSIX: true) + // Only allowed in a custom character class. + guard context.isInCustomCharacterClass else { return nil } + return try recordLoc { src in + try src.tryEating { src in + guard src.tryEat(sequence: "[:") else { return nil } + let inverted = src.tryEat("^") + + // Note we lex the contents and ending *before* classifying, because we + // want to bail with nil if we don't have the right ending. This allows + // the lexing of a custom character class if we don't have a ':]' + // ending. + let (key, value) = src.lexCharacterPropertyKeyValue() + guard src.tryEat(sequence: ":]") else { return nil } + + let prop = try Source.classifyCharacterPropertyContents(key: key, + value: value) + return .init(prop, isInverted: inverted, isPOSIX: true) + } + } + } + + private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool { + do { + var src = self + return try src.lexPOSIXCharacterProperty(context: context) != nil + } catch { + // We want to tend on the side of lexing a POSIX character property, so + // even if it is invalid in some way (e.g invalid property names), still + // try and lex it. + return true } } @@ -1129,26 +1160,52 @@ extension Source { } } - private mutating func lexCharacterPropertyContents( - end: String - ) throws -> Located { - try recordLoc { src in - // We should either have: - // - 'x=y' where 'x' is a property key, and 'y' is a value. - // - 'y' where 'y' is a value (or a bool key with an inferred value - // of true), and its key is inferred. - // TODO: We could have better recovery here if we only ate the characters - // that property keys and values can use. - let lhs = src.lexUntil { - $0.isEmpty || $0.peek() == "=" || $0.starts(with: end) - }.value - if src.tryEat("=") { - let rhs = try src.lexUntil(eating: end).value - return try Source.classifyCharacterProperty(key: lhs, value: rhs) + private mutating func lexCharacterPropertyKeyValue( + ) -> (key: String?, value: String) { + func atPossibleEnding(_ src: inout Source) -> Bool { + guard let next = src.peek() else { return true } + switch next { + case "=": + // End of a key. + return true + case ":", "[", "]": + // POSIX character property endings to cover ':]', ']', and '[' as the + // start of a nested character class. + return true + case "}": + // Ending of '\p{'. We cover this for POSIX too as it's not a valid + // character property name anyway, and it's nice not to have diverging + // logic for these cases. + return true + default: + // We may want to handle other metacharacters here, e.g '{', '(', ')', + // as they're not valid character property names. However for now + // let's tend on the side of forming an unknown property name in case + // these characters are ever used in future character property names + // (though it's very unlikely). Users can always escape e.g the ':' + // in '[:' if they definitely want a custom character class. + return false } - try src.expect(sequence: end) - return try Source.classifyCharacterPropertyValueOnly(lhs) } + // We should either have: + // - 'x=y' where 'x' is a property key, and 'y' is a value. + // - 'y' where 'y' is a value (or a bool key with an inferred value of true) + // and its key is inferred. + let lhs = lexUntil(atPossibleEnding).value + if tryEat("=") { + let rhs = lexUntil(atPossibleEnding).value + return (lhs, rhs) + } + return (nil, lhs) + } + + private static func classifyCharacterPropertyContents( + key: String?, value: String + ) throws -> AST.Atom.CharacterProperty.Kind { + if let key = key { + return try classifyCharacterProperty(key: key, value: value) + } + return try classifyCharacterPropertyValueOnly(value) } /// Try to consume a character property. @@ -1164,7 +1221,10 @@ extension Source { let isInverted = src.peek() == "P" src.advance(2) - let prop = try src.lexCharacterPropertyContents(end: "}").value + let (key, value) = src.lexCharacterPropertyKeyValue() + let prop = try Source.classifyCharacterPropertyContents(key: key, + value: value) + try src.expect("}") return .init(prop, isInverted: isInverted, isPOSIX: false) } } @@ -1758,11 +1818,8 @@ extension Source { if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful - // POSIX character property. This is only allowed in a custom character - // class. - // TODO: Can we try and recover and diagnose these outside character - // classes? - if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value { + // POSIX character property. + if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value { return .property(prop) } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c3aa3500b..d70a1f14f 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -403,7 +403,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = try source.lexCustomCCStart(context: context) { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -487,7 +487,7 @@ extension Parser { while source.peek() != "]" && source.peekCCBinOp() == nil { // Nested custom character class. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = try source.lexCustomCCStart(context: context) { members.append(.custom(try parseCustomCharacterClass(cccStart))) continue } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 5e3defb11..aa0ebbf98 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -474,6 +474,26 @@ extension RegexTests { parseTest( "[a^]", charClass("a", "^")) + // These are custom character classes, not invalid POSIX character classes. + // TODO: This behavior is subtle, we ought to warn. + parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e"))) + parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":")) + parseTest("[:a]", charClass(":", "a")) + parseTest("[a:]", charClass("a", ":")) + parseTest("[:]", charClass(":")) + parseTest("[::]", charClass(":", ":")) + parseTest("[:=:]", charClass(":", "=", ":")) + parseTest("[[:]]", charClass(charClass(":"))) + parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":"))) + + parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":"))) + parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":"))) + parseTest(#"[[:a]]"#, charClass(charClass(":", "a"))) + parseTest(#"[[:}]]"#, charClass(charClass(":", "}"))) + parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) + parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{")))) + parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + parseTest( #"\D\S\W"#, concat( @@ -1096,9 +1116,13 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) + // TODO: Start erroring on these? parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx"))) parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL"))) parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y"))) + parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)"))) + parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()")))) + parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}")) // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) @@ -2183,7 +2207,11 @@ extension RegexTests { diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) diagnosticTest(#"\p{a"#, .expected("}")) - diagnosticTest(#"\p{a="#, .expected("}")) + diagnosticTest(#"\p{a="#, .emptyProperty) + diagnosticTest(#"\p{a=}"#, .emptyProperty) + diagnosticTest(#"\p{a=b"#, .expected("}")) + diagnosticTest(#"\p{aaa[b]}"#, .expected("}")) + diagnosticTest(#"\p{a=b=c}"#, .expected("}")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) @@ -2218,6 +2246,15 @@ extension RegexTests { // the closing bracket. diagnosticTest("[]", .expected("]")) + diagnosticTest("[:a", .expected("]")) + diagnosticTest("[:a:", .expected("]")) + diagnosticTest("[[:a", .expected("]")) + diagnosticTest("[[:a:", .expected("]")) + diagnosticTest("[[:a[:]", .expected("]")) + + diagnosticTest("[[::]]", .emptyProperty) + diagnosticTest("[[:=:]]", .emptyProperty) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) From 5912ab493f53fdefbbc2cdf5b2d85795d8fe9176 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:14 +0100 Subject: [PATCH 05/11] Allow POSIX character properties outside of custom character classes This matches the ICU behavior, and appears to be suggested by UTS#18. --- .../Regex/Parse/LexicalAnalysis.swift | 17 +++++++---------- Sources/_RegexParser/Regex/Parse/Parse.swift | 4 ++-- .../_StringProcessing/Utility/ASTBuilder.swift | 5 +++++ Tests/RegexTests/ParseTests.swift | 11 ++++++++--- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index bf53d0ab1..f70989c9f 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1064,14 +1064,13 @@ extension Source { } mutating func lexCustomCCStart( - context: ParsingContext ) throws -> Located? { recordLoc { src in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise // we have a custom character class. // TODO: This behavior seems subtle, could we warn? - guard !src.canLexPOSIXCharacterProperty(context: context) else { + guard !src.canLexPOSIXCharacterProperty() else { return nil } if src.tryEat("[") { @@ -1104,11 +1103,8 @@ extension Source { } private mutating func lexPOSIXCharacterProperty( - context: ParsingContext ) throws -> Located? { - // Only allowed in a custom character class. - guard context.isInCustomCharacterClass else { return nil } - return try recordLoc { src in + try recordLoc { src in try src.tryEating { src in guard src.tryEat(sequence: "[:") else { return nil } let inverted = src.tryEat("^") @@ -1127,10 +1123,10 @@ extension Source { } } - private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool { + private func canLexPOSIXCharacterProperty() -> Bool { do { var src = self - return try src.lexPOSIXCharacterProperty(context: context) != nil + return try src.lexPOSIXCharacterProperty() != nil } catch { // We want to tend on the side of lexing a POSIX character property, so // even if it is invalid in some way (e.g invalid property names), still @@ -1818,8 +1814,9 @@ extension Source { if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful - // POSIX character property. - if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value { + // POSIX character property. Like \p{...} this is also allowed outside of + // a custom character class. + if let prop = try src.lexPOSIXCharacterProperty()?.value { return .property(prop) } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index d70a1f14f..c3aa3500b 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -403,7 +403,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart(context: context) { + if let cccStart = try source.lexCustomCCStart() { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -487,7 +487,7 @@ extension Parser { while source.peek() != "]" && source.peekCCBinOp() == nil { // Nested custom character class. - if let cccStart = try source.lexCustomCCStart(context: context) { + if let cccStart = try source.lexCustomCCStart() { members.append(.custom(try parseCustomCharacterClass(cccStart))) continue } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index be9f61517..961d0e23f 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -354,6 +354,11 @@ func prop( ) -> AST.Node { atom(.property(.init(kind, isInverted: inverted, isPOSIX: false))) } +func posixProp( + _ kind: AST.Atom.CharacterProperty.Kind, inverted: Bool = false +) -> AST.Node { + atom(.property(.init(kind, isInverted: inverted, isPOSIX: true))) +} // Raw atom constructing variant func atom_a( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index aa0ebbf98..e9b422379 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -477,12 +477,9 @@ extension RegexTests { // These are custom character classes, not invalid POSIX character classes. // TODO: This behavior is subtle, we ought to warn. parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e"))) - parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":")) parseTest("[:a]", charClass(":", "a")) parseTest("[a:]", charClass("a", ":")) parseTest("[:]", charClass(":")) - parseTest("[::]", charClass(":", ":")) - parseTest("[:=:]", charClass(":", "=", ":")) parseTest("[[:]]", charClass(charClass(":"))) parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":"))) @@ -522,6 +519,12 @@ extension RegexTests { posixProp_m(.binary(.uppercase), inverted: true), "c", "d")) + // Like ICU, we allow POSIX character properties outside of custom character + // classes. This also appears to be suggested by UTS#18. + // TODO: We should likely emit a warning. + parseTest("[:space:]", posixProp(.binary(.whitespace))) + parseTest("[:script=Greek:]", posixProp(.script(.greek))) + parseTest("[[[:space:]]]", charClass(charClass( posixProp_m(.binary(.whitespace)) ))) @@ -2252,6 +2255,8 @@ extension RegexTests { diagnosticTest("[[:a:", .expected("]")) diagnosticTest("[[:a[:]", .expected("]")) + diagnosticTest("[::]", .emptyProperty) + diagnosticTest("[:=:]", .emptyProperty) diagnosticTest("[[::]]", .emptyProperty) diagnosticTest("[[:=:]]", .emptyProperty) From c63848655061d431b37715b1033f452a9195cdcf Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:15 +0100 Subject: [PATCH 06/11] Fix character class trivia matching Rather than matching and not advancing the input, we should always return `nil` to never match against the trivia. --- Sources/_StringProcessing/ConsumerInterface.swift | 5 +---- Tests/RegexTests/MatchTests.swift | 10 ++++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 85e73801d..d1cc1d327 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -287,11 +287,8 @@ extension DSLTree.CustomCharacterClass.Member { } case .trivia: // TODO: Should probably strip this earlier... - return { _, bounds in - return bounds.lowerBound - } + return { _, _ in nil } } - } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f808fa0e3..9a8e89fe0 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1208,6 +1208,16 @@ extension RegexTests { ("CaFe", true), ("EfAc", true)) } + + func testNonSemanticWhitespace() { + firstMatchTest(#" \t "#, input: " \t ", match: " \t ") + firstMatchTest(#"(?xx) \t "#, input: " \t ", match: "\t") + + firstMatchTest(#"[ \t]+"#, input: " \t ", match: " \t ") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t") + } func testASCIIClasses() { // 'D' ASCII-only digits From f053dc33adba44e56aa535385842d2701663dff8 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:15 +0100 Subject: [PATCH 07/11] Fix trivia parsing for set operations and initial `]` cases Previously we would check for an empty array of members when deciding whether an initial `]` is literal, or if the operands of a set operation are invalid. Switch to checking whether we have any semantic members instead. --- .../Regex/AST/CustomCharClass.swift | 6 ++- Sources/_RegexParser/Regex/Parse/Parse.swift | 29 ++++++----- .../_RegexParser/Regex/Printing/DumpAST.swift | 4 +- Tests/RegexTests/MatchTests.swift | 3 ++ Tests/RegexTests/ParseTests.swift | 49 +++++++++++++++++-- 5 files changed, 73 insertions(+), 18 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index 614048f0a..19e72aef5 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -97,6 +97,10 @@ extension CustomCC.Member { if case .trivia = self { return true } return false } + + public var isSemantic: Bool { + !isTrivia + } } extension AST.CustomCharacterClass { @@ -104,7 +108,7 @@ extension AST.CustomCharacterClass { /// nested custom character classes. public var strippingTriviaShallow: Self { var copy = self - copy.members = copy.members.filter { !$0.isTrivia } + copy.members = copy.members.filter(\.isSemantic) return copy } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c3aa3500b..b24097b83 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -438,16 +438,18 @@ extension Parser { defer { context.isInCustomCharacterClass = alreadyInCCC } typealias Member = CustomCC.Member - try source.expectNonEmpty() - var members: Array = [] + try parseCCCMembers(into: &members) - // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty - // character classes, and assume an initial ']' is literal. - if let loc = source.tryEatWithLoc("]") { - members.append(.atom(.init(.char("]"), loc))) + // If we didn't parse any semantic members, we can eat a ']' character, as + // PCRE, Oniguruma, and ICU forbid empty character classes, and assume an + // initial ']' is literal. + if members.none(\.isSemantic) { + if let loc = source.tryEatWithLoc("]") { + members.append(.atom(.init(.char("]"), loc))) + try parseCCCMembers(into: &members) + } } - try parseCCCMembers(into: &members) // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. @@ -458,8 +460,9 @@ extension Parser { var rhs: Array = [] try parseCCCMembers(into: &rhs) - if members.isEmpty || rhs.isEmpty { - throw ParseError.expectedCustomCharacterClassMembers + if members.none(\.isSemantic) || rhs.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) } // If we're done, bail early @@ -472,8 +475,9 @@ extension Parser { // Otherwise it's just another member to accumulate members = [setOp] } - if members.isEmpty { - throw ParseError.expectedCustomCharacterClassMembers + if members.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) } try source.expect("]") return CustomCC(start, members, loc(start.location.start)) @@ -484,7 +488,8 @@ extension Parser { ) throws { // Parse members until we see the end of the custom char class or an // operator. - while source.peek() != "]" && source.peekCCBinOp() == nil { + while !source.isEmpty && source.peek() != "]" && + source.peekCCBinOp() == nil { // Nested custom character class. if let cccStart = try source.lexCustomCCStart() { diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 0e40ad2ce..986f3d86e 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -312,7 +312,9 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable { case .quote(let q): return "\(q)" case .trivia(let t): return "\(t)" case .setOperation(let lhs, let op, let rhs): - return "op \(lhs) \(op.value) \(rhs)" + // TODO: We should eventually have some way of filtering out trivia for + // tests, so that it can appear in regular dumps. + return "op \(lhs.filter(\.isSemantic)) \(op.value) \(rhs.filter(\.isSemantic))" } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 9a8e89fe0..868426d41 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1217,6 +1217,9 @@ extension RegexTests { firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t") firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t") firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t") + + firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa") + firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a") } func testASCIIClasses() { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e9b422379..65dd6ed09 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -460,9 +460,25 @@ extension RegexTests { parseTest("[-]", charClass("-")) - // Empty character classes are forbidden, therefore this is a character - // class of literal ']'. + // Empty character classes are forbidden, therefore these are character + // classes containing literal ']'. parseTest("[]]", charClass("]")) + parseTest("[]a]", charClass("]", "a")) + parseTest( + "(?x)[ ]]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("]")) + ) + parseTest( + "(?x)[ ] ]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("]")) + ) + parseTest( + "(?x)[ ] a ]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("]", "a")) + ) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -613,6 +629,16 @@ extension RegexTests { parseTest( "~~*", concat("~", zeroOrMore(of: "~"))) + parseTest( + "[ && ]", charClass( + .setOperation([" "], .init(faking: .intersection), [" ", " "])) + ) + parseTest( + "(?x)[ a && b ]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, charClass( + .setOperation(["a"], .init(faking: .intersection), ["b"])) + )) + // MARK: Quotes parseTest( @@ -2205,6 +2231,9 @@ extension RegexTests { diagnosticTest(")))", .unbalancedEndOfGroup) diagnosticTest("())()", .unbalancedEndOfGroup) + diagnosticTest("[", .expectedCustomCharacterClassMembers) + diagnosticTest("[^", .expectedCustomCharacterClassMembers) + diagnosticTest(#"\u{5"#, .expected("}")) diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) @@ -2245,9 +2274,21 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) - // The first ']' of a custom character class is literal, so this is missing - // the closing bracket. + // MARK: Character classes + + diagnosticTest("[a", .expected("]")) + + // The first ']' of a custom character class is literal, so these are + // missing the closing bracket. diagnosticTest("[]", .expected("]")) + diagnosticTest("(?x)[ ]", .expected("]")) + + diagnosticTest("[&&]", .expectedCustomCharacterClassMembers) + diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers) + diagnosticTest("[&&a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ && ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ &&a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[a&& ]", .expectedCustomCharacterClassMembers) diagnosticTest("[:a", .expected("]")) diagnosticTest("[:a:", .expected("]")) From e84c93d061d1b57c1917ad202ffad40e4e907548 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:15 +0100 Subject: [PATCH 08/11] Throw error if we encounter stray opening '(' This should be unreachable, let's make sure of that. Doing so requires generalizing the handling of LocatedError a bit. --- .../Regex/Parse/LexicalAnalysis.swift | 21 +++++++++++++------ .../Regex/Parse/SourceLocation.swift | 5 +++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index f70989c9f..daa629055 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -21,6 +21,16 @@ API convention: - eat() and tryEat() is still used by the parser as a character-by-character interface */ +extension Error { + func addingLocation(_ loc: Range) -> Error { + // If we're already a LocatedError, don't change the location. + if self is _LocatedErrorProtocol { + return self + } + return Source.LocatedError(self, loc) + } +} + extension Source { // MARK: - recordLoc @@ -51,12 +61,8 @@ extension Source { do { guard let result = try f(&self) else { return nil } return Located(result, start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, start..: Error { + public struct LocatedError: Error, _LocatedErrorProtocol { public let error: E public let location: SourceLocation @@ -70,7 +72,6 @@ extension Source { self.error = v self.location = Location(r) } - } /// Located value: a value wrapped with a source range From 1ea6f2043ef824b2314e95b1d22f3080094633b4 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:16 +0100 Subject: [PATCH 09/11] Change matching option scoping behavior to match PCRE Previously we would always parse a "change matching option" sequence as a group, and for the isolated syntax e.g `(?x)`, we would have it implicitly wrap everything after in the same group by having it do a recursive parse. This matched the Oniguruma behavior for such isolated groups, and was easy to implement, but its behavior is quite unintuitive when it comes to alternations, as e.g `a(?x)b|c` becomes `a(?x:b|c)`, which may not be expected. Instead, let's follow PCRE's behavior by having such isolated cases affect the syntax options for the remainder of the current group, including across alternation branches. This is done by lexing such cases as atoms (as they aren't really group-like anymore), and having them change the syntax options when we encounter them. The existing scoping rules around groups take care of resetting the options when we exit the scope. --- Sources/_RegexParser/Regex/AST/Atom.swift | 10 +- Sources/_RegexParser/Regex/AST/Group.swift | 19 +- .../Regex/Parse/LexicalAnalysis.swift | 98 ++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 64 ++- .../_RegexParser/Regex/Printing/DumpAST.swift | 34 +- Sources/_StringProcessing/ByteCodeGen.swift | 8 +- .../_StringProcessing/ConsumerInterface.swift | 7 +- .../_StringProcessing/PrintAsPattern.swift | 5 + .../Regex/ASTConversion.swift | 9 +- Sources/_StringProcessing/Regex/DSLTree.swift | 2 + Sources/_StringProcessing/Regex/Options.swift | 3 +- .../Utility/ASTBuilder.swift | 9 +- Tests/RegexTests/MatchTests.swift | 11 + Tests/RegexTests/ParseTests.swift | 471 +++++++++--------- 14 files changed, 404 insertions(+), 346 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 6482c4042..9cc2e9a96 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -72,6 +72,9 @@ extension AST { // (*ACCEPT), (*FAIL), ... case backtrackingDirective(BacktrackingDirective) + + // (?i), (?i-m), ... + case changeMatchingOptions(MatchingOptionSequence) } } } @@ -91,6 +94,7 @@ extension AST.Atom { case .subpattern(let v): return v case .callout(let v): return v case .backtrackingDirective(let v): return v + case .changeMatchingOptions(let v): return v case .any: return nil case .startOfLine: return nil case .endOfLine: return nil @@ -691,7 +695,7 @@ extension AST.Atom { return nil case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective: + .callout, .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -731,7 +735,7 @@ extension AST.Atom { case .property, .escaped, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -740,6 +744,8 @@ extension AST.Atom { switch kind { case .backtrackingDirective(let b): return b.isQuantifiable + case .changeMatchingOptions: + return false // TODO: Are callouts quantifiable? default: return true diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift index 81e0931ad..a8c4f8b0f 100644 --- a/Sources/_RegexParser/Regex/AST/Group.swift +++ b/Sources/_RegexParser/Regex/AST/Group.swift @@ -68,9 +68,7 @@ extension AST { case atomicScriptRun // (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:) - // Isolated options are written as e.g (?i), and implicitly form a group - // containing all the following elements of the current group. - case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool) + case changeMatchingOptions(MatchingOptionSequence) // NOTE: Comments appear to be groups, but are not parsed // the same. They parse more like quotes, so are not @@ -87,21 +85,6 @@ extension AST.Group.Kind { } } - /// Whether this is a group with an implicit scope, e.g isolated matching - /// options implicitly become parent groups for the rest of the elements in - /// the current group: - /// - /// (a(?i)bc)de -> (a(?i:bc))de - /// - public var hasImplicitScope: Bool { - switch self { - case .changeMatchingOptions(_, let isIsolated): - return isIsolated - default: - return false - } - } - /// If this is a named group, its name, `nil` otherwise. public var name: String? { switch self { diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index daa629055..e8b7e9e18 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -712,6 +712,22 @@ extension Source { return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: []) } + /// A matching option changing atom. + /// + /// '(?' MatchingOptionSeq ')' + /// + mutating func lexChangeMatchingOptionAtom( + context: ParsingContext + ) throws -> AST.MatchingOptionSequence? { + try tryEating { src in + guard src.tryEat(sequence: "(?"), + let seq = try src.lexMatchingOptionSequence(context: context) + else { return nil } + try src.expect(")") + return seq + } + } + /// Try to consume explicitly spelled-out PCRE2 group syntax. mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { tryEating { src in @@ -852,7 +868,7 @@ extension Source { // otherwise a matching option specifier. Conversely, '(?P' can be the // start of a matching option sequence, or a reference if it is followed // by '=' or '<'. - guard !src.shouldLexGroupLikeAtom() else { return nil } + guard !src.shouldLexGroupLikeAtom(context: context) else { return nil } guard src.tryEat("(") else { return nil } if src.tryEat("?") { @@ -877,22 +893,13 @@ extension Source { // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). if let seq = try src.lexMatchingOptionSequence(context: context) { - if src.tryEat(":") { - return .changeMatchingOptions(seq, isIsolated: false) - } - // If this isn't start of an explicit group, we should have an - // implicit group that covers the remaining elements of the current - // group. - // TODO: This implicit scoping behavior matches Oniguruma, but PCRE - // also does it across alternations, which will require additional - // handling. - guard src.tryEat(")") else { + guard src.tryEat(":") else { if let next = src.peek() { throw ParseError.invalidMatchingOption(next) } throw ParseError.expected(")") } - return .changeMatchingOptions(seq, isIsolated: true) + return .changeMatchingOptions(seq) } guard let next = src.peek() else { @@ -1041,18 +1048,8 @@ extension Source { context: ParsingContext ) throws -> Located? { try tryEating { src in - guard src.tryEat(sequence: "(?"), - let group = try src.lexGroupStart(context: context) - else { return nil } - - // Implicitly scoped groups are not supported here. - guard !group.value.hasImplicitScope else { - throw LocatedError( - ParseError.unsupportedCondition("implicitly scoped group"), - group.location - ) - } - return group + guard src.tryEat(sequence: "(?") else { return nil } + return try src.lexGroupStart(context: context) } } @@ -1239,17 +1236,19 @@ extension Source { allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false ) throws -> AST.Reference? { let kind = try recordLoc { src -> AST.Reference.Kind? in - // Note this logic should match canLexNumberedReference. - if src.tryEat("+") { - return .relative(try src.expectNumber().value) - } - if src.tryEat("-") { - return .relative(try -src.expectNumber().value) - } - if let num = try src.lexNumber() { - return .absolute(num.value) + try src.tryEating { src in + // Note this logic should match canLexNumberedReference. + if src.tryEat("+"), let num = try src.lexNumber() { + return .relative(num.value) + } + if src.tryEat("-"), let num = try src.lexNumber() { + return .relative(-num.value) + } + if let num = try src.lexNumber() { + return .absolute(num.value) + } + return nil } - return nil } guard let kind = kind else { return nil } guard allowWholePatternRef || kind.value != .recurseWholePattern else { @@ -1478,8 +1477,21 @@ extension Source { return src.canLexNumberedReference() } + private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { + var src = self + + // See if we can lex a matching option sequence that terminates in ')'. Such + // a sequence is an atom. If an error is thrown, there are invalid elements + // of the matching option sequence. In such a case, we can lex as a group + // and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") + } + /// Whether a group specifier should be lexed as an atom instead of a group. - private func shouldLexGroupLikeAtom() -> Bool { + private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { var src = self guard src.tryEat("(") else { return false } @@ -1493,6 +1505,9 @@ extension Source { // The start of an Oniguruma 'of-contents' callout. if src.tryEat("{") { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } + return false } // The start of a backreference directive or Oniguruma named callout. @@ -1753,13 +1768,20 @@ extension Source { /// /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective /// - mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind { + mutating func expectGroupLikeAtom( + context: ParsingContext + ) throws -> AST.Atom.Kind { try recordLoc { src in // References that look like groups, e.g (?R), (?1), ... if let ref = try src.lexGroupLikeReference() { return ref.value } + // Change matching options atom (?i), (?x-i), ... + if let seq = try src.lexChangeMatchingOptionAtom(context: context) { + return .changeMatchingOptions(seq) + } + // (*ACCEPT), (*FAIL), (*MARK), ... if let b = try src.lexBacktrackingDirective() { return .backtrackingDirective(b) @@ -1828,8 +1850,8 @@ extension Source { // If we have group syntax that was skipped over in lexGroupStart, we // need to handle it as an atom, or throw an error. - if !customCC && src.shouldLexGroupLikeAtom() { - return try src.expectGroupLikeAtom() + if !customCC && src.shouldLexGroupLikeAtom(context: context) { + return try src.expectGroupLikeAtom(context: context) } // A quantifier here is invalid. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index b24097b83..975012546 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -282,42 +282,53 @@ extension Parser { loc(start))) } + /// Apply the syntax options of a given matching option sequence to the + /// current set of options. + private mutating func applySyntaxOptions( + of opts: AST.MatchingOptionSequence + ) { + // We skip this for multi-line, as extended syntax is always enabled there. + if context.syntax.contains(.multilineExtendedSyntax) { return } + + // Check if we're introducing or removing extended syntax. + // TODO: PCRE differentiates between (?x) and (?xx) where only the latter + // handles non-semantic whitespace in a custom character class. Other + // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, + // treat (?x) and (?xx) as the same option here. If we ever get a strict + // PCRE mode, we will need to change this to handle that. + if opts.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if opts.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if opts.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } + } + + /// Apply the syntax options of a matching option changing group to the + /// current set of options. + private mutating func applySyntaxOptions(of group: AST.Group.Kind) { + if case .changeMatchingOptions(let seq) = group { + applySyntaxOptions(of: seq) + } + } + /// Perform a recursive parse for the body of a group. mutating func parseGroupBody( start: Source.Position, _ kind: AST.Located ) throws -> AST.Group { context.recordGroup(kind.value) - // Check if we're introducing or removing extended syntax. We skip this for - // multi-line, as extended syntax is always enabled there. - // TODO: PCRE differentiates between (?x) and (?xx) where only the latter - // handles non-semantic whitespace in a custom character class. Other - // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, - // treat (?x) and (?xx) as the same option here. If we ever get a strict - // PCRE mode, we will need to change this to handle that. let currentSyntax = context.syntax - if !context.syntax.contains(.multilineExtendedSyntax) { - if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { - if c.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if c.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if c.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) - } - } - } + applySyntaxOptions(of: kind.value) defer { context.syntax = currentSyntax } let child = try parseNode() - // An implicit scoped group has already consumed its closing paren. - if !kind.value.hasImplicitScope { - try source.expect(")") - } + try source.expect(")") return .init(kind, child, loc(start)) } @@ -409,6 +420,11 @@ extension Parser { } if let atom = try source.lexAtom(context: context) { + // If we have a change matching options atom, apply the syntax options. We + // already take care of scoping syntax options within a group. + if case .changeMatchingOptions(let opts) = atom.kind { + applySyntaxOptions(of: opts) + } // TODO: track source locations return .atom(atom) } diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 986f3d86e..8565b14e9 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -156,6 +156,9 @@ extension AST.Atom { case .backtrackingDirective(let d): return "\(d)" + case .changeMatchingOptions(let opts): + return "changeMatchingOptions<\(opts)>" + case .char, .scalar: fatalError("Unreachable") } @@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable { extension AST.Group.Kind: _ASTPrintable { public var _dumpBase: String { switch self { - case .capture: return "capture" - case .namedCapture(let s): return "capture<\(s.value)>" - case .balancedCapture(let b): return "balanced capture \(b)" - case .nonCapture: return "nonCapture" - case .nonCaptureReset: return "nonCaptureReset" - case .atomicNonCapturing: return "atomicNonCapturing" - case .lookahead: return "lookahead" - case .negativeLookahead: return "negativeLookahead" - case .nonAtomicLookahead: return "nonAtomicLookahead" - case .lookbehind: return "lookbehind" - case .negativeLookbehind: return "negativeLookbehind" - case .nonAtomicLookbehind: return "nonAtomicLookbehind" - case .scriptRun: return "scriptRun" - case .atomicScriptRun: return "atomicScriptRun" - case .changeMatchingOptions(let seq, let isIsolated): - return "changeMatchingOptions<\(seq), \(isIsolated)>" + case .capture: return "capture" + case .namedCapture(let s): return "capture<\(s.value)>" + case .balancedCapture(let b): return "balanced capture \(b)" + case .nonCapture: return "nonCapture" + case .nonCaptureReset: return "nonCaptureReset" + case .atomicNonCapturing: return "atomicNonCapturing" + case .lookahead: return "lookahead" + case .negativeLookahead: return "negativeLookahead" + case .nonAtomicLookahead: return "nonAtomicLookahead" + case .lookbehind: return "lookbehind" + case .negativeLookbehind: return "negativeLookbehind" + case .nonAtomicLookbehind: return "nonAtomicLookbehind" + case .scriptRun: return "scriptRun" + case .atomicScriptRun: return "atomicScriptRun" + case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>" } } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3cc2d4039..b6f9b4732 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -34,6 +34,9 @@ extension Compiler.ByteCodeGen { case let .symbolicReference(id): builder.buildUnresolvedReference(id: id) + case let .changeMatchingOptions(optionSequence): + options.apply(optionSequence) + case let .unconverted(astAtom): if let consumer = try astAtom.generateConsumer(options) { builder.buildConsume(by: consumer) @@ -349,7 +352,7 @@ extension Compiler.ByteCodeGen { case .capture, .namedCapture, .balancedCapture: throw Unreachable("These should produce a capture node") - case .changeMatchingOptions(let optionSequence, _): + case .changeMatchingOptions(let optionSequence): options.apply(optionSequence) try emitNode(child) @@ -585,6 +588,9 @@ extension Compiler.ByteCodeGen { } case let .capture(_, refId, child): + options.beginScope() + defer { options.endScope() } + let cap = builder.makeCapture(id: refId) switch child { case let .matcher(_, m): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index d1cc1d327..ecb7d1356 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -100,6 +100,10 @@ extension DSLTree.Atom { // TODO: Should we handle? return nil + case .changeMatchingOptions: + // TODO: Should we handle? + return nil + case let .unconverted(a): return try a.generateConsumer(opts) } @@ -178,7 +182,8 @@ extension AST.Atom { return nil case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective: + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: // FIXME: implement return nil } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 3afc19836..4d135898b 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -137,6 +137,8 @@ extension PrettyPrinter { print("/* TOOD: backreferences */") case .symbolicReference: print("/* TOOD: symbolic references */") + case .changeMatchingOptions: + print("/* TODO: change matching options */") } case .trivia: @@ -320,6 +322,9 @@ extension AST.Atom { case .backtrackingDirective: return " /* TODO: backtracking directive */" + + case .changeMatchingOptions: + return "/* TODO: change matching options */" } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 86041f528..8acbd3b1b 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -206,10 +206,11 @@ extension AST.Atom { } switch self.kind { - case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) - case .any: return .any - case let .backreference(r): return .backreference(r) + case let .char(c): return .char(c) + case let .scalar(s): return .scalar(s) + case .any: return .any + case let .backreference(r): return .backreference(r) + case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq) case .escaped(let c) where c.scalarValue != nil: return .scalar(c.scalarValue!) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 7b2ecd515..51f5ea36f 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -166,6 +166,8 @@ extension DSLTree { case backreference(AST.Reference) case symbolicReference(ReferenceID) + case changeMatchingOptions(AST.MatchingOptionSequence) + case unconverted(AST.Atom) } } diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 6f38956e7..623589b54 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -195,7 +195,6 @@ extension RegexComponent { ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) return Regex(node: .nonCapturingGroup( - .changeMatchingOptions(sequence, isIsolated: false), - regex.root)) + .changeMatchingOptions(sequence), regex.root)) } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 961d0e23f..51d4f8bfc 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -119,9 +119,14 @@ func atomicScriptRun(_ child: AST.Node) -> AST.Node { group(.atomicScriptRun, child) } func changeMatchingOptions( - _ seq: AST.MatchingOptionSequence, isIsolated: Bool, _ child: AST.Node + _ seq: AST.MatchingOptionSequence, _ child: AST.Node ) -> AST.Node { - group(.changeMatchingOptions(seq, isIsolated: isIsolated), child) + group(.changeMatchingOptions(seq), child) +} +func changeMatchingOptions( + _ seq: AST.MatchingOptionSequence +) -> AST.Node { + atom(.changeMatchingOptions(seq)) } func matchingOptions( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 868426d41..e00c77f56 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1319,6 +1319,17 @@ extension RegexTests { firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil) firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb") firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") + + // Matching option changing persists across alternations. + firstMatchTest(#"a(?s)b|c|.d"#, input: "abc", match: "ab") + firstMatchTest(#"a(?s)b|c|.d"#, input: "c", match: "c") + firstMatchTest(#"a(?s)b|c|.d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?s)(?^)b|c|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nc", match: "\nc") + firstMatchTest(#"a(?s)b|c(?-s)|(?^s).d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?:(?s).b)|.c|.d"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"a(?:(?s).b)|.c"#, input: "a\nc", match: nil) } func testOptionMethods() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 65dd6ed09..bdae250ba 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -464,21 +464,18 @@ extension RegexTests { // classes containing literal ']'. parseTest("[]]", charClass("]")) parseTest("[]a]", charClass("]", "a")) - parseTest( - "(?x)[ ]]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]")) - ) - parseTest( - "(?x)[ ] ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]")) - ) - parseTest( - "(?x)[ ] a ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]", "a")) - ) + parseTest("(?x)[ ]]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] a ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]", "a") + )) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -630,14 +627,13 @@ extension RegexTests { "~~*", concat("~", zeroOrMore(of: "~"))) parseTest( - "[ && ]", charClass( - .setOperation([" "], .init(faking: .intersection), [" ", " "])) + "[ && ]", + charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) ) - parseTest( - "(?x)[ a && b ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass( - .setOperation(["a"], .init(faking: .intersection), ["b"])) - )) + parseTest("(?x)[ a && b ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + )) // MARK: Quotes @@ -832,81 +828,67 @@ extension RegexTests { // Matching option changing groups. parseTest("(?-)", changeMatchingOptions( - matchingOptions(), isIsolated: true, empty()) - ) + matchingOptions() + )) parseTest("(?i)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive) + )) parseTest("(?m)", changeMatchingOptions( - matchingOptions(adding: .multiline), - isIsolated: true, empty()) - ) + matchingOptions(adding: .multiline) + )) parseTest("(?x)", changeMatchingOptions( - matchingOptions(adding: .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extended) + )) parseTest("(?xx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended) + )) parseTest("(?xxx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended, .extended) + )) parseTest("(?P)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: true, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps) + )) parseTest("(?-i)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(removing: .caseInsensitive) + )) parseTest("(?i-s)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive, removing: .singleLine), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive, removing: .singleLine) + )) parseTest("(?i-is)", changeMatchingOptions( matchingOptions(adding: .caseInsensitive, - removing: .caseInsensitive, .singleLine), - isIsolated: true, empty()) - ) + removing: .caseInsensitive, .singleLine) + )) parseTest("(?:)", nonCapture(empty())) parseTest("(?-:)", changeMatchingOptions( - matchingOptions(), isIsolated: false, empty()) - ) + matchingOptions(), empty() + )) parseTest("(?i:)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(adding: .caseInsensitive), empty() + )) parseTest("(?-i:)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(removing: .caseInsensitive), empty() + )) parseTest("(?P:)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: false, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps), empty() + )) parseTest("(?^)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: true, empty()) - ) + unsetMatchingOptions() + )) parseTest("(?^:)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: false, empty()) - ) + unsetMatchingOptions(), empty() + )) parseTest("(?^ims:)", changeMatchingOptions( unsetMatchingOptions(adding: .caseInsensitive, .multiline, .singleLine), - isIsolated: false, empty()) - ) + empty() + )) parseTest("(?^J:)", changeMatchingOptions( - unsetMatchingOptions(adding: .allowDuplicateGroupNames), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .allowDuplicateGroupNames), empty() + )) parseTest("(?^y{w}:)", changeMatchingOptions( - unsetMatchingOptions(adding: .textSegmentWordMode), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .textSegmentWordMode), empty() + )) let allOptions: [AST.MatchingOption.Kind] = [ .caseInsensitive, .allowDuplicateGroupNames, .multiline, .noAutoCapture, @@ -917,50 +899,64 @@ extension RegexTests { .byteSemantics ] parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: true, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)) + )) parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: false, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)), empty() + )) parseTest( - "a(b(?i)c)d", concat("a", capture(concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, "c"))), "d"), + "a(b(?i)c)d", concat( + "a", + capture(concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + )), + "d" + ), captures: .atom() ) parseTest( - "(a(?i)b(c)d)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", capture("c"), "d")))), + "(a(?i)b(c)d)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + capture("c"), + "d" + )), captures: .tuple(.atom(), .atom()) ) parseTest( - "(a(?i)b(?#hello)c)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", "c")))), + "(a(?i)b(?#hello)c)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + "c" + )), captures: .atom() ) - // TODO: This is Oniguruma's behavior, but PCRE treats it as: - // ab(?i:c)|(?i:def)|(?i:gh) - // instead. We ought to have a mode to emulate that. - parseTest("ab(?i)c|def|gh", concat("a", "b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", concat("d", "e", "f"), concat("g", "h"))))) + parseTest("ab(?i)c|def|gh", alt( + concat( + "a", + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + concat("d", "e", "f"), + concat("g", "h") + )) - parseTest("(a|b(?i)c|d)", capture(alt("a", concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", "d"))))), - captures: .atom()) + parseTest("(a|b(?i)c|d)", capture(alt( + "a", + concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + "d" + )), captures: .atom()) // MARK: References @@ -1479,149 +1475,149 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) - parseTest("(?x) # hello", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, empty())) - parseTest("(?xx) # hello", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, empty())) - parseTest("(?x) \\# abc", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, concat("#", "a", "b", "c"))) - parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, concat(" "))) + // MARK: Matching option changing parseTest( - "(?x) a (?^) b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat( - "a", - changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, concat(" ", "b")) - ) - ) + "(?x) # hello", + changeMatchingOptions(matchingOptions(adding: .extended)) ) - - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. parseTest( - "(?x)[ # abc]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("#", "a", "b", "c")) + "(?xx) # hello", + changeMatchingOptions(matchingOptions(adding: .extraExtended)) ) + parseTest("(?x) \\# abc", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "#", "a", "b", "c" + )) + parseTest("(?xx) \\ ", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), " " + )) parseTest( - "(?x)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?xx)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?x)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?-xx)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extraExtended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?xx)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^i)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .extended), - isIsolated: true, concat("d", "e", "f")))) - ) - parseTest( - "(?:(?x)a b c)d e f", concat(nonCapture(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c"))), "d", " ", "e", " ", "f") - ) - parseTest( - "(?x:a b c)# hi", concat(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: false, - concat("a", "b", "c")), "#", " ", "h", "i") + "(?x) a (?^) b", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", + changeMatchingOptions(unsetMatchingOptions()), + " ", "b" + ) ) - parseTest( - "(?x-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xxx-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xx-i)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, removing: .caseInsensitive), isIsolated: true, - concat("a", "b", "c")) + // End of line comments aren't applicable in custom char classes. + // TODO: ICU supports this. + parseTest("(?x)[ # abc]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("#", "a", "b", "c") + )) + + parseTest("(?x)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?xx)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?x)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?-xx)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extraExtended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?xx)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^i)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .extended)), + "d", "e", "f" + )) + parseTest("(?:(?x)a b c)d e f", concat( + nonCapture(concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c" + )), + "d", " ", "e", " ", "f" + )) + parseTest("(?x:a b c)# hi", concat(changeMatchingOptions( + matchingOptions(adding: .extended), + concat("a", "b", "c")), "#", " ", "h", "i") ) - // PCRE states that whitespace seperating quantifiers is permitted under - // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 - parseTest( - "(?x)a *", + parseTest("(?x-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - zeroOrMore(of: "a")) - ) - parseTest( - "(?x)a + ?", + matchingOptions(adding: .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xxx-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - oneOrMore(.reluctant, of: "a")) - ) - parseTest( - "(?x)a {2,4}", + matchingOptions(adding: .extraExtended, .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xx-i)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - quantRange(2 ... 4, of: "a")) - ) + matchingOptions(adding: .extraExtended, removing: .caseInsensitive) + ), + "a", "b", "c" + )) + + // PCRE states that whitespace seperating quantifiers is permitted under + // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 + parseTest("(?x)a *", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + zeroOrMore(of: "a") + )) + parseTest("(?x)a + ?", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + oneOrMore(.reluctant, of: "a") + )) + parseTest("(?x)a {2,4}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + quantRange(2 ... 4, of: "a") + )) // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 // TODO: We ought to warn on this, and produce a range anyway. - parseTest( - "(?x)a{1, 3}", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "{", "1", ",", "3", "}")) - ) + parseTest("(?x)a{1, 3}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "{", "1", ",", "3", "}" + )) // Test that we cover the list of whitespace characters covered by PCRE. parseTest( "(?x)a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, concat("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b" + )) parseTest( "(?x)[a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b]", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("a", "b") + )) + + parseTest(#"(?i:)?"#, zeroOrOne(of: changeMatchingOptions( + matchingOptions(adding: .caseInsensitive), empty() + ))) // Test multi-line comment handling. parseTest( @@ -1843,10 +1839,10 @@ extension RegexTests { parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( - "#|(?-x)[a b]|#", changeMatchingOptions( - matchingOptions(removing: .extended), isIsolated: true, - charClass("a", " ", "b")) - ) + "#|(?-x)[a b]|#", concat( + changeMatchingOptions(matchingOptions(removing: .extended)), + charClass("a", " ", "b") + )) parseWithDelimitersTest("#|[[a ] b]|#", charClass(charClass("a"), "b")) // Non-semantic whitespace between quantifier characters for consistency @@ -1856,8 +1852,7 @@ extension RegexTests { // End-of-line comments aren't enabled by default in experimental syntax. parseWithDelimitersTest("#|#abc|#", concat("#", "a", "b", "c")) parseWithDelimitersTest("#|(?x)#abc|#", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - empty()) + matchingOptions(adding: .extended)) ) parseWithDelimitersTest("#|||#", alt(empty(), empty())) @@ -1913,8 +1908,7 @@ extension RegexTests { (?^) # comment /# - """, changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, empty()) + """, changeMatchingOptions(unsetMatchingOptions()) ) // (?x) has no effect. @@ -1923,8 +1917,7 @@ extension RegexTests { (?x) # comment /# - """, changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, empty()) + """, changeMatchingOptions(matchingOptions(adding: .extended)) ) // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter @@ -2328,17 +2321,18 @@ extension RegexTests { diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) - // MARK: Text Segment options + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) - // MARK: Semantic Level options - diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + diagnosticTest("(?a)", .unknownGroupKind("?a")) + diagnosticTest("(?y{)", .expected("g")) + // Extended syntax may not be removed in multi-line mode. diagnosticWithDelimitersTest(""" #/ @@ -2406,6 +2400,7 @@ extension RegexTests { diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?i)?"#, .notQuantifiable) // MARK: References @@ -2438,7 +2433,7 @@ extension RegexTests { diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) - diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group")) + diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?(")) // MARK: Callouts From 5cc0ea0b9ff6c8b882696a58b6d3afaf532f5812 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:16 +0100 Subject: [PATCH 10/11] Error on unknown character properties Previously we would form an `.other` character property kind for any unclassified properties, which crash at runtime as unsupported. Instead, switch to erroring on them. Eventually it would be nice if we could version this based on what the runtime being targeted supports. --- Sources/_RegexParser/Regex/AST/Atom.swift | 3 --- .../CharacterPropertyClassification.swift | 9 ++++--- .../Regex/Parse/Diagnostics.swift | 9 ++++--- .../_StringProcessing/ConsumerInterface.swift | 4 --- Tests/RegexTests/ParseTests.swift | 27 ++++++++++--------- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 9cc2e9a96..1f6043d72 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -401,9 +401,6 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) case onigurumaSpecial(OnigurumaSpecialProperty) - - /// Unhandled properties. - case other(key: String?, value: String) } // TODO: erm, separate out or fold into something? splat it in? diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index e5b65a46c..911312121 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -397,8 +397,9 @@ extension Source { return .pcreSpecial(pcreSpecial) } - // Otherwise we don't know what this is. - return .other(key: nil, value: value) + // TODO: This should be versioned, and do we want a more lax behavior for + // the runtime? + throw ParseError.unknownProperty(key: nil, value: value) } static func classifyCharacterProperty( @@ -435,6 +436,8 @@ extension Source { if let match = match { return match } - return .other(key: key, value: value) + // TODO: This should be versioned, and do we want a more lax behavior for + // the runtime? + throw ParseError.unknownProperty(key: key, value: value) } } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 621d6ea11..c3d74c30b 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -57,8 +57,8 @@ enum ParseError: Error, Hashable { case expectedCustomCharacterClassMembers case invalidCharacterClassRangeOperand - case invalidPOSIXSetName(String) case emptyProperty + case unknownProperty(key: String?, value: String) case expectedGroupSpecifier case unbalancedEndOfGroup @@ -142,10 +142,13 @@ extension ParseError: CustomStringConvertible { return "expected custom character class members" case .invalidCharacterClassRangeOperand: return "invalid character class range" - case let .invalidPOSIXSetName(n): - return "invalid character set name: '\(n)'" case .emptyProperty: return "empty property" + case .unknownProperty(let key, let value): + if let key = key { + return "unknown character property '\(key)=\(value)'" + } + return "unknown character property '\(value)'" case .expectedGroupSpecifier: return "expected group specifier" case .unbalancedEndOfGroup: diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index ecb7d1356..f77cd322f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -423,10 +423,6 @@ extension AST.Atom.CharacterProperty { case .onigurumaSpecial(let s): throw Unsupported("TODO: map Oniguruma special: \(s)") - - case let .other(key, value): - throw Unsupported( - "TODO: map other \(key ?? "")=\(value)") } }() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index bdae250ba..4043e4ccb 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -501,7 +501,6 @@ extension RegexTests { parseTest(#"[[:a]]"#, charClass(charClass(":", "a"))) parseTest(#"[[:}]]"#, charClass(charClass(":", "}"))) parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) - parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{")))) parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) parseTest( @@ -1141,14 +1140,6 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) - // TODO: Start erroring on these? - parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx"))) - parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL"))) - parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y"))) - parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)"))) - parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()")))) - parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}")) - // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) parseTest(#"\p{ll}"#, lowercaseLetter) @@ -2231,12 +2222,12 @@ extension RegexTests { diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) - diagnosticTest(#"\p{a"#, .expected("}")) + diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a")) diagnosticTest(#"\p{a="#, .emptyProperty) diagnosticTest(#"\p{a=}"#, .emptyProperty) - diagnosticTest(#"\p{a=b"#, .expected("}")) - diagnosticTest(#"\p{aaa[b]}"#, .expected("}")) - diagnosticTest(#"\p{a=b=c}"#, .expected("}")) + diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) + diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) @@ -2321,6 +2312,16 @@ extension RegexTests { diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) + // MARK: Character properties + + diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx")) + diagnosticTest(#"\p{gcL}"#, .unknownProperty(key: nil, value: "gcL")) + diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) + diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) + diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#)) + diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From 771e7353bc14c83eab16c2a368f888e18aef06fb Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 21 Apr 2022 11:11:17 +0100 Subject: [PATCH 11/11] Don't parse a character property containing a backslash Add backslash to the list of characters we don't consider valid for a character property name. This means that we'll bail when attempting to lex a POSIX character property and instead lex a custom character class. This allows e.g `[:\Q :] \E]` to be lexed as a custom character class. For `\p{...}` this just means we'll emit a truncated invalid property error, which is arguably more inline with what the user was expecting.. I noticed when digging through the ICU source code that it will bail out of parsing a POSIX character property if it encounters one of its known escape sequences (e.g `\a`, `\e`, `\f`, ...). Interestingly this doesn't cover character property escapes e.g `\d`, but it's not clear that is intentional. Given backslash is not a valid character property character anyway, it seems reasonable to broaden this behavior to bail on any backslash. --- .../Regex/Parse/LexicalAnalysis.swift | 8 +++++++ Tests/RegexTests/ParseTests.swift | 21 ++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index e8b7e9e18..6a61ccdf7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1176,6 +1176,14 @@ extension Source { // character property name anyway, and it's nice not to have diverging // logic for these cases. return true + case "\\": + // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here + // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It + // seems character class escapes e.g '\d' are excluded, however it's not + // clear that is intentional. Let's apply the rule for any escape, as a + // backslash would never be a valid character property name, and we can + // diagnose any invalid escapes when parsing as a character class. + return true default: // We may want to handle other metacharacters here, e.g '{', '(', ')', // as they're not valid character property names. However for now diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4043e4ccb..94c134853 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -503,6 +503,25 @@ extension RegexTests { parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + parseTest( + #"[:[:space:]:]"#, + charClass(":", posixProp_m(.binary(.whitespace)), ":") + ) + parseTest( + #"[:a[:space:]b:]"#, + charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":") + ) + + // ICU parses a custom character class if it sees any of its known escape + // sequences in a POSIX character property (though it appears to exclude + // character class escapes e.g '\d'). We do so for any escape sequence as + // '\' is not a valid character property character. + parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]"))) + parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":")) + parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":")) + parseTest(#"[:\\:]"#, charClass(":", "\\", ":")) + parseTest(#"[:\:]"#, charClass(":", ":")) + parseTest( #"\D\S\W"#, concat( @@ -2319,7 +2338,7 @@ extension RegexTests { diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) - diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#)) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) // MARK: Matching options