diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 0aa0951c5..6482c4042 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -631,6 +631,41 @@ extension AST.Atom { } } +extension AST.Atom.EscapedBuiltin { + /// If the escape sequence represents a unicode scalar value, returns the + /// value, otherwise `nil`. + public var scalarValue: UnicodeScalar? { + switch self { + // TODO: Should we separate these into a separate enum? Or move the + // specifics of the scalar to the DSL tree? + case .alarm: + return "\u{7}" + case .backspace: + return "\u{8}" + case .escape: + return "\u{1B}" + case .formfeed: + return "\u{C}" + case .newline: + return "\n" + case .carriageReturn: + return "\r" + case .tab: + return "\t" + + case .singleDataUnit, .decimalDigit, .notDecimalDigit, + .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, + .newlineSequence, .whitespace, .notWhitespace, .verticalTab, + .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, + .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, + .textSegment, .notTextSegment: + return nil + } + } +} + extension AST.Atom { /// Retrieve the character value of the atom if it represents a literal /// character or unicode scalar, nil otherwise. @@ -642,34 +677,7 @@ extension AST.Atom { return Character(s) case .escaped(let c): - switch c { - // TODO: Should we separate these into a separate enum? Or move the - // specifics of the scalar to the DSL tree? - case .alarm: - return "\u{7}" - case .backspace: - return "\u{8}" - case .escape: - return "\u{1B}" - case .formfeed: - return "\u{C}" - case .newline: - return "\n" - case .carriageReturn: - return "\r" - case .tab: - return "\t" - - case .singleDataUnit, .decimalDigit, .notDecimalDigit, - .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, - .newlineSequence, .whitespace, .notWhitespace, .verticalTab, - .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, - .wordBoundary, .notWordBoundary, .startOfSubject, - .endOfSubjectBeforeNewline, .endOfSubject, - .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, - .textSegment, .notTextSegment: - return nil - } + return c.scalarValue.map(Character.init) case .keyboardControl, .keyboardMeta, .keyboardMetaControl: // TODO: These should have unicode scalar values. diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 0a2d93ff1..e372280f2 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -295,101 +295,6 @@ extension DSLTree.CustomCharacterClass.Member { } } -extension AST.CustomCharacterClass.Member { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram<String>.ConsumeFunction { - switch self { - case .custom(let ccc): - return try ccc.generateConsumer(opts) - - case .range(let r): - guard let lhs = r.lhs.literalCharacterValue else { - throw Unsupported("\(r.lhs) in range") - } - guard let rhs = r.rhs.literalCharacterValue else { - throw Unsupported("\(r.rhs) in range") - } - - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil - } - - case .atom(let atom): - guard let gen = try atom.generateConsumer(opts) else { - throw Unsupported("TODO") - } - return gen - - case .quote(let q): - // TODO: Not optimal. - let consumers = try q.literal.map { - try AST.Atom(.char($0), .fake).generateConsumer(opts)! - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return idx - } - } - return nil - } - - case .trivia: - throw Unreachable( - "Should have been stripped by caller") - - case .setOperation(let lhs, let op, let rhs): - // TODO: We should probably have a component type - // instead of a members array... for now we reconstruct - // an AST node... - let start = AST.Located( - faking: AST.CustomCharacterClass.Start.normal) - - let lhs = try AST.CustomCharacterClass( - start, lhs, .fake - ).generateConsumer(opts) - let rhs = try AST.CustomCharacterClass( - start, rhs, .fake - ).generateConsumer(opts) - - return { input, bounds in - // NOTE: Easy way to implement, not performant - let lhsIdxOpt = lhs(input, bounds) - let rhsIdxOpt = rhs(input, bounds) - - // TODO: What if lengths don't line up? - assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil - || rhsIdxOpt == nil) - - switch op.value { - case .subtraction: - guard rhsIdxOpt == nil else { return nil } - return lhsIdxOpt - - case .intersection: - if let idx = lhsIdxOpt { - return rhsIdxOpt == nil ? nil : idx - } - return nil - - case .symmetricDifference: - if let idx = lhsIdxOpt { - return rhsIdxOpt == nil ? idx : nil - } - return rhsIdxOpt - } - } - } - } -} - extension DSLTree.CustomCharacterClass { func generateConsumer( _ opts: MatchingOptions @@ -413,29 +318,6 @@ extension DSLTree.CustomCharacterClass { } } -extension AST.CustomCharacterClass { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram<String>.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try strippingTriviaShallow.members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) - } - return nil - } - } -} - // NOTE: Conveniences, though not most performant private func consumeScalarScript( _ s: Unicode.Script diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 5336a1892..f773bd275 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -211,6 +211,9 @@ extension AST.Atom { case .any: return .any case let .backreference(r): return .backreference(r) + case .escaped(let c) where c.scalarValue != nil: + return .scalar(c.scalarValue!) + default: return .unconverted(self) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 67412d262..0ecbd5ad3 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -281,6 +281,15 @@ extension RegexTests { // code point sequence firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) + // Escape sequences that represent scalar values. + firstMatchTest(#"\a[\b]\e\f\n\r\t"#, + input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", + match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") + firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#, + input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", + match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") + + firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") // MARK: Quotes @@ -596,24 +605,20 @@ extension RegexTests { func scalar(_ u: UnicodeScalar) -> UInt32 { u.value } - // Currently not supported in the matching engine. for s in scalar("\u{C}") ... scalar("\u{1B}") { let u = UnicodeScalar(s)! - firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)") } for u: UnicodeScalar in ["\u{7}", "\u{8}"] { - firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)") } for s in scalar("\u{A}") ... scalar("\u{D}") { let u = UnicodeScalar(s)! - firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)") } - firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}", - xfail: true) + firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", xfail: true)