Skip to content

Convert scalar escape sequences to DSL scalars #245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 36 additions & 28 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,41 @@ extension AST.Atom {
}
}

extension AST.Atom.EscapedBuiltin {
/// If the escape sequence represents a unicode scalar value, returns the
/// value, otherwise `nil`.
public var scalarValue: UnicodeScalar? {
switch self {
// TODO: Should we separate these into a separate enum? Or move the
// specifics of the scalar to the DSL tree?
case .alarm:
return "\u{7}"
case .backspace:
return "\u{8}"
case .escape:
return "\u{1B}"
case .formfeed:
return "\u{C}"
case .newline:
return "\n"
case .carriageReturn:
return "\r"
case .tab:
return "\t"

case .singleDataUnit, .decimalDigit, .notDecimalDigit,
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
.wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
.textSegment, .notTextSegment:
return nil
}
}
}

extension AST.Atom {
/// Retrieve the character value of the atom if it represents a literal
/// character or unicode scalar, nil otherwise.
Expand All @@ -642,34 +677,7 @@ extension AST.Atom {
return Character(s)

case .escaped(let c):
switch c {
// TODO: Should we separate these into a separate enum? Or move the
// specifics of the scalar to the DSL tree?
case .alarm:
return "\u{7}"
case .backspace:
return "\u{8}"
case .escape:
return "\u{1B}"
case .formfeed:
return "\u{C}"
case .newline:
return "\n"
case .carriageReturn:
return "\r"
case .tab:
return "\t"

case .singleDataUnit, .decimalDigit, .notDecimalDigit,
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
.wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
.textSegment, .notTextSegment:
return nil
}
return c.scalarValue.map(Character.init)

case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
// TODO: These should have unicode scalar values.
Expand Down
118 changes: 0 additions & 118 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -295,101 +295,6 @@ extension DSLTree.CustomCharacterClass.Member {
}
}

extension AST.CustomCharacterClass.Member {
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram<String>.ConsumeFunction {
switch self {
case .custom(let ccc):
return try ccc.generateConsumer(opts)

case .range(let r):
guard let lhs = r.lhs.literalCharacterValue else {
throw Unsupported("\(r.lhs) in range")
}
guard let rhs = r.rhs.literalCharacterValue else {
throw Unsupported("\(r.rhs) in range")
}

return { input, bounds in
// TODO: check for out of bounds?
let curIdx = bounds.lowerBound
if (lhs...rhs).contains(input[curIdx]) {
// TODO: semantic level
return input.index(after: curIdx)
}
return nil
}

case .atom(let atom):
guard let gen = try atom.generateConsumer(opts) else {
throw Unsupported("TODO")
}
return gen

case .quote(let q):
// TODO: Not optimal.
let consumers = try q.literal.map {
try AST.Atom(.char($0), .fake).generateConsumer(opts)!
}
return { input, bounds in
for consumer in consumers {
if let idx = consumer(input, bounds) {
return idx
}
}
return nil
}

case .trivia:
throw Unreachable(
"Should have been stripped by caller")

case .setOperation(let lhs, let op, let rhs):
// TODO: We should probably have a component type
// instead of a members array... for now we reconstruct
// an AST node...
let start = AST.Located(
faking: AST.CustomCharacterClass.Start.normal)

let lhs = try AST.CustomCharacterClass(
start, lhs, .fake
).generateConsumer(opts)
let rhs = try AST.CustomCharacterClass(
start, rhs, .fake
).generateConsumer(opts)

return { input, bounds in
// NOTE: Easy way to implement, not performant
let lhsIdxOpt = lhs(input, bounds)
let rhsIdxOpt = rhs(input, bounds)

// TODO: What if lengths don't line up?
assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil
|| rhsIdxOpt == nil)

switch op.value {
case .subtraction:
guard rhsIdxOpt == nil else { return nil }
return lhsIdxOpt

case .intersection:
if let idx = lhsIdxOpt {
return rhsIdxOpt == nil ? nil : idx
}
return nil

case .symmetricDifference:
if let idx = lhsIdxOpt {
return rhsIdxOpt == nil ? idx : nil
}
return rhsIdxOpt
}
}
}
}
}

extension DSLTree.CustomCharacterClass {
func generateConsumer(
_ opts: MatchingOptions
Expand All @@ -413,29 +318,6 @@ extension DSLTree.CustomCharacterClass {
}
}

extension AST.CustomCharacterClass {
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram<String>.ConsumeFunction {
// NOTE: Easy way to implement, obviously not performant
let consumers = try strippingTriviaShallow.members.map {
try $0.generateConsumer(opts)
}
return { input, bounds in
for consumer in consumers {
if let idx = consumer(input, bounds) {
return isInverted ? nil : idx
}
}
if isInverted {
// FIXME: semantic level
return input.index(after: bounds.lowerBound)
}
return nil
}
}
}

// NOTE: Conveniences, though not most performant
private func consumeScalarScript(
_ s: Unicode.Script
Expand Down
3 changes: 3 additions & 0 deletions Sources/_StringProcessing/Regex/ASTConversion.swift
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ extension AST.Atom {
case .any: return .any
case let .backreference(r): return .backreference(r)

case .escaped(let c) where c.scalarValue != nil:
return .scalar(c.scalarValue!)

default: return .unconverted(self)
}
}
Expand Down
23 changes: 14 additions & 9 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,15 @@ extension RegexTests {
// code point sequence
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)

// Escape sequences that represent scalar values.
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#,
input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")

firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n")

// MARK: Quotes

Expand Down Expand Up @@ -596,24 +605,20 @@ extension RegexTests {

func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }

// Currently not supported in the matching engine.
for s in scalar("\u{C}") ... scalar("\u{1B}") {
let u = UnicodeScalar(s)!
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
xfail: true)
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)")
}
for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
xfail: true)
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)")
}
for s in scalar("\u{A}") ... scalar("\u{D}") {
let u = UnicodeScalar(s)!
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
xfail: true)
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)")
}
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
xfail: true)
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")

// Currently not supported in the matching engine.
for c: UnicodeScalar in ["a", "b", "c"] {
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
xfail: true)
Expand Down