Skip to content

Escape sequence + empty char class tweaks #226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 55 additions & 5 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -641,17 +641,67 @@ extension AST.Atom {
case .scalar(let s):
return Character(s)

case .escaped(let c):
switch c {
// TODO: Should we separate these into a separate enum? Or move the
// specifics of the scalar to the DSL tree?
case .alarm:
return "\u{7}"
case .backspace:
return "\u{8}"
case .escape:
return "\u{1B}"
case .formfeed:
return "\u{C}"
case .newline:
return "\n"
case .carriageReturn:
return "\r"
case .tab:
return "\t"

case .singleDataUnit, .decimalDigit, .notDecimalDigit,
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
.wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
.textSegment, .notTextSegment:
return nil
}

case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
// TODO: Not a character per-say, what should we do?
fallthrough
// TODO: These should have unicode scalar values.
return nil

case .property, .escaped, .any, .startOfLine, .endOfLine,
.backreference, .subpattern, .namedCharacter, .callout,
.backtrackingDirective:
case .namedCharacter:
// TODO: This should have a unicode scalar value depending on the name
// given.
// TODO: Do we want to validate and assign a scalar value when building
// the AST? Or defer for the matching engine?
return nil

case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
.callout, .backtrackingDirective:
return nil
}
}

/// Whether this atom is valid as the operand of a custom character class
/// range.
public var isValidCharacterClassRangeBound: Bool {
// If we have a literal character value for this, it can be used as a bound.
if literalCharacterValue != nil { return true }
switch kind {
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
return true
default:
return false
}
}

/// Produce a string literal representation of the atom, if possible
///
/// Individual characters will be returned, Unicode scalars will be
Expand Down
3 changes: 3 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ enum ParseError: Error, Hashable {

case expectedNonEmptyContents
case expectedEscape
case invalidEscape(Character)

case cannotReferToWholePattern

Expand Down Expand Up @@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible {
return "expected non-empty contents"
case .expectedEscape:
return "expected escape sequence"
case .invalidEscape(let c):
return "invalid escape sequence '\\\(c)'"
case .cannotReferToWholePattern:
return "cannot refer to whole pattern here"
case .notQuantifiable:
Expand Down
11 changes: 10 additions & 1 deletion Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1489,8 +1489,17 @@ extension Source {
return try .scalar(
src.expectUnicodeScalar(escapedCharacter: char).value)
default:
return .char(char)
break
}

// We only allow unknown escape sequences for non-letter ASCII, and
// non-ASCII whitespace.
guard (char.isASCII && !char.isLetter) ||
(!char.isASCII && char.isWhitespace)
else {
throw ParseError.invalidEscape(char)
}
return .char(char)
}
}

Expand Down
11 changes: 9 additions & 2 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,12 @@ extension Parser {
try source.expectNonEmpty()

var members: Array<Member> = []

// We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty
// character classes, and assume an initial ']' is literal.
if let loc = source.tryEatWithLoc("]") {
members.append(.atom(.init(.char("]"), loc)))
}
try parseCCCMembers(into: &members)

// If we have a binary set operator, parse it and the next members. Note
Expand Down Expand Up @@ -489,10 +495,11 @@ extension Parser {
// Range between atoms.
if let (dashLoc, rhs) =
try source.lexCustomCharClassRangeEnd(context: context) {
guard atom.literalCharacterValue != nil &&
rhs.literalCharacterValue != nil else {
guard atom.isValidCharacterClassRangeBound &&
rhs.isValidCharacterClassRangeBound else {
throw ParseError.invalidCharacterClassRangeOperand
}
// TODO: Validate lower <= upper?
members.append(.range(.init(atom, dashLoc, rhs)))
continue
}
Expand Down
75 changes: 0 additions & 75 deletions Sources/_StringProcessing/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -319,21 +319,6 @@ extension CharacterClass {
}
}

extension AST.Node {
/// If this has a character class representation, whether built-in or custom, return it.
///
/// TODO: Not sure if this the right model type, but I suspect we'll want to produce
/// something like this on demand
var characterClass: CharacterClass? {
switch self {
case let .customCharacterClass(cc): return cc.modelCharacterClass
case let .atom(a): return a.characterClass

default: return nil
}
}
}

extension DSLTree.Node {
var characterClass: CharacterClass? {
switch self {
Expand Down Expand Up @@ -502,66 +487,6 @@ extension DSLTree.CustomCharacterClass {
}
}

extension AST.CustomCharacterClass {
/// The model character class for this custom character class.
var modelCharacterClass: CharacterClass? {
typealias Component = CharacterClass.CharacterSetComponent
func getComponents(_ members: [Member]) -> [Component]? {
var result = Array<Component>()
for m in members {
switch m {
case .custom(let cc):
guard let cc = cc.modelCharacterClass else {
return nil
}
result.append(.characterClass(cc))
case .range(let r):
result.append(.range(
r.lhs.literalCharacterValue! ...
r.rhs.literalCharacterValue!))

case .atom(let a):
if let cc = a.characterClass {
result.append(.characterClass(cc))
} else if let lit = a.literalCharacterValue {
result.append(.character(lit))
} else {
return nil
}

case .quote(let q):
// Decompose quoted literal into literal characters.
result += q.literal.map { .character($0) }

case .trivia:
// Not semantically important.
break

case .setOperation(let lhs, let op, let rhs):
// FIXME: CharacterClass wasn't designed for set operations with
// multiple components in each operand, we should fix that. For now,
// just produce custom components.
guard let lhs = getComponents(lhs),
let rhs = getComponents(rhs)
else {
return nil
}
result.append(.setOperation(.init(
lhs: .characterClass(.custom(lhs)),
op: op.value,
rhs: .characterClass(.custom(rhs)))))
}
}
return result
}
guard let comps = getComponents(members) else {
return nil
}
let cc = CharacterClass.custom(comps)
return self.isInverted ? cc.inverted : cc
}
}

extension CharacterClass {
// FIXME: Calling on inverted sets wont be the same as the
// inverse of a boundary if at the start or end of the
Expand Down
29 changes: 29 additions & 0 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,35 @@ extension RegexTests {

firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α")

func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }

// Currently not supported in the matching engine.
for s in scalar("\u{C}") ... scalar("\u{1B}") {
let u = UnicodeScalar(s)!
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
xfail: true)
}
for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
xfail: true)
}
for s in scalar("\u{A}") ... scalar("\u{D}") {
let u = UnicodeScalar(s)!
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
xfail: true)
}
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
xfail: true)

for c: UnicodeScalar in ["a", "b", "c"] {
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
xfail: true)
}
for c: UnicodeScalar in ["$", "%", "&", "'"] {
firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#,
input: "#()\(c)", match: "\(c)", xfail: true)
}

// MARK: Operators

firstMatchTest(
Expand Down
81 changes: 71 additions & 10 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,10 @@ extension RegexTests {

parseTest("[-]", charClass("-"))

// Empty character classes are forbidden, therefore this is a character
// class of literal ']'.
parseTest("[]]", charClass("]"))

// These are metacharacters in certain contexts, but normal characters
// otherwise.
parseTest(
Expand Down Expand Up @@ -494,6 +498,25 @@ extension RegexTests {
parseTest("[*]", charClass("*"))
parseTest("[{0}]", charClass("{", "0", "}"))

parseTest(#"[\f-\e]"#, charClass(
range_m(.escaped(.formfeed), .escaped(.escape))))
parseTest(#"[\a-\b]"#, charClass(
range_m(.escaped(.alarm), .escaped(.backspace))))
parseTest(#"[\n-\r]"#, charClass(
range_m(.escaped(.newline), .escaped(.carriageReturn))))
parseTest(#"[\t-\t]"#, charClass(
range_m(.escaped(.tab), .escaped(.tab))))

parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass(
range_m(.keyboardControl("X"), .keyboardControl("Y")),
range_m(.keyboardControl("A"), .keyboardControl("B")),
range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")),
range_m(.keyboardMeta("A"), .keyboardMeta("B"))
))

parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass(
range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))))

// MARK: Operators

parseTest(
Expand Down Expand Up @@ -544,9 +567,8 @@ extension RegexTests {
#"a\Q \Q \\.\Eb"#,
concat("a", quote(#" \Q \\."#), "b"))

// These follow the PCRE behavior.
// This follows the PCRE behavior.
parseTest(#"\Q\\E"#, quote("\\"))
parseTest(#"\E"#, "E")

parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
syntax: .experimental)
Expand All @@ -566,6 +588,25 @@ extension RegexTests {

parseTest(#"["-"]"#, charClass(range_m("\"", "\"")))

// MARK: Escapes

// Not metachars, but we allow their escape as ASCII.
parseTest(#"\<"#, "<")
parseTest(#"\ "#, " ")
parseTest(#"\\"#, "\\")

// Escaped U+3000 IDEOGRAPHIC SPACE.
parseTest(#"\\#u{3000}"#, "\u{3000}")

// Control and meta controls.
parseTest(#"\c "#, atom(.keyboardControl(" ")))
parseTest(#"\c!"#, atom(.keyboardControl("!")))
parseTest(#"\c~"#, atom(.keyboardControl("~")))
parseTest(#"\C--"#, atom(.keyboardControl("-")))
parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")))
parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")))
parseTest(#"\M-a"#, atom(.keyboardMeta("a")))

// MARK: Comments

parseTest(
Expand Down Expand Up @@ -989,13 +1030,6 @@ extension RegexTests {
// Backreferences are not valid in custom character classes.
parseTest(#"[\8]"#, charClass("8"))
parseTest(#"[\9]"#, charClass("9"))
parseTest(#"[\g]"#, charClass("g"))
parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0"))
parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}"))
parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'"))

parseTest(#"\g"#, atom(.char("g")))
parseTest(#"\k"#, atom(.char("k")))

// MARK: Character names.

Expand Down Expand Up @@ -1526,7 +1560,7 @@ extension RegexTests {
parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))

parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", "✅"))
parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", "✅"))

// Printable ASCII characters.
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
Expand Down Expand Up @@ -1871,10 +1905,37 @@ extension RegexTests {
diagnosticTest("(?<a-b", .expected(">"))
diagnosticTest("(?<a-b>", .expected(")"))

// The first ']' of a custom character class is literal, so this is missing
// the closing bracket.
diagnosticTest("[]", .expected("]"))

// MARK: Bad escapes

diagnosticTest("\\", .expectedEscape)

// TODO: Custom diagnostic for control sequence
diagnosticTest(#"\c"#, .unexpectedEndOfInput)

// TODO: Custom diagnostic for expected backref
diagnosticTest(#"\g"#, .invalidEscape("g"))
diagnosticTest(#"\k"#, .invalidEscape("k"))

// TODO: Custom diagnostic for backref in custom char class
diagnosticTest(#"[\g]"#, .invalidEscape("g"))
diagnosticTest(#"[\g+30]"#, .invalidEscape("g"))
diagnosticTest(#"[\g{1}]"#, .invalidEscape("g"))
diagnosticTest(#"[\k'a']"#, .invalidEscape("k"))

// TODO: Custom diagnostic for missing '\Q'
diagnosticTest(#"\E"#, .invalidEscape("E"))

// Non-ASCII non-whitespace cases.
diagnosticTest(#"\🔥"#, .invalidEscape("🔥"))
diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰"))
diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}"))
diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é"))
diagnosticTest(#"\˂"#, .invalidEscape("˂"))

// MARK: Text Segment options

diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)
Expand Down