Skip to content

[5.7] Cherry-pick parser changes to 5.7 #309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Documentation/Evolution/RegexSyntaxRunTimeConstruction.md
Original file line number Diff line number Diff line change
@@ -339,7 +339,7 @@ UnicodeScalar -> '\u{' HexDigit{1...} '}'
| '\o{' OctalDigit{1...} '}'
| '\0' OctalDigit{0...3}
HexDigit -> [0-9a-zA-Z]
HexDigit -> [0-9a-fA-F]
OctalDigit -> [0-7]
NamedScalar -> '\N{' ScalarName '}'
77 changes: 44 additions & 33 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
@@ -72,6 +72,9 @@ extension AST {

// (*ACCEPT), (*FAIL), ...
case backtrackingDirective(BacktrackingDirective)

// (?i), (?i-m), ...
case changeMatchingOptions(MatchingOptionSequence)
}
}
}
@@ -91,6 +94,7 @@ extension AST.Atom {
case .subpattern(let v): return v
case .callout(let v): return v
case .backtrackingDirective(let v): return v
case .changeMatchingOptions(let v): return v
case .any: return nil
case .startOfLine: return nil
case .endOfLine: return nil
@@ -397,9 +401,6 @@ extension AST.Atom.CharacterProperty {
/// Some special properties implemented by PCRE and Oniguruma.
case pcreSpecial(PCRESpecialCategory)
case onigurumaSpecial(OnigurumaSpecialProperty)

/// Unhandled properties.
case other(key: String?, value: String)
}

// TODO: erm, separate out or fold into something? splat it in?
@@ -631,6 +632,41 @@ extension AST.Atom {
}
}

extension AST.Atom.EscapedBuiltin {
/// If the escape sequence represents a unicode scalar value, returns the
/// value, otherwise `nil`.
public var scalarValue: UnicodeScalar? {
switch self {
// TODO: Should we separate these into a separate enum? Or move the
// specifics of the scalar to the DSL tree?
case .alarm:
return "\u{7}"
case .backspace:
return "\u{8}"
case .escape:
return "\u{1B}"
case .formfeed:
return "\u{C}"
case .newline:
return "\n"
case .carriageReturn:
return "\r"
case .tab:
return "\t"

case .singleDataUnit, .decimalDigit, .notDecimalDigit,
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
.wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
.textSegment, .notTextSegment:
return nil
}
}
}

extension AST.Atom {
/// Retrieve the character value of the atom if it represents a literal
/// character or unicode scalar, nil otherwise.
@@ -642,34 +678,7 @@ extension AST.Atom {
return Character(s)

case .escaped(let c):
switch c {
// TODO: Should we separate these into a separate enum? Or move the
// specifics of the scalar to the DSL tree?
case .alarm:
return "\u{7}"
case .backspace:
return "\u{8}"
case .escape:
return "\u{1B}"
case .formfeed:
return "\u{C}"
case .newline:
return "\n"
case .carriageReturn:
return "\r"
case .tab:
return "\t"

case .singleDataUnit, .decimalDigit, .notDecimalDigit,
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
.wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
.textSegment, .notTextSegment:
return nil
}
return c.scalarValue.map(Character.init)

case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
// TODO: These should have unicode scalar values.
@@ -683,7 +692,7 @@ extension AST.Atom {
return nil

case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
.callout, .backtrackingDirective:
.callout, .backtrackingDirective, .changeMatchingOptions:
return nil
}
}
@@ -723,7 +732,7 @@ extension AST.Atom {

case .property, .escaped, .any, .startOfLine, .endOfLine,
.backreference, .subpattern, .namedCharacter, .callout,
.backtrackingDirective:
.backtrackingDirective, .changeMatchingOptions:
return nil
}
}
@@ -732,6 +741,8 @@ extension AST.Atom {
switch kind {
case .backtrackingDirective(let b):
return b.isQuantifiable
case .changeMatchingOptions:
return false
// TODO: Are callouts quantifiable?
default:
return true
6 changes: 5 additions & 1 deletion Sources/_RegexParser/Regex/AST/CustomCharClass.swift
Original file line number Diff line number Diff line change
@@ -97,14 +97,18 @@ extension CustomCC.Member {
if case .trivia = self { return true }
return false
}

public var isSemantic: Bool {
!isTrivia
}
}

extension AST.CustomCharacterClass {
/// Strip trivia from the character class members. This does not recurse into
/// nested custom character classes.
public var strippingTriviaShallow: Self {
var copy = self
copy.members = copy.members.filter { !$0.isTrivia }
copy.members = copy.members.filter(\.isSemantic)
return copy
}
}
19 changes: 1 addition & 18 deletions Sources/_RegexParser/Regex/AST/Group.swift
Original file line number Diff line number Diff line change
@@ -68,9 +68,7 @@ extension AST {
case atomicScriptRun

// (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:)
// Isolated options are written as e.g (?i), and implicitly form a group
// containing all the following elements of the current group.
case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool)
case changeMatchingOptions(MatchingOptionSequence)

// NOTE: Comments appear to be groups, but are not parsed
// the same. They parse more like quotes, so are not
@@ -87,21 +85,6 @@ extension AST.Group.Kind {
}
}

/// Whether this is a group with an implicit scope, e.g isolated matching
/// options implicitly become parent groups for the rest of the elements in
/// the current group:
///
/// (a(?i)bc)de -> (a(?i:bc))de
///
public var hasImplicitScope: Bool {
switch self {
case .changeMatchingOptions(_, let isIsolated):
return isIsolated
default:
return false
}
}

/// If this is a named group, its name, `nil` otherwise.
public var name: String? {
switch self {
Original file line number Diff line number Diff line change
@@ -397,8 +397,9 @@ extension Source {
return .pcreSpecial(pcreSpecial)
}

// Otherwise we don't know what this is.
return .other(key: nil, value: value)
// TODO: This should be versioned, and do we want a more lax behavior for
// the runtime?
throw ParseError.unknownProperty(key: nil, value: value)
}

static func classifyCharacterProperty(
@@ -435,6 +436,8 @@ extension Source {
if let match = match {
return match
}
return .other(key: key, value: value)
// TODO: This should be versioned, and do we want a more lax behavior for
// the runtime?
throw ParseError.unknownProperty(key: key, value: value)
}
}
9 changes: 6 additions & 3 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
@@ -57,8 +57,8 @@ enum ParseError: Error, Hashable {
case expectedCustomCharacterClassMembers
case invalidCharacterClassRangeOperand

case invalidPOSIXSetName(String)
case emptyProperty
case unknownProperty(key: String?, value: String)

case expectedGroupSpecifier
case unbalancedEndOfGroup
@@ -142,10 +142,13 @@ extension ParseError: CustomStringConvertible {
return "expected custom character class members"
case .invalidCharacterClassRangeOperand:
return "invalid character class range"
case let .invalidPOSIXSetName(n):
return "invalid character set name: '\(n)'"
case .emptyProperty:
return "empty property"
case .unknownProperty(let key, let value):
if let key = key {
return "unknown character property '\(key)=\(value)'"
}
return "unknown character property '\(value)'"
case .expectedGroupSpecifier:
return "expected group specifier"
case .unbalancedEndOfGroup:
Loading