Skip to content

[swift/main] Allow captures in lookahead and atomic groups #735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 50 additions & 24 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -308,51 +308,77 @@ fileprivate extension Compiler.ByteCodeGen {
try emitNode(node)
}

mutating func emitLookaround(
_ kind: (forwards: Bool, positive: Bool),
_ child: DSLTree.Node
) throws {
guard kind.forwards else {
throw Unsupported("backwards assertions")
}
mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws {
/*
save(restoringAt: success)
save(restoringAt: intercept)
<sub-pattern> // failure restores at intercept
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
fail(preservingCaptures: true) // ->success
intercept:
clearSavePoint // remove success
fail // propagate failure
success:
...
*/
let intercept = builder.makeAddress()
let success = builder.makeAddress()

builder.buildSave(success)
builder.buildSave(intercept)
try emitNode(child)
builder.buildClearThrough(intercept)
builder.buildFail(preservingCaptures: true) // Lookahead succeeds here

builder.label(intercept)
builder.buildClear()
builder.buildFail()

let positive = kind.positive
builder.label(success)
}

mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws {
/*
save(restoringAt: success)
save(restoringAt: intercept)
<sub-pattern> // failure restores at intercept
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
<if negative>:
clearSavePoint // remove success
fail // positive->success, negative propagates
clearSavePoint // remove success
fail // propagate failure
intercept:
<if positive>:
clearSavePoint // remove success
fail // positive propagates, negative->success
fail // ->success
success:
...
*/

let intercept = builder.makeAddress()
let success = builder.makeAddress()

builder.buildSave(success)
builder.buildSave(intercept)
try emitNode(child)
builder.buildClearThrough(intercept)
if !positive {
builder.buildClear()
}
builder.buildClear()
builder.buildFail()

builder.label(intercept)
if positive {
builder.buildClear()
}
builder.buildFail()

builder.label(success)
}

mutating func emitLookaround(
_ kind: (forwards: Bool, positive: Bool),
_ child: DSLTree.Node
) throws {
guard kind.forwards else {
throw Unsupported("backwards assertions")
}
if kind.positive {
try emitPositiveLookahead(child)
} else {
try emitNegativeLookahead(child)
}
}

mutating func emitAtomicNoncapturingGroup(
_ child: DSLTree.Node
Expand All @@ -361,8 +387,8 @@ fileprivate extension Compiler.ByteCodeGen {
save(continuingAt: success)
save(restoringAt: intercept)
<sub-pattern> // failure restores at intercept
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
fail // ->success
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
fail(preservingCaptures: true) // ->success
intercept:
clearSavePoint // remove success
fail // propagate failure
Expand All @@ -377,7 +403,7 @@ fileprivate extension Compiler.ByteCodeGen {
builder.buildSave(intercept)
try emitNode(child)
builder.buildClearThrough(intercept)
builder.buildFail()
builder.buildFail(preservingCaptures: true) // Atomic group succeeds here

builder.label(intercept)
builder.buildClear()
Expand Down
8 changes: 4 additions & 4 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,11 @@ extension Instruction.Payload {
self.rawValue == 1
}

init(bool: BoolRegister) {
self.init(bool)
init(bool: Bool) {
self.init(bool ? 1 : 0, 0)
}
var bool: BoolRegister {
interpret()
var boolPayload: Bool {
interpret(as: TypedInt<Bool>.self) == 1
}

init(element: ElementRegister, isCaseInsensitive: Bool) {
Expand Down
4 changes: 2 additions & 2 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ extension MEProgram.Builder {
instructions.append(.init(.clearThrough))
fixup(to: t)
}
mutating func buildFail() {
instructions.append(.init(.fail))
mutating func buildFail(preservingCaptures: Bool = false) {
instructions.append(.init(.fail, .init(bool: preservingCaptures)))
}

mutating func buildAdvance(_ n: Distance) {
Expand Down
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/Engine/MECapture.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ extension Processor {
var value: Any? = nil

// An in-progress capture start
fileprivate var currentCaptureBegin: Position? = nil
var currentCaptureBegin: Position? = nil

fileprivate func _invariantCheck() {
if range == nil {
Expand Down
11 changes: 8 additions & 3 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ extension Processor {
return true
}

mutating func signalFailure() {
mutating func signalFailure(preservingCaptures: Bool = false) {
guard !savePoints.isEmpty else {
state = .fail
return
Expand Down Expand Up @@ -362,10 +362,14 @@ extension Processor {
controller.pc = pc
currentPosition = pos ?? currentPosition
callStack.removeLast(callStack.count - stackEnd.rawValue)
storedCaptures = capEnds
registers.ints = intRegisters
registers.positions = posRegisters

if !preservingCaptures {
// Reset all capture information
storedCaptures = capEnds
}

metrics.addBacktrack()
}

Expand Down Expand Up @@ -479,7 +483,8 @@ extension Processor {
tryAccept()

case .fail:
signalFailure()
let preservingCaptures = payload.boolPayload
signalFailure(preservingCaptures: preservingCaptures)

case .advance:
let (isScalar, distance) = payload.distance
Expand Down
22 changes: 22 additions & 0 deletions Tests/RegexBuilderTests/AlgorithmsTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,28 @@ class AlgorithmsResultBuilderTests: XCTestCase {
"+"
int
}

let ref1 = Reference<Substring>()
let ref2 = Reference<Substring>()
try expectMatch(
.first,
("ABBAB", ("ABBAB", "A", "B")),
("defABBAdefB", ("defABBAdefB", "A", "B")),
matchType: (Substring, Substring, Substring).self,
equivalence: ==
) {
Anchor.startOfSubject
Lookahead {
ZeroOrMore(.any)
Capture(as: ref1) { One(.any) }
Capture(as: ref2) { One(.any) }
ref2
ref1
}
OneOrMore(.any)
ref2
Anchor.endOfSubject
}
}

func testStartsAndContains() throws {
Expand Down
24 changes: 19 additions & 5 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1804,8 +1804,7 @@ extension RegexTests {
firstMatchTests(
#"(?>(\d+))\w+\1"#,
(input: "23x23", match: "23x23"),
(input: "123x23", match: "23x23"),
xfail: true)
(input: "123x23", match: "23x23"))

// Backreferences in scalar mode
// In scalar mode the backreference should not match
Expand All @@ -1823,12 +1822,10 @@ extension RegexTests {
(input: "abbba", match: nil),
(input: "ABBA", match: nil),
(input: "defABBAdef", match: nil))
// FIXME: Backreferences don't escape positive lookaheads
firstMatchTests(
#"^(?=.*(.)(.)\2\1).+\2$"#,
(input: "ABBAB", match: "ABBAB"),
(input: "defABBAdefB", match: "defABBAdefB"),
xfail: true)
(input: "defABBAdefB", match: "defABBAdefB"))

firstMatchTests(
#"^(?!.*(.)(.)\2\1).+$"#,
Expand Down Expand Up @@ -2771,6 +2768,23 @@ extension RegexTests {
}
}

func testIssue713() throws {
// Original report from https://github.com/apple/swift-experimental-string-processing/issues/713
let originalInput = "Something 9a"
let originalRegex = #/(?=([1-9]|(a|b)))/#
let originalOutput = originalInput.matches(of: originalRegex).map(\.output)
XCTAssert(originalOutput[0] == ("", "9", nil))
XCTAssert(originalOutput[1] == ("", "a", "a"))

let simplifiedRegex = #/(?=(9))/#
let simplifiedOutput = originalInput.matches(of: simplifiedRegex).map(\.output)
XCTAssert(simplifiedOutput[0] == ("", "9"))

let additionalRegex = #/(a+)b(a+)/#
let additionalInput = "abaaba"
XCTAssertNil(additionalInput.wholeMatch(of: additionalRegex))
}

func testNSRECompatibility() throws {
// NSRE-compatibility includes scalar matching, so `[\r\n]` should match
// either `\r` or `\n`.
Expand Down