Skip to content

Commit db96f7c

Browse files
authored
[swift/main] Allow captures in lookahead and atomic groups (#735)
This fixes an issue where capture groups inside a positive lookahead were being reset even upon successful matching of the lookahead. For example, with the pattern `/(?=(\d))/`, matching against a string like `"abc1"` should result in the output `("", "1")`. However, accessing the output traps instead, since the range data for capture 1 is missing even on success. This change resolves the issue by adding a boolean payload to the `fail` instruction that indicates whether to preserve captures when resetting the matching state, which allows any captures inside a lookahead to persist after success. Fixes #713.
1 parent d09a6b6 commit db96f7c

File tree

7 files changed

+106
-39
lines changed

7 files changed

+106
-39
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

+50-24
Original file line numberDiff line numberDiff line change
@@ -308,51 +308,77 @@ fileprivate extension Compiler.ByteCodeGen {
308308
try emitNode(node)
309309
}
310310

311-
mutating func emitLookaround(
312-
_ kind: (forwards: Bool, positive: Bool),
313-
_ child: DSLTree.Node
314-
) throws {
315-
guard kind.forwards else {
316-
throw Unsupported("backwards assertions")
317-
}
311+
mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws {
312+
/*
313+
save(restoringAt: success)
314+
save(restoringAt: intercept)
315+
<sub-pattern> // failure restores at intercept
316+
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
317+
fail(preservingCaptures: true) // ->success
318+
intercept:
319+
clearSavePoint // remove success
320+
fail // propagate failure
321+
success:
322+
...
323+
*/
324+
let intercept = builder.makeAddress()
325+
let success = builder.makeAddress()
326+
327+
builder.buildSave(success)
328+
builder.buildSave(intercept)
329+
try emitNode(child)
330+
builder.buildClearThrough(intercept)
331+
builder.buildFail(preservingCaptures: true) // Lookahead succeeds here
332+
333+
builder.label(intercept)
334+
builder.buildClear()
335+
builder.buildFail()
318336

319-
let positive = kind.positive
337+
builder.label(success)
338+
}
339+
340+
mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws {
320341
/*
321342
save(restoringAt: success)
322343
save(restoringAt: intercept)
323344
<sub-pattern> // failure restores at intercept
324345
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
325-
<if negative>:
326-
clearSavePoint // remove success
327-
fail // positive->success, negative propagates
346+
clearSavePoint // remove success
347+
fail // propagate failure
328348
intercept:
329-
<if positive>:
330-
clearSavePoint // remove success
331-
fail // positive propagates, negative->success
349+
fail // ->success
332350
success:
333351
...
334352
*/
335-
336353
let intercept = builder.makeAddress()
337354
let success = builder.makeAddress()
338355

339356
builder.buildSave(success)
340357
builder.buildSave(intercept)
341358
try emitNode(child)
342359
builder.buildClearThrough(intercept)
343-
if !positive {
344-
builder.buildClear()
345-
}
360+
builder.buildClear()
346361
builder.buildFail()
347362

348363
builder.label(intercept)
349-
if positive {
350-
builder.buildClear()
351-
}
352364
builder.buildFail()
353365

354366
builder.label(success)
355367
}
368+
369+
mutating func emitLookaround(
370+
_ kind: (forwards: Bool, positive: Bool),
371+
_ child: DSLTree.Node
372+
) throws {
373+
guard kind.forwards else {
374+
throw Unsupported("backwards assertions")
375+
}
376+
if kind.positive {
377+
try emitPositiveLookahead(child)
378+
} else {
379+
try emitNegativeLookahead(child)
380+
}
381+
}
356382

357383
mutating func emitAtomicNoncapturingGroup(
358384
_ child: DSLTree.Node
@@ -361,8 +387,8 @@ fileprivate extension Compiler.ByteCodeGen {
361387
save(continuingAt: success)
362388
save(restoringAt: intercept)
363389
<sub-pattern> // failure restores at intercept
364-
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
365-
fail // ->success
390+
clearThrough(intercept) // remove intercept and any leftovers from <sub-pattern>
391+
fail(preservingCaptures: true) // ->success
366392
intercept:
367393
clearSavePoint // remove success
368394
fail // propagate failure
@@ -377,7 +403,7 @@ fileprivate extension Compiler.ByteCodeGen {
377403
builder.buildSave(intercept)
378404
try emitNode(child)
379405
builder.buildClearThrough(intercept)
380-
builder.buildFail()
406+
builder.buildFail(preservingCaptures: true) // Atomic group succeeds here
381407

382408
builder.label(intercept)
383409
builder.buildClear()

Sources/_StringProcessing/Engine/InstPayload.swift

+4-4
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,11 @@ extension Instruction.Payload {
211211
self.rawValue == 1
212212
}
213213

214-
init(bool: BoolRegister) {
215-
self.init(bool)
214+
init(bool: Bool) {
215+
self.init(bool ? 1 : 0, 0)
216216
}
217-
var bool: BoolRegister {
218-
interpret()
217+
var boolPayload: Bool {
218+
interpret(as: TypedInt<Bool>.self) == 1
219219
}
220220

221221
init(element: ElementRegister, isCaseInsensitive: Bool) {

Sources/_StringProcessing/Engine/MEBuilder.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,8 @@ extension MEProgram.Builder {
146146
instructions.append(.init(.clearThrough))
147147
fixup(to: t)
148148
}
149-
mutating func buildFail() {
150-
instructions.append(.init(.fail))
149+
mutating func buildFail(preservingCaptures: Bool = false) {
150+
instructions.append(.init(.fail, .init(bool: preservingCaptures)))
151151
}
152152

153153
mutating func buildAdvance(_ n: Distance) {

Sources/_StringProcessing/Engine/MECapture.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ extension Processor {
3737
var value: Any? = nil
3838

3939
// An in-progress capture start
40-
fileprivate var currentCaptureBegin: Position? = nil
40+
var currentCaptureBegin: Position? = nil
4141

4242
fileprivate func _invariantCheck() {
4343
if range == nil {

Sources/_StringProcessing/Engine/Processor.swift

+8-3
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ extension Processor {
331331
return true
332332
}
333333

334-
mutating func signalFailure() {
334+
mutating func signalFailure(preservingCaptures: Bool = false) {
335335
guard !savePoints.isEmpty else {
336336
state = .fail
337337
return
@@ -362,10 +362,14 @@ extension Processor {
362362
controller.pc = pc
363363
currentPosition = pos ?? currentPosition
364364
callStack.removeLast(callStack.count - stackEnd.rawValue)
365-
storedCaptures = capEnds
366365
registers.ints = intRegisters
367366
registers.positions = posRegisters
368367

368+
if !preservingCaptures {
369+
// Reset all capture information
370+
storedCaptures = capEnds
371+
}
372+
369373
metrics.addBacktrack()
370374
}
371375

@@ -479,7 +483,8 @@ extension Processor {
479483
tryAccept()
480484

481485
case .fail:
482-
signalFailure()
486+
let preservingCaptures = payload.boolPayload
487+
signalFailure(preservingCaptures: preservingCaptures)
483488

484489
case .advance:
485490
let (isScalar, distance) = payload.distance

Tests/RegexBuilderTests/AlgorithmsTests.swift

+22
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,28 @@ class AlgorithmsResultBuilderTests: XCTestCase {
256256
"+"
257257
int
258258
}
259+
260+
let ref1 = Reference<Substring>()
261+
let ref2 = Reference<Substring>()
262+
try expectMatch(
263+
.first,
264+
("ABBAB", ("ABBAB", "A", "B")),
265+
("defABBAdefB", ("defABBAdefB", "A", "B")),
266+
matchType: (Substring, Substring, Substring).self,
267+
equivalence: ==
268+
) {
269+
Anchor.startOfSubject
270+
Lookahead {
271+
ZeroOrMore(.any)
272+
Capture(as: ref1) { One(.any) }
273+
Capture(as: ref2) { One(.any) }
274+
ref2
275+
ref1
276+
}
277+
OneOrMore(.any)
278+
ref2
279+
Anchor.endOfSubject
280+
}
259281
}
260282

261283
func testStartsAndContains() throws {

Tests/RegexTests/MatchTests.swift

+19-5
Original file line numberDiff line numberDiff line change
@@ -1804,8 +1804,7 @@ extension RegexTests {
18041804
firstMatchTests(
18051805
#"(?>(\d+))\w+\1"#,
18061806
(input: "23x23", match: "23x23"),
1807-
(input: "123x23", match: "23x23"),
1808-
xfail: true)
1807+
(input: "123x23", match: "23x23"))
18091808

18101809
// Backreferences in scalar mode
18111810
// In scalar mode the backreference should not match
@@ -1823,12 +1822,10 @@ extension RegexTests {
18231822
(input: "abbba", match: nil),
18241823
(input: "ABBA", match: nil),
18251824
(input: "defABBAdef", match: nil))
1826-
// FIXME: Backreferences don't escape positive lookaheads
18271825
firstMatchTests(
18281826
#"^(?=.*(.)(.)\2\1).+\2$"#,
18291827
(input: "ABBAB", match: "ABBAB"),
1830-
(input: "defABBAdefB", match: "defABBAdefB"),
1831-
xfail: true)
1828+
(input: "defABBAdefB", match: "defABBAdefB"))
18321829

18331830
firstMatchTests(
18341831
#"^(?!.*(.)(.)\2\1).+$"#,
@@ -2771,6 +2768,23 @@ extension RegexTests {
27712768
}
27722769
}
27732770

2771+
func testIssue713() throws {
2772+
// Original report from https://github.com/apple/swift-experimental-string-processing/issues/713
2773+
let originalInput = "Something 9a"
2774+
let originalRegex = #/(?=([1-9]|(a|b)))/#
2775+
let originalOutput = originalInput.matches(of: originalRegex).map(\.output)
2776+
XCTAssert(originalOutput[0] == ("", "9", nil))
2777+
XCTAssert(originalOutput[1] == ("", "a", "a"))
2778+
2779+
let simplifiedRegex = #/(?=(9))/#
2780+
let simplifiedOutput = originalInput.matches(of: simplifiedRegex).map(\.output)
2781+
XCTAssert(simplifiedOutput[0] == ("", "9"))
2782+
2783+
let additionalRegex = #/(a+)b(a+)/#
2784+
let additionalInput = "abaaba"
2785+
XCTAssertNil(additionalInput.wholeMatch(of: additionalRegex))
2786+
}
2787+
27742788
func testNSRECompatibility() throws {
27752789
// NSRE-compatibility includes scalar matching, so `[\r\n]` should match
27762790
// either `\r` or `\n`.

0 commit comments

Comments
 (0)