Skip to content

Treat capture 0 (i.e. the whole match) specially #777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,18 @@ extension Compiler {

extension Compiler.ByteCodeGen {
mutating func emitRoot(_ root: DSLTree.Node) throws -> MEProgram {
// The whole match (`.0` element of output) is equivalent to an implicit
// capture over the entire regex.
try emitNode(.capture(name: nil, reference: nil, root))
// If the whole regex is a matcher, then the whole-match value
// is the constructed value. Denote that the current value
// register is the processor's value output.
switch root {
case .matcher:
builder.denoteCurrentValueIsWholeMatchValue()
default:
break
}

try emitNode(root)

builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart()
builder.buildAccept()
return try builder.assemble()
Expand Down Expand Up @@ -149,8 +158,9 @@ fileprivate extension Compiler.ByteCodeGen {
guard let i = n.value else {
throw Unreachable("Expected a value")
}
let cap = builder.captureRegister(forBackreference: i)
builder.buildBackreference(
.init(i), isScalarMode: options.semanticLevel == .unicodeScalar)
cap, isScalarMode: options.semanticLevel == .unicodeScalar)
case .named(let name):
try builder.buildNamedReference(
name, isScalarMode: options.semanticLevel == .unicodeScalar)
Expand Down
39 changes: 32 additions & 7 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,18 @@ extension MEProgram {

// Registers
var nextIntRegister = IntRegister(0)
var nextCaptureRegister = CaptureRegister(0)
var nextValueRegister = ValueRegister(0)
var nextPositionRegister = PositionRegister(0)

// Set to non-nil when a value register holds the whole-match
// value (i.e. when a regex consists entirely of a custom matcher)
var wholeMatchValue: ValueRegister? = nil

// Note: Capture 0 (i.e. whole-match) is handled specially
// by the engine, so `n` here refers to the regex AST's `n+1`
// capture
var nextCaptureRegister = CaptureRegister(0)

// Special addresses or instructions
var failAddressToken: AddressToken? = nil

Expand Down Expand Up @@ -70,6 +78,24 @@ extension MEProgram.Builder {
self.second = b
}
}

// Maps the AST's named capture offset to a capture register
func captureRegister(named name: String) throws -> CaptureRegister {
guard let index = captureList.indexOfCapture(named: name) else {
throw RegexCompilationError.uncapturedReference
}
return .init(index - 1)
}

// Map an AST's backreference number to a capture register
func captureRegister(forBackreference i: Int) -> CaptureRegister {
.init(i - 1)
}

mutating func denoteCurrentValueIsWholeMatchValue() {
assert(wholeMatchValue == nil)
wholeMatchValue = nextValueRegister
}
}

extension MEProgram.Builder {
Expand Down Expand Up @@ -337,10 +363,8 @@ extension MEProgram.Builder {
}

mutating func buildNamedReference(_ name: String, isScalarMode: Bool) throws {
guard let index = captureList.indexOfCapture(named: name) else {
throw RegexCompilationError.uncapturedReference
}
buildBackreference(.init(index), isScalarMode: isScalarMode)
let cap = try captureRegister(named: name)
buildBackreference(cap, isScalarMode: isScalarMode)
}

// TODO: Mutating because of fail address fixup, drop when
Expand Down Expand Up @@ -401,6 +425,7 @@ extension MEProgram.Builder {
regInfo.transformFunctions = transformFunctions.count
regInfo.matcherFunctions = matcherFunctions.count
regInfo.captures = nextCaptureRegister.rawValue
regInfo.wholeMatchValue = wholeMatchValue?.rawValue

return MEProgram(
instructions: InstructionList(instructions),
Expand Down Expand Up @@ -514,8 +539,8 @@ extension MEProgram.Builder {
assert(preexistingValue == nil)
}
if let name = name {
let index = captureList.indexOfCapture(named: name)
assert(index == nextCaptureRegister.rawValue)
let cap = try? captureRegister(named: name)
assert(cap == nextCaptureRegister)
}
assert(nextCaptureRegister.rawValue < captureList.captures.count)
return nextCaptureRegister
Expand Down
14 changes: 0 additions & 14 deletions Sources/_StringProcessing/Engine/MECapture.swift
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,3 @@ extension Processor {
}
}
}

struct MECaptureList {
var values: Array<Processor._StoredCapture>
var referencedCaptureOffsets: [ReferenceID: Int]

func latestUntyped(from input: String) -> Array<Substring?> {
values.map {
guard let range = $0.range else {
return nil
}
return input[range]
}
}
}
4 changes: 4 additions & 0 deletions Sources/_StringProcessing/Engine/Registers.swift
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,10 @@ extension MEProgram {
var positionStackAddresses = 0
var savePointAddresses = 0
var captures = 0

// The value register holding the whole-match value, if there
// is one
var wholeMatchValue: Int? = nil
}
}

Expand Down
34 changes: 25 additions & 9 deletions Sources/_StringProcessing/Engine/Structuralize.swift
Original file line number Diff line number Diff line change
@@ -1,20 +1,36 @@
internal import _RegexParser

extension CaptureList {
@available(SwiftStdlib 5.7, *)
func createElements(
_ list: MECaptureList
@available(SwiftStdlib 5.7, *)
extension Executor {
static func createExistentialElements(
_ program: MEProgram,
matchRange: Range<String.Index>,
storedCaptures: [Processor._StoredCapture],
wholeMatchValue: Any?
) -> [AnyRegexOutput.ElementRepresentation] {
assert(list.values.count == captures.count)

let capList = program.captureList
let capOffsets = program.referencedCaptureOffsets

// Formal captures include the entire match
assert(storedCaptures.count + 1 == capList.captures.count)

var result = [AnyRegexOutput.ElementRepresentation]()

for (i, (cap, meStored)) in zip(captures, list.values).enumerated() {
result.reserveCapacity(1 + capList.captures.count)
result.append(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should be able to reserve the correct capacity in result before this…

Suggested change
result.append(
result.reserveCapacity(capList.captures.count)
result.append(

AnyRegexOutput.ElementRepresentation(
optionalDepth: 0,
content: (matchRange, wholeMatchValue),
visibleInTypedOutput: capList.captures[0].visibleInTypedOutput)
)

for (i, (cap, meStored)) in zip(
capList.captures.dropFirst(), storedCaptures
).enumerated() {
let element = AnyRegexOutput.ElementRepresentation(
optionalDepth: cap.optionalDepth,
content: meStored.deconstructed,
name: cap.name,
referenceID: list.referencedCaptureOffsets.first { $1 == i }?.key,
referenceID: capOffsets.first { $1 == i }?.key,
visibleInTypedOutput: cap.visibleInTypedOutput
)

Expand Down
19 changes: 13 additions & 6 deletions Sources/_StringProcessing/Executor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,22 @@ extension Executor {
guard let endIdx = try cpu.run() else {
return nil
}
let capList = MECaptureList(
values: cpu.storedCaptures,
referencedCaptureOffsets: program.referencedCaptureOffsets)

let range = startPosition..<endIdx
let caps = program.captureList.createElements(capList)

let wholeMatchValue: Any?
if let val = program.registerInfo.wholeMatchValue {
wholeMatchValue = cpu.registers.values[val]
} else {
wholeMatchValue = nil
}
let aroElements = Executor.createExistentialElements(
program,
matchRange: startPosition..<endIdx,
storedCaptures: cpu.storedCaptures,
wholeMatchValue: wholeMatchValue)

let anyRegexOutput = AnyRegexOutput(
input: cpu.input, elements: caps)
input: cpu.input, elements: aroElements)
return .init(anyRegexOutput: anyRegexOutput, range: range)
}}

Expand Down
5 changes: 4 additions & 1 deletion Sources/_StringProcessing/Regex/Match.swift
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ extension Regex.Match {
let typeErasedMatch = anyRegexOutput.existentialOutput(
from: anyRegexOutput.input
)
return typeErasedMatch as! Output
guard let output = typeErasedMatch as? Output else {
fatalError("Internal error: existential cast failed")
}
return output
}

/// Accesses a capture by its name or number.
Expand Down
32 changes: 32 additions & 0 deletions Tests/RegexBuilderTests/CustomTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,38 @@ class CustomRegexComponentTests: XCTestCase {
("x10x", nil, IntParser.ParseError()),
("30", 30, nil)
)
customTest(
Regex {
Optionally {
IntParser()
}
},
("zzz", nil, IntParser.ParseError()),
("x10x", nil, IntParser.ParseError()),
("30", "30", nil)
)
customTest(
Regex {
Regex {
IntParser()
}
},
("zzz", nil, IntParser.ParseError()),
("x10x", nil, IntParser.ParseError()),
("30", 30, nil)
)
customTest(
Regex {
Regex {
IntParser()
}
"x"
},
("zzz", nil, IntParser.ParseError()),
("x10x", nil, IntParser.ParseError()),
("30", nil, nil),
("30x", "30x", nil)
)

customTest(
Regex {
Expand Down