diff --git a/Package.swift b/Package.swift index b9b9d6d71..1f5e10f0a 100644 --- a/Package.swift +++ b/Package.swift @@ -59,9 +59,9 @@ let package = Package( name: "VariadicsGenerator", targets: ["VariadicsGenerator"]), // Disable to work around rdar://126877024 -// .executable( -// name: "RegexBenchmark", -// targets: ["RegexBenchmark"]) + .executable( + name: "RegexBenchmark", + targets: ["RegexBenchmark"]) ], dependencies: [ .package(url: "https://github.com/apple/swift-argument-parser", from: "1.0.0"), @@ -143,17 +143,17 @@ let package = Package( "_StringProcessing" ], swiftSettings: [availabilityDefinition]), -// .executableTarget( -// name: "RegexBenchmark", -// dependencies: [ -// .product(name: "ArgumentParser", package: "swift-argument-parser"), -// "_RegexParser", -// "_StringProcessing", -// "RegexBuilder" -// ], -// swiftSettings: [ -// .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), -// ]), + .executableTarget( + name: "RegexBenchmark", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + "_RegexParser", + "_StringProcessing", + "RegexBuilder" + ], + swiftSettings: [ + .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), + ]), // MARK: Exercises .target( diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index 3a967c022..bcf8fa42a 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -153,6 +153,8 @@ struct CrossBenchmark { /// Whether to also run scalar-semantic mode var alsoRunScalarSemantic: Bool = true + var alsoRunSimpleWordBoundaries: Bool = false + func register(_ runner: inout BenchmarkRunner) { if isWhole { runner.registerCrossBenchmark( @@ -160,14 +162,16 @@ struct CrossBenchmark { input: input, pattern: regex, .whole, - alsoRunScalarSemantic: alsoRunScalarSemantic) + alsoRunScalarSemantic: alsoRunScalarSemantic, + alsoRunSimpleWordBoundaries: alsoRunSimpleWordBoundaries) } else { runner.registerCrossBenchmark( nameBase: baseName, input: input, pattern: regex, .allMatches, - alsoRunScalarSemantic: alsoRunScalarSemantic) + alsoRunScalarSemantic: alsoRunScalarSemantic, + alsoRunSimpleWordBoundaries: alsoRunSimpleWordBoundaries) if includeFirst || runner.includeFirstOverride { runner.registerCrossBenchmark( @@ -175,7 +179,8 @@ struct CrossBenchmark { input: input, pattern: regex, .first, - alsoRunScalarSemantic: alsoRunScalarSemantic) + alsoRunScalarSemantic: alsoRunScalarSemantic, + alsoRunSimpleWordBoundaries: alsoRunSimpleWordBoundaries) } } } diff --git a/Sources/RegexBenchmark/BenchmarkRegistration.swift b/Sources/RegexBenchmark/BenchmarkRegistration.swift index a3abef8e4..e12502e99 100644 --- a/Sources/RegexBenchmark/BenchmarkRegistration.swift +++ b/Sources/RegexBenchmark/BenchmarkRegistration.swift @@ -18,6 +18,8 @@ extension BenchmarkRunner { self.addDiceNotation() self.addErrorMessages() self.addIpAddress() + + self.addURLWithWordBoundaries() // -- end of registrations -- } } diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index b067b9679..6abee43aa 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -33,7 +33,8 @@ struct BenchmarkRunner { input: String, pattern: String, _ type: Benchmark.MatchType, - alsoRunScalarSemantic: Bool = true + alsoRunScalarSemantic: Bool = true, + alsoRunSimpleWordBoundaries: Bool ) { let swiftRegex = try! Regex(pattern) let nsRegex: NSRegularExpression @@ -58,6 +59,16 @@ struct BenchmarkRunner { type: .init(type), target: input)) + if alsoRunSimpleWordBoundaries { + register( + Benchmark( + name: nameBase + nameSuffix + "_SimpleWordBoundaries", + regex: swiftRegex.wordBoundaryKind(.simple), + pattern: pattern, + type: type, + target: input)) + } + if alsoRunScalarSemantic { register( Benchmark( diff --git a/Sources/RegexBenchmark/Inputs/URL.swift b/Sources/RegexBenchmark/Inputs/URL.swift new file mode 100644 index 000000000..b1b03f53d --- /dev/null +++ b/Sources/RegexBenchmark/Inputs/URL.swift @@ -0,0 +1,22 @@ +extension Inputs { + static let url: String = { + let element = """ + Item 1 | Item 2® •Item 3 Item4 + + + \t\t\t + + Check it out here: http://www.test.com/this-is-a-fake-url-that-should-be-replaced?a=1 + Check it out here: https://www.test.com/this-is-a-fake-url-that-should-be-replaced?a=1 + This is not a web link ftp://user@host:domain.com/path + This is a link without a scheme www.apple.com/mac + + This is some good text and should not be removed. + Thanks. + 😀🩷🤵🏿 + """ + let multiplier = 30 + return Array(repeating: element, count: multiplier).joined() + }() + +} diff --git a/Sources/RegexBenchmark/Suite/URLRegex.swift b/Sources/RegexBenchmark/Suite/URLRegex.swift new file mode 100644 index 000000000..e5f00f4e7 --- /dev/null +++ b/Sources/RegexBenchmark/Suite/URLRegex.swift @@ -0,0 +1,14 @@ +import _StringProcessing + +extension BenchmarkRunner { + mutating func addURLWithWordBoundaries() { + let urlRegex = #"https?://([-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6})\b[-a-zA-Z0-9()@:%_+.~#?&=]*"# + let url = CrossBenchmark( + baseName: "URLWithWordBoundaries", + regex: urlRegex, + input: Inputs.url, + alsoRunSimpleWordBoundaries: true + ) + url.register(&self) + } +} diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 3f9b8d49a..57834a324 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -11,107 +11,33 @@ // MARK: `RangesCollection` -struct RangesCollection { - public typealias Base = Searcher.Searched - - let base: Base +struct RangesSequence { + let input: Searcher.Searched let searcher: Searcher - private(set) public var startIndex: Index - init(base: Base, searcher: Searcher) { - self.base = base + init(input: Searcher.Searched, searcher: Searcher) { + self.input = input self.searcher = searcher - - var state = searcher.state(for: base, in: base.startIndex..: IteratorProtocol { - public typealias Base = Searcher.Searched - - let base: Base - let searcher: Searcher - var state: Searcher.State - - init(base: Base, searcher: Searcher) { - self.base = base - self.searcher = searcher - self.state = searcher.state(for: base, in: base.startIndex.. Range? { - searcher.search(base, &state) - } -} - -extension RangesCollection: Sequence { - public func makeIterator() -> RangesIterator { - Iterator(base: base, searcher: searcher) - } -} -extension RangesCollection: Collection { - // TODO: Custom `SubSequence` for the sake of more efficient slice iteration - - public struct Index { - var range: Range? + struct Iterator: IteratorProtocol { + let base: RangesSequence var state: Searcher.State - } - - public var endIndex: Index { - // TODO: Avoid calling `state(for:startingAt)` here - Index( - range: nil, - state: searcher.state(for: base, in: base.startIndex.. Index { - var index = index - formIndex(after: &index) - return index - } - public subscript(index: Index) -> Range { - guard let range = index.range else { - fatalError("Cannot subscript using endIndex") + init(_ base: RangesSequence) { + self.base = base + self.state = base.searcher.state(for: base.input, in: base.input.startIndex.. Bool { - switch (lhs.range, rhs.range) { - case (nil, nil): - return true - case (nil, _?), (_?, nil): - return false - case (let lhs?, let rhs?): - return lhs.lowerBound == rhs.lowerBound + public mutating func next() -> Range? { + base.searcher.search(base.input, &state) } } +} - static func < (lhs: Self, rhs: Self) -> Bool { - switch (lhs.range, rhs.range) { - case (nil, _): - return false - case (_, nil): - return true - case (let lhs?, let rhs?): - return lhs.lowerBound < rhs.lowerBound - } +extension RangesSequence: Sequence { + public func makeIterator() -> Iterator { + Iterator(self) } } @@ -122,8 +48,8 @@ extension RangesCollection.Index: Comparable { extension Collection { func _ranges( of searcher: S - ) -> RangesCollection where S.Searched == Self { - RangesCollection(base: self, searcher: searcher) + ) -> RangesSequence where S.Searched == Self { + RangesSequence(input: self, searcher: searcher) } } @@ -132,7 +58,7 @@ extension Collection { extension Collection where Element: Equatable { func _ranges( of other: C - ) -> RangesCollection> where C.Element == Element { + ) -> RangesSequence> where C.Element == Element { _ranges(of: ZSearcher(pattern: Array(other), by: ==)) } @@ -163,8 +89,8 @@ extension Collection where Element: Equatable { } @available(SwiftStdlib 5.7, *) -struct RegexRangesCollection { - let base: RegexMatchesCollection +struct RegexRangesSequence { + let base: RegexMatchesSequence init( input: String, @@ -181,9 +107,9 @@ struct RegexRangesCollection { } @available(SwiftStdlib 5.7, *) -extension RegexRangesCollection: Sequence { +extension RegexRangesSequence: Sequence { struct Iterator: IteratorProtocol { - var matchesBase: RegexMatchesCollection.Iterator + var matchesBase: RegexMatchesSequence.Iterator mutating func next() -> Range? { matchesBase.next().map(\.range) @@ -195,16 +121,6 @@ extension RegexRangesCollection: Sequence { } } -@available(SwiftStdlib 5.7, *) -extension RegexRangesCollection: Collection { - typealias Index = RegexMatchesCollection.Index - - var startIndex: Index { base.startIndex } - var endIndex: Index { base.endIndex } - func index(after i: Index) -> Index { base.index(after: i) } - subscript(position: Index) -> Range { base[position].range } -} - // MARK: Regex algorithms extension Collection where SubSequence == Substring { @@ -214,8 +130,8 @@ extension Collection where SubSequence == Substring { of regex: R, subjectBounds: Range, searchBounds: Range - ) -> RegexRangesCollection { - RegexRangesCollection( + ) -> RegexRangesSequence { + RegexRangesSequence( input: self[...].base, subjectBounds: subjectBounds, searchBounds: searchBounds, @@ -226,7 +142,7 @@ extension Collection where SubSequence == Substring { @_disfavoredOverload func _ranges( of regex: R - ) -> RegexRangesCollection { + ) -> RegexRangesSequence { _ranges( of: regex, subjectBounds: startIndex..( + func _replacing( _ ranges: Ranges, with replacement: Replacement, maxReplacements: Int = .max @@ -49,13 +49,15 @@ extension RangeReplaceableCollection { var result = Self() var index = startIndex - - // `maxRanges` is a workaround for https://github.com/apple/swift/issues/59522 - let maxRanges = ranges.prefix(maxReplacements) - for range in maxRanges { + var replacements = 0 + + for range in ranges { + if replacements == maxReplacements { break } + result.append(contentsOf: self[index.. { - public typealias Base = Searcher.Searched +struct SplitSequence { + public typealias Input = Searcher.Searched - let ranges: RangesCollection + let ranges: RangesSequence var maxSplits: Int var omittingEmptySubsequences: Bool init( - ranges: RangesCollection, + ranges: RangesSequence, maxSplits: Int, omittingEmptySubsequences: Bool) { @@ -29,53 +29,53 @@ struct SplitCollection { } init( - base: Base, + input: Input, searcher: Searcher, maxSplits: Int, omittingEmptySubsequences: Bool) { - self.ranges = base._ranges(of: searcher) + self.ranges = input._ranges(of: searcher) self.maxSplits = maxSplits self.omittingEmptySubsequences = omittingEmptySubsequences } } -extension SplitCollection: Sequence { +extension SplitSequence: Sequence { public struct Iterator: IteratorProtocol { - let base: Base - var index: Base.Index - var ranges: RangesCollection.Iterator - var maxSplits: Int - var omittingEmptySubsequences: Bool + var ranges: RangesSequence.Iterator + var index: Input.Index + var maxSplits: Int var splitCounter = 0 + var omittingEmptySubsequences: Bool var isDone = false + var input: Input { ranges.base.input } + init( - ranges: RangesCollection, + ranges: RangesSequence, maxSplits: Int, omittingEmptySubsequences: Bool ) { - self.base = ranges.base - self.index = base.startIndex + self.index = ranges.input.startIndex self.ranges = ranges.makeIterator() self.maxSplits = maxSplits self.omittingEmptySubsequences = omittingEmptySubsequences } - public mutating func next() -> Base.SubSequence? { + public mutating func next() -> Input.SubSequence? { guard !isDone else { return nil } /// Return the rest of base if it's non-empty or we're including /// empty subsequences. - func finish() -> Base.SubSequence? { + func finish() -> Input.SubSequence? { isDone = true - return index == base.endIndex && omittingEmptySubsequences + return index == input.endIndex && omittingEmptySubsequences ? nil - : base[index...] + : input[index...] } - if index == base.endIndex { + if index == input.endIndex { return finish() } @@ -96,7 +96,7 @@ extension SplitCollection: Sequence { } splitCounter += 1 - return base[index..( + func _split( by separator: Searcher, maxSplits: Int, omittingEmptySubsequences: Bool - ) -> SplitCollection where Searcher.Searched == Self { - SplitCollection( - base: self, + ) -> SplitSequence where Searcher.Searched == Self { + SplitSequence( + input: self, searcher: separator, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) @@ -126,12 +126,12 @@ extension Collection { extension Collection where Element: Equatable { @_disfavoredOverload - func split( + func _split( by separator: C, maxSplits: Int, omittingEmptySubsequences: Bool - ) -> SplitCollection> where C.Element == Element { - split(by: ZSearcher(pattern: Array(separator), by: ==), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) + ) -> SplitSequence> where C.Element == Element { + _split(by: ZSearcher(pattern: Array(separator), by: ==), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) } // FIXME: Return `some Collection` for SE-0346 @@ -159,7 +159,7 @@ extension Collection where Element: Equatable { return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] default: - return Array(split( + return Array(_split( by: ZSearcher(pattern: Array(separator), by: ==), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) @@ -186,7 +186,7 @@ extension StringProtocol where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [Substring] { - Array(self[...].split( + Array(self[...]._split( by: SubstringSearcher(text: "" as Substring, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) @@ -199,7 +199,7 @@ extension StringProtocol where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [Substring] { - Array(self[...].split( + Array(self[...]._split( by: SubstringSearcher(text: "" as Substring, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift index e870e1493..ff385856c 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Trim.swift @@ -44,7 +44,7 @@ extension RangeReplaceableCollection { // MARK: Predicate algorithms extension Collection { - fileprivate func endOfPrefix(while predicate: (Element) throws -> Bool) rethrows -> Index { + fileprivate func _endOfPrefix(while predicate: (Element) throws -> Bool) rethrows -> Index { try firstIndex(where: { try !predicate($0) }) ?? endIndex } @@ -52,7 +52,7 @@ extension Collection { public func trimmingPrefix( while predicate: (Element) throws -> Bool ) rethrows -> SubSequence { - let end = try endOfPrefix(while: predicate) + let end = try _endOfPrefix(while: predicate) return self[end...] } } @@ -62,7 +62,7 @@ extension Collection where SubSequence == Self { public mutating func trimPrefix( while predicate: (Element) throws -> Bool ) throws { - let end = try endOfPrefix(while: predicate) + let end = try _endOfPrefix(while: predicate) self = self[end...] } } @@ -73,7 +73,7 @@ extension RangeReplaceableCollection { public mutating func trimPrefix( while predicate: (Element) throws -> Bool ) rethrows { - let end = try endOfPrefix(while: predicate) + let end = try _endOfPrefix(while: predicate) removeSubrange(startIndex.. - ) -> Consumed.Index? -} - -extension BidirectionalCollectionConsumer { - func consumingBack(_ consumed: Consumed) -> Consumed.Index? { - consumingBack(consumed, in: consumed.startIndex.. Bool - where Consumed.SubSequence == Consumed - { - guard let index = consumingBack(consumed) else { return false } - consumed = consumed[.. - ) -> Consumed.Index? { - var index = range.upperBound - var patternIndex = pattern.endIndex - - while true { - if patternIndex == pattern.startIndex { - return index - } - - if index == range.lowerBound { - return nil - } - - consumed.formIndex(before: &index) - pattern.formIndex(before: &patternIndex) - - if consumed[index] != pattern[patternIndex] { - return nil - } - } - } -} diff --git a/Sources/_StringProcessing/Algorithms/Consumers/ManyConsumer.swift b/Sources/_StringProcessing/Algorithms/Consumers/ManyConsumer.swift deleted file mode 100644 index 10d9fd5c3..000000000 --- a/Sources/_StringProcessing/Algorithms/Consumers/ManyConsumer.swift +++ /dev/null @@ -1,47 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -struct ManyConsumer { - let base: Base -} - -extension ManyConsumer: CollectionConsumer { - typealias Consumed = Base.Consumed - - func consuming( - _ consumed: Base.Consumed, - in range: Range - ) -> Base.Consumed.Index? { - var result = range.lowerBound - while let index = base.consuming(consumed, in: result.. - ) -> Base.Consumed.Index? { - var result = range.upperBound - while let index = base.consumingBack( - consumed, - in: range.lowerBound.. { - let predicate: (Consumed.Element) -> Bool -} - -extension PredicateConsumer: CollectionConsumer { - public func consuming( - _ consumed: Consumed, - in range: Range - ) -> Consumed.Index? { - let start = range.lowerBound - guard start != range.upperBound && predicate(consumed[start]) else { - return nil - } - return consumed.index(after: start) - } -} - -extension PredicateConsumer: BidirectionalCollectionConsumer - where Consumed: BidirectionalCollection -{ - func consumingBack( - _ consumed: Consumed, - in range: Range - ) -> Consumed.Index? { - let end = range.upperBound - guard end != range.lowerBound else { return nil } - let previous = consumed.index(before: end) - return predicate(consumed[previous]) ? previous : nil - } -} - -extension PredicateConsumer: StatelessCollectionSearcher { - public typealias Searched = Consumed - - public func search( - _ searched: Searched, - in range: Range - ) -> Range? { - // TODO: Make this reusable - guard let index = searched[range].firstIndex(where: predicate) else { - return nil - } - return index.. - ) -> Range? { - // TODO: Make this reusable - guard let index = searched[range].lastIndex(where: predicate) else { - return nil - } - return index..( - of searcher: S - ) -> _MatchResult? where S.Searched == Self { - var state = searcher.state(for: self, in: startIndex..( - of searcher: S - ) -> _BackwardMatchResult? - where S.BackwardSearched == Self - { - var state = searcher.backwardState(for: self, in: startIndex..( - _ searcher: Searcher, - with replacement: (_MatchResult) throws -> Replacement, - subrange: Range, - maxReplacements: Int = .max - ) rethrows -> Self where Searcher.Searched == SubSequence, - Replacement.Element == Element - { - precondition(maxReplacements >= 0) - - var index = subrange.lowerBound - var result = Self() - result.append(contentsOf: self[..( - _ searcher: Searcher, - with replacement: (_MatchResult) throws -> Replacement, - maxReplacements: Int = .max - ) rethrows -> Self where Searcher.Searched == SubSequence, - Replacement.Element == Element - { - try _replacing( - searcher, - with: replacement, - subrange: startIndex..( - _ searcher: Searcher, - with replacement: (_MatchResult) throws -> Replacement, - maxReplacements: Int = .max - ) rethrows where Searcher.Searched == SubSequence, - Replacement.Element == Element - { - self = try _replacing( - searcher, - with: replacement, - maxReplacements: maxReplacements) - } -} - // MARK: Regex algorithms extension RangeReplaceableCollection where SubSequence == Substring { diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchResult.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchResult.swift index 94e6d8c3b..7d8157045 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchResult.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/MatchResult.swift @@ -17,12 +17,3 @@ struct _MatchResult { match.startIndex.. { - let match: S.BackwardSearched.SubSequence - let result: S.Match - - var range: Range { - match.startIndex.. { - public typealias Base = Searcher.Searched - - let base: Base - let searcher: Searcher - private(set) public var startIndex: Index - - init(base: Base, searcher: Searcher) { - self.base = base - self.searcher = searcher - - var state = searcher.state(for: base, in: base.startIndex..: IteratorProtocol { - public typealias Base = Searcher.Searched - - let base: Base - let searcher: Searcher - var state: Searcher.State - - init(base: Base, searcher: Searcher) { - self.base = base - self.searcher = searcher - self.state = searcher.state(for: base, in: base.startIndex.. _MatchResult? { - searcher.matchingSearch(base, &state).map { range, result in - _MatchResult(match: base[range], result: result) - } - } -} - -extension MatchesCollection: Sequence { - public func makeIterator() -> MatchesIterator { - Iterator(base: base, searcher: searcher) - } -} - -extension MatchesCollection: Collection { - // TODO: Custom `SubSequence` for the sake of more efficient slice iteration - - struct Index { - var match: (range: Range, match: Searcher.Match)? - var state: Searcher.State - } - - public var endIndex: Index { - // TODO: Avoid calling `state(for:startingAt)` here - Index( - match: nil, - state: searcher.state(for: base, in: base.startIndex.. Index { - var index = index - formIndex(after: &index) - return index - } - - public subscript(index: Index) -> _MatchResult { - guard let (range, result) = index.match else { - fatalError("Cannot subscript using endIndex") - } - return _MatchResult(match: base[range], result: result) - } -} - -extension MatchesCollection.Index: Comparable { - public static func == (lhs: Self, rhs: Self) -> Bool { - switch (lhs.match?.range, rhs.match?.range) { - case (nil, nil): - return true - case (nil, _?), (_?, nil): - return false - case (let lhs?, let rhs?): - return lhs.lowerBound == rhs.lowerBound - } - } - - public static func < (lhs: Self, rhs: Self) -> Bool { - switch (lhs.match?.range, rhs.match?.range) { - case (nil, _): - return false - case (_, nil): - return true - case (let lhs?, let rhs?): - return lhs.lowerBound < rhs.lowerBound - } - } -} - -// MARK: `ReversedMatchesCollection` -// TODO: reversed matches - -struct ReversedMatchesCollection< - Searcher: BackwardMatchingCollectionSearcher -> { - public typealias Base = Searcher.BackwardSearched - - let base: Base - let searcher: Searcher - - init(base: Base, searcher: Searcher) { - self.base = base - self.searcher = searcher - } -} - -extension ReversedMatchesCollection: Sequence { - struct Iterator: IteratorProtocol { - let base: Base - let searcher: Searcher - var state: Searcher.BackwardState - - init(base: Base, searcher: Searcher) { - self.base = base - self.searcher = searcher - self.state = searcher.backwardState( - for: base, in: base.startIndex.. _BackwardMatchResult? { - searcher.matchingSearchBack(base, &state).map { range, result in - _BackwardMatchResult(match: base[range], result: result) - } - } - } - - public func makeIterator() -> Iterator { - Iterator(base: base, searcher: searcher) - } -} - -// TODO: `Collection` conformance - -// MARK: `CollectionSearcher` algorithms - -extension Collection { - func _matches( - of searcher: S - ) -> MatchesCollection where S.Searched == Self { - MatchesCollection(base: self, searcher: searcher) - } -} - -extension BidirectionalCollection { - func _matchesFromBack( - of searcher: S - ) -> ReversedMatchesCollection where S.BackwardSearched == Self { - ReversedMatchesCollection(base: self, searcher: searcher) - } -} - // MARK: Regex algorithms @available(SwiftStdlib 5.7, *) -struct RegexMatchesCollection { +struct RegexMatchesSequence { let input: String let subjectBounds: Range let searchBounds: Range let regex: Regex - let startIndex: Index - + init( input: String, subjectBounds: Range, @@ -201,15 +28,11 @@ struct RegexMatchesCollection { self.subjectBounds = subjectBounds self.searchBounds = searchBounds self.regex = regex - self.startIndex = (try? regex._firstMatch( - input, - subjectBounds: subjectBounds, - searchBounds: searchBounds)).map(Index.match) ?? .end } } @available(SwiftStdlib 5.7, *) -extension RegexMatchesCollection: Sequence { +extension RegexMatchesSequence: Sequence { /// Returns the index to start searching for the next match after `match`. fileprivate func searchIndex(after match: Regex.Match) -> String.Index? { if !match.range.isEmpty { @@ -218,7 +41,7 @@ extension RegexMatchesCollection: Sequence { // If the last match was an empty match, advance by one position and // run again, unless at the end of `input`. - if match.range.lowerBound == input.endIndex { + guard match.range.lowerBound < subjectBounds.upperBound else { return nil } @@ -231,38 +54,30 @@ extension RegexMatchesCollection: Sequence { } struct Iterator: IteratorProtocol { - let base: RegexMatchesCollection - - // Because `RegexMatchesCollection` eagerly computes the first match for - // its `startIndex`, the iterator can use that match for its initial - // iteration. For subsequent calls to `next()`, this value is `false`, and - // `nextStart` is used to search for the next match. - var initialIteration = true - var nextStart: String.Index? - - init(_ matches: RegexMatchesCollection) { + let base: RegexMatchesSequence + + // Set to nil when iteration is finished (because some regex can empty-match + // at the end of the subject). + var currentPosition: String.Index? + + init(_ matches: RegexMatchesSequence) { self.base = matches - self.nextStart = base.startIndex.match.flatMap(base.searchIndex(after:)) + self.currentPosition = base.subjectBounds.lowerBound } mutating func next() -> Regex.Match? { - // Initial case with pre-computed first match - if initialIteration { - initialIteration = false - return base.startIndex.match - } - - // `nextStart` is `nil` when iteration has completed - guard let start = nextStart, start <= base.searchBounds.upperBound else { + // `currentPosition` is `nil` when iteration has completed + guard let position = currentPosition, position <= base.searchBounds.upperBound else { return nil } - + // Otherwise, find the next match (if any) and compute `nextStart` - let match = try? base.regex._firstMatch( + let match = try? Executor.firstMatch( + base.regex.program.loweredProgram, base.input, - subjectBounds: base.subjectBounds, - searchBounds: start...Match) - case end - - var match: Regex.Match? { - switch self { - case .match(let match): return match - case .end: return nil - } - } - - static func == (lhs: Self, rhs: Self) -> Bool { - switch (lhs, rhs) { - case (.match(let lhs), .match(let rhs)): - return lhs.range == rhs.range - case (.end, .end): - return true - case (.end, .match), (.match, .end): - return false - } - } - - static func < (lhs: Self, rhs: Self) -> Bool { - switch (lhs, rhs) { - case (.match(let lhs), .match(let rhs)): - // This implementation uses a tuple comparison so that an empty - // range `i.. Index { - guard let currentMatch = i.match else { - fatalError("Can't advance past the 'endIndex' of a match collection.") - } - - guard - let start = searchIndex(after: currentMatch), - start <= searchBounds.upperBound, - let nextMatch = try? regex._firstMatch( - input, - subjectBounds: subjectBounds, - searchBounds: start.. Regex.Match { - guard let match = position.match else { - fatalError("Can't subscript the 'endIndex' of a match collection.") - } - return match - } -} - extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) @_disfavoredOverload func _matches( of regex: R - ) -> RegexMatchesCollection { - RegexMatchesCollection( + ) -> RegexMatchesSequence { + RegexMatchesSequence( input: self[...].base, subjectBounds: startIndex...Match> var result = Array.Match>() + + + for match in _matches(of: r) { result.append(match) } diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionConsumer.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionConsumer.swift deleted file mode 100644 index ae3a24d4e..000000000 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionConsumer.swift +++ /dev/null @@ -1,48 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -protocol MatchingCollectionConsumer: CollectionConsumer { - associatedtype Match - func matchingConsuming( - _ consumed: Consumed, - in range: Range - ) -> (upperBound: Consumed.Index, match: Match)? -} - -extension MatchingCollectionConsumer { - func consuming( - _ consumed: Consumed, - in range: Range - ) -> Consumed.Index? { - matchingConsuming(consumed, in: range)?.upperBound - } -} - -// MARK: Consuming from the back - -protocol BidirectionalMatchingCollectionConsumer: - MatchingCollectionConsumer, BidirectionalCollectionConsumer -{ - func matchingConsumingBack( - _ consumed: Consumed, - in range: Range - ) -> (lowerBound: Consumed.Index, match: Match)? -} - -extension BidirectionalMatchingCollectionConsumer { - func consumingBack( - _ consumed: Consumed, - in range: Range - ) -> Consumed.Index? { - matchingConsumingBack(consumed, in: range)?.lowerBound - } -} - diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionSearcher.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionSearcher.swift index 902d94591..b75f30c73 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/MatchingCollectionSearcher.swift @@ -25,107 +25,3 @@ extension MatchingCollectionSearcher { matchingSearch(searched, &state)?.range } } - -protocol MatchingStatelessCollectionSearcher: - MatchingCollectionSearcher, StatelessCollectionSearcher -{ - func matchingSearch( - _ searched: Searched, - in range: Range - ) -> (range: Range, match: Match)? -} - -extension MatchingStatelessCollectionSearcher { - // for disambiguation between the `MatchingCollectionSearcher` and - // `StatelessCollectionSearcher` overloads - func search( - _ searched: Searched, - _ state: inout State - ) -> Range? { - matchingSearch(searched, &state)?.range - } - - func matchingSearch( - _ searched: Searched, - _ state: inout State - ) -> (range: Range, match: Match)? { - // TODO: deduplicate this logic with `StatelessCollectionSearcher`? - - guard - case .index(let index) = state.position, - let (range, value) = matchingSearch(searched, in: index.. - ) -> Range? { - matchingSearch(searched, in: range)?.range - } -} - -// MARK: Searching from the back - -protocol BackwardMatchingCollectionSearcher: BackwardCollectionSearcher { - associatedtype Match - func matchingSearchBack( - _ searched: BackwardSearched, - _ state: inout BackwardState - ) -> (range: Range, match: Match)? -} - -protocol BackwardMatchingStatelessCollectionSearcher: - BackwardMatchingCollectionSearcher, BackwardStatelessCollectionSearcher -{ - func matchingSearchBack( - _ searched: BackwardSearched, - in range: Range - ) -> (range: Range, match: Match)? -} - -extension BackwardMatchingStatelessCollectionSearcher { - func searchBack( - _ searched: BackwardSearched, - in range: Range - ) -> Range? { - matchingSearchBack(searched, in: range)?.range - } - - func matchingSearchBack( - _ searched: BackwardSearched, - _ state: inout BackwardState) -> (range: Range, match: Match)? - { - // TODO: deduplicate this logic with `StatelessBackwardCollectionSearcher`? - - guard - case .index(let index) = state.position, - let (range, value) = matchingSearchBack(searched, in: state.end..) -> BackwardState - func searchBack( - _ searched: BackwardSearched, - _ state: inout BackwardState - ) -> Range? -} - -protocol BackwardStatelessCollectionSearcher: BackwardCollectionSearcher - where BackwardState == DefaultSearcherState -{ - func searchBack( - _ searched: BackwardSearched, - in range: Range - ) -> Range? -} - -extension BackwardStatelessCollectionSearcher { - func backwardState( - for searched: BackwardSearched, - in range: Range - ) -> BackwardState { - BackwardState(position: .index(range.upperBound), end: range.lowerBound) - } - - func searchBack( - _ searched: BackwardSearched, - _ state: inout BackwardState) -> Range? { - guard - case .index(let index) = state.position, - let range = searchBack(searched, in: state.end.. { - let consumer: Consumer -} - -extension ConsumerSearcher: StatelessCollectionSearcher { - typealias Searched = Consumer.Consumed - - func search( - _ searched: Searched, - in range: Range - ) -> Range? { - var start = range.lowerBound - while true { - if let end = consumer.consuming(searched, in: start.. - ) -> Range? { - var end = range.upperBound - while true { - if let start = consumer.consumingBack( - searched, in: range.lowerBound.. - ) -> (range: Range, match: Consumer.Match)? { - var start = range.lowerBound - while true { - if let (end, value) = consumer.matchingConsuming( - searched, - in: start.. - ) -> (range: Range, match: Match)? { - var end = range.upperBound - while true { - if let (start, value) = consumer.matchingConsumingBack( - searched, in: range.lowerBound.. - where Searched.Element: Equatable, Pattern.Element == Searched.Element -{ - let pattern: Pattern -} - -extension NaivePatternSearcher: StatelessCollectionSearcher { - func search( - _ searched: Searched, - in range: Range - ) -> Range? { - var searchStart = range.lowerBound - - guard let patternFirst = pattern.first else { - return searchStart.. - ) -> Range? { - var searchEnd = range.upperBound - - guard let otherLastIndex = pattern.indices.last else { - return searchEnd.. { - let searcher: Searcher? -} - -extension PatternOrEmpty: CollectionSearcher { - typealias Searched = Searcher.Searched - - struct State { - enum Representation { - case state(Searcher.State) - case empty(index: Searched.Index, end: Searched.Index) - case emptyDone - } - - let representation: Representation - } - - func state( - for searched: Searcher.Searched, - in range: Range - ) -> State { - if let searcher = searcher { - return State( - representation: .state(searcher.state(for: searched, in: range))) - } else { - return State( - representation: .empty(index: range.lowerBound, end: range.upperBound)) - } - } - - func search( - _ searched: Searched, - _ state: inout State - ) -> Range? { - switch state.representation { - case .state(var s): - // TODO: Avoid a potential copy-on-write copy here - let result = searcher!.search(searched, &s) - state = State(representation: .state(s)) - return result - case .empty(let index, let end): - if index == end { - state = State(representation: .emptyDone) - } else { - state = State( - representation: .empty(index: searched.index(after: index), end: end)) - } - return index.. { - let predicate: (Searched.Element) -> Bool -} - -extension PredicateSearcher: StatelessCollectionSearcher { - func search( - _ searched: Searched, - in range: Range - ) -> Range? { - guard let index = searched[range].firstIndex(where: predicate) else { - return nil - } - return index.. - ) -> Range? { - guard let index = searched[range].lastIndex(where: predicate) else { - return nil - } - return index.. Executor { +) throws -> MEProgram { let ast = try parse(regex, syntax) let dsl: DSLTree @@ -104,7 +104,7 @@ func _compileRegex( dsl = ast.dslTree } let program = try Compiler(tree: dsl).emit() - return Executor(program: program) + return program } @_spi(RegexBenchmark) diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift deleted file mode 100644 index 6af973919..000000000 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ /dev/null @@ -1,58 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -var checkComments = true - -extension Engine { - func makeProcessor( - input: String, bounds: Range, matchMode: MatchMode - ) -> Processor { - Processor( - program: program, - input: input, - subjectBounds: bounds, - searchBounds: bounds, - matchMode: matchMode, - isTracingEnabled: enableTracing, - shouldMeasureMetrics: enableMetrics) - } - - func makeFirstMatchProcessor( - input: String, - subjectBounds: Range, - searchBounds: Range - ) -> Processor { - Processor( - program: program, - input: input, - subjectBounds: subjectBounds, - searchBounds: searchBounds, - matchMode: .partialFromFront, - isTracingEnabled: enableTracing, - shouldMeasureMetrics: enableMetrics) - } -} - -extension Processor { - // TODO: Should we throw here? - mutating func consume() -> Input.Index? { - while true { - switch self.state { - case .accept: - return self.currentPosition - case .fail: - return nil - case .inProgress: self.cycle() - } - } - } -} - diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift deleted file mode 100644 index a5cb11bd6..000000000 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ /dev/null @@ -1,37 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -// Currently, engine binds the type and consume binds an instance. -// But, we can play around with this. -struct Engine { - - let program: MEProgram - - // TODO: Pre-allocated register banks - - var instructions: InstructionList { program.instructions } - - var enableTracing: Bool { program.enableTracing } - var enableMetrics: Bool { program.enableMetrics } - - init(_ program: MEProgram) { - self.program = program - } -} - -struct AsyncEngine { /* ... */ } - -extension Engine: CustomStringConvertible { - var description: String { - // TODO: better description - return program.description - } -} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index d6b2cfe0c..fec198902 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -18,7 +18,7 @@ enum MatchMode { /// A concrete CU. Somehow will run the concrete logic and /// feed stuff back to generic code -struct Controller { +struct Controller: Equatable { var pc: InstructionAddress mutating func step() { @@ -48,6 +48,16 @@ struct Processor { /// `input.startIndex.. + let matchMode: MatchMode + let instructions: InstructionList + + // MARK: Update-only state + + var wordIndexCache: Set? = nil + var wordIndexMaxIndex: String.Index? = nil + + // MARK: Resettable state + /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a @@ -57,12 +67,7 @@ struct Processor { /// Anchors like `^` and `.startOfSubject` use `subjectBounds` instead of /// `searchBounds`. The "start of matching" anchor `\G` uses `searchBounds` /// as its starting point. - let searchBounds: Range - - let matchMode: MatchMode - let instructions: InstructionList - - // MARK: Resettable state + var searchBounds: Range /// The current search position while processing. /// @@ -80,9 +85,6 @@ struct Processor { var storedCaptures: Array<_StoredCapture> - var wordIndexCache: Set? = nil - var wordIndexMaxIndex: String.Index? = nil - var state: State = .inProgress var failureReason: Error? = nil @@ -103,9 +105,7 @@ extension Processor { input: Input, subjectBounds: Range, searchBounds: Range, - matchMode: MatchMode, - isTracingEnabled: Bool, - shouldMeasureMetrics: Bool + matchMode: MatchMode ) { self.controller = Controller(pc: 0) self.instructions = program.instructions @@ -115,8 +115,8 @@ extension Processor { self.matchMode = matchMode self.metrics = ProcessorMetrics( - isTracingEnabled: isTracingEnabled, - shouldMeasureMetrics: shouldMeasureMetrics) + isTracingEnabled: program.enableTracing, + shouldMeasureMetrics: program.enableTracing) self.currentPosition = searchBounds.lowerBound @@ -128,8 +128,12 @@ extension Processor { _checkInvariants() } - mutating func reset(currentPosition: Position) { + mutating func reset( + currentPosition: Position, + searchBounds: Range + ) { self.currentPosition = currentPosition + self.searchBounds = searchBounds self.controller = Controller(pc: 0) @@ -149,6 +153,22 @@ extension Processor { _checkInvariants() } + // Check that resettable state has been reset. Note that `reset()` + // takes a new current position and search bounds. + func isReset() -> Bool { + _checkInvariants() + guard self.controller == Controller(pc: 0), + self.savePoints.isEmpty, + self.callStack.isEmpty, + self.storedCaptures.allSatisfy({ $0.range == nil }), + self.state == .inProgress, + self.failureReason == nil + else { + return false + } + return true + } + func _checkInvariants() { assert(searchBounds.lowerBound >= subjectBounds.lowerBound) assert(searchBounds.upperBound <= subjectBounds.upperBound) diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 5cf702514..f97ac9f74 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,93 +11,225 @@ internal import _RegexParser -struct Executor { - // TODO: consider let, for now lets us toggle tracing - var engine: Engine +/// `Executor` encapsulates the execution of the regex engine post-compilation. +/// It doesn't know anything about the `Regex` type or how to compile a regex. +@available(SwiftStdlib 5.7, *) +enum Executor { + static func prefixMatch( + _ program: MEProgram, + _ input: String, + subjectBounds: Range, + searchBounds: Range + ) throws -> Regex.Match? { + try Executor._run( + program, + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + mode: .partialFromFront) + } - init(program: MEProgram) { - self.engine = Engine(program) + static func wholeMatch( + _ program: MEProgram, + _ input: String, + subjectBounds: Range, + searchBounds: Range + ) throws -> Regex.Match? { + try Executor._run( + program, + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + mode: .wholeString) } - @available(SwiftStdlib 5.7, *) - func firstMatch( + static func firstMatch( + _ program: MEProgram, _ input: String, subjectBounds: Range, - searchBounds: Range, - graphemeSemantic: Bool + searchBounds: Range ) throws -> Regex.Match? { - var cpu = engine.makeFirstMatchProcessor( + var cpu = Processor( + program: program, input: input, subjectBounds: subjectBounds, - searchBounds: searchBounds) -#if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } -#endif - var low = searchBounds.lowerBound - let high = searchBounds.upperBound + searchBounds: searchBounds, + matchMode: .partialFromFront) + return try Executor._firstMatch( + program, + using: &cpu) + } + + static func _firstMatch( + _ program: MEProgram, + using cpu: inout Processor + ) throws -> Regex.Match? { + let isGraphemeSemantic = program.initialOptions.semanticLevel == .graphemeCluster + + var low = cpu.searchBounds.lowerBound + let high = cpu.searchBounds.upperBound while true { - if let m: Regex.Match = try _match( - input, from: low, using: &cpu - ) { + if let m = try Executor._run(program, &cpu) { return m } - if low >= high { return nil } - if graphemeSemantic { - low = input.index( - low, offsetBy: 1, limitedBy: searchBounds.upperBound) ?? searchBounds.upperBound + // Fast-path for start-anchored regex + if program.canOnlyMatchAtStart { + return nil + } + if low == high { return nil } + if isGraphemeSemantic { + cpu.input.formIndex(after: &low) } else { - input.unicodeScalars.formIndex(after: &low) + cpu.input.unicodeScalars.formIndex(after: &low) + } + guard low <= high else { + return nil } - cpu.reset(currentPosition: low) + cpu.reset(currentPosition: low, searchBounds: cpu.searchBounds) } } - @available(SwiftStdlib 5.7, *) - func match( + static func allMatches( + _ program: MEProgram, _ input: String, - in subjectBounds: Range, - _ mode: MatchMode - ) throws -> Regex.Match? { - var cpu = engine.makeProcessor( - input: input, bounds: subjectBounds, matchMode: mode) -#if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } -#endif - return try _match(input, from: subjectBounds.lowerBound, using: &cpu) + subjectBounds: Range, + searchBounds: Range + ) -> Matches { + fatalError() + } +} + +@available(SwiftStdlib 5.7, *) +extension Executor { + struct Matches: Sequence { + var program: MEProgram + var input: String + var subjectBounds: Range + var searchBounds: Range + + struct Iterator: IteratorProtocol { + var program: MEProgram + var processor: Processor + } + + func makeIterator() -> Iterator { + Iterator( + program: program, + processor: Processor( + program: program, + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + matchMode: .partialFromFront)) + } + } +} + +@available(SwiftStdlib 5.7, *) +extension Executor.Matches.Iterator { + func nextSearchIndex( + after range: Range + ) -> String.Index? { + if !range.isEmpty { + return range.upperBound + } + + // If the last match was an empty match, advance by one position and + // run again, unless at the end of `input`. + guard range.lowerBound < processor.subjectBounds.upperBound else { + return nil + } + + switch program.initialOptions.semanticLevel { + case .graphemeCluster: + return processor.input.index(after: range.upperBound) + case .unicodeScalar: + return processor.input.unicodeScalars.index(after: range.upperBound) + } + } + + mutating func next() -> Regex.Match? { + guard let match = try? Executor._firstMatch( + program, using: &processor + ) else { + return nil + } + + // If there's more input to process, advance our position + // and search bounds. Otherwise, set to fail fast. + if let currentPosition = nextSearchIndex(after: match.range) { + processor.reset( + currentPosition: currentPosition, + searchBounds: currentPosition..( +@available(SwiftStdlib 5.7, *) +extension Executor { + static func _run( + _ program: MEProgram, _ input: String, - from currentPosition: String.Index, - using cpu: inout Processor + subjectBounds: Range, + searchBounds: Range, + mode: MatchMode ) throws -> Regex.Match? { - // FIXME: currentPosition is already encapsulated in cpu, don't pass in - // FIXME: cpu.consume() should return the matched range, not the upper bound - guard let endIdx = cpu.consume() else { - if let e = cpu.failureReason { - throw e - } + var cpu = Processor( + program: program, + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + matchMode: mode) + return try _run(program, &cpu) + } + + static func _run( + _ program: MEProgram, + _ cpu: inout Processor + ) throws -> Regex.Match? { + + let startPosition = cpu.currentPosition + guard let endIdx = try cpu.run() else { return nil } - let capList = MECaptureList( values: cpu.storedCaptures, - referencedCaptureOffsets: engine.program.referencedCaptureOffsets) + referencedCaptureOffsets: program.referencedCaptureOffsets) - let range = currentPosition.., - _ mode: MatchMode - ) throws -> Regex.Match? { - try match(input, in: subjectBounds, mode) +extension Processor { + fileprivate mutating func run() throws -> Input.Index? { +#if PROCESSOR_MEASUREMENTS_ENABLED + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } +#endif + if self.state == .fail { + if let e = failureReason { + throw e + } + return nil + } + assert(isReset()) + while true { + switch self.state { + case .accept: + return self.currentPosition + case .fail: + if let e = failureReason { + throw e + } + return nil + case .inProgress: self.cycle() + } + } } } diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 0b0b2e797..26e7c130e 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -109,7 +109,12 @@ extension Regex { /// - Returns: The match, if this regex matches the entirety of `string`; /// otherwise, `nil`. public func wholeMatch(in string: String) throws -> Regex.Match? { - try _match(string, in: string.startIndex.. Regex.Match? { - try _match(string, in: string.startIndex.. Regex.Match? { - try _firstMatch(string, in: string.startIndex.. Regex.Match? { - try _match(string.base, in: string.startIndex.. Regex.Match? { - try _match(string.base, in: string.startIndex.. Regex.Match? { - try _firstMatch(string.base, in: string.startIndex.., - mode: MatchMode = .wholeString - ) throws -> Regex.Match? { - let executor = Executor(program: regex.program.loweredProgram) - return try executor.match(input, in: subjectBounds, mode) + let bounds = string.startIndex.., +// mode: MatchMode = .wholeString +// ) throws -> Regex.Match? { +// +// +// let executor = Executor(program: regex.program.loweredProgram) +// return try executor.match(input, in: subjectBounds, mode) +// } - func _firstMatch( - _ input: String, - in subjectBounds: Range - ) throws -> Regex.Match? { - try regex.program.loweredProgram.canOnlyMatchAtStart - ? _match(input, in: subjectBounds, mode: .partialFromFront) - : _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) - } +// func _firstMatch( +// _ input: String, +// in subjectBounds: Range +// ) throws -> Regex.Match? { +// try Executor.firstMatch(self.program.loweredProgram, input, subjectBounds: subjectBounds, searchBounds: subjectBounds) +// } - func _firstMatch( - _ input: String, - subjectBounds: Range, - searchBounds: Range - ) throws -> Regex.Match? { - let executor = Executor(program: regex.program.loweredProgram) - let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster - return try executor.firstMatch( - input, - subjectBounds: subjectBounds, - searchBounds: searchBounds, - graphemeSemantic: graphemeSemantic) - } +// func _firstMatch( +// _ input: String, +// subjectBounds: Range, +// searchBounds: Range +// ) throws -> Regex.Match? { +// let executor = Executor(program: regex.program.loweredProgram) +// let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster +// return try executor.firstMatch( +// input, +// subjectBounds: subjectBounds, +// searchBounds: searchBounds, +// graphemeSemantic: graphemeSemantic) +// } } @available(SwiftStdlib 5.7, *) diff --git a/Tests/RegexTests/AlgorithmsInternalsTests.swift b/Tests/RegexTests/AlgorithmsInternalsTests.swift index af0007bbe..6a604e26f 100644 --- a/Tests/RegexTests/AlgorithmsInternalsTests.swift +++ b/Tests/RegexTests/AlgorithmsInternalsTests.swift @@ -54,21 +54,8 @@ extension AlgorithmTests { ] // Make sure we're getting the right collection type - let _: RegexMatchesCollection = matches + let _: RegexMatchesSequence = matches XCTAssertEqual(matches.map(\.output), expected) - - let i = matches.index(matches.startIndex, offsetBy: 3) - XCTAssertEqual(matches[i].output, expected[3]) - let j = matches.index(i, offsetBy: 5) - XCTAssertEqual(j, matches.endIndex) - - var index = matches.startIndex - while index < matches.endIndex { - XCTAssertEqual( - matches[index].output, - expected[matches.distance(from: matches.startIndex, to: index)]) - matches.formIndex(after: &index) - } } } diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 85aecd210..63ee266ec 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -128,11 +128,8 @@ extension StringCapture { // TODO: Move `flatCaptureTest`s over here too... -func compile(_ ast: AST) -> Executor { - let tree = ast.dslTree - let prog = try! Compiler(tree: tree).emit() - let executor = Executor(program: prog) - return executor +func compile(_ ast: AST) -> MEProgram { + try! Compiler(tree: ast.dslTree).emit() } func captureTest( @@ -184,8 +181,11 @@ func captureTest( for (input, output) in tests { let inputRange = input.startIndex...wholeMatch( + compile(ast), + input, + subjectBounds: inputRange, + searchBounds: inputRange ) else { XCTFail("No match", file: file, line: line) return diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index d0500847b..05212388d 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -154,7 +154,7 @@ extension RegexTests { ) throws { assert(!equivs.isEmpty) let progs = try equivs.map { - try _compileRegex($0).engine.program + try _compileRegex($0) } let ref = progs.first! for (prog, equiv) in zip(progs, equivs).dropFirst() { @@ -325,7 +325,7 @@ extension RegexTests { do { let prog = try _compileRegex(regex, syntax, semanticLevel) var found: Set = [] - for inst in prog.engine.instructions { + for inst in prog.instructions { let decoded = DecodedInstr.decode(inst) found.insert(decoded)