From 2fadbbed8e1d1e9a2e6c9aa411edfb867728f419 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 31 Jan 2024 15:11:29 -0600 Subject: [PATCH 01/13] Add a string-specific search algorithm This adds a Boyer-Moore substring search algorithm, and updates the `firstRange(of:)` and `ranges(of:)` methods to use that when both pieces of the search are strings/substrings. Still need to look at availability and switch the "replacing" methods to use this new search algorithm. --- .../Algorithms/Algorithms/FirstRange.swift | 50 ++++++++++-- .../Algorithms/Algorithms/Ranges.swift | 22 ++++- .../Algorithms/SubstringSearcher.swift | 81 +++++++++++++++++++ 3 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index d0fb8673d..f2209a8fc 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -21,8 +21,25 @@ extension Collection { } // MARK: Fixed pattern algorithms +extension Substring { + @usableFromInline + func _firstRangeSubstring( + of other: Substring + ) -> Range? { + var searcher = SubstringSearcher(text: self, pattern: other) + return searcher.next() + } +} extension Collection where Element: Equatable { + @usableFromInline + func _firstRangeGeneric( + of other: C + ) -> Range? where C.Element == Element { + let searcher = ZSearcher(pattern: Array(other), by: ==) + return searcher.search(self[...], in: startIndex..( of other: C ) -> Range? where C.Element == Element { - // TODO: Use a more efficient search algorithm - let searcher = ZSearcher(pattern: Array(other), by: ==) - return searcher.search(self[...], in: startIndex..? + case (let str as Substring, let other as String): + return str._firstRangeSubstring(of: other[...]) as! Range? + case (let str as String, let other as Substring): + return str[...]._firstRangeSubstring(of: other) as! Range? + case (let str as Substring, let other as Substring): + return str._firstRangeSubstring(of: other) as! Range? + + default: + return _firstRangeGeneric(of: other) + } } } @@ -47,11 +75,23 @@ extension BidirectionalCollection where Element: Comparable { /// - Returns: A range in the collection of the first occurrence of `sequence`. /// Returns `nil` if `sequence` is not found. @available(SwiftStdlib 5.7, *) + @inline(__always) public func firstRange( of other: C ) -> Range? where C.Element == Element { - let searcher = ZSearcher(pattern: Array(other), by: ==) - return searcher.search(self[...], in: startIndex..? + case (let str as Substring, let other as String): + return str._firstRangeSubstring(of: other[...]) as! Range? + case (let str as String, let other as Substring): + return str[...]._firstRangeSubstring(of: other) as! Range? + case (let str as Substring, let other as Substring): + return str._firstRangeSubstring(of: other) as! Range? + + default: + return _firstRangeGeneric(of: other) + } } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 030ce1f64..e2ab4bbee 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -135,6 +135,13 @@ extension Collection where Element: Equatable { ) -> RangesCollection> where C.Element == Element { _ranges(of: ZSearcher(pattern: Array(other), by: ==)) } + + @usableFromInline + func _rangesGeneric( + of other: C + ) -> [Range] where C.Element == Element { + Array(_ranges(of: other)) + } // FIXME: Return `some Collection>` for SE-0346 /// Finds and returns the ranges of the all occurrences of a given sequence @@ -143,10 +150,23 @@ extension Collection where Element: Equatable { /// - Returns: A collection of ranges of all occurrences of `other`. Returns /// an empty collection if `other` is not found. @available(SwiftStdlib 5.7, *) + @inline(__always) public func ranges( of other: C ) -> [Range] where C.Element == Element { - Array(_ranges(of: other)) + switch (self, other) { + case (let str as String, let other as String): + return Array(SubstringSearcher(text: str[...], pattern: other[...])) as! [Range] + case (let str as Substring, let other as String): + return Array(SubstringSearcher(text: str, pattern: other[...])) as! [Range] + case (let str as String, let other as Substring): + return Array(SubstringSearcher(text: str[...], pattern: other)) as! [Range] + case (let str as Substring, let other as Substring): + return Array(SubstringSearcher(text: str, pattern: other)) as! [Range] + + default: + return _rangesGeneric(of: other) + } } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift new file mode 100644 index 000000000..2f44eee5b --- /dev/null +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +/// An implementation of the Boyer-Moore algorithm, for string-specific +/// searching. +@usableFromInline +struct SubstringSearcher: Sequence, IteratorProtocol { + @usableFromInline + let text: Substring + @usableFromInline + let pattern: Substring + @usableFromInline + let badCharacterOffsets: [Character: Int] + @usableFromInline + let patternCount: Int + @usableFromInline + var endOfSearch: String.Index? + + @usableFromInline + init(text: Substring, pattern: Substring) { + self.text = text + self.pattern = pattern + self.patternCount = pattern.count + self.endOfSearch = text.index( + text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) + self.badCharacterOffsets = Dictionary( + zip(pattern, 0...), uniquingKeysWith: { _, last in last }) + } + + @inlinable + mutating func next() -> Range? { + while let end = endOfSearch { + // Empty pattern matches at every position. + if patternCount == 0 { + endOfSearch = end == text.endIndex ? nil : text.index(after: end) + return end..= pattern.startIndex + && pattern[patternCursor] == text[textCursor] + { + patternOffset -= 1 + + // Success! + if patternCursor == pattern.startIndex { + // Calculate the offset for the next search. + endOfSearch = text.index(end, offsetBy: patternCount, limitedBy: text.endIndex) + return textCursor.. text.startIndex) + text.formIndex(before: &textCursor) + pattern.formIndex(before: &patternCursor) + } + + // Match failed - calculate the end index of the next possible + // candidate, based on the `badCharacterOffsets` table and the + // current position in the pattern. + let shiftOffset = Swift.max( + 1, + patternOffset - (badCharacterOffsets[text[textCursor]] ?? 0)) + endOfSearch = text.index( + end, offsetBy: shiftOffset, limitedBy: text.endIndex) + } + return nil + } +} + From 714c2a0d018757c28adab210aa2cb238938538ed Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 15 Feb 2024 16:42:18 -0600 Subject: [PATCH 02/13] ... --- .../Algorithms/SubstringSearcher.swift | 170 ++++++++++++++---- 1 file changed, 131 insertions(+), 39 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index 2f44eee5b..3a2b0d4a6 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -/// An implementation of the Boyer-Moore algorithm, for string-specific +/// An implementation of the Boyer-Moore-Horspool algorithm, for string-specific /// searching. @usableFromInline struct SubstringSearcher: Sequence, IteratorProtocol { @@ -22,60 +22,152 @@ struct SubstringSearcher: Sequence, IteratorProtocol { @usableFromInline let patternCount: Int @usableFromInline - var endOfSearch: String.Index? + var endOfNextPotentialMatch: String.Index? @usableFromInline init(text: Substring, pattern: Substring) { self.text = text self.pattern = pattern self.patternCount = pattern.count - self.endOfSearch = text.index( + self.endOfNextPotentialMatch = text.index( text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) self.badCharacterOffsets = Dictionary( zip(pattern, 0...), uniquingKeysWith: { _, last in last }) } @inlinable - mutating func next() -> Range? { - while let end = endOfSearch { - // Empty pattern matches at every position. - if patternCount == 0 { - endOfSearch = end == text.endIndex ? nil : text.index(after: end) - return end.. (result: Range?, nextEnd: String.Index?) + { + // Empty pattern matches at every position. + if patternCount == 0 { + return ( + end..= pattern.startIndex + && pattern[patternCursor] == text[textCursor] + { + patternOffset -= 1 - // Search backwards from `end` to the start of the pattern - while patternCursor >= pattern.startIndex - && pattern[patternCursor] == text[textCursor] - { - patternOffset -= 1 - - // Success! - if patternCursor == pattern.startIndex { - // Calculate the offset for the next search. - endOfSearch = text.index(end, offsetBy: patternCount, limitedBy: text.endIndex) - return textCursor.. text.startIndex) - text.formIndex(before: &textCursor) - pattern.formIndex(before: &patternCursor) + // Success! + if patternCursor == pattern.startIndex { + // Calculate the offset for the next search. + return ( + textCursor.. text.startIndex) + text.formIndex(before: &textCursor) + pattern.formIndex(before: &patternCursor) } - return nil + + // Match failed - calculate the end index of the next possible + // candidate, based on the `badCharacterOffsets` table and the + // current position in the pattern. + let shiftOffset = Swift.max( + 1, + patternOffset - (badCharacterOffsets[text[textCursor]] ?? 0)) + let nextEnd = text.index( + end, offsetBy: shiftOffset, limitedBy: text.endIndex) + guard let nextEnd else { return (nil, nil) } + return nextRange(searchFromEnd: nextEnd) + } + + @inlinable + mutating func next() -> Range? { + guard let end = endOfNextPotentialMatch else { return nil } + let (result, nextEnd) = nextRange(searchFromEnd: end) + endOfNextPotentialMatch = nextEnd + return result +// while let end = endOfSearch { +// // Empty pattern matches at every position. +// if patternCount == 0 { +// endOfSearch = end == text.endIndex ? nil : text.index(after: end) +// return end..= pattern.startIndex +// && pattern[patternCursor] == text[textCursor] +// { +// patternOffset -= 1 +// +// // Success! +// if patternCursor == pattern.startIndex { +// // Calculate the offset for the next search. +// endOfSearch = text.index(end, offsetBy: patternCount, limitedBy: text.endIndex) +// return textCursor.. text.startIndex) +// text.formIndex(before: &textCursor) +// pattern.formIndex(before: &patternCursor) +// } +// +// // Match failed - calculate the end index of the next possible +// // candidate, based on the `badCharacterOffsets` table and the +// // current position in the pattern. +// let shiftOffset = Swift.max( +// 1, +// patternOffset - (badCharacterOffsets[text[textCursor]] ?? 0)) +// endOfSearch = text.index( +// end, offsetBy: shiftOffset, limitedBy: text.endIndex) +// } +// return nil } } +extension SubstringSearcher { + struct Coll: Collection { + var iterator: SubstringSearcher + var startIndex: Index + + var endIndex: Index { Index() } + + init(iterator: SubstringSearcher) { + var iterator = iterator + self.startIndex = Index(range: iterator.next()) + self.iterator = iterator + } + + struct Index: Comparable { + var range: Range? + var endOfNextPotentialMatch: String.Index? + + static func < (lhs: Index, rhs: Index) -> Bool { + switch (lhs.range, rhs.range) { + case (nil, _): false + case (_, nil): true + case let (lhs?, rhs?): + lhs.lowerBound < rhs.lowerBound + } + } + } + + subscript(index: Index) -> Range { + index.range! + } + + func index(after index: Index) -> Index { + let (range, next) = iterator.nextRange( + searchFromEnd: index.endOfNextPotentialMatch!) + return Index(range: range, endOfNextPotentialMatch: next) + } + } + + var collection: Coll { + .init(iterator: self) + } +} From ded493c4403eae042fd091a63744e09accd1a1d4 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 15 Feb 2024 18:40:27 -0600 Subject: [PATCH 03/13] Add string-specific implementation for replacing --- .../Algorithms/Algorithms/Replace.swift | 79 +++++++++++++++---- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index 5e6c97973..d5f8e6ea4 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -11,6 +11,34 @@ // MARK: `CollectionSearcher` algorithms +extension Substring { + func _replacingSubstring( + _ other: Substring, + with replacement: Substring, + maxReplacements: Int = .max + ) -> String { + precondition(maxReplacements >= 0) + + var result = String() + var index = startIndex + + // `maxRanges` is a workaround for https://github.com/apple/swift/issues/59522 + var rangeIterator = SubstringSearcher(text: self, pattern: other) + var replacementCount = 0 + while let range = rangeIterator.next() { + result.append(contentsOf: self[index..( _ ranges: Ranges, @@ -35,19 +63,6 @@ extension RangeReplaceableCollection { result.append(contentsOf: self[index...]) return result } - - mutating func _replace< - Ranges: Collection, Replacement: Collection - >( - _ ranges: Ranges, - with replacement: Replacement, - maxReplacements: Int = .max - ) where Ranges.Element == Range, Replacement.Element == Element { - self = _replacing( - ranges, - with: replacement, - maxReplacements: maxReplacements) - } } // MARK: Fixed pattern algorithms @@ -70,10 +85,40 @@ extension RangeReplaceableCollection where Element: Equatable { subrange: Range, maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { - _replacing( - self[subrange]._ranges(of: other), - with: replacement, - maxReplacements: maxReplacements) + switch (self, other, replacement) { + case (let str as String, let other as String, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as Substring, let other as Substring, let repl as Substring): + return str._replacingSubstring(other, with: repl, + maxReplacements: maxReplacements) as! Self + + case (let str as Substring, let other as String, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as String, let other as Substring, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as String, let other as String, let repl as Substring): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + + case (let str as String, let other as Substring, let repl as Substring): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as Substring, let other as String, let repl as Substring): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as Substring, let other as Substring, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + + default: + return _replacing( + self[subrange]._ranges(of: other), + with: replacement, + maxReplacements: maxReplacements) + } } /// Returns a new collection in which all occurrences of a target sequence From c06263aa2c6cb4768e5107ab2da7a7a039d3bad7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 28 Feb 2024 11:37:27 -0600 Subject: [PATCH 04/13] Add CollectionSearcher conformance to new search --- .../Algorithms/Algorithms/Split.swift | 4 +- .../Algorithms/SubstringSearcher.swift | 149 ++++++------------ 2 files changed, 52 insertions(+), 101 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift index 5e7ba37b0..a85e6c247 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift @@ -175,7 +175,7 @@ extension StringProtocol where SubSequence == Substring { omittingEmptySubsequences: Bool = true ) -> [Substring] { Array(split( - by: ZSearcher(pattern: Array(separator), by: ==), + by: SubstringSearcher(text: "" as Self, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) } @@ -188,7 +188,7 @@ extension StringProtocol where SubSequence == Substring { omittingEmptySubsequences: Bool = true ) -> [Substring] { Array(split( - by: ZSearcher(pattern: Array(separator), by: ==), + by: SubstringSearcher(text: "" as Self, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index 3a2b0d4a6..026043fae 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -12,41 +12,61 @@ /// An implementation of the Boyer-Moore-Horspool algorithm, for string-specific /// searching. @usableFromInline -struct SubstringSearcher: Sequence, IteratorProtocol { +struct SubstringSearcher: Sequence, IteratorProtocol + where Searched.SubSequence == Substring +{ @usableFromInline - let text: Substring - @usableFromInline - let pattern: Substring + struct State { + @usableFromInline + let badCharacterOffsets: [Character: Int] + @usableFromInline + let patternCount: Int + @usableFromInline + var endOfNextPotentialMatch: String.Index? + + init(text: Substring? = nil, pattern: Substring) { + var offsets: [Character: Int] = [:] + var count = 0 + for (offset, ch) in pattern.enumerated() { + offsets[ch] = offset + count += 1 + } + + self.badCharacterOffsets = offsets + self.patternCount = count + if let text { + self.endOfNextPotentialMatch = text.index( + text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) + } + } + } + @usableFromInline - let badCharacterOffsets: [Character: Int] + let text: Searched @usableFromInline - let patternCount: Int + let pattern: Substring @usableFromInline - var endOfNextPotentialMatch: String.Index? + var state: State @usableFromInline - init(text: Substring, pattern: Substring) { + init(text: Searched, pattern: Substring) { self.text = text self.pattern = pattern - self.patternCount = pattern.count - self.endOfNextPotentialMatch = text.index( - text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) - self.badCharacterOffsets = Dictionary( - zip(pattern, 0...), uniquingKeysWith: { _, last in last }) + self.state = .init(text: text[...], pattern: pattern) } @inlinable - func nextRange(searchFromEnd end: String.Index) + func nextRange(in text: Searched, searchFromEnd end: String.Index) -> (result: Range?, nextEnd: String.Index?) { // Empty pattern matches at every position. - if patternCount == 0 { + if state.patternCount == 0 { return ( end.. text.startIndex) @@ -74,100 +94,31 @@ struct SubstringSearcher: Sequence, IteratorProtocol { // current position in the pattern. let shiftOffset = Swift.max( 1, - patternOffset - (badCharacterOffsets[text[textCursor]] ?? 0)) + patternOffset - (state.badCharacterOffsets[text[textCursor]] ?? 0)) let nextEnd = text.index( end, offsetBy: shiftOffset, limitedBy: text.endIndex) guard let nextEnd else { return (nil, nil) } - return nextRange(searchFromEnd: nextEnd) + return nextRange(in: text, searchFromEnd: nextEnd) } @inlinable mutating func next() -> Range? { - guard let end = endOfNextPotentialMatch else { return nil } - let (result, nextEnd) = nextRange(searchFromEnd: end) - endOfNextPotentialMatch = nextEnd + guard let end = state.endOfNextPotentialMatch else { return nil } + let (result, nextEnd) = nextRange(in: text, searchFromEnd: end) + state.endOfNextPotentialMatch = nextEnd return result -// while let end = endOfSearch { -// // Empty pattern matches at every position. -// if patternCount == 0 { -// endOfSearch = end == text.endIndex ? nil : text.index(after: end) -// return end..= pattern.startIndex -// && pattern[patternCursor] == text[textCursor] -// { -// patternOffset -= 1 -// -// // Success! -// if patternCursor == pattern.startIndex { -// // Calculate the offset for the next search. -// endOfSearch = text.index(end, offsetBy: patternCount, limitedBy: text.endIndex) -// return textCursor.. text.startIndex) -// text.formIndex(before: &textCursor) -// pattern.formIndex(before: &patternCursor) -// } -// -// // Match failed - calculate the end index of the next possible -// // candidate, based on the `badCharacterOffsets` table and the -// // current position in the pattern. -// let shiftOffset = Swift.max( -// 1, -// patternOffset - (badCharacterOffsets[text[textCursor]] ?? 0)) -// endOfSearch = text.index( -// end, offsetBy: shiftOffset, limitedBy: text.endIndex) -// } -// return nil } } -extension SubstringSearcher { - struct Coll: Collection { - var iterator: SubstringSearcher - var startIndex: Index - - var endIndex: Index { Index() } - - init(iterator: SubstringSearcher) { - var iterator = iterator - self.startIndex = Index(range: iterator.next()) - self.iterator = iterator - } - - struct Index: Comparable { - var range: Range? - var endOfNextPotentialMatch: String.Index? - - static func < (lhs: Index, rhs: Index) -> Bool { - switch (lhs.range, rhs.range) { - case (nil, _): false - case (_, nil): true - case let (lhs?, rhs?): - lhs.lowerBound < rhs.lowerBound - } - } - } - - subscript(index: Index) -> Range { - index.range! - } - - func index(after index: Index) -> Index { - let (range, next) = iterator.nextRange( - searchFromEnd: index.endOfNextPotentialMatch!) - return Index(range: range, endOfNextPotentialMatch: next) - } +extension SubstringSearcher: CollectionSearcher { + func state(for text: Searched, in range: Range) -> State { + State(text: text[range], pattern: pattern) } - var collection: Coll { - .init(iterator: self) + func search(_ text: Searched, _ state: inout State) -> Range? { + guard let end = state.endOfNextPotentialMatch else { return nil } + let (result, nextEnd) = nextRange(in: text, searchFromEnd: end) + state.endOfNextPotentialMatch = nextEnd + return result } } From aa38dea9f5f049f171e1738e8b61a6a85dd13711 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 28 Feb 2024 15:10:57 -0600 Subject: [PATCH 05/13] Substring search: iterative rather than recursive For large strings, a recursive search can run out of stack space. This eliminates the issue by looping within the `nextRange` function. --- .../Algorithms/SubstringSearcher.swift | 65 ++++++++++--------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index 026043fae..428c8db7f 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -66,39 +66,44 @@ struct SubstringSearcher: Sequence, IteratorProtocol end == text.endIndex ? nil : text.index(after: end)) } - var patternOffset = state.patternCount - 1 - var patternCursor = pattern.index(before: pattern.endIndex) - var textCursor = text.index(before: end) - - // Search backwards from `end` to the start of the pattern - while patternCursor >= pattern.startIndex - && pattern[patternCursor] == text[textCursor] - { - patternOffset -= 1 - - // Success! - if patternCursor == pattern.startIndex { - // Calculate the offset for the next search. - return ( - textCursor..= pattern.startIndex + && pattern[patternCursor] == text[textCursor] + { + patternOffset -= 1 + + // Success! + if patternCursor == pattern.startIndex { + // Calculate the offset for the next search. + return ( + textCursor.. text.startIndex) + text.formIndex(before: &textCursor) + pattern.formIndex(before: &patternCursor) } - precondition(textCursor > text.startIndex) - text.formIndex(before: &textCursor) - pattern.formIndex(before: &patternCursor) + // Match failed - calculate the end index of the next possible + // candidate, based on the `badCharacterOffsets` table and the + // current position in the pattern. + let shiftOffset = Swift.max( + 1, + patternOffset - (state.badCharacterOffsets[text[textCursor]] ?? 0)) + if let nextEnd = text.index( + currentEnd, offsetBy: shiftOffset, limitedBy: text.endIndex) { + currentEnd = nextEnd + } else { + return (nil, nil) + } } - - // Match failed - calculate the end index of the next possible - // candidate, based on the `badCharacterOffsets` table and the - // current position in the pattern. - let shiftOffset = Swift.max( - 1, - patternOffset - (state.badCharacterOffsets[text[textCursor]] ?? 0)) - let nextEnd = text.index( - end, offsetBy: shiftOffset, limitedBy: text.endIndex) - guard let nextEnd else { return (nil, nil) } - return nextRange(in: text, searchFromEnd: nextEnd) } @inlinable From 649f4c52290895796a018b18c85ad50f5678c757 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 29 Feb 2024 14:48:32 -0600 Subject: [PATCH 06/13] Dispatch string splitting to new searcher --- .../Algorithms/Algorithms/Split.swift | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift index a85e6c247..3713019ac 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift @@ -148,10 +148,22 @@ extension Collection where Element: Equatable { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [SubSequence] where C.Element == Element { - Array(split( - by: ZSearcher(pattern: Array(separator), by: ==), - maxSplits: maxSplits, - omittingEmptySubsequences: omittingEmptySubsequences)) + switch (self, separator) { + case (let str as String, let sep as String): + return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + case (let str as String, let sep as Substring): + return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + case (let str as Substring, let sep as String): + return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + case (let str as Substring, let sep as Substring): + return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + + default: + return Array(split( + by: ZSearcher(pattern: Array(separator), by: ==), + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences)) + } } } From f57b5e4f006118745b3c0e1344aea400c15bfd63 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 29 Mar 2024 16:37:06 -0500 Subject: [PATCH 07/13] Remove generic on SubstringSearcher --- .../Algorithms/Algorithms/Split.swift | 12 ++++++------ .../Algorithms/Algorithms/SubstringSearcher.swift | 7 ++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift index 3713019ac..da4eecc60 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift @@ -150,9 +150,9 @@ extension Collection where Element: Equatable { ) -> [SubSequence] where C.Element == Element { switch (self, separator) { case (let str as String, let sep as String): - return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + return str[...]._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] case (let str as String, let sep as Substring): - return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + return str[...]._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] case (let str as Substring, let sep as String): return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] case (let str as Substring, let sep as Substring): @@ -186,8 +186,8 @@ extension StringProtocol where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [Substring] { - Array(split( - by: SubstringSearcher(text: "" as Self, pattern: separator[...]), + Array(self[...].split( + by: SubstringSearcher(text: "" as Substring, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) } @@ -199,8 +199,8 @@ extension StringProtocol where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [Substring] { - Array(split( - by: SubstringSearcher(text: "" as Self, pattern: separator[...]), + Array(self[...].split( + by: SubstringSearcher(text: "" as Substring, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index 428c8db7f..01586fd7c 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -12,9 +12,7 @@ /// An implementation of the Boyer-Moore-Horspool algorithm, for string-specific /// searching. @usableFromInline -struct SubstringSearcher: Sequence, IteratorProtocol - where Searched.SubSequence == Substring -{ +struct SubstringSearcher: Sequence, IteratorProtocol { @usableFromInline struct State { @usableFromInline @@ -41,6 +39,9 @@ struct SubstringSearcher: Sequence, IteratorProtocol } } + @usableFromInline + typealias Searched = Substring + @usableFromInline let text: Searched @usableFromInline From 474601fd0a8915dcb959e45c0c3f663020fdb833 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 19 Apr 2024 13:26:36 -0500 Subject: [PATCH 08/13] Remove unnecessary inlining annotations --- .../Algorithms/Algorithms/FirstRange.swift | 4 ---- .../Algorithms/Algorithms/Ranges.swift | 10 +--------- .../Algorithms/Algorithms/SubstringSearcher.swift | 12 ------------ 3 files changed, 1 insertion(+), 25 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index f2209a8fc..2ff63fea3 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -22,7 +22,6 @@ extension Collection { // MARK: Fixed pattern algorithms extension Substring { - @usableFromInline func _firstRangeSubstring( of other: Substring ) -> Range? { @@ -32,7 +31,6 @@ extension Substring { } extension Collection where Element: Equatable { - @usableFromInline func _firstRangeGeneric( of other: C ) -> Range? where C.Element == Element { @@ -47,7 +45,6 @@ extension Collection where Element: Equatable { /// - Returns: A range in the collection of the first occurrence of `sequence`. /// Returns nil if `sequence` is not found. @available(SwiftStdlib 5.7, *) - @inline(__always) public func firstRange( of other: C ) -> Range? where C.Element == Element { @@ -75,7 +72,6 @@ extension BidirectionalCollection where Element: Comparable { /// - Returns: A range in the collection of the first occurrence of `sequence`. /// Returns `nil` if `sequence` is not found. @available(SwiftStdlib 5.7, *) - @inline(__always) public func firstRange( of other: C ) -> Range? where C.Element == Element { diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index e2ab4bbee..3f9b8d49a 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -136,13 +136,6 @@ extension Collection where Element: Equatable { _ranges(of: ZSearcher(pattern: Array(other), by: ==)) } - @usableFromInline - func _rangesGeneric( - of other: C - ) -> [Range] where C.Element == Element { - Array(_ranges(of: other)) - } - // FIXME: Return `some Collection>` for SE-0346 /// Finds and returns the ranges of the all occurrences of a given sequence /// within the collection. @@ -150,7 +143,6 @@ extension Collection where Element: Equatable { /// - Returns: A collection of ranges of all occurrences of `other`. Returns /// an empty collection if `other` is not found. @available(SwiftStdlib 5.7, *) - @inline(__always) public func ranges( of other: C ) -> [Range] where C.Element == Element { @@ -165,7 +157,7 @@ extension Collection where Element: Equatable { return Array(SubstringSearcher(text: str, pattern: other)) as! [Range] default: - return _rangesGeneric(of: other) + return Array(_ranges(of: other)) } } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index 01586fd7c..d7c43c33b 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -11,15 +11,10 @@ /// An implementation of the Boyer-Moore-Horspool algorithm, for string-specific /// searching. -@usableFromInline struct SubstringSearcher: Sequence, IteratorProtocol { - @usableFromInline struct State { - @usableFromInline let badCharacterOffsets: [Character: Int] - @usableFromInline let patternCount: Int - @usableFromInline var endOfNextPotentialMatch: String.Index? init(text: Substring? = nil, pattern: Substring) { @@ -39,24 +34,18 @@ struct SubstringSearcher: Sequence, IteratorProtocol { } } - @usableFromInline typealias Searched = Substring - @usableFromInline let text: Searched - @usableFromInline let pattern: Substring - @usableFromInline var state: State - @usableFromInline init(text: Searched, pattern: Substring) { self.text = text self.pattern = pattern self.state = .init(text: text[...], pattern: pattern) } - @inlinable func nextRange(in text: Searched, searchFromEnd end: String.Index) -> (result: Range?, nextEnd: String.Index?) { @@ -107,7 +96,6 @@ struct SubstringSearcher: Sequence, IteratorProtocol { } } - @inlinable mutating func next() -> Range? { guard let end = state.endOfNextPotentialMatch else { return nil } let (result, nextEnd) = nextRange(in: text, searchFromEnd: end) From b1ebf7455831b0771b3a72f3183c0db8cf68ea0e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 19 Apr 2024 13:26:45 -0500 Subject: [PATCH 09/13] Update string algorithms tests --- Tests/RegexTests/AlgorithmsTests.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index 60548a0a2..6122275ea 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -64,6 +64,8 @@ class AlgorithmTests: XCTestCase { XCTAssertTrue("abcde".contains("bcde")) XCTAssertTrue("abcde".contains("bcd")) XCTAssertTrue("ababacabababa".contains("abababa")) + XCTAssertTrue("bbababacabababa".contains("abababa")) + XCTAssertFalse("bbababacbbababa".contains("abababa")) XCTAssertFalse("".contains("abcd")) @@ -340,7 +342,7 @@ class AlgorithmTests: XCTestCase { XCTAssertEqual(stringActual, expected, """ Mismatch in string split of '\(str)', maxSplits: \(maxSplits), omitEmpty: \(omitEmpty) expected: \(expected.map(String.init)) - actual: \(regexActual.map(String.init)) + actual: \(stringActual.map(String.init)) """) } } From 8ff1237ab59498698cafeddf47d2e28e3865973e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 19 Apr 2024 13:43:15 -0500 Subject: [PATCH 10/13] Verify string/substring dispatch in algorithms --- .../Algorithms/Algorithms/Replace.swift | 8 ++--- Tests/RegexTests/AlgorithmsTests.swift | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index d5f8e6ea4..0a7064dcd 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -91,11 +91,11 @@ extension RangeReplaceableCollection where Element: Equatable { maxReplacements: maxReplacements) as! Self case (let str as Substring, let other as Substring, let repl as Substring): return str._replacingSubstring(other, with: repl, - maxReplacements: maxReplacements) as! Self + maxReplacements: maxReplacements)[...] as! Self case (let str as Substring, let other as String, let repl as String): return str[...]._replacingSubstring(other[...], with: repl[...], - maxReplacements: maxReplacements) as! Self + maxReplacements: maxReplacements)[...] as! Self case (let str as String, let other as Substring, let repl as String): return str[...]._replacingSubstring(other[...], with: repl[...], maxReplacements: maxReplacements) as! Self @@ -108,10 +108,10 @@ extension RangeReplaceableCollection where Element: Equatable { maxReplacements: maxReplacements) as! Self case (let str as Substring, let other as String, let repl as Substring): return str[...]._replacingSubstring(other[...], with: repl[...], - maxReplacements: maxReplacements) as! Self + maxReplacements: maxReplacements)[...] as! Self case (let str as Substring, let other as Substring, let repl as String): return str[...]._replacingSubstring(other[...], with: repl[...], - maxReplacements: maxReplacements) as! Self + maxReplacements: maxReplacements)[...] as! Self default: return _replacing( diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index 6122275ea..234e8ae13 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -67,6 +67,12 @@ class AlgorithmTests: XCTestCase { XCTAssertTrue("bbababacabababa".contains("abababa")) XCTAssertFalse("bbababacbbababa".contains("abababa")) + let str = "abcde" + let pattern = "bcde" + XCTAssertTrue(str[...].contains(pattern)) + XCTAssertTrue(str.contains(pattern[...])) + XCTAssertTrue(str[...].contains(pattern[...])) + XCTAssertFalse("".contains("abcd")) for start in 0..<9 { @@ -163,6 +169,13 @@ class AlgorithmTests: XCTestCase { let actualSeq: [Range] = input.ranges(of: pattern).map(input.offsets(of:)) XCTAssertEqual(actualSeq, expected, file: file, line: line) + let a1: [Range] = input[...].ranges(of: pattern).map(input.offsets(of:)) + let a2: [Range] = input.ranges(of: pattern[...]).map(input.offsets(of:)) + let a3: [Range] = input[...].ranges(of: pattern[...]).map(input.offsets(of:)) + XCTAssertEqual(a1, expected, file: file, line: line) + XCTAssertEqual(a2, expected, file: file, line: line) + XCTAssertEqual(a3, expected, file: file, line: line) + // `IndexingIterator` tests the collection conformance let actualCol: [Range] = input.ranges(of: pattern)[...].map(input.offsets(of:)) XCTAssertEqual(actualCol, expected, file: file, line: line) @@ -173,6 +186,13 @@ class AlgorithmTests: XCTestCase { let secondRange = input[upperBound...].firstRange(of: pattern).map(input.offsets(of:)) XCTAssertEqual(secondRange, expected.dropFirst().first, file: file, line: line) } + + let r1 = input[...].firstRange(of: pattern) + let r2 = input.firstRange(of: pattern[...]) + let r3 = input[...].firstRange(of: pattern[...]) + XCTAssertEqual(r1.map(input.offsets(of:)), expected.first, file: file, line: line) + XCTAssertEqual(r2.map(input.offsets(of:)), expected.first, file: file, line: line) + XCTAssertEqual(r3.map(input.offsets(of:)), expected.first, file: file, line: line) } expectRanges("", "", [0..<0]) @@ -227,6 +247,13 @@ class AlgorithmTests: XCTestCase { ) { let actual = Array(string.split(separator: separator, omittingEmptySubsequences: false)) XCTAssertEqual(actual, expected, file: file, line: line) + + let a1 = Array(string[...].split(separator: separator, omittingEmptySubsequences: false)) + let a2 = Array(string.split(separator: separator[...], omittingEmptySubsequences: false)) + let a3 = Array(string[...].split(separator: separator[...], omittingEmptySubsequences: false)) + XCTAssertEqual(a1, expected, file: file, line: line) + XCTAssertEqual(a2, expected, file: file, line: line) + XCTAssertEqual(a3, expected, file: file, line: line) } expectSplit("", "", [""]) @@ -507,6 +534,15 @@ class AlgorithmTests: XCTestCase { XCTAssertEqual(s1.replacing(regex, with: ""), " | ") XCTAssertEqual(s2.replacing(regex, with: ""), "") + XCTAssertEqual(s.replacing("aa", with: "Z"), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s.replacing("aa" as Substring, with: "Z"), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s.replacing("aa", with: "Z" as Substring), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s.replacing("aa" as Substring, with: "Z" as Substring), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa", with: "Z"), "ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa", with: "Z" as Substring), "ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa" as Substring, with: "Z"), "ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa" as Substring, with: "Z" as Substring), "ZZZ | ZZZZZ") + XCTAssertEqual( s.matches(of: regex).map(\.0), ["aaa", "aaaaaa", "aaaaaaaaaa"]) From 0e01cf42ad4dbc46ec6dd75bc6bff8361a6019a2 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 19 Apr 2024 14:44:43 -0500 Subject: [PATCH 11/13] Add tests for string.replacing maxReplacements --- .../Algorithms/Algorithms/Replace.swift | 4 +--- Tests/RegexTests/AlgorithmsTests.swift | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index 0a7064dcd..54dd0f864 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -22,16 +22,14 @@ extension Substring { var result = String() var index = startIndex - // `maxRanges` is a workaround for https://github.com/apple/swift/issues/59522 var rangeIterator = SubstringSearcher(text: self, pattern: other) var replacementCount = 0 - while let range = rangeIterator.next() { + while replacementCount < maxReplacements, let range = rangeIterator.next() { result.append(contentsOf: self[index.. Date: Mon, 22 Apr 2024 12:50:31 -0500 Subject: [PATCH 12/13] Add fallback to naive search for small patterns --- .../Algorithms/SubstringSearcher.swift | 61 +++++++++++++++++-- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index d7c43c33b..127192635 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -17,16 +17,28 @@ struct SubstringSearcher: Sequence, IteratorProtocol { let patternCount: Int var endOfNextPotentialMatch: String.Index? + /// Require this length to calculate and use bad character offsets instead of just + // using a simple naive search. + static let patternCountMinimum = 4 + init(text: Substring? = nil, pattern: Substring) { var offsets: [Character: Int] = [:] var count = 0 - for (offset, ch) in pattern.enumerated() { - offsets[ch] = offset - count += 1 + let useBadCharacterOffsets = + pattern.index(pattern.startIndex, offsetBy: Self.patternCountMinimum, limitedBy: pattern.endIndex) + != nil + if useBadCharacterOffsets { + for (offset, ch) in pattern.enumerated() { + offsets[ch] = offset + count += 1 + } + self.badCharacterOffsets = offsets + self.patternCount = count + } else { + self.badCharacterOffsets = [:] + self.patternCount = pattern.count } - self.badCharacterOffsets = offsets - self.patternCount = count if let text { self.endOfNextPotentialMatch = text.index( text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) @@ -46,15 +58,52 @@ struct SubstringSearcher: Sequence, IteratorProtocol { self.state = .init(text: text[...], pattern: pattern) } - func nextRange(in text: Searched, searchFromEnd end: String.Index) + func nextRangeNaive(in text: Searched, searchFromEnd end: String.Index) -> (result: Range?, nextEnd: String.Index?) { + precondition(state.patternCount > 0) + let patternLastIndex = pattern.index(before: pattern.endIndex) + var textLastIndex = text.index(before: end) + + FindLastCharacterMatch: + while let potentialMatchEnd = text[textLastIndex...].firstIndex(of: pattern[patternLastIndex]) { + var textCursor = potentialMatchEnd + var patternCursor = patternLastIndex + precondition(textCursor >= text.startIndex) + while patternCursor > pattern.startIndex { + pattern.formIndex(before: &patternCursor) + text.formIndex(before: &textCursor) + + guard pattern[patternCursor] == text[textCursor] else { + textLastIndex = text.index(after: potentialMatchEnd) + continue FindLastCharacterMatch + } + } + + // It's a match! + let currentEnd = text.index(after: potentialMatchEnd) + return ( + textCursor.. (result: Range?, nextEnd: String.Index?) + { // Empty pattern matches at every position. if state.patternCount == 0 { return ( end.. Date: Wed, 24 Apr 2024 09:38:57 -0500 Subject: [PATCH 13/13] Improve some comments/formatting in the string search --- .../Algorithms/SubstringSearcher.swift | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift index 127192635..bcdf20812 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -17,21 +17,22 @@ struct SubstringSearcher: Sequence, IteratorProtocol { let patternCount: Int var endOfNextPotentialMatch: String.Index? - /// Require this length to calculate and use bad character offsets instead of just - // using a simple naive search. + /// The minimum pattern length for using bad character offsets + /// (aka Boyer-Moore) instead of just a simple naive search. static let patternCountMinimum = 4 init(text: Substring? = nil, pattern: Substring) { - var offsets: [Character: Int] = [:] - var count = 0 - let useBadCharacterOffsets = - pattern.index(pattern.startIndex, offsetBy: Self.patternCountMinimum, limitedBy: pattern.endIndex) - != nil - if useBadCharacterOffsets { - for (offset, ch) in pattern.enumerated() { - offsets[ch] = offset - count += 1 - } + let useBadCharacterOffsets = + pattern.prefix(Self.patternCountMinimum).count == Self.patternCountMinimum + + if useBadCharacterOffsets { + var offsets: [Character: Int] = [:] + var count = 0 + for (offset, ch) in pattern.enumerated() { + offsets[ch] = offset + count += 1 + } + self.badCharacterOffsets = offsets self.patternCount = count } else { @@ -44,10 +45,12 @@ struct SubstringSearcher: Sequence, IteratorProtocol { text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) } } + + var shouldUseNaiveSearch: Bool { + badCharacterOffsets.isEmpty + } } - typealias Searched = Substring - let text: Searched let pattern: Substring var state: State @@ -58,6 +61,8 @@ struct SubstringSearcher: Sequence, IteratorProtocol { self.state = .init(text: text[...], pattern: pattern) } + /// Finds and returns the range of the next matching substring, along + /// with the end index of the next possible match, using a naive approach. func nextRangeNaive(in text: Searched, searchFromEnd end: String.Index) -> (result: Range?, nextEnd: String.Index?) { @@ -69,6 +74,7 @@ struct SubstringSearcher: Sequence, IteratorProtocol { while let potentialMatchEnd = text[textLastIndex...].firstIndex(of: pattern[patternLastIndex]) { var textCursor = potentialMatchEnd var patternCursor = patternLastIndex + precondition(textCursor >= text.startIndex) while patternCursor > pattern.startIndex { pattern.formIndex(before: &patternCursor) @@ -90,6 +96,8 @@ struct SubstringSearcher: Sequence, IteratorProtocol { return (nil, nil) } + /// Finds and returns the range of the next matching substring, along + /// with the end index of the next possible match. func nextRange(in text: Searched, searchFromEnd end: String.Index) -> (result: Range?, nextEnd: String.Index?) { @@ -100,8 +108,8 @@ struct SubstringSearcher: Sequence, IteratorProtocol { end == text.endIndex ? nil : text.index(after: end)) } - // Fall back to naive search if we didn't calculate bad character offsets - if state.badCharacterOffsets.isEmpty { + // We fall back to the naive search if `pattern` is small. + if state.shouldUseNaiveSearch { return nextRangeNaive(in: text, searchFromEnd: end) } @@ -154,6 +162,8 @@ struct SubstringSearcher: Sequence, IteratorProtocol { } extension SubstringSearcher: CollectionSearcher { + typealias Searched = Substring + func state(for text: Searched, in range: Range) -> State { State(text: text[range], pattern: pattern) }