diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index d0fb8673d..2ff63fea3 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -21,8 +21,23 @@ extension Collection { } // MARK: Fixed pattern algorithms +extension Substring { + func _firstRangeSubstring( + of other: Substring + ) -> Range? { + var searcher = SubstringSearcher(text: self, pattern: other) + return searcher.next() + } +} extension Collection where Element: Equatable { + func _firstRangeGeneric( + of other: C + ) -> Range? where C.Element == Element { + let searcher = ZSearcher(pattern: Array(other), by: ==) + return searcher.search(self[...], in: startIndex..( of other: C ) -> Range? where C.Element == Element { - // TODO: Use a more efficient search algorithm - let searcher = ZSearcher(pattern: Array(other), by: ==) - return searcher.search(self[...], in: startIndex..? + case (let str as Substring, let other as String): + return str._firstRangeSubstring(of: other[...]) as! Range? + case (let str as String, let other as Substring): + return str[...]._firstRangeSubstring(of: other) as! Range? + case (let str as Substring, let other as Substring): + return str._firstRangeSubstring(of: other) as! Range? + + default: + return _firstRangeGeneric(of: other) + } } } @@ -50,8 +75,19 @@ extension BidirectionalCollection where Element: Comparable { public func firstRange( of other: C ) -> Range? where C.Element == Element { - let searcher = ZSearcher(pattern: Array(other), by: ==) - return searcher.search(self[...], in: startIndex..? + case (let str as Substring, let other as String): + return str._firstRangeSubstring(of: other[...]) as! Range? + case (let str as String, let other as Substring): + return str[...]._firstRangeSubstring(of: other) as! Range? + case (let str as Substring, let other as Substring): + return str._firstRangeSubstring(of: other) as! Range? + + default: + return _firstRangeGeneric(of: other) + } } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 030ce1f64..3f9b8d49a 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -135,7 +135,7 @@ extension Collection where Element: Equatable { ) -> RangesCollection> where C.Element == Element { _ranges(of: ZSearcher(pattern: Array(other), by: ==)) } - + // FIXME: Return `some Collection>` for SE-0346 /// Finds and returns the ranges of the all occurrences of a given sequence /// within the collection. @@ -146,7 +146,19 @@ extension Collection where Element: Equatable { public func ranges( of other: C ) -> [Range] where C.Element == Element { - Array(_ranges(of: other)) + switch (self, other) { + case (let str as String, let other as String): + return Array(SubstringSearcher(text: str[...], pattern: other[...])) as! [Range] + case (let str as Substring, let other as String): + return Array(SubstringSearcher(text: str, pattern: other[...])) as! [Range] + case (let str as String, let other as Substring): + return Array(SubstringSearcher(text: str[...], pattern: other)) as! [Range] + case (let str as Substring, let other as Substring): + return Array(SubstringSearcher(text: str, pattern: other)) as! [Range] + + default: + return Array(_ranges(of: other)) + } } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index 5e6c97973..54dd0f864 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -11,6 +11,32 @@ // MARK: `CollectionSearcher` algorithms +extension Substring { + func _replacingSubstring( + _ other: Substring, + with replacement: Substring, + maxReplacements: Int = .max + ) -> String { + precondition(maxReplacements >= 0) + + var result = String() + var index = startIndex + + var rangeIterator = SubstringSearcher(text: self, pattern: other) + var replacementCount = 0 + while replacementCount < maxReplacements, let range = rangeIterator.next() { + result.append(contentsOf: self[index..( _ ranges: Ranges, @@ -35,19 +61,6 @@ extension RangeReplaceableCollection { result.append(contentsOf: self[index...]) return result } - - mutating func _replace< - Ranges: Collection, Replacement: Collection - >( - _ ranges: Ranges, - with replacement: Replacement, - maxReplacements: Int = .max - ) where Ranges.Element == Range, Replacement.Element == Element { - self = _replacing( - ranges, - with: replacement, - maxReplacements: maxReplacements) - } } // MARK: Fixed pattern algorithms @@ -70,10 +83,40 @@ extension RangeReplaceableCollection where Element: Equatable { subrange: Range, maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { - _replacing( - self[subrange]._ranges(of: other), - with: replacement, - maxReplacements: maxReplacements) + switch (self, other, replacement) { + case (let str as String, let other as String, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as Substring, let other as Substring, let repl as Substring): + return str._replacingSubstring(other, with: repl, + maxReplacements: maxReplacements)[...] as! Self + + case (let str as Substring, let other as String, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements)[...] as! Self + case (let str as String, let other as Substring, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as String, let other as String, let repl as Substring): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + + case (let str as String, let other as Substring, let repl as Substring): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements) as! Self + case (let str as Substring, let other as String, let repl as Substring): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements)[...] as! Self + case (let str as Substring, let other as Substring, let repl as String): + return str[...]._replacingSubstring(other[...], with: repl[...], + maxReplacements: maxReplacements)[...] as! Self + + default: + return _replacing( + self[subrange]._ranges(of: other), + with: replacement, + maxReplacements: maxReplacements) + } } /// Returns a new collection in which all occurrences of a target sequence diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift index 5e7ba37b0..da4eecc60 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift @@ -148,10 +148,22 @@ extension Collection where Element: Equatable { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [SubSequence] where C.Element == Element { - Array(split( - by: ZSearcher(pattern: Array(separator), by: ==), - maxSplits: maxSplits, - omittingEmptySubsequences: omittingEmptySubsequences)) + switch (self, separator) { + case (let str as String, let sep as String): + return str[...]._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + case (let str as String, let sep as Substring): + return str[...]._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + case (let str as Substring, let sep as String): + return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + case (let str as Substring, let sep as Substring): + return str._split(separator: sep, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) as! [SubSequence] + + default: + return Array(split( + by: ZSearcher(pattern: Array(separator), by: ==), + maxSplits: maxSplits, + omittingEmptySubsequences: omittingEmptySubsequences)) + } } } @@ -174,8 +186,8 @@ extension StringProtocol where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [Substring] { - Array(split( - by: ZSearcher(pattern: Array(separator), by: ==), + Array(self[...].split( + by: SubstringSearcher(text: "" as Substring, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) } @@ -187,8 +199,8 @@ extension StringProtocol where SubSequence == Substring { maxSplits: Int = .max, omittingEmptySubsequences: Bool = true ) -> [Substring] { - Array(split( - by: ZSearcher(pattern: Array(separator), by: ==), + Array(self[...].split( + by: SubstringSearcher(text: "" as Substring, pattern: separator[...]), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences)) } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift new file mode 100644 index 000000000..bcdf20812 --- /dev/null +++ b/Sources/_StringProcessing/Algorithms/Algorithms/SubstringSearcher.swift @@ -0,0 +1,177 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +/// An implementation of the Boyer-Moore-Horspool algorithm, for string-specific +/// searching. +struct SubstringSearcher: Sequence, IteratorProtocol { + struct State { + let badCharacterOffsets: [Character: Int] + let patternCount: Int + var endOfNextPotentialMatch: String.Index? + + /// The minimum pattern length for using bad character offsets + /// (aka Boyer-Moore) instead of just a simple naive search. + static let patternCountMinimum = 4 + + init(text: Substring? = nil, pattern: Substring) { + let useBadCharacterOffsets = + pattern.prefix(Self.patternCountMinimum).count == Self.patternCountMinimum + + if useBadCharacterOffsets { + var offsets: [Character: Int] = [:] + var count = 0 + for (offset, ch) in pattern.enumerated() { + offsets[ch] = offset + count += 1 + } + + self.badCharacterOffsets = offsets + self.patternCount = count + } else { + self.badCharacterOffsets = [:] + self.patternCount = pattern.count + } + + if let text { + self.endOfNextPotentialMatch = text.index( + text.startIndex, offsetBy: patternCount, limitedBy: text.endIndex) + } + } + + var shouldUseNaiveSearch: Bool { + badCharacterOffsets.isEmpty + } + } + + let text: Searched + let pattern: Substring + var state: State + + init(text: Searched, pattern: Substring) { + self.text = text + self.pattern = pattern + self.state = .init(text: text[...], pattern: pattern) + } + + /// Finds and returns the range of the next matching substring, along + /// with the end index of the next possible match, using a naive approach. + func nextRangeNaive(in text: Searched, searchFromEnd end: String.Index) + -> (result: Range?, nextEnd: String.Index?) + { + precondition(state.patternCount > 0) + let patternLastIndex = pattern.index(before: pattern.endIndex) + var textLastIndex = text.index(before: end) + + FindLastCharacterMatch: + while let potentialMatchEnd = text[textLastIndex...].firstIndex(of: pattern[patternLastIndex]) { + var textCursor = potentialMatchEnd + var patternCursor = patternLastIndex + + precondition(textCursor >= text.startIndex) + while patternCursor > pattern.startIndex { + pattern.formIndex(before: &patternCursor) + text.formIndex(before: &textCursor) + + guard pattern[patternCursor] == text[textCursor] else { + textLastIndex = text.index(after: potentialMatchEnd) + continue FindLastCharacterMatch + } + } + + // It's a match! + let currentEnd = text.index(after: potentialMatchEnd) + return ( + textCursor.. (result: Range?, nextEnd: String.Index?) + { + // Empty pattern matches at every position. + if state.patternCount == 0 { + return ( + end..= pattern.startIndex + && pattern[patternCursor] == text[textCursor] + { + patternOffset -= 1 + + // Success! + if patternCursor == pattern.startIndex { + // Calculate the offset for the next search. + return ( + textCursor.. text.startIndex) + text.formIndex(before: &textCursor) + pattern.formIndex(before: &patternCursor) + } + + // Match failed - calculate the end index of the next possible + // candidate, based on the `badCharacterOffsets` table and the + // current position in the pattern. + let shiftOffset = Swift.max( + 1, + patternOffset - (state.badCharacterOffsets[text[textCursor]] ?? 0)) + if let nextEnd = text.index( + currentEnd, offsetBy: shiftOffset, limitedBy: text.endIndex) { + currentEnd = nextEnd + } else { + return (nil, nil) + } + } + } + + mutating func next() -> Range? { + guard let end = state.endOfNextPotentialMatch else { return nil } + let (result, nextEnd) = nextRange(in: text, searchFromEnd: end) + state.endOfNextPotentialMatch = nextEnd + return result + } +} + +extension SubstringSearcher: CollectionSearcher { + typealias Searched = Substring + + func state(for text: Searched, in range: Range) -> State { + State(text: text[range], pattern: pattern) + } + + func search(_ text: Searched, _ state: inout State) -> Range? { + guard let end = state.endOfNextPotentialMatch else { return nil } + let (result, nextEnd) = nextRange(in: text, searchFromEnd: end) + state.endOfNextPotentialMatch = nextEnd + return result + } +} diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index 60548a0a2..dca2923f3 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -64,7 +64,15 @@ class AlgorithmTests: XCTestCase { XCTAssertTrue("abcde".contains("bcde")) XCTAssertTrue("abcde".contains("bcd")) XCTAssertTrue("ababacabababa".contains("abababa")) + XCTAssertTrue("bbababacabababa".contains("abababa")) + XCTAssertFalse("bbababacbbababa".contains("abababa")) + let str = "abcde" + let pattern = "bcde" + XCTAssertTrue(str[...].contains(pattern)) + XCTAssertTrue(str.contains(pattern[...])) + XCTAssertTrue(str[...].contains(pattern[...])) + XCTAssertFalse("".contains("abcd")) for start in 0..<9 { @@ -161,6 +169,13 @@ class AlgorithmTests: XCTestCase { let actualSeq: [Range] = input.ranges(of: pattern).map(input.offsets(of:)) XCTAssertEqual(actualSeq, expected, file: file, line: line) + let a1: [Range] = input[...].ranges(of: pattern).map(input.offsets(of:)) + let a2: [Range] = input.ranges(of: pattern[...]).map(input.offsets(of:)) + let a3: [Range] = input[...].ranges(of: pattern[...]).map(input.offsets(of:)) + XCTAssertEqual(a1, expected, file: file, line: line) + XCTAssertEqual(a2, expected, file: file, line: line) + XCTAssertEqual(a3, expected, file: file, line: line) + // `IndexingIterator` tests the collection conformance let actualCol: [Range] = input.ranges(of: pattern)[...].map(input.offsets(of:)) XCTAssertEqual(actualCol, expected, file: file, line: line) @@ -171,6 +186,13 @@ class AlgorithmTests: XCTestCase { let secondRange = input[upperBound...].firstRange(of: pattern).map(input.offsets(of:)) XCTAssertEqual(secondRange, expected.dropFirst().first, file: file, line: line) } + + let r1 = input[...].firstRange(of: pattern) + let r2 = input.firstRange(of: pattern[...]) + let r3 = input[...].firstRange(of: pattern[...]) + XCTAssertEqual(r1.map(input.offsets(of:)), expected.first, file: file, line: line) + XCTAssertEqual(r2.map(input.offsets(of:)), expected.first, file: file, line: line) + XCTAssertEqual(r3.map(input.offsets(of:)), expected.first, file: file, line: line) } expectRanges("", "", [0..<0]) @@ -225,6 +247,13 @@ class AlgorithmTests: XCTestCase { ) { let actual = Array(string.split(separator: separator, omittingEmptySubsequences: false)) XCTAssertEqual(actual, expected, file: file, line: line) + + let a1 = Array(string[...].split(separator: separator, omittingEmptySubsequences: false)) + let a2 = Array(string.split(separator: separator[...], omittingEmptySubsequences: false)) + let a3 = Array(string[...].split(separator: separator[...], omittingEmptySubsequences: false)) + XCTAssertEqual(a1, expected, file: file, line: line) + XCTAssertEqual(a2, expected, file: file, line: line) + XCTAssertEqual(a3, expected, file: file, line: line) } expectSplit("", "", [""]) @@ -340,7 +369,7 @@ class AlgorithmTests: XCTestCase { XCTAssertEqual(stringActual, expected, """ Mismatch in string split of '\(str)', maxSplits: \(maxSplits), omitEmpty: \(omitEmpty) expected: \(expected.map(String.init)) - actual: \(regexActual.map(String.init)) + actual: \(stringActual.map(String.init)) """) } } @@ -478,7 +507,22 @@ class AlgorithmTests: XCTestCase { expectReplace("a", "a", "X", "X") expectReplace("aab", "a", "X", "XXb") - // FIXME: Test maxReplacements + let str = "aabaaabaab" + XCTAssertEqual( + str.replacing("aab", with: "Z", maxReplacements: 1000), + "ZaZZ") + XCTAssertEqual( + str.replacing("aab", with: "Z", maxReplacements: 3), + "ZaZZ") + XCTAssertEqual( + str.replacing("aab", with: "Z", maxReplacements: 2), + "ZaZaab") + XCTAssertEqual( + str.replacing("aab", with: "Z", maxReplacements: 1), + "Zaaabaab") + XCTAssertEqual( + str.replacing("aab", with: "Z", maxReplacements: 0), + str) } func testSubstring() throws { @@ -505,6 +549,15 @@ class AlgorithmTests: XCTestCase { XCTAssertEqual(s1.replacing(regex, with: ""), " | ") XCTAssertEqual(s2.replacing(regex, with: ""), "") + XCTAssertEqual(s.replacing("aa", with: "Z"), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s.replacing("aa" as Substring, with: "Z"), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s.replacing("aa", with: "Z" as Substring), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s.replacing("aa" as Substring, with: "Z" as Substring), "Za | ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa", with: "Z"), "ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa", with: "Z" as Substring), "ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa" as Substring, with: "Z"), "ZZZ | ZZZZZ") + XCTAssertEqual(s1.replacing("aa" as Substring, with: "Z" as Substring), "ZZZ | ZZZZZ") + XCTAssertEqual( s.matches(of: regex).map(\.0), ["aaa", "aaaaaa", "aaaaaaaaaa"])