|
1 |
| -// In computer science, the Boyer–Moore string-search algorithm is an efficient string-searching algorithm, |
2 |
| -// that is the standard benchmark for practical string-search literature. Source: https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm |
| 1 | +//! This module implements the Boyer-Moore string search algorithm, an efficient method |
| 2 | +//! for finding all occurrences of a pattern within a given text. The algorithm skips |
| 3 | +//! sections of the text by leveraging two key rules: the bad character rule and the |
| 4 | +//! good suffix rule (only the bad character rule is implemented here for simplicity). |
3 | 5 |
|
4 | 6 | use std::collections::HashMap;
|
5 | 7 |
|
6 |
| -pub fn boyer_moore_search(text: &str, pattern: &str) -> Vec<usize> { |
| 8 | +/// Builds the bad character table for the Boyer-Moore algorithm. |
| 9 | +/// This table stores the last occurrence of each character in the pattern. |
| 10 | +/// |
| 11 | +/// # Arguments |
| 12 | +/// * `pat` - The pattern as a slice of characters. |
| 13 | +/// |
| 14 | +/// # Returns |
| 15 | +/// A `HashMap` where the keys are characters from the pattern and the values are their |
| 16 | +/// last known positions within the pattern. |
| 17 | +fn build_bad_char_table(pat: &[char]) -> HashMap<char, isize> { |
| 18 | + let mut bad_char_table = HashMap::new(); |
| 19 | + for (i, &ch) in pat.iter().enumerate() { |
| 20 | + bad_char_table.insert(ch, i as isize); |
| 21 | + } |
| 22 | + bad_char_table |
| 23 | +} |
| 24 | + |
| 25 | +/// Calculates the shift when a full match occurs in the Boyer-Moore algorithm. |
| 26 | +/// It uses the bad character table to determine how much to shift the pattern. |
| 27 | +/// |
| 28 | +/// # Arguments |
| 29 | +/// * `shift` - The current shift of the pattern on the text. |
| 30 | +/// * `pat_len` - The length of the pattern. |
| 31 | +/// * `text_len` - The length of the text. |
| 32 | +/// * `bad_char_table` - The bad character table built for the pattern. |
| 33 | +/// * `text` - The text as a slice of characters. |
| 34 | +/// |
| 35 | +/// # Returns |
| 36 | +/// The number of positions to shift the pattern after a match. |
| 37 | +fn calc_match_shift( |
| 38 | + shift: isize, |
| 39 | + pat_len: isize, |
| 40 | + text_len: isize, |
| 41 | + bad_char_table: &HashMap<char, isize>, |
| 42 | + text: &[char], |
| 43 | +) -> isize { |
| 44 | + if shift + pat_len >= text_len { |
| 45 | + return 1; |
| 46 | + } |
| 47 | + let next_ch = text[(shift + pat_len) as usize]; |
| 48 | + pat_len - bad_char_table.get(&next_ch).unwrap_or(&-1) |
| 49 | +} |
| 50 | + |
| 51 | +/// Calculates the shift when a mismatch occurs in the Boyer-Moore algorithm. |
| 52 | +/// The bad character rule is used to determine how far to shift the pattern. |
| 53 | +/// |
| 54 | +/// # Arguments |
| 55 | +/// * `mis_idx` - The mismatch index in the pattern. |
| 56 | +/// * `shift` - The current shift of the pattern on the text. |
| 57 | +/// * `text` - The text as a slice of characters. |
| 58 | +/// * `bad_char_table` - The bad character table built for the pattern. |
| 59 | +/// |
| 60 | +/// # Returns |
| 61 | +/// The number of positions to shift the pattern after a mismatch. |
| 62 | +fn calc_mismatch_shift( |
| 63 | + mis_idx: isize, |
| 64 | + shift: isize, |
| 65 | + text: &[char], |
| 66 | + bad_char_table: &HashMap<char, isize>, |
| 67 | +) -> isize { |
| 68 | + let mis_ch = text[(shift + mis_idx) as usize]; |
| 69 | + let bad_char_shift = bad_char_table.get(&mis_ch).unwrap_or(&-1); |
| 70 | + std::cmp::max(1, mis_idx - bad_char_shift) |
| 71 | +} |
| 72 | + |
| 73 | +/// Performs the Boyer-Moore string search algorithm, which searches for all |
| 74 | +/// occurrences of a pattern within a text. |
| 75 | +/// |
| 76 | +/// The Boyer-Moore algorithm is efficient for large texts and patterns, as it |
| 77 | +/// skips sections of the text based on the bad character rule and other optimizations. |
| 78 | +/// |
| 79 | +/// # Arguments |
| 80 | +/// * `text` - The text to search within as a string slice. |
| 81 | +/// * `pat` - The pattern to search for as a string slice. |
| 82 | +/// |
| 83 | +/// # Returns |
| 84 | +/// A vector of starting indices where the pattern occurs in the text. |
| 85 | +pub fn boyer_moore_search(text: &str, pat: &str) -> Vec<usize> { |
7 | 86 | let mut positions = Vec::new();
|
8 |
| - let n = text.len() as i32; |
9 |
| - let m = pattern.len() as i32; |
10 |
| - let pattern: Vec<char> = pattern.chars().collect(); |
11 |
| - let text: Vec<char> = text.chars().collect(); |
12 |
| - if n == 0 || m == 0 { |
| 87 | + |
| 88 | + let text_len = text.len() as isize; |
| 89 | + let pat_len = pat.len() as isize; |
| 90 | + |
| 91 | + // Handle edge cases where the text or pattern is empty, or the pattern is longer than the text |
| 92 | + if text_len == 0 || pat_len == 0 || pat_len > text_len { |
13 | 93 | return positions;
|
14 | 94 | }
|
15 |
| - let mut collection = HashMap::new(); |
16 |
| - for (i, c) in pattern.iter().enumerate() { |
17 |
| - collection.insert(c, i as i32); |
18 |
| - } |
19 |
| - let mut shift: i32 = 0; |
20 |
| - while shift <= (n - m) { |
21 |
| - let mut j = m - 1; |
22 |
| - while j >= 0 && pattern[j as usize] == text[(shift + j) as usize] { |
| 95 | + |
| 96 | + // Convert text and pattern to character vectors for easier indexing |
| 97 | + let pat: Vec<char> = pat.chars().collect(); |
| 98 | + let text: Vec<char> = text.chars().collect(); |
| 99 | + |
| 100 | + // Build the bad character table for the pattern |
| 101 | + let bad_char_table = build_bad_char_table(&pat); |
| 102 | + |
| 103 | + let mut shift = 0; |
| 104 | + |
| 105 | + // Main loop: shift the pattern over the text |
| 106 | + while shift <= text_len - pat_len { |
| 107 | + let mut j = pat_len - 1; |
| 108 | + |
| 109 | + // Compare pattern from right to left |
| 110 | + while j >= 0 && pat[j as usize] == text[(shift + j) as usize] { |
23 | 111 | j -= 1;
|
24 | 112 | }
|
| 113 | + |
| 114 | + // If we found a match (j < 0), record the position |
25 | 115 | if j < 0 {
|
26 | 116 | positions.push(shift as usize);
|
27 |
| - let add_to_shift = { |
28 |
| - if (shift + m) < n { |
29 |
| - let c = text[(shift + m) as usize]; |
30 |
| - let index = collection.get(&c).unwrap_or(&-1); |
31 |
| - m - index |
32 |
| - } else { |
33 |
| - 1 |
34 |
| - } |
35 |
| - }; |
36 |
| - shift += add_to_shift; |
| 117 | + shift += calc_match_shift(shift, pat_len, text_len, &bad_char_table, &text); |
37 | 118 | } else {
|
38 |
| - let c = text[(shift + j) as usize]; |
39 |
| - let index = collection.get(&c).unwrap_or(&-1); |
40 |
| - let add_to_shift = std::cmp::max(1, j - index); |
41 |
| - shift += add_to_shift; |
| 119 | + // If mismatch, calculate how far to shift based on the bad character rule |
| 120 | + shift += calc_mismatch_shift(j, shift, &text, &bad_char_table); |
42 | 121 | }
|
43 | 122 | }
|
| 123 | + |
44 | 124 | positions
|
45 | 125 | }
|
46 | 126 |
|
47 | 127 | #[cfg(test)]
|
48 | 128 | mod tests {
|
49 | 129 | use super::*;
|
50 | 130 |
|
51 |
| - #[test] |
52 |
| - fn test_boyer_moore_search() { |
53 |
| - let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "ABCAB"); |
54 |
| - assert_eq!(a, [1, 11, 20]); |
55 |
| - let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "FFF"); |
56 |
| - assert_eq!(a, []); |
57 |
| - let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "CAB"); |
58 |
| - assert_eq!(a, [3, 13, 22]); |
| 131 | + macro_rules! boyer_moore_tests { |
| 132 | + ($($name:ident: $tc:expr,)*) => { |
| 133 | + $( |
| 134 | + #[test] |
| 135 | + fn $name() { |
| 136 | + let (text, pattern, expected) = $tc; |
| 137 | + assert_eq!(boyer_moore_search(text, pattern), expected); |
| 138 | + } |
| 139 | + )* |
| 140 | + }; |
| 141 | + } |
| 142 | + |
| 143 | + boyer_moore_tests! { |
| 144 | + test_simple_match: ("AABCAB12AFAABCABFFEGABCAB", "ABCAB", vec![1, 11, 20]), |
| 145 | + test_no_match: ("AABCAB12AFAABCABFFEGABCAB", "FFF", vec![]), |
| 146 | + test_partial_match: ("AABCAB12AFAABCABFFEGABCAB", "CAB", vec![3, 13, 22]), |
| 147 | + test_empty_text: ("", "A", vec![]), |
| 148 | + test_empty_pattern: ("ABC", "", vec![]), |
| 149 | + test_both_empty: ("", "", vec![]), |
| 150 | + test_pattern_longer_than_text: ("ABC", "ABCDEFG", vec![]), |
| 151 | + test_single_character_text: ("A", "A", vec![0]), |
| 152 | + test_single_character_pattern: ("AAAA", "A", vec![0, 1, 2, 3]), |
| 153 | + test_case_sensitivity: ("ABCabcABC", "abc", vec![3]), |
| 154 | + test_overlapping_patterns: ("AAAAA", "AAA", vec![0, 1, 2]), |
| 155 | + test_special_characters: ("@!#$$%^&*", "$$", vec![3]), |
| 156 | + test_numerical_pattern: ("123456789123456", "456", vec![3, 12]), |
| 157 | + test_partial_overlap_no_match: ("ABCD", "ABCDE", vec![]), |
| 158 | + test_single_occurrence: ("XXXXXXXXXXXXXXXXXXPATTERNXXXXXXXXXXXXXXXXXX", "PATTERN", vec![18]), |
| 159 | + test_single_occurrence_with_noise: ("PATPATPATPATTERNPAT", "PATTERN", vec![9]), |
59 | 160 | }
|
60 | 161 | }
|
0 commit comments