Skip to content

Commit aad5192

Browse files
authored
Refactor Boyer-Moore Search Algorithm (TheAlgorithms#794)
ref: refactor Boyer-Moore search algorithm - Decompse the functions with helper functions - Write docstring for explaination - Rewrite tests using macro and add some edge tests
1 parent 418bf15 commit aad5192

File tree

1 file changed

+139
-38
lines changed

1 file changed

+139
-38
lines changed

src/string/boyer_moore_search.rs

Lines changed: 139 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,161 @@
1-
// In computer science, the Boyer–Moore string-search algorithm is an efficient string-searching algorithm,
2-
// that is the standard benchmark for practical string-search literature. Source: https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
1+
//! This module implements the Boyer-Moore string search algorithm, an efficient method
2+
//! for finding all occurrences of a pattern within a given text. The algorithm skips
3+
//! sections of the text by leveraging two key rules: the bad character rule and the
4+
//! good suffix rule (only the bad character rule is implemented here for simplicity).
35
46
use std::collections::HashMap;
57

6-
pub fn boyer_moore_search(text: &str, pattern: &str) -> Vec<usize> {
8+
/// Builds the bad character table for the Boyer-Moore algorithm.
9+
/// This table stores the last occurrence of each character in the pattern.
10+
///
11+
/// # Arguments
12+
/// * `pat` - The pattern as a slice of characters.
13+
///
14+
/// # Returns
15+
/// A `HashMap` where the keys are characters from the pattern and the values are their
16+
/// last known positions within the pattern.
17+
fn build_bad_char_table(pat: &[char]) -> HashMap<char, isize> {
18+
let mut bad_char_table = HashMap::new();
19+
for (i, &ch) in pat.iter().enumerate() {
20+
bad_char_table.insert(ch, i as isize);
21+
}
22+
bad_char_table
23+
}
24+
25+
/// Calculates the shift when a full match occurs in the Boyer-Moore algorithm.
26+
/// It uses the bad character table to determine how much to shift the pattern.
27+
///
28+
/// # Arguments
29+
/// * `shift` - The current shift of the pattern on the text.
30+
/// * `pat_len` - The length of the pattern.
31+
/// * `text_len` - The length of the text.
32+
/// * `bad_char_table` - The bad character table built for the pattern.
33+
/// * `text` - The text as a slice of characters.
34+
///
35+
/// # Returns
36+
/// The number of positions to shift the pattern after a match.
37+
fn calc_match_shift(
38+
shift: isize,
39+
pat_len: isize,
40+
text_len: isize,
41+
bad_char_table: &HashMap<char, isize>,
42+
text: &[char],
43+
) -> isize {
44+
if shift + pat_len >= text_len {
45+
return 1;
46+
}
47+
let next_ch = text[(shift + pat_len) as usize];
48+
pat_len - bad_char_table.get(&next_ch).unwrap_or(&-1)
49+
}
50+
51+
/// Calculates the shift when a mismatch occurs in the Boyer-Moore algorithm.
52+
/// The bad character rule is used to determine how far to shift the pattern.
53+
///
54+
/// # Arguments
55+
/// * `mis_idx` - The mismatch index in the pattern.
56+
/// * `shift` - The current shift of the pattern on the text.
57+
/// * `text` - The text as a slice of characters.
58+
/// * `bad_char_table` - The bad character table built for the pattern.
59+
///
60+
/// # Returns
61+
/// The number of positions to shift the pattern after a mismatch.
62+
fn calc_mismatch_shift(
63+
mis_idx: isize,
64+
shift: isize,
65+
text: &[char],
66+
bad_char_table: &HashMap<char, isize>,
67+
) -> isize {
68+
let mis_ch = text[(shift + mis_idx) as usize];
69+
let bad_char_shift = bad_char_table.get(&mis_ch).unwrap_or(&-1);
70+
std::cmp::max(1, mis_idx - bad_char_shift)
71+
}
72+
73+
/// Performs the Boyer-Moore string search algorithm, which searches for all
74+
/// occurrences of a pattern within a text.
75+
///
76+
/// The Boyer-Moore algorithm is efficient for large texts and patterns, as it
77+
/// skips sections of the text based on the bad character rule and other optimizations.
78+
///
79+
/// # Arguments
80+
/// * `text` - The text to search within as a string slice.
81+
/// * `pat` - The pattern to search for as a string slice.
82+
///
83+
/// # Returns
84+
/// A vector of starting indices where the pattern occurs in the text.
85+
pub fn boyer_moore_search(text: &str, pat: &str) -> Vec<usize> {
786
let mut positions = Vec::new();
8-
let n = text.len() as i32;
9-
let m = pattern.len() as i32;
10-
let pattern: Vec<char> = pattern.chars().collect();
11-
let text: Vec<char> = text.chars().collect();
12-
if n == 0 || m == 0 {
87+
88+
let text_len = text.len() as isize;
89+
let pat_len = pat.len() as isize;
90+
91+
// Handle edge cases where the text or pattern is empty, or the pattern is longer than the text
92+
if text_len == 0 || pat_len == 0 || pat_len > text_len {
1393
return positions;
1494
}
15-
let mut collection = HashMap::new();
16-
for (i, c) in pattern.iter().enumerate() {
17-
collection.insert(c, i as i32);
18-
}
19-
let mut shift: i32 = 0;
20-
while shift <= (n - m) {
21-
let mut j = m - 1;
22-
while j >= 0 && pattern[j as usize] == text[(shift + j) as usize] {
95+
96+
// Convert text and pattern to character vectors for easier indexing
97+
let pat: Vec<char> = pat.chars().collect();
98+
let text: Vec<char> = text.chars().collect();
99+
100+
// Build the bad character table for the pattern
101+
let bad_char_table = build_bad_char_table(&pat);
102+
103+
let mut shift = 0;
104+
105+
// Main loop: shift the pattern over the text
106+
while shift <= text_len - pat_len {
107+
let mut j = pat_len - 1;
108+
109+
// Compare pattern from right to left
110+
while j >= 0 && pat[j as usize] == text[(shift + j) as usize] {
23111
j -= 1;
24112
}
113+
114+
// If we found a match (j < 0), record the position
25115
if j < 0 {
26116
positions.push(shift as usize);
27-
let add_to_shift = {
28-
if (shift + m) < n {
29-
let c = text[(shift + m) as usize];
30-
let index = collection.get(&c).unwrap_or(&-1);
31-
m - index
32-
} else {
33-
1
34-
}
35-
};
36-
shift += add_to_shift;
117+
shift += calc_match_shift(shift, pat_len, text_len, &bad_char_table, &text);
37118
} else {
38-
let c = text[(shift + j) as usize];
39-
let index = collection.get(&c).unwrap_or(&-1);
40-
let add_to_shift = std::cmp::max(1, j - index);
41-
shift += add_to_shift;
119+
// If mismatch, calculate how far to shift based on the bad character rule
120+
shift += calc_mismatch_shift(j, shift, &text, &bad_char_table);
42121
}
43122
}
123+
44124
positions
45125
}
46126

47127
#[cfg(test)]
48128
mod tests {
49129
use super::*;
50130

51-
#[test]
52-
fn test_boyer_moore_search() {
53-
let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "ABCAB");
54-
assert_eq!(a, [1, 11, 20]);
55-
let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "FFF");
56-
assert_eq!(a, []);
57-
let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "CAB");
58-
assert_eq!(a, [3, 13, 22]);
131+
macro_rules! boyer_moore_tests {
132+
($($name:ident: $tc:expr,)*) => {
133+
$(
134+
#[test]
135+
fn $name() {
136+
let (text, pattern, expected) = $tc;
137+
assert_eq!(boyer_moore_search(text, pattern), expected);
138+
}
139+
)*
140+
};
141+
}
142+
143+
boyer_moore_tests! {
144+
test_simple_match: ("AABCAB12AFAABCABFFEGABCAB", "ABCAB", vec![1, 11, 20]),
145+
test_no_match: ("AABCAB12AFAABCABFFEGABCAB", "FFF", vec![]),
146+
test_partial_match: ("AABCAB12AFAABCABFFEGABCAB", "CAB", vec![3, 13, 22]),
147+
test_empty_text: ("", "A", vec![]),
148+
test_empty_pattern: ("ABC", "", vec![]),
149+
test_both_empty: ("", "", vec![]),
150+
test_pattern_longer_than_text: ("ABC", "ABCDEFG", vec![]),
151+
test_single_character_text: ("A", "A", vec![0]),
152+
test_single_character_pattern: ("AAAA", "A", vec![0, 1, 2, 3]),
153+
test_case_sensitivity: ("ABCabcABC", "abc", vec![3]),
154+
test_overlapping_patterns: ("AAAAA", "AAA", vec![0, 1, 2]),
155+
test_special_characters: ("@!#$$%^&*", "$$", vec![3]),
156+
test_numerical_pattern: ("123456789123456", "456", vec![3, 12]),
157+
test_partial_overlap_no_match: ("ABCD", "ABCDE", vec![]),
158+
test_single_occurrence: ("XXXXXXXXXXXXXXXXXXPATTERNXXXXXXXXXXXXXXXXXX", "PATTERN", vec![18]),
159+
test_single_occurrence_with_noise: ("PATPATPATPATTERNPAT", "PATTERN", vec![9]),
59160
}
60161
}

0 commit comments

Comments
 (0)