diff --git a/README.md b/README.md index 7c0e1d6f50a..23aca25f767 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ These are for demonstration purposes only. - [x] [Reverse](./src/string/reverse.rs) - [x] [Run Length Encoding](.src/string/run_length_encoding.rs) - [x] [Hamming Distance](./src/string/hamming_distance.rs) +- [x] [Suffix Tree](./src/string/suffix_tree.rs) ## [General](./src/general) diff --git a/src/string/aho_corasick.rs b/src/string/aho_corasick.rs index e1d5759c491..901d822ddd7 100644 --- a/src/string/aho_corasick.rs +++ b/src/string/aho_corasick.rs @@ -64,7 +64,8 @@ impl AhoCorasick { pub fn search<'a>(&self, s: &'a str) -> Vec<&'a str> { let mut ans = vec![]; let mut cur = Rc::clone(&self.root); - for (i, c) in s.chars().enumerate() { + let mut position: usize = 0; + for (_, c) in s.chars().enumerate() { loop { if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) { cur = Rc::clone(child); @@ -76,8 +77,9 @@ impl AhoCorasick { None => break, } } + position += c.len_utf8(); for &len in &cur.borrow().lengths { - ans.push(&s[i + 1 - len..=i]); + ans.push(&s[position - len..position]); } } ans @@ -95,4 +97,37 @@ mod tests { let res = ac.search("ababcxyzacxy12678acxy6543"); assert_eq!(res, ["abc", "xyz", "acxy", "678", "acxy", "6543",]); } + + #[test] + fn test_aho_corasick_with_utf8() { + let dict = [ + "abc", + "中文", + "abc中", + "abcd", + "xyz", + "acxy", + "efg", + "123", + "678", + "6543", + "ハンバーガー", + ]; + let ac = AhoCorasick::new(&dict); + let res = ac.search("ababc中xyzacxy12678acxyハンバーガー6543中文"); + assert_eq!( + res, + [ + "abc", + "abc中", + "xyz", + "acxy", + "678", + "acxy", + "ハンバーガー", + "6543", + "中文" + ] + ); + } } diff --git a/src/string/mod.rs b/src/string/mod.rs index 0e0d3e9db48..931999822ca 100644 --- a/src/string/mod.rs +++ b/src/string/mod.rs @@ -6,6 +6,7 @@ mod manacher; mod rabin_karp; mod reverse; mod run_length_encoding; +mod suffix_tree; mod z_algorithm; pub use self::aho_corasick::AhoCorasick; @@ -18,5 +19,6 @@ pub use self::manacher::manacher; pub use self::rabin_karp::rabin_karp; pub use self::reverse::reverse; pub use self::run_length_encoding::{run_length_decoding, run_length_encoding}; +pub use self::suffix_tree::{Node, SuffixTree}; pub use self::z_algorithm::match_pattern; pub use self::z_algorithm::z_array; diff --git a/src/string/suffix_tree.rs b/src/string/suffix_tree.rs new file mode 100644 index 00000000000..fb2a374eef8 --- /dev/null +++ b/src/string/suffix_tree.rs @@ -0,0 +1,152 @@ +// In computer science, a suffix tree (also called PAT tree or, in an earlier form, position tree) +// is a compressed trie containing all the suffixes of the given text as their keys and positions +// in the text as their values. Suffix trees allow particularly fast implementations of many +// important string operations. Source: https://en.wikipedia.org/wiki/Suffix_tree + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Node { + pub sub: String, // substring of input string + pub ch: Vec, // vector of child nodes +} + +impl Node { + fn new(sub: String, children: Vec) -> Self { + Node { + sub, + ch: children.to_vec(), + } + } + pub fn empty() -> Self { + Node { + sub: "".to_string(), + ch: vec![], + } + } +} + +pub struct SuffixTree { + pub nodes: Vec, +} + +impl SuffixTree { + pub fn new(s: String) -> Self { + let mut suf_tree = SuffixTree { + nodes: vec![Node::empty()], + }; + for i in 0..s.len() { + let (_, substr) = s.split_at(i); + suf_tree.add_suffix(substr); + } + suf_tree + } + fn add_suffix(&mut self, suf: &str) { + let mut n = 0; + let mut i = 0; + while i < suf.len() { + let b = suf.chars().nth(i); + let mut x2 = 0; + let mut n2: usize; + loop { + let children = &self.nodes[n].ch; + if children.len() == x2 { + n2 = self.nodes.len(); + self.nodes.push(Node::new( + { + let (_, sub) = suf.split_at(i); + sub.to_string() + }, + vec![], + )); + self.nodes[n].ch.push(n2); + return; + } + n2 = children[x2]; + if self.nodes[n2].sub.chars().next() == b { + break; + } + x2 += 1; + } + let sub2 = self.nodes[n2].sub.clone(); + let mut j = 0; + while j < sub2.len() { + if suf.chars().nth(i + j) != sub2.chars().nth(j) { + let n3 = n2; + n2 = self.nodes.len(); + self.nodes.push(Node::new( + { + let (sub, _) = sub2.split_at(j); + sub.to_string() + }, + vec![n3], + )); + let (_, temp_sub) = sub2.split_at(j); + self.nodes[n3].sub = temp_sub.to_string(); + self.nodes[n].ch[x2] = n2; + break; + } + j += 1; + } + i += j; + n = n2; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_suffix_tree() { + let suf_tree = SuffixTree::new("banana$".to_string()); + assert_eq!( + suf_tree.nodes, + vec![ + Node { + sub: "".to_string(), + ch: vec![1, 8, 6, 10] + }, + Node { + sub: "banana$".to_string(), + ch: vec![] + }, + Node { + sub: "na$".to_string(), + ch: vec![] + }, + Node { + sub: "na$".to_string(), + ch: vec![] + }, + Node { + sub: "na".to_string(), + ch: vec![2, 5] + }, + Node { + sub: "$".to_string(), + ch: vec![] + }, + Node { + sub: "na".to_string(), + ch: vec![3, 7] + }, + Node { + sub: "$".to_string(), + ch: vec![] + }, + Node { + sub: "a".to_string(), + ch: vec![4, 9] + }, + Node { + sub: "$".to_string(), + ch: vec![] + }, + Node { + sub: "$".to_string(), + ch: vec![] + } + ] + ); + } +}