From 46a663a2fdc337679f15f8bfa987784a12e52200 Mon Sep 17 00:00:00 2001 From: Aashay Shingre Date: Sun, 16 Sep 2018 21:51:59 +0530 Subject: [PATCH 1/2] add aho-corasick algorithm --- strings/aho-corasick.py | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 strings/aho-corasick.py diff --git a/strings/aho-corasick.py b/strings/aho-corasick.py new file mode 100644 index 000000000000..2cad39c92770 --- /dev/null +++ b/strings/aho-corasick.py @@ -0,0 +1,64 @@ +from collections import deque + +class Automaton: + def __init__(self, keywords): + self.adlist = list() + self.adlist.append({'value':'', 'next_states':[], 'fail_state':0, 'output':[]}) + + for keyword in keywords: + self.add_keyword(keyword) + self.set_fail_transitions() + + def find_next_state(self, current_state, char): + for state in self.adlist[current_state]['next_states']: + if char == self.adlist[state]['value']: + return state + return None + + def add_keyword(self, keyword): + current_state = 0 + for character in keyword: + if self.find_next_state(current_state, character): + current_state = self.find_next_state(current_state, character) + else: + self.adlist.append({'value':character, 'next_states':[], 'fail_state':0, 'output':[]}) + self.adlist[current_state]['next_states'].append(len(self.adlist)-1) + current_state = len(self.adlist) - 1 + self.adlist[current_state]['output'].append(keyword) + + def set_fail_transitions(self): + q = deque() + for node in self.adlist[0]['next_states']: + q.append(node) + self.adlist[node]['fail_state'] = 0 + while q: + r = q.popleft() + for child in self.adlist[r]['next_states']: + q.append(child) + state = self.adlist[r]['fail_state'] + while self.find_next_state(state, self.adlist[child]['value']) == None and state != 0: + state = self.adlist[state]['fail_state'] + self.adlist[child]['fail_state'] = self.find_next_state(state, self.adlist[child]['value']) + if self.adlist[child]['fail_state'] == None: + self.adlist[child]['fail_state'] = 0 + self.adlist[child]['output'] = self.adlist[child]['output'] + self.adlist[self.adlist[child]['fail_state']]['output'] + + def search_in(self, string): + result = dict() #returns a dict with keywords and list of its occurences + current_state = 0 + for i in range(len(string)): + while self.find_next_state(current_state, string[i]) == None and current_state != 0: + current_state = self.adlist[current_state]['fail_state'] + current_state = self.find_next_state(current_state, string[i]) + if current_state is None: + current_state = 0 + else: + for key in self.adlist[current_state]['output']: + if not (key in result): + result[key] = [] + result[key].append((i - len(key) + 1)) + return result + +keywords = ['what', 'hat', 'ver', 'er'] +A = Automaton(keywords) +print(A.search_in('whatever, err ... , wherever')) \ No newline at end of file From a73acd03def62bf573ac1237f91ef23cfbf25e72 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Tue, 22 Oct 2019 09:38:51 +0200 Subject: [PATCH 2/2] Add a doctest and format with black --- strings/aho-corasick.py | 76 ++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/strings/aho-corasick.py b/strings/aho-corasick.py index 2cad39c92770..6790892a358d 100644 --- a/strings/aho-corasick.py +++ b/strings/aho-corasick.py @@ -1,17 +1,20 @@ from collections import deque + class Automaton: def __init__(self, keywords): self.adlist = list() - self.adlist.append({'value':'', 'next_states':[], 'fail_state':0, 'output':[]}) - + self.adlist.append( + {"value": "", "next_states": [], "fail_state": 0, "output": []} + ) + for keyword in keywords: self.add_keyword(keyword) self.set_fail_transitions() def find_next_state(self, current_state, char): - for state in self.adlist[current_state]['next_states']: - if char == self.adlist[state]['value']: + for state in self.adlist[current_state]["next_states"]: + if char == self.adlist[state]["value"]: return state return None @@ -21,44 +24,69 @@ def add_keyword(self, keyword): if self.find_next_state(current_state, character): current_state = self.find_next_state(current_state, character) else: - self.adlist.append({'value':character, 'next_states':[], 'fail_state':0, 'output':[]}) - self.adlist[current_state]['next_states'].append(len(self.adlist)-1) + self.adlist.append( + { + "value": character, + "next_states": [], + "fail_state": 0, + "output": [], + } + ) + self.adlist[current_state]["next_states"].append(len(self.adlist) - 1) current_state = len(self.adlist) - 1 - self.adlist[current_state]['output'].append(keyword) + self.adlist[current_state]["output"].append(keyword) def set_fail_transitions(self): q = deque() - for node in self.adlist[0]['next_states']: + for node in self.adlist[0]["next_states"]: q.append(node) - self.adlist[node]['fail_state'] = 0 + self.adlist[node]["fail_state"] = 0 while q: r = q.popleft() - for child in self.adlist[r]['next_states']: + for child in self.adlist[r]["next_states"]: q.append(child) - state = self.adlist[r]['fail_state'] - while self.find_next_state(state, self.adlist[child]['value']) == None and state != 0: - state = self.adlist[state]['fail_state'] - self.adlist[child]['fail_state'] = self.find_next_state(state, self.adlist[child]['value']) - if self.adlist[child]['fail_state'] == None: - self.adlist[child]['fail_state'] = 0 - self.adlist[child]['output'] = self.adlist[child]['output'] + self.adlist[self.adlist[child]['fail_state']]['output'] + state = self.adlist[r]["fail_state"] + while ( + self.find_next_state(state, self.adlist[child]["value"]) == None + and state != 0 + ): + state = self.adlist[state]["fail_state"] + self.adlist[child]["fail_state"] = self.find_next_state( + state, self.adlist[child]["value"] + ) + if self.adlist[child]["fail_state"] == None: + self.adlist[child]["fail_state"] = 0 + self.adlist[child]["output"] = ( + self.adlist[child]["output"] + + self.adlist[self.adlist[child]["fail_state"]]["output"] + ) def search_in(self, string): - result = dict() #returns a dict with keywords and list of its occurences + """ + >>> A = Automaton(["what", "hat", "ver", "er"]) + >>> A.search_in("whatever, err ... , wherever") + {'what': [0], 'hat': [1], 'ver': [5, 25], 'er': [6, 10, 22, 26]} + """ + result = dict() # returns a dict with keywords and list of its occurences current_state = 0 for i in range(len(string)): - while self.find_next_state(current_state, string[i]) == None and current_state != 0: - current_state = self.adlist[current_state]['fail_state'] + while ( + self.find_next_state(current_state, string[i]) == None + and current_state != 0 + ): + current_state = self.adlist[current_state]["fail_state"] current_state = self.find_next_state(current_state, string[i]) if current_state is None: current_state = 0 else: - for key in self.adlist[current_state]['output']: + for key in self.adlist[current_state]["output"]: if not (key in result): result[key] = [] result[key].append((i - len(key) + 1)) return result -keywords = ['what', 'hat', 'ver', 'er'] -A = Automaton(keywords) -print(A.search_in('whatever, err ... , wherever')) \ No newline at end of file + +if __name__ == "__main__": + import doctest + + doctest.testmod()