add data process package

supercoderhawk · supercoderhawk · commit 9edbaf09baa2 · 2017-10-11T12:26:10.000+08:00
diff --git a/python/dnlp/data_process/__init__.py b/python/dnlp/data_process/__init__.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-
diff --git a/python/dnlp/data_process/process_cws.py b/python/dnlp/data_process/process_cws.py
@@ -0,0 +1,76 @@
+# -*- coding: UTF-8 -*-
+import re
+import pickle
+from dnlp.data_process.processor import Preprocessor
+from dnlp.utils.constant import TAG_BEGIN, TAG_INSIDE, TAG_END, TAG_SINGLE,CWS_TAGS
+
+
+class ProcessCWS(Preprocessor):
+  def __init__(self, *, files: tuple = (), dict_path: str = '', base_folder: str = 'dnlp/data', name: str = '',
+               delimiter: tuple = ('。')):
+    self.SPLIT_CHAR = '  '
+    if base_folder == '':
+      raise Exception('base folder is empty')
+    if dict_path != '':
+      Preprocessor.__init__(self, base_folder=base_folder, dict_path=dict_path)
+    else:
+      if name == '':
+        raise Exception('')
+      Preprocessor.__init__(self, base_folder=base_folder, files=files, dict_path=base_folder + name + '_dict.utf8')
+    self.files = files
+    self.name = name
+    self.delimiter = delimiter
+    self.sentences = self.preprocess()
+    self.tags = CWS_TAGS
+    self.characters, self.labels = self.map_to_indices()
+    self.save_data()
+
+  def preprocess(self):
+    sentences = []
+    for file in self.files:
+      with open(self.base_folder + file, encoding='utf8') as f:
+        lines = [l.strip() for l in f.read().splitlines()]
+        if self.delimiter != ():
+          for d in self.delimiter:
+            new_lines = []
+            for ls in map(lambda i: i.split(d), lines):
+              if not ls[-1]:
+                new_lines.extend([l+d for l in ls[:-1]])
+              else:
+                new_lines.extend([l + d for l in ls])
+            lines = new_lines
+        sentences += lines
+
+    return sentences
+
+  def map_to_indices(self):
+    characters = []
+    labels = []
+    for sentence in self.sentences:
+      sentence = re.sub('[ ]+', self.SPLIT_CHAR, sentence).strip()
+      words = sentence.split(self.SPLIT_CHAR)
+      chs = []
+      lls = []
+      for word in words:
+        if len(word) == 1:
+          chs.append(self.dictionary[word])
+          lls.append(TAG_SINGLE)
+        elif len(word) == 0:
+          raise Exception('word length is zero')
+        else:
+          chs.extend(map(lambda ch: self.dictionary[ch], word))
+          lls.append(TAG_BEGIN)
+          lls.extend([TAG_INSIDE] * (len(word) - 2))
+          lls.append(TAG_END)
+      characters.append(chs)
+      labels.append(lls)
+    return characters, labels
+
+  def save_data(self):
+    data = {}
+    data['characters'] = self.characters
+    data['labels'] = self.labels
+    data['dictionary'] = self.dictionary
+    data['tags'] = self.tags
+    with open(self.base_folder + self.name + '.pickle', 'wb') as f:
+      pickle.dump(data, f)
diff --git a/python/dnlp/data_process/process_emr.py b/python/dnlp/data_process/process_emr.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-
diff --git a/python/dnlp/data_process/process_ner.py b/python/dnlp/data_process/process_ner.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-
diff --git a/python/dnlp/data_process/process_pos.py b/python/dnlp/data_process/process_pos.py
@@ -0,0 +1 @@
+#-*- coding: UTF-8 -*-
diff --git a/python/dnlp/data_process/processor.py b/python/dnlp/data_process/processor.py
@@ -0,0 +1,59 @@
+# -*- coding: UTF-8 -*-
+from dnlp.utils.constant import BATCH_PAD, BATCH_PAD_VAL, UNK, UNK_VAL, STRT, STRT_VAL, END, END_VAL
+
+
+class Preprocessor(object):
+  def __init__(self, *, base_folder: str, files: tuple = (), dict_path: str = ''):
+    self.base_folder = base_folder
+    if files != ():
+      self.dictionary = self.build_dictionary(files=files, output_dict_path=dict_path)
+    else:
+      self.dictionary = self.read_dictionary(dict_path)
+
+  def read_dictionary(self, dict_path: str, reverse=False):
+    dictionary = {}
+    with open(dict_path, encoding='utf8') as d:
+      items = d.readlines()
+      for item in items:
+        pair = item.split(' ')
+        dictionary[pair[0]] = int(pair[1])
+    if reverse:
+      return dictionary, dict(zip(dictionary.values(), dictionary.keys()))
+    else:
+      return dictionary
+
+  def build_dictionary(self, *, content: str = '', files: tuple = (), output_dict_path: str = '',
+                       reverse: bool = False):
+    if content == '' and files == ():
+      raise Exception('input is none')
+
+    chs = set(content)
+    file_content = ''
+    for file in files:
+      with open(self.base_folder + file, encoding='utf8') as f:
+        file_content += f.read()
+    chs = chs.union(set(file_content))
+    chs = chs.difference(['\r', '\n', ' ', ''])
+    dictionary = {BATCH_PAD: BATCH_PAD_VAL, UNK: UNK_VAL, STRT: STRT_VAL, END: END_VAL}
+    idx = len(dictionary)
+    for ch in chs:
+      dictionary[ch] = idx
+      idx += 1
+
+    if output_dict_path != '':
+      with open(output_dict_path, 'w', encoding='utf8') as o_f:
+        for ch, idx in zip(dictionary.keys(), dictionary.values()):
+          o_f.write(ch + ' ' + str(idx)+'\n')
+    if reverse:
+      return dictionary, dict(zip(dictionary.values(), dictionary.keys()))
+    else:
+      return dictionary
+
+  def preprocess(self):
+    raise NotImplementedError('not implement method')
+
+  def map_to_indices(self):
+    raise NotImplementedError('not implement method')
+
+  def save_data(self):
+    raise NotImplementedError('not implement method')