Skip to content

Commit 9edbaf0

Browse files
add data process package
1 parent 5e1f77e commit 9edbaf0

File tree

6 files changed

+139
-0
lines changed

6 files changed

+139
-0
lines changed

python/dnlp/data_process/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#-*- coding: UTF-8 -*-
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# -*- coding: UTF-8 -*-
2+
import re
3+
import pickle
4+
from dnlp.data_process.processor import Preprocessor
5+
from dnlp.utils.constant import TAG_BEGIN, TAG_INSIDE, TAG_END, TAG_SINGLE,CWS_TAGS
6+
7+
8+
class ProcessCWS(Preprocessor):
9+
def __init__(self, *, files: tuple = (), dict_path: str = '', base_folder: str = 'dnlp/data', name: str = '',
10+
delimiter: tuple = ('。')):
11+
self.SPLIT_CHAR = ' '
12+
if base_folder == '':
13+
raise Exception('base folder is empty')
14+
if dict_path != '':
15+
Preprocessor.__init__(self, base_folder=base_folder, dict_path=dict_path)
16+
else:
17+
if name == '':
18+
raise Exception('')
19+
Preprocessor.__init__(self, base_folder=base_folder, files=files, dict_path=base_folder + name + '_dict.utf8')
20+
self.files = files
21+
self.name = name
22+
self.delimiter = delimiter
23+
self.sentences = self.preprocess()
24+
self.tags = CWS_TAGS
25+
self.characters, self.labels = self.map_to_indices()
26+
self.save_data()
27+
28+
def preprocess(self):
29+
sentences = []
30+
for file in self.files:
31+
with open(self.base_folder + file, encoding='utf8') as f:
32+
lines = [l.strip() for l in f.read().splitlines()]
33+
if self.delimiter != ():
34+
for d in self.delimiter:
35+
new_lines = []
36+
for ls in map(lambda i: i.split(d), lines):
37+
if not ls[-1]:
38+
new_lines.extend([l+d for l in ls[:-1]])
39+
else:
40+
new_lines.extend([l + d for l in ls])
41+
lines = new_lines
42+
sentences += lines
43+
44+
return sentences
45+
46+
def map_to_indices(self):
47+
characters = []
48+
labels = []
49+
for sentence in self.sentences:
50+
sentence = re.sub('[ ]+', self.SPLIT_CHAR, sentence).strip()
51+
words = sentence.split(self.SPLIT_CHAR)
52+
chs = []
53+
lls = []
54+
for word in words:
55+
if len(word) == 1:
56+
chs.append(self.dictionary[word])
57+
lls.append(TAG_SINGLE)
58+
elif len(word) == 0:
59+
raise Exception('word length is zero')
60+
else:
61+
chs.extend(map(lambda ch: self.dictionary[ch], word))
62+
lls.append(TAG_BEGIN)
63+
lls.extend([TAG_INSIDE] * (len(word) - 2))
64+
lls.append(TAG_END)
65+
characters.append(chs)
66+
labels.append(lls)
67+
return characters, labels
68+
69+
def save_data(self):
70+
data = {}
71+
data['characters'] = self.characters
72+
data['labels'] = self.labels
73+
data['dictionary'] = self.dictionary
74+
data['tags'] = self.tags
75+
with open(self.base_folder + self.name + '.pickle', 'wb') as f:
76+
pickle.dump(data, f)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#-*- coding: UTF-8 -*-
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#-*- coding: UTF-8 -*-
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#-*- coding: UTF-8 -*-

python/dnlp/data_process/processor.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# -*- coding: UTF-8 -*-
2+
from dnlp.utils.constant import BATCH_PAD, BATCH_PAD_VAL, UNK, UNK_VAL, STRT, STRT_VAL, END, END_VAL
3+
4+
5+
class Preprocessor(object):
6+
def __init__(self, *, base_folder: str, files: tuple = (), dict_path: str = ''):
7+
self.base_folder = base_folder
8+
if files != ():
9+
self.dictionary = self.build_dictionary(files=files, output_dict_path=dict_path)
10+
else:
11+
self.dictionary = self.read_dictionary(dict_path)
12+
13+
def read_dictionary(self, dict_path: str, reverse=False):
14+
dictionary = {}
15+
with open(dict_path, encoding='utf8') as d:
16+
items = d.readlines()
17+
for item in items:
18+
pair = item.split(' ')
19+
dictionary[pair[0]] = int(pair[1])
20+
if reverse:
21+
return dictionary, dict(zip(dictionary.values(), dictionary.keys()))
22+
else:
23+
return dictionary
24+
25+
def build_dictionary(self, *, content: str = '', files: tuple = (), output_dict_path: str = '',
26+
reverse: bool = False):
27+
if content == '' and files == ():
28+
raise Exception('input is none')
29+
30+
chs = set(content)
31+
file_content = ''
32+
for file in files:
33+
with open(self.base_folder + file, encoding='utf8') as f:
34+
file_content += f.read()
35+
chs = chs.union(set(file_content))
36+
chs = chs.difference(['\r', '\n', ' ', ''])
37+
dictionary = {BATCH_PAD: BATCH_PAD_VAL, UNK: UNK_VAL, STRT: STRT_VAL, END: END_VAL}
38+
idx = len(dictionary)
39+
for ch in chs:
40+
dictionary[ch] = idx
41+
idx += 1
42+
43+
if output_dict_path != '':
44+
with open(output_dict_path, 'w', encoding='utf8') as o_f:
45+
for ch, idx in zip(dictionary.keys(), dictionary.values()):
46+
o_f.write(ch + ' ' + str(idx)+'\n')
47+
if reverse:
48+
return dictionary, dict(zip(dictionary.values(), dictionary.keys()))
49+
else:
50+
return dictionary
51+
52+
def preprocess(self):
53+
raise NotImplementedError('not implement method')
54+
55+
def map_to_indices(self):
56+
raise NotImplementedError('not implement method')
57+
58+
def save_data(self):
59+
raise NotImplementedError('not implement method')

0 commit comments

Comments
 (0)