diff --git a/dnn.py b/dnn.py index a1c3b39..f31afc6 100644 --- a/dnn.py +++ b/dnn.py @@ -14,7 +14,7 @@ def __init__(self, type='mlp', batch_size=10, batch_length=224, mode=TrainMode.B DNNBase.__init__(self) # 参数初始化 self.dtype = tf.float32 - self.skip_window_left = 0 + self.skip_window_left = 1 self.skip_window_right = 1 self.window_size = self.skip_window_left + self.skip_window_right + 1 # self.vocab_size = 4000 @@ -153,7 +153,7 @@ def __init__(self, type='mlp', batch_size=10, batch_length=224, mode=TrainMode.B def train_exe(self): tf.global_variables_initializer().run(session=self.sess) self.sess.graph.finalize() - epoches = 50 + epoches = 100 last_time = time.time() if self.mode == TrainMode.Sentence: for i in range(epoches): diff --git a/pipeline.py b/pipeline.py index 7fa7421..508b5a1 100644 --- a/pipeline.py +++ b/pipeline.py @@ -57,21 +57,23 @@ def evaluate_re(): re_two = RECNN(2, window_size=w, train=False) # re_multi = RECNN(29, window_size=w, train=False) name = 'cnn_emr_model100_{0}.ckpt'.format('_'.join(map(str, w))) - re_two.evaluate(name) - # re_multi.evaluate(name) + prec, recall, f1 = re_two.evaluate(name) + # prec, recall, f1 = re_multi.evaluate(name) + with open('corpus/re.txt', 'a', encoding='utf8') as f: + f.write(name + '\t{:.2f}\t{:.2f}\t{:.2f}\n'.format(prec * 100, recall * 100, f1 * 100)) if __name__ == '__main__': # 实体识别 - # print('mlp') - # evaluate_ner('tmp/mlp/mlp-ner-model20.ckpt') - # print('mlp+embed') - # evaluate_ner('tmp/mlp/mlp-ner-embed-model50.ckpt') - # print('lstm') - # evaluate_ner('tmp/lstm/lstm-ner-model50.ckpt') - # print('lstm+embed') - # evaluate_ner('tmp/lstm/lstm-ner-embed-model50.ckpt') + print('mlp') + evaluate_ner('tmp/mlp/mlp-ner-model50.ckpt') + print('mlp+embed') + evaluate_ner('tmp/mlp/mlp-ner-embed-model50.ckpt') + print('lstm') + evaluate_ner('tmp/lstm/lstm-ner-model50.ckpt') + print('lstm+embed') + evaluate_ner('tmp/lstm/lstm-ner-embed-model50.ckpt') # 关系抽取 - evaluate_re() + # evaluate_re() # re_two = RECNN(2, window_size=(4,), train=False) # re_two.evaluate('cnn_emr_model60_4.ckpt') diff --git a/preprocess_data.py b/preprocess_data.py index 6948eba..6c5518a 100644 --- a/preprocess_data.py +++ b/preprocess_data.py @@ -6,7 +6,7 @@ class PreprocessData: def __init__(self, corpus, mode, type=CorpusType.Train,force_generate=True): - self.skip_window_left = 0 + self.skip_window_left = 1 self.skip_window_right = 1 self.window_size = self.skip_window_left + self.skip_window_right + 1 self.dict_path = 'corpus/' + corpus + '_dict.utf8' @@ -58,9 +58,12 @@ def generate_sentences(self): extend_words = [2] * self.skip_window_left extend_words.extend(sentence_words) extend_words.extend([3] * self.skip_window_right) + if self.skip_window_right == 0: + et = enumerate(extend_words[self.skip_window_left:], self.skip_window_left) + else: + et = enumerate(extend_words[self.skip_window_left:-self.skip_window_right], self.skip_window_left) word_batch = list( - map(lambda item: extend_words[item[0] - self.skip_window_left:item[0] + self.skip_window_right + 1], - enumerate(extend_words[self.skip_window_left:-self.skip_window_right], self.skip_window_left))) + map(lambda item: extend_words[item[0] - self.skip_window_left:item[0] + self.skip_window_right + 1],et)) characters_batch.append(np.array(word_batch, dtype=np.int32)) labels_batch.append(np.array(self.labels[i], dtype=np.int32)) #print(characters_batch) diff --git a/re_cnn.py b/re_cnn.py index 6aebd52..4558d79 100644 --- a/re_cnn.py +++ b/re_cnn.py @@ -5,7 +5,7 @@ class RECNN(): - def __init__(self, relation_count=2, window_size=(3,), batch_size=50, batch_length=85,train=True): + def __init__(self, relation_count=2, window_size=(3,), batch_size=50, batch_length=85, train=True): tf.reset_default_graph() self.dtype = tf.float32 self.window_size = window_size @@ -42,7 +42,7 @@ def __init__(self, relation_count=2, window_size=(3,), batch_size=50, batch_leng self.character_embedding = self.weight_variable([self.words_size, self.character_embed_size]) self.conv_kernel = self.get_conv_kernel() self.bias = [self.weight_variable([self.filter_size])] * len(self.window_size) - self.full_connected_weight = self.weight_variable([self.filter_size*len(self.window_size), self.relation_count]) + self.full_connected_weight = self.weight_variable([self.filter_size * len(self.window_size), self.relation_count]) self.full_connected_bias = self.weight_variable([self.relation_count]) self.position_lookup = tf.nn.embedding_lookup(self.position_embedding, self.input_position) self.character_lookup = tf.nn.embedding_lookup(self.character_embedding, self.input_characters) @@ -57,7 +57,7 @@ def __init__(self, relation_count=2, window_size=(3,), batch_size=50, batch_leng if train: self.hidden_layer = tf.layers.dropout(self.get_hidden(), self.dropout_rate) else: - self.hidden_layer = tf.expand_dims(tf.layers.dropout(self.get_hidden(), self.dropout_rate),0) + self.hidden_layer = tf.expand_dims(tf.layers.dropout(self.get_hidden(), self.dropout_rate), 0) self.output_no_softmax = tf.matmul(self.hidden_layer, self.full_connected_weight) + self.full_connected_bias self.output = tf.nn.softmax(tf.matmul(self.hidden_layer, self.full_connected_weight) + self.full_connected_bias) self.params = [self.position_embedding, self.character_embedding, self.full_connected_weight, @@ -99,7 +99,7 @@ def get_hidden(self): if self.is_train: h = tf.concat([h, hh], 1) else: - h = tf.concat([h,hh], 0) + h = tf.concat([h, hh], 0) return h def conv(self, conv_kernel): @@ -159,10 +159,10 @@ def predict(self, sentences, primary_indies, secondary_indices): output = sess.run(self.output, feed_dict={self.input: input}) return np.argmax(output, 1) - def evaluate(self, model_file): - #tf.reset_default_graph() + def evaluate(self, model_file, debug=False): + # tf.reset_default_graph() with tf.Session() as sess: - #tf.global_variables_initializer().run() + # tf.global_variables_initializer().run() self.saver.restore(sess=sess, save_path=self.output_folder + model_file) items = self.load_batches(self.test_batch_path) @@ -184,11 +184,16 @@ def evaluate(self, model_file): current = np.argmax(output) if target == current: corr_count[target] += 1 + elif debug: + print() prec_count[current] += 1 recall_count[target] += 1 - - precs = [c / p for c, p in zip(corr_count, prec_count) if p != 0 and c != 0] - recalls = [c / r for c, r in zip(corr_count, recall_count) if r!= 0 and c != 0] + if self.relation_count == 2: + precs = [c / p if p != 0 else 0 for c, p in zip(corr_count, prec_count)] + recalls = [c / r if r != 0 else 0 for c, r in zip(corr_count, recall_count) if r != 0] + else: + precs = [c / p for c, p in zip(corr_count, prec_count) if p != 0 and c != 0] + recalls = [c / r for c, r in zip(corr_count, recall_count) if r != 0 and c != 0] print(corr_count) print(recall_count) print(corr_count) @@ -196,10 +201,9 @@ def evaluate(self, model_file): print(recalls) prec = sum(precs) / len(precs) recall = sum(recalls) / len(recalls) - f1 = 2*prec*recall/(prec+recall) - print('precision:', prec) - print('recall:', recall) - print('f1',f1) + f1 = 2 * prec * recall / (prec + recall) + return prec, recall, f1 + def train_two(): re_2 = RECNN(window_size=(2,)) @@ -215,20 +219,22 @@ def train_two(): re_2_3_4 = RECNN(window_size=(2, 3, 4)) re_2_3_4.train() + def train_multi(): - re_2 = RECNN(window_size=(2,),relation_count=29) + re_2 = RECNN(window_size=(2,), relation_count=29) re_2.train() - re_3 = RECNN(window_size=(3,),relation_count=29) + re_3 = RECNN(window_size=(3,), relation_count=29) re_3.train() - re_4 = RECNN(window_size=(4,),relation_count=29) + re_4 = RECNN(window_size=(4,), relation_count=29) re_4.train() - re_2_3 = RECNN(window_size=(2, 3),relation_count=29) + re_2_3 = RECNN(window_size=(2, 3), relation_count=29) re_2_3.train() - re_3_4 = RECNN(window_size=(3, 4),relation_count=29) + re_3_4 = RECNN(window_size=(3, 4), relation_count=29) re_3_4.train() - re_2_3_4 = RECNN(window_size=(2, 3, 4),relation_count=29) + re_2_3_4 = RECNN(window_size=(2, 3, 4), relation_count=29) re_2_3_4.train() + if __name__ == '__main__': train_two() train_multi()