-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathskipgram.py
executable file
·116 lines (91 loc) · 5.02 KB
/
skipgram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import tensorflow as tf
import math,logging
from pprint import pprint
from time import time
from classify import perform_classification
class skipgram(object):
'''
skipgram model - refer Mikolov et al (2013)
'''
def __init__(self, corpus_dir, extn, class_labels_fname,
num_graphs,num_subgraphs,learning_rate,embedding_size,
num_negsample,num_steps,corpus):
self.corpus_dir = corpus_dir
self.extn = extn
self.class_labels_fname = class_labels_fname
self.num_graphs = num_graphs
self.num_subgraphs = num_subgraphs
self.embedding_size = embedding_size
self.num_negsample = num_negsample
self.learning_rate = learning_rate
self.num_steps = num_steps
self.corpus = corpus
self.graph, self.batch_inputs, self.batch_labels,self.normalized_embeddings,\
self.loss,self.optimizer = self.trainer_initial()
def trainer_initial(self):
graph = tf.Graph()
with graph.as_default():
batch_inputs = tf.placeholder(tf.int32, shape=([None, ]))
batch_labels = tf.placeholder(tf.int64, shape=([None, 1]))
graph_embeddings = tf.Variable(
tf.random_uniform([self.num_graphs, self.embedding_size], -0.5 / self.embedding_size, 0.5/self.embedding_size))
batch_graph_embeddings = tf.nn.embedding_lookup(graph_embeddings, batch_inputs) #hiddeb layer
weights = tf.Variable(tf.truncated_normal([self.num_subgraphs, self.embedding_size],
stddev=1.0 / math.sqrt(self.embedding_size))) #output layer wt
biases = tf.Variable(tf.zeros(self.num_subgraphs)) #output layer biases
#negative sampling part
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=weights,
biases=biases,
labels=batch_labels,
inputs=batch_graph_embeddings,
num_sampled=self.num_negsample,
num_classes=self.num_subgraphs,
sampled_values=tf.nn.fixed_unigram_candidate_sampler(
true_classes=batch_labels,
num_true=1,
num_sampled=self.num_negsample,
unique=True,
range_max=self.num_subgraphs,
distortion=0.75,
unigrams=self.corpus.subgraph_id_freq_map_as_list)#word_id_freq_map_as_list is the
# frequency of each word in vocabulary
))
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(self.learning_rate,
global_step, 100000, 0.96, staircase=True) #linear decay over time
learning_rate = tf.maximum(learning_rate,0.001) #cannot go below 0.001 to ensure at least a minimal learning
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
norm = tf.sqrt(tf.reduce_mean(tf.square(graph_embeddings), 1, keep_dims=True))
normalized_embeddings = graph_embeddings/norm
return graph,batch_inputs, batch_labels, normalized_embeddings, loss, optimizer
def train(self,corpus,batch_size):
with tf.Session(graph=self.graph,
config=tf.ConfigProto(log_device_placement=True,allow_soft_placement=False)) as sess:
init = tf.global_variables_initializer()
sess.run(init)
loss = 0
results = []
for i in xrange(self.num_steps+1):
t0 = time()
step = 0
while corpus.epoch_flag == False:
# get (target,context) wordid tuples
batch_data, batch_labels = corpus.generate_batch_from_file(batch_size)
feed_dict = {self.batch_inputs:batch_data,self.batch_labels:batch_labels}
_,loss_val = sess.run([self.optimizer,self.loss],feed_dict=feed_dict)
loss += loss_val
step += 1
corpus.epoch_flag = False
epoch_time = time() - t0
if i == 500 or i == 900 or i == 1000:
print('epochs', i, 'loss', loss/step)
embeddings = self.normalized_embeddings.eval()
results.append(str(perform_classification(self.corpus_dir, self.extn, embeddings, self.class_labels_fname)))
loss = 0
#done with training
final_embeddings = self.normalized_embeddings.eval()
with open('log', 'a+') as f:
f.write(self.corpus_dir + ',' + str(self.learning_rate) + ',' + ','.join(results) + '\n')
print('Done')
return final_embeddings