-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathclassify.py
88 lines (70 loc) · 3 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import time,json
from utils import get_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC,LinearSVC
from random import randint
import numpy as np
import logging
from sklearn.model_selection import GridSearchCV
from utils import get_class_labels
import os
logger = logging.getLogger()
logger.setLevel("INFO")
def subgraph2vec_tokenizer (s):
'''
Tokenize the string from subgraph2vec sentence (i.e. <target> <context1> <context2> ...). Just target is to be used
and context strings to be ignored.
:param s: context of graph2vec file.
:return: List of targets from graph2vec file.
'''
return [line.split(' ')[0] for line in s.split('\n')]
def linear_svm_classify (X_train, X_test, Y_train, Y_test):
'''
Classifier with graph embeddings
:param X_train: training feature vectors
:param X_test: testing feature vectors
:param Y_train: training set labels
:param Y_test: test set labels
:return: None
'''
params = {'C':[0.001, 0.01,0.1,1,10,100,1000]}
classifier = GridSearchCV(LinearSVC(), params, cv=5, scoring='accuracy',verbose=0)
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)
return acc
def perform_classification (corpus_dir, extn, embeddings, class_labels_fname):
'''
Perform classification from
:param corpus_dir: folder containing subgraph2vec sentence files
:param extn: extension of subgraph2vec sentence files
:param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
:param class_labels_fname: files containing labels of each graph
:return: None
'''
wlk_files = get_files(corpus_dir, extn)
Y = np.array(get_class_labels(wlk_files, class_labels_fname))
# logging.info('Y (label) matrix shape: {}'.format(Y.shape))
seed = randint(0, 1000)
# with open(embedding_fname,'r') as fh:
# graph_embedding_dict = json.load(fh)
wlk_files = [os.path.basename(x) for x in wlk_files]
# graph_embedding_dict = {os.path.basename(x):y for x, y in graph_embedding_dict.iteritems()}
# X = np.array([graph_embedding_dict[fname] for fname in wlk_files])
X = embeddings
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(10, shuffle=True, random_state=None)
accs = []
for train_index, test_index in kf.split(X, Y):
X_train, X_test = X[train_index], X[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
# logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape))
acc = linear_svm_classify(X_train, X_test, Y_train, Y_test)
accs.append(acc)
print(np.mean(accs), np.std(accs))
return np.mean(accs)
if __name__ == '__main__':
pass