-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathtridnr.py
134 lines (96 loc) · 5.46 KB
/
tridnr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from sklearn.cross_validation import train_test_split
from gensim.models.doc2vec import Doc2Vec
import networkutils as net
from random import shuffle
import Evaluation
class TriDNR:
"""
Tri-Party Deep Network Representation, IJCAI-2016
Read the data from a `directory` which contains text, label, structure information, and initialize the TriDNR from
Doc2Vec and DeepWalk Models, then iteratively update the model with text, label, and structure information
The `directory` read the data from the directory 'dir', the directory should contain three files:
docs.txt -- text document for each node, one line for one node
labels.txt -- class label for each node, one line for one node
adjedges.txt -- edge list of each node, one line for one node.
`train_size`: Percentage of training data in range 0.0-1.0, if train_size==0, it becomes pure unsupervised network
representation
`text_weight`: weights for the text inforamtion 0.0-1.0
`size` is the dimensionality of the feature vectors.
`dm` defines doc2vec the training algorithm. (`dm=1`), 'distributed memory' (PV-DM) is used.
Otherwise, `distributed bag of words` (PV-DBOW) is employed.
`min_count`: minimum number of counts for words
"""
def __init__(self, directory=None, train_size=0.3, textweight=0.8, size=300, seed=1, workers=1, passes=10, dm=0, min_count=3):
# Read the data
alldocs, docindex, classlabels = net.readNetworkData(directory)
print('%d documents, %d classes, training ratio=%f' % (len(alldocs), len(classlabels), train_size))
print('%d classes' % len(classlabels))
#Initilize Doc2Vec
if train_size > 0: #label information is available for learning
print('Adding Label Information')
train, test = train_test_split(alldocs, train_size=train_size, random_state=seed)
"""
Add supervised information to training data, use label information for learning
Specifically, the doc2vec algorithm used the tags information as document IDs,
and learn a vector representation for each tag (ID). We add the class label into the tags,
so each class label will acts as a ID and is used to learn the latent representation
"""
alldata = train[:]
for x in alldata:
x.tags.append('Label_'+x.labels)
alldata.extend(test)
else: # no label information is available, pure unsupervised learning
alldata = alldocs[:]
d2v = net.trainDoc2Vec(alldata, workers=workers, size=size, dm=dm, passes=passes, min_count=min_count)
raw_walks, netwalks = net.getdeepwalks(directory, number_walks=20, walk_length=8)
w2v = net.trainWord2Vec(raw_walks, buildvoc=1, passes=passes, size=size, workers=workers)
if train_size > 0: #Print out the initial results
print('Initialize Doc2Vec Model With Supervised Information...')
Evaluation.evaluationEmbedModelFromTrainTest(d2v, train, test, classifierStr='SVM')
print('Initialize Deep Walk Model')
Evaluation.evaluationEmbedModelFromTrainTest(w2v, train, test, classifierStr='SVM')
self.d2v = d2v
self.w2v = w2v
self.train(d2v, w2v, directory, alldata, passes=passes, weight=textweight)
if textweight > 0.5:
self.model = d2v
else:
self.model = w2v
def setWeights(self, orignialModel, destModel, weight=1):
if isinstance(orignialModel, Doc2Vec):
print('Copy Weights from Doc2Vec to Word2Vec')
# destModel.reset_weights()
doctags = orignialModel.docvecs.doctags
keys = destModel.vocab.keys()
for key in keys:
if not doctags.__contains__(key):
continue
index = doctags[key].index # Doc2Vec index
id = destModel.vocab[key].index # Word2Vec index
destModel.syn0[id] = (1-weight) * destModel.syn0[id] + weight * orignialModel.docvecs.doctag_syn0[index]
destModel.syn0_lockf[id] = orignialModel.docvecs.doctag_syn0_lockf[index]
else: # orignialModel is a word2vec instance only
print('Copy Weights from Word2Vec to Doc2Vec')
assert isinstance(destModel, Doc2Vec)
doctags = destModel.docvecs.doctags
keys = orignialModel.vocab.keys()
for key in keys:
if not doctags.__contains__(key):
continue
index = doctags[key].index # Doc2Vec index
id = orignialModel.vocab[key].index # Word2Vec index
destModel.docvecs.doctag_syn0[index] = (1-weight) * destModel.docvecs.doctag_syn0[index] + weight * orignialModel.syn0[id]
destModel.docvecs.doctag_syn0_lockf[index] = orignialModel.syn0_lockf[id]
def train(self, d2v, w2v, directory, alldata, passes=10, weight=0.9):
raw_walks, walks = net.getdeepwalks(directory, number_walks=20, walk_length=10)
for i in xrange(passes):
print('Iterative Runing %d' % i)
self.setWeights(d2v, w2v, weight=weight)
#Train Word2Vec
shuffle(raw_walks)
print("Update W2V...")
w2v.train(raw_walks)
self.setWeights(w2v, d2v, weight=(1-weight))
print("Update D2V...")
shuffle(alldata) # shuffling gets best results
d2v.train(alldata)