-
Notifications
You must be signed in to change notification settings - Fork 0
/
nce-word2vec.py
254 lines (186 loc) · 8.84 KB
/
nce-word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# coding=utf8
import sys
import tensorflow as tf
import word2vec
flags = word2vec.flags
flags.DEFINE_string("nce_vector_path", sys.argv[4], "File name for NCE vector sample source")
flags.DEFINE_string("nce_theta_w_path", sys.argv[5], "File name for NCE theta_w sample source")
flags.DEFINE_string("nce_theta_b_path", sys.argv[6], "File name for NCE theta_b sample source")
FLAGS = flags.FLAGS
class NCE_Options(word2vec.Options):
def __init__(self):
word2vec.Options.__init__(self)
self.nce_path = '1'
self.nce_vector_path = FLAGS.nce_vector_path
self.nce_theta_w_path = FLAGS.nce_theta_w_path
self.nce_theta_b_path = FLAGS.nce_theta_b_path
class NCE_Word2Vec(Word2Vec):
"""Noise-Contrastive Estimation Word2Vec model (Skipgram)."""
def read_nce(self):
opt = self._options
nce_vector = []
nce_theta_w = []
nce_theta_b = []
if opt.nce_path != '':
with open(opt.nce_vector_path, 'r') as f_vector,\
open(opt.nce_theta_w_path, 'r') as f_theta_w,\
open(opt.nce_theta_b_path, 'r') as f_theta_b:
for _line in f_vector:
_content = _line.strip().split('\t')
if len(_content) != 2:
continue
_word, _vector = _content
nce_vector.append(map(float, _vector.split(' ')))
for _line in f_theta_w:
nce_theta_w.append(map(float, _line.split(' ')))
for _line in f_theta_b:
nce_theta_b.append(float(_line))
# All Constant (Used as distribution)
# Embedding (trained before at other training set): [vocab_size, emb_dim]
self._nce_vector = tf.constant(nce_vector)
# Theta_w (trained before ... ): [vocab_size, emb_dim]
self._nce_theta_w = tf.constant(nce_theta_w)
# Theta_b (trained before ... ): [1, vocab_size]
self._nce_theta_b = tf.constant(nce_theta_b)
else:
print('Error: No nce path determined!')
def forward(self, examples, labels):
"""Build the graph for the forward pass(Adding NCE part)."""
opts = self._options
# Declare all variables we need.
# Embedding: [vocab_size, emb_dim]
init_width = 0.5 / opts.emb_dim
emb = tf.Variable(
tf.random_uniform(
[opts.vocab_size, opts.emb_dim], -init_width, init_width),
name="emb")
self._emb = emb
# Softmax weight: [vocab_size, emb_dim]. Transposed.
sm_w_t = tf.Variable(
tf.zeros([opts.vocab_size, opts.emb_dim]),
name="sm_w_t")
# Softmax bias: [vocab_size].
sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="sm_b")
self._theta_w = sm_w_t
self._theta_b = sm_b
# Global step: scalar, i.e., shape [].
self.global_step = tf.Variable(0, name="global_step")
# Nodes to compute the nce loss w/ candidate sampling.
labels_matrix = tf.reshape(
tf.cast(labels,dtype=tf.int64),
[opts.batch_size, 1]
)
# Embeddings for examples: [batch_size, emb_dim]
example_emb = tf.nn.embedding_lookup(emb, examples)
# Negative sampling.
sampled_ids = self.nce_sample(labels_matrix, example_emb)
# Weights for labels: [batch_size, emb_dim]
true_w = tf.nn.embedding_lookup(sm_w_t, labels)
# Biases for labels: [batch_size, 1]
true_b = tf.nn.embedding_lookup(sm_b, labels)
# Weights for sampled ids: [num_sampled, emb_dim]
sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids)
# Biases for sampled ids: [num_sampled, 1]
sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)
# True logits: [batch_size, 1]
true_logits = tf.reduce_sum(tf.mul(example_emb, true_w), 1) + true_b
# Sampled logits: [batch_size, num_sampled]
# We replicate sampled noise labels for all examples in the batch
# using the matmul.
sampled_b_vec = tf.reshape(sampled_b, [opts.num_samples])
sampled_logits = tf.matmul(example_emb,
sampled_w,
transpose_b=True) + sampled_b_vec
return true_logits, sampled_logits
def build_graph(self):
"""Build the graph for the full model."""
opts = self._options
# The training data. A text file.
(words, counts, words_per_epoch, self._epoch, self._words, examples,
labels) = word2vec.skipgram(filename=opts.train_data,
batch_size=opts.batch_size,
window_size=opts.window_size,
min_count=opts.min_count,
subsample=opts.subsample)
(opts.vocab_words, opts.vocab_counts,
opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
opts.vocab_size = len(opts.vocab_words)
print("Load NCE sample...")
self.read_nce()
print("NCE file: ", opts.nce_path)
self._examples = examples
self._labels = labels
self._id2word = opts.vocab_words
for i, w in enumerate(self._id2word):
self._word2id[w] = i
true_logits, sampled_logits = self.forward(examples, labels)
loss = self.nce_loss(true_logits, sampled_logits)
# scalar_summary("NCE loss", loss)
self._loss = loss
self.optimize(loss)
# Properly initialize all variables.
tf.global_variables_initializer().run()
# tf.initialize_all_variables().run()
self.saver = tf.train.Saver(
{
"emb": self._emb
}
)
def nce_sample(self, labels, batch_emb):
"""Return NCE sample"""
opts = self._options
# media_matrix=tf.matmul(batch_emb,self._nce_theta_w,transpose_b=True)+self._nce_theta_b
# nce_distribution = tf.reduce_sum(media_matrix, 0) / self._options.batch_size
unigrams = [1 for _ in xrange(opts.vocab_size)]
rough_sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
true_classes=labels,
num_true=1,
num_sampled=opts.num_samples * 5, # in order to find top k
unique=True,
range_max=opts.vocab_size,
distortion=0.75,
unigrams=unigrams))
real_nce_theta_w = tf.nn.embedding_lookup(self._nce_theta_w, rough_sampled_ids)
real_nce_theta_b = tf.nn.embedding_lookup(self._nce_theta_b, rough_sampled_ids)
media_matrix = tf.matmul(batch_emb, real_nce_theta_w, transpose_b=True) + real_nce_theta_b
rough_sampled_logits = tf.reduce_sum(media_matrix, 0) / self._options.batch_size
# rough_sampled_logits = tf.nn.embedding_lookup(nce_distribution, rough_sampled_ids)
_, top_k_sampled_logits_id = tf.nn.top_k(rough_sampled_logits, opts.num_samples)
sampled_ids = tf.nn.embedding_lookup(rough_sampled_ids, top_k_sampled_logits_id)
return sampled_ids
def _start_shell(local_ns=None):
# An interactive shell is useful for debugging/development.
import IPython
user_ns = {}
if local_ns:
user_ns.update(local_ns)
user_ns.update(globals())
IPython.start_ipython(argv=[], user_ns=user_ns)
def main(_):
"""Train a word2vec model."""
if not FLAGS.train_data or not FLAGS.eval_data or not FLAGS.save_path:
print("--train_data --eval_data and --save_path must be specified.")
sys.exit(1)
opts = NCE_Options()
mConfig = tf.ConfigProto(allow_soft_placement=True)
mConfig.gpu_options.allocator_type = 'BFC'
mConfig.gpu_options.per_process_gpu_memory_fraction=0.8
# with tf.Graph().as_default(), tf.Session() as session:
with tf.Graph().as_default(), tf.Session(config=mConfig) as session:
with tf.device("/cpu:0"):
model = Word2Vec(opts, session)
for epoch in xrange(opts.epochs_to_train):
model.train() # Process one epoch
model.save_vec(FLAGS.vector_path, epoch) # Save embeddings for current epoch
# Perform a final save.
model.saver.save(session,
os.path.join(opts.save_path, "model.ckpt"),
global_step=model.global_step)
model.save_vec(FLAGS.vector_path, -1) # Final save
if FLAGS.interactive:
# E.g.,
# [0]: model.analogy(b'france', b'paris', b'russia')
# [1]: model.nearby([b'proton', b'elephant', b'maxwell'])
_start_shell(locals())
if __name__ == "__main__":
tf.app.run()