-
Notifications
You must be signed in to change notification settings - Fork 6
/
util.py
471 lines (416 loc) · 20.1 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
import torch
import six
from torchtext.data.field import RawField, Field
from torchtext.vocab import Vocab
from torchtext.data.pipeline import Pipeline
from collections import Counter, OrderedDict
from torch.autograd import Variable
from torchtext.data.dataset import Dataset
from torch.nn import Module
class MultiMarginHierarchyLoss(Module):
r"""Creates a criterion that optimizes a multi-class classification hinge
loss (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and
output `y` (which is a 1D tensor of target class indices,
`0` <= `y` <= `x.size(1)`):
For each mini-batch sample::
loss(x, y) = sum_i(max(0, (margin - x[y] + x[i]))^p) / x.size(0)
where `i == 0` to `x.size(0)` and `i != y`.
Optionally, you can give non-equal weighting on the classes by passing
a 1D `weight` tensor into the constructor.
The loss function then becomes:
loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x.size(0)
By default, the losses are averaged over observations for each minibatch.
However, if the field `size_average` is set to ``False``,
the losses are instead summed.
"""
def __init__(self, neighbor_maps, class_size=2, p=1, neighbor_margin=0.5,
margin=1, weight=None, size_average=True):
super(MultiMarginHierarchyLoss, self).__init__()
if p != 1 and p != 2:
raise ValueError("only p == 1 and p == 2 supported")
assert weight is None or weight.dim() == 1
self.class_size = class_size
self.neighbor_maps = neighbor_maps
self.neighbor_margin = neighbor_margin
self.p = p
self.margin = margin
self.size_average = size_average
self.weight = weight
def forward(self, input, target):
# return multi_margin_loss(input, target, self.p, self.margin,
# self.weight, self.size_average)
batch_size = input.size(0)
y_indices = target.nonzero()
for b in range(batch_size):
tgt_labels = y_indices[b, :]
l = [self.neighbor_maps[str(tgt_label)] for tgt_label in tgt_labels]
neighbor_inds = [item for sublist in l for item in sublist] # for entire group
# compute loss
for i in range(self.class_size):
pass
return
# we also implement the latest BCELoss without reduction
def binary_cross_entropy_with_logits(input, target, weight=None, size_average=True, reduce=True):
r"""Function that measures Binary Cross Entropy between target and output
logits.
See :class:`~torch.nn.BCEWithLogitsLoss` for details.
Args:
input: Variable of arbitrary shape
target: Variable of the same shape as input
weight (Variable, optional): a manual rescaling weight
if provided it's repeated to match input tensor shape
size_average (bool, optional): By default, the losses are averaged
over observations for each minibatch. However, if the field
sizeAverage is set to False, the losses are instead summed
for each minibatch. Default: ``True``
reduce (bool, optional): By default, the losses are averaged or summed over
observations for each minibatch depending on size_average. When reduce
is False, returns a loss per input/target element instead and ignores
size_average. Default: True
Examples::
>>> input = torch.randn(3, requires_grad=True)
>>> target = torch.FloatTensor(3).random_(2)
>>> loss = F.binary_cross_entropy_with_logits(input, target)
>>> loss.backward()
"""
if not (target.size() == input.size()):
raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
max_val = (-input).clamp(min=0)
loss = input - input * target + max_val + ((-max_val).exp() + (-input - max_val).exp()).log()
if weight is not None:
loss = loss * weight
if not reduce:
return loss
elif size_average:
return loss.mean()
else:
return loss.sum()
class BCEWithLogitsLoss(Module):
r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
class. This version is more numerically stable than using a plain `Sigmoid`
followed by a `BCELoss` as, by combining the operations into one layer,
we take advantage of the log-sum-exp trick for numerical stability.
The loss can be described as:
.. math::
\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
l_n = - w_n \left[ t_n \cdot \log \sigma(x_n)
+ (1 - t_n) \cdot \log (1 - \sigma(x_n)) \right],
where :math:`N` is the batch size. If reduce is ``True``, then
.. math::
\ell(x, y) = \begin{cases}
\operatorname{mean}(L), & \text{if}\; \text{size_average} = \text{True},\\
\operatorname{sum}(L), & \text{if}\; \text{size_average} = \text{False}.
\end{cases}
This is used for measuring the error of a reconstruction in for example
an auto-encoder. Note that the targets `t[i]` should be numbers
between 0 and 1.
Args:
weight (Tensor, optional): a manual rescaling weight given to the loss
of each batch element. If given, has to be a Tensor of size
"nbatch".
size_average (bool, optional): By default, the losses are averaged
over observations for each minibatch. However, if the field
size_average is set to ``False``, the losses are instead summed for
each minibatch. Default: ``True``
reduce (bool, optional): By default, the losses are averaged or summed over
observations for each minibatch depending on size_average. When reduce
is False, returns a loss per input/target element instead and ignores
size_average. Default: True
Shape:
- Input: :math:`(N, *)` where `*` means, any number of additional
dimensions
- Target: :math:`(N, *)`, same shape as the input
Examples::
>>> loss = nn.BCEWithLogitsLoss()
>>> input = torch.randn(3, requires_grad=True)
>>> target = torch.FloatTensor(3).random_(2)
>>> output = loss(input, target)
>>> output.backward()
"""
def __init__(self, weight=None, size_average=True, reduce=True):
super(BCEWithLogitsLoss, self).__init__()
self.size_average = size_average
self.reduce = reduce
self.register_buffer('weight', weight)
def forward(self, input, target):
if self.weight is not None:
var = Variable(self.weight) if not isinstance(self.weight, Variable) else self.weight
return binary_cross_entropy_with_logits(input, target,
var,
self.size_average,
reduce=self.reduce)
else:
return binary_cross_entropy_with_logits(input, target,
size_average=self.size_average,
reduce=self.reduce)
class ReversibleField(Field):
def __init__(self, **kwargs):
if kwargs.get('tokenize') is list:
self.use_revtok = False
else:
self.use_revtok = True
if kwargs.get('tokenize') not in ('revtok', 'subword', list):
kwargs['tokenize'] = 'revtok'
if 'unk_token' not in kwargs:
kwargs['unk_token'] = ' UNK '
super(ReversibleField, self).__init__(**kwargs)
def reverse(self, batch):
if self.use_revtok:
try:
import revtok
except ImportError:
print("Please install revtok.")
raise
if not self.batch_first:
batch = batch.t()
with torch.cuda.device_of(batch):
batch = batch.tolist()
batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch] # denumericalize
def trim(s, t):
sentence = []
for w in s:
if w == t:
break
sentence.append(w)
return sentence
batch = [trim(ex, self.eos_token) for ex in batch] # trim past frst eos
def filter_special(tok):
return tok not in (self.init_token, self.pad_token)
batch = [filter(filter_special, ex) for ex in batch]
if self.use_revtok:
return [revtok.detokenize(ex) for ex in batch]
return [' '.join(ex) for ex in batch]
class MultiLabelField(RawField):
"""Defines a datatype together with instructions for converting to Tensor.
Field class models common text processing datatypes that can be represented
by tensors. It holds a Vocab object that defines the set of possible values
for elements of the field and their corresponding numerical representations.
The Field object also holds other parameters relating to how a datatype
should be numericalized, such as a tokenization method and the kind of
Tensor that should be produced.
If a Field is shared between two columns in a dataset (e.g., question and
answer in a QA dataset), then they will have a shared vocabulary.
Attributes:
sequential: Whether the datatype represents sequential data. If False,
no tokenization is applied. Default: True.
use_vocab: Whether to use a Vocab object. If False, the data in this
field should already be numerical. Default: True.
init_token: A token that will be prepended to every example using this
field, or None for no initial token. Default: None.
eos_token: A token that will be appended to every example using this
field, or None for no end-of-sentence token. Default: None.
fix_length: A fixed length that all examples using this field will be
padded to, or None for flexible sequence lengths. Default: None.
tensor_type: The torch.Tensor class that represents a batch of examples
of this kind of data. Default: torch.LongTensor.
preprocessing: The Pipeline that will be applied to examples
using this field after tokenizing but before numericalizing. Many
Datasets replace this attribute with a custom preprocessor.
Default: None.
postprocessing: A Pipeline that will be applied to examples using
this field after numericalizing but before the numbers are turned
into a Tensor. The pipeline function takes the batch as a list,
the field's Vocab, and train (a bool).
Default: None.
lower: Whether to lowercase the text in this field. Default: False.
tokenize: The function used to tokenize strings using this field into
sequential examples. If "spacy", the SpaCy English tokenizer is
used. Default: str.split.
include_lengths: Whether to return a tuple of a padded minibatch and
a list containing the lengths of each examples, or just a padded
minibatch. Default: False.
batch_first: Whether to produce tensors with the batch dimension first.
Default: False.
pad_token: The string token used as padding. Default: "<pad>".
unk_token: The string token used to represent OOV words. Default: "<unk>".
"""
vocab_cls = Vocab
# Dictionary mapping PyTorch tensor types to the appropriate Python
# numeric type.
tensor_types = {
torch.FloatTensor: float,
torch.cuda.FloatTensor: float,
torch.DoubleTensor: float,
torch.cuda.DoubleTensor: float,
torch.HalfTensor: float,
torch.cuda.HalfTensor: float,
torch.ByteTensor: int,
torch.cuda.ByteTensor: int,
torch.CharTensor: int,
torch.cuda.CharTensor: int,
torch.ShortTensor: int,
torch.cuda.ShortTensor: int,
torch.IntTensor: int,
torch.cuda.IntTensor: int,
torch.LongTensor: int,
torch.cuda.LongTensor: int
}
def __init__(
self, label_size, sequential=True, use_vocab=True, init_token=None,
eos_token=None, fix_length=None, tensor_type=torch.LongTensor,
preprocessing=None, postprocessing=None, lower=False,
tokenize=(lambda s: s.split()), include_lengths=False,
batch_first=False, pad_token="<pad>", unk_token="<unk>"):
self.sequential = sequential
self.use_vocab = use_vocab
self.init_token = init_token
self.eos_token = eos_token
self.unk_token = unk_token
self.fix_length = fix_length
self.tensor_type = tensor_type
self.preprocessing = preprocessing
self.postprocessing = postprocessing
self.lower = lower
self.tokenize = tokenize
self.include_lengths = include_lengths
self.batch_first = batch_first
self.pad_token = pad_token if self.sequential else None
self.label_size = label_size
def preprocess(self, x):
"""Load a single example using this field, tokenizing if necessary.
If the input is a Python 2 `str`, it will be converted to Unicode
first. If `sequential=True`, it will be tokenized. Then the input
will be optionally lowercased and passed to the user-provided
`preprocessing` Pipeline."""
if (six.PY2 and isinstance(x, six.string_types) and not
isinstance(x, six.text_type)):
x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
# will strip and then split here!
if self.sequential and isinstance(x, six.text_type):
x = self.tokenize(x.rstrip('\n'))
if self.lower:
x = Pipeline(six.text_type.lower)(x)
if self.preprocessing is not None:
return self.preprocessing(x)
else:
return x
def process(self, batch, device, train):
""" Process a list of examples to create a torch.Tensor.
Pad, numericalize, and postprocess a batch and create a tensor.
Args:
batch (list(object)): A list of object from a batch of examples.
Returns:
data (torch.autograd.Varaible): Processed object given the input
and custom postprocessing Pipeline.
"""
padded = self.pad(batch)
tensor = self.numericalize(padded, device=device, train=train)
return tensor
def pad(self, minibatch):
"""Pad a batch of examples using this field.
Pads to self.fix_length if provided, otherwise pads to the length of
the longest example in the batch. Prepends self.init_token and appends
self.eos_token if those attributes are not None. Returns a tuple of the
padded list and a list containing lengths of each example if
`self.include_lengths` is `True` and `self.sequential` is `True`, else just
returns the padded list. If `self.sequential` is `False`, no padding is applied.
"""
minibatch = list(minibatch)
# we handle "padding" at numericalization
return minibatch
#
# if self.fix_length is None:
# max_len = max(len(x) for x in minibatch)
# else:
# max_len = self.fix_length + (
# self.init_token, self.eos_token).count(None) - 2
# padded, lengths = [], []
# for x in minibatch:
# padded.append(
# ([] if self.init_token is None else [self.init_token]) +
# list(x[:max_len]) +
# ([] if self.eos_token is None else [self.eos_token]) +
# [self.pad_token] * max(0, max_len - len(x)))
# lengths.append(len(padded[-1]) - max(0, max_len - len(x)))
# if self.include_lengths:
# return (padded, lengths)
# return padded
def build_vocab(self, *args, **kwargs):
"""Construct the Vocab object for this field from one or more datasets.
Arguments:
Positional arguments: Dataset objects or other iterable data
sources from which to construct the Vocab object that
represents the set of possible values for this field. If
a Dataset object is provided, all columns corresponding
to this field are used; individual columns can also be
provided directly.
Remaining keyword arguments: Passed to the constructor of Vocab.
"""
counter = Counter()
sources = []
for arg in args:
if isinstance(arg, Dataset):
sources += [getattr(arg, name) for name, field in
arg.fields.items() if field is self]
else:
sources.append(arg)
for data in sources:
for x in data:
if not self.sequential:
x = [x]
counter.update(x)
specials = list(OrderedDict.fromkeys(
tok for tok in [self.unk_token, self.pad_token, self.init_token,
self.eos_token]
if tok is not None))
self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
def numericalize(self, arr, device=None, train=True):
"""Turn a batch of examples that use this field into a Variable.
If the field has include_lengths=True, a tensor of lengths will be
included in the return value.
Arguments:
arr (List[List[str]], or tuple of (List[List[str]], List[int])):
List of tokenized and padded examples, or tuple of List of
tokenized and padded examples and List of lengths of each
example if self.include_lengths is True.
device (-1 or None): Device to create the Variable's Tensor on.
Use -1 for CPU and None for the currently active GPU device.
Default: None.
train (boolean): Whether the batch is for a training set.
If False, the Variable will be created with volatile=True.
Default: True.
"""
if self.include_lengths and not isinstance(arr, tuple):
raise ValueError("Field has include_lengths set to True, but "
"input data is not a tuple of "
"(data batch, batch lengths).")
if isinstance(arr, tuple):
arr, lengths = arr
lengths = torch.LongTensor(lengths)
if self.use_vocab:
if self.sequential:
arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
else:
arr = [self.vocab.stoi[x] for x in arr]
if self.postprocessing is not None:
arr = self.postprocessing(arr, self.vocab, train)
else:
if self.tensor_type not in self.tensor_types:
raise ValueError(
"Specified Field tensor_type {} can not be used with "
"use_vocab=False because we do not know how to numericalize it. "
"Please raise an issue at "
"https://github.com/pytorch/text/issues".format(self.tensor_type))
numericalization_func = self.tensor_types[self.tensor_type]
# It doesn't make sense to explictly coerce to a numeric type if
# the data is sequential, since it's unclear how to coerce padding tokens
# to a numeric type.
if not self.sequential:
arr = [numericalization_func(x) for x in arr]
if self.sequential:
batches = []
for x in arr:
zeros = [0.] * self.label_size
for l in x:
zeros[int(l)] = 1.
batches.append(zeros)
arr = batches
if self.postprocessing is not None:
arr = self.postprocessing(arr, None, train)
arr = self.tensor_type(arr)
if device == -1:
if self.sequential:
arr = arr.contiguous()
else:
arr = arr.cuda(device)
return Variable(arr, volatile=not train)