Source code for gluonnlp.model.sampled_block
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Blocks for sampled losses."""
__all__ = ['ISDense', 'NCEDense', 'SparseISDense', 'SparseNCEDense']
from mxnet import nd
from mxnet.gluon import Block, HybridBlock
class _SampledDenseHelper(HybridBlock):
"""A helper Block for calculating sampled pred.
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
sparse_label: bool
Whether to output label as an integer array instead of probability distribution.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits,
sparse_label, prefix=None, params=None):
super(_SampledDenseHelper, self).__init__(prefix=prefix, params=params)
self._num_classes = num_classes
self._num_sampled = num_sampled
self._in_unit = in_unit
self._remove_accidental_hits = remove_accidental_hits
self._sparse_label = sparse_label
# pylint: disable=arguments-differ
def hybrid_forward(self, F, x, sampled_values, label, w_all, b_all):
"""Forward computation."""
sampled_candidates, expected_count_sampled, expected_count_true = sampled_values
# (num_sampled, in_unit)
w_sampled = w_all.slice(begin=(0, 0), end=(self._num_sampled, None))
w_true = w_all.slice(begin=(self._num_sampled, 0), end=(None, None))
b_sampled = b_all.slice(begin=(0,), end=(self._num_sampled,))
b_true = b_all.slice(begin=(self._num_sampled,), end=(None,))
# true pred
# (batch_size, 1)
x = x.reshape((-1, self._in_unit))
pred_true = (w_true * x).sum(axis=1) + b_true
# samples pred
# (batch_size, num_sampled)
b_sampled = F.reshape(b_sampled, (-1,))
pred_sampled = F.FullyConnected(x, weight=w_sampled, bias=b_sampled,
num_hidden=self._num_sampled)
# remove accidental hits
if self._remove_accidental_hits:
label_vec = F.reshape(label, (-1, 1)).astype('int32')
sample_vec = F.reshape(sampled_candidates, (1, -1)).astype('int32')
mask = F.broadcast_equal(label_vec, sample_vec).astype('float32') * -1e37
pred_sampled = pred_sampled + mask
# subtract log(q)
expected_count_sampled = expected_count_sampled.astype('float32')
expected_count_sampled = expected_count_sampled.reshape(shape=(1, self._num_sampled))
expected_count_true = expected_count_true.astype('float32').reshape((-1,))
pred_true = pred_true - F.log(expected_count_true)
pred_true = pred_true.reshape((-1, 1))
pred_sampled = F.broadcast_sub(pred_sampled, F.log(expected_count_sampled))
# pred and new_labels
# (batch_size, 1+num_sampled)
pred = F.concat(pred_true, pred_sampled, dim=1)
if self._sparse_label:
new_label = F.zeros_like(label)
else:
label_vec = F.reshape(label, (-1, 1))
new_label_true = F.ones_like(label_vec)
new_label_sampled = F.zeros_like(pred_sampled)
new_label = F.Concat(new_label_true, new_label_sampled, dim=1)
return pred, new_label
def __repr__(self):
s = '{name}({mapping})'
mapping = '{0} -> {1}, with {2} samples'.format(self._in_unit, self._num_classes,
self._num_sampled)
return s.format(name=self.__class__.__name__,
mapping=mapping,
**self.__dict__)
class _SampledDense(HybridBlock):
"""Block that computes sampled output training pred and labels suitable for
sampled softmax loss or noise contrastive estimation loss.
Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss, and
`loss.SigmoidBinaryCrossEntropyLoss` for nce loss.
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
dtype : str or np.dtype, default 'float32'
Data type of output embeddings.
weight_initializer : str or `Initializer`, optional
Initializer for the `kernel` weights matrix.
bias_initializer: str or `Initializer`, optional
Initializer for the bias vector.
sparse_grad: bool, default True.
Whether to use sparse gradient.
Inputs:
- **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
the input network.
- **sampled_values** : A list of three tensors for
`sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
- **label**: A tensor of shape `(batch_size,1)`.
The target classes.
Outputs:
- **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
The output probability for the true class and sampled classes
- **new_targets**: A tensor.
The new target classes. The shape is `(batch_size, 1)` if `sparse_label` is `True`,
`(batch_size, 1+num_sampled)` otherwise.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits,
sparse_label, dtype='float32', weight_initializer=None,
bias_initializer='zeros', sparse_grad=True, prefix=None, params=None):
super(_SampledDense, self).__init__(prefix=prefix, params=params)
with self.name_scope():
grad_stype = 'row_sparse' if sparse_grad else 'default'
self.weight = self.params.get('weight', shape=(num_classes, in_unit),
init=weight_initializer,
dtype=dtype, grad_stype=grad_stype)
self.bias = self.params.get('bias', shape=(num_classes,), init=bias_initializer,
dtype=dtype)
self._dense = _SampledDenseHelper(num_classes, num_sampled, in_unit,
remove_accidental_hits, sparse_label)
self._num_classes = num_classes
self._num_sampled = num_sampled
self._in_unit = in_unit
self._remove_accidental_hits = remove_accidental_hits
self._sparse_grad = sparse_grad
# pylint: disable=arguments-differ
def hybrid_forward(self, F, x, sampled_values, label, weight, bias):
"""Forward computation."""
sampled_candidates, _, _ = sampled_values
# (batch_size,)
label = F.reshape(label, shape=(-1,))
# (num_sampled+batch_size,)
ids = F.concat(sampled_candidates.astype('int32'), label.astype('int32'), dim=0)
# lookup weights and biases
# (num_sampled+batch_size, dim)
w_all = F.Embedding(data=ids, weight=weight,
input_dim=self._num_classes, output_dim=self._in_unit,
sparse_grad=self._sparse_grad)
# (num_sampled+batch_size, 1)
b_all = F.take(bias, indices=ids)
return self._dense(x, sampled_values, label, w_all, b_all)
def __repr__(self):
s = '{name}({mapping})'
mapping = '{0} -> {1}, with {2} samples'.format(self._in_unit, self._num_classes,
self._num_sampled)
return s.format(name=self.__class__.__name__,
mapping=mapping,
**self.__dict__)
[docs]class NCEDense(_SampledDense):
"""Noise contrastive estimated Dense block, which computes sampled pred
output and labels for noise contrastive estimation loss during training.
Reference:
Exploring the Limits of Language Modeling
Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
https://arxiv.org/pdf/1602.02410
Please use `loss.SigmoidBinaryCrossEntropyLoss` for noise contrastive estimation loss
during training.
.. note::
If `sparse_grad` is set to True, the gradient w.r.t input and output
embeddings will be sparse. Only a subset of optimizers support
sparse gradients, including SGD, AdaGrad and Adam.
By default `lazy_update` is turned on for these optimizers,
which may perform differently from standard updates.
For more details, please check the Optimization API at:
https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
Example::
# network with sampling for training
encoder = Encoder(..)
decoder = NCEDense(..)
train_net.add(encoder)
train_net.add(decoder)
loss_train = SigmoidBinaryCrossEntropyLoss()
# training
for x, y, sampled_values in train_batches:
pred, new_targets = train_net(x, sampled_values, y)
l = loss_train(pred, new_targets)
# network for testing
test_net.add(encoder)
test_net.add(Dense(..., params=decoder.params))
loss_test = SoftmaxCrossEntropyLoss()
# testing
for x, y in test_batches:
pred = test_net(x)
l = loss_test(pred, y)
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool, default False
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
dtype : str or np.dtype, default 'float32'
Data type of output embeddings.
weight_initializer : str or `Initializer`, optional
Initializer for the `kernel` weights matrix.
bias_initializer: str or `Initializer`, optional
Initializer for the bias vector.
sparse_grad: bool, default True.
Whether to use sparse gradient.
Inputs:
- **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
the input network.
- **sampled_values** : A list of three tensors for
`sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
- **label**: A tensor of shape `(batch_size,1)`.
The target classes.
Outputs:
- **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
The output probability for the true class and sampled classes
- **new_targets**: A tensor of shape `(batch_size, 1+num_sampled)`.
The new target classes.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=False,
dtype='float32', weight_initializer=None, bias_initializer='zeros',
sparse_grad=True, prefix=None, params=None):
super(NCEDense, self).__init__(num_classes, num_sampled, in_unit, remove_accidental_hits,
False, dtype=dtype, weight_initializer=weight_initializer,
bias_initializer=bias_initializer, sparse_grad=sparse_grad,
prefix=prefix, params=params)
[docs]class ISDense(_SampledDense):
"""Importance sampled Dense block, which computes sampled pred output and labels
for importance sampled softmax loss during training.
Reference:
Exploring the Limits of Language Modeling
Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
https://arxiv.org/pdf/1602.02410
Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss.
.. note::
If `sparse_grad` is set to True, the gradient w.r.t input and output
embeddings will be sparse. Only a subset of optimizers support
sparse gradients, including SGD, AdaGrad and Adam.
By default `lazy_update` is turned on for these optimizers,
which may perform differently from standard updates.
For more details, please check the Optimization API at
https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
Example::
# network with importance sampling for training
encoder = Encoder(..)
decoder = ISDense(..)
train_net.add(encoder)
train_net.add(decoder)
loss = SoftmaxCrossEntropyLoss()
# training
for x, y, sampled_values in train_batches:
pred, new_targets = train_net(x, sampled_values, y)
l = loss(pred, new_targets)
# network for testing
test_net.add(encoder)
test_net.add(Dense(..., params=decoder.params))
# testing
for x, y in test_batches:
pred = test_net(x)
l = loss(pred, y)
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool, default True
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
dtype : str or np.dtype, default 'float32'
Data type of output embeddings.
weight_initializer : str or `Initializer`, optional
Initializer for the `kernel` weights matrix.
bias_initializer: str or `Initializer`, optional
Initializer for the bias vector.
sparse_grad: bool, default True.
Whether to use sparse gradient.
Inputs:
- **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
the input network.
- **sampled_values** : A list of three tensors for
`sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
- **label**: A tensor of shape `(batch_size,1)`.
The target classes.
Outputs:
- **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
The output probability for the true class and sampled classes
- **new_targets**: A tensor of shape `(batch_size,)`.
The new target classes.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=True,
dtype='float32', weight_initializer=None, bias_initializer='zeros',
sparse_grad=True, prefix=None, params=None):
super(ISDense, self).__init__(num_classes, num_sampled, in_unit, remove_accidental_hits,
True, dtype=dtype, weight_initializer=weight_initializer,
bias_initializer=bias_initializer, sparse_grad=sparse_grad,
prefix=prefix, params=params)
class _SparseSampledDense(Block):
"""Block that computes sampled output training pred and labels suitable for
sampled softmax loss or noise contrastive estimation loss.
Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss, and
`loss.SigmoidBinaryCrossEntropyLoss` for nce loss.
The block is designed for distributed training with extremely large
number of classes to reduce communication overhead and memory consumption.
Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
Different from SampledDense block, the parameters have to be saved before they
are used for testing.
Example::
# network with sampled_softmax_loss for training
encoder = Encoder(..)
train_net.add(encoder)
train_net.add(SampledDense(.., prefix='decoder')))
loss = SoftmaxCrossEntropyLoss()
# training
for x, y, sampled_values in train_batches:
pred, new_targets = train_net(x, sampled_values, y)
l = loss(pred, new_targets)
# save params
train_net.save_parameters('net.params')
# network for testing
test_net.add(encoder)
test_net.add(Dense(..., prefix='decoder'))
# load params
test_net.load_parameters('net.params')
# testing
for x, y in test_batches:
pred = test_net(x)
l = loss(pred, y)
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
sparse_label: bool
Whether to output label as an integer array instead of probability distribution.
dtype : str or np.dtype, default 'float32'
Data type of output embeddings.
weight_initializer : str or `Initializer`, optional
Initializer for the `kernel` weights matrix.
bias_initializer: str or `Initializer`, optional
Initializer for the bias vector.
Inputs:
- **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
the input network.
- **sampled_values** : A list of three tensors for
`sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
- **label**: A tensor of shape `(batch_size,1)`.
The target classes.
Outputs:
- **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
The output probability for the true class and sampled classes
- **new_targets**: A tensor.
The new target classes. The shape is `(batch_size, 1)` if `sparse_label` is `True`,
`(batch_size, 1+num_sampled)` otherwise.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits,
sparse_label, dtype='float32', weight_initializer=None,
bias_initializer='zeros', prefix=None, params=None):
super(_SparseSampledDense, self).__init__(prefix=prefix, params=params)
with self.name_scope():
self.weight = self.params.get('weight', shape=(num_classes, in_unit),
init=weight_initializer, dtype=dtype,
grad_stype='row_sparse', stype='row_sparse')
self.bias = self.params.get('bias', shape=(num_classes,), init=bias_initializer,
dtype=dtype)
self._dense = _SampledDenseHelper(num_classes, num_sampled, in_unit,
remove_accidental_hits, sparse_label)
self._num_classes = num_classes
self._num_sampled = num_sampled
self._in_unit = in_unit
self._remove_accidental_hits = remove_accidental_hits
self._kwargs = {'input_dim': self._num_classes, 'output_dim': self._in_unit,
'sparse_grad': True}
def forward(self, x, sampled_values, label): # pylint: disable=arguments-differ
"""Forward computation."""
sampled_candidates, _, _ = sampled_values
# (batch_size,)
label = label.reshape(shape=(-1,))
# (num_sampled+batch_size,)
ids = nd.concat(sampled_candidates.astype('int32'), label.astype('int32'), dim=0)
# lookup weights and biases
weight = self.weight.row_sparse_data(ids)
bias = self.bias.data(ids.context)
# (num_sampled+batch_size, dim)
w_all = nd.Embedding(data=ids, weight=weight, **self._kwargs)
# (num_sampled+batch_size,)
b_all = nd.take(bias, indices=ids)
out, new_targets = self._dense(x, sampled_values, label, w_all, b_all)
return out, new_targets
def __repr__(self):
s = '{name}({mapping})'
mapping = '{0} -> {1}, num_sampled = {2}, remove_accidental_hits = {3}'
mapping = mapping.format(self._in_unit, self._num_classes, self._num_sampled,
str(self._remove_accidental_hits))
return s.format(name=self.__class__.__name__,
mapping=mapping, **self.__dict__)
[docs]class SparseISDense(_SparseSampledDense):
"""Importance sampled Dense block with sparse weights, which computes sampled pred output
and labels for importance sampled softmax loss during training.
Reference:
Exploring the Limits of Language Modeling
Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
https://arxiv.org/pdf/1602.02410
Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss.
The block is designed for distributed training with extremely large
number of classes to reduce communication overhead and memory consumption.
Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
.. note::
Different from `ISDense` block, the weight parameter is stored in
row_sparse format, which helps reduce memory consumption and
communication overhead during multi-GPU training. However,
sparse parameters cannot be shared with other blocks, nor could we hybridize
a block containing sparse parameters. Therefore, the parameters have
to be saved before they are used for testing.
Example::
# network with importance sampled softmax for training
encoder = Encoder(..)
train_net.add(encoder)
train_net.add(SparseISDense(.., prefix='decoder')))
loss = SoftmaxCrossEntropyLoss()
# training
for x, y, sampled_values in train_batches:
pred, new_targets = train_net(x, sampled_values, y)
l = loss(pred, new_targets)
# save params
train_net.save_parameters('net.params')
# network for testing
test_net.add(encoder)
test_net.add(Dense(..., prefix='decoder'))
# load params
test_net.load_parameters('net.params')
# testing
for x, y in test_batches:
pred = test_net(x)
l = loss(pred, y)
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool, default True
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
dtype : str or np.dtype, default 'float32'
Data type of output embeddings.
weight_initializer : str or `Initializer`, optional
Initializer for the `kernel` weights matrix.
bias_initializer: str or `Initializer`, optional
Initializer for the bias vector.
Inputs:
- **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
the input network.
- **sampled_values** : A list of three tensors for
`sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
- **label**: A tensor of shape `(batch_size,1)`.
The target classes.
Outputs:
- **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
The output probability for the true class and sampled classes
- **new_targets**: A tensor of shape `(batch_size,)`.
The new target classes.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=True,
dtype='float32', weight_initializer=None, bias_initializer='zeros',
prefix=None, params=None):
super(SparseISDense, self).__init__(num_classes, num_sampled, in_unit,
remove_accidental_hits, True, dtype,
weight_initializer, bias_initializer,
prefix=prefix, params=params)
[docs]class SparseNCEDense(_SparseSampledDense):
"""Noise contrastive estimated Dense block with sparse weights, which computes sampled
pred output and labels for noise contrastive estimation loss during training.
Reference:
Exploring the Limits of Language Modeling
Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
https://arxiv.org/pdf/1602.02410
Please use `loss.SigmoidBinaryCrossEntropyLoss` for noise contrastive estimation loss
during training.
The block is designed for distributed training with extremely large
number of classes to reduce communication overhead and memory consumption.
Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
.. note::
Different from `NCEDense` block, the weight parameter is stored
in row_sparse format, which helps reduce memory consumption and
communication overhead during multi-GPU training. However,
sparse parameters cannot be shared with other blocks, nor could we
hybridize a block containing sparse parameters. Therefore, the
parameters have to be saved before they are used for testing.
Example::
# network with importance sampled softmax for training
encoder = Encoder(..)
train_net.add(encoder)
train_net.add(SparseNCEDense(.., prefix='decoder')))
train_loss = SigmoidBinaryCrossEntropyLoss()
# training
for x, y, sampled_values in train_batches:
pred, new_targets = train_net(x, sampled_values, y)
l = train_loss(pred, new_targets)
# save params
train_net.save_parameters('net.params')
# network for testing
test_net.add(encoder)
test_net.add(Dense(..., prefix='decoder'))
# load params
test_net.load_parameters('net.params')
test_loss = SoftmaxCrossEntropyLoss()
# testing
for x, y in test_batches:
pred = test_net(x)
l = test_loss(pred, y)
Parameters
----------
num_classes: int
Number of possible classes.
num_sampled: int
Number of classes randomly sampled for each batch.
in_unit: int
Dimensionality of the input space.
remove_accidental_hits: bool, default True
Whether to remove "accidental hits" when a sampled candidate is equal to
one of the true classes.
dtype : str or np.dtype, default 'float32'
Data type of output embeddings.
weight_initializer : str or `Initializer`, optional
Initializer for the `kernel` weights matrix.
bias_initializer: str or `Initializer`, optional
Initializer for the bias vector.
Inputs:
- **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
the input network.
- **sampled_values** : A list of three tensors for
`sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
- **label**: A tensor of shape `(batch_size, 1+num_samples)`.
The target classes.
Outputs:
- **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
The output probability for the true class and sampled classes
- **new_targets**: A tensor of shape `(batch_size, 1+num_sampled)`.
The new target classes.
"""
def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=True,
dtype='float32', weight_initializer=None, bias_initializer='zeros',
prefix=None, params=None):
super(SparseNCEDense, self).__init__(num_classes, num_sampled, in_unit,
remove_accidental_hits, False,
dtype, weight_initializer, bias_initializer,
prefix=prefix, params=params)