Source code for gluonnlp.model.train.language_model
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Language models for training."""
__all__ = ['AWDRNN', 'StandardRNN', 'BigRNN']
from mxnet import init, nd, autograd
from mxnet.gluon import nn, Block, HybridBlock, contrib, rnn, ParameterDict
from mxnet import sym
from ..utils import _get_rnn_layer, apply_weight_drop
from ..sampled_block import ISDense, SparseISDense
from ...utils import Parallelizable
[docs]class AWDRNN(HybridBlock):
"""AWD language model by salesforce.
Reference: https://github.com/salesforce/awd-lstm-lm
License: BSD 3-Clause
Parameters
----------
mode : str
The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'.
vocab_size : int
Size of the input vocabulary.
embed_size : int
Dimension of embedding vectors.
hidden_size : int
Number of hidden units for RNN.
num_layers : int
Number of RNN layers.
tie_weights : bool, default False
Whether to tie the weight matrices of output dense layer and input embedding layer.
dropout : float
Dropout rate to use for encoder output.
weight_drop : float
Dropout rate to use on encoder h2h weights.
drop_h : float
Dropout rate to on the output of intermediate layers of encoder.
drop_i : float
Dropout rate to on the output of embedding.
drop_e : float
Dropout rate to use on the embedding layer.
"""
def __init__(self, mode, vocab_size, embed_size=400, hidden_size=1150, num_layers=3,
tie_weights=True, dropout=0.4, weight_drop=0.5, drop_h=0.2,
drop_i=0.65, drop_e=0.1, **kwargs):
super(AWDRNN, self).__init__(**kwargs)
self._mode = mode
self._vocab_size = vocab_size
self._embed_size = embed_size
self._hidden_size = hidden_size
self._num_layers = num_layers
self._dropout = dropout
self._drop_h = drop_h
self._drop_i = drop_i
self._drop_e = drop_e
self._weight_drop = weight_drop
self._tie_weights = tie_weights
self._shared_params = None
if 'params' in kwargs:
self._shared_params = kwargs['params']
with self.name_scope():
self.embedding = self._get_embedding()
self.encoder = self._get_encoder()
self.decoder = self._get_decoder()
def _get_embedding(self):
embedding = nn.HybridSequential()
with embedding.name_scope():
embedding_block = nn.Embedding(self._vocab_size, self._embed_size,
weight_initializer=init.Uniform(0.1))
if self._drop_e:
apply_weight_drop(embedding_block, 'weight', self._drop_e, axes=(1,))
embedding.add(embedding_block)
if self._drop_i:
embedding.add(nn.Dropout(self._drop_i, axes=(0,)))
return embedding
def _get_encoder(self):
encoder = nn.HybridSequential()
with encoder.name_scope():
for l in range(self._num_layers):
encoder.add(_get_rnn_layer(self._mode, 1, self._embed_size if l == 0 else
self._hidden_size, self._hidden_size if
l != self._num_layers - 1 or not self._tie_weights
else self._embed_size, 0, self._weight_drop))
return encoder
def _get_decoder(self):
output = nn.HybridSequential()
with output.name_scope():
if self._tie_weights:
if self._shared_params is not None:
# self.embedding[0].params do not contain the bias, it
# may leave the decoder bias uninitialized. We resolve this
# issue by creating a new ParameterDict and stuffing
# every shared params into the ParameterDict.
shared_params = self.embedding[0].params
shared_params = ParameterDict(shared_params.prefix)
shared_params.update(self._shared_params)
output.add(nn.Dense(self._vocab_size, flatten=False,
params=shared_params))
else:
output.add(nn.Dense(self._vocab_size, flatten=False,
params=self.embedding[0].params))
else:
output.add(nn.Dense(self._vocab_size, flatten=False))
return output
def begin_state(self, *args, **kwargs):
return [c.begin_state(*args, **kwargs) for c in self.encoder]
def state_info(self, *args, **kwargs):
return [c.state_info(*args, **kwargs) for c in self.encoder]
def __call__(self, inputs, begin_state=None):
#pylint: disable=arguments-differ, dangerous-default-value
"""Encode the inputs given the states and valid sequence length.
Parameters
-----------
inputs : NDArray or Symbol
input tensor with shape `(sequence_length, batch_size)`
when `layout` is "TNC".
begin_state : list
initial recurrent state tensor with length equals to num_layers.
the initial state with shape `(1, batch_size, num_hidden)`
Returns
--------
out: NDArray
output tensor with shape `(sequence_length, batch_size, input_size)`
when `layout` is "TNC".
out_states: list
output recurrent state tensor with length equals to num_layers.
the state with shape `(1, batch_size, num_hidden)`
encoded_raw: list
The list of outputs of the model's encoder with length equals to num_layers.
the shape of every encoder's output `(sequence_length, batch_size, num_hidden)`
encoded_dropped: list
The list of outputs with dropout of the model's encoder with length equals
to num_layers. The shape of every encoder's dropped output
`(sequence_length, batch_size, num_hidden)`
"""
return super(AWDRNN, self).__call__(inputs, begin_state)
[docs] def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ
"""Implement the forward computation that the awd language model and cache model use.
Parameters
-----------
inputs : NDArray or Symbol
input tensor with shape `(sequence_length, batch_size)`
when `layout` is "TNC".
begin_state : list
initial recurrent state tensor with length equals to num_layers.
the initial state with shape `(1, batch_size, num_hidden)`
Returns
--------
out: NDArray or Symbol
output tensor with shape `(sequence_length, batch_size, input_size)`
when `layout` is "TNC".
out_states: list
output recurrent state tensor with length equals to num_layers.
the state with shape `(1, batch_size, num_hidden)`
encoded_raw: list
The list of outputs of the model's encoder with length equals to num_layers.
the shape of every encoder's output `(sequence_length, batch_size, num_hidden)`
encoded_dropped: list
The list of outputs with dropout of the model's encoder with length equals
to num_layers. The shape of every encoder's dropped output
`(sequence_length, batch_size, num_hidden)`
"""
encoded = self.embedding(inputs)
if not begin_state:
if F == nd:
begin_state = self.begin_state(batch_size=inputs.shape[1])
else:
begin_state = self.begin_state(batch_size=0, func=sym.zeros)
out_states = []
encoded_raw = []
encoded_dropped = []
for i, (e, s) in enumerate(zip(self.encoder, begin_state)):
encoded, state = e(encoded, s)
encoded_raw.append(encoded)
out_states.append(state)
if self._drop_h and i != len(self.encoder) - 1:
encoded = F.Dropout(encoded, p=self._drop_h, axes=(0,))
encoded_dropped.append(encoded)
if self._dropout:
encoded = F.Dropout(encoded, p=self._dropout, axes=(0,))
encoded_dropped.append(encoded)
with autograd.predict_mode():
out = self.decoder(encoded)
return out, out_states, encoded_raw, encoded_dropped
[docs]class StandardRNN(HybridBlock):
"""Standard RNN language model.
Parameters
----------
mode : str
The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'.
vocab_size : int
Size of the input vocabulary.
embed_size : int
Dimension of embedding vectors.
hidden_size : int
Number of hidden units for RNN.
num_layers : int
Number of RNN layers.
dropout : float
Dropout rate to use for encoder output.
tie_weights : bool, default False
Whether to tie the weight matrices of output dense layer and input embedding layer.
"""
def __init__(self, mode, vocab_size, embed_size, hidden_size,
num_layers, dropout=0.5, tie_weights=False, **kwargs):
if tie_weights:
assert embed_size == hidden_size, 'Embedding dimension must be equal to ' \
'hidden dimension in order to tie weights. ' \
'Got: emb: {}, hid: {}.'.format(embed_size,
hidden_size)
super(StandardRNN, self).__init__(**kwargs)
self._mode = mode
self._embed_size = embed_size
self._hidden_size = hidden_size
self._num_layers = num_layers
self._dropout = dropout
self._tie_weights = tie_weights
self._vocab_size = vocab_size
self._shared_params = None
if 'params' in kwargs:
self._shared_params = kwargs['params']
with self.name_scope():
self.embedding = self._get_embedding()
self.encoder = self._get_encoder()
self.decoder = self._get_decoder()
def _get_embedding(self):
embedding = nn.HybridSequential()
with embedding.name_scope():
embedding.add(nn.Embedding(self._vocab_size, self._embed_size,
weight_initializer=init.Uniform(0.1)))
if self._dropout:
embedding.add(nn.Dropout(self._dropout))
return embedding
def _get_encoder(self):
return _get_rnn_layer(self._mode, self._num_layers, self._embed_size,
self._hidden_size, self._dropout, 0)
def _get_decoder(self):
output = nn.HybridSequential()
with output.name_scope():
if self._tie_weights:
if self._shared_params is not None:
# self.embedding[0].params do not contain the bias, it
# may leave the decoder bias uninitialized. We resolve this
# issue by creating a new ParameterDict and stuffing
# every shared params into the ParameterDict.
shared_params = self.embedding[0].params
shared_params = ParameterDict(shared_params.prefix)
shared_params.update(self._shared_params)
output.add(nn.Dense(self._vocab_size, flatten=False,
params=shared_params))
else:
output.add(nn.Dense(self._vocab_size, flatten=False,
params=self.embedding[0].params))
else:
output.add(nn.Dense(self._vocab_size, flatten=False))
return output
def begin_state(self, *args, **kwargs):
return self.encoder.begin_state(*args, **kwargs)
def state_info(self, *args, **kwargs):
return self.encoder.state_info(*args, **kwargs)
def __call__(self, inputs, begin_state=None): # pylint: disable=arguments-differ
"""Defines the forward computation. Arguments can be either
:py:class:`NDArray` or :py:class:`Symbol`.
Parameters
-----------
inputs : NDArray or Symbol
input tensor with shape `(sequence_length, batch_size)`
when `layout` is "TNC".
begin_state : list
initial recurrent state tensor with length equals to num_layers-1.
the initial state with shape `(num_layers, batch_size, num_hidden)`
Returns
--------
out: NDArray or Symbol
output tensor with shape `(sequence_length, batch_size, input_size)`
when `layout` is "TNC".
out_states: list
output recurrent state tensor with length equals to num_layers-1.
the state with shape `(num_layers, batch_size, num_hidden)`
encoded_raw: list
The list of last output of the model's encoder.
the shape of last encoder's output `(sequence_length, batch_size, num_hidden)`
encoded_dropped: list
The list of last output with dropout of the model's encoder.
the shape of last encoder's dropped output `(sequence_length, batch_size, num_hidden)`
"""
return super(StandardRNN, self).__call__(inputs, begin_state)
[docs] def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ
"""Defines the forward computation. Arguments can be either
:py:class:`NDArray` or :py:class:`Symbol`.
Parameters
-----------
inputs : NDArray or Symbol
input tensor with shape `(sequence_length, batch_size)`
when `layout` is "TNC".
begin_state : list
initial recurrent state tensor with length equals to num_layers-1.
the initial state with shape `(num_layers, batch_size, num_hidden)`
Returns
--------
out: NDArray or Symbol
output tensor with shape `(sequence_length, batch_size, input_size)`
when `layout` is "TNC".
out_states: list
output recurrent state tensor with length equals to num_layers-1.
the state with shape `(num_layers, batch_size, num_hidden)`
encoded_raw: list
The list of last output of the model's encoder.
the shape of last encoder's output `(sequence_length, batch_size, num_hidden)`
encoded_dropped: list
The list of last output with dropout of the model's encoder.
the shape of last encoder's dropped output `(sequence_length, batch_size, num_hidden)`
"""
encoded = self.embedding(inputs)
if not begin_state:
if F == nd:
begin_state = self.begin_state(batch_size=inputs.shape[1])
else:
begin_state = self.begin_state(batch_size=0, func=sym.zeros)
encoded_raw = []
encoded_dropped = []
encoded, state = self.encoder(encoded, begin_state)
encoded_raw.append(encoded)
if self._dropout:
encoded = F.Dropout(encoded, p=self._dropout, axes=(0,))
out = self.decoder(encoded)
return out, state, encoded_raw, encoded_dropped
[docs]class BigRNN(Block):
"""Big language model with LSTMP and importance sampling.
Reference: https://github.com/rafaljozefowicz/lm
License: MIT
Parameters
----------
vocab_size : int
Size of the input vocabulary.
embed_size : int
Dimension of embedding vectors.
hidden_size : int
Number of hidden units for LSTMP.
num_layers : int
Number of LSTMP layers.
projection_size : int
Number of projection units for LSTMP.
num_sampled : int
Number of sampled classes for the decoder.
embed_dropout : float
Dropout rate to use for embedding output.
encoder_dropout : float
Dropout rate to use for encoder output.
sparse_weight : bool
Whether to use RewSparseNDArray for weights of input and output embeddings.
sparse_grad : bool
Whether to use RowSparseNDArray for the gradients w.r.t.
weights of input and output embeddings.
.. note: If `sparse_grad` is set to True, the gradient w.r.t input and output
embeddings will be sparse. Only a subset of optimizers support
sparse gradients, including SGD, AdaGrad and Adam.
By default `lazy_update` is turned on for these optimizers,
which may perform differently from standard updates.
For more details, please check the Optimization API at:
https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
.. note: If `sparse_weight` is set to True, the parameters in the embedding block and
decoder block will be stored in row_sparse format, which helps reduce memory
consumption and communication overhead during multi-GPU training. However,
sparse parameters cannot be shared with other blocks, nor could we hybridize
a block containing sparse parameters.
"""
def __init__(self, vocab_size, embed_size, hidden_size, num_layers,
projection_size, num_sampled, embed_dropout=0.0, encode_dropout=0.0,
sparse_weight=True, sparse_grad=True, **kwargs):
super(BigRNN, self).__init__(**kwargs)
self._embed_size = embed_size
self._hidden_size = hidden_size
self._projection_size = projection_size
self._num_layers = num_layers
self._embed_dropout = embed_dropout
self._encode_dropout = encode_dropout
self._vocab_size = vocab_size
self._num_sampled = num_sampled
self._sparse_weight = sparse_weight
self._sparse_grad = sparse_grad
if self._sparse_weight:
assert self._sparse_grad, 'Dense grad with sparse weight is not supported.'
with self.name_scope():
self.embedding = self._get_embedding()
self.encoder = self._get_encoder()
self.decoder = self._get_decoder()
def _get_embedding(self):
prefix = 'embedding0_'
if self._sparse_weight:
embedding = nn.Sequential(prefix=prefix)
else:
embedding = nn.HybridSequential(prefix=prefix)
with embedding.name_scope():
if self._sparse_weight:
# sparse embedding has both sparse weight and sparse grad
embed = contrib.nn.SparseEmbedding(self._vocab_size, self._embed_size,
prefix=prefix)
else:
embed = nn.Embedding(self._vocab_size, self._embed_size, prefix=prefix,
sparse_grad=self._sparse_grad)
embedding.add(embed)
if self._embed_dropout:
embedding.add(nn.Dropout(self._embed_dropout))
return embedding
def _get_encoder(self):
block = rnn.HybridSequentialRNNCell()
with block.name_scope():
for _ in range(self._num_layers):
block.add(contrib.rnn.LSTMPCell(self._hidden_size, self._projection_size))
if self._encode_dropout:
block.add(rnn.DropoutCell(self._encode_dropout))
return block
def _get_decoder(self):
prefix = 'decoder0_'
if self._sparse_weight:
# sparse IS Dense has both sparse weight and sparse grad
block = SparseISDense(self._vocab_size, self._num_sampled,
self._projection_size, remove_accidental_hits=True,
prefix=prefix)
else:
block = ISDense(self._vocab_size, self._num_sampled,
self._projection_size, remove_accidental_hits=True,
prefix=prefix, sparse_grad=self._sparse_grad)
return block
def begin_state(self, **kwargs):
return self.encoder.begin_state(**kwargs)
[docs] def forward(self, inputs, label, begin_state, sampled_values): # pylint: disable=arguments-differ
"""Defines the forward computation.
Parameters
-----------
inputs : NDArray
input tensor with shape `(sequence_length, batch_size)`
when `layout` is "TNC".
begin_state : list
initial recurrent state tensor with length equals to num_layers*2.
For each layer the two initial states have shape `(batch_size, num_hidden)`
and `(batch_size, num_projection)`
sampled_values : list
a list of three tensors for `sampled_classes` with shape `(num_samples,)`,
`expected_count_sampled` with shape `(num_samples,)`, and
`expected_count_true` with shape `(sequence_length, batch_size)`.
Returns
--------
out : NDArray
output tensor with shape `(sequence_length, batch_size, 1+num_samples)`
when `layout` is "TNC".
out_states : list
output recurrent state tensor with length equals to num_layers*2.
For each layer the two initial states have shape `(batch_size, num_hidden)`
and `(batch_size, num_projection)`
new_target : NDArray
output tensor with shape `(sequence_length, batch_size)`
when `layout` is "TNC".
"""
encoded = self.embedding(inputs)
length = inputs.shape[0]
batch_size = inputs.shape[1]
encoded, out_states = self.encoder.unroll(length, encoded, begin_state,
layout='TNC', merge_outputs=True)
out, new_target = self.decoder(encoded, sampled_values, label)
out = out.reshape((length, batch_size, -1))
new_target = new_target.reshape((length, batch_size))
return out, out_states, new_target
class ParallelBigRNN(Parallelizable):
"""Data parallel BigRNN model for training.
Parameters
----------
model : HybridBlock
The RNN model to be parallelized
loss_fn : function
A function computes the loss of given predictions.
batch_size : int
Defines the batch size at each iteration
----------
"""
def __init__(self, model, loss_fn, batch_size):
self._model = model
self._loss = loss_fn
self._batch_size = batch_size
def forward_backward(self, x):
"""Defines the forward computation.
Parameters
----------
x : tuple
It contains the input, target, masked, sampled and hidden states
Returns
----------
hidden : NDArray
Next hidden states computed by the parallel model
ls : NDArray
Loss computed with provided loss function
"""
X, y, m, s, h = x
with autograd.record():
output, hidden, new_target = self._model(X, y, h, s)
output = output.reshape((-3, -1))
new_target = new_target.reshape((-1,))
ls = self._loss(output, new_target) * m.reshape((-1,))
ls = ls / self._batch_size
ls.backward()
return hidden, ls