Source code for gluonnlp.model.elmo

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""ELMo."""
__all__ = ['ELMoBiLM', 'ELMoCharacterEncoder',
           'elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
           'elmo_2x4096_512_2048cnn_2xhighway']

import os
import mxnet as mx
from mxnet import gluon
from mxnet.gluon.model_zoo import model_store
from mxnet.gluon.model_zoo.model_store import get_model_file

from .convolutional_encoder import ConvolutionalEncoder
from .bilm_encoder import BiLMEncoder
from ..initializer.initializer import HighwayBias
from ..vocab.elmo import ELMoCharVocab
from ..base import get_home_dir


[docs]class ELMoCharacterEncoder(gluon.HybridBlock):
    r"""ELMo character encoder

    Compute context-free character-based token representation with character-level convolution.

    This encoder has input character ids of shape
    (batch_size, sequence_length, max_character_per_word)
    and returns (batch_size, sequence_length, embedding_size).

    Parameters
    ----------
    output_size : int
        The output dimension after conducting the convolutions and max pooling,
        and applying highways, as well as linear projection.
    filters : list of tuple
        List of tuples representing the settings for convolution layers.
        Each element is (ngram_filter_size, num_filters).
    char_embed_size : int
        The input dimension to the encoder.
    num_highway : int
        The number of layers of the Highway layer.
    conv_layer_activation : str
        Activation function to be used after convolutional layer.
    max_chars_per_token : int
        The maximum number of characters of a token.
    char_vocab_size : int
        Size of character-level vocabulary.
    """
    def __init__(self,
                 output_size,
                 filters,
                 char_embed_size,
                 num_highway,
                 conv_layer_activation,
                 max_chars_per_token,
                 char_vocab_size,
                 **kwargs):
        super(ELMoCharacterEncoder, self).__init__(**kwargs)

        self._output_size = output_size
        self._char_embed_size = char_embed_size
        self._filters = filters
        ngram_filter_sizes = []
        num_filters = []
        for width, num in filters:
            ngram_filter_sizes.append(width)
            num_filters.append(num)
        self._num_highway = num_highway
        self._conv_layer_activation = conv_layer_activation
        self._max_chars_per_token = max_chars_per_token

        with self.name_scope():
            self._char_embedding = gluon.nn.Embedding(char_vocab_size,
                                                      self._char_embed_size)
            self._convolutions = ConvolutionalEncoder(embed_size=self._char_embed_size,
                                                      num_filters=tuple(num_filters),
                                                      ngram_filter_sizes=tuple(ngram_filter_sizes),
                                                      conv_layer_activation=conv_layer_activation,
                                                      num_highway=self._num_highway,
                                                      highway_bias=HighwayBias(
                                                          nonlinear_transform_bias=0.0,
                                                          transform_gate_bias=1.0),
                                                      output_size=self._output_size)


[docs]    def hybrid_forward(self, F, inputs):
        # pylint: disable=arguments-differ
        """
        Compute context insensitive token embeddings for ELMo representations.

        Parameters
        ----------
        inputs : NDArray
            Shape (batch_size, sequence_length, max_character_per_token)
            of character ids representing the current batch.

        Returns
        -------
        token_embedding : NDArray
            Shape (batch_size, sequence_length, embedding_size) with context
            insensitive token representations.
        """
        # the character id embedding
        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
        character_embedding = self._char_embedding(inputs.reshape((-1, self._max_chars_per_token)))

        character_embedding = F.transpose(character_embedding, axes=(1, 0, 2))
        token_embedding = self._convolutions(character_embedding)

        out_shape_ref = inputs.slice_axis(axis=-1, begin=0, end=1)
        out_shape_ref = out_shape_ref.broadcast_axes(axis=(2,),
                                                     size=(self._output_size))

        return token_embedding.reshape_like(out_shape_ref)


[docs]class ELMoBiLM(gluon.HybridBlock):
    r"""ELMo Bidirectional language model

    Run a pre-trained bidirectional language model, outputting the weighted
    ELMo representation.

    We implement the ELMo Bidirectional language model (BiLm) proposed in the following work::

        @inproceedings{Peters:2018,
        author={Peters, Matthew E. and  Neumann, Mark and Iyyer, Mohit and Gardner,
        Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
        title={Deep contextualized word representations},
        booktitle={Proc. of NAACL},
        year={2018}
        }

    Parameters
    ----------
    rnn_type : str
        The type of RNN cell to use.
        The option for pre-trained models is 'lstmpc'.
    output_size : int
        The output dimension after conducting the convolutions and max pooling,
        and applying highways, as well as linear projection.
    filters : list of tuple
        List of tuples representing the settings for convolution layers.
        Each element is (ngram_filter_size, num_filters).
    char_embed_size : int
        The input dimension to the encoder.
    char_vocab_size : int
        Size of character-level vocabulary.
    num_highway : int
        The number of layers of the Highway layer.
    conv_layer_activation : str
        Activation function to be used after convolutional layer.
    max_chars_per_token : int
        The maximum number of characters of a token.
    input_size : int
        The initial input size of in the RNN cell.
    hidden_size : int
        The hidden size of the RNN cell.
    proj_size : int
        The projection size of each LSTMPCellWithClip cell
    num_layers : int
        The number of RNN cells.
    cell_clip : float
        Clip cell state between [-cellclip, cell_clip] in LSTMPCellWithClip cell
    proj_clip : float
        Clip projection between [-projclip, projclip] in LSTMPCellWithClip cell
    skip_connection : bool
        Whether to add skip connections (add RNN cell input to output)
    """
    def __init__(self,
                 rnn_type,
                 output_size,
                 filters,
                 char_embed_size,
                 char_vocab_size,
                 num_highway,
                 conv_layer_activation,
                 max_chars_per_token,
                 input_size,
                 hidden_size,
                 proj_size,
                 num_layers,
                 cell_clip,
                 proj_clip,
                 skip_connection=True,
                 **kwargs):
        super(ELMoBiLM, self).__init__(**kwargs)

        self._rnn_type = rnn_type
        self._output_size = output_size
        self._filters = filters
        self._char_embed_size = char_embed_size
        self._char_vocab_size = char_vocab_size
        self._num_highway = num_highway
        self._conv_layer_activation = conv_layer_activation
        self._max_chars_per_token = max_chars_per_token
        self._input_size = input_size
        self._hidden_size = hidden_size
        self._proj_size = proj_size
        self._num_layers = num_layers
        self._cell_clip = cell_clip
        self._proj_clip = proj_clip
        self._skip_connection = skip_connection

        if not self._skip_connection:
            raise NotImplementedError

        with self.name_scope():
            self._elmo_char_encoder = ELMoCharacterEncoder(self._output_size,
                                                           self._filters,
                                                           self._char_embed_size,
                                                           self._num_highway,
                                                           self._conv_layer_activation,
                                                           self._max_chars_per_token,
                                                           self._char_vocab_size)
            self._elmo_lstm = BiLMEncoder(mode=self._rnn_type,
                                          input_size=self._input_size,
                                          hidden_size=self._hidden_size,
                                          proj_size=self._proj_size,
                                          num_layers=self._num_layers,
                                          cell_clip=self._cell_clip,
                                          proj_clip=self._proj_clip)

    def begin_state(self, func, **kwargs):
        return self._elmo_lstm.begin_state(func, **kwargs)

[docs]    def hybrid_forward(self, F, inputs, states=None, mask=None):
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        inputs : NDArray
            Shape (batch_size, sequence_length, max_character_per_token)
            of character ids representing the current batch.
        states : (list of list of NDArray, list of list of NDArray)
            The states. First tuple element is the forward layer states, while the second is
            the states from backward layer. Each is a list of states for each layer.
            The state of each layer has a list of two initial tensors with
            shape (batch_size, proj_size) and (batch_size, hidden_size).
        mask :  NDArray
            Shape (batch_size, sequence_length) with sequence mask.

        Returns
        -------
        output : list of NDArray
            A list of activations at each layer of the network, each of shape
            (batch_size, sequence_length, embedding_size)
        states : (list of list of NDArray, list of list of NDArray)
            The states. First tuple element is the forward layer states, while the second is
            the states from backward layer. Each is a list of states for each layer.
            The state of each layer has a list of two initial tensors with
            shape (batch_size, proj_size) and (batch_size, hidden_size).
        """

        type_representation = self._elmo_char_encoder(inputs)
        type_representation = type_representation.transpose(axes=(1, 0, 2))
        lstm_outputs, states = self._elmo_lstm(type_representation, states, mask)
        lstm_outputs = lstm_outputs.transpose(axes=(0, 2, 1, 3))
        type_representation = type_representation.transpose(axes=(1, 0, 2))

        # Prepare the output. The first layer is duplicated.
        output = F.concat(*[type_representation, type_representation], dim=-1)
        if mask is not None:
            output = output * mask.expand_dims(axis=-1)
        output = [output]
        output.extend([layer_activations.squeeze(axis=0) for layer_activations
                       in F.split(lstm_outputs, self._num_layers, axis=0)])
        return output, states


model_store._model_sha1.update(
    {name: checksum for checksum, name in [
        ('8c9257d9153436e9eb692f9ec48d8ee07e2120f8', 'elmo_2x1024_128_2048cnn_1xhighway_gbw'),
        ('85eab56a3c90c6866dd8d13b50449934be58a2e6', 'elmo_2x2048_256_2048cnn_1xhighway_gbw'),
        ('79af623840c13b10cb891d20c207afc483ab27b9', 'elmo_2x4096_512_2048cnn_2xhighway_5bw'),
        ('5608a09f33c52e5ab3f043b1793481ab448a0347', 'elmo_2x4096_512_2048cnn_2xhighway_gbw')
    ]})



def _get_elmo_model(model_cls, model_name, dataset_name, pretrained, ctx, root, **kwargs):
    vocab = ELMoCharVocab()
    if 'char_vocab_size' not in kwargs:
        kwargs['char_vocab_size'] = len(vocab)
    net = model_cls(**kwargs)
    if pretrained:
        model_file = get_model_file('_'.join([model_name, dataset_name]), root=root)
        net.load_parameters(model_file, ctx=ctx)
    return net, vocab


[docs]def elmo_2x1024_128_2048cnn_1xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(),
                                      root=os.path.join(get_home_dir(), 'models'), **kwargs):
    r"""ELMo 2-layer BiLSTM with 1024 hidden units, 128 projection size, 1 highway layer.

    Parameters
    ----------
    dataset_name : str or None, default None
        The dataset name on which the pre-trained model is trained.
        Options are 'gbw'.
    pretrained : bool, default False
        Whether to load the pre-trained weights for model.
    ctx : Context, default CPU
        The context in which to load the pre-trained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    gluon.Block
    """

    predefined_args = {'rnn_type': 'lstmpc',
                       'output_size': 128,
                       'filters': [[1, 32], [2, 32], [3, 64], [4, 128],
                                   [5, 256], [6, 512], [7, 1024]],
                       'char_embed_size': 16,
                       'num_highway': 1,
                       'conv_layer_activation': 'relu',
                       'max_chars_per_token': 50,
                       'input_size': 128,
                       'hidden_size': 1024,
                       'proj_size': 128,
                       'num_layers': 2,
                       'cell_clip': 3,
                       'proj_clip': 3,
                       'skip_connection': True}
    assert all((k not in kwargs) for k in predefined_args), \
           'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    return _get_elmo_model(ELMoBiLM, 'elmo_2x1024_128_2048cnn_1xhighway', dataset_name, pretrained,
                           ctx, root, **predefined_args)


[docs]def elmo_2x2048_256_2048cnn_1xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(),
                                      root=os.path.join(get_home_dir(), 'models'), **kwargs):
    r"""ELMo 2-layer BiLSTM with 2048 hidden units, 256 projection size, 1 highway layer.

    Parameters
    ----------
    dataset_name : str or None, default None
        The dataset name on which the pre-trained model is trained.
        Options are 'gbw'.
    pretrained : bool, default False
        Whether to load the pre-trained weights for model.
    ctx : Context, default CPU
        The context in which to load the pre-trained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    gluon.Block
    """

    predefined_args = {'rnn_type': 'lstmpc',
                       'output_size': 256,
                       'filters': [[1, 32], [2, 32], [3, 64], [4, 128],
                                   [5, 256], [6, 512], [7, 1024]],
                       'char_embed_size': 16,
                       'num_highway': 1,
                       'conv_layer_activation': 'relu',
                       'max_chars_per_token': 50,
                       'input_size': 256,
                       'hidden_size': 2048,
                       'proj_size': 256,
                       'num_layers': 2,
                       'cell_clip': 3,
                       'proj_clip': 3,
                       'skip_connection': True}
    assert all((k not in kwargs) for k in predefined_args), \
           'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    return _get_elmo_model(ELMoBiLM, 'elmo_2x2048_256_2048cnn_1xhighway', dataset_name, pretrained,
                           ctx, root, **predefined_args)


[docs]def elmo_2x4096_512_2048cnn_2xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(),
                                      root=os.path.join(get_home_dir(), 'models'), **kwargs):
    r"""ELMo 2-layer BiLSTM with 4096 hidden units, 512 projection size, 2 highway layer.

    Parameters
    ----------
    dataset_name : str or None, default None
        The dataset name on which the pre-trained model is trained.
        Options are 'gbw' and '5bw'.
    pretrained : bool, default False
        Whether to load the pre-trained weights for model.
    ctx : Context, default CPU
        The context in which to load the pre-trained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    gluon.Block
    """

    predefined_args = {'rnn_type': 'lstmpc',
                       'output_size': 512,
                       'filters': [[1, 32], [2, 32], [3, 64], [4, 128],
                                   [5, 256], [6, 512], [7, 1024]],
                       'char_embed_size': 16,
                       'num_highway': 2,
                       'conv_layer_activation': 'relu',
                       'max_chars_per_token': 50,
                       'input_size': 512,
                       'hidden_size': 4096,
                       'proj_size': 512,
                       'num_layers': 2,
                       'cell_clip': 3,
                       'proj_clip': 3,
                       'skip_connection': True}
    assert all((k not in kwargs) for k in predefined_args), \
           'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    return _get_elmo_model(ELMoBiLM, 'elmo_2x4096_512_2048cnn_2xhighway', dataset_name, pretrained,
                           ctx, root, **predefined_args)