Source code for gluonnlp.vocab.elmo

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Vocabulary class used in the original pre-trained ELMo models."""

# pylint: disable=consider-iterating-dictionary

__all__ = ['ELMoCharVocab']

[docs]class ELMoCharVocab:
    r"""ELMo special character vocabulary

    The vocab aims to map individual tokens to sequences of character ids, compatible with ELMo.
    To be consistent with previously trained models, we include it here.

    Specifically, char ids 0-255 come from utf-8 encoding bytes.
    Above 256 are reserved for special tokens.

    Parameters
    ----------
    bos_token : hashable object or None, default '<bos>'
        The representation for the special token of beginning-of-sequence token.
    eos_token : hashable object or None, default '<eos>'
        The representation for the special token of end-of-sequence token.

    Attributes
    ----------
    max_word_length : 50
        The maximum number of character a word contains is 50 in ELMo.
    bos_id : 256
        The index of beginning of the sentence character is 256 in ELMo.
    eos_id : 257
        The index of end of the sentence character is 257 in ELMo.
    bow_id : 258
        The index of beginning of the word character is 258 in ELMo.
    eow_id : 259
        The index of end of the word character is 259 in ELMo.
    pad_id : 260
        The index of padding character is 260 in ELMo.
    """
    max_word_length = 50
    max_word_chars = 48 # excluding bow and eow
    # char ids 0-255 come from utf-8 encoding bytes
    bos_id = 256
    eos_id = 257
    bow_id = 258
    eow_id = 259
    pad_id = 260

    def __init__(self, bos_token='<bos>', eos_token='<eos>'):
        self._bos_token = bos_token
        self._eos_token = eos_token
        self._id_dict = {bos_token: [ELMoCharVocab.bos_id],
                         eos_token: [ELMoCharVocab.eos_id]}

    def __getitem__(self, tokens):
        """Looks up indices of text tokens according to the vocabulary.

        Parameters
        ----------
        tokens : str or list of strs
            A source token or tokens to be converted.

        Returns
        -------
        int or list of ints
            A list of char indices or a list of list of char indices according to the vocabulary.
        """

        if not isinstance(tokens, (list, tuple)):
            return self._token_to_char_indices(tokens)
        else:
            return [self._token_to_char_indices(token) for token in tokens]

    def _token_to_char_indices(self, token):
        ids = [ELMoCharVocab.pad_id] * ELMoCharVocab.max_word_length
        ids[0] = ELMoCharVocab.bow_id
        word_ids = bytearray(token, 'utf-8', 'ignore')[:ELMoCharVocab.max_word_chars]
        word_ids = self._id_dict.get(token, word_ids)
        ids[1:(1+len(word_ids))] = word_ids
        ids[1+len(word_ids)] = ELMoCharVocab.eow_id
        return ids

[docs]    def __call__(self, tokens):
        """Looks up indices of text tokens according to the vocabulary.


        Parameters
        ----------
        tokens : str or list of strs
            A source token or tokens to be converted.


        Returns
        -------
        int or list of ints
            A list of char indices or a list of list of char indices according to the vocabulary.
        """

        return self[tokens]

    def __len__(self):
        return 262