Source code for gluonnlp.vocab.vocab

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=consider-iterating-dictionary
"""Vocabulary."""
__all__ = ['Vocab']

import collections
import json
import uuid
import warnings
import sys
from typing import Dict, Hashable, List, Optional

from mxnet import nd

from .. import _constants as C
from .. import embedding as emb
from ..data.utils import Counter, DefaultLookupDict, count_tokens

UNK_IDX = 0
_DEPR_PAD = object()
_DEPR_BOS = object()
_DEPR_EOS = object()


def _is_py35():
    return sys.version_info[0] == 3 and sys.version_info[1] == 5

[docs]class Vocab: """Indexing and embedding attachment for text tokens. Parameters ---------- counter Counts text token frequencies in the text data. Its keys will be indexed according to frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`, `unknown_token`, and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple. max_size The maximum possible number of the most frequent tokens in the keys of `counter` that can be indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose that there are different keys of `counter` whose frequency are the same, if indexing all of them will exceed this argument value, such keys will be indexed one by one according to their __cmp__() order until the frequency threshold is met. If this argument is None or larger than its largest possible value restricted by `counter` and `reserved_tokens`, this argument has no effect. min_freq The minimum frequency required for a token in the keys of `counter` to be indexed. unknown_token The representation for any unknown token. If `unknown_token` is not `None`, looking up any token that is not part of the vocabulary and thus considered unknown will return the index of `unknown_token`. If None, looking up an unknown token will result in `KeyError`. reserved_tokens A list specifying additional tokens to be added to the vocabulary. `reserved_tokens` must not contain the value of `unknown_token` or duplicate tokens. It must neither contain special tokens specified via keyword arguments. token_to_idx If not `None`, specifies the indices of tokens to be used by the vocabulary. Each token in `token_to_index` must be part of the Vocab and each index can only be associated with a single token. `token_to_idx` is not required to contain a mapping for all tokens. For example, it is valid to only set the `unknown_token` index to 10 (instead of the default of 0) with `token_to_idx = {'<unk>': 10}`, assuming that there are at least 10 tokens in the vocabulary. `**kwargs` Keyword arguments of the format `xxx_token` can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index. For example, specifying `mask_token='<mask>` as additional keyword argument when constructing a vocabulary `v` leads to `v.mask_token` exposing the value of the special token: `<mask>`. If the specified token is not part of the Vocabulary, it will be added, just as if it was listed in the `reserved_tokens` argument. The specified tokens are listed together with reserved tokens in the `reserved_tokens` attribute of the vocabulary object. deprecated_padding_token The representation for the special token of padding token. Default: '<pad>'. Specifying padding_token as positional argument is deprecated and support will be removed. Specify it as keyword argument instead (see documentation of `**kwargs` above) deprecated_bos_token The representation for the special token of beginning-of-sequence token. Default: '<bos>'. Specifying bos_token as positional argument is deprecated and support will be removed. Specify it as keyword argument instead (see documentation of `**kwargs` above) deprecated_eos_token The representation for the special token of end-of-sequence token. Default: '<eos>'. Specifying eos_token as positional argument is deprecated and support will be removed. Specify it as keyword argument instead (see documentation of `**kwargs` above) Attributes ---------- embedding : instance of :class:`gluonnlp.embedding.TokenEmbedding` The embedding of the indexed tokens. idx_to_token : list of strs A list of indexed tokens where the list indices and the token indices are aligned. reserved_tokens : list of strs or None A list of reserved tokens that will always be indexed. token_to_idx : dict mapping str to int A dict mapping each token to its index integer. unknown_token : hashable object or None The representation for any unknown token. In other words, any unknown token will be indexed as the same representation. padding_token : hashable object or None The representation for padding token. bos_token : hashable object or None The representation for beginning-of-sentence token. eos_token : hashable object or None The representation for end-of-sentence token. Examples -------- >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world'] >>> counter = gluonnlp.data.count_tokens(text_data) >>> my_vocab = gluonnlp.Vocab(counter) >>> fasttext = gluonnlp.embedding.create('fasttext', source='wiki.simple') -etc- >>> my_vocab.set_embedding(fasttext) >>> my_vocab.embedding[['hello', 'world']][:, :5] <BLANKLINE> [[ 0.39567 0.21454 -0.035389 -0.24299 -0.095645] [ 0.10444 -0.10858 0.27212 0.13299 -0.33165 ]] <NDArray 2x5 @cpu(0)> >>> my_vocab[['hello', 'world']] [5, 4] >>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape >>> layer = gluon.nn.Embedding(input_dim, output_dim) >>> layer.initialize() >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec) >>> layer(mx.nd.array([5, 4]))[:, :5] <BLANKLINE> [[ 0.39567 0.21454 -0.035389 -0.24299 -0.095645] [ 0.10444 -0.10858 0.27212 0.13299 -0.33165 ]] <NDArray 2x5 @cpu(0)> >>> glove = gluonnlp.embedding.create('glove', source='glove.6B.50d') -etc- >>> my_vocab.set_embedding(glove) >>> my_vocab.embedding[['hello', 'world']][:, :5] <BLANKLINE> [[-0.38497 0.80092 0.064106 -0.28355 -0.026759] [-0.41486 0.71848 -0.3045 0.87445 0.22441 ]] <NDArray 2x5 @cpu(0)> Extra keyword arguments of the format `xxx_token` are used to expose specified tokens as attributes. >>> my_vocab2 = gluonnlp.Vocab(counter, special_token='hi') >>> my_vocab2.special_token 'hi' With the `token_to_idx` argument the order of the `Vocab`'s index can be adapted. For example, `Vocab` assigns the index `0` to the `unknown_token` by default. With the `token_to_idx` argument, the default can be overwritten. Here we assign index `3` to the unknown token representation `<unk>`. >>> tok2idx = {'<unk>': 3} >>> my_vocab3 = gluonnlp.Vocab(counter, token_to_idx=tok2idx) >>> my_vocab3.unknown_token '<unk>' >>> my_vocab3[my_vocab3.unknown_token] 3 >>> my_vocab[my_vocab.unknown_token] 0 """ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] = None, min_freq: int = 1, unknown_token: Optional[Hashable] = C.UNK_TOKEN, deprecated_padding_token: Optional[Hashable] = _DEPR_PAD, deprecated_bos_token: Optional[Hashable] = _DEPR_BOS, deprecated_eos_token: Optional[Hashable] = _DEPR_EOS, reserved_tokens: Optional[List[Hashable]] = None, token_to_idx: Optional[Dict[Hashable, int]] = None, *, padding_token: Optional[Hashable] = C.PAD_TOKEN, bos_token: Optional[Hashable] = C.BOS_TOKEN, eos_token: Optional[Hashable] = C.EOS_TOKEN, **kwargs): # Sanity checks. assert min_freq > 0, '`min_freq` must be set to a positive value.' # Deprecation checks and warnings combs = ((deprecated_padding_token, 'padding_token', _DEPR_PAD, padding_token), (deprecated_bos_token, 'bos_token', _DEPR_BOS, bos_token), (deprecated_eos_token, 'eos_token', _DEPR_EOS, eos_token)) for depr_pos_arg, name, indicator, value in combs: if depr_pos_arg != indicator: warnings.warn( 'Specifying `{n}` as positional argument is deprecated and ' 'support will be removed. Please specify `{n}` as keyword argument instead, ' 'for example `Vocab(counter, {n}={v})`'.format(n=name, v=depr_pos_arg), DeprecationWarning) # Store positional argument value in kwargs kwargs[name] = depr_pos_arg elif name not in kwargs: # Store keyword argument value in kwargs kwargs[name] = value # Set up idx_to_token and token_to_idx based on presence of unknown token self._unknown_token = unknown_token self._idx_to_token = [unknown_token] if unknown_token else [] if unknown_token: self._token_to_idx = DefaultLookupDict(UNK_IDX) else: self._token_to_idx = {} # Handle special tokens special_tokens = [] special_iter = kwargs.items() if _is_py35(): special_iter = sorted(special_iter) for special_token_name, special_token in special_iter: # Test if kwarg specifies a special token if not special_token_name.endswith('_token'): raise ValueError('{} is invalid. Only keyword arguments ' 'that end in \'_token\' are supported ' 'to declare special tokens.'.format(special_token_name)) if special_token is not None and special_token not in special_tokens: special_tokens.append(special_token) if reserved_tokens is not None: special_tokens.extend(reserved_tokens) special_token_set = set(special_tokens) if unknown_token: assert unknown_token not in special_token_set, \ '`reserved_token` cannot contain `unknown_token`.' assert len(special_token_set) == len(special_tokens), \ '`reserved_tokens` cannot contain duplicate reserved tokens or ' \ 'other special tokens.' if not special_tokens: self._reserved_tokens = None else: self._reserved_tokens = special_tokens self._idx_to_token.extend(special_tokens) self._token_to_idx.update((token, idx) for idx, token in enumerate(self._idx_to_token)) self._embedding = None if counter: self._index_counter_keys(counter, unknown_token, special_tokens, max_size, min_freq) self._identifiers_to_tokens = kwargs if kwargs: self._expose_tokens_as_attributes(kwargs) if token_to_idx: self._sort_index_according_to_user_specification(token_to_idx) if unknown_token: self._token_to_idx._default = \ self._token_to_idx[unknown_token] # pytype: disable=not-writable def _index_counter_keys(self, counter, unknown_token, special_tokens, max_size, min_freq): """Indexes keys of `counter`. Indexes keys of `counter` according to frequency thresholds such as `max_size` and `min_freq`. """ unknown_and_special_tokens = set(special_tokens) if special_tokens else set() if unknown_token: unknown_and_special_tokens.add(unknown_token) token_freqs = sorted(counter.items(), key=lambda x: x[0]) token_freqs.sort(key=lambda x: x[1], reverse=True) token_cap = len(unknown_and_special_tokens) + ( len(counter) if not max_size else max_size) for token, freq in token_freqs: if freq < min_freq or len(self._idx_to_token) == token_cap: break if token not in unknown_and_special_tokens: self._idx_to_token.append(token) self._token_to_idx[token] = len(self._idx_to_token) - 1 def _expose_tokens_as_attributes(self, identifiers_to_tokens): # This method must not be called before internal attributes accessed by # @properties getters are set. Otherwise the @properties may raise # during the hasattr(self, identifier) check for identifier, token in identifiers_to_tokens.items(): # Special tokens are automatically added to the vocab; assert, just to be sure assert token is None or token in self if identifier.startswith('_'): raise ValueError('It is not allowed to use identifiers starting with ' 'underscore. In Python identifier names beginning with ' 'underscore are internal.') if hasattr(self, identifier): raise ValueError('vocab.{} already exists. ' 'Please choose a different identifier for token {}' .format(identifier, token)) setattr(self, identifier, token) def _sort_index_according_to_user_specification(self, token_to_idx): # Sanity checks if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()): raise ValueError('User-specified token_to_idx mapping can only contain ' 'tokens that will be part of the vocabulary.') if len(set(token_to_idx.values())) != len(token_to_idx): raise ValueError('User-specified indices must not contain duplicates.') if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(self.token_to_idx): raise ValueError('User-specified indices must not be < 0 or >= the number of tokens ' 'that will be in the vocabulary. The current vocab contains {}' 'tokens.'.format(len(self.token_to_idx))) # Update index ordering for token, new_idx in token_to_idx.items(): old_idx = self.token_to_idx[token] ousted_token = self.idx_to_token[new_idx] self.token_to_idx[token] = new_idx self.token_to_idx[ousted_token] = old_idx self.idx_to_token[old_idx] = ousted_token self.idx_to_token[new_idx] = token @property def embedding(self): return self._embedding @property def idx_to_token(self): return self._idx_to_token @property def reserved_tokens(self): return self._reserved_tokens @property def token_to_idx(self): return self._token_to_idx @property def unknown_token(self): return self._unknown_token def __contains__(self, token): """Checks whether a text token exists in the vocabulary. Parameters ---------- token : str A text token. Returns ------- bool Whether the text token exists in the vocabulary (including `unknown_token`). """ return token in self._token_to_idx def __getitem__(self, tokens): """Looks up indices of text tokens according to the vocabulary. If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError. Parameters ---------- tokens : str or list of strs A source token or tokens to be converted. Returns ------- int or list of ints A token index or a list of token indices according to the vocabulary. """ if not isinstance(tokens, (list, tuple)): return self._token_to_idx[tokens] else: return [self._token_to_idx[token] for token in tokens] def __len__(self): return len(self._idx_to_token)
[docs] def set_embedding(self, *embeddings): """Attaches one or more embeddings to the indexed text tokens. Parameters ---------- embeddings : None or tuple of :class:`gluonnlp.embedding.TokenEmbedding` instances The embedding to be attached to the indexed tokens. If a tuple of multiple embeddings are provided, their embedding vectors will be concatenated for the same token. """ if len(embeddings) == 1 and embeddings[0] is None: self._embedding = None return for embs in embeddings: assert isinstance(embs, emb.TokenEmbedding), \ 'The argument `embeddings` must be an instance or a list of instances of ' \ '`gluonnlp.embedding.TokenEmbedding`.' assert embs.idx_to_vec is not None, \ 'For all specified `embeddings`, `embeddings.idx_to_vec` must be initialized. ' \ 'Use eg. `emb[emb.unknown_token] = nd.zeros(emsize)` to initialize, ' \ 'where `emsize` is the desired embedding dimensionality.' assert all([embs.unknown_token for embs in embeddings]) or \ all([not embs.unknown_token for embs in embeddings]), \ 'Either all or none of the TokenEmbeddings must have an ' \ 'unknown_token set.' new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings) # TODO(leezu): Remove once np shape is used by default assert len(self), 'Empty vocab not yet supported' new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len)) col_start = 0 # Concatenate all the embedding vectors in embedding. for embs in embeddings: if embs and embs.idx_to_vec is not None: col_end = col_start + embs.idx_to_vec.shape[1] # Cancatenate vectors of the unknown token. new_idx_to_vec[0, col_start:col_end] = embs.idx_to_vec[0] new_idx_to_vec[1:, col_start:col_end] = embs[self._idx_to_token[1:]] col_start = col_end self._embedding = emb.TokenEmbedding(self.unknown_token, init_unknown_vec=None, allow_extend=False, idx_to_token=self.idx_to_token, idx_to_vec=new_idx_to_vec)
[docs] def to_tokens(self, indices): """Converts token indices to tokens according to the vocabulary. Parameters ---------- indices : int or list of ints A source token index or token indices to be converted. Returns ------- str or list of strs A token or a list of tokens according to the vocabulary. """ to_reduce = False if not isinstance(indices, (list, tuple)): indices = [indices] to_reduce = True max_idx = len(self._idx_to_token) - 1 tokens = [] for idx in indices: if not isinstance(idx, int) or idx > max_idx: raise ValueError('Token index {} in the provided `indices` is invalid.'.format(idx)) tokens.append(self._idx_to_token[idx]) return tokens[0] if to_reduce else tokens
[docs] def to_indices(self, tokens): """Looks up indices of text tokens according to the vocabulary. Parameters ---------- tokens : str or list of strs A source token or tokens to be converted. Returns ------- int or list of ints A token index or a list of token indices according to the vocabulary. """ return self[tokens]
[docs] def __call__(self, tokens): """Looks up indices of text tokens according to the vocabulary. Parameters ---------- tokens : str or list of strs A source token or tokens to be converted. Returns ------- int or list of ints A token index or a list of token indices according to the vocabulary. """ return self[tokens]
def __repr__(self): unk = '"{}"'.format(self._unknown_token) if self._unknown_token else 'None' reserved = '"{}"'.format(self._reserved_tokens) if self._reserved_tokens else 'None' return 'Vocab(size={}, unk={}, reserved={})'.format(len(self), unk, reserved)
[docs] def to_json(self): """Serialize Vocab object to json string. This method does not serialize the underlying embedding. """ if self._embedding: warnings.warn('Serialization of attached embedding ' 'to json is not supported. ' 'You may serialize the embedding to a binary format ' 'separately using vocab.embedding.serialize') vocab_dict = {} vocab_dict['idx_to_token'] = self._idx_to_token vocab_dict['token_to_idx'] = dict(self._token_to_idx) vocab_dict['reserved_tokens'] = self._reserved_tokens vocab_dict['unknown_token'] = self._unknown_token vocab_dict['identifiers_to_tokens'] = self._identifiers_to_tokens return json.dumps(vocab_dict)
[docs] @classmethod def from_json(cls, json_str): """Deserialize Vocab object from json string. Parameters ---------- json_str : str Serialized json string of a Vocab object. Returns ------- Vocab """ vocab_dict = json.loads(json_str) token_to_idx = vocab_dict.get('token_to_idx') unknown_token = vocab_dict.get('unknown_token') reserved_tokens = vocab_dict.get('reserved_tokens') identifiers_to_tokens = vocab_dict.get('identifiers_to_tokens', dict()) special_tokens = {unknown_token} # Backward compatibility for explicit serialization of padding_token, # bos_token, eos_token handling in the json string as done in older # versions of GluonNLP. deprecated_arguments = ['padding_token', 'bos_token', 'eos_token'] for token_name in deprecated_arguments: if token_name in vocab_dict: token = vocab_dict[token_name] assert token_name not in identifiers_to_tokens, 'Invalid json string. ' \ '{} was serialized twice.'.format(token_name) identifiers_to_tokens[token_name] = token # Separate reserved from special tokens special_tokens.update(identifiers_to_tokens.values()) if reserved_tokens is not None: reserved_tokens = [ t for t in reserved_tokens if t not in special_tokens ] # Backward compatiblity code to deserialize corrupted vocabularies # created without bugfix https://github.com/dmlc/gluon-nlp/pull/749 corrected_token_to_idx = collections.defaultdict(list) idx_to_token = vocab_dict.get('idx_to_token') if len(idx_to_token) > len(token_to_idx): # Index is corrupt warnings.warn( 'Detected a corrupted index in the deserialize vocabulary. ' 'For versions before GluonNLP v0.7 the index is corrupted ' 'by specifying the same token for different special purposes, ' 'for example eos_token == padding_token. ' 'Deserializing the vocabulary nevertheless.' ) for token, count in collections.Counter(idx_to_token).items(): if count == 1: continue # Introduce new tokens to avoid invalid duplicates idx = -1 while count > 0: count -= 1 idx = idx_to_token.index(token, idx + 1) if idx == token_to_idx[token]: # Valid idx continue # Introduce temporary token token_to_idx.update({str(uuid.uuid4()): idx}) corrected_token_to_idx[token].append(idx) vocab = cls( counter=count_tokens(token_to_idx.keys()), unknown_token=unknown_token, reserved_tokens=reserved_tokens, token_to_idx=token_to_idx, **identifiers_to_tokens) # Backward compatiblity code to deserialize corrupted vocabularies # created without bugfix https://github.com/dmlc/gluon-nlp/pull/749 for token, corrected_idxs in corrected_token_to_idx.items(): for idx in corrected_idxs: # delete temporary tokens del vocab._token_to_idx[vocab._idx_to_token[idx]] vocab._idx_to_token[idx] = token return vocab