Source code for gluonnlp.vocab.vocab
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=consider-iterating-dictionary
"""Vocabulary."""
__all__ = ['Vocab']
import collections
import json
import uuid
import warnings
import sys
from typing import Dict, Hashable, List, Optional
from mxnet import nd
from .. import _constants as C
from .. import embedding as emb
from ..data.utils import Counter, DefaultLookupDict, count_tokens
UNK_IDX = 0
_DEPR_PAD = object()
_DEPR_BOS = object()
_DEPR_EOS = object()
def _is_py35():
return sys.version_info[0] == 3 and sys.version_info[1] == 5
[docs]class Vocab:
"""Indexing and embedding attachment for text tokens.
Parameters
----------
counter
Counts text token frequencies in the text data. Its keys will be indexed according to
frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`,
`unknown_token`, and values of `reserved_tokens` must be of the same hashable type.
Examples: str, int, and tuple.
max_size
The maximum possible number of the most frequent tokens in the keys of `counter` that can be
indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose
that there are different keys of `counter` whose frequency are the same, if indexing all of
them will exceed this argument value, such keys will be indexed one by one according to
their __cmp__() order until the frequency threshold is met. If this argument is None or
larger than its largest possible value restricted by `counter` and `reserved_tokens`, this
argument has no effect.
min_freq
The minimum frequency required for a token in the keys of `counter` to be indexed.
unknown_token
The representation for any unknown token. If `unknown_token` is not
`None`, looking up any token that is not part of the vocabulary and
thus considered unknown will return the index of `unknown_token`. If
None, looking up an unknown token will result in `KeyError`.
reserved_tokens
A list specifying additional tokens to be added to the vocabulary.
`reserved_tokens` must not contain the value of `unknown_token` or
duplicate tokens. It must neither contain special tokens specified via
keyword arguments.
token_to_idx
If not `None`, specifies the indices of tokens to be used by the
vocabulary. Each token in `token_to_index` must be part of the Vocab
and each index can only be associated with a single token.
`token_to_idx` is not required to contain a mapping for all tokens. For
example, it is valid to only set the `unknown_token` index to 10
(instead of the default of 0) with `token_to_idx = {'<unk>': 10}`,
assuming that there are at least 10 tokens in the vocabulary.
`**kwargs`
Keyword arguments of the format `xxx_token` can be used to specify
further special tokens that will be exposed as attribute of the
vocabulary and associated with an index.
For example, specifying `mask_token='<mask>` as additional keyword
argument when constructing a vocabulary `v` leads to `v.mask_token`
exposing the value of the special token: `<mask>`.
If the specified token is not part of the Vocabulary, it will be added,
just as if it was listed in the `reserved_tokens` argument. The
specified tokens are listed together with reserved tokens in the
`reserved_tokens` attribute of the vocabulary object.
deprecated_padding_token
The representation for the special token of padding token. Default:
'<pad>'. Specifying padding_token as positional argument is deprecated
and support will be removed. Specify it as keyword argument instead
(see documentation of `**kwargs` above)
deprecated_bos_token
The representation for the special token of beginning-of-sequence
token. Default: '<bos>'. Specifying bos_token as positional argument is
deprecated and support will be removed. Specify it as keyword argument
instead (see documentation of `**kwargs` above)
deprecated_eos_token
The representation for the special token of end-of-sequence token.
Default: '<eos>'. Specifying eos_token as positional argument is
deprecated and support will be removed. Specify it as keyword argument
instead (see documentation of `**kwargs` above)
Attributes
----------
embedding : instance of :class:`gluonnlp.embedding.TokenEmbedding`
The embedding of the indexed tokens.
idx_to_token : list of strs
A list of indexed tokens where the list indices and the token indices are aligned.
reserved_tokens : list of strs or None
A list of reserved tokens that will always be indexed.
token_to_idx : dict mapping str to int
A dict mapping each token to its index integer.
unknown_token : hashable object or None
The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation.
padding_token : hashable object or None
The representation for padding token.
bos_token : hashable object or None
The representation for beginning-of-sentence token.
eos_token : hashable object or None
The representation for end-of-sentence token.
Examples
--------
>>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
>>> counter = gluonnlp.data.count_tokens(text_data)
>>> my_vocab = gluonnlp.Vocab(counter)
>>> fasttext = gluonnlp.embedding.create('fasttext', source='wiki.simple')
-etc-
>>> my_vocab.set_embedding(fasttext)
>>> my_vocab.embedding[['hello', 'world']][:, :5]
<BLANKLINE>
[[ 0.39567 0.21454 -0.035389 -0.24299 -0.095645]
[ 0.10444 -0.10858 0.27212 0.13299 -0.33165 ]]
<NDArray 2x5 @cpu(0)>
>>> my_vocab[['hello', 'world']]
[5, 4]
>>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape
>>> layer = gluon.nn.Embedding(input_dim, output_dim)
>>> layer.initialize()
>>> layer.weight.set_data(my_vocab.embedding.idx_to_vec)
>>> layer(mx.nd.array([5, 4]))[:, :5]
<BLANKLINE>
[[ 0.39567 0.21454 -0.035389 -0.24299 -0.095645]
[ 0.10444 -0.10858 0.27212 0.13299 -0.33165 ]]
<NDArray 2x5 @cpu(0)>
>>> glove = gluonnlp.embedding.create('glove', source='glove.6B.50d')
-etc-
>>> my_vocab.set_embedding(glove)
>>> my_vocab.embedding[['hello', 'world']][:, :5]
<BLANKLINE>
[[-0.38497 0.80092 0.064106 -0.28355 -0.026759]
[-0.41486 0.71848 -0.3045 0.87445 0.22441 ]]
<NDArray 2x5 @cpu(0)>
Extra keyword arguments of the format `xxx_token` are used to expose
specified tokens as attributes.
>>> my_vocab2 = gluonnlp.Vocab(counter, special_token='hi')
>>> my_vocab2.special_token
'hi'
With the `token_to_idx` argument the order of the `Vocab`'s index can be
adapted. For example, `Vocab` assigns the index `0` to the `unknown_token`
by default. With the `token_to_idx` argument, the default can be
overwritten. Here we assign index `3` to the unknown token representation
`<unk>`.
>>> tok2idx = {'<unk>': 3}
>>> my_vocab3 = gluonnlp.Vocab(counter, token_to_idx=tok2idx)
>>> my_vocab3.unknown_token
'<unk>'
>>> my_vocab3[my_vocab3.unknown_token]
3
>>> my_vocab[my_vocab.unknown_token]
0
"""
def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] = None,
min_freq: int = 1, unknown_token: Optional[Hashable] = C.UNK_TOKEN,
deprecated_padding_token: Optional[Hashable] = _DEPR_PAD,
deprecated_bos_token: Optional[Hashable] = _DEPR_BOS,
deprecated_eos_token: Optional[Hashable] = _DEPR_EOS,
reserved_tokens: Optional[List[Hashable]] = None,
token_to_idx: Optional[Dict[Hashable, int]] = None, *,
padding_token: Optional[Hashable] = C.PAD_TOKEN,
bos_token: Optional[Hashable] = C.BOS_TOKEN,
eos_token: Optional[Hashable] = C.EOS_TOKEN, **kwargs):
# Sanity checks.
assert min_freq > 0, '`min_freq` must be set to a positive value.'
# Deprecation checks and warnings
combs = ((deprecated_padding_token, 'padding_token', _DEPR_PAD, padding_token),
(deprecated_bos_token, 'bos_token', _DEPR_BOS, bos_token),
(deprecated_eos_token, 'eos_token', _DEPR_EOS, eos_token))
for depr_pos_arg, name, indicator, value in combs:
if depr_pos_arg != indicator:
warnings.warn(
'Specifying `{n}` as positional argument is deprecated and '
'support will be removed. Please specify `{n}` as keyword argument instead, '
'for example `Vocab(counter, {n}={v})`'.format(n=name, v=depr_pos_arg),
DeprecationWarning)
# Store positional argument value in kwargs
kwargs[name] = depr_pos_arg
elif name not in kwargs: # Store keyword argument value in kwargs
kwargs[name] = value
# Set up idx_to_token and token_to_idx based on presence of unknown token
self._unknown_token = unknown_token
self._idx_to_token = [unknown_token] if unknown_token else []
if unknown_token:
self._token_to_idx = DefaultLookupDict(UNK_IDX)
else:
self._token_to_idx = {}
# Handle special tokens
special_tokens = []
special_iter = kwargs.items()
if _is_py35():
special_iter = sorted(special_iter)
for special_token_name, special_token in special_iter:
# Test if kwarg specifies a special token
if not special_token_name.endswith('_token'):
raise ValueError('{} is invalid. Only keyword arguments '
'that end in \'_token\' are supported '
'to declare special tokens.'.format(special_token_name))
if special_token is not None and special_token not in special_tokens:
special_tokens.append(special_token)
if reserved_tokens is not None:
special_tokens.extend(reserved_tokens)
special_token_set = set(special_tokens)
if unknown_token:
assert unknown_token not in special_token_set, \
'`reserved_token` cannot contain `unknown_token`.'
assert len(special_token_set) == len(special_tokens), \
'`reserved_tokens` cannot contain duplicate reserved tokens or ' \
'other special tokens.'
if not special_tokens:
self._reserved_tokens = None
else:
self._reserved_tokens = special_tokens
self._idx_to_token.extend(special_tokens)
self._token_to_idx.update((token, idx) for idx, token in enumerate(self._idx_to_token))
self._embedding = None
if counter:
self._index_counter_keys(counter, unknown_token, special_tokens, max_size, min_freq)
self._identifiers_to_tokens = kwargs
if kwargs:
self._expose_tokens_as_attributes(kwargs)
if token_to_idx:
self._sort_index_according_to_user_specification(token_to_idx)
if unknown_token:
self._token_to_idx._default = \
self._token_to_idx[unknown_token] # pytype: disable=not-writable
def _index_counter_keys(self, counter, unknown_token, special_tokens, max_size,
min_freq):
"""Indexes keys of `counter`.
Indexes keys of `counter` according to frequency thresholds such as `max_size` and
`min_freq`.
"""
unknown_and_special_tokens = set(special_tokens) if special_tokens else set()
if unknown_token:
unknown_and_special_tokens.add(unknown_token)
token_freqs = sorted(counter.items(), key=lambda x: x[0])
token_freqs.sort(key=lambda x: x[1], reverse=True)
token_cap = len(unknown_and_special_tokens) + (
len(counter) if not max_size else max_size)
for token, freq in token_freqs:
if freq < min_freq or len(self._idx_to_token) == token_cap:
break
if token not in unknown_and_special_tokens:
self._idx_to_token.append(token)
self._token_to_idx[token] = len(self._idx_to_token) - 1
def _expose_tokens_as_attributes(self, identifiers_to_tokens):
# This method must not be called before internal attributes accessed by
# @properties getters are set. Otherwise the @properties may raise
# during the hasattr(self, identifier) check
for identifier, token in identifiers_to_tokens.items():
# Special tokens are automatically added to the vocab; assert, just to be sure
assert token is None or token in self
if identifier.startswith('_'):
raise ValueError('It is not allowed to use identifiers starting with '
'underscore. In Python identifier names beginning with '
'underscore are internal.')
if hasattr(self, identifier):
raise ValueError('vocab.{} already exists. '
'Please choose a different identifier for token {}'
.format(identifier, token))
setattr(self, identifier, token)
def _sort_index_according_to_user_specification(self, token_to_idx):
# Sanity checks
if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()):
raise ValueError('User-specified token_to_idx mapping can only contain '
'tokens that will be part of the vocabulary.')
if len(set(token_to_idx.values())) != len(token_to_idx):
raise ValueError('User-specified indices must not contain duplicates.')
if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(self.token_to_idx):
raise ValueError('User-specified indices must not be < 0 or >= the number of tokens '
'that will be in the vocabulary. The current vocab contains {}'
'tokens.'.format(len(self.token_to_idx)))
# Update index ordering
for token, new_idx in token_to_idx.items():
old_idx = self.token_to_idx[token]
ousted_token = self.idx_to_token[new_idx]
self.token_to_idx[token] = new_idx
self.token_to_idx[ousted_token] = old_idx
self.idx_to_token[old_idx] = ousted_token
self.idx_to_token[new_idx] = token
@property
def embedding(self):
return self._embedding
@property
def idx_to_token(self):
return self._idx_to_token
@property
def reserved_tokens(self):
return self._reserved_tokens
@property
def token_to_idx(self):
return self._token_to_idx
@property
def unknown_token(self):
return self._unknown_token
def __contains__(self, token):
"""Checks whether a text token exists in the vocabulary.
Parameters
----------
token : str
A text token.
Returns
-------
bool
Whether the text token exists in the vocabulary (including `unknown_token`).
"""
return token in self._token_to_idx
def __getitem__(self, tokens):
"""Looks up indices of text tokens according to the vocabulary.
If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError.
Parameters
----------
tokens : str or list of strs
A source token or tokens to be converted.
Returns
-------
int or list of ints
A token index or a list of token indices according to the vocabulary.
"""
if not isinstance(tokens, (list, tuple)):
return self._token_to_idx[tokens]
else:
return [self._token_to_idx[token] for token in tokens]
def __len__(self):
return len(self._idx_to_token)
[docs] def set_embedding(self, *embeddings):
"""Attaches one or more embeddings to the indexed text tokens.
Parameters
----------
embeddings : None or tuple of :class:`gluonnlp.embedding.TokenEmbedding` instances
The embedding to be attached to the indexed tokens. If a tuple of multiple embeddings
are provided, their embedding vectors will be concatenated for the same token.
"""
if len(embeddings) == 1 and embeddings[0] is None:
self._embedding = None
return
for embs in embeddings:
assert isinstance(embs, emb.TokenEmbedding), \
'The argument `embeddings` must be an instance or a list of instances of ' \
'`gluonnlp.embedding.TokenEmbedding`.'
assert embs.idx_to_vec is not None, \
'For all specified `embeddings`, `embeddings.idx_to_vec` must be initialized. ' \
'Use eg. `emb[emb.unknown_token] = nd.zeros(emsize)` to initialize, ' \
'where `emsize` is the desired embedding dimensionality.'
assert all([embs.unknown_token for embs in embeddings]) or \
all([not embs.unknown_token for embs in embeddings]), \
'Either all or none of the TokenEmbeddings must have an ' \
'unknown_token set.'
new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings)
# TODO(leezu): Remove once np shape is used by default
assert len(self), 'Empty vocab not yet supported'
new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len))
col_start = 0
# Concatenate all the embedding vectors in embedding.
for embs in embeddings:
if embs and embs.idx_to_vec is not None:
col_end = col_start + embs.idx_to_vec.shape[1]
# Cancatenate vectors of the unknown token.
new_idx_to_vec[0, col_start:col_end] = embs.idx_to_vec[0]
new_idx_to_vec[1:, col_start:col_end] = embs[self._idx_to_token[1:]]
col_start = col_end
self._embedding = emb.TokenEmbedding(self.unknown_token,
init_unknown_vec=None,
allow_extend=False,
idx_to_token=self.idx_to_token,
idx_to_vec=new_idx_to_vec)
[docs] def to_tokens(self, indices):
"""Converts token indices to tokens according to the vocabulary.
Parameters
----------
indices : int or list of ints
A source token index or token indices to be converted.
Returns
-------
str or list of strs
A token or a list of tokens according to the vocabulary.
"""
to_reduce = False
if not isinstance(indices, (list, tuple)):
indices = [indices]
to_reduce = True
max_idx = len(self._idx_to_token) - 1
tokens = []
for idx in indices:
if not isinstance(idx, int) or idx > max_idx:
raise ValueError('Token index {} in the provided `indices` is invalid.'.format(idx))
tokens.append(self._idx_to_token[idx])
return tokens[0] if to_reduce else tokens
[docs] def to_indices(self, tokens):
"""Looks up indices of text tokens according to the vocabulary.
Parameters
----------
tokens : str or list of strs
A source token or tokens to be converted.
Returns
-------
int or list of ints
A token index or a list of token indices according to the vocabulary.
"""
return self[tokens]
[docs] def __call__(self, tokens):
"""Looks up indices of text tokens according to the vocabulary.
Parameters
----------
tokens : str or list of strs
A source token or tokens to be converted.
Returns
-------
int or list of ints
A token index or a list of token indices according to the vocabulary.
"""
return self[tokens]
def __repr__(self):
unk = '"{}"'.format(self._unknown_token) if self._unknown_token else 'None'
reserved = '"{}"'.format(self._reserved_tokens) if self._reserved_tokens else 'None'
return 'Vocab(size={}, unk={}, reserved={})'.format(len(self), unk, reserved)
[docs] def to_json(self):
"""Serialize Vocab object to json string.
This method does not serialize the underlying embedding.
"""
if self._embedding:
warnings.warn('Serialization of attached embedding '
'to json is not supported. '
'You may serialize the embedding to a binary format '
'separately using vocab.embedding.serialize')
vocab_dict = {}
vocab_dict['idx_to_token'] = self._idx_to_token
vocab_dict['token_to_idx'] = dict(self._token_to_idx)
vocab_dict['reserved_tokens'] = self._reserved_tokens
vocab_dict['unknown_token'] = self._unknown_token
vocab_dict['identifiers_to_tokens'] = self._identifiers_to_tokens
return json.dumps(vocab_dict)
[docs] @classmethod
def from_json(cls, json_str):
"""Deserialize Vocab object from json string.
Parameters
----------
json_str : str
Serialized json string of a Vocab object.
Returns
-------
Vocab
"""
vocab_dict = json.loads(json_str)
token_to_idx = vocab_dict.get('token_to_idx')
unknown_token = vocab_dict.get('unknown_token')
reserved_tokens = vocab_dict.get('reserved_tokens')
identifiers_to_tokens = vocab_dict.get('identifiers_to_tokens', dict())
special_tokens = {unknown_token}
# Backward compatibility for explicit serialization of padding_token,
# bos_token, eos_token handling in the json string as done in older
# versions of GluonNLP.
deprecated_arguments = ['padding_token', 'bos_token', 'eos_token']
for token_name in deprecated_arguments:
if token_name in vocab_dict:
token = vocab_dict[token_name]
assert token_name not in identifiers_to_tokens, 'Invalid json string. ' \
'{} was serialized twice.'.format(token_name)
identifiers_to_tokens[token_name] = token
# Separate reserved from special tokens
special_tokens.update(identifiers_to_tokens.values())
if reserved_tokens is not None:
reserved_tokens = [
t for t in reserved_tokens if t not in special_tokens
]
# Backward compatiblity code to deserialize corrupted vocabularies
# created without bugfix https://github.com/dmlc/gluon-nlp/pull/749
corrected_token_to_idx = collections.defaultdict(list)
idx_to_token = vocab_dict.get('idx_to_token')
if len(idx_to_token) > len(token_to_idx): # Index is corrupt
warnings.warn(
'Detected a corrupted index in the deserialize vocabulary. '
'For versions before GluonNLP v0.7 the index is corrupted '
'by specifying the same token for different special purposes, '
'for example eos_token == padding_token. '
'Deserializing the vocabulary nevertheless.'
)
for token, count in collections.Counter(idx_to_token).items():
if count == 1:
continue
# Introduce new tokens to avoid invalid duplicates
idx = -1
while count > 0:
count -= 1
idx = idx_to_token.index(token, idx + 1)
if idx == token_to_idx[token]:
# Valid idx
continue
# Introduce temporary token
token_to_idx.update({str(uuid.uuid4()): idx})
corrected_token_to_idx[token].append(idx)
vocab = cls(
counter=count_tokens(token_to_idx.keys()),
unknown_token=unknown_token,
reserved_tokens=reserved_tokens,
token_to_idx=token_to_idx,
**identifiers_to_tokens)
# Backward compatiblity code to deserialize corrupted vocabularies
# created without bugfix https://github.com/dmlc/gluon-nlp/pull/749
for token, corrected_idxs in corrected_token_to_idx.items():
for idx in corrected_idxs:
# delete temporary tokens
del vocab._token_to_idx[vocab._idx_to_token[idx]]
vocab._idx_to_token[idx] = token
return vocab