Source code for gluonnlp.model.train

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=wildcard-import
"""NLP training model."""

import mxnet as mx
import gluonnlp as nlp

from . import cache, embedding, language_model
from .cache import *
from .embedding import *
from .language_model import *

__all__ = language_model.__all__ + cache.__all__ + embedding.__all__ + ['get_cache_model']

[docs]def get_cache_model(name, dataset_name='wikitext-2', window=2000, theta=0.6, lambdas=0.2, ctx=mx.cpu(), **kwargs): r"""Returns a cache model using a pre-trained language model. We implement the neural cache language model proposed in the following work:: @article{grave2016improving, title={Improving neural language models with a continuous cache}, author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas}, journal={ICLR}, year={2017} } Parameters ---------- name : str Name of the cache language model. dataset_name : str or None, default 'wikitext-2'. The dataset name on which the pre-trained model is trained. Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from the training set of the dataset. If None, then vocab is required, for specifying embedding weight size, and is directly returned. window : int Size of cache window theta : float The scala controls the flatness of the cache distribution that predict the next word as shown below: .. math:: p_{cache} \propto \sum_{i=1}^{t-1} \mathbb{1}_{w=x_{i+1}} exp(\theta {h_t}^T h_i) where :math:`p_{cache}` is the cache distribution, :math:`\mathbb{1}` is the identity function, and :math:`h_i` is the output of timestep i. lambdas : float Linear scalar between only cache and vocab distribution, the formulation is as below: .. math:: p = (1 - \lambda) p_{vocab} + \lambda p_{cache} where :math:`p_{vocab}` is the vocabulary distribution and :math:`p_{cache}` is the cache distribution. vocab : gluonnlp.Vocab or None, default None Vocabulary object to be used with the language model. Required when dataset_name is not specified. pretrained : bool, default False Whether to load the pre-trained weights for model. ctx : Context, default CPU The context in which to load the pre-trained weights. root : str, default '~/.mxnet/models' Location for keeping the pre-trained model parameters. Returns ------- Block The model. """ lm_model, vocab = nlp.model.get_model(name, dataset_name=dataset_name, pretrained=True, ctx=ctx, **kwargs) cache_cell = CacheCell(lm_model, len(vocab), window, theta, lambdas) return cache_cell