Source code for gluonnlp.data
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=wildcard-import
"""This module includes common utilities such as data readers and counter."""
import os
from . import (batchify, candidate_sampler, conll, corpora, dataloader,
dataset, question_answering, registry, sampler, sentiment,
stream, super_glue, transforms, translation, utils,
word_embedding_evaluation, intent_slot, glue, datasetloader,
classification, baidu_ernie_data, bert, xlnet)
from .candidate_sampler import *
from .conll import *
from .glue import *
from .super_glue import *
from .corpora import *
from .dataloader import *
from .dataset import *
from .question_answering import *
from .registry import *
from .sampler import *
from .sentiment import *
from .stream import *
from .transforms import *
from .translation import *
from .utils import *
from .utils import _load_pretrained_sentencepiece_tokenizer
from .word_embedding_evaluation import *
from .intent_slot import *
from .datasetloader import *
from .classification import *
from .baidu_ernie_data import *
from .bert import *
from .xlnet import *
from ..base import get_home_dir
__all__ = (['batchify', 'get_tokenizer'] + utils.__all__ + transforms.__all__
+ sampler.__all__ + dataset.__all__ + corpora.__all__ + sentiment.__all__
+ word_embedding_evaluation.__all__ + stream.__all__ + conll.__all__
+ translation.__all__ + registry.__all__ + question_answering.__all__
+ dataloader.__all__ + candidate_sampler.__all__ + intent_slot.__all__
+ glue.__all__ + super_glue.__all__ + classification.__all__
+ baidu_ernie_data.__all__ + datasetloader.__all__
+ bert.__all__ + xlnet.__all__) # pytype: disable=attribute-error
[docs]def get_tokenizer(model_name, dataset_name,
vocab=None, root=os.path.join(get_home_dir(), 'data'),
**kwargs):
"""Returns a pre-defined tokenizer by name.
Parameters
----------
model_name : str
Options include 'bert_24_1024_16', 'bert_12_768_12', 'roberta_12_768_12',
'roberta_24_1024_16' and 'ernie_12_768_12'
dataset_name : str
The supported datasets for model_name of either bert_24_1024_16 and
bert_12_768_12 are 'book_corpus_wiki_en_cased',
'book_corpus_wiki_en_uncased'.
For model_name bert_12_768_12 'wiki_cn_cased',
'wiki_multilingual_uncased', 'wiki_multilingual_cased',
'scibert_scivocab_uncased', 'scibert_scivocab_cased',
'scibert_basevocab_uncased','scibert_basevocab_cased',
'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
'biobert_v1.1_pubmed',
'clinicalbert',
'kobert_news_wiki_ko_cased' are supported.
For model_name roberta_12_768_12 and roberta_24_1024_16
'openwebtext_ccnews_stories_books_cased' is supported.
For model_name ernie_12_768_12
'baidu_ernie_uncased'.
is additionally supported.
vocab : gluonnlp.vocab.BERTVocab or None, default None
Vocabulary for the dataset. Must be provided if tokenizer is based on
vocab.
root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
Location for keeping the model parameters.
Returns
-------
gluonnlp.data.BERTTokenizer or gluonnlp.data.GPT2BPETokenizer or
gluonnlp.data.SentencepieceTokenizer
Examples
--------
>>> model_name = 'bert_12_768_12'
>>> dataset_name = 'book_corpus_wiki_en_uncased'
>>> _, vocab = gluonnlp.model.get_model(model_name,
... dataset_name=dataset_name,
... pretrained=False, root='./model')
-etc-
>>> tokenizer = gluonnlp.data.get_tokenizer(model_name, dataset_name, vocab)
>>> tokenizer('Habit is second nature.')
['habit', 'is', 'second', 'nature', '.']
"""
model_name, dataset_name = model_name.lower(), dataset_name.lower()
model_dataset_name = '_'.join([model_name, dataset_name])
model_dataset_names = {'roberta_12_768_12_openwebtext_ccnews_stories_books_cased':
[GPT2BPETokenizer, {'lower': False}],
'roberta_24_1024_16_openwebtext_ccnews_stories_books_cased':
[GPT2BPETokenizer, {'lower': False}],
'bert_12_768_12_book_corpus_wiki_en_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_book_corpus_wiki_en_uncased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_openwebtext_book_corpus_wiki_en_uncased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_wiki_multilingual_uncased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_wiki_multilingual_cased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_wiki_cn_cased':
[BERTTokenizer, {'lower': False}],
'bert_24_1024_16_book_corpus_wiki_en_cased':
[BERTTokenizer, {'lower': False}],
'bert_24_1024_16_book_corpus_wiki_en_uncased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_scibert_scivocab_uncased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_scibert_scivocab_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_scibert_basevocab_uncased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_scibert_basevocab_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_biobert_v1.0_pmc_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_biobert_v1.0_pubmed_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_biobert_v1.0_pubmed_pmc_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_biobert_v1.1_pubmed_cased':
[BERTTokenizer, {'lower': False}],
'bert_12_768_12_clinicalbert_uncased':
[BERTTokenizer, {'lower': True}],
'bert_12_768_12_kobert_news_wiki_ko_cased':
[_load_pretrained_sentencepiece_tokenizer, {'num_best': 0, 'alpha':1.0}],
'ernie_12_768_12_baidu_ernie_uncased':
[BERTTokenizer, {'lower': True}]}
if model_dataset_name not in model_dataset_names:
raise ValueError(
'Model name %s is not supported. Available options are\n\t%s'%(
model_dataset_name, '\n\t'.join(sorted(model_dataset_names.keys()))))
tokenizer_cls, extra_args = model_dataset_names[model_dataset_name]
kwargs = {**extra_args, **kwargs}
if tokenizer_cls is BERTTokenizer:
assert vocab is not None, 'Must specify vocab if loading BERTTokenizer'
return tokenizer_cls(vocab, **kwargs)
elif tokenizer_cls is GPT2BPETokenizer:
return tokenizer_cls(root=root)
elif tokenizer_cls is _load_pretrained_sentencepiece_tokenizer:
return tokenizer_cls(dataset_name, root, **kwargs)
else:
raise ValueError('Could not get any matched tokenizer interface.')