Source code for gluonnlp.data.corpora.wikitext
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=
"""WikiText corpora."""
__all__ = ['WikiText2', 'WikiText103', 'WikiText2Raw', 'WikiText103Raw']
import os
import shutil
import zipfile
from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download
from ... import _constants as C
from ..dataset import CorpusDataset
from ..registry import register
from ...base import get_home_dir
class _WikiText(CorpusDataset):
def __init__(self, namespace, segment, bos, eos, flatten, skip_empty, root,
**kwargs):
root = os.path.expanduser(root)
os.makedirs(root, exist_ok=True)
self._root = root
self._namespace = 'gluon/dataset/{}'.format(namespace)
self._segment = segment
super(_WikiText, self).__init__(
self._get_data(),
bos=bos,
eos=eos,
flatten=flatten,
skip_empty=skip_empty,
**kwargs)
def _get_data(self):
archive_file_name, archive_hash = self._archive_file
data_file_name, data_hash = self._data_file[self._segment]
root = self._root
path = os.path.join(root, data_file_name)
if not os.path.exists(path) or not check_sha1(path, data_hash):
downloaded_file_path = download(_get_repo_file_url(self._namespace, archive_file_name),
path=root,
sha1_hash=archive_hash)
with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
for member in zf.namelist():
filename = os.path.basename(member)
if filename:
dest = os.path.join(root, filename)
with zf.open(member) as source, \
open(dest, 'wb') as target:
shutil.copyfileobj(source, target)
return path
[docs]@register(segment=['train', 'val', 'test'])
class WikiText2(_WikiText):
"""WikiText-2 word-level dataset for language modeling, from Salesforce research.
WikiText2 is implemented as CorpusDataset with the default flatten=True.
From
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
License: Creative Commons Attribution-ShareAlike
Parameters
----------
segment : {'train', 'val', 'test'}, default 'train'
Dataset segment.
flatten : bool, default True
Whether to return all samples as flattened tokens. If True, each sample is a token.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
tokenizer : function, default str.split
A function that splits each sample string into list of tokens.
bos : str or None, default None
The token to add at the beginning of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '$MXNET_HOME/datasets/wikitext-2'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> wikitext2 = gluonnlp.data.WikiText2('val', root='./datasets/wikitext2')
-etc-
>>> len(wikitext2)
216347
>>> wikitext2[0]
'='
>>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False,
... root='./datasets/wikitext2')
>>> len(wikitext2)
2461
>>> wikitext2[0]
['=', 'Homarus', 'gammarus', '=', '<eos>']
>>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False, bos='<bos>', eos=None,
... root='./datasets/wikitext2')
>>> wikitext2[0]
['<bos>', '=', 'Homarus', 'gammarus', '=']
>>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False, bos='<bos>', eos=None,
... skip_empty=False, root='./datasets/wikitext2')
>>> len(wikitext2)
3760
>>> wikitext2[0]
['<bos>']
"""
def __init__(self,
segment='train',
flatten=True,
skip_empty=True,
tokenizer=lambda s: s.split(),
bos=None,
eos=C.EOS_TOKEN,
root=os.path.join(get_home_dir(), 'datasets', 'wikitext-2'),
**kwargs):
self._archive_file = ('wikitext-2-v1.zip',
'3c914d17d80b1459be871a5039ac23e752a53cbe')
self._data_file = {
'train': ('wiki.train.tokens',
'863f29c46ef9d167fff4940ec821195882fe29d1'),
'val': ('wiki.valid.tokens',
'0418625c8b4da6e4b5c7a0b9e78d4ae8f7ee5422'),
'test': ('wiki.test.tokens',
'c7b8ce0aa086fb34dab808c5c49224211eb2b172')
}
super(WikiText2, self).__init__(
'wikitext-2',
segment=segment,
bos=bos,
eos=eos,
flatten=flatten,
skip_empty=skip_empty,
root=root,
tokenizer=tokenizer,
**kwargs)
[docs]@register(segment=['train', 'val', 'test'])
class WikiText103(_WikiText):
"""WikiText-103 word-level dataset for language modeling, from Salesforce research.
WikiText103 is implemented as CorpusDataset with the default flatten=True.
From
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
License: Creative Commons Attribution-ShareAlike
Parameters
----------
segment : {'train', 'val', 'test'}, default 'train'
Dataset segment.
flatten : bool, default True
Whether to return all samples as flattened tokens. If True, each sample is a token.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
tokenizer : function, default str.split
A function that splits each sample string into list of tokens.
bos : str or None, default None
The token to add at the beginning of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '$MXNET_HOME/datasets/wikitext-103'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> wikitext103 = gluonnlp.data.WikiText103('val', root='./datasets/wikitext103')
-etc-
>>> len(wikitext103)
216347
>>> wikitext103[0]
'='
>>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False,
... root='./datasets/wikitext103')
>>> len(wikitext103)
2461
>>> wikitext103[0]
['=', 'Homarus', 'gammarus', '=', '<eos>']
>>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False, bos='<bos>', eos=None,
... root='./datasets/wikitext103')
>>> wikitext103[0]
['<bos>', '=', 'Homarus', 'gammarus', '=']
>>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False, bos='<bos>', eos=None,
... skip_empty=False, root='./datasets/wikitext103')
>>> len(wikitext103)
3760
>>> wikitext103[0]
['<bos>']
"""
def __init__(self,
segment='train',
flatten=True,
skip_empty=True,
tokenizer=lambda s: s.split(),
bos=None,
eos=C.EOS_TOKEN,
root=os.path.join(get_home_dir(), 'datasets',
'wikitext-103'),
**kwargs):
self._archive_file = ('wikitext-103-v1.zip',
'0aec09a7537b58d4bb65362fee27650eeaba625a')
self._data_file = {
'train': ('wiki.train.tokens',
'b7497e2dfe77e72cfef5e3dbc61b7b53712ac211'),
'val': ('wiki.valid.tokens',
'c326ac59dc587676d58c422eb8a03e119582f92b'),
'test': ('wiki.test.tokens',
'8a5befc548865cec54ed4273cf87dbbad60d1e47')
}
super(WikiText103, self).__init__(
'wikitext-103',
segment=segment,
bos=bos,
eos=eos,
flatten=flatten,
skip_empty=skip_empty,
root=root,
tokenizer=tokenizer,
**kwargs)
[docs]@register(segment=['train', 'val', 'test'])
class WikiText2Raw(_WikiText):
"""WikiText-2 character-level dataset for language modeling
WikiText2Raw is implemented as CorpusDataset with the default flatten=True.
From Salesforce research:
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
License: Creative Commons Attribution-ShareAlike
Parameters
----------
segment : {'train', 'val', 'test'}, default 'train'
Dataset segment.
flatten : bool, default True
Whether to return all samples as flattened tokens. If True, each sample is a token.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
tokenizer : function, default s.encode('utf-8')
A function that splits each sample string into list of tokens.
The tokenizer can also be used to convert everything to lowercase.
E.g. with tokenizer=lambda s: s.lower().encode('utf-8')
bos : str or None, default None
The token to add at the beginning of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '$MXNET_HOME/datasets/wikitext-2'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> wikitext2 = gluonnlp.data.WikiText2Raw('val', root='./datasets/wikitext2')
-etc-
>>> len(wikitext2)
1136862
>>> wikitext2[0]
61
>>> type(wikitext2[0])
<class 'int'>
>>> wikitext2 = gluonnlp.data.WikiText2Raw('val', flatten=False,
... tokenizer=None, root='./datasets/wikitext2')
>>> len(wikitext2)
2461
>>> wikitext2[0]
'= Homarus gammarus ='
>>> wikitext2 = gluonnlp.data.WikiText2Raw('val', flatten=False, bos='<bos>', eos=None,
... tokenizer=lambda s: s.split(),
... root='./datasets/wikitext2')
>>> wikitext2[0]
['<bos>', '=', 'Homarus', 'gammarus', '=']
"""
def __init__(self,
segment='train',
flatten=True,
skip_empty=True,
bos=None,
eos=None,
tokenizer=lambda s: s.encode('utf-8'),
root=os.path.join(get_home_dir(), 'datasets', 'wikitext-2'),
**kwargs):
self._archive_file = ('wikitext-2-raw-v1.zip',
'3b6993c138fc61c95f7fffd900fef68f8411371d')
self._data_file = {
'train': ('wiki.train.raw',
'd33faf256327882db0edc7c67cd098d1051a2112'),
'val': ('wiki.valid.raw',
'db78d4db83700cba1b1bf4a9381087043db2876d'),
'test': ('wiki.test.raw',
'6f1fe2054a940eebfc76b284b09680763b37f5ea')
}
super(WikiText2Raw, self).__init__(
'wikitext-2',
segment=segment,
bos=bos,
eos=eos,
flatten=flatten,
skip_empty=skip_empty,
root=root,
tokenizer=tokenizer,
**kwargs)
[docs]@register(segment=['train', 'val', 'test'])
class WikiText103Raw(_WikiText):
"""WikiText-103 character-level dataset for language modeling
WikiText103Raw is implemented as CorpusDataset with the default flatten=True.
From Salesforce research:
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
License: Creative Commons Attribution-ShareAlike
Parameters
----------
segment : {'train', 'val', 'test'}, default 'train'
Dataset segment.
flatten : bool, default True
Whether to return all samples as flattened tokens. If True, each sample is a token.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
tokenizer : function, default s.encode('utf-8')
A function that splits each sample string into list of tokens.
The tokenizer can also be used to convert everything to lowercase.
E.g. with tokenizer=lambda s: s.lower().encode('utf-8')
bos : str or None, default None
The token to add at the beginning of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '$MXNET_HOME/datasets/wikitext-103'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> wikitext103 = gluonnlp.data.WikiText103Raw('val', root='./datasets/wikitext103')
-etc-
>>> len(wikitext103)
1136862
>>> wikitext103[0]
61
>>> wikitext103 = gluonnlp.data.WikiText103Raw('val', flatten=False,
... root='./datasets/wikitext103')
>>> len(wikitext103)
2461
>>> wikitext103[0]
[61, 32, 72, 111, 109, 97, 114, 117, 115, 32, 103, 97, 109, 109, 97, 114, 117, 115, 32, 61]
>>> wikitext103 = gluonnlp.data.WikiText103Raw('val', flatten=False, tokenizer=None,
... root='./datasets/wikitext103')
>>> wikitext103[0]
'= Homarus gammarus ='
"""
def __init__(self,
segment='train',
flatten=True,
skip_empty=True,
tokenizer=lambda s: s.encode('utf-8'),
bos=None,
eos=None,
root=os.path.join(get_home_dir(), 'datasets',
'wikitext-103'),
**kwargs):
self._archive_file = ('wikitext-103-raw-v1.zip',
'86f2375181b9247049d9c9205fad2b71b274b568')
self._data_file = {
'train': ('wiki.train.raw',
'3d06627c15e834408cfee91293f862c11c1cc9ef'),
'val': ('wiki.valid.raw',
'db78d4db83700cba1b1bf4a9381087043db2876d'),
'test': ('wiki.test.raw',
'6f1fe2054a940eebfc76b284b09680763b37f5ea')
}
super(WikiText103Raw, self).__init__(
'wikitext-103',
segment=segment,
bos=bos,
eos=eos,
flatten=flatten,
skip_empty=skip_empty,
root=root,
tokenizer=tokenizer,
**kwargs)