Source code for gluonnlp.data.sentiment
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=
"""Sentiment analysis datasets."""
__all__ = ['IMDB', 'MR', 'TREC', 'SUBJ', 'SST_1', 'SST_2', 'CR', 'MPQA']
import json
import os
import shutil
import zipfile
from mxnet.gluon.data import SimpleDataset
from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
from .registry import register
from ..base import get_home_dir
class SentimentDataset(SimpleDataset):
"""Base class for sentiment analysis data sets.
Parameters
----------
segment : str
Dataset segment.
root : str
Path to temp folder for storing data.
"""
def __init__(self, segment, root):
root = os.path.expanduser(root)
os.makedirs(root, exist_ok=True)
self._root = root
self._segment = segment
self._get_data()
super(SentimentDataset, self).__init__(self._read_data())
def _get_data(self):
"""Load data from the file. Do nothing if data was loaded before.
"""
(data_archive_name, archive_hash), (data_name, data_hash) \
= self._data_file()[self._segment]
data_path = os.path.join(self._root, data_name)
if not os.path.exists(data_path) or not check_sha1(data_path, data_hash):
file_path = download(_get_repo_file_url(self._repo_dir(), data_archive_name),
path=self._root, sha1_hash=archive_hash)
with zipfile.ZipFile(file_path, 'r') as zf:
for member in zf.namelist():
filename = os.path.basename(member)
if filename:
dest = os.path.join(self._root, filename)
with zf.open(member) as source, open(dest, 'wb') as target:
shutil.copyfileobj(source, target)
def _read_data(self):
(_, _), (data_file_name, _) = self._data_file()[self._segment]
with open(os.path.join(self._root, data_file_name)) as f:
samples = json.load(f)
return samples
def _data_file(self):
raise NotImplementedError
def _repo_dir(self):
raise NotImplementedError
[docs]@register(segment=['train', 'test', 'unsup'])
class IMDB(SimpleDataset):
"""IMDB reviews for sentiment analysis.
From
http://ai.stanford.edu/~amaas/data/sentiment/
Positive classes have label values in [7, 10]. Negative classes have label values in [1, 4].
All samples in unsupervised set have labels with value 0.
Parameters
----------
segment : str, default 'train'
Dataset segment. Options are 'train', 'test', and 'unsup' for unsupervised.
root : str, default '$MXNET_HOME/datasets/imdb'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> imdb = gluonnlp.data.IMDB('test', root='./datasets/imdb')
-etc-
>>> len(imdb)
25000
>>> len(imdb[0])
2
>>> type(imdb[0][0]), type(imdb[0][1])
(<class 'str'>, <class 'int'>)
>>> imdb[0][0][:75]
'I went and saw this movie last night after being coaxed to by a few friends'
>>> imdb[0][1]
10
>>> imdb = gluonnlp.data.IMDB('unsup', root='./datasets/imdb')
-etc-
>>> len(imdb)
50000
>>> len(imdb[0])
2
>>> type(imdb[0][0]), type(imdb[0][1])
(<class 'str'>, <class 'int'>)
>>> imdb[0][0][:70]
'I admit, the great majority of films released before say 1933 are just'
>>> imdb[0][1]
0
"""
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'imdb')):
self._data_file = {'train': ('train.json',
'516a0ba06bca4e32ee11da2e129f4f871dff85dc'),
'test': ('test.json',
'7d59bd8899841afdc1c75242815260467495b64a'),
'unsup': ('unsup.json',
'f908a632b7e7d7ecf113f74c968ef03fadfc3c6c')}
root = os.path.expanduser(root)
os.makedirs(root, exist_ok=True)
self._root = root
self._segment = segment
self._get_data()
super(IMDB, self).__init__(self._read_data())
def _get_data(self):
data_file_name, data_hash = self._data_file[self._segment]
root = self._root
path = os.path.join(root, data_file_name)
if not os.path.exists(path) or not check_sha1(path, data_hash):
download(_get_repo_file_url('gluon/dataset/imdb', data_file_name),
path=root, sha1_hash=data_hash)
def _read_data(self):
with open(os.path.join(self._root, self._segment+'.json')) as f:
samples = json.load(f)
return samples
[docs]@register()
class MR(SentimentDataset):
"""Movie reviews for sentiment analysis.
From
https://www.cs.cornell.edu/people/pabo/movie-review-data/
Positive class has label value 1. Negative class has label value 0.
Parameters
----------
root : str, default '$MXNET_HOME/datasets/mr'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> mr = gluonnlp.data.MR(root='./datasets/mr')
-etc-
>>> len(mr)
10662
>>> len(mr[3])
2
>>> type(mr[3][0]), type(mr[3][1])
(<class 'str'>, <class 'int'>)
>>> mr[3][0][:55]
'if you sometimes like to go to the movies to have fun ,'
>>> mr[3][1]
1
"""
def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'mr')):
super(MR, self).__init__('all', root)
def _data_file(self):
return {'all': (('all-7606efec.zip', '0fcbaffe0bac94733e6497f700196585f03fa89e'),
('all-7606efec.json', '7606efec578d9613f5c38bf2cef8d3e4e6575b2c '))}
def _repo_dir(self):
return 'gluon/dataset/mr'
[docs]@register(segment=['train', 'test'])
class TREC(SentimentDataset):
"""Question dataset for question classification.
From
http://cogcomp.cs.illinois.edu/Data/QA/QC/
Class labels are (http://cogcomp.org/Data/QA/QC/definition.html):
- DESCRIPTION: 0
- ENTITY: 1
- ABBREVIATION: 2
- HUMAN: 3
- LOCATION: 4
- NUMERIC: 5
The first space-separated token in the text of each sample is the fine-grain label.
Parameters
----------
segment : str, default 'train'
Dataset segment. Options are 'train' and 'test'.
root : str, default '$MXNET_HOME/datasets/trec'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> trec = gluonnlp.data.TREC('test', root='./datasets/trec')
-etc-
>>> len(trec)
500
>>> len(trec[0])
2
>>> type(trec[0][0]), type(trec[0][1])
(<class 'str'>, <class 'int'>)
>>> trec[0][0]
'How far is it from Denver to Aspen ?'
>>> (trec[0][1], trec[0][0].split()[0])
(5, 'How')
"""
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'trec')):
super(TREC, self).__init__(segment, root)
def _data_file(self):
return {'train': (('train-1776132f.zip', '337d3f43a56ec26f5773c6fc406ef19fb4cd3c92'),
('train-1776132f.json', '1776132fb2fc0ed2dc91b62f7817a4e071a3c7de')),
'test': (('test-ff9ad0ce.zip', '57f03aaee2651ca05f1f9fc5731ba7e9ad98e38a'),
('test-ff9ad0ce.json', 'ff9ad0ceb44d8904663fee561804a8dd0edc1b15'))}
def _repo_dir(self):
return 'gluon/dataset/trec'
[docs]@register()
class SUBJ(SentimentDataset):
"""Subjectivity dataset for sentiment analysis.
Positive class has label value 1. Negative class has label value 0.
Parameters
----------
root : str, default '$MXNET_HOME/datasets/subj'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> subj = gluonnlp.data.SUBJ(root='./datasets/subj')
-etc-
>>> len(subj)
10000
>>> len(subj[0])
2
>>> type(subj[0][0]), type(subj[0][1])
(<class 'str'>, <class 'int'>)
>>> subj[0][0][:60]
'its impressive images of crematorium chimney fires and stack'
>>> subj[0][1]
1
"""
def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'subj')):
super(SUBJ, self).__init__('all', root)
def _data_file(self):
return {'all': (('all-9e7bd1da.zip', '8b0d95c2fc885cc38e4ad776d7429183f3ef632b'),
('all-9e7bd1da.json', '9e7bd1daa359c24abe1fac767d0e0af7bc114045'))}
def _repo_dir(self):
return 'gluon/dataset/subj'
[docs]@register(segment=['train', 'dev', 'test'])
class SST_1(SentimentDataset):
"""Stanford Sentiment Treebank: an extension of the MR data set.
However, train/dev/test splits are provided and labels are fine-grained
(very positive, positive, neutral, negative, very negative).
From
http://nlp.stanford.edu/sentiment/
Class labels are:
- very positive: 4
- positive: 3
- neutral: 2
- negative: 1
- very negative: 0
Parameters
----------
segment : str, default 'train'
Dataset segment. Options are 'train' and 'test'.
root : str, default '$MXNET_HOME/datasets/sst-1'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> sst_1 = gluonnlp.data.SST_1('test', root='./datasets/sst_1')
-etc-
>>> len(sst_1)
2210
>>> len(sst_1[0])
2
>>> type(sst_1[0][0]), type(sst_1[0][1])
(<class 'str'>, <class 'int'>)
>>> sst_1[0][0][:73]
'no movement , no yuks , not much of anything .'
>>> sst_1[0][1]
1
"""
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'sst-1')):
super(SST_1, self).__init__(segment, root)
def _data_file(self):
return {'train': (('train-638f9352.zip', '0a039010449772700c0e270c7095362403dc486a'),
('train-638f9352.json', '638f935251c0474e93d4aa50fda0c900faf02bba')),
'dev': (('dev-820ac954.zip', 'e4b7899ef5d37a6bf01d8ec1115ba20b8419b96f'),
('dev-820ac954.json', '820ac954b14b4f7d947e25f7a99249618d7962ee')),
'test': (('test-ab593ae9.zip', 'd3736db56cdc7293c38435557697c2407652525d'),
('test-ab593ae9.json', 'ab593ae9628f94af4f698654158ded1488b1de3b'))}
def _repo_dir(self):
return 'gluon/dataset/sst-1'
[docs]@register(segment=['train', 'dev', 'test'])
class SST_2(SentimentDataset):
"""Stanford Sentiment Treebank: an extension of the MR data set.
Same as the SST-1 data set except that neutral reviews are removed
and labels are binary (positive, negative).
From
http://nlp.stanford.edu/sentiment/
Positive class has label value 1. Negative class has label value 0.
Parameters
----------
segment : str, default 'train'
Dataset segment. Options are 'train' and 'test'.
root : str, default '$MXNET_HOME/datasets/sst-2'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> sst_2 = gluonnlp.data.SST_2('test', root='./datasets/sst_2')
-etc-
>>> len(sst_2)
1821
>>> len(sst_2[0])
2
>>> type(sst_2[0][0]), type(sst_2[0][1])
(<class 'str'>, <class 'int'>)
>>> sst_2[0][0][:65]
'no movement , no yuks , not much of anything .'
>>> sst_2[0][1]
0
"""
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'sst-2')):
super(SST_2, self).__init__(segment, root)
def _data_file(self):
return {'train': (('train-61f1f238.zip', 'f27a9ac6a7c9208fb7f024b45554da95639786b3'),
('train-61f1f238.json', '61f1f23888652e11fb683ac548ed0be8a87dddb1')),
'dev': (('dev-65511587.zip', '8c74911f0246bd88dc0ced2619f95f10db09dc98'),
('dev-65511587.json', '655115875d83387b61f9701498143724147a1fc9')),
'test': (('test-a39c1db6.zip', '4b7f1648207ec5dffb4e4783cf1f48d6f36ba4db'),
('test-a39c1db6.json', 'a39c1db6ecc3be20bf2563bf2440c3c06887a2df'))}
def _repo_dir(self):
return 'gluon/dataset/sst-2'
[docs]@register()
class CR(SentimentDataset):
"""
Customer reviews of various products (cameras, MP3s etc.). The task is to
predict positive/negative reviews.
Positive class has label value 1. Negative class has label value 0.
Parameters
----------
root : str, default '$MXNET_HOME/datasets/cr'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> cr = gluonnlp.data.CR(root='./datasets/cr')
-etc-
>>> len(cr)
3775
>>> len(cr[3])
2
>>> type(cr[3][0]), type(cr[3][1])
(<class 'str'>, <class 'int'>)
>>> cr[3][0][:55]
'i know the saying is " you get what you pay for " but a'
>>> cr[3][1]
0
"""
def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'cr')):
super(CR, self).__init__('all', root)
def _data_file(self):
return {'all': (('all-0c9633c6.zip', 'c662e2f9115d74e1fcc7c896fa3e2dc5ee7688e7'),
('all-0c9633c6.json', '0c9633c695d29b18730eddff965c850425996edf'))}
def _repo_dir(self):
return 'gluon/dataset/cr'
[docs]@register()
class MPQA(SentimentDataset):
"""
Opinion polarity detection subtask of the MPQA dataset.
From
http://www.cs.pitt.edu/mpqa/
Positive class has label value 1. Negative class has label value 0.
Parameters
----------
root : str, default '$MXNET_HOME/datasets/mpqa'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> mpqa = gluonnlp.data.MPQA(root='./datasets/mpqa')
-etc-
>>> len(mpqa)
10606
>>> len(mpqa[3])
2
>>> type(mpqa[3][0]), type(mpqa[3][1])
(<class 'str'>, <class 'int'>)
>>> mpqa[3][0][:25]
'many years of decay'
>>> mpqa[3][1]
0
"""
def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'mpqa')):
super(MPQA, self).__init__('all', root)
def _data_file(self):
return {'all': (('all-bcbfeed8.zip', 'e07ae226cfe4713328eeb9660b261b9852ff5865'),
('all-bcbfeed8.json', 'bcbfeed8b8767a564bdc428486ef18c1ba4dc536'))}
def _repo_dir(self):
return 'gluon/dataset/mpqa'