Source code for gluonnlp.data.word_embedding_evaluation
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=too-many-lines
"""Word embedding evaluation datasets."""
import os
import tarfile
import zipfile
from mxnet.gluon.data.dataset import SimpleDataset
from mxnet.gluon.utils import check_sha1, _get_repo_file_url, download
from .. import _constants as C
from .dataset import CorpusDataset
from .registry import register
from ..base import get_home_dir
base_datasets = [
'WordSimilarityEvaluationDataset', 'WordAnalogyEvaluationDataset'
]
word_similarity_datasets = [
'WordSim353', 'MEN', 'RadinskyMTurk', 'RareWords', 'SimLex999',
'SimVerb3500', 'SemEval17Task2', 'BakerVerb143', 'YangPowersVerb130'
]
word_analogy_datasets = ['GoogleAnalogyTestSet', 'BiggerAnalogyTestSet']
__all__ = base_datasets + word_similarity_datasets + word_analogy_datasets
class _Dataset(SimpleDataset):
_url = None # Dataset is retrieved from here if not cached
_archive_file = (None, None) # Archive name and checksum
_checksums = None # Checksum of archive contents
_verify_ssl = True # Verify SSL certificates when downloading from self._url
_namespace = None # Contains S3 namespace for self-hosted datasets
def __init__(self, root):
self.root = os.path.expanduser(root)
os.makedirs(self.root, exist_ok=True)
self._download_data()
super(_Dataset, self).__init__(self._get_data())
def _download_data(self):
_, archive_hash = self._archive_file
for name, checksum in self._checksums.items():
name = name.split('/')
path = os.path.join(self.root, *name)
if not os.path.exists(path) or not check_sha1(path, checksum):
if self._namespace is not None:
url = _get_repo_file_url(self._namespace,
self._archive_file[0])
else:
url = self._url
downloaded_file_path = download(url, path=self.root,
sha1_hash=archive_hash,
verify_ssl=self._verify_ssl)
if downloaded_file_path.lower().endswith('zip'):
with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
zf.extractall(path=self.root)
elif downloaded_file_path.lower().endswith('tar.gz'):
with tarfile.open(downloaded_file_path, 'r') as tf:
tf.extractall(path=self.root)
elif len(self._checksums) > 1:
err = 'Failed retrieving {clsname}.'.format(
clsname=self.__class__.__name__)
err += (' Expecting multiple files, '
'but could not detect archive format.')
raise RuntimeError(err)
def _get_data(self):
raise NotImplementedError
###############################################################################
# Word similarity and relatedness datasets
###############################################################################
[docs]class WordSimilarityEvaluationDataset(_Dataset):
"""Base class for word similarity or relatedness task datasets.
Inheriting classes are assumed to implement datasets of the form ['word1',
'word2', score] where score is a numerical similarity or relatedness score
with respect to 'word1' and 'word2'.
"""
def __init__(self, root):
super(WordSimilarityEvaluationDataset, self).__init__(root=root)
self._cast_score_to_float()
def _get_data(self):
raise NotImplementedError
def _cast_score_to_float(self):
self._data = [[row[0], row[1], float(row[2])] for row in self._data]
[docs]@register(segment=['all', 'similarity', 'relatedness'])
class WordSim353(WordSimilarityEvaluationDataset):
"""WordSim353 dataset.
The dataset was collected by Finkelstein et al.
(http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/). Agirre et
al. proposed to split the collection into two datasets, one focused on
measuring similarity, and the other one on relatedness
(http://alfonseca.org/eng/research/wordsim353.html).
- Finkelstein, L., Gabrilovich, E., Matias, Y., Rivlin, E., Solan, Z.,
Wolfman, G., & Ruppin, E. (2002). Placing search in context: the concept
revisited. ACM} Trans. Inf. Syst., 20(1), 116–131.
https://dl.acm.org/citation.cfm?id=372094
- Agirre, E., Alfonseca, E., Hall, K. B., Kravalova, J., Pasca, M., & Soroa, A.
(2009). A study on similarity and relatedness using distributional and
wordnet-based approaches. In , Human Language Technologies: Conference of the
North American Chapter of the Association of Computational Linguistics,
Proceedings, May 31 - June 5, 2009, Boulder, Colorado, {USA (pp. 19–27). :
The Association for Computational Linguistics.
License: Creative Commons Attribution 4.0 International (CC BY 4.0)
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 10 (very much related or identical words).
Parameters
----------
segment : str
'relatedness', 'similarity' or 'all'
root : str, default '$MXNET_HOME/datasets/wordsim353'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> wordsim353 = gluonnlp.data.WordSim353('similarity', root='./datasets/wordsim353')
-etc-
>>> len(wordsim353)
203
>>> wordsim353[0]
['Arafat', 'Jackson', 2.5]
"""
_url = 'http://alfonseca.org/pubs/ws353simrel.tar.gz'
_namespace = 'gluon/dataset/ws353'
_archive_file = ('ws353simrel.tar.gz',
'1b9ca7f4d61682dea0004acbd48ce74275d5bfff')
_checksums = {
'wordsim353_sim_rel/wordsim353_agreed.txt':
'1c9f77c9dd42bcc09092bd32adf0a1988d03ca80',
'wordsim353_sim_rel/wordsim353_annotator1.txt':
'674d5a9263d099a5128b4bf4beeaaceb80f71f4e',
'wordsim353_sim_rel/wordsim353_annotator2.txt':
'9b79a91861a4f1075183b93b89b73e1b470b94c1',
'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt':
'c36c5dc5ebea9964f4f43e2c294cd620471ab1b8',
'wordsim353_sim_rel/wordsim_similarity_goldstandard.txt':
'4845df518a83c8f7c527439590ed7e4c71916a99'
}
_data_file = {
'relatedness': ('wordsim_relatedness_goldstandard.txt',
'c36c5dc5ebea9964f4f43e2c294cd620471ab1b8'),
'similarity': ('wordsim_similarity_goldstandard.txt',
'4845df518a83c8f7c527439590ed7e4c71916a99')
}
min = 0
max = 10
def __init__(self, segment='all', root=os.path.join(
get_home_dir(), 'datasets', 'wordsim353')):
if segment is not None:
assert segment in ['all', 'relatedness', 'similarity']
self.segment = segment
super(WordSim353, self).__init__(root=root)
def _get_data(self):
paths = []
if self.segment == 'relatedness' or self.segment == 'all':
paths.append(
os.path.join(
self.root,
'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt'))
if self.segment == 'similarity' or self.segment == 'all':
paths.append(
os.path.join(
self.root,
'wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'))
return sorted(list({tuple(row) for row in CorpusDataset(paths)}))
[docs]@register(segment=['full', 'dev', 'test'])
class MEN(WordSimilarityEvaluationDataset):
"""MEN dataset for word-similarity and relatedness.
The dataset was collected by Bruni et al.
(https://staff.fnwi.uva.nl/e.bruni/MEN).
- Bruni, E., Boleda, G., Baroni, M., & Nam-Khanh Tran (2012). Distributional
semantics in technicolor. In , The 50th Annual Meeting of the Association for
Computational Linguistics, Proceedings of the Conference, July 8-14, 2012,
Jeju Island, Korea - Volume 1: Long Papers (pp. 136–145). : The Association
for Computer Linguistics.
License: Creative Commons Attribution 2.0 Generic (CC BY 2.0)
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 50 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/men'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
segment : str, default 'train'
Dataset segment. Options are 'train', 'dev', 'test'.
Examples
--------
>>> men = gluonnlp.data.MEN('test', root='./datasets/men')
-etc-
>>> len(men)
1000
>>> men[0]
['display', 'pond', 10.0]
"""
_url = 'https://staff.fnwi.uva.nl/e.bruni/resources/MEN.tar.gz'
_namespace = 'gluon/dataset/men'
_archive_file = ('MEN.tar.gz', '3c4af1b7009c1ad75e03562f7f7bc5f51ff3a31a')
_checksums = {
'MEN/MEN_dataset_lemma_form.dev':
'55d2c9675f84dc661861172fc89db437cab2ed92',
'MEN/MEN_dataset_lemma_form.test':
'c003c9fddfe0ce1d38432cdb13863599d7a2d37d',
'MEN/MEN_dataset_lemma_form_full':
'e32e0a0fa09ccf95aa898bd42011e84419f7fafb',
'MEN/MEN_dataset_natural_form_full':
'af9c2ca0033e2561676872eed98e223ee6366b82',
'MEN/agreement/agreement-score.txt':
'bee1fe16ce63a198a12a924ceb50253c49c7b45c',
'MEN/agreement/elias-men-ratings.txt':
'd180252df271de96c8fbba6693eaa16793e0f7f0',
'MEN/agreement/marcos-men-ratings.txt':
'dbfceb7d88208c2733861f27d3d444c15db18519',
'MEN/instructions.txt':
'e6f69c7338246b404bafa6e24257fc4a5ba01baa',
'MEN/licence.txt':
'f57c6d61814a0895236ab99c06b61b2611430f92'
}
_segment_file = {
'full': 'MEN/MEN_dataset_lemma_form_full',
'dev': 'MEN/MEN_dataset_lemma_form.dev',
'test': 'MEN/MEN_dataset_lemma_form.test',
}
min = 0
max = 50
def __init__(self, segment='dev', root=os.path.join(
get_home_dir(), 'datasets', 'men')):
self.segment = segment
super(MEN, self).__init__(root=root)
def _get_data(self):
datafilepath = os.path.join(
self.root, *self._segment_file[self.segment].split('/'))
dataset = CorpusDataset(datafilepath)
# Remove lemma information
return [[row[0][:-2], row[1][:-2], row[2]] for row in dataset]
[docs]@register
class RadinskyMTurk(WordSimilarityEvaluationDataset):
"""MTurk dataset for word-similarity and relatedness by Radinsky et al..
- Radinsky, K., Agichtein, E., Gabrilovich, E., & Markovitch, S. (2011). A word
at a time: computing word relatedness using temporal semantic analysis. In S.
Srinivasan, K. Ramamritham, A. Kumar, M. P. Ravindra, E. Bertino, & R. Kumar,
Proceedings of the 20th International Conference on World Wide Web, {WWW}
2011, Hyderabad, India, March 28 - April 1, 2011 (pp. 337–346). : ACM.
License: Unspecified
Each sample consists of a pair of words, and a score with scale from
1 (totally unrelated words) to 5 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/radinskymturk'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> radinskymturk = gluonnlp.data.RadinskyMTurk(root='./datasets/radinskymturk')
-etc-
>>> len(radinskymturk)
287
>>> radinskymturk[0]
['episcopal', 'russia', 2.75]
"""
_url = 'http://www.kiraradinsky.com/files/Mtruk.csv'
_archive_file = ('Mtruk.csv', '14959899c092148abba21401950d6957c787434c')
_checksums = {'Mtruk.csv': '14959899c092148abba21401950d6957c787434c'}
min = 1
max = 5
def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
'radinskymturk')):
super(RadinskyMTurk, self).__init__(root=root)
def _get_data(self):
datafilepath = os.path.join(self.root, self._archive_file[0])
return list(CorpusDataset(datafilepath, tokenizer=lambda x: x.split(',')))
[docs]@register
class RareWords(WordSimilarityEvaluationDataset):
"""Rare words dataset word-similarity and relatedness.
- Luong, T., Socher, R., & Manning, C. D. (2013). Better word representations
with recursive neural networks for morphology. In J. Hockenmaier, & S.
Riedel, Proceedings of the Seventeenth Conference on Computational Natural
Language Learning, CoNLL 2013, Sofia, Bulgaria, August 8-9, 2013 (pp.
104–113). : ACL.
License: Unspecified
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 10 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/rarewords',
MXNET_HOME defaults to '~/.mxnet'.
Path to temp folder for storing data.
Examples
--------
>>> rarewords = gluonnlp.data.RareWords(root='./datasets/rarewords')
-etc-
>>> len(rarewords)
2034
>>> rarewords[0]
['squishing', 'squirt', 5.88]
"""
_url = 'http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip'
_archive_file = ('rw.zip', 'bf9c5959a0a2d7ed8e51d91433ac5ebf366d4fb9')
_checksums = {'rw/rw.txt': 'bafc59f099f1798b47f5bed7b0ebbb933f6b309a'}
min = 0
max = 10
def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
'rarewords')):
super(RareWords, self).__init__(root=root)
def _get_data(self):
datafilepath = os.path.join(self.root, 'rw', 'rw.txt')
dataset = CorpusDataset(datafilepath)
return [[row[0], row[1], row[2]] for row in dataset]
[docs]@register
class SimLex999(WordSimilarityEvaluationDataset):
"""SimLex999 dataset word-similarity.
- Hill, F., Reichart, R., & Korhonen, A. (2015). Simlex-999: evaluating
semantic models with (genuine) similarity estimation. Computational
Linguistics, 41(4), 665–695. https://arxiv.org/abs/1408.3456
License: Unspecified
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 10 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/simlex999'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> simlex999 = gluonnlp.data.SimLex999(root='./datasets/simlex999')
-etc-
>>> len(simlex999)
999
>>> simlex999[0]
['old', 'new', 1.58]
"""
_url = 'https://www.cl.cam.ac.uk/~fh295/SimLex-999.zip'
_archive_file = ('SimLex-999.zip',
'0d3afe35b89d60acf11c28324ac7be10253fda39')
_checksums = {
'SimLex-999/README.txt': 'f54f4a93213b847eb93cc8952052d6b990df1bd1',
'SimLex-999/SimLex-999.txt': '0496761e49015bc266908ea6f8e35a5ec77cb2ee'
}
min = 0
max = 10
score = 'SimLex999'
def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
'simlex999')):
super(SimLex999, self).__init__(root=root)
def _get_data(self):
dataset = CorpusDataset(
os.path.join(self.root, 'SimLex-999', 'SimLex-999.txt'))
return [[row[0], row[1], row[3]] for i, row in enumerate(dataset)
if i != 0] # Throw away header
[docs]@register
class SimVerb3500(WordSimilarityEvaluationDataset):
"""SimVerb3500 dataset word-similarity.
- Hill, F., Reichart, R., & Korhonen, A. (2015). Simlex-999: evaluating
semantic models with (genuine) similarity estimation. Computational
Linguistics, 41(4), 665–695. https://arxiv.org/abs/1408.3456
License: Unspecified
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 10 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/verb3500'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> simverb3500 = gluonnlp.data.SimVerb3500(root='./datasets/simverb3500') #doctest:+SKIP
-etc-
>>> len(simverb3500) #doctest:+SKIP
3500
>>> simverb3500[0] #doctest:+SKIP
['take', 'remove', 6.81]
"""
_url = 'https://www.aclweb.org/anthology/attachments/D16-1235.Attachment.zip'
_archive_file = ('D16-1235.Attachment.zip', '7bcfff115ca3e4c909b3763a2ba35e83992f2a2f')
_checksums = {
'data/README.txt':
'fc2645b30a291a7486015c3e4b51d8eb599f7c7e',
'data/SimVerb-3000-test.txt':
'4cddf11f0fbbb3b94958e69b0614be5d125ec607',
'data/SimVerb-3500-ratings.txt':
'133d45daeb0e73b9da26930741455856887ac17b',
'data/SimVerb-3500-stats.txt':
'79a0fd7c6e03468742d276b127d70478a6995681',
'data/SimVerb-3500.txt':
'0e79af04fd42f44affc93004f2a02b62f155a9ae',
'data/SimVerb-3520-annotator-ratings.csv':
'9ff69cec9c93a1abba7be1404fc82d7f20e6633b',
'data/SimVerb-500-dev.txt':
'3ae184352ca2d9f855ca7cb099a65635d184f75a'
}
_segment_file = {
'full': 'data/SimVerb-3500.txt',
'test': 'data/SimVerb-3000-test.txt',
'dev': 'data/SimVerb-500-dev.txt'
}
min = 0
max = 10
def __init__(self, segment='full', root=os.path.join(
get_home_dir(), 'datasets', 'simverb3500')):
self.segment = segment
super(SimVerb3500, self).__init__(root=root)
def _get_data(self):
dataset = CorpusDataset(
os.path.join(self.root,
*self._segment_file[self.segment].split('/')))
return [[row[0], row[1], row[3]] for row in dataset]
[docs]@register(segment=['trial', 'test'])
class SemEval17Task2(WordSimilarityEvaluationDataset):
"""SemEval17Task2 dataset for word-similarity.
The dataset was collected by Finkelstein et al.
(http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/). Agirre et
al. proposed to split the collection into two datasets, one focused on
measuring similarity, and the other one on relatedness
(http://alfonseca.org/eng/research/wordsim353.html).
- Finkelstein, L., Gabrilovich, E., Matias, Y., Rivlin, E., Solan, Z.,
Wolfman, G., & Ruppin, E. (2002). Placing search in context: the concept
revisited. ACM} Trans. Inf. Syst., 20(1), 116–131.
https://dl.acm.org/citation.cfm?id=372094
- Agirre, E., Alfonseca, E., Hall, K. B., Kravalova, J., Pasca, M., & Soroa, A.
(2009). A study on similarity and relatedness using distributional and
wordnet-based approaches. In , Human Language Technologies: Conference of the
North American Chapter of the Association of Computational Linguistics,
Proceedings, May 31 - June 5, 2009, Boulder, Colorado, {USA (pp. 19–27). :
The Association for Computational Linguistics.
License: Unspecified
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 5 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/semeval17task2'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
segment : str, default 'train'
Dataset segment. Options are 'trial', 'test'.
language : str, default 'en'
Dataset language.
Examples
--------
>>> semeval17task2 = gluonnlp.data.SemEval17Task2() # doctest: +SKIP
-etc-
>>> len(semeval17task2) # doctest: +SKIP
18
>>> semeval17task2[0] # doctest: +SKIP
['sunset', 'string', 0.05]
"""
# TODO: reenable doctest once semeval17task2 is available again
_url = 'http://alt.qcri.org/semeval2017/task2/data/uploads/semeval2017-task2.zip'
_archive_file = ('semeval2017-task2.zip',
'b29860553f98b057303815817dfb60b9fe79cfba')
_checksums = C.SEMEVAL17_CHECKSUMS
_datatemplate = ('SemEval17-Task2/{segment}/subtask1-monolingual/data/'
'{language}.{segment}.data.txt')
_keytemplate = ('SemEval17-Task2/{segment}/subtask1-monolingual/keys/'
'{language}.{segment}.gold.txt')
min = 0
max = 5
segments = ('trial', 'test')
languages = ('en', 'es', 'de', 'it', 'fa')
def __init__(self, segment='trial', language='en', root=os.path.join(
get_home_dir(), 'datasets', 'semeval17task2')):
assert segment in self.segments
assert language in self.languages
self.language = language
self.segment = segment
super(SemEval17Task2, self).__init__(root=root)
def _get_data(self):
data = self._datatemplate.format(segment=self.segment,
language=self.language)
data = os.path.join(self.root, *data.split('/'))
keys = self._keytemplate.format(segment=self.segment,
language=self.language)
keys = os.path.join(self.root, *keys.split('/'))
data_dataset = CorpusDataset(data)
keys_dataset = CorpusDataset(keys)
return [[d[0], d[1], k[0]] for d, k in zip(data_dataset, keys_dataset)]
[docs]@register
class BakerVerb143(WordSimilarityEvaluationDataset):
"""Verb143 dataset.
- Baker, S., Reichart, R., & Korhonen, A. (2014). An unsupervised model for
instance level subcategorization acquisition. In A. Moschitti, B. Pang, &
W. Daelemans, Proceedings of the 2014 Conference on Empirical Methods in
Natural Language Processing, {EMNLP} 2014, October 25-29, 2014, Doha,
Qatar, {A} meeting of SIGDAT, a Special Interest Group of the {ACL (pp.
278–289). : ACL.
144 pairs of verbs annotated by 10 annotators following the WS-353
guidelines.
License: unspecified
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 1 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/verb143'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> bakerverb143 = gluonnlp.data.BakerVerb143(root='./datasets/bakerverb143') #doctest:+SKIP
-etc-
>>> len(bakerverb143) #doctest:+SKIP
144
>>> bakerverb143[0] #doctest:+SKIP
['happen', 'say', 0.19]
"""
_url = 'https://ie.technion.ac.il/~roiri/papers/EMNLP14.zip'
_archive_file = ('EMNLP14.zip', '1862e52af784e76e83d472532a75eb797fb8b807')
_checksums = {
'verb_similarity dataset.txt':
'd7e4820c7504cbae56898353e4d94e6408c330fc'
}
min = 0
max = 1
def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
'verb143')):
super(BakerVerb143, self).__init__(root=root)
def _get_data(self):
path = os.path.join(self.root, 'verb_similarity dataset.txt')
dataset = CorpusDataset(path)
return [[row[0], row[1], row[12]] for row in dataset]
[docs]@register
class YangPowersVerb130(WordSimilarityEvaluationDataset):
"""Verb-130 dataset.
- Yang, D., & Powers, D. M. (2006). Verb similarity on the taxonomy of
wordnet. In The Third International WordNet Conference: GWC 2006
License: Unspecified
Each sample consists of a pair of words, and a score with scale from
0 (totally unrelated words) to 4 (very much related or identical words).
Parameters
----------
root : str, default '$MXNET_HOME/datasets/verb130'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> yangpowersverb130 = gluonnlp.data.YangPowersVerb130(root='./datasets/yangpowersverb130')
>>> len(yangpowersverb130)
130
>>> yangpowersverb130[0]
['brag', 'boast', 4.0]
"""
_words1 = [
'brag', 'concoct', 'divide', 'build', 'end', 'accentuate',
'demonstrate', 'solve', 'consume', 'position', 'swear', 'furnish',
'merit', 'submit', 'seize', 'spin', 'enlarge', 'swing', 'circulate',
'recognize', 'resolve', 'prolong', 'tap', 'block', 'arrange', 'twist',
'hail', 'dissipate', 'approve', 'impose', 'hasten', 'rap', 'lean',
'make', 'show', 'sell', 'weave', 'refer', 'distribute', 'twist',
'drain', 'depict', 'build', 'hail', 'call', 'swing', 'yield', 'split',
'challenge', 'hinder', 'welcome', 'need', 'refer', 'finance', 'expect',
'terminate', 'yell', 'swell', 'rotate', 'seize', 'approve', 'supply',
'clip', 'divide', 'advise', 'complain', 'want', 'twist', 'swing',
'make', 'hinder', 'build', 'express', 'resolve', 'bruise', 'swing',
'catch', 'swear', 'request', 'arrange', 'relieve', 'move', 'weave',
'swear', 'forget', 'supervise', 'situate', 'explain', 'ache',
'evaluate', 'recognize', 'dilute', 'hasten', 'scorn', 'swear',
'arrange', 'discard', 'list', 'stamp', 'market', 'boil', 'sustain',
'resolve', 'dissipate', 'anger', 'approve', 'research', 'request',
'boast', 'furnish', 'refine', 'acknowledge', 'clean', 'lean',
'postpone', 'hail', 'remember', 'scrape', 'sweat', 'highlight',
'seize', 'levy', 'alter', 'refer', 'empty', 'flush', 'shake',
'imitate', 'correlate', 'refer'
]
_words2 = [
'boast', 'devise', 'split', 'construct', 'terminate', 'highlight',
'show', 'figure', 'eat', 'situate', 'vow', 'supply', 'deserve',
'yield', 'take', 'twirl', 'swell', 'sway', 'distribute', 'acknowledge',
'settle', 'sustain', 'knock', 'hinder', 'plan', 'curl', 'acclaim',
'disperse', 'support', 'levy', 'accelerate', 'tap', 'rest', 'earn',
'publish', 'market', 'intertwine', 'direct', 'commercialize',
'intertwine', 'tap', 'recognize', 'organize', 'address', 'refer',
'bounce', 'seize', 'crush', 'yield', 'assist', 'recognize', 'deserve',
'explain', 'build', 'deserve', 'postpone', 'boast', 'curl', 'situate',
'request', 'scorn', 'consume', 'twist', 'figure', 'furnish', 'boast',
'deserve', 'fasten', 'crash', 'trade', 'yield', 'propose', 'figure',
'examine', 'split', 'break', 'consume', 'explain', 'levy', 'study',
'hinder', 'swell', 'print', 'think', 'resolve', 'concoct', 'isolate',
'boast', 'spin', 'terminate', 'succeed', 'market', 'permit', 'yield',
'describe', 'explain', 'arrange', 'figure', 'weave', 'sweeten', 'tap',
'lower', 'publicize', 'isolate', 'approve', 'boast', 'distribute',
'concoct', 'yield', 'impress', 'sustain', 'distribute', 'concoct',
'grate', 'show', 'judge', 'hail', 'lean', 'spin', 'restore', 'refer',
'believe', 'highlight', 'carry', 'situate', 'spin', 'swell',
'highlight', 'levy', 'lean'
]
_url = ('https://dspace2.flinders.edu.au/xmlui/bitstream/handle/'
'2328/9557/Yang%20Verb.pdf?sequence=1')
min = 0
max = 4
def __init__(self, root=os.path.join('~', '.mxnet', 'datasets',
'verb130')):
super(YangPowersVerb130, self).__init__(root=root)
def _get_data(self):
scores = [4] * 26 + [3] * 26 + [2] * 26 + [1] * 26 + [0] * 26
return list(zip(self._words1, self._words2, scores))
def _download_data(self):
# Overwrite download method as this dataset is self-contained
pass
###############################################################################
# Word analogy datasets
###############################################################################
[docs]class WordAnalogyEvaluationDataset(_Dataset):
"""Base class for word analogy task datasets.
Inheriting classes are assumed to implement datasets of the form ['word1',
'word2', 'word3', 'word4'] or ['word1', [ 'word2a', 'word2b', ... ],
'word3', [ 'word4a', 'word4b', ... ]].
"""
def _get_data(self):
raise NotImplementedError
[docs]@register(category=C.GOOGLEANALOGY_CATEGORIES)
class GoogleAnalogyTestSet(WordAnalogyEvaluationDataset):
"""Google analogy test set
- Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient
estimation of word representations in vector space. In Proceedings of
the International Conference on Learning Representations (ICLR).
License: Unspecified
Each sample consists of two analogical pairs of words.
Parameters
----------
group : {'syntactic', 'semantic'} or None, default None
The subset for the specified type of analogy. None for the complete dataset.
category : str or None, default None
The subset for the specified category of analogy. None for the complete dataset.
lowercase : boolean, default True
Whether to convert words to lowercase.
root : str, default '$MXNET_HOME/datasets/google_analogy'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet(
... root='./datasets/googleanalogytestset')
-etc-
>>> len(googleanalogytestset)
19544
>>> googleanalogytestset[0]
['athens', 'greece', 'baghdad', 'iraq']
>>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet(
... 'syntactic', root='./datasets/googleanalogytestset')
>>> googleanalogytestset[0]
['amazing', 'amazingly', 'apparent', 'apparently']
>>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet(
... 'syntactic', 'gram8-plural', root='./datasets/googleanalogytestset')
>>> googleanalogytestset[0]
['banana', 'bananas', 'bird', 'birds']
"""
_archive_file = ('questions-words.txt',
'fa92df4bbe788f2d51827c762c63bd8e470edf31')
_checksums = {
'questions-words.txt': 'fa92df4bbe788f2d51827c762c63bd8e470edf31'
}
_url = 'http://download.tensorflow.org/data/questions-words.txt'
groups = ['syntactic', 'semantic']
categories = C.GOOGLEANALOGY_CATEGORIES
def __init__(self, group=None,
category=None, lowercase=True, root=os.path.join(
get_home_dir(), 'datasets', 'google_analogy')):
assert group is None or group in self.groups
assert category is None or category in self.categories
self.category = category
self.group = group
self.lowercase = lowercase
super(GoogleAnalogyTestSet, self).__init__(root=root)
def _get_data(self):
words = []
with open(os.path.join(self.root, self._archive_file[0])) as f:
for line in f:
if line.startswith(':'):
current_category = line.split()[1]
if 'gram' in current_category:
current_group = 'syntactic'
else:
current_group = 'semantic'
else:
if self.group is not None and self.group != current_group:
continue
if self.category is not None and self.category != current_category:
continue
if self.lowercase:
line = line.lower()
words.append(line.split())
return words
[docs]@register(category=list(C.BATS_CATEGORIES.keys()))
class BiggerAnalogyTestSet(WordAnalogyEvaluationDataset):
"""Bigger analogy test set
- Gladkova, A., Drozd, A., & Matsuoka, S. (2016). Analogy-based detection
of morphological and semantic relations with word embeddings: what works
and what doesn’t. In Proceedings of the NAACL-HLT SRW (pp. 47–54). San
Diego, California, June 12-17, 2016: ACL. Retrieved from
https://www.aclweb.org/anthology/N/N16/N16-2002.pdf
License: Unspecified
Each sample consists of two analogical pairs of words.
Parameters
----------
root : str, default '$MXNET_HOME/datasets/bats'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> biggeranalogytestset = gluonnlp.data.BiggerAnalogyTestSet(
... root='./datasets/biggeranalogytestset')
-etc-
>>> len(biggeranalogytestset)
98000
>>> biggeranalogytestset[0]
['arm', 'armless', 'art', 'artless']
"""
_archive_file = ('BATS_3.0.zip',
'bf94d47884be9ea83af369beeea7499ed25dcf0d')
_checksums = C.BATS_CHECKSUMS
_url = 'https://s3.amazonaws.com/blackbirdprojects/tut_vsm/BATS_3.0.zip'
_category_group_map = {
'I': '1_Inflectional_morphology',
'D': '2_Derivational_morphology',
'E': '3_Encyclopedic_semantics',
'L': '4_Lexicographic_semantics'
}
_categories = C.BATS_CATEGORIES
def __init__(self, category=None, form_analogy_pairs=True,
drop_alternative_solutions=True, root=os.path.join(
get_home_dir(), 'datasets', 'bigger_analogy')):
self.form_analogy_pairs = form_analogy_pairs
self.drop_alternative_solutions = drop_alternative_solutions
self.category = category
if self.category is not None:
assert self.category in self._categories.keys()
super(BiggerAnalogyTestSet, self).__init__(root=root)
def _get_data(self):
if self.category is not None:
categories = [self.category]
else:
categories = sorted(list(self._categories.keys()))
datasets = []
for category in categories:
group = self._category_group_map[category[0]]
category_name = self._categories[category]
path = os.path.join(
self.root,
*('BATS_3.0/{group}/{category} {category_name}.txt'.format(
group=group, category=category,
category_name=category_name).split('/')))
dataset = CorpusDataset(path)
dataset = [[row[0], row[1].split('/')] for row in dataset]
# Drop alternative solutions seperated by '/' from word2 column
if self.drop_alternative_solutions:
dataset = [[row[0], row[1][0]] for row in dataset]
# Final dataset consists of all analogy pairs per category
if self.form_analogy_pairs:
dataset = [[arow[0], arow[1], brow[0], brow[1]]
for arow in dataset for brow in dataset
if arow != brow]
datasets += dataset
return datasets