Source code for gluonnlp.data.intent_slot
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Datasets for intent classification and slot labeling."""
import io
import os
import zipfile
import numpy as np
from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
from mxnet.gluon.data import SimpleDataset
from .dataset import TSVDataset
from .registry import register
from .utils import Splitter
from ..base import get_home_dir
from ..vocab import Vocab
__all__ = ['ATISDataset', 'SNIPSDataset']
class _BaseICSLDataset(SimpleDataset):
"""Base Class of Datasets for Joint Intent Classification and Slot Labeling.
Parameters
----------
segment : str
Dataset segment.
root : str
Path to temp folder for storing data.
"""
def __init__(self, segment, root):
root = os.path.expanduser(root)
os.makedirs(root, exist_ok=True)
self._segment = segment
self._root = root
self._intent_vocab = None
self._slot_vocab = None
self._get_data()
super(_BaseICSLDataset, self).__init__(self._read_data(segment))
@property
def _download_info(self):
"""Download file information.
Returns
-------
filename_format : str
The filename format with slot for short hash.
sha1_hash : str
Expected sha1 hash of the file content.
"""
raise NotImplementedError
@property
def intent_vocab(self):
if self._intent_vocab is None:
with open(os.path.join(self._root, 'intent_vocab.json'), 'r') as f:
self._intent_vocab = Vocab.from_json(f.read())
return self._intent_vocab
@property
def slot_vocab(self):
if self._slot_vocab is None:
with open(os.path.join(self._root, 'slot_vocab.json'), 'r') as f:
self._slot_vocab = Vocab.from_json(f.read())
return self._slot_vocab
def _get_data(self):
filename_format, sha1_hash = self._download_info
filename = filename_format.format(sha1_hash[:8])
data_filename = os.path.join(self._root, filename)
url = _get_repo_file_url('gluon/dataset', filename)
if not os.path.exists(data_filename) or not check_sha1(data_filename, sha1_hash):
download(url, path=data_filename, sha1_hash=sha1_hash)
with zipfile.ZipFile(data_filename, 'r') as zf:
zf.extractall(self._root)
def _read_data(self, segment):
sentences = TSVDataset(os.path.join(self._root, '{}_sentence.txt'.format(segment)),
field_separator=Splitter(' '))
tags = TSVDataset(os.path.join(self._root, '{}_tags.txt'.format(segment)),
field_separator=Splitter(' '))
with io.open(os.path.join(self._root, '{}_intent.txt'.format(segment)), 'r',
encoding='utf-8') as f:
intents = []
for line in f:
line = line.strip()
intents.append(np.array([self.intent_vocab[ele] for ele in line.split(';')],
dtype=np.int32))
return list(zip(sentences, tags, intents))
[docs]@register(segment=['train', 'dev', 'test'])
class ATISDataset(_BaseICSLDataset):
"""Airline Travel Information System dataset from MS CNTK.
From
https://github.com/Microsoft/CNTK/tree/master/Examples/LanguageUnderstanding/ATIS/Data
License: Unspecified
Each sample has three fields: tokens, slot labels, intent label.
Parameters
----------
segment : {'train', 'dev', 'test'}, default 'train'
Dataset segment.
root : str, default '$MXNET_HOME/datasets/atis'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> atis = gluonnlp.data.ATISDataset(root='./datasets/atis')
-etc-
>>> len(atis)
4478
>>> len(atis[0])
3
>>> len(atis[0][0])
10
>>> atis[0][0]
['i', 'want', 'to', 'fly', 'from', 'baltimore', 'to', 'dallas', 'round', 'trip']
>>> len(atis[0][1])
10
>>> atis[0][1][:8]
['O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name']
>>> atis[0][2]
array([10], dtype=int32)
"""
def __init__(self, segment='train',
root=os.path.join(get_home_dir(), 'datasets', 'atis')):
super(ATISDataset, self).__init__(segment, root)
@property
def _download_info(self):
return 'atis-{}.zip', 'fb75a1b595566d5c5ec06ee6f2296d6629b8c225'
[docs]@register(segment=['train', 'dev', 'test'])
class SNIPSDataset(_BaseICSLDataset):
"""Snips Natural Language Understanding Benchmark dataset.
Coucke et al. (2018). Snips Voice Platform: an embedded Spoken Language Understanding system
for private-by-design voice interfaces. https://arxiv.org/abs/1805.10190
From
https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
License: Unspecified
Each sample has three fields: tokens, slot labels, intent label.
Parameters
----------
segment : {'train', 'dev', 'test'}, default 'train'
Dataset segment.
root : str, default '$MXNET_HOME/datasets/snips'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
Examples
--------
>>> snips = gluonnlp.data.SNIPSDataset(root='./datasets/snips')
-etc-
>>> len(snips)
13084
>>> len(snips[0])
3
>>> len(snips[1][0])
8
>>> snips[1][0]
['put', 'United', 'Abominations', 'onto', 'my', 'rare', 'groove', 'playlist']
>>> len(snips[1][1])
8
>>> snips[1][1][:5]
['O', 'B-entity_name', 'I-entity_name', 'O', 'B-playlist_owner']
>>> snips[1][2]
array([0], dtype=int32)
"""
def __init__(self, segment='train',
root=os.path.join(get_home_dir(), 'datasets', 'snips')):
super(SNIPSDataset, self).__init__(segment, root)
@property
def _download_info(self):
return 'snips-{}.zip', 'f22420cc0f2a26078337dc375606be46a4cc8c51'