Source code for gluonnlp.data.batchify.batchify
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Batchify functions. They can be used in Gluon data loader to help combine individual samples
into batches for fast processing."""
__all__ = ['Stack', 'Pad', 'Tuple', 'List', 'NamedTuple', 'Dict']
import warnings
import math
from typing import (Dict as t_Dict, Callable as t_Callable, List as t_List, Tuple as t_Tuple,
AnyStr, Union as t_Union)
import numpy as np
import mxnet as mx
def _pad_arrs_to_max_length(arrs, pad_axis, pad_val, use_shared_mem, dtype, round_to=None):
"""Inner Implementation of the Pad batchify
Parameters
----------
arrs : list
pad_axis : int
pad_val : number
use_shared_mem : bool, default False
Returns
-------
ret : NDArray
original_length : NDArray
"""
if isinstance(arrs[0], mx.nd.NDArray):
dtype = arrs[0].dtype if dtype is None else dtype
arrs = [arr.asnumpy() for arr in arrs]
elif not isinstance(arrs[0], np.ndarray):
arrs = [np.asarray(ele) for ele in arrs]
else:
dtype = arrs[0].dtype if dtype is None else dtype
original_length = [ele.shape[pad_axis] for ele in arrs]
max_size = max(original_length)
if round_to is not None:
max_size = round_to * math.ceil(max_size / round_to)
ret_shape = list(arrs[0].shape)
ret_shape[pad_axis] = max_size
ret_shape = (len(arrs), ) + tuple(ret_shape)
ret = np.full(shape=ret_shape, fill_value=pad_val, dtype=dtype)
for i, arr in enumerate(arrs):
if arr.shape[pad_axis] == max_size:
ret[i] = arr
else:
slices = [slice(None) for _ in range(arr.ndim)]
slices[pad_axis] = slice(0, arr.shape[pad_axis])
if slices[pad_axis].start != slices[pad_axis].stop:
slices = [slice(i, i + 1)] + slices
ret[tuple(slices)] = arr
ctx = mx.Context('cpu_shared', 0) if use_shared_mem else mx.cpu()
ret = mx.nd.array(ret, ctx=ctx, dtype=dtype)
original_length = mx.nd.array(original_length, ctx=ctx, dtype=np.int32)
return ret, original_length
def _stack_arrs(arrs, use_shared_mem, dtype):
if isinstance(arrs[0], mx.nd.NDArray):
dtype = arrs[0].dtype if dtype is None else dtype
if use_shared_mem:
out = mx.nd.empty((len(arrs),) + arrs[0].shape, dtype=dtype,
ctx=mx.Context('cpu_shared', 0))
return mx.nd.stack(*arrs, out=out)
else:
return mx.nd.stack(*arrs)
else:
out = np.asarray(arrs)
dtype = out.dtype if dtype is None else dtype
if use_shared_mem:
return mx.nd.array(out, ctx=mx.Context('cpu_shared', 0), dtype=dtype)
else:
return mx.nd.array(out, dtype=dtype)
[docs]class Stack:
r"""Stack the input data samples to construct the batch.
The N input samples must have the same shape/length and will be stacked to construct a batch.
Parameters
----------
dtype : str or numpy.dtype, default None
The value type of the output. If it is set to None, the input data type is used.
Examples
--------
>>> import gluonnlp.data.batchify as bf
>>> # Stack multiple lists
>>> a = [1, 2, 3, 4]
>>> b = [4, 5, 6, 8]
>>> c = [8, 9, 1, 2]
>>> bf.Stack()([a, b, c])
<BLANKLINE>
[[1 2 3 4]
[4 5 6 8]
[8 9 1 2]]
<NDArray 3x4 @cpu_shared(0)>
>>> # Stack multiple numpy.ndarrays
>>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
>>> b = np.array([[5, 6, 7, 8], [1, 2, 3, 4]])
>>> bf.Stack()([a, b])
<BLANKLINE>
[[[1 2 3 4]
[5 6 7 8]]
<BLANKLINE>
[[5 6 7 8]
[1 2 3 4]]]
<NDArray 2x2x4 @cpu_shared(0)>
>>> # Stack multiple NDArrays
>>> a = mx.nd.array([[1, 2, 3, 4], [5, 6, 7, 8]])
>>> b = mx.nd.array([[5, 6, 7, 8], [1, 2, 3, 4]])
>>> bf.Stack()([a, b])
<BLANKLINE>
[[[1. 2. 3. 4.]
[5. 6. 7. 8.]]
<BLANKLINE>
[[5. 6. 7. 8.]
[1. 2. 3. 4.]]]
<NDArray 2x2x4 @cpu_shared(0)>
"""
def __init__(self, dtype=None):
self._dtype = dtype
[docs] def __call__(self, data):
"""Batchify the input data
Parameters
----------
data : list
The input data samples
Returns
-------
batch_data : NDArray
"""
return _stack_arrs(data, True, self._dtype)
[docs]class Pad:
"""Return a callable that pads and stacks data.
Parameters
----------
axis : int, default 0
The axis to pad the arrays. The arrays will be padded to the largest dimension at
`axis`. For example, assume the input arrays have shape
(10, 8, 5), (6, 8, 5), (3, 8, 5) and the `axis` is 0. Each input will be padded into
(10, 8, 5) and then stacked to form the final output, which has shape(3, 10, 8, 5).
pad_val : float or int, default 0
The padding value.
ret_length : bool, default False
Whether to return the valid length in the output.
dtype : str or numpy.dtype, default None
The value type of the output. If it is set to None, the input data type is used.
round_to : int, default None
If specified, the padded dimension will be rounded to be multiple of this argument.
Examples
--------
>>> import gluonnlp.data.batchify as bf
>>> # Inputs are multiple lists
>>> a = [1, 2, 3, 4]
>>> b = [4, 5, 6]
>>> c = [8, 2]
>>> bf.Pad(pad_val=0)([a, b, c])
<BLANKLINE>
[[1. 2. 3. 4.]
[4. 5. 6. 0.]
[8. 2. 0. 0.]]
<NDArray 3x4 @cpu_shared(0)>
>>> # Also output the lengths
>>> a = [1, 2, 3, 4]
>>> b = [4, 5, 6]
>>> c = [8, 2]
>>> batch, length = bf.Pad(pad_val=0, ret_length=True)([a, b, c])
>>> batch
<BLANKLINE>
[[1. 2. 3. 4.]
[4. 5. 6. 0.]
[8. 2. 0. 0.]]
<NDArray 3x4 @cpu_shared(0)>
>>> length
<BLANKLINE>
[4 3 2]
<NDArray 3 @cpu_shared(0)>
>>> # Inputs are multiple ndarrays
>>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
>>> b = np.array([[5, 8], [1, 2]])
>>> bf.Pad(axis=1, pad_val=-1)([a, b])
<BLANKLINE>
[[[ 1 2 3 4]
[ 5 6 7 8]]
<BLANKLINE>
[[ 5 8 -1 -1]
[ 1 2 -1 -1]]]
<NDArray 2x2x4 @cpu_shared(0)>
"""
def __init__(self, axis=0, pad_val=None, ret_length=False, dtype=None, round_to=None):
self._axis = axis
assert isinstance(axis, int), 'axis must be an integer! ' \
'Received axis=%s, type=%s.' % (str(axis),
str(type(axis)))
self._pad_val = 0 if pad_val is None else pad_val
self._ret_length = ret_length
self._dtype = dtype
self._warned = False
self._round_to = round_to
if pad_val is None:
warnings.warn(
'Padding value is not given and will be set automatically to 0 '
'in data.batchify.Pad(). '
'Please check whether this is intended '
'(e.g. value of padding index in the vocabulary).')
[docs] def __call__(self, data):
"""Batchify the input data.
The input can be list of numpy.ndarray, list of numbers or list of
mxnet.nd.NDArray. Inputting mxnet.nd.NDArray is discouraged as each
array need to be converted to numpy for efficient padding.
The arrays will be padded to the largest dimension at `axis` and then
stacked to form the final output. In addition, the function will output
the original dimensions at the `axis` if ret_length is turned on.
Parameters
----------
data : List[np.ndarray] or List[List[dtype]] or List[mx.nd.NDArray]
List of samples to pad and stack.
Returns
-------
batch_data: NDArray
Data in the minibatch. Shape is (N, ...)
valid_length: NDArray, optional
The sequences' original lengths at the padded axis. Shape is (N,). This will only be
returned in `ret_length` is True.
"""
if isinstance(data[0], mx.nd.NDArray) and not self._warned:
self._warned = True
warnings.warn(
'Using Pad with NDArrays is discouraged for speed reasons. '
'Instead you should pad your data while it is still a list '
'and before converting to an NDArray. '
'Alternatively you can consider inputting a numpy.ndarray.')
if isinstance(data[0], (mx.nd.NDArray, np.ndarray, list)):
padded_arr, original_length = _pad_arrs_to_max_length(data, self._axis,
self._pad_val, True,
self._dtype,
round_to=self._round_to)
if self._ret_length:
return padded_arr, original_length
else:
return padded_arr
else:
raise NotImplementedError
[docs]class Tuple:
"""Wrap multiple batchify functions together. The input functions will be applied
to the corresponding input fields.
Each data sample should be a list or tuple containing multiple attributes. The `i`th batchify
function stored in `Tuple` will be applied on the `i`th attribute. For example, each
data sample is (nd_data, label). You can wrap two batchify functions using
`Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and label correspondingly.
Parameters
----------
fn : list or tuple or callable
The batchify functions to wrap.
*args : tuple of callable
The additional batchify functions to wrap.
Examples
--------
>>> import gluonnlp.data.batchify as bf
>>> a = ([1, 2, 3, 4], 0)
>>> b = ([5, 7], 1)
>>> c = ([1, 2, 3, 4, 5, 6, 7], 0)
>>> f1, f2 = bf.Tuple(bf.Pad(pad_val=0), bf.Stack())([a, b])
>>> f1
<BLANKLINE>
[[1. 2. 3. 4.]
[5. 7. 0. 0.]]
<NDArray 2x4 @cpu_shared(0)>
>>> f2
<BLANKLINE>
[0 1]
<NDArray 2 @cpu_shared(0)>
"""
def __init__(self, fn, *args):
if isinstance(fn, (list, tuple)):
assert len(args) == 0, 'Input pattern not understood. The input of Tuple can be ' \
'Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). ' \
'Received fn=%s, args=%s' % (str(fn), str(args))
self._fn = fn
else:
self._fn = (fn, ) + args
for i, ele_fn in enumerate(self._fn):
assert hasattr(ele_fn, '__call__'), 'Batchify functions must be callable! ' \
'type(fn[%d]) = %s' % (i, str(type(ele_fn)))
[docs] def __call__(self, data):
"""Batchify the input data.
Parameters
----------
data : list
The samples to batchfy. Each sample should contain N attributes.
Returns
-------
ret : tuple
A tuple of length N. Contains the batchified result of each attribute in the input.
"""
assert len(data[0]) == len(self._fn),\
'The number of attributes in each data sample should contains' \
' {} elements'.format(len(self._fn))
ret = []
for i, ele_fn in enumerate(self._fn):
ret.append(ele_fn([ele[i] for ele in data]))
return tuple(ret)
[docs]class List:
"""Simply forward the list of input data.
This is particularly useful when the Dataset contains textual data
and in conjonction with the `Tuple` batchify function.
Examples
--------
>>> import gluonnlp.data.batchify as bf
>>> a = ([1, 2, 3, 4], "I am using MXNet")
>>> b = ([5, 7, 2, 5], "Gluon rocks!")
>>> c = ([1, 2, 3, 4], "Batchification!")
>>> _, l = bf.Tuple(bf.Stack(), bf.List())([a, b, c])
>>> l
['I am using MXNet', 'Gluon rocks!', 'Batchification!']
"""
[docs] def __call__(self, data: t_List) -> t_List:
"""
Parameters
----------
data
The list of samples
Returns
-------
ret
The input list
"""
return list(data)
[docs]class Dict:
"""Wrap multiple batchify functions together and apply it to merge inputs from a dict.
The generated batch samples are stored as a dict with the same keywords.
Each data sample should be a dict and the fn corresponds to `key` will be applied on the
input with the keyword `key`.
For example, each data sample is {'data': nd_data, 'label': nd_label}.
You can merge the data and labels using
`Dict({'data': DataBatchify, 'label': LabelBatchify})` to batchify the nd_data and nd_label.
Parameters
----------
fn_dict
A dictionary that contains the key-->batchify function mapping.
Examples
--------
>>> from gluonnlp.data.batchify import Dict, Pad, Stack
>>> a = {'data': [1, 2, 3, 4], 'label': 0}
>>> b = {'data': [5, 7], 'label': 1}
>>> c = {'data': [1, 2, 3, 4, 5, 6, 7], 'label': 0}
>>> batchify_fn = Dict({'data': Pad(pad_val=0), 'label': Stack()})
>>> sample = batchify_fn([a, b, c])
>>> sample['data']
<BLANKLINE>
[[1. 2. 3. 4. 0. 0. 0.]
[5. 7. 0. 0. 0. 0. 0.]
[1. 2. 3. 4. 5. 6. 7.]]
<NDArray 3x7 @cpu_shared(0)>
>>> sample['label']
<BLANKLINE>
[0 1 0]
<NDArray 3 @cpu_shared(0)>
"""
def __init__(self, fn_dict: t_Dict[AnyStr, t_Callable]):
self._fn_dict = fn_dict
if not isinstance(fn_dict, dict):
raise ValueError('Input must be a dictionary! type of input = {}'
.format(type(fn_dict)))
for fn in fn_dict.values():
if not hasattr(fn, '__call__'):
raise ValueError('Elements of the dictionary must be callable!')
self._fn_dict = fn_dict
[docs] def __call__(self, data: t_List[t_Dict]) -> t_Dict:
"""
Parameters
----------
data
The samples to batchify. Each sample should be a dictionary
Returns
-------
ret
The resulting dictionary that stores the merged samples.
"""
ret = dict()
for k, ele_fn in self._fn_dict.items():
ret[k] = ele_fn([ele[k] for ele in data])
return ret
[docs]class NamedTuple:
"""Wrap multiple batchify functions together and apply it to merge inputs from a namedtuple.
The generated batch samples are stored as a namedtuple with the same structure.
Each data sample should be a namedtuple. The `i`th batchify
function stored in `NamedTuple` will be applied on the `i`th attribute of the namedtuple data.
For example, each data sample is Sample(data=nd_data, label=nd_label).
You can wrap two batchify functions using
`NamedTuple(Sample, {'data': DataBatchify, 'label': LabelBatchify})` to
batchify nd_data and nd_label correspondingly. The result will be stored as a Sample object
and you can access the data and label via `sample.data` and `sample.label`, correspondingly.
Parameters
----------
container : NamedTuple class
The object that constructs the NamedTuple.
fn_info
The information of the inner batchify functions.
Examples
--------
>>> from gluonnlp.data.batchify import NamedTuple, Pad, Stack
>>> from collections import namedtuple
>>> SampleData = namedtuple('SampleData', ['data', 'label'])
>>> a = SampleData([1, 2, 3, 4], 0)
>>> b = SampleData([5, 7], 1)
>>> c = SampleData([1, 2, 3, 4, 5, 6, 7], 0)
>>> batchify_fn = NamedTuple(SampleData, {'data': Pad(pad_val=0), 'label': Stack()})
>>> sample = batchify_fn([a, b, c])
>>> sample
SampleData(data=
[[1. 2. 3. 4. 0. 0. 0.]
[5. 7. 0. 0. 0. 0. 0.]
[1. 2. 3. 4. 5. 6. 7.]]
<NDArray 3x7 @cpu_shared(0)>, label=
[0 1 0]
<NDArray 3 @cpu_shared(0)>)
>>> sample.data
<BLANKLINE>
[[1. 2. 3. 4. 0. 0. 0.]
[5. 7. 0. 0. 0. 0. 0.]
[1. 2. 3. 4. 5. 6. 7.]]
<NDArray 3x7 @cpu_shared(0)>
>>> # Let's consider to use a list
>>> batchify_fn = NamedTuple(SampleData, [Pad(pad_val=0), Stack()])
>>> batchify_fn([a, b, c])
SampleData(data=
[[1. 2. 3. 4. 0. 0. 0.]
[5. 7. 0. 0. 0. 0. 0.]
[1. 2. 3. 4. 5. 6. 7.]]
<NDArray 3x7 @cpu_shared(0)>, label=
[0 1 0]
<NDArray 3 @cpu_shared(0)>)
"""
def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Callable],
t_Dict[AnyStr, t_Callable]]):
self._container = container
if isinstance(fn_info, (list, tuple)):
if len(container._fields) != len(fn_info):
raise ValueError('Attributes mismatch! Required fields={}, fn_info={}'
.format(container._fields, fn_info))
elif isinstance(fn_info, dict):
for name in container._fields:
if name not in fn_info:
raise ValueError('Attribute {} has not been assigned a callable. '
'Required fields={}, Found fields={}'
.format(name, container._fields, fn_info.keys()))
if len(container._fields) != len(fn_info):
raise ValueError('Attributes mimatch! Required fields={}, Found fields={}'
.format(container._fields, fn_info.keys()))
fn_info = [fn_info[name] for name in container._fields]
for fn in fn_info:
if not hasattr(fn, '__call__'):
raise ValueError('All batchify functions must be callable.')
self._fn_l = fn_info
[docs] def __call__(self, data):
"""Batchify the input data.
Parameters
----------
data : List of NamedTuple
The samples to batchify. Each sample should be a NamedTuple.
Returns
-------
ret : List of NamedTuple
A namedtuple of length N. Contains the batchified result of each attribute in the input.
"""
if not isinstance(data[0], self._container):
raise ValueError('The samples should have the same type as the stored namedtuple.'
' data[0]={}, container={}'.format(data[0], self._container))
ret = []
for i, ele_fn in enumerate(self._fn_l):
ret.append(ele_fn([ele[i] for ele in data]))
return self._container(*ret)