Source code for gluonnlp.data.batchify.embedding

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Batchify helpers for embedding training."""

__all__ = ['EmbeddingCenterContextBatchify']

import itertools
import logging
import random

import numpy as np

from ...base import numba_njit, numba_prange
from ..stream import DataStream


[docs]class EmbeddingCenterContextBatchify:
    """Helper to create batches of center and contexts words.

    Batches are created lazily on a optionally shuffled version of the Dataset.
    To create batches from some corpus, first create a
    EmbeddingCenterContextBatchify object and then call it with the corpus.
    Please see the documentation of __call__ for more details.

    Parameters
    ----------
    batch_size : int
        Maximum size of batches returned. Actual batch returned can be smaller
        when running out of samples.
    window_size : int, default 5
        The maximum number of context elements to consider left and right of
        each center element. Less elements may be considered if there are not
        sufficient elements left / right of the center element or if a reduced
        window size was drawn.
    reduce_window_size_randomly : bool, default True
       If True, randomly draw a reduced window size for every center element
       uniformly from [1, window].
    shuffle : bool, default True
       If True, shuffle the sentences before lazily generating batches.
    cbow : bool, default False
       Enable CBOW mode. In CBOW mode the returned context contains multiple
       entries per row. One for each context. If CBOW is False (default), there
       is a separate row for each context. The context_data array always
       contains weights for the context words equal to 1 over the number of
       context words in the given row of the context array.
    weight_dtype : numpy.dtype, default numpy.float32
        Data type for data array of sparse COO context representation.
    index_dtype : numpy.dtype, default numpy.int64

    """

    def __init__(self, batch_size, window_size=5,
                 reduce_window_size_randomly=True, shuffle=True, cbow=False,
                 weight_dtype='float32', index_dtype='int64'):
        self._batch_size = batch_size
        self._window_size = window_size
        self._reduce_window_size_randomly = reduce_window_size_randomly
        self._shuffle = shuffle
        self._cbow = cbow
        self._weight_dtype = weight_dtype
        self._index_dtype = index_dtype

[docs]    def __call__(self, corpus):
        """Batchify a dataset.

        Parameters
        ----------
        corpus : list of sentences
            List of sentences. Any list containing for example integers or
            strings can be a sentence. Context samples do not cross sentence
            boundaries.

         Returns
         -------
         DataStream
             Each element of the DataStream is a tuple of 2 elements (center,
             context). center is a numpy.ndarray of shape (batch_size, ).
             context is a tuple of 3 numpy.ndarray, representing a sparse COO
             array (data, row, col). The center and context arrays contain the
             center and corresponding context words respectively. A sparse
             representation is used for context as the number of context words
             for one center word varies based on the randomly chosen context
             window size and sentence boundaries. The returned center and col
             arrays are of the same dtype as the sentence elements.

        """
        return _EmbeddingCenterContextBatchify(
            corpus, self._batch_size, self._window_size,
            self._reduce_window_size_randomly, self._shuffle, cbow=self._cbow,
            weight_dtype=self._weight_dtype, index_dtype=self._index_dtype)


class _EmbeddingCenterContextBatchify(DataStream):
    def __init__(self, sentences, batch_size, window_size,
                 reduce_window_size_randomly, shuffle, cbow, weight_dtype,
                 index_dtype):
        self._sentences = sentences
        self._batch_size = batch_size
        self._window_size = window_size
        self._reduce_window_size_randomly = reduce_window_size_randomly
        self._shuffle = shuffle
        self._cbow = cbow
        self._weight_dtype = weight_dtype
        self._index_dtype = index_dtype

    def __iter__(self):
        if numba_prange is range:
            logging.warning(
                'EmbeddingCenterContextBatchify supports just in time compilation '
                'with numba, but numba is not installed. '
                'Consider "pip install numba" for significant speed-ups.')

        firstelement = next(itertools.chain.from_iterable(self._sentences))
        if isinstance(firstelement, str):
            sentences = [np.asarray(s, dtype='O') for s in self._sentences]
        else:
            dtype = type(firstelement)
            sentences = [np.asarray(s, dtype=dtype) for s in self._sentences]

        if self._shuffle:
            random.shuffle(sentences)

        sentence_boundaries = np.cumsum([len(c) for c in sentences])
        sentences = np.concatenate(sentences)

        it = iter(
            _context_generator(
                sentence_boundaries, self._window_size, self._batch_size,
                random_window_size=self._reduce_window_size_randomly,
                cbow=self._cbow, seed=random.getrandbits(32)))

        def _closure():
            while True:
                try:
                    (center, context_data, context_row, context_col) = next(it)
                    context_data = np.asarray(context_data, dtype=self._weight_dtype)
                    context_row = np.asarray(context_row, dtype=self._index_dtype)
                    context_col = sentences[context_col]
                    context_coo = (context_data, context_row, context_col)
                    yield sentences[center], context_coo
                except StopIteration:
                    return

        return _closure()


@numba_njit
def _get_sentence_start_end(sentence_boundaries, sentence_pointer):
    end = sentence_boundaries[sentence_pointer]
    if sentence_pointer == 0:
        start = 0
    else:
        start = sentence_boundaries[sentence_pointer - 1]
    return start, end


@numba_njit
def _context_generator(sentence_boundaries, window, batch_size,
                       random_window_size, cbow, seed):
    num_rows = batch_size
    word_pointer = 0
    num_context_skip = 0
    while True:
        center_batch = []
        # Prepare arrays for COO sparse matrix format
        context_data = []
        context_row = []
        context_col = []
        i = 0
        while i < num_rows:
            if word_pointer >= sentence_boundaries[-1]:
                # There is no data left
                break

            contexts = _get_context(word_pointer, sentence_boundaries, window,
                                    random_window_size, seed)
            if contexts is None:
                word_pointer += 1
                continue
            center = word_pointer
            for j, context in enumerate(contexts):
                if num_context_skip > j:
                    # In SkipGram mode, there may be some leftover contexts
                    # form the last batch
                    continue
                if i >= num_rows:
                    num_context_skip = j
                    assert not cbow
                    break

                num_context_skip = 0
                context_row.append(i)
                context_col.append(context)
                if cbow:
                    context_data.append(1.0 / len(contexts))
                else:
                    center_batch.append(center)
                    context_data.append(1)
                    i += 1

            if cbow:
                center_batch.append(center)
                i += 1

            if num_context_skip == 0:
                word_pointer += 1
            else:
                assert i == num_rows
                break

        if len(center_batch) == num_rows:
            center_batch_np = np.array(center_batch, dtype=np.int64)
            context_data_np = np.array(context_data, dtype=np.float32)
            context_row_np = np.array(context_row, dtype=np.int64)
            context_col_np = np.array(context_col, dtype=np.int64)
            yield center_batch_np, context_data_np, context_row_np, context_col_np
        else:
            assert word_pointer >= sentence_boundaries[-1]
            break


@numba_njit
def _get_context(center_idx, sentence_boundaries, window_size,
                 random_window_size, seed):
    """Compute the context with respect to a center word in a sentence.

    Takes an numpy array of sentences boundaries.

    """
    random.seed(seed + center_idx)

    sentence_index = np.searchsorted(sentence_boundaries, center_idx)
    sentence_start, sentence_end = _get_sentence_start_end(
        sentence_boundaries, sentence_index)

    if random_window_size:
        window_size = random.randint(1, window_size)
    start_idx = max(sentence_start, center_idx - window_size)
    end_idx = min(sentence_end, center_idx + window_size + 1)

    if start_idx != center_idx and center_idx + 1 != end_idx:
        context = np.concatenate((np.arange(start_idx, center_idx),
                                  np.arange(center_idx + 1, end_idx)))
    elif start_idx != center_idx:
        context = np.arange(start_idx, center_idx)
    elif center_idx + 1 != end_idx:
        context = np.arange(center_idx + 1, end_idx)
    else:
        context = None

    return context