Source code for gluonnlp.data.xlnet.squad
"""Utility functions for xlnet squad preprocessing"""
__all__ = ['convert_index', 'lcs_match']
import unicodedata
import numpy as np
def _preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
"""Remove space, convert to lower case, keep accents.
Parameters
----------
inputs: str
input string
lower: bool
If convert the input string to lower case.
remove_space: bool
If remove the spaces in the input string.
keep_accents: bool
If keep accents in the input string.
Returns
-------
str: processed input string
"""
if remove_space:
outputs = ' '.join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace('``', '"').replace('\'\'', '"')
if not keep_accents:
outputs = unicodedata.normalize('NFKD', outputs)
outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
if lower:
outputs = outputs.lower()
return outputs
[docs]def convert_index(index_map, pos, M=None, is_start=True):
"""Working best with lcs_match(), convert the token index to origin text index
Parameters
----------
index_map: list of int
Typically, it is a map form origin indices to converted indices
pos: int
The origin index to be converted.
M: int
The maximum index.
is_start: bool
True if pos is a start position.
Returns
-------
int : the converted index regarding index_map
"""
if index_map[pos] is not None:
return index_map[pos]
N = len(index_map)
rear = pos
while rear < N - 1 and index_map[rear] is None:
rear += 1
front = pos
while front > 0 and index_map[front] is None:
front -= 1
assert index_map[front] is not None or index_map[rear] is not None
if index_map[front] is None:
if index_map[rear] >= 1:
if is_start:
return 0
else:
return index_map[rear] - 1
return index_map[rear]
if index_map[rear] is None:
if M is not None and index_map[front] < M - 1:
if is_start:
return index_map[front] + 1
else:
return M - 1
return index_map[front]
if is_start:
if index_map[rear] > index_map[front] + 1:
return index_map[front] + 1
else:
return index_map[rear]
else:
if index_map[rear] > index_map[front] + 1:
return index_map[rear] - 1
else:
return index_map[front]
[docs]def lcs_match(max_dist, seq1, seq2, max_seq_length=1024, lower=False):
"""Longest common sequence match.
unlike standard LCS, this is specifically optimized for the setting
because the mismatch between sentence pieces and original text will be small
Parameters
----------
max_dist: int
The max distance between tokens to be considered.
seq1: list
The first sequence to be matched.
seq2: list
The second sequence to be matched.
lower: bool
If match the lower-cased tokens.
Returns
-------
numpyArray: Token-wise lcs matrix f. Shape of ((max(len(seq1), 1024), max(len(seq2), 1024))
Map: The dp path in matrix f.
g[(i ,j)] == 2 if token_i in seq1 matches token_j in seq2.
g[(i, j)] == 1 if token_i in seq1 matches token_{j-1} in seq2.
g[(i, j)] == 0 of token_{i-1} in seq1 matches token_j in seq2.
"""
f = np.zeros((max(len(seq1), max_seq_length), max(len(seq2), max_seq_length)),
dtype=np.float32)
g = {}
for i, token in enumerate(seq1):
for j in range(i - max_dist, i + max_dist):
if j >= len(seq2) or j < 0:
continue
if i > 0:
g[(i, j)] = 0
f[i, j] = f[i - 1, j]
if j > 0 and f[i, j - 1] > f[i, j]:
g[(i, j)] = 1
f[i, j] = f[i, j - 1]
f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
if (_preprocess_text(token, lower=lower, remove_space=False) == seq2[j]
and f_prev + 1 > f[i, j]):
g[(i, j)] = 2
f[i, j] = f_prev + 1
return f, g