Source code for gluonnlp.utils.parameter
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Utility functions for trainer and parameters."""
__all__ = ['grad_global_norm', 'clip_grad_global_norm', 'save_parameters',
'save_states', 'load_parameters', 'load_states']
import warnings
from collections import defaultdict
import mxnet as mx
from mxnet import nd
from .. import _constants as C
from .files import _TempFilePath, _transfer_file_s3
[docs]def grad_global_norm(parameters, max_norm=None):
"""Calculate the 2-norm of gradients of parameters, and how much they should be scaled down
such that their 2-norm does not exceed `max_norm`, if `max_norm` if provided.
If gradients exist for more than one context for a parameter, user needs to explicitly call
``trainer.allreduce_grads`` so that the gradients are summed first before calculating
the 2-norm.
.. note::
This function is only for use when `update_on_kvstore` is set to False in trainer.
Example::
trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
with mx.autograd.record():
y = net(x)
loss = loss_fn(y, label)
loss.backward()
trainer.allreduce_grads()
norm = grad_global_norm(net.collect_params().values())
...
Parameters
----------
parameters : list of Parameters
max_norm: NDArray, optional
The maximum L2 norm threshold. If provided, `ratio` and `is_finite` will be returned.
Returns
-------
NDArray
Total norm. Shape is (1,)
NDArray
Ratio for rescaling gradients based on max_norm s.t. grad = grad / ratio.
If total norm is NaN, ratio will be NaN, too.
Returned if `max_norm` is provided. Shape is (1,)
NDArray
Whether the total norm is finite, returned if `max_norm` is provided. Shape is (1,)
"""
# distribute gradients among contexts
idx = 0
arrays = defaultdict(list)
sum_norms = []
for p in parameters:
if p.grad_req != 'null':
p_grads = p.list_grad()
arrays[idx % len(p_grads)].append(p_grads[idx % len(p_grads)])
idx += 1
assert len(arrays) > 0, 'No parameter found available for gradient norm.'
ctx, dtype = arrays[0][0].context, 'float32'
for idx, arr in enumerate(arrays.values()):
sum_norm = mx.nd.multi_sum_sq(*arr, num_arrays=len(arr))
sum_norm = nd.add_n(*sum_norm)
sum_norms.append(sum_norm.as_in_context(ctx))
# reduce
total_norm = nd.add_n(*sum_norms).sqrt()
if max_norm is None:
return total_norm
scale = total_norm / max_norm
# is_finite = 0 if NaN or Inf, 1 otherwise.
is_finite = nd.contrib.isfinite(scale)
# if scale is finite, nd.maximum selects the max between scale and 1. That is,
# 1 is returned if total_norm does not exceed max_norm.
# if scale = NaN or Inf, the result of nd.minimum is undefined. Therefore, we use
# choices.take to return NaN or Inf.
scale_or_one = nd.maximum(nd.ones((1,), dtype=dtype, ctx=ctx), scale)
choices = nd.concat(scale, scale_or_one, dim=0)
chosen_scale = choices.take(is_finite)
return total_norm, chosen_scale, is_finite
[docs]def clip_grad_global_norm(parameters, max_norm, check_isfinite=True):
"""Rescales gradients of parameters so that the sum of their 2-norm is smaller than `max_norm`.
If gradients exist for more than one context for a parameter, user needs to explicitly call
``trainer.allreduce_grads`` so that the gradients are summed first before calculating
the 2-norm.
.. note::
This function is only for use when `update_on_kvstore` is set to False in trainer.
In cases where training happens on multiple contexts, this method should be used in
conjunction with ``trainer.allreduce_grads()`` and ``trainer.update()``.
(**not** ``trainer.step()``)
Example::
trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
with mx.autograd.record():
y = net(x)
loss = loss_fn(y, label)
loss.backward()
trainer.allreduce_grads()
nlp.utils.clip_grad_global_norm(net.collect_params().values(), max_norm)
trainer.update(batch_size)
...
Parameters
----------
parameters : list of Parameters
max_norm : float
check_isfinite : bool, default True
If True, check that the total_norm is finite (not nan or inf). This
requires a blocking .asscalar() call.
Returns
-------
NDArray or float
Total norm. Return type is NDArray of shape (1,) if check_isfinite is
False. Otherwise a float is returned.
"""
total_norm, ratio, is_finite = grad_global_norm(parameters, max_norm)
scale = 1 / ratio
if check_isfinite:
if is_finite != 1:
warnings.warn(
UserWarning('nan or inf is detected. '
'Clipping results will be undefined.'), stacklevel=2)
for p in parameters:
if p.grad_req != 'null':
for arr in p.list_grad():
arr *= scale.as_in_context(arr.context)
return total_norm
def _s3_compatible_save_load(is_save, save_load_method, filename, *args, **kwargs):
"""Dispatch function for save load with s3."""
if C.S3_PREFIX in filename:
# create temp dir
with _TempFilePath() as temp_path:
if is_save:
# save model
save_load_method(temp_path, *args, **kwargs)
_transfer_file_s3(temp_path, filename, upload=is_save)
else:
# load model
_transfer_file_s3(temp_path, filename, upload=is_save)
save_load_method(temp_path, *args, **kwargs)
else:
save_load_method(filename, *args, **kwargs)
[docs]def load_parameters(model, filename, ctx=None, allow_missing=False,
ignore_extra=False, cast_dtype=None):
"""Load parameters from file previously saved by `save_parameters`.
Both local file system path and S3 URI are supported.
For example, 's3://mybucket/folder/net.params', './folder/net.params'.
Parameters
----------
filename : str
Path to parameter file.
ctx : Context or list of Context, default cpu()
Context(s) to initialize loaded parameters on.
allow_missing : bool, default False
Whether to silently skip loading parameters not represents in the file.
ignore_extra : bool, default False
Whether to silently ignore parameters from the file that are not
present in this Block.
cast_dtype : bool, default False
Cast the data type of the NDArray loaded from the checkpoint to the dtype
provided by the Parameter if any.
"""
if cast_dtype is not None:
_s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx,
allow_missing=allow_missing, ignore_extra=ignore_extra,
cast_dtype=cast_dtype)
else:
_s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx,
allow_missing=allow_missing, ignore_extra=ignore_extra)
[docs]def save_parameters(model, filename):
"""Save parameters to file.
Saved parameters can only be loaded with `Block.load_parameters`. Note that this
method only saves parameters, not model structure.
Both local file system path and S3 URI are supported.
For example, 's3://mybucket/folder/net.params', './folder/net.params'.
Parameters
----------
model : mx.gluon.Block
The model to save.
uri : str
Path to file.
"""
_s3_compatible_save_load(True, model.save_parameters, filename)
[docs]def load_states(trainer, fname):
"""Loads trainer states (e.g. optimizer, momentum) from a file.
Both local file system path and S3 URI are supported.
For example, 's3://mybucket/folder/net.states', './folder/net.states'.
Parameters
----------
trainer : mxnet.gluon.Trainer
The trainer whose states will be loaded.
fname : str
Path to input states file.
Note
----
`optimizer.param_dict`, which contains Parameter information (such as
`lr_mult` and `wd_mult`) will not be loaded from the file, but rather set
based on current Trainer's parameters.
"""
_s3_compatible_save_load(False, trainer.load_states, fname)
[docs]def save_states(trainer, fname):
"""Saves trainer states (e.g. optimizer, momentum) to a file.
Both local file system path and S3 URI are supported.
For example, 's3://mybucket/folder/net.states', './folder/net.states'.
Parameters
----------
trainer : mxnet.gluon.Trainer
The trainer whose states will be saved.
fname : str
Path to output states file.
Note
----
`optimizer.param_dict`, which contains Parameter information (such as
`lr_mult` and `wd_mult`) will not be saved.
"""
_s3_compatible_save_load(True, trainer.save_states, fname)