2020-03-23 22:01:54 +08:00
|
|
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
2023-05-15 11:14:31 +08:00
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import paddle
|
2023-11-09 16:23:09 +08:00
|
|
|
from paddle import _C_ops, pir
|
2023-09-07 17:26:19 +08:00
|
|
|
from paddle.base import framework
|
2023-11-09 16:23:09 +08:00
|
|
|
from paddle.base.framework import in_dynamic_or_pir_mode
|
2023-05-15 11:14:31 +08:00
|
|
|
|
2020-09-16 13:02:31 +08:00
|
|
|
__all__ = ['L1Decay', 'L2Decay']
|
|
|
|
|
|
2023-05-15 11:14:31 +08:00
|
|
|
|
|
|
|
|
class WeightDecayRegularizer:
|
|
|
|
|
"""Base class for weight decay regularizers
|
|
|
|
|
|
|
|
|
|
Defines the common interface of weight-decay regularizers.
|
|
|
|
|
Weight-decay regularizers are added only during the backward
|
|
|
|
|
pass for faster regularization. They add operations to the network
|
|
|
|
|
that correspond to gradient of the regularization function.
|
|
|
|
|
Users should not use this class directly, but need to use one
|
|
|
|
|
of its implementations
|
|
|
|
|
"""
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __init__(self) -> None:
|
2023-05-15 11:14:31 +08:00
|
|
|
pass
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __call__(
|
|
|
|
|
self, param: paddle.Tensor, grad: paddle.Tensor, block: pir.Block
|
|
|
|
|
):
|
2023-05-15 11:14:31 +08:00
|
|
|
"""Add corresponding weight decay operations to the network"""
|
2024-08-08 01:19:53 +08:00
|
|
|
raise NotImplementedError
|
2023-05-15 11:14:31 +08:00
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
|
"""Debug string"""
|
2024-08-08 01:19:53 +08:00
|
|
|
raise NotImplementedError
|
2020-09-16 13:02:31 +08:00
|
|
|
|
|
|
|
|
|
2023-05-15 11:14:31 +08:00
|
|
|
class L1Decay(WeightDecayRegularizer):
|
2020-11-24 14:53:51 +08:00
|
|
|
r"""
|
2020-09-16 13:02:31 +08:00
|
|
|
Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
|
2022-09-14 21:56:19 +08:00
|
|
|
|
|
|
|
|
It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
|
|
|
|
|
When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
|
|
|
|
|
``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
|
|
|
|
|
higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
|
2020-09-16 13:02:31 +08:00
|
|
|
in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the regularizer
|
|
|
|
|
in Optimizer will be used.
|
2022-09-14 21:56:19 +08:00
|
|
|
|
2020-09-29 17:05:08 +08:00
|
|
|
In the implementation, the loss function of L1 Weight Decay Regularization is as follows:
|
2022-09-14 21:56:19 +08:00
|
|
|
|
2020-09-16 13:02:31 +08:00
|
|
|
.. math::
|
|
|
|
|
|
2020-09-29 17:05:08 +08:00
|
|
|
loss = coeff * reduce\_sum(abs(x))
|
2020-09-16 13:02:31 +08:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
coeff(float, optional): regularization coeff. Default:0.0.
|
2022-09-14 21:56:19 +08:00
|
|
|
|
2020-09-16 13:02:31 +08:00
|
|
|
Examples:
|
2026-02-06 13:12:06 +08:00
|
|
|
.. code-block:: pycon
|
2023-07-05 16:03:48 +08:00
|
|
|
:name: code-example1
|
2020-09-16 13:02:31 +08:00
|
|
|
|
2023-08-30 10:37:35 +08:00
|
|
|
>>> # Example1: set Regularizer in optimizer
|
|
|
|
|
>>> import paddle
|
|
|
|
|
>>> from paddle.regularizer import L1Decay
|
|
|
|
|
|
|
|
|
|
>>> linear = paddle.nn.Linear(10, 10)
|
|
|
|
|
>>> inp = paddle.rand(shape=[10, 10], dtype="float32")
|
|
|
|
|
>>> out = linear(inp)
|
|
|
|
|
>>> loss = paddle.mean(out)
|
|
|
|
|
>>> beta1 = paddle.to_tensor([0.9], dtype="float32")
|
|
|
|
|
>>> beta2 = paddle.to_tensor([0.99], dtype="float32")
|
|
|
|
|
>>> momentum = paddle.optimizer.Momentum(
|
|
|
|
|
... learning_rate=0.1,
|
|
|
|
|
... parameters=linear.parameters(),
|
2026-02-06 13:12:06 +08:00
|
|
|
... weight_decay=L1Decay(0.0001),
|
|
|
|
|
... )
|
2023-08-30 10:37:35 +08:00
|
|
|
>>> back = out.backward()
|
|
|
|
|
>>> momentum.step()
|
|
|
|
|
>>> momentum.clear_grad()
|
2020-09-16 13:02:31 +08:00
|
|
|
|
2026-02-06 13:12:06 +08:00
|
|
|
.. code-block:: pycon
|
2023-07-05 16:03:48 +08:00
|
|
|
:name: code-example2
|
|
|
|
|
|
2023-08-30 10:37:35 +08:00
|
|
|
>>> # Example2: set Regularizer in parameters
|
|
|
|
|
>>> # Set L1 regularization in parameters.
|
|
|
|
|
>>> # Global regularizer does not take effect on my_conv2d for this case.
|
|
|
|
|
>>> from paddle.nn import Conv2D
|
|
|
|
|
>>> from paddle import ParamAttr
|
|
|
|
|
>>> from paddle.regularizer import L1Decay
|
|
|
|
|
|
|
|
|
|
>>> my_conv2d = Conv2D(
|
2026-02-06 13:12:06 +08:00
|
|
|
... in_channels=10,
|
|
|
|
|
... out_channels=10,
|
|
|
|
|
... kernel_size=1,
|
|
|
|
|
... stride=1,
|
|
|
|
|
... padding=0,
|
|
|
|
|
... weight_attr=ParamAttr(regularizer=L1Decay(coeff=0.01)),
|
|
|
|
|
... bias_attr=False,
|
|
|
|
|
... )
|
2020-09-16 13:02:31 +08:00
|
|
|
"""
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __init__(self, coeff: float = 0.0) -> None:
|
2023-05-15 11:14:31 +08:00
|
|
|
assert coeff is not None
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._coeff = coeff
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __call__(
|
|
|
|
|
self,
|
|
|
|
|
param: paddle.Tensor,
|
|
|
|
|
grad: paddle.Tensor,
|
|
|
|
|
block: pir.Block,
|
|
|
|
|
):
|
2023-05-15 11:14:31 +08:00
|
|
|
"""Add L1 weight decay ops to network
|
|
|
|
|
|
|
|
|
|
Adds L1 weight decay ops.
|
|
|
|
|
L1WeightDecay = reg_coeff * sign(parameter)
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
param: parameter variable for which regularization is applied
|
|
|
|
|
block: block in which variable is to be created
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
new variable for weight decay
|
|
|
|
|
"""
|
2023-11-09 16:23:09 +08:00
|
|
|
assert isinstance(
|
2024-01-19 14:59:45 +08:00
|
|
|
param, (framework.Variable, pir.Value, pir.core.ParameterMeta)
|
2023-11-09 16:23:09 +08:00
|
|
|
)
|
|
|
|
|
assert isinstance(block, (framework.Block, pir.Block))
|
2023-05-15 11:14:31 +08:00
|
|
|
|
2023-11-09 16:23:09 +08:00
|
|
|
if in_dynamic_or_pir_mode():
|
2023-05-22 20:56:38 +08:00
|
|
|
sign = _C_ops.sign(param)
|
|
|
|
|
return _C_ops.scale(sign, self._coeff, 0.0, True)
|
2023-05-15 11:14:31 +08:00
|
|
|
else:
|
|
|
|
|
sign = block.create_var(
|
|
|
|
|
dtype=param.dtype, shape=param.shape, lod_level=param.lod_level
|
|
|
|
|
)
|
|
|
|
|
decay = block.create_var(
|
|
|
|
|
dtype=param.dtype, shape=param.shape, lod_level=param.lod_level
|
|
|
|
|
)
|
2023-05-22 20:56:38 +08:00
|
|
|
# Append sign op
|
|
|
|
|
block.append_op(
|
|
|
|
|
type='sign', inputs={"X": param}, outputs={"Out": sign}
|
|
|
|
|
)
|
2023-05-15 11:14:31 +08:00
|
|
|
|
2023-05-22 20:56:38 +08:00
|
|
|
# Append scale op to the output of sign op
|
|
|
|
|
block.append_op(
|
|
|
|
|
type='scale',
|
|
|
|
|
inputs={"X": sign},
|
|
|
|
|
outputs={"Out": decay},
|
|
|
|
|
attrs={"scale": self._coeff},
|
|
|
|
|
)
|
|
|
|
|
return decay
|
2023-05-15 11:14:31 +08:00
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __str__(self) -> str:
|
2024-06-30 19:16:03 +08:00
|
|
|
return f"L1Decay, coeff={self._coeff:f}"
|
2023-05-15 11:14:31 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class L2Decay(WeightDecayRegularizer):
|
2020-11-24 14:53:51 +08:00
|
|
|
r"""
|
2020-09-16 13:02:31 +08:00
|
|
|
Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
|
2022-09-14 21:56:19 +08:00
|
|
|
|
|
|
|
|
It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
|
|
|
|
|
When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
|
|
|
|
|
``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
|
|
|
|
|
higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
|
2020-09-16 13:02:31 +08:00
|
|
|
in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the regularizer
|
|
|
|
|
in Optimizer will be used.
|
2022-09-14 21:56:19 +08:00
|
|
|
|
2020-09-29 17:05:08 +08:00
|
|
|
In the implementation, the loss function of L2 Weight Decay Regularization is as follows:
|
2020-09-16 13:02:31 +08:00
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
|
2020-09-29 17:05:08 +08:00
|
|
|
loss = 0.5 * coeff * reduce\_sum(square(x))
|
2020-09-16 13:02:31 +08:00
|
|
|
|
|
|
|
|
Args:
|
2023-05-15 11:14:31 +08:00
|
|
|
coeff(float, optional): regularization coeff. Default:0.0
|
2022-09-14 21:56:19 +08:00
|
|
|
|
2020-09-16 13:02:31 +08:00
|
|
|
Examples:
|
2026-02-06 13:12:06 +08:00
|
|
|
.. code-block:: pycon
|
2023-07-05 16:03:48 +08:00
|
|
|
:name: code-example1
|
2020-09-16 13:02:31 +08:00
|
|
|
|
2023-08-30 10:37:35 +08:00
|
|
|
>>> # Example1: set Regularizer in optimizer
|
|
|
|
|
>>> import paddle
|
|
|
|
|
>>> from paddle.regularizer import L2Decay
|
|
|
|
|
>>> linear = paddle.nn.Linear(10, 10)
|
|
|
|
|
>>> inp = paddle.rand(shape=[10, 10], dtype="float32")
|
|
|
|
|
>>> out = linear(inp)
|
|
|
|
|
>>> loss = paddle.mean(out)
|
|
|
|
|
>>> beta1 = paddle.to_tensor([0.9], dtype="float32")
|
|
|
|
|
>>> beta2 = paddle.to_tensor([0.99], dtype="float32")
|
|
|
|
|
>>> momentum = paddle.optimizer.Momentum(
|
|
|
|
|
... learning_rate=0.1,
|
|
|
|
|
... parameters=linear.parameters(),
|
2026-02-06 13:12:06 +08:00
|
|
|
... weight_decay=L2Decay(0.0001),
|
|
|
|
|
... )
|
2023-08-30 10:37:35 +08:00
|
|
|
>>> back = out.backward()
|
|
|
|
|
>>> momentum.step()
|
|
|
|
|
>>> momentum.clear_grad()
|
2020-09-16 13:02:31 +08:00
|
|
|
|
2026-02-06 13:12:06 +08:00
|
|
|
.. code-block:: pycon
|
2023-07-05 16:03:48 +08:00
|
|
|
:name: code-example2
|
2023-08-30 10:37:35 +08:00
|
|
|
|
|
|
|
|
>>> # Example2: set Regularizer in parameters
|
|
|
|
|
>>> # Set L2 regularization in parameters.
|
|
|
|
|
>>> # Global regularizer does not take effect on my_conv2d for this case.
|
|
|
|
|
>>> from paddle.nn import Conv2D
|
|
|
|
|
>>> from paddle import ParamAttr
|
|
|
|
|
>>> from paddle.regularizer import L2Decay
|
|
|
|
|
|
|
|
|
|
>>> my_conv2d = Conv2D(
|
2026-02-06 13:12:06 +08:00
|
|
|
... in_channels=10,
|
|
|
|
|
... out_channels=10,
|
|
|
|
|
... kernel_size=1,
|
|
|
|
|
... stride=1,
|
|
|
|
|
... padding=0,
|
|
|
|
|
... weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
|
|
|
|
|
... bias_attr=False,
|
|
|
|
|
... )
|
2020-09-16 13:02:31 +08:00
|
|
|
"""
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __init__(self, coeff: float = 0.0) -> None:
|
2023-05-15 11:14:31 +08:00
|
|
|
assert coeff is not None
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._coeff = coeff
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __call__(
|
|
|
|
|
self,
|
|
|
|
|
param: paddle.Tensor,
|
|
|
|
|
grad: paddle.Tensor,
|
|
|
|
|
block: pir.Block,
|
|
|
|
|
):
|
2023-05-15 11:14:31 +08:00
|
|
|
"""Add L2 weight decay ops to network
|
|
|
|
|
|
|
|
|
|
Adds L2 weight decay ops.
|
|
|
|
|
L2WeightDecay = reg_coeff * parameter
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
param: parameter variable for which regularization is applied
|
|
|
|
|
block: block in which variable is to be created
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
new variable for weight decay
|
|
|
|
|
"""
|
2023-11-09 16:23:09 +08:00
|
|
|
assert isinstance(
|
2024-01-19 14:59:45 +08:00
|
|
|
param, (framework.Variable, pir.Value, pir.core.ParameterMeta)
|
2023-11-09 16:23:09 +08:00
|
|
|
)
|
|
|
|
|
assert isinstance(block, (framework.Block, pir.Block))
|
2023-05-15 11:14:31 +08:00
|
|
|
|
2023-11-09 16:23:09 +08:00
|
|
|
if in_dynamic_or_pir_mode():
|
2023-05-22 20:56:38 +08:00
|
|
|
return _C_ops.scale(param, self._coeff, 0.0, True)
|
2023-05-15 11:14:31 +08:00
|
|
|
else:
|
|
|
|
|
decay = block.create_var(
|
|
|
|
|
dtype=param.dtype, shape=param.shape, lod_level=param.lod_level
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Append Op to calculate decay
|
|
|
|
|
block.append_op(
|
|
|
|
|
type='scale',
|
|
|
|
|
inputs={"X": param},
|
|
|
|
|
outputs={"Out": decay},
|
|
|
|
|
attrs={"scale": self._coeff},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return decay
|
|
|
|
|
|
2024-06-17 21:10:57 +08:00
|
|
|
def __str__(self) -> str:
|
2024-06-30 19:16:03 +08:00
|
|
|
return f"L2Decay, coeff={self._coeff:f}"
|