SIGN IN SIGN UP
PaddlePaddle / Paddle UNCLAIMED

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: import all neural network related api under this directory,
# including layers, linear, conv, rnn etc.
from .activation import (
celu,
elu,
elu_,
gelu,
glu,
gumbel_softmax,
hardshrink,
hardsigmoid,
hardswish,
hardtanh,
hardtanh_,
leaky_relu,
leaky_relu_,
log_sigmoid,
log_softmax,
maxout,
mish,
prelu,
relu,
relu6,
relu_,
rrelu,
selu,
sigmoid,
silu,
softmax,
softmax_,
softplus,
softshrink,
softsign,
swiglu,
swish,
tanh,
tanh_,
tanhshrink,
thresholded_relu,
thresholded_relu_,
)
from .common import (
alpha_dropout,
bilinear,
class_center_sample,
cosine_similarity,
dropout,
dropout1d,
dropout2d,
dropout3d,
feature_alpha_dropout,
fold,
interpolate,
label_smooth,
linear,
pad,
unfold,
upsample,
zeropad2d,
)
from .conv import (
conv1d,
conv1d_transpose,
conv2d,
conv2d_transpose,
conv3d,
conv3d_transpose,
)
from .distance import pairwise_distance, pdist # noqa: F401
from .extension import (
diag_embed, # noqa: F401
gather_tree,
sequence_mask,
temporal_shift,
)
from .flash_attention import (
flash_attention_v3_varlen,
flash_attn_qkvpacked,
flash_attn_varlen_qkvpacked,
flashmask_attention,
sdp_kernel, # noqa: F401
)
from .input import (
embedding,
embedding_renorm_, # noqa: F401
one_hot,
)
from .loss import (
adaptive_log_softmax_with_loss,
binary_cross_entropy,
binary_cross_entropy_with_logits,
cosine_embedding_loss,
cross_entropy,
ctc_loss,
dice_loss,
gaussian_nll_loss,
hinge_embedding_loss,
hsigmoid_loss,
kl_div,
l1_loss,
log_loss,
margin_cross_entropy,
margin_ranking_loss,
mse_loss,
2025-07-09 21:26:06 +08:00
multi_label_margin_loss,
multi_label_soft_margin_loss,
multi_margin_loss,
nll_loss,
npair_loss,
poisson_nll_loss,
rnnt_loss,
sigmoid_focal_loss,
smooth_l1_loss,
soft_margin_loss,
softmax_with_cross_entropy,
square_error_cost,
triplet_margin_loss,
triplet_margin_with_distance_loss,
)
from .moe_permute import moe_permute
from .moe_unpermute import moe_unpermute
from .norm import (
batch_norm,
group_norm,
instance_norm,
layer_norm,
local_response_norm,
normalize,
rms_norm,
)
from .pooling import (
adaptive_avg_pool1d,
adaptive_avg_pool2d,
adaptive_avg_pool3d,
adaptive_max_pool1d,
adaptive_max_pool2d,
adaptive_max_pool3d,
avg_pool1d,
avg_pool2d,
avg_pool3d,
fractional_max_pool2d,
fractional_max_pool3d,
lp_pool1d,
lp_pool2d,
max_pool1d,
max_pool2d,
max_pool3d,
max_unpool1d,
max_unpool2d,
max_unpool3d,
)
[API Compatibility] Add paddle.compat.nn.functional.sdpa (#76446) * Implement paddle.nn.functional.sdpa * Enable flash attention test and disable test_compat_attention on Windows * Refactor sdpa * check dtype for mem_efficient_attention, support 3d attn_mask, refine tests * fix test_flash_attn error * feat: refactor GQA implementation and improve tensor handling - Move GQA logic from compat module to main scaled_dot_product_attention function - Add enable_gqa parameter to function signature with proper documentation - Simplify tensor dimension handling using is_batched check - Remove duplicate GQA validation and expansion code from compat module - Improve code organization by centralizing GQA functionality in main implementation * feat(test): update attention test shape for better alignment Change the shape parameter in TestSDPAttentionWithScale from (2, 32, 8, 32) to (2, 8, 8, 32) to improve test alignment and ensure proper attention mechanism validation with more realistic tensor dimensions. * feat(attention): update documentation and mask handling logic - Update scaled_dot_product_attention documentation to clarify dtype support and remove GQA mode mention - Simplify mask padding logic in MultiheadAttention to always use input dtype - Add tensor shape comments for better code readability - Refactor attention mask generation logic to improve efficiency - Remove unused device capability checking functions These changes improve code clarity and maintainability while ensuring consistent behavior across different input types. * feat(transformer): initialize bias parameters with None and conditionally create bias parameters Initialize all bias parameters (in_proj_bias, q_proj_bias, k_proj_bias, v_proj_bias) to None at class initialization. Conditionally create bias parameters only when bias=True, moving the bias parameter creation logic to the appropriate conditional branches. This improves code clarity by ensuring bias parameters are always defined and only created when needed. * feat(nn): remove __all__ from compat nn module * feat: fix CUDA availability check in scaled dot product attention Change `paddle.device.is_available()` to `paddle.cuda.is_available()` in the CUDA availability check function. This ensures proper detection of CUDA availability specifically for GPU operations in the scaled dot product attention implementation. * feat: update shape output format in docstrings and rename attention module - Change shape output format from list to paddle.Size in AvgPool1D, AvgPool2D, AvgPool3D, and Unfold docstrings - Rename attention.py to sdpa.py and update import paths - Remove debug parameter from check_all_tensors_on_device function - Replace debug warning with info logging for tensor device placement checks - Update MultiheadAttention documentation regarding optimized implementation conditions * feat: reduce log verbosity in attention validation functions Changed logger calls from info to debug level in SDPA validation functions to reduce noise in production logs. This maintains the same validation logic but only shows detailed validation messages when debug logging is enabled. * feat: add bfloat16 support check for MHA tests on CUDA Add paddle.device.is_bf16_supported() check to ensure bfloat16 tests only run on CUDA devices that support bfloat16. This prevents test failures on CUDA devices without bfloat16 support by falling back to float32 dtype in those cases. * feat: add runtime flags for attention backends and fix bf16 support check - Add FLAGS_memory_efficient_attention_available and FLAGS_flash_attention_available to conditionally enable attention backends at runtime - Update SDPA backend selection to use runtime flags instead of hardcoded values - Fix bf16 support detection in multihead attention tests by checking CUDA compute capability - Remove redundant scale check in flash attention constraints - Improve test coverage by using consistent bf16 capability checks * feat: add global flags for attention kernel availability Add global boolean flags `memory_efficient_attention_available` and `flash_attention_available` to centralize availability checks for memory efficient and flash attention kernels. Move flag definitions from individual kernel files to flags.cc for better maintainability and to avoid code duplication. The flags automatically set to true when corresponding compilation macros (PADDLE_WITH_MEMORY_EFFICIENT_ATTENTION and PADDLE_WITH_FLASHATTN) are defined, allowing runtime detection of available attention implementations. * Fix compile error on windows * Fix build error * feat(nn): use safe dict get for attention backend flags Replace direct dictionary access with get() method to handle missing flags gracefully. This prevents KeyError exceptions when the global flags dictionary doesn't contain the expected flash attention and memory efficient attention availability flags, providing default False values instead.
2025-11-26 13:25:39 +08:00
from .sdpa import scaled_dot_product_attention
from .sparse_attention import sparse_attention
from .vision import (
affine_grid,
channel_shuffle,
grid_sample,
pixel_shuffle,
pixel_unshuffle,
)
[API Compatibility] Add paddle.Tensor.clamp_ ,paddle.nn.functional.logsigmoid, paddle.functional.meshgrid, paddle.nn.init.calculate_fan_in_and_fan_out ,paddle.autocast (#76206) * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * support recompute's forward and backward in pipeline mode * [API Compatibility] Add paddle.Tensor.clip_ * Revert "support recompute's forward and backward in pipeline mode" This reverts commit 7fd48d9060b292136bce9cdf79983530d5c5d52f. * Revert "[API Compatibility] Add paddle.Tensor.clip_" This reverts commit 025efc33f3daad27e6b8eda75d032c91c1a7a020. * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast
2025-11-06 14:13:32 +08:00
logsigmoid = log_sigmoid
conv_transpose1d = conv1d_transpose
conv_transpose2d = conv2d_transpose
conv_transpose3d = conv3d_transpose
__all__ = [
'celu',
'conv1d',
'conv1d_transpose',
'conv2d',
'conv2d_transpose',
'conv3d',
'conv3d_transpose',
'conv_transpose1d',
'conv_transpose2d',
'conv_transpose3d',
'pairwise_distance',
'elu',
'elu_',
'gelu',
'hardshrink',
'hardtanh',
'hardtanh_',
'hardsigmoid',
'hardswish',
'leaky_relu',
'leaky_relu_',
'log_sigmoid',
[API Compatibility] Add paddle.Tensor.clamp_ ,paddle.nn.functional.logsigmoid, paddle.functional.meshgrid, paddle.nn.init.calculate_fan_in_and_fan_out ,paddle.autocast (#76206) * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * sharding stage3 bugfix * support recompute's forward and backward in pipeline mode * [API Compatibility] Add paddle.Tensor.clip_ * Revert "support recompute's forward and backward in pipeline mode" This reverts commit 7fd48d9060b292136bce9cdf79983530d5c5d52f. * Revert "[API Compatibility] Add paddle.Tensor.clip_" This reverts commit 025efc33f3daad27e6b8eda75d032c91c1a7a020. * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast * [API Compatibility] Add clip_、logsigmoid、_calculate_fan_in_and_fan_out、meshgrid、autocast
2025-11-06 14:13:32 +08:00
'logsigmoid',
'maxout',
'prelu',
'relu',
'relu_',
'relu6',
'selu',
'softmax',
'softmax_',
'softplus',
'softshrink',
'softsign',
'sigmoid',
'silu',
'swiglu',
'swish',
'mish',
'tanh',
'tanh_',
'tanhshrink',
'thresholded_relu',
'thresholded_relu_',
'log_softmax',
'glu',
'gumbel_softmax',
'sequence_mask',
'dropout',
'dropout1d',
'dropout2d',
'dropout3d',
'alpha_dropout',
'feature_alpha_dropout',
'label_smooth',
'linear',
'pad',
'zeropad2d',
'unfold',
'interpolate',
'upsample',
'bilinear',
'cosine_similarity',
'avg_pool1d',
'avg_pool2d',
'avg_pool3d',
'lp_pool1d',
'lp_pool2d',
'max_pool1d',
'max_pool2d',
'max_pool3d',
'max_unpool1d',
'max_unpool2d',
'max_unpool3d',
'moe_permute',
'moe_unpermute',
'adaptive_avg_pool1d',
'adaptive_avg_pool2d',
'adaptive_avg_pool3d',
'adaptive_max_pool1d',
'adaptive_max_pool2d',
'adaptive_max_pool3d',
'fractional_max_pool2d',
'fractional_max_pool3d',
'binary_cross_entropy',
'binary_cross_entropy_with_logits',
'cross_entropy',
'dice_loss',
'hsigmoid_loss',
'kl_div',
'l1_loss',
'log_loss',
'mse_loss',
'margin_ranking_loss',
'multi_label_soft_margin_loss',
'nll_loss',
'poisson_nll_loss',
'npair_loss',
'sigmoid_focal_loss',
'smooth_l1_loss',
'softmax_with_cross_entropy',
'margin_cross_entropy',
'square_error_cost',
'ctc_loss',
'rnnt_loss',
'hinge_embedding_loss',
'affine_grid',
'grid_sample',
'local_response_norm',
'pixel_shuffle',
'pixel_unshuffle',
'channel_shuffle',
'embedding',
'gather_tree',
'one_hot',
'normalize',
'temporal_shift',
'batch_norm',
'layer_norm',
'rms_norm',
'instance_norm',
'class_center_sample',
'sparse_attention',
'fold',
'cosine_embedding_loss',
'rrelu',
'triplet_margin_with_distance_loss',
'triplet_margin_loss',
'adaptive_log_softmax_with_loss',
'multi_margin_loss',
2025-07-09 21:26:06 +08:00
'multi_label_margin_loss',
'soft_margin_loss',
'gaussian_nll_loss',
'scaled_dot_product_attention',
'flashmask_attention',
'flash_attn_qkvpacked',
"flash_attention_v3_varlen",
'flash_attn_varlen_qkvpacked',
'group_norm',
]