# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations from typing import TYPE_CHECKING, Literal import paddle from paddle import _C_ops from paddle.base.data_feeder import check_dtype from paddle.device import ( is_compiled_with_cuda, ) from paddle.device.cuda import get_device_capability from paddle.framework import ( LayerHelper, in_dynamic_or_pir_mode, ) if TYPE_CHECKING: from typing_extensions import TypeAlias from paddle import Tensor from paddle._typing import DTypeLike _Algo: TypeAlias = Literal[ 'weight_only_int8', 'weight_only_int4', 'llm.int8' ] _GroupSize: TypeAlias = Literal[-1, 64, 128] def _get_arch_info(): # Get SMVersion from device. if is_compiled_with_cuda(): cuda_version = paddle.version.cuda() if ( cuda_version is not None and cuda_version != 'False' ) or paddle.is_compiled_with_rocm(): major, minor = get_device_capability() arch = int(major * 10 + minor) return arch else: raise ValueError( "Paddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDA" ) else: # Default arch value for type checking. return 0 def weight_quantize( x: Tensor, algo: _Algo = "weight_only_int8", arch: int | None = None, group_size: _GroupSize = -1, ) -> tuple[Tensor, Tensor]: """ Quantization function for weight_only and llm.int8's weight. Args: x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16. algo (str): The algo that is x will be apply, must be one of 'weight_only_int8', 'weight_only_int4', 'llm.int8', 'w4a8' and 'w4afp8, default: 'weight_only_int8'. arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None. group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128. Returns: out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x. scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32. Examples: .. code-block:: pycon >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import weight_quantize >>> paddle.seed(2023) >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16) >>> out, scale = weight_quantize(x, algo='weight_only_int8') >>> print(out.shape) paddle.Size([32, 64]) >>> print(scale.shape) paddle.Size([32]) """ if arch is None: arch = _get_arch_info() if is_compiled_with_cuda(): assert ( arch == 70 or arch == 75 or arch == 80 or arch == 86 or arch == 89 or arch == 90 or arch == 92 or arch == 100 ), ( f"Currently weight_quantize only support SM70/75/80/86/89/90/92/100. but got {arch} " ) assert group_size == -1 or group_size == 64 or group_size == 128, ( f"Currently group_size only support -1/64/128. but got {group_size} " ) if in_dynamic_or_pir_mode(): return _C_ops.weight_quantize(x, algo, arch, group_size) else: type = "weight_quantize" helper = LayerHelper(type, **locals()) out = helper.create_variable_for_type_inference('int8') scale = helper.create_variable_for_type_inference('float') helper.append_op( type=type, inputs={"x": x}, outputs={'out': out, "scale": scale}, attrs={"algo": algo, "arch": arch, "group_size": group_size}, ) return (out, scale) def weight_dequantize( x: Tensor, scale: Tensor, algo: _Algo = "weight_only_int8", out_dtype: DTypeLike = "float16", group_size: _GroupSize = -1, ) -> Tensor: """ Dequantization function for weight_only and llm.int8's weight. Args: x (Tensor): The input Tensor to be dequantized, the data type is int8. scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32. algo (str): The algo that is x will be apply, must be one of 'weight_only_int8', 'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'. out_dtype (str|np.dtype): [Deprecated][Not used] The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'. Returns: out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x. Examples: .. code-block:: pycon >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import weight_quantize, weight_dequantize >>> paddle.seed(2023) >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16) >>> out, scale = weight_quantize(x, algo='weight_only_int8') >>> x_dequant = weight_dequantize(out, scale) """ assert group_size == -1 or group_size == 64 or group_size == 128, ( f"Currently group_size only support -1/64/128. but got {group_size} " ) if in_dynamic_or_pir_mode(): return _C_ops.weight_dequantize(x, scale, algo, group_size) else: type = "weight_dequantize" helper = LayerHelper(type, **locals()) out_dtype = scale.dtype out = helper.create_variable_for_type_inference(out_dtype) helper.append_op( type=type, inputs={"x": x, "scale": scale}, outputs={'out': out}, attrs={ "algo": algo, "group_size": group_size, }, ) return out def weight_only_linear( x: Tensor, weight: Tensor, bias: Tensor | None = None, weight_scale: Tensor | None = None, weight_dtype: DTypeLike = "int8", arch: int | None = None, group_size: _GroupSize = -1, ) -> Tensor: """ Applies matrix multiplication of two tensors and then bias addition if provided. This method requires CUDA version >= 11.2. Args: x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16. weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2. bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would be performed. Otherwise, The bias is added to the matrix multiplication result. weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1. weight_dtype(str): The dtype of weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'. arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None. group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128. Returns: Tensor: the output Tensor, the data type is the same as that of x. Examples: .. code-block:: pycon >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import weight_only_linear >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') >>> scale = paddle.randn([32], dtype='float32') >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') >>> if paddle.device.cuda.get_device_capability()[0] >= 8: ... out = weight_only_linear( ... x, ... weight, ... bias=bias, ... weight_scale=scale, ... weight_dtype='int8', ... ) ... print(out.shape) paddle.Size([1, 2, 32]) """ if arch is None: arch = _get_arch_info() if is_compiled_with_cuda(): assert ( arch == 70 or arch == 75 or arch == 80 or arch == 86 or arch == 89 or arch == 90 or arch == 92 or arch == 100 ), ( f"Currently weight_quantize only support SM70/75/80/86/89/90/92/100. but got {arch} " ) assert group_size == -1 or group_size == 64 or group_size == 128, ( f"Currently weight_quantize only support group size of -1, 64 or 128. but got {group_size} " ) if in_dynamic_or_pir_mode(): out = _C_ops.weight_only_linear( x, weight, bias, weight_scale, weight_dtype, arch, group_size ) return out else: check_dtype( weight_dtype, 'weight_dtype', ['int8', 'int4'], 'weight_only_linear' ) type = "weight_only_linear" helper = LayerHelper(type, **locals()) dtype = x.dtype inputs = { 'x': [x], 'weight': [weight], 'weight_scale': [weight_scale], } if bias is not None: inputs["bias"] = [bias] attrs = { 'weight_dtype': weight_dtype, 'arch': arch, 'group_size': group_size, } out = helper.create_variable_for_type_inference(dtype) helper.append_op( type=type, inputs=inputs, outputs={'out': out}, attrs=attrs, ) return out def llm_int8_linear( x: Tensor, weight: Tensor, bias: Tensor | None = None, weight_scale: Tensor | None = None, threshold: float = 6.0, ) -> Tensor: """ Applies matrix multiplication of two tensors and then bias addition if provided. This method requires CUDA version >= 11.2. Args: x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16. weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2. bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would be performed. Otherwise, the bias is added to the matrix multiplication result. weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1. threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype. Returns: Tensor: the output Tensor, the data type is the same as that of x. Examples: .. code-block:: pycon >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import llm_int8_linear >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16') >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8') >>> scale = paddle.randn([32], dtype='float32') >>> bias = paddle.cast(paddle.randn([32]), dtype='float16') >>> if paddle.device.cuda.get_device_capability()[0] >= 8: ... out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0) ... print(out.shape) paddle.Size([1, 2, 32]) """ if in_dynamic_or_pir_mode(): out = _C_ops.llm_int8_linear(x, weight, bias, weight_scale, threshold) return out else: type = "llm_int8_linear" helper = LayerHelper(type, **locals()) dtype = x.dtype inputs = { 'x': [x], 'weight': [weight], 'weight_scale': [weight_scale], } if bias: inputs["bias"] = [bias] attrs = {'threshold': threshold} out = helper.create_variable_for_type_inference(dtype) helper.append_op( type=type, inputs=inputs, outputs={'out': out}, attrs=attrs, ) return out def apply_per_channel_scale(x: Tensor, scales: Tensor) -> Tensor: """ Apply pre-quant per channel scale on activations Args: x (Tensor): Input tensor representing the activations, the data type can be float16 or bfloat16. scales(Tensor): Per-channel scale factors for pre-quantization. Data type should be compatible with x. Returns: out (Tensor): The Tensor which is the pre-quant results, the data type is compatible with x. Examples: .. code-block:: pycon >>> # doctest: +SKIP('No testing required') >>> import paddle >>> from paddle.nn.quant import apply_per_channel_scale >>> paddle.seed(2023) >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16) >>> scales = paddle.rand(shape=[32], dtype=paddle.float16) >>> out = apply_per_channel_scale(x, scales) """ if in_dynamic_or_pir_mode(): return _C_ops.apply_per_channel_scale(x, scales) else: type = "apply_per_channel_scale" helper = LayerHelper(type, **locals()) out = helper.create_variable_for_type_inference(x.dtype) helper.append_op( type=type, inputs={"x": [x], "scales": [scales]}, outputs={"out": out}, ) return out