# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import copy import hashlib import inspect import warnings from typing import TYPE_CHECKING, Any, Callable import numpy as np import numpy.typing as npt from typing_extensions import overload import paddle from paddle import _C_ops, profiler from paddle.base.data_feeder import ( _PADDLE_DTYPE_2_NUMPY_DTYPE, convert_uint16_to_float, ) from paddle.base.libpaddle import Place from paddle.profiler.utils import in_profiler_mode from paddle.utils import deprecated from paddle.utils.decorator_utils import tensor_cuda_decorator from paddle.utils.dlpack import DLDeviceType from paddle.utils.download import check_and_create_dir from .. import core, framework, unique_name from ..framework import ( EagerParamBase, Parameter, Variable, convert_np_dtype_to_dtype_, ) from .base import switch_to_static_graph from .math_op_patch import monkey_patch_math_tensor if TYPE_CHECKING: from enum import IntEnum from typing_extensions import CapsuleType from paddle import Tensor from paddle._typing import DTypeLike, PlaceLike, TensorIndex from paddle.cuda import DeviceLike _grad_scalar = None class TensorHookRemoveHelper: """ A helper class that for removing Tensor gradient's hook. NOTE(wuweilong):the operation weakref.ref(tensor) will cause some unexpected errors in eager mode. """ def __init__(self, tensor: Tensor, hook_id: int) -> None: self._tensor = tensor self._hook_id = hook_id def remove(self) -> bool: """ Remove reference Tensor's hook. Returns: bool: Return True if removed successfully """ tensor = self._tensor if tensor is not None: res = tensor._remove_grad_hook(self._hook_id) if res is True: return True else: warnings.warn( f"The backward hook (ID: {self._hook_id}) of Tensor `{tensor.name}` you want to remove does not exist or has been removed.", RuntimeWarning, ) return False _already_patch_repr = False def monkey_patch_tensor(): # TODO(cleanup-legacy-ir): This method is for dy2st in legacy ir only # and should be removed after legacy ir is removed. @switch_to_static_graph def _to_static_var(self, to_parameter=False, **kwargs): """ **Notes**: **This API is ONLY available in Dygraph mode** Transform a Tensor into static Variable with same attributes. It's a low level interface used in dy2static and shall not be called directly. Args: to_parameter (bool): It takes effect only if the input a Tensor. If set True, the Tensor will be converted into framework.Parameters. Otherwise, it will be converted into framework.Variable. Default False. Examples: .. code-block:: pycon >>> import paddle.base as base >>> import paddle >>> import numpy as np >>> data = np.ones([3, 1024], dtype='float32') >>> with base.dygraph.guard(): ... tensor = paddle.to_tensor(data) ... static_var = tensor._to_static_var() """ # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph. # It will fail. So, for property that different between dynamic and static graph, should not getattr(self, attr, None). attr_not_need_keys = [ 'grad', 'T', 'mT', 'place', '_place_str', 'data', 'grad_', 'strides', 'offset', '__cuda_array_interface__', 'itemsize', 'is_cuda', 'is_cpu', ] param_keys = ['stop_gradient', 'trainable'] if isinstance(self, EagerParamBase): attr_kwargs = self.__dict__.copy() for key in param_keys: attr_kwargs[key] = getattr(self, key) else: attr_names = [] for name in dir(self): if name not in attr_not_need_keys: if not inspect.ismethod( getattr(self, name) ) and not name.startswith('_'): attr_names.append(name) attr_kwargs = {name: getattr(self, name) for name in attr_names} attr_keys = ['block', 'shape', 'dtype', 'type', 'name', 'persistable'] for attr in attr_keys: attr_kwargs[attr] = getattr(self, attr, None) # If specify block, use it instead of self.block if 'block' in kwargs: attr_kwargs['block'] = kwargs['block'] attr_kwargs.update(kwargs) if to_parameter or isinstance(self, EagerParamBase): del attr_kwargs['persistable'] # NOTE(Aurelius84): All parameters should be placed into global block. attr_kwargs['block'] = attr_kwargs['block'].program.global_block() static_var = Parameter(**attr_kwargs) else: static_var = Variable(**attr_kwargs) if self.placements is not None: # import for shard tensor api import paddle.distributed as dist static_var = dist.shard_tensor( static_var, self.process_mesh, self.placements, stop_gradient=static_var.stop_gradient, ) return static_var # TODO(jiabin): move this to cplusplus end if we find some performance issue on it @framework.dygraph_only def set_value( self: Tensor, value: Tensor | npt.NDArray[Any] | dict[str, int] | str ) -> None: """ **Notes**: **This API is ONLY available in Dygraph mode** Set a new value for this Variable. Args: value (Variable|np.ndarray): the new value. Examples: .. code-block:: pycon >>> import paddle.base as base >>> import paddle >>> from paddle.nn import Linear >>> import numpy as np >>> data = np.ones([3, 1024], dtype='float32') >>> with base.dygraph.guard(): ... linear = Linear(1024, 4) ... t = paddle.to_tensor(data) ... linear(t) # call with default weight ... custom_weight = np.random.randn(1024, 4).astype("float32") ... linear.weight.set_value(custom_weight) # change existing weight ... out = linear(t) # call with different weight """ if id(self) == id(value): return assert isinstance(value, (np.ndarray, paddle.Tensor, dict, str)), ( "Variable set_value function, arguments type only support Variable, numpy, Tensor, dict, string." ) if self.is_dist(): assert isinstance(value, (np.ndarray, paddle.Tensor)), ( "For set_value function of dist tensor, arguments type only support numpy or Tensor." ) if isinstance(value, (dict, str)): assert len(self) == len(value), ( f"Variable length not match, Variable [ {self.name} ] need tensor with length {len(self)} but load set tensor with length {len(value)}" ) if isinstance(value, dict): self.value().set_vocab(value) else: self.value().set_string_list(value) else: assert self.shape == list(value.shape), ( f"Variable Shape not match, Variable [ {self.name} ] need tensor with shape {self.shape} but load set tensor with shape {value.shape}" ) if isinstance(value, paddle.Tensor): dtype = value.dtype elif paddle.framework.use_pir_api(): dtype = paddle.pir.core.convert_np_dtype_to_dtype_(value.dtype) else: dtype = convert_np_dtype_to_dtype_(value.dtype) assert self.dtype == dtype, ( f"Variable dtype not match, Variable [ {self.name} ] need tensor with dtype {self.dtype} but load tensor with dtype {dtype}" ) # NOTE(wuweilong): self could be Tensor, the subsequent behavior are defined in different files # if self is Tensor, method value() return self that defined in this file, get_tensor() defined in eager_method.cc # this Interface behavior will be unified in the future. if self.is_dist(): if isinstance(value, paddle.Tensor) and value.is_dist(): from paddle.distributed.auto_parallel.placement_type import ( check_placements_equal, ) # TODO: support reshard later assert ( value.process_mesh == self.value().process_mesh or check_placements_equal( value.placements, self.value().placements ) ), ( f"process_mesh:{value.process_mesh} != {self.value().process_mesh} or placements:{value.placements} != {self.value().placements} not match" ) else: # calling set method bound for DistTensor value = paddle.distributed.shard_tensor( value, self.value().process_mesh, self.value().placements, ) if isinstance(value, paddle.Tensor): self.value().set_tensor(value) else: self.value().get_tensor().set(value.get_tensor()) return if isinstance(value, paddle.Tensor): self.value().set_tensor(value) else: self.value().get_tensor().set( value, framework._current_expected_place() ) @framework.dygraph_only def backward( self: Tensor, grad_tensor: Tensor | None = None, retain_graph: bool = False, *, dump_backward_graph_path: str | None = None, ) -> None: """ Run backward of current Graph which starts from current Tensor. The new gradient will accumulate on previous gradient. You can clear gradient by ``Tensor.clear_grad()`` . Args: grad_tensor(Tensor|None, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, the initial gradient values of the current Tensor would be Tensor filled with 1.0; if `grad_tensor` is not None, it must have the same length as the current Tensor. The default value is None. retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter :code:`retain_graph` to True, then the grads will be retained. Thus, setting it to False is much more memory-efficient. Defaults to False. dump_backward_graph_path(str, optional): Specifies the directory path for storing the debug file. If this parameter is specified, the backward-related graph (in dot format) and the debugging call stack information will be generated in this directory. Returns: None Examples: .. code-block:: pycon >>> import paddle >>> x = paddle.to_tensor(5.0, stop_gradient=False) >>> for i in range(5): ... y = paddle.pow(x, 4.0) ... y.backward() ... print("{}: {}".format(i, x.grad)) 0: 500.0 1: 1000.0 2: 1500.0 3: 2000.0 4: 2500.0 >>> x.clear_grad() >>> print("{}".format(x.grad)) 0.0 >>> grad_tensor = paddle.to_tensor(2.0) >>> for i in range(5): ... y = paddle.pow(x, 4.0) ... y.backward(grad_tensor) ... print("{}: {}".format(i, x.grad)) 0: 1000.0 1: 2000.0 2: 3000.0 3: 4000.0 4: 5000.0 """ if framework.in_dygraph_mode(): if in_profiler_mode(): record_event = profiler.RecordEvent( "Gradient Backward", profiler.TracerEventType.Backward ) record_event.begin() if grad_tensor is not None: assert isinstance(grad_tensor, core.eager.Tensor), ( "The type of grad_tensor must be paddle.Tensor" ) assert grad_tensor.shape == self.shape, ( f"Tensor shape not match, Tensor of grad_tensor [ {grad_tensor.name} ] with shape {grad_tensor.shape} mismatch Tensor [ {self.name} ] with shape {self.shape}" ) if grad_tensor is None: grad_tensor = [] else: grad_tensor = [grad_tensor] if _grad_scalar: # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly. self = _grad_scalar.scale(self) check_and_create_dir(dump_backward_graph_path) core.eager.run_backward( [self], grad_tensor, retain_graph, dump_backward_graph_path ) if in_profiler_mode(): record_event.end() else: raise ValueError( "Variable.backward() is only available in DyGraph mode" ) @framework.dygraph_only @deprecated( since="2.1.0", level=1, reason="Please use tensor.grad, which returns the tensor value of the gradient.", ) def gradient( self: Tensor, ) -> npt.NDArray[Any] | tuple[npt.NDArray[Any], npt.NDArray[Any]] | None: """ .. warning:: This API will be deprecated in the future, it is recommended to use :code:`x.grad` which returns the tensor value of the gradient. Get the Gradient of Current Tensor. Returns: ndarray: Numpy value of the gradient of current Tensor Examples: .. code-block:: pycon >>> import paddle >>> x = paddle.to_tensor(5.0, stop_gradient=False) >>> y = paddle.pow(x, 4.0) >>> y.backward() >>> print("grad of x: {}".format(x.gradient())) grad of x: 500.0 """ if self.grad is None: return None if self.grad.is_selected_rows(): return (np.array(self.grad), np.array(self.grad.rows())) return np.array(self.grad) @framework.dygraph_only def apply_(self: Tensor, func: Callable[[Tensor], Tensor]) -> Tensor: """ Inplace apply the python function to the tensor. Returns: None Examples: .. code-block:: pycon >>> # doctest: +REQUIRES(env:GPU) >>> import paddle >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("cpu", "float64") >>> f = lambda x: 3 * x + 2 >>> x.apply_(f) >>> print(x) Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True, [[2.90000004, 3.50000000, 2.30000000], [4.69999993, 4.69999993, 4.09999996], [3.20000002, 4.40000004, 2.60000001]]) >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("cpu", "float16") >>> x.apply_(f) >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("cpu", "bfloat16") >>> x.apply_(f) >>> if paddle.is_compiled_with_cuda(): >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("gpu", "float32") >>> x.apply_(f) """ if not self.stop_gradient: raise RuntimeError( "Cannot apply function on a tensor that required gradient." ) return self._apply_(func) def apply(self, func: Callable[[Tensor], Tensor]) -> Tensor: """ Apply the python function to the tensor. Returns: None Examples: .. code-block:: pycon >>> # doctest: +REQUIRES(env:GPU) >>> import paddle >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("cpu", "float64") >>> f = lambda x: 3 * x + 2 >>> y = x.apply(f) >>> print(y) Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True, [[2.90000004, 3.50000000, 2.30000000], [4.69999993, 4.69999993, 4.09999996], [3.20000002, 4.40000004, 2.60000001]]) >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("cpu", "float16") >>> y = x.apply(f) >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("cpu", "bfloat16") >>> y = x.apply(f) >>> if paddle.is_compiled_with_cuda(): >>> x = paddle.to_tensor([[0.3, 0.5, 0.1], >>> [0.9, 0.9, 0.7], >>> [0.4, 0.8, 0.2]]).to("gpu", "float32") >>> y = x.apply(f) """ if not self.stop_gradient: raise RuntimeError( "Cannot apply function on a tensor that required gradient." ) return self._apply(func) @framework.dygraph_only def register_hook( self: Tensor, hook: Callable[[Tensor], Tensor | None] ) -> TensorHookRemoveHelper: """ Registers a backward hook for current Tensor. The hook will be called every time the gradient Tensor of current Tensor is computed. The hook should not modify the input gradient Tensor, but it can optionally return a new gradient Tensor which will be used in place of current Tensor's gradient. The hook should have the following signature: hook(grad) -> Tensor or None Args: hook(function): A backward hook to be registered for Tensor.grad Returns: TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method. Examples: .. code-block:: pycon >>> import paddle >>> # hook function return None >>> def print_hook_fn(grad): ... print(grad) >>> # hook function return Tensor >>> def double_hook_fn(grad): ... grad = grad * 2 ... return grad >>> x = paddle.to_tensor([0.0, 1.0, 2.0, 3.0], stop_gradient=False) >>> y = paddle.to_tensor([4.0, 5.0, 6.0, 7.0], stop_gradient=False) >>> z = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) >>> # one Tensor can register multiple hooks >>> h = x.register_hook(print_hook_fn) >>> x.register_hook(double_hook_fn) >>> w = x + y >>> # register hook by lambda function >>> w.register_hook(lambda grad: grad * 2) >>> o = z.matmul(w) >>> o.backward() >>> # print_hook_fn print content in backward Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False, [2., 4., 6., 8.]) >>> print("w.grad:", w.grad) w.grad: None >>> print("x.grad:", x.grad) x.grad: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False, [4. , 8. , 12., 16.]) >>> print("y.grad:", y.grad) y.grad: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False, [2., 4., 6., 8.]) >>> # remove hook >>> h.remove() """ if self.stop_gradient is True: raise RuntimeError( "Cannot register hook on a tensor that stop gradient." ) hook_id = self._register_grad_hook(hook) helper = TensorHookRemoveHelper(self, hook_id) return helper @framework.dygraph_only def _to( self: Tensor, device: PlaceLike | None = None, dtype: DTypeLike | None = None, blocking: bool | None = None, copy_tensor: bool | None = None, ) -> Tensor: if device is None and dtype is None and blocking is None: return self def is_cuda_place(place: PlaceLike): return isinstance(place, core.CUDAPlace) or ( isinstance(place, Place) and place.is_gpu_place() ) def get_device_id(place: PlaceLike): if isinstance( place, ( core.CUDAPlace, core.XPUPlace, core.IPUPlace, core.CustomPlace, ), ): return place.get_device_id() elif isinstance(place, Place): if place.is_gpu_place(): return place.gpu_device_id() elif place.is_xpu_place(): return place.xpu_device_id() elif place.is_ipu_place(): return place.ipu_device_id() elif place.is_custom_place(): return place.custom_device_id() else: raise ValueError( f"Invalid place: {place}, only support getting device id from CUDAPlace/XPUPlace/IPUPlace/CustomPlace" ) if device is not None: if isinstance(device, str): device = paddle.device._convert_to_place(device) elif isinstance( device, ( core.Place, core.CPUPlace, core.CUDAPlace, core.CUDAPinnedPlace, core.XPUPlace, core.CustomPlace, ), ): pass else: raise ValueError( "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace(), paddle.XPUPlace() or paddle.CustomPlace(), but the type of device is " + type(device).__name__ ) if blocking is None: blocking = True else: assert isinstance(blocking, bool), ( "blocking value error, must be the True, False or None" ) def transform(t, device, dtype, blocking, copy_tensor): if device is None: device = t.place if dtype is None: dtype = t.dtype # 1. gpu place need to determine whether the memory is sufficient for allocation. if t.place.is_gpu_place() and ( # NOTE: Only copy memory when place or device id is different, # otherwise, it may frequently call GpuMemGetInfo in # core.gpu_memory_available, leading to abnormal overhead. not is_cuda_place(device) or t.place.gpu_device_id() != get_device_id(device) ): proto_dtype = framework.convert_to_proto_type(dtype) size_dtype = core.size_of_dtype(proto_dtype) # Note(weilong wu): Paddle GPU minimum memory allocation unit is 256 bytes, # waiting_alloc_memory will compute the memory space occupied by 't'. # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. waiting_alloc_memory = ( ((t._numel() * size_dtype) / 256 + 1) * 256 * 1.2 ) gpu_memory_available = core.gpu_memory_available() if gpu_memory_available < waiting_alloc_memory: # Copy Tensor to cpu if needed t_used = t._copy_to(paddle.CPUPlace(), blocking) # Release memory of t t._clear() copy_tensor = False else: # Tensor still in GPU t_used = t else: t_used = t # 2. cast Tensor to dtype if needed if dtype is not None and dtype != t_used.dtype: with paddle.base.framework._dygraph_place_guard( place=t_used.place ): t_casted = t_used.cast(dtype=dtype) copy_tensor = False else: t_casted = t_used # 3. Copy casted Tensor(in CPU or GPU) to device if needed if device is not None and not t_casted.place._equals(device): new_t = t_casted._copy_to(device, blocking) copy_tensor = False else: new_t = t_casted new_t.stop_gradient = t.stop_gradient if copy_tensor: return copy.deepcopy(new_t) else: return new_t with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) return transform(self, device, dtype, blocking, copy_tensor) @overload def to( self: Tensor, device: PlaceLike, dtype: DTypeLike | None = ..., blocking: bool = ..., copy: bool = ..., *, non_blocking: bool = ..., ) -> Tensor: ... @overload def to( self: Tensor, dtype: DTypeLike, blocking: bool = ..., copy: bool = ..., *, non_blocking: bool = ..., ) -> Tensor: ... @overload def to( self: Tensor, other: Tensor, blocking: bool = ..., copy: bool = ..., *, non_blocking: bool = ..., ) -> Tensor: ... @framework.dygraph_only def to(self: Tensor, *args, **kwargs): """ Performs Tensor dtype and/or device conversion. A paddle.dtype and place are inferred from the arguments of ``self.to(*args, **kwargs)``. This API has three calling conventions: 1. ``to(device=None, dtype=None, blocking=True, copy=False, *, non_blocking=False)``: Moves and/or casts the Tensor. 2. ``to(dtype, blocking=True, copy=False, *, non_blocking=False)``: Equivalent to ``self.to(device=None, dtype=dtype, ...)``. 3. ``to(other, blocking=True, copy=False, *, non_blocking=False)``: Equivalent to ``self.to(device=other.place, dtype=other.dtype, ...)``. .. note:: If the self Tensor already has the correct dtype and device, then self is returned. Otherwise, the returned tensor is a copy of self with the desired dtype and device. Args: device (str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device to move to. Default: ``None``. dtype (str|numpy.dtype|paddle.dtype|None, optional): The desired data type. Default: ``None``. blocking (bool, optional): If ``False`` and the source is in pinned memory, the copy will be asynchronous with respect to the host. Default: ``True``. copy (bool, optional): If ``True``, a new Tensor is created even when the Tensor already matches the desired conversion. Default: ``False``. other (Tensor, optional): Tensor whose dtype and device are the desired dtype and device. Keyword args: non_blocking (bool, optional): If ``True`` and the source is in pinned memory, the copy will be asynchronous with respect to the host. Default: ``False``. ``non_blocking`` and ``blocking`` are mutually exclusive and cannot both be set at the same time. Returns: Tensor: self Examples: .. code-block:: pycon >>> # doctest: +REQUIRES(env:GPU) >>> import paddle >>> x = paddle.to_tensor([1, 2, 3]) >>> print(x) Tensor(shape=[3], dtype=int64, place=Place(gpu:0), stop_gradient=True, [1, 2, 3]) >>> x = x.to("cpu") >>> print(x.place) Place(cpu) >>> x = x.to("float32") >>> print(x.dtype) paddle.float32 >>> x = x.to("gpu", "int16") >>> print(x) Tensor(shape=[3], dtype=int16, place=Place(gpu:0), stop_gradient=True, [1, 2, 3]) >>> y = paddle.to_tensor([4, 5, 6]) >>> y Tensor(shape=[3], dtype=int64, place=Place(gpu:0), stop_gradient=True, [4, 5, 6]) >>> y = y.to(x) >>> print(y) Tensor(shape=[3], dtype=int16, place=Place(gpu:0), stop_gradient=True, [4, 5, 6]) """ from paddle.nn.layer.layers import _parse_to_args device, dtype, blocking, copy_tensor = _parse_to_args(*args, **kwargs) return self._to(device, dtype, blocking, copy_tensor) def clear_grad(self: Tensor) -> None: """ The alias of clear_gradient(). """ self.clear_gradient() def item(self: Tensor, *args: int) -> float | bool | complex: """ Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a single-element Tensor. Args: *args(int): The input coordinates. If it's single int, the data in the corresponding order of flattened Tensor will be returned. Default: None, and it must be in the case where Tensor has only one element. Returns(Python scalar): A Python scalar, whose dtype is corresponds to the dtype of Tensor. Raises: ValueError: If the Tensor has more than one element, there must be coordinates. Examples: .. code-block:: pycon >>> import paddle >>> x = paddle.to_tensor(1) >>> print(x.item()) 1 >>> print(type(x.item())) >>> x = paddle.to_tensor(1.0) >>> print(x.item()) 1.0 >>> print(type(x.item())) >>> x = paddle.to_tensor(True) >>> print(x.item()) True >>> print(type(x.item())) >>> x = paddle.to_tensor(1 + 1j) >>> print(x.item()) (1+1j) >>> print(type(x.item())) >>> x = paddle.to_tensor([[1.1, 2.2, 3.3]]) >>> print(x.item(2)) 3.299999952316284 >>> print(x.item(0, 2)) 3.299999952316284 """ # resolve the error issue in scenario of pipeline parallel # where some devices do not have self data, return None does not affect # the execution result in those devices, so currently we return None if self.is_dist() and not self._is_initialized(): return None scalar = self._getitem_from_offset(*args) if scalar.dtype == np.uint16: return convert_uint16_to_float(scalar).item() return scalar.item() @property def inplace_version(self: Tensor) -> int: """ The inplace version of current Tensor. The version number is incremented whenever the current Tensor is modified through an inplace operation. **Notes: This is a read-only property** Examples: .. code-block:: pycon >>> import paddle >>> var = paddle.ones(shape=[4, 2, 3], dtype="float32") >>> print(var.inplace_version) 0 >>> var[1] = 2.2 >>> print(var.inplace_version) 1 """ return self._inplace_version() def __str__(self: Tensor) -> str: """ Convert a Tensor object to a readable string. Returns(str): A readable string. Examples: .. code-block:: pycon >>> import paddle >>> paddle.seed(2023) >>> x = paddle.rand([2, 5]) >>> print(x) Tensor(shape=[2, 5], dtype=float32, place=Place(cpu), stop_gradient=True, [[0.86583614, 0.52014720, 0.25960937, 0.90525323, 0.42400089], [0.40641287, 0.97020894, 0.74437362, 0.51785129, 0.73292869]]) """ from paddle.tensor.to_string import tensor_to_string return tensor_to_string(self) def __format__(self, format_spec: str) -> str: if self.ndim == 0: return self.item().__format__(format_spec) return object.__format__(self, format_spec) def __deepcopy__(self, memo: dict[int, Tensor]) -> Tensor: """ Deep copy Tensor, it will always performs Tensor copy. Examples: .. code-block:: pycon >>> import paddle >>> import copy >>> x = paddle.to_tensor(2.0) >>> y = copy.deepcopy(x) >>> print(x) Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 2.) >>> print(y) Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True, 2.) """ new_tensor = core.eager.Tensor() new_tensor.name = self.name + unique_name.generate("_deepcopy") memo[id(self)] = new_tensor new_tensor.copy_(self, True) return new_tensor # TODO(cleanup-legacy-ir): This method is for dy2st in legacy ir only # and should be removed after legacy ir is removed. @property def block(self): return framework.default_main_program().global_block() def __nonzero__(self: Tensor) -> bool: # np.prod([]) -> np.float64, so use int numel = int(np.prod(self.shape)) assert numel == 1, ( "When Variable is used as the condition of if/while , Variable can only contain one element." ) # resolve the error issue in scenario of pipeline parallel # where some devices do not have this data, return True or False does not affect # the execution result in those devices, so currently we return False if self.is_dist() and not self._is_initialized(): return False assert self._is_initialized(), "tensor not initialized" return bool(np.array(self) > 0) def __bool__(self: Tensor) -> bool: return self.__nonzero__() def __array__( self: Tensor, dtype: npt.DTypeLike | None = None, copy: bool | None = None, ) -> npt.NDArray[Any]: """ Returns a numpy array shows the value of current Tensor. Returns: ndarray: The numpy value of current Tensor. Returns type: ndarray: dtype is same as current Tensor Examples: .. code-block:: pycon >>> import paddle >>> import numpy as np >>> x = paddle.randn([2, 2]) >>> x_array = np.array(x) >>> print(type(x_array)) >>> print(x_array.shape) (2, 2) """ array = self.numpy(False) if dtype: array = array.astype(dtype) return array def pre_deal_index(self, item): # since in pybind there is no efficiency way to transfer Py_Tuple/Py_List/Py_Range to Tensor # we call this function in python level. item = list(item) if isinstance(item, tuple) else [item] for i, slice_item in enumerate(item): if isinstance(slice_item, (list, tuple)): item[i] = np.array(slice_item) elif isinstance(slice_item, range): item[i] = np.array(list(slice_item)) return tuple(item) def __getitem__( self, item: TensorIndex, ) -> Tensor: item = pre_deal_index(self, item) return self._getitem_dygraph(item) def __setitem__( self, item: TensorIndex, value: Tensor | npt.NDArray[Any] | complex | bool, ) -> None: item = pre_deal_index(self, item) return self._setitem_dygraph(item, value) @framework.dygraph_only def _set_grad_ivar(self, value): if isinstance(self, EagerParamBase): self.grad = value self._unset_fake_empty() else: raise TypeError( "_set_grad_ivar is only supported for Parameter Tensor" ) @framework.dygraph_only def value(self: Tensor) -> Tensor: return self @framework.dygraph_only def _slice(self: Tensor, begin_idx: int, end_idx: int) -> Tensor: return core.eager.Tensor(self.get_tensor()._slice(begin_idx, end_idx)) @framework.dygraph_only def _numel(self: Tensor) -> int: return self.get_tensor()._numel() @framework.dygraph_only def _clear_data(self: Tensor) -> None: self.get_tensor()._clear() @framework.dygraph_only def _use_gpudnn(self, use_gpudnn=True): return self._tensor_use_gpudnn(use_gpudnn) @framework.dygraph_only def _uva(self: Tensor, device_id: int = 0) -> None: ''' Returns self tensor with the UVA(unified virtual addressing). Args: device_id(int, optional): The destination GPU device id. Default: None, means current device. Examples: .. code-block:: pycon >>> # doctest: +REQUIRES(env:GPU) >>> import paddle >>> paddle.device.set_device('gpu') >>> x = paddle.to_tensor([1, 2, 3], place=paddle.CPUPlace()) >>> x._uva() >>> print(x) ''' self._tensor_uva(device_id) @framework.dygraph_only def cpu(self: Tensor) -> Tensor: if self.place.is_cpu_place(): return self else: res = self._copy_to(core.CPUPlace(), True) res.stop_gradient = self.stop_gradient res.persistable = self.persistable return res @overload def cuda( self: Tensor, device_id: DeviceLike = None, blocking: bool = True ) -> Tensor: ... @overload def cuda( self: Tensor, device: DeviceLike = None, non_blocking: bool = False, ) -> Tensor: ... @framework.dygraph_only @tensor_cuda_decorator() def cuda( self: Tensor, device_id: DeviceLike = None, blocking: bool = True, ) -> Tensor: device_type = paddle.device.get_all_device_type() if len( device_type ) > 0 and paddle.device.is_compiled_with_custom_device(device_type[-1]): res_place_class = core.CustomPlace elif paddle.device.is_compiled_with_xpu(): res_place_class = core.XPUPlace elif paddle.device.is_compiled_with_cuda(): res_place_class = core.CUDAPlace else: raise ValueError("No available device found.") if device_id is None: # None res_place = framework._current_expected_place() if not isinstance(res_place, res_place_class): res_place = res_place_class(0) elif isinstance(device_id, paddle.device.Device): # Device res_place = device_id._to_place() elif isinstance(device_id, int): # int res_place = res_place_class(device_id) elif isinstance(device_id, str): # str device = paddle.device(device_id) res_place = device._to_place() elif isinstance( device_id, (core.CUDAPlace, core.CustomPlace, core.XPUPlace) ): # Place res_place = device_id else: raise ValueError( "device_id must be DeviceLike, which is paddle.CUDAPlace|paddle.CustomPlace|paddle.XPUPlace|int|str|None" ) if self.place._equals(res_place): return self else: res = self._copy_to(res_place, blocking) res.stop_gradient = self.stop_gradient res.persistable = self.persistable return res @property def is_cuda(self: Tensor) -> bool: return self.place.is_gpu_place() @property def is_cpu(self: Tensor) -> bool: return self.place.is_cpu_place() @framework.dygraph_only def pin_memory(self: Tensor, blocking: bool = True) -> Tensor: if ( self.place.is_cuda_pinned_place() or self.place.is_xpu_pinned_place() ): return self else: if paddle.device.is_compiled_with_xpu(): res = self._copy_to(core.XPUPinnedPlace(), blocking) else: res = self._copy_to(core.CUDAPinnedPlace(), blocking) res.stop_gradient = self.stop_gradient res.persistable = self.persistable return res @framework.dygraph_only def values(self: Tensor) -> Tensor: """ **Notes**: **This API is ONLY available in Dygraph mode** Get the values of current SparseTensor(COO or CSR). Returns: Tensor: A DenseTensor Examples: .. code-block:: pycon >>> import paddle >>> indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] >>> values = [1, 2, 3, 4, 5] >>> dense_shape = [3, 4] >>> sparse_x = paddle.sparse.sparse_coo_tensor( ... paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape ... ) >>> print(sparse_x.values()) Tensor(shape=[5], dtype=float32, place=Place(cpu), stop_gradient=True, [1., 2., 3., 4., 5.]) """ return _C_ops.sparse_values(self) @framework.dygraph_only def to_dense(self: Tensor) -> Tensor: """ **Notes**: **This API is ONLY available in Dygraph mode** Convert the current SparseTensor(COO or CSR) to DenseTensor. Returns: Tensor: A DenseTensor Examples: .. code-block:: pycon >>> import paddle >>> indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] >>> values = [1, 2, 3, 4, 5] >>> dense_shape = [3, 4] >>> sparse_x = paddle.sparse.sparse_coo_tensor( ... paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape ... ) >>> dense_x = sparse_x.to_dense() >>> print(dense_x) Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True, [[0., 1., 0., 2.], [0., 0., 3., 0.], [4., 5., 0., 0.]]) """ return _C_ops.sparse_to_dense(self) @framework.dygraph_only def to_sparse_coo(self: Tensor, sparse_dim: int) -> Tensor: """ **Notes**: **This API is ONLY available in Dygraph mode** Convert the current DenseTensor to SparseTensor in COO format. When the input is already a SparseCooTensor, this function will directly return the input itself without performing any conversion. Returns: Tensor: A SparseCooTensor Examples: .. code-block:: pycon >>> import paddle >>> dense_x = [[0, 1, 0, 2], [0, 0, 3, 4]] >>> dense_x = paddle.to_tensor(dense_x, dtype='float32') >>> sparse_x = dense_x.to_sparse_coo(sparse_dim=2) >>> print(sparse_x) Tensor(shape=[2, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True, indices=[[0, 0, 1, 1], [1, 3, 2, 3]], values=[1., 2., 3., 4.]) """ if self.is_sparse_coo(): return self return _C_ops.sparse_to_sparse_coo(self, sparse_dim) @framework.dygraph_only def _md5sum(self: Tensor) -> str: """ **Notes**: **This API is ONLY available in Dygraph mode** Calculate the md5sum of current Tensor. Returns: str: The md5sum of current Tensor. Examples: .. code-block:: pycon >>> import paddle >>> x = paddle.to_tensor([1, 2, 3]) >>> print(x._md5sum()) >>> #'1f68049372c5b2a4e0d049044450 """ numpy_array = np.array(self) array_bytes = numpy_array.tobytes() return hashlib.md5(array_bytes).hexdigest() def __hash__(self): return hash(id(self)) @framework.dygraph_only def coalesce(self: Tensor, name: str | None = None) -> Tensor: r""" the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique. Parameters: x (Tensor): the input SparseCooTensor. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: Tensor: return the SparseCooTensor after coalesced. Examples: .. code-block:: pycon >>> import paddle >>> indices = [[0, 0, 1], [1, 1, 2]] >>> values = [1.0, 2.0, 3.0] >>> sp_x = paddle.sparse.sparse_coo_tensor(indices, values) >>> sp_x = sp_x.coalesce() >>> print(sp_x.indices()) Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True, [[0, 1], [1, 2]]) >>> print(sp_x.values()) Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True, [3., 3.]) """ return _C_ops.sparse_coalesce(self) @framework.dygraph_only def sparse_mask( self: Tensor, mask: Tensor, name: str | None = None ) -> Tensor: r""" constructs a sparse tensor by extracting values from a dense source at the unique, sorted indices defined by a sparse mask. Args: self (Tensor): The input dense tensor (will be filtered). mask (Tensor): Sparse tensor (SparseCooTensor or SparseCsrTensor) used as mask. name (str, optional): Operation name (ignored in this implementation). Returns: SparseTensor: A sparse tensor with the same indices as `mask`, containing values from `self` at mask positions. Examples: .. code-block:: pycon >>> import paddle >>> paddle.set_device('cpu') >>> paddle.seed(2024) >>> crows = [0, 2, 3, 5] >>> cols = [1, 3, 2, 0, 1] >>> values = [1.0, 2.0, 3.0, 4.0, 5.0] >>> dense_shape = [3, 4] >>> csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape) >>> x = paddle.rand(dense_shape) >>> out = x.sparse_mask(csr) >>> print(out) Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True, crows=[0, 2, 3, 5], cols=[1, 3, 2, 0, 1], values=[0.23659813, 0.08467803, 0.64152628, 0.66596609, 0.90394485]) >>> paddle.seed(2024) >>> indices = [[0, 1, 2], [1, 2, 0]] >>> values = [1.0, 2.0, 3.0] >>> dense_shape = [3, 3] >>> coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape) >>> x = paddle.rand(dense_shape) >>> out = x.sparse_mask(coo) >>> print(out) Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True, indices=[[0, 1, 2], [1, 2, 0]], values=[0.23659813, 0.40340215, 0.64152628]) """ return _C_ops.sparse_mask_as(self, mask) @framework.dygraph_only def __dlpack_device__(self): """ Extract the DLPack device type and device ID for the current tensor. Returns: tuple: A tuple containing the DLPack device type and device ID. - device_type (DLDeviceType): The type of device (e.g., kDLCPU, kDLCUDA, etc.). - device_id (int): The device ID. """ place = self.place if isinstance(place, Place): if place.is_gpu_place(): return DLDeviceType.kDLCUDA, place.gpu_device_id() elif place.is_cpu_place(): return DLDeviceType.kDLCPU, None elif place.is_cuda_pinned_place(): return DLDeviceType.kDLCUDAHost, None elif place.is_xpu_place(): return DLDeviceType.kDLOneAPI, place.xpu_device_id() else: raise RuntimeError(f"Unsupported Paddle device type {place}") elif place.is_cpu_place(): return DLDeviceType.kDLCPU, None elif place.is_cuda_pinned_place(): return DLDeviceType.kDLCUDAHost, None elif place.is_gpu_place(): return DLDeviceType.kDLCUDA, place.get_device_id() elif place.is_xpu_place(): return DLDeviceType.kDLOneAPI, place.get_device_id() else: raise ValueError(f"Unsupported tensor place: {place}") @property def device(self: Tensor) -> str: """ Return the device descriptor string indicating where the tensor is located. Returns: str: A string representing the device where the tensor resides. Possible formats include: - 'cpu' for CPU tensors - 'cuda:{device_id}' for GPU tensors (e.g., 'cuda:0') - 'xpu:{device_id}' for XPU tensors (e.g., 'xpu:0') - '{device_type}:{device_id}' for custom device tensors Examples: .. code-block:: pycon >>> import paddle >>> # CPU tensor >>> cpu_tensor = paddle.to_tensor([1, 2, 3]).to("cpu") >>> print(cpu_tensor.device) 'cpu' """ place = self.place return paddle.device(place) @property def __cuda_array_interface__(self): """Array view description for cuda tensors. See: CUDA Array Interface (Version 2) https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html """ # raise AttributeError for unsupported tensors, so that # hasattr(cpu_tensor, "__cuda_array_interface__") is False. if not self.place.is_gpu_place(): raise AttributeError( "Can't get __cuda_array_interface__ on non-CUDA tensor. " "If CUDA data is required use tensor.cuda() to copy tensor to device memory." ) if self.is_sparse(): raise AttributeError( "Can't get __cuda_array_interface__ on sparse tensor. " "Use Tensor.to_dense() to convert to a dense tensor first." ) # RuntimeError, matching tensor.__array__() behavior. if not self.stop_gradient: raise RuntimeError( "Can't get __cuda_array_interface__ on Tensor that requires grad. " "If gradients aren't required, use var.detach() to get Tensor that doesn't require grad." ) # CUDA devices are little-endian and tensors are stored in native byte # order. 1-byte entries are endian-agnostic. typestr = { paddle.complex64: " 0 else 0 data = (data_ptr, False) # read-only is false return { "typestr": typestr, "shape": shape, "strides": strides, "data": data, "version": 2, } def __dlpack__( self, *, stream: int | None = None, max_version: tuple[int, int] | None = None, dl_device: tuple[IntEnum, int] | None = None, copy: bool | None = None, ) -> CapsuleType: """ Creates a DLPack capsule of the current tensor to be exported to other libraries. Args: stream (int | None, optional): An optional Python integer representing a pointer to a CUDA stream. Synchronizes the tensor with this stream before exporting. If None or -1, no synchronization is performed. If 0, the default stream is used. max_version (tuple[int, int] | None): An optional Python tuple with 2 integers, representing the maximum version the caller supports. If None (default), we will fallback to DLPack 0.8. dl_device (tuple[IntEnum, int] | None, optional): The DLPack device type. Default is None, meaning the exported capsule should be on the same device as self is. When specified, the format must be a 2-tuple, following that of the return value of array.__dlpack_device__(). copy (bool | None, optional): Whether or not to copy the input. If True, the output tensor always copied. If False, the output tensor must never copied, and raise a BufferError in case a copy is deemed necessary. If None, the output tensor must reuse the existing memory buffer if possible and copy otherwise. Default: None. """ if self.is_sparse(): raise BufferError( "Can't get __dlpack__ from a Tensor from sparse storage." ) if not self.stop_gradient: raise BufferError( "Can't get __dlpack__ from Tensor that requires gradients. " "If gradients aren't required, use tensor.detach() to get a tensor without gradient." ) if stream is not None and not isinstance(stream, int): raise TypeError("stream must be an integer or None.") elif self.place.is_gpu_place() and stream != -1: is_rocm = paddle.is_compiled_with_rocm() is_cuda = paddle.is_compiled_with_cuda() if not (is_rocm or is_cuda): raise RuntimeError( "DLPack with stream synchronization is only supported " "when Paddle is compiled with CUDA or ROCm." ) if is_cuda and stream == 0: raise ValueError( "For CUDA, stream=0 is ambiguityous, please use None for default stream." ) if is_cuda and stream == 2: raise ValueError( "For CUDA, stream=2 means per-thread default stream, which is not supported." ) if is_rocm and stream in {1, 2}: raise ValueError("For ROCm, stream=1 or 2 is not supported.") if ( stream is None # For CUDA, stream=1 means default stream or (is_cuda and stream == 1) # For ROCm, stream=0 means default stream or (is_rocm and stream == 0) ): consumer_stream = paddle.device.Stream( stream_base=core._get_legacy_default_stream( paddle.framework._current_expected_place_().get_device_id() ) ) else: assert stream > 2, "stream should be a valid stream pointer." consumer_stream = paddle.device.get_stream_from_external(stream) current_stream = paddle.device.current_stream() def is_same_stream( lhs: paddle.device.Stream, rhs: paddle.device.Stream ) -> bool: return ( lhs.stream_base.raw_stream == rhs.stream_base.raw_stream ) and (lhs.device == rhs.device) if not is_same_stream(consumer_stream, current_stream): event = paddle.device.Event() event.record(current_stream) consumer_stream.wait_event(event) elif self.place.is_cpu_place(): assert stream is None, "CPU tensor stream must be None." if max_version is None or max_version[0] < 1: return self.get_tensor()._to_dlpack(dl_device=dl_device, copy=copy) return self.get_tensor()._to_dlpack_versioned( dl_device=dl_device, copy=copy ) def get_device(self: Tensor) -> int: """ Return the device id where the Tensor is located. Returns: int: The device id of the Tensor. Returns -1 for CPU tensors; for GPU tensors, returns the CUDA device id (e.g., 0 for `gpu:0`). Examples: .. code-block:: pycon >>> import paddle >>> x = paddle.to_tensor([1, 2, 3], place=paddle.CPUPlace()) >>> x.get_device() -1 >>> # doctest: +REQUIRES(env:GPU) >>> y = paddle.to_tensor([1, 2, 3], place=paddle.CUDAPlace(0)) >>> y.get_device() 0 """ if self.place.is_cpu_place(): return -1 else: return self.place.gpu_device_id() def __tvm_ffi_env_stream__(self) -> int: """ Returns the raw stream pointer of the current tensor's device context. This is used for TVM FFI environment integration. """ if self.place.is_gpu_place(): return paddle.base.libpaddle._get_current_raw_stream( self.place.gpu_device_id() ) else: # TODO: Add XPU and custom device support. raise RuntimeError( "Currently, the __tvm_ffi_env_stream__ method is only supported for GPU tensors." ) for method_name, method in ( ("__bool__", __bool__), ("__nonzero__", __nonzero__), ("_to_static_var", _to_static_var), ("set_value", set_value), ("block", block), ("backward", backward), ("clear_grad", clear_grad), ("inplace_version", inplace_version), ("is_cuda", is_cuda), ("is_cpu", is_cpu), ("gradient", gradient), ("apply_", apply_), ("apply", apply), ("register_hook", register_hook), ("__str__", __str__), ("__repr__", __str__), ("__format__", __format__), ("__deepcopy__", __deepcopy__), ("__module__", "paddle"), ("__array__", __array__), ("__getitem__", __getitem__), ("item", item), ("__setitem__", __setitem__), ("_to", _to), ("to", to), ("values", values), ("to_dense", to_dense), ("to_sparse_coo", to_sparse_coo), ("coalesce", coalesce), ("sparse_mask", sparse_mask), ("_set_grad_ivar", _set_grad_ivar), ("value", value), ("cpu", cpu), ("cuda", cuda), ("pin_memory", pin_memory), ("_slice", _slice), ("_numel", _numel), ("_uva", _uva), ("_clear_data", _clear_data), ("__hash__", __hash__), ("_use_gpudnn", _use_gpudnn), ("_md5sum", _md5sum), ("__cuda_array_interface__", __cuda_array_interface__), ("__dlpack__", __dlpack__), ("__dlpack_device__", __dlpack_device__), ("get_device", get_device), ("__tvm_ffi_env_stream__", __tvm_ffi_env_stream__), # For TVM FFI 0.1.0-0.1.4 ("__c_dlpack_exchange_api__", core.dlpack_exchange_api_ptr()), # For TVM FFI 0.1.5+ ("__dlpack_c_exchange_api__", core.dlpack_exchange_api_pycapsule()), ("device", device), ): setattr(core.eager.Tensor, method_name, method) global _already_patch_repr if not _already_patch_repr: # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class. # So, we need to overwrite it to a more readable one. # See details in https://github.com/pybind/pybind11/issues/2537. origin = core.VarDesc.VarType.__str__ def dtype_str(dtype): if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE: numpy_dtype = _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype] if numpy_dtype == 'uint16': numpy_dtype = 'bfloat16' prefix = 'paddle.' return prefix + numpy_dtype else: # for example, paddle.base.core.VarDesc.VarType.DENSE_TENSOR return origin(dtype) core.VarDesc.VarType.__str__ = dtype_str _already_patch_repr = True # patch math methods for tensor monkey_patch_math_tensor()