The common header of DLPack. More...

#include <stddef.h>
#include <stdint.h>
#include <fmt/core.h>
#include <fmt/format.h>

Data Structures
struct	DLPackVersion
	The DLPack version. More...

struct	DLDevice
	A Device for Tensor and operator. More...

struct	DLDataType
	The data type the tensor can hold. The data type is assumed to follow the native endian-ness. An explicit error message should be raised when attempting to export an array with non-native endianness. More...

struct	DLTensor
	Plain C Tensor object, does not manage memory. More...

struct	DLManagedTensor
	C Tensor object, manage memory of DLTensor. This data structure is intended to facilitate the borrowing of DLTensor by another framework. It is not meant to transfer the tensor. When the borrowing framework doesn't need the tensor, it should call the deleter to notify the host that the resource is no longer needed. More...

struct	DLManagedTensorVersioned
	A versioned and managed C Tensor object, manage memory of DLTensor. More...

struct	DLPackExchangeAPIHeader
	DLPackExchangeAPI stable header. More...

struct	DLPackExchangeAPI
	Framework-specific function pointers table for DLPack exchange. More...

struct	fmt::formatter< DLDeviceType >

Namespaces
	fmt

Macros
#define	DLPACK_EXTERN_C
	Compatibility with C++. More...

#define	DLPACK_MAJOR_VERSION 1
	The current major version of dlpack. More...

#define	DLPACK_MINOR_VERSION 2
	The current minor version of dlpack. More...

#define	DLPACK_DLL
	DLPACK_DLL prefix for windows. More...

#define	DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
	bit mask to indicate that the tensor is read only. More...

#define	DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
	bit mask to indicate that the tensor is a copy made by the producer. More...

#define	DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
	bit mask to indicate that whether a sub-byte type is packed or padded. More...

Typedefs
typedef struct DLManagedTensor	DLManagedTensor
	C Tensor object, manage memory of DLTensor. This data structure is intended to facilitate the borrowing of DLTensor by another framework. It is not meant to transfer the tensor. When the borrowing framework doesn't need the tensor, it should call the deleter to notify the host that the resource is no longer needed. More...

typedef struct DLManagedTensorVersioned	DLManagedTensorVersioned
	A versioned and managed C Tensor object, manage memory of DLTensor. More...

typedef int(*	DLPackManagedTensorAllocator) (DLTensor prototype, DLManagedTensorVersioned out, void error_ctx, void(SetError) (void error_ctx, const char kind, const char message))
	Request a producer library to create a new tensor. More...

typedef int(*	DLPackManagedTensorFromPyObjectNoSync) (void py_object, DLManagedTensorVersioned *out)
	Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned. More...

typedef int(*	DLPackDLTensorFromPyObjectNoSync) (void py_object, DLTensor out)
	Exports a PyObject* Tensor/NDArray to a provided DLTensor. More...

typedef int(*	DLPackCurrentWorkStream) (DLDeviceType device_type, int32_t device_id, void **out_current_stream)
	Obtain the current work stream of a device. More...

typedef int(*	DLPackManagedTensorToPyObjectNoSync) (DLManagedTensorVersioned tensor, void *out_py_object)
	Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray. More...

typedef struct DLPackExchangeAPIHeader	DLPackExchangeAPIHeader
	DLPackExchangeAPI stable header. More...

typedef struct DLPackExchangeAPI	DLPackExchangeAPI
	Framework-specific function pointers table for DLPack exchange. More...

Enumerations
enum	DLDeviceType { kDLCPU = 1 , kDLCUDA = 2 , kDLCUDAHost = 3 , kDLOpenCL = 4 , kDLVulkan = 7 , kDLMetal = 8 , kDLVPI = 9 , kDLROCM = 10 , kDLROCMHost = 11 , kDLExtDev = 12 , kDLCUDAManaged = 13 , kDLOneAPI = 14 , kDLWebGPU = 15 , kDLHexagon = 16 , kDLMAIA = 17 , kDLTrn = 18 }
	The device type in DLDevice. More...

enum	DLDataTypeCode { kDLInt = 0U , kDLUInt = 1U , kDLFloat = 2U , kDLOpaqueHandle = 3U , kDLBfloat = 4U , kDLComplex = 5U , kDLBool = 6U , kDLFloat8_e3m4 = 7U , kDLFloat8_e4m3 = 8U , kDLFloat8_e4m3b11fnuz = 9U , kDLFloat8_e4m3fn = 10U , kDLFloat8_e4m3fnuz = 11U , kDLFloat8_e5m2 = 12U , kDLFloat8_e5m2fnuz = 13U , kDLFloat8_e8m0fnu = 14U , kDLFloat6_e2m3fn = 15U , kDLFloat6_e3m2fn = 16U , kDLFloat4_e2m1fn = 17U }
	The type code options DLDataType. More...

Detailed Description

The common header of DLPack.

Macro Definition Documentation

◆ DLPACK_DLL

#define DLPACK_DLL

DLPACK_DLL prefix for windows.

◆ DLPACK_EXTERN_C

#define DLPACK_EXTERN_C

Compatibility with C++.

◆ DLPACK_FLAG_BITMASK_IS_COPIED

#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)

bit mask to indicate that the tensor is a copy made by the producer.

If set, the tensor is considered solely owned throughout its lifetime by the consumer, until the producer-provided deleter is invoked.

◆ DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED

#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)

bit mask to indicate that whether a sub-byte type is packed or padded.

The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can be set by the producer to signal that a tensor of sub-byte type is padded.

◆ DLPACK_FLAG_BITMASK_READ_ONLY

#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)

bit mask to indicate that the tensor is read only.

◆ DLPACK_MAJOR_VERSION

#define DLPACK_MAJOR_VERSION 1

The current major version of dlpack.

◆ DLPACK_MINOR_VERSION

#define DLPACK_MINOR_VERSION 2

The current minor version of dlpack.

Typedef Documentation

◆ DLManagedTensor

typedef struct DLManagedTensor DLManagedTensor

C Tensor object, manage memory of DLTensor. This data structure is intended to facilitate the borrowing of DLTensor by another framework. It is not meant to transfer the tensor. When the borrowing framework doesn't need the tensor, it should call the deleter to notify the host that the resource is no longer needed.

Note: This data structure is used as Legacy DLManagedTensor in DLPack exchange and is deprecated after DLPack v0.8 Use DLManagedTensorVersioned instead. This data structure may get renamed or deleted in future versions.

See also: DLManagedTensorVersioned

◆ DLManagedTensorVersioned

typedef struct DLManagedTensorVersioned DLManagedTensorVersioned

A versioned and managed C Tensor object, manage memory of DLTensor.

This data structure is intended to facilitate the borrowing of DLTensor by another framework. It is not meant to transfer the tensor. When the borrowing framework doesn't need the tensor, it should call the deleter to notify the host that the resource is no longer needed.

Note: This is the current standard DLPack exchange data structure.

◆ DLPackCurrentWorkStream

typedef int(* DLPackCurrentWorkStream) ( DLDeviceType device_type, int32_t device_id, void **out_current_stream)

Obtain the current work stream of a device.

Obtain the current work stream of a device from the producer framework. For example, it should map to torch.cuda.current_stream in PyTorch.

When device_type is kDLCPU, the consumer do not have to query the stream and the producer can simply return NULL when queried. The consumer do not have to do anything on stream sync or setting. So CPU only framework can just provide a dummy implementation that always set out_current_stream[0] to NULL.

Parameters

device_type	The device type.
device_id	The device id.
out_current_stream	The output current work stream.

Returns: 0 on success, -1 on failure with a Python exception set.

Note: - As a C function, must not thrown C++ exceptions.

See also: DLPackExchangeAPI

◆ DLPackDLTensorFromPyObjectNoSync

typedef int(* DLPackDLTensorFromPyObjectNoSync) ( void *py_object, DLTensor *out)

Exports a PyObject* Tensor/NDArray to a provided DLTensor.

This function provides a faster interface for temporary, non-owning, exchange. The producer (implementor) still owns the memory of data, strides, shape. The liveness of the DLTensor and the data it views is only guaranteed until control is returned.

This function currently assumes that the producer (implementor) can fill in the DLTensor shape and strides without the need for temporary allocations.

This function does not perform any stream synchronization. The consumer should query DLPackCurrentWorkStream to get the current work stream and launch kernels on it.

This function is exposed by the framework through the DLPackExchangeAPI.

Parameters

py_object	The Python object to convert. Must have the same type as the one the `DLPackExchangeAPI` was discovered from.
out	The output DLTensor, whose space is pre-allocated on stack.

Returns: 0 on success, -1 on failure with a Python exception set.

Note: - As a C function, must not thrown C++ exceptions.

See also: DLPackExchangeAPI, DLPackCurrentWorkStream

◆ DLPackExchangeAPI

typedef struct DLPackExchangeAPI DLPackExchangeAPI

Framework-specific function pointers table for DLPack exchange.

Additionally to __dlpack__() we define a C function table sharable by Python implementations via __c_dlpack_exchange_api__. This attribute must be set on the type as a Python integer compatible with PyLong_FromVoidPtr/PyLong_AsVoidPtr.

A consumer library may use a pattern such as:

PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__;  // as C-code
MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj);
if (api == NULL && PyErr_Occurred()) { goto handle_error; }

Note that this must be defined on the type. The consumer should look up the attribute on the type and may cache the result for each unique type.

The precise API table is given by:

struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
  MyDLPackExchangeAPI() {
    header.version.major = DLPACK_MAJOR_VERSION;
    header.version.minor = DLPACK_MINOR_VERSION;
    header.prev_version_api = nullptr;
 
    managed_tensor_allocator = MyDLPackManagedTensorAllocator;
    managed_tensor_from_py_object_no_sync =
MyDLPackManagedTensorFromPyObjectNoSync; managed_tensor_to_py_object_no_sync
= MyDLPackManagedTensorToPyObjectNoSync; dltensor_from_py_object_no_sync =
MyDLPackDLTensorFromPyObjectNoSync; current_work_stream =
MyDLPackCurrentWorkStream;
 }
 
 static const DLPackExchangeAPI* Global() {
    static MyDLPackExchangeAPI inst;
    return &inst;
 }
};

Guidelines for leveraging DLPackExchangeAPI:

There are generally two kinds of consumer needs for DLPack exchange:

N0: library support, where consumer.kernel(x, y, z) would like to run a kernel with the data from x, y, z. The consumer is also expected to run the kernel with the same stream context as the producer. For example, when x, y, z is torch.Tensor, consumer should query exchange_api->current_work_stream to get the current stream and launch the kernel with the same stream. This setup is necessary for no synchronization in kernel launch and maximum compatibility with CUDA graph capture in the producer. This is the desirable behavior for library extension support for frameworks like PyTorch.
N1: data ingestion and retention

Note that obj.__dlpack__() API should provide useful ways for N1. The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0 with the support of the function pointer current_work_stream.

Array/Tensor libraries should statically create and initialize this structure then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array. The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.

One simple way to do so is to create a static instance of DLPackExchangeAPI within the framework and return a pointer to it. The following code shows an example to do so in C++. It should also be reasonably easy to do so in other languages.

◆ DLPackExchangeAPIHeader

typedef struct DLPackExchangeAPIHeader DLPackExchangeAPIHeader

DLPackExchangeAPI stable header.

See also: DLPackExchangeAPI

◆ DLPackManagedTensorAllocator

typedef int(* DLPackManagedTensorAllocator) ( DLTensor *prototype, DLManagedTensorVersioned **out, void *error_ctx, void(*SetError)(void *error_ctx, const char *kind, const char *message))

Request a producer library to create a new tensor.

Create a new DLManagedTensorVersioned within the context of the producer library. The allocation is defined via the prototype DLTensor.

This function is exposed by the framework through the DLPackExchangeAPI.

Parameters

prototype	The prototype DLTensor. Only the dtype, ndim, shape, and device fields are used.
out	The output DLManagedTensorVersioned.
error_ctx	Context for `SetError`.
SetError	The function to set the error.

Returns: The owning DLManagedTensorVersioned* or NULL on failure. SetError is called exactly when NULL is returned (the implementor must ensure this).

Note

- As a C function, must not thrown C++ exceptions.

Error propagation via SetError to avoid any direct need of Python API. Due to this SetError may have to ensure the GIL is held since it will presumably set a Python error.

See also: DLPackExchangeAPI

◆ DLPackManagedTensorFromPyObjectNoSync

typedef int(* DLPackManagedTensorFromPyObjectNoSync) ( void *py_object, DLManagedTensorVersioned **out)

Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.

This function does not perform any stream synchronization. The consumer should query DLPackCurrentWorkStream to get the current work stream and launch kernels on it.

This function is exposed by the framework through the DLPackExchangeAPI.

Parameters

py_object The Python object to convert. Must have the same type as the one the DLPackExchangeAPI was discovered from.

Returns: The owning DLManagedTensorVersioned* or NULL on failure with a Python exception set. If the data cannot be described using DLPack this should be a BufferError if possible.

Note: - As a C function, must not thrown C++ exceptions.

See also: DLPackExchangeAPI, DLPackCurrentWorkStream

◆ DLPackManagedTensorToPyObjectNoSync

typedef int(* DLPackManagedTensorToPyObjectNoSync) ( DLManagedTensorVersioned *tensor, void **out_py_object)

Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.

Convert an owning DLManagedTensorVersioned* to the Python tensor of the producer (implementor) library with the correct type.

This function does not perform any stream synchronization.

This function is exposed by the framework through the DLPackExchangeAPI.

Parameters

tensor	The DLManagedTensorVersioned to convert the ownership of the tensor is stolen.
out_py_object	The output Python object.

Returns: 0 on success, -1 on failure with a Python exception set.

See also: DLPackExchangeAPI

Enumeration Type Documentation

◆ DLDataTypeCode

enum DLDataTypeCode

The type code options DLDataType.

Enumerator
kDLInt	signed integer
kDLUInt	unsigned integer
kDLFloat	IEEE floating point.
kDLOpaqueHandle	Opaque handle type, reserved for testing purposes. Frameworks need to agree on the handle data type for the exchange to be well-defined.
kDLBfloat	bfloat16
kDLComplex	complex number (C/C++/Python layout: compact struct per complex number)
kDLBool	boolean
kDLFloat8_e3m4	FP8 data types.
kDLFloat8_e4m3
kDLFloat8_e4m3b11fnuz
kDLFloat8_e4m3fn
kDLFloat8_e4m3fnuz
kDLFloat8_e5m2
kDLFloat8_e5m2fnuz
kDLFloat8_e8m0fnu
kDLFloat6_e2m3fn	FP6 data types Setting bits != 6 is currently unspecified, and the producer must ensure it is set while the consumer must stop importing if the value is unexpected.
kDLFloat6_e3m2fn
kDLFloat4_e2m1fn	FP4 data types Setting bits != 4 is currently unspecified, and the producer must ensure it is set while the consumer must stop importing if the value is unexpected.

◆ DLDeviceType

enum DLDeviceType

The device type in DLDevice.

Enumerator
kDLCPU	CPU device.
kDLCUDA	CUDA GPU device.
kDLCUDAHost	Pinned CUDA CPU memory by cudaMallocHost.
kDLOpenCL	OpenCL devices.
kDLVulkan	Vulkan buffer for next generation graphics.
kDLMetal	Metal for Apple GPU.
kDLVPI	Verilog simulator buffer.
kDLROCM	ROCm GPUs for AMD GPUs.
kDLROCMHost	Pinned ROCm CPU memory allocated by hipMallocHost.
kDLExtDev	Reserved extension device type, used for quickly test extension device The semantics can differ depending on the implementation.
kDLCUDAManaged	CUDA managed/unified memory allocated by cudaMallocManaged.
kDLOneAPI	Unified shared memory allocated on a oneAPI non-partititioned device. Call to oneAPI runtime is required to determine the device type, the USM allocation type and the sycl context it is bound to.
kDLWebGPU	GPU support for next generation WebGPU standard.
kDLHexagon	Qualcomm Hexagon DSP.
kDLMAIA	Microsoft MAIA devices.
kDLTrn	AWS Trainium.

Data Structures

Namespaces

Macros

Typedefs

Enumerations

Detailed Description

Macro Definition Documentation

◆ DLPACK_DLL

◆ DLPACK_EXTERN_C

◆ DLPACK_FLAG_BITMASK_IS_COPIED

◆ DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED

◆ DLPACK_FLAG_BITMASK_READ_ONLY

◆ DLPACK_MAJOR_VERSION

◆ DLPACK_MINOR_VERSION

Typedef Documentation

◆ DLManagedTensor

◆ DLManagedTensorVersioned

◆ DLPackCurrentWorkStream

◆ DLPackDLTensorFromPyObjectNoSync

◆ DLPackExchangeAPI

◆ DLPackExchangeAPIHeader

◆ DLPackManagedTensorAllocator

◆ DLPackManagedTensorFromPyObjectNoSync

◆ DLPackManagedTensorToPyObjectNoSync

Enumeration Type Documentation

◆ DLDataTypeCode

◆ DLDeviceType