30 #include <type_traits> 40 #include <cuda_runtime.h> 50 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
51 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
54 template <
int64_t block_size,
int64_t thread_size,
typename func_t>
55 __global__
void ElementWiseKernel_(int64_t n, func_t f) {
56 int64_t items_per_block = block_size * thread_size;
57 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
59 for (int64_t i = 0; i < thread_size; ++i) {
68 template <
typename func_t>
69 void ParallelForCUDA_(
const Device& device, int64_t n,
const func_t& func) {
78 CUDAScopedDevice scoped_device(device);
79 int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
80 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
82 ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
83 <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
91 template <
typename func_t>
101 #pragma omp parallel for num_threads(utility::EstimateMaxThreads()) 102 for (int64_t i = 0; i < n; ++i) {
121 template <
typename func_t>
124 ParallelForCUDA_(device, n, func);
176 template <
typename vec_func_t,
typename func_t>
180 const vec_func_t& vec_func) {
181 #ifdef BUILD_ISPC_MODULE 184 ParallelForCUDA_(device, n, func);
188 int64_t start = n * i / num_threads;
189 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
190 vec_func(start, end);
197 ParallelForCUDA_(device, n, func);
205 #ifdef BUILD_ISPC_MODULE 208 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \ 209 using namespace ispc; \ 210 ISPCKernel(start, end, __VA_ARGS__); 215 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \ 217 "ISPC module disabled. Unable to call vectorized kernel {}", \ 218 OPEN3D_STRINGIFY(ISPCKernel)); 223 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \ 224 [&](T, int64_t start, int64_t end) { \ 225 OPEN3D_CALL_ISPC_KERNEL_( \ 226 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \ 239 #define OPEN3D_VECTORIZED(ISPCKernel, ...) \ 240 [&](int64_t start, int64_t end) { \ 241 OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \ 257 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \ 258 [&](int64_t start, int64_t end) { \ 259 static_assert(std::is_arithmetic<T>::value, \ 260 "Data type is not an arithmetic type"); \ 262 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \ 263 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \ 264 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \ 265 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \ 266 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \ 267 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \ 268 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \ 269 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \ 270 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \ 271 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \ 272 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \ 273 [&](auto&& generic, int64_t start, int64_t end) { \ 275 "Unsupported data type {} for calling " \ 276 "vectorized kernel {}", \ 277 typeid(generic).name(), \ 278 OPEN3D_STRINGIFY(ISPCKernel)); \ 279 })(T{}, start, end); \
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:67
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:122
DeviceType GetType() const
Definition: Device.h:91
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:50
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:92
Definition: PinholeCameraIntrinsic.cpp:35
std::string ToString() const
Definition: Device.h:75
#define LogError(...)
Definition: Logging.h:72