Loading [MathJax]/extensions/TeX/AMSsymbols.js
Open3D (C++ API)  0.14.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ParallelFor.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018-2021 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 #pragma once
28 
29 #include <cstdint>
30 #include <type_traits>
31 
32 #include "open3d/core/Device.h"
33 #include "open3d/utility/Logging.h"
37 
38 #ifdef __CUDACC__
39 #include <cuda.h>
40 #include <cuda_runtime.h>
41 
42 #include "open3d/core/CUDAUtils.h"
43 #endif
44 
45 namespace open3d {
46 namespace core {
47 
48 #ifdef __CUDACC__
49 
50 static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
51 static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
52 
54 template <int64_t block_size, int64_t thread_size, typename func_t>
55 __global__ void ElementWiseKernel_(int64_t n, func_t f) {
56  int64_t items_per_block = block_size * thread_size;
57  int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
58 #pragma unroll
59  for (int64_t i = 0; i < thread_size; ++i) {
60  if (idx < n) {
61  f(idx);
62  idx += block_size;
63  }
64  }
65 }
66 
68 template <typename func_t>
69 void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
70  if (device.GetType() != Device::DeviceType::CUDA) {
71  utility::LogError("ParallelFor for CUDA cannot run on device {}.",
72  device.ToString());
73  }
74  if (n == 0) {
75  return;
76  }
77 
78  CUDAScopedDevice scoped_device(device);
79  int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
80  int64_t grid_size = (n + items_per_block - 1) / items_per_block;
81 
82  ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
83  <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
84  n, func);
85  OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");
86 }
87 
88 #else
89 
91 template <typename func_t>
92 void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
93  if (device.GetType() != Device::DeviceType::CPU) {
94  utility::LogError("ParallelFor for CPU cannot run on device {}.",
95  device.ToString());
96  }
97  if (n == 0) {
98  return;
99  }
100 
101 #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
102  for (int64_t i = 0; i < n; ++i) {
103  func(i);
104  }
105 }
106 
107 #endif
108 
121 template <typename func_t>
122 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
123 #ifdef __CUDACC__
124  ParallelForCUDA_(device, n, func);
125 #else
126  ParallelForCPU_(device, n, func);
127 #endif
128 }
129 
176 template <typename vec_func_t, typename func_t>
177 void ParallelFor(const Device& device,
178  int64_t n,
179  const func_t& func,
180  const vec_func_t& vec_func) {
181 #ifdef BUILD_ISPC_MODULE
182 
183 #ifdef __CUDACC__
184  ParallelForCUDA_(device, n, func);
185 #else
186  int num_threads = utility::EstimateMaxThreads();
187  ParallelForCPU_(device, num_threads, [&](int64_t i) {
188  int64_t start = n * i / num_threads;
189  int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
190  vec_func(start, end);
191  });
192 #endif
193 
194 #else
195 
196 #ifdef __CUDACC__
197  ParallelForCUDA_(device, n, func);
198 #else
199  ParallelForCPU_(device, n, func);
200 #endif
201 
202 #endif
203 }
204 
205 #ifdef BUILD_ISPC_MODULE
206 
207 // Internal helper macro.
208 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
209  using namespace ispc; \
210  ISPCKernel(start, end, __VA_ARGS__);
211 
212 #else
213 
214 // Internal helper macro.
215 #define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
216  utility::LogError( \
217  "ISPC module disabled. Unable to call vectorized kernel {}", \
218  OPEN3D_STRINGIFY(ISPCKernel));
219 
220 #endif
221 
223 #define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
224  [&](T, int64_t start, int64_t end) { \
225  OPEN3D_CALL_ISPC_KERNEL_( \
226  OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
227  __VA_ARGS__); \
228  }
229 
239 #define OPEN3D_VECTORIZED(ISPCKernel, ...) \
240  [&](int64_t start, int64_t end) { \
241  OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
242  }
243 
257 #define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
258  [&](int64_t start, int64_t end) { \
259  static_assert(std::is_arithmetic<T>::value, \
260  "Data type is not an arithmetic type"); \
261  utility::Overload( \
262  OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
263  OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
264  OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
265  OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
266  OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
267  OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
268  OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
269  OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
270  OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
271  OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
272  OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
273  [&](auto&& generic, int64_t start, int64_t end) { \
274  utility::LogError( \
275  "Unsupported data type {} for calling " \
276  "vectorized kernel {}", \
277  typeid(generic).name(), \
278  OPEN3D_STRINGIFY(ISPCKernel)); \
279  })(T{}, start, end); \
280  }
281 
282 } // namespace core
283 } // namespace open3d
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:67
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:122
DeviceType GetType() const
Definition: Device.h:91
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:50
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:92
Definition: Device.h:39
Definition: PinholeCameraIntrinsic.cpp:35
Common CUDA utilities.
std::string ToString() const
Definition: Device.h:75
#define LogError(...)
Definition: Logging.h:72