Loading [MathJax]/extensions/TeX/AMSsymbols.js
Open3D (C++ API)  0.14.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
StdGPUHashBackend.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018-2021 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 #pragma once
28 
29 #include <stdgpu/memory.h>
30 #include <thrust/device_vector.h>
31 #include <thrust/transform.h>
32 
33 #include <stdgpu/unordered_map.cuh>
34 #include <type_traits>
35 
36 #include "open3d/core/CUDAUtils.h"
41 
42 namespace open3d {
43 namespace core {
44 
50 template <typename T>
52 public:
54  using value_type = T;
55 
57  StdGPUAllocator() = default;
58 
60  explicit StdGPUAllocator(const Device& device) : std_allocator_(device) {}
61 
63  StdGPUAllocator(const StdGPUAllocator&) = default;
64 
66  StdGPUAllocator& operator=(const StdGPUAllocator&) = default;
67 
69  StdGPUAllocator(StdGPUAllocator&&) = default;
70 
73 
75  template <typename U>
77  : std_allocator_(other.std_allocator_) {}
78 
81  if (GetDevice().GetType() != Device::DeviceType::CUDA) {
82  utility::LogError("Unsupported device.");
83  }
84 
85  T* p = std_allocator_.allocate(n);
86  stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
87  return p;
88  }
89 
91  void deallocate(T* p, std::size_t n) {
92  if (GetDevice().GetType() != Device::DeviceType::CUDA) {
93  utility::LogError("Unsupported device.");
94  }
95 
96  stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
97  std_allocator_.deallocate(p, n);
98  }
99 
101  bool operator==(const StdGPUAllocator& other) {
102  return std_allocator_ == other.std_allocator_;
103  }
104 
106  bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
107 
109  Device GetDevice() const { return std_allocator_.GetDevice(); }
110 
111 private:
112  // Allow access in rebind constructor.
113  template <typename T2>
114  friend class StdGPUAllocator;
115 
116  StdAllocator<T> std_allocator_;
117 };
118 
119 // These typedefs must be defined outside of StdGPUHashBackend to make them
120 // accessible in raw CUDA kernels.
121 template <typename Key>
124 
125 template <typename Key, typename Hash, typename Eq>
127  stdgpu::unordered_map<Key,
128  buf_index_t,
129  Hash,
130  Eq,
132 
133 template <typename Key, typename Hash, typename Eq>
135 public:
136  StdGPUHashBackend(int64_t init_capacity,
137  int64_t key_dsize,
138  const std::vector<int64_t>& value_dsizes,
139  const Device& device);
141 
142  void Reserve(int64_t capacity) override;
143 
144  void Insert(const void* input_keys,
145  const std::vector<const void*>& input_values_soa,
146  buf_index_t* output_buf_indices,
147  bool* output_masks,
148  int64_t count) override;
149 
150  void Find(const void* input_keys,
151  buf_index_t* output_buf_indices,
152  bool* output_masks,
153  int64_t count) override;
154 
155  void Erase(const void* input_keys,
156  bool* output_masks,
157  int64_t count) override;
158 
159  int64_t GetActiveIndices(buf_index_t* output_indices) override;
160 
161  void Clear() override;
162 
163  int64_t Size() const override;
164 
165  int64_t GetBucketCount() const override;
166  std::vector<int64_t> BucketSizes() const override;
167  float LoadFactor() const override;
168 
170 
171  void Allocate(int64_t capacity);
172  void Free();
173 
174 protected:
175  // Use reference, since the structure itself is implicitly handled as a
176  // pointer directly by stdgpu.
178 
180 };
181 
182 template <typename Key, typename Hash, typename Eq>
184  int64_t init_capacity,
185  int64_t key_dsize,
186  const std::vector<int64_t>& value_dsizes,
187  const Device& device)
188  : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
189  Allocate(init_capacity);
190 }
191 
192 template <typename Key, typename Hash, typename Eq>
194  Free();
195 }
196 
197 template <typename Key, typename Hash, typename Eq>
199  return impl_.size();
200 }
201 
202 // Need an explicit kernel for non-const access to map
203 template <typename Key, typename Hash, typename Eq>
205  CUDAHashBackendBufferAccessor buffer_accessor,
206  const Key* input_keys,
207  buf_index_t* output_buf_indices,
208  bool* output_masks,
209  int64_t count) {
210  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
211  if (tid >= count) return;
212 
213  Key key = input_keys[tid];
214  auto iter = map.find(key);
215  bool flag = (iter != map.end());
216  output_masks[tid] = flag;
217  output_buf_indices[tid] = flag ? iter->second : 0;
218 }
219 
220 template <typename Key, typename Hash, typename Eq>
221 void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
222  buf_index_t* output_buf_indices,
223  bool* output_masks,
224  int64_t count) {
225  uint32_t threads = 128;
226  uint32_t blocks = (count + threads - 1) / threads;
227 
228  STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
229  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
230  output_buf_indices, output_masks, count);
231  cuda::Synchronize(this->device_);
232 }
233 
234 // Need an explicit kernel for non-const access to map
235 template <typename Key, typename Hash, typename Eq>
237  CUDAHashBackendBufferAccessor buffer_accessor,
238  const Key* input_keys,
239  buf_index_t* output_buf_indices,
240  bool* output_masks,
241  int64_t count) {
242  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
243  if (tid >= count) return;
244 
245  Key key = input_keys[tid];
246  auto iter = map.find(key);
247  bool flag = (iter != map.end());
248  output_masks[tid] = flag;
249  output_buf_indices[tid] = flag ? iter->second : 0;
250 
251  if (output_masks[tid]) {
252  output_masks[tid] = map.erase(key);
253  if (output_masks[tid]) {
254  buffer_accessor.DeviceFree(output_buf_indices[tid]);
255  }
256  }
257 }
258 
259 template <typename Key, typename Hash, typename Eq>
260 void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
261  bool* output_masks,
262  int64_t count) {
263  uint32_t threads = 128;
264  uint32_t blocks = (count + threads - 1) / threads;
265 
266  core::Tensor toutput_buf_indices =
267  core::Tensor({count}, core::Int32, this->device_);
268  buf_index_t* output_buf_indices =
269  static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());
270 
271  STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
272  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
273  output_buf_indices, output_masks, count);
274  cuda::Synchronize(this->device_);
275 }
276 
277 template <typename Key>
279  OPEN3D_HOST_DEVICE buf_index_t
280  operator()(const thrust::pair<Key, buf_index_t>& x) const {
281  return x.second;
282  }
283 };
284 
285 template <typename Key, typename Hash, typename Eq>
287  buf_index_t* output_indices) {
288  auto range = impl_.device_range();
289 
290  thrust::transform(range.begin(), range.end(), output_indices,
292 
293  return impl_.size();
294 }
295 
296 template <typename Key, typename Hash, typename Eq>
298  impl_.clear();
299  this->buffer_->ResetHeap();
300 }
301 
302 template <typename Key, typename Hash, typename Eq>
304 
305 template <typename Key, typename Hash, typename Eq>
307  return impl_.bucket_count();
308 }
309 
310 template <typename Key, typename Hash, typename Eq>
311 std::vector<int64_t> StdGPUHashBackend<Key, Hash, Eq>::BucketSizes() const {
312  utility::LogError("Unimplemented");
313 }
314 
315 template <typename Key, typename Hash, typename Eq>
317  return impl_.load_factor();
318 }
319 
320 // Need an explicit kernel for non-const access to map
321 template <typename Key, typename Hash, typename Eq, typename block_t>
322 __global__ void STDGPUInsertKernel(
324  CUDAHashBackendBufferAccessor buffer_accessor,
325  const Key* input_keys,
326  const void* const* input_values_soa,
327  buf_index_t* output_buf_indices,
328  bool* output_masks,
329  int64_t count,
330  int64_t n_values) {
331  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
332  if (tid >= count) return;
333 
334  Key key = input_keys[tid];
335  output_buf_indices[tid] = 0;
336  output_masks[tid] = false;
337 
338  // First apply 'try insert' with a dummy index
339  auto res = map.emplace(key, 0);
340 
341  // If success, change the iterator and provide the actual index
342  if (res.second) {
343  buf_index_t buf_index = buffer_accessor.DeviceAllocate();
344  auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);
345 
346  // Copy templated key to buffer (duplicate)
347  // TODO: hack stdgpu inside and take out the buffer directly
348  *static_cast<Key*>(key_ptr) = key;
349 
350  // Copy/reset non-templated value in buffer
351  for (int j = 0; j < n_values; ++j) {
352  const int64_t blocks_per_element =
353  buffer_accessor.value_blocks_per_element_[j];
354 
355  block_t* dst_value = static_cast<block_t*>(
356  buffer_accessor.GetValuePtr(buf_index, j));
357  const block_t* src_value =
358  static_cast<const block_t*>(input_values_soa[j]) +
359  blocks_per_element * tid;
360  for (int b = 0; b < blocks_per_element; ++b) {
361  dst_value[b] = src_value[b];
362  }
363  }
364 
365  // Update from the dummy index
366  res.first->second = buf_index;
367 
368  // Write to return variables
369  output_buf_indices[tid] = buf_index;
370  output_masks[tid] = true;
371  }
372 }
373 
374 template <typename Key, typename Hash, typename Eq>
376  const void* input_keys,
377  const std::vector<const void*>& input_values_soa,
378  buf_index_t* output_buf_indices,
379  bool* output_masks,
380  int64_t count) {
381  uint32_t threads = 128;
382  uint32_t blocks = (count + threads - 1) / threads;
383 
384  thrust::device_vector<const void*> input_values_soa_device(
385  input_values_soa.begin(), input_values_soa.end());
386 
387  int64_t n_values = input_values_soa.size();
388  const void* const* ptr_input_values_soa =
389  thrust::raw_pointer_cast(input_values_soa_device.data());
390 
391  DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
393  STDGPUInsertKernel<Key, Hash, Eq, block_t>
394  <<<blocks, threads, 0, core::cuda::GetStream()>>>(
395  impl_, buffer_accessor_,
396  static_cast<const Key*>(input_keys),
397  ptr_input_values_soa, output_buf_indices,
398  output_masks, count, n_values);
399  });
400  cuda::Synchronize(this->device_);
401 }
402 
403 template <typename Key, typename Hash, typename Eq>
405  this->capacity_ = capacity;
406 
407  // Allocate buffer for key values.
408  this->buffer_ = std::make_shared<HashBackendBuffer>(
409  this->capacity_, this->key_dsize_, this->value_dsizes_,
410  this->device_);
412 
413  // stdgpu initializes on the default stream. Set the current stream to
414  // ensure correct behavior.
415  {
416  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
417 
419  this->capacity_,
421  cuda::Synchronize(this->device_);
422  }
423 }
424 
425 template <typename Key, typename Hash, typename Eq>
427  // Buffer is automatically handled by the smart pointer.
429 
430  // stdgpu initializes on the default stream. Set the current stream to
431  // ensure correct behavior.
432  {
433  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
434 
436  }
437 }
438 } // namespace core
439 } // namespace open3d
std::shared_ptr< HashBackendBuffer > buffer_
Definition: DeviceHashBackend.h:121
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:102
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: StdGPUHashBackend.h:179
Definition: StdGPUHashBackend.h:278
Definition: CUDAHashBackendBufferAccessor.h:43
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashBackend.h:101
Definition: StdGPUHashBackend.h:134
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashBackend.h:76
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashBackend.h:106
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashBackend.h:286
T value_type
T.
Definition: StdGPUHashBackend.h:54
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
std::vector< int64_t > value_dsizes_
Definition: DeviceHashBackend.h:117
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:127
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition: StdGPUHashBackend.h:183
~StdGPUHashBackend()
Definition: StdGPUHashBackend.h:193
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashBackend.h:91
Device device_
Definition: DeviceHashBackend.h:119
void Reserve(int64_t capacity) override
Definition: StdGPUHashBackend.h:303
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
int64_t key_dsize_
Definition: DeviceHashBackend.h:116
Definition: StdAllocator.h:42
int count
Definition: FilePCD.cpp:61
int64_t capacity_
Definition: DeviceHashBackend.h:114
void Allocate(int64_t capacity)
Definition: StdGPUHashBackend.h:404
void Synchronize()
Definition: CUDAUtils.cpp:78
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:204
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashBackend.h:221
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:63
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition: StdGPUHashBackend.h:131
const Dtype Int32
Definition: Dtype.cpp:65
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition: StdGPUHashBackend.h:169
__host__ void Setup(HashBackendBuffer &hashmap_buffer)
Definition: CUDAHashBackendBufferAccessor.h:45
__host__ void Shutdown(const Device &device)
Definition: CUDAHashBackendBufferAccessor.h:92
Definition: Device.h:39
StdGPUAllocator(const Device &device)
Constructor from device.
Definition: StdGPUHashBackend.h:60
Definition: StdGPUHashBackend.h:51
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition: StdGPUHashBackend.h:306
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashBackend.h:80
StdGPUAllocator()=default
Default constructor.
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:236
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:107
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashBackend.h:297
void Free()
Definition: StdGPUHashBackend.h:426
Definition: PinholeCameraIntrinsic.cpp:35
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition: StdGPUHashBackend.h:177
int64_t common_block_size_
Definition: CUDAHashBackendBufferAccessor.h:124
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition: StdGPUHashBackend.h:316
Definition: DeviceHashBackend.h:39
OPEN3D_HOST_DEVICE buf_index_t operator()(const thrust::pair< Key, buf_index_t > &x) const
Definition: StdGPUHashBackend.h:280
__device__ buf_index_t DeviceAllocate()
Definition: CUDAHashBackendBufferAccessor.h:98
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashBackend.h:260
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: StdGPUHashBackend.h:322
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:724
T * GetDataPtr()
Definition: Tensor.h:1074
void Insert(const void *input_keys, const std::vector< const void *> &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashBackend.h:375
Common CUDA utilities.
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:110
Device GetDevice() const
Returns the device on which memory is allocated.
Definition: StdGPUHashBackend.h:109
#define LogError(...)
Definition: Logging.h:72
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition: StdGPUHashBackend.h:311
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition: StdGPUHashBackend.h:198