Loading [MathJax]/extensions/TeX/AMSsymbols.js
Open3D (C++ API)  0.13.0+8be9a53
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
StdGPUHashmap.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018-2021 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 #pragma once
28 
29 #include <stdgpu/memory.h>
30 #include <thrust/transform.h>
31 
32 #include <stdgpu/unordered_map.cuh>
33 #include <type_traits>
34 
35 #include "open3d/core/CUDAUtils.h"
39 
40 namespace open3d {
41 namespace core {
42 
48 template <typename T>
50 public:
52  using value_type = T;
53 
55  StdGPUAllocator() = default;
56 
58  explicit StdGPUAllocator(const Device& device) : std_allocator_(device) {}
59 
61  StdGPUAllocator(const StdGPUAllocator&) = default;
62 
64  StdGPUAllocator& operator=(const StdGPUAllocator&) = default;
65 
67  StdGPUAllocator(StdGPUAllocator&&) = default;
68 
71 
73  template <typename U>
75  : std_allocator_(other.std_allocator_) {}
76 
79  if (GetDevice().GetType() != Device::DeviceType::CUDA) {
80  utility::LogError("Unsupported device.");
81  }
82 
83  T* p = std_allocator_.allocate(n);
84  stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
85  return p;
86  }
87 
89  void deallocate(T* p, std::size_t n) {
90  if (GetDevice().GetType() != Device::DeviceType::CUDA) {
91  utility::LogError("Unsupported device.");
92  }
93 
94  stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
95  std_allocator_.deallocate(p, n);
96  }
97 
99  bool operator==(const StdGPUAllocator& other) {
100  return std_allocator_ == other.std_allocator_;
101  }
102 
104  bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
105 
107  Device GetDevice() const { return std_allocator_.GetDevice(); }
108 
109 private:
110  // Allow access in rebind constructor.
111  template <typename T2>
112  friend class StdGPUAllocator;
113 
114  StdAllocator<T> std_allocator_;
115 };
116 
117 // These typedefs must be defined outside of StdGPUHashmap to make them
118 // accessible in raw CUDA kernels.
119 template <typename Key>
122 
123 template <typename Key, typename Hash>
124 using InternalStdGPUHashmap =
125  stdgpu::unordered_map<Key,
126  addr_t,
127  Hash,
128  stdgpu::equal_to<Key>,
130 
131 template <typename Key, typename Hash>
132 class StdGPUHashmap : public DeviceHashmap {
133 public:
134  StdGPUHashmap(int64_t init_capacity,
135  int64_t dsize_key,
136  int64_t dsize_value,
137  const Device& device);
138  ~StdGPUHashmap();
139 
140  void Rehash(int64_t buckets) override;
141 
142  void Insert(const void* input_keys,
143  const void* input_values,
144  addr_t* output_addrs,
145  bool* output_masks,
146  int64_t count) override;
147 
148  void Activate(const void* input_keys,
149  addr_t* output_addrs,
150  bool* output_masks,
151  int64_t count) override;
152 
153  void Find(const void* input_keys,
154  addr_t* output_addrs,
155  bool* output_masks,
156  int64_t count) override;
157 
158  void Erase(const void* input_keys,
159  bool* output_masks,
160  int64_t count) override;
161 
162  int64_t GetActiveIndices(addr_t* output_indices) override;
163 
164  void Clear() override;
165 
166  int64_t Size() const override;
167 
168  int64_t GetBucketCount() const override;
169  std::vector<int64_t> BucketSizes() const override;
170  float LoadFactor() const override;
171 
172  InternalStdGPUHashmap<Key, Hash> GetImpl() const { return impl_; }
173 
174 protected:
175  // Use reference, since the structure itself is implicitly handled as a
176  // pointer directly by stdgpu.
178 
180 
181  void InsertImpl(const void* input_keys,
182  const void* input_values,
183  addr_t* output_addrs,
184  bool* output_masks,
185  int64_t count);
186 
187  void Allocate(int64_t capacity);
188  void Free();
189 };
190 
191 template <typename Key, typename Hash>
193  int64_t dsize_key,
194  int64_t dsize_value,
195  const Device& device)
196  : DeviceHashmap(init_capacity, dsize_key, dsize_value, device) {
197  Allocate(init_capacity);
198 }
199 
200 template <typename Key, typename Hash>
202  Free();
203 }
204 
205 template <typename Key, typename Hash>
207  return impl_.size();
208 }
209 
210 template <typename Key, typename Hash>
211 void StdGPUHashmap<Key, Hash>::Insert(const void* input_keys,
212  const void* input_values,
213  addr_t* output_addrs,
214  bool* output_masks,
215  int64_t count) {
216  int64_t new_size = Size() + count;
217  if (new_size > this->capacity_) {
218  int64_t bucket_count = GetBucketCount();
219  float avg_capacity_per_bucket =
220  float(this->capacity_) / float(bucket_count);
221  int64_t expected_buckets = std::max(
222  bucket_count * 2,
223  int64_t(std::ceil(new_size / avg_capacity_per_bucket)));
224  Rehash(expected_buckets);
225  }
226  InsertImpl(input_keys, input_values, output_addrs, output_masks, count);
227 }
228 
229 template <typename Key, typename Hash>
230 void StdGPUHashmap<Key, Hash>::Activate(const void* input_keys,
231  addr_t* output_addrs,
232  bool* output_masks,
233  int64_t count) {
234  Insert(input_keys, nullptr, output_addrs, output_masks, count);
235 }
236 
237 // Need an explicit kernel for non-const access to map
238 template <typename Key, typename Hash>
240  CUDAHashmapBufferAccessor buffer_accessor,
241  const Key* input_keys,
242  addr_t* output_addrs,
243  bool* output_masks,
244  int64_t count) {
245  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
246  if (tid >= count) return;
247 
248  Key key = input_keys[tid];
249  auto iter = map.find(key);
250  bool flag = (iter != map.end());
251  output_masks[tid] = flag;
252  output_addrs[tid] = flag ? iter->second : 0;
253 }
254 
255 template <typename Key, typename Hash>
256 void StdGPUHashmap<Key, Hash>::Find(const void* input_keys,
257  addr_t* output_addrs,
258  bool* output_masks,
259  int64_t count) {
260  uint32_t threads = 128;
261  uint32_t blocks = (count + threads - 1) / threads;
262 
263  STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
264  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
265  output_addrs, output_masks, count);
266  cuda::Synchronize(this->device_);
267 }
268 
269 // Need an explicit kernel for non-const access to map
270 template <typename Key, typename Hash>
272  CUDAHashmapBufferAccessor buffer_accessor,
273  const Key* input_keys,
274  addr_t* output_addrs,
275  bool* output_masks,
276  int64_t count) {
277  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
278  if (tid >= count) return;
279 
280  Key key = input_keys[tid];
281  auto iter = map.find(key);
282  bool flag = (iter != map.end());
283  output_masks[tid] = flag;
284  output_addrs[tid] = flag ? iter->second : 0;
285 
286  if (output_masks[tid]) {
287  output_masks[tid] = map.erase(key);
288  if (output_masks[tid]) {
289  buffer_accessor.DeviceFree(output_addrs[tid]);
290  }
291  }
292 }
293 
294 template <typename Key, typename Hash>
295 void StdGPUHashmap<Key, Hash>::Erase(const void* input_keys,
296  bool* output_masks,
297  int64_t count) {
298  uint32_t threads = 128;
299  uint32_t blocks = (count + threads - 1) / threads;
300 
301  core::Tensor toutput_addrs =
302  core::Tensor({count}, core::Int32, this->device_);
303  addr_t* output_addrs = static_cast<addr_t*>(toutput_addrs.GetDataPtr());
304 
305  STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
306  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
307  output_addrs, output_masks, count);
308  cuda::Synchronize(this->device_);
309 }
310 
311 template <typename Key>
313  OPEN3D_HOST_DEVICE addr_t
314  operator()(const thrust::pair<Key, addr_t>& x) const {
315  return x.second;
316  }
317 };
318 
319 template <typename Key, typename Hash>
320 int64_t StdGPUHashmap<Key, Hash>::GetActiveIndices(addr_t* output_indices) {
321  auto range = impl_.device_range();
322 
323  thrust::transform(range.begin(), range.end(), output_indices,
325 
326  return impl_.size();
327 }
328 
329 template <typename Key, typename Hash>
331  impl_.clear();
333 }
334 
335 template <typename Key, typename Hash>
336 void StdGPUHashmap<Key, Hash>::Rehash(int64_t buckets) {
337  int64_t iterator_count = Size();
338 
339  Tensor active_keys;
340  Tensor active_values;
341 
342  if (iterator_count > 0) {
343  Tensor active_addrs({iterator_count}, core::Int32, this->device_);
344  GetActiveIndices(static_cast<addr_t*>(active_addrs.GetDataPtr()));
345 
346  Tensor active_indices = active_addrs.To(core::Int64);
347  active_keys = this->GetKeyBuffer().IndexGet({active_indices});
348  active_values = this->GetValueBuffer().IndexGet({active_indices});
349  }
350 
351  float avg_capacity_per_bucket =
352  float(this->capacity_) / float(GetBucketCount());
353 
354  Free();
355  int64_t new_capacity =
356  int64_t(std::ceil(buckets * avg_capacity_per_bucket));
357  Allocate(new_capacity);
358 
359  if (iterator_count > 0) {
360  Tensor output_addrs({iterator_count}, core::Int32, this->device_);
361  Tensor output_masks({iterator_count}, core::Bool, this->device_);
362 
363  InsertImpl(active_keys.GetDataPtr(), active_values.GetDataPtr(),
364  static_cast<addr_t*>(output_addrs.GetDataPtr()),
365  output_masks.GetDataPtr<bool>(), iterator_count);
366  }
367 }
368 
369 template <typename Key, typename Hash>
371  return impl_.bucket_count();
372 }
373 
374 template <typename Key, typename Hash>
375 std::vector<int64_t> StdGPUHashmap<Key, Hash>::BucketSizes() const {
376  utility::LogError("Unimplemented");
377 }
378 
379 template <typename Key, typename Hash>
381  return impl_.load_factor();
382 }
383 
384 // Need an explicit kernel for non-const access to map
385 template <typename Key, typename Hash>
387  CUDAHashmapBufferAccessor buffer_accessor,
388  const Key* input_keys,
389  const void* input_values,
390  int64_t dsize_value,
391  addr_t* output_addrs,
392  bool* output_masks,
393  int64_t count) {
394  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
395  if (tid >= count) return;
396 
397  Key key = input_keys[tid];
398  output_addrs[tid] = 0;
399  output_masks[tid] = false;
400 
401  // First apply 'try insert' with a dummy index
402  auto res = map.emplace(key, 0);
403 
404  // If success, change the iterator and provide the actual index
405  if (res.second) {
406  addr_t dst_kv_addr = buffer_accessor.DeviceAllocate();
407  auto dst_kv_iter = buffer_accessor.ExtractIterator(dst_kv_addr);
408 
409  // Copy templated key to buffer (duplicate)
410  // TODO: hack stdgpu inside and take out the buffer directly
411  *static_cast<Key*>(dst_kv_iter.first) = key;
412 
413  // Copy/reset non-templated value in buffer
414  uint8_t* dst_value = static_cast<uint8_t*>(dst_kv_iter.second);
415  if (input_values != nullptr) {
416  const uint8_t* src_value =
417  static_cast<const uint8_t*>(input_values) +
418  dsize_value * tid;
419  for (int byte = 0; byte < dsize_value; ++byte) {
420  dst_value[byte] = src_value[byte];
421  }
422  }
423 
424  // Update from the dummy index
425  res.first->second = dst_kv_addr;
426 
427  // Write to return variables
428  output_addrs[tid] = dst_kv_addr;
429  output_masks[tid] = true;
430  }
431 }
432 
433 template <typename Key, typename Hash>
434 void StdGPUHashmap<Key, Hash>::InsertImpl(const void* input_keys,
435  const void* input_values,
436  addr_t* output_addrs,
437  bool* output_masks,
438  int64_t count) {
439  uint32_t threads = 128;
440  uint32_t blocks = (count + threads - 1) / threads;
441 
442  STDGPUInsertKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
443  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
444  input_values, this->dsize_value_, output_addrs, output_masks,
445  count);
446  cuda::Synchronize(this->device_);
447 }
448 
449 template <typename Key, typename Hash>
450 void StdGPUHashmap<Key, Hash>::Allocate(int64_t capacity) {
451  this->capacity_ = capacity;
452 
453  // Allocate buffer for key values.
454  this->buffer_ =
455  std::make_shared<HashmapBuffer>(this->capacity_, this->dsize_key_,
456  this->dsize_value_, this->device_);
457 
459  buffer_accessor_.Setup(this->capacity_, this->dsize_key_,
460  this->dsize_value_, this->buffer_->GetKeyBuffer(),
461  this->buffer_->GetValueBuffer(),
462  this->buffer_->GetHeap());
464 
465  // stdgpu initializes on the default stream. Set the current stream to
466  // ensure correct behavior.
467  {
468  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
469 
471  this->capacity_,
473  cuda::Synchronize(this->device_);
474  }
475 }
476 
477 template <typename Key, typename Hash>
479  // Buffer is automatically handled by the smart pointer.
480 
482 
483  // stdgpu initializes on the default stream. Set the current stream to
484  // ensure correct behavior.
485  {
486  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
487 
489  }
490 }
491 } // namespace core
492 } // namespace open3d
void * first
Definition: SlabTraits.h:54
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashmap.h:330
void Insert(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashmap.h:211
Definition: StdGPUHashmap.h:312
__global__ void STDGPUEraseKernel(InternalStdGPUHashmap< Key, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:271
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashmap.h:99
void Free()
Definition: StdGPUHashmap.h:478
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashmap.h:74
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashmap.h:104
const Dtype Bool
Definition: Dtype.cpp:72
const Dtype Int64
Definition: Dtype.cpp:67
T value_type
T.
Definition: StdGPUHashmap.h:52
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
StdGPUHashmap(int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const Device &device)
Definition: StdGPUHashmap.h:192
~StdGPUHashmap()
Definition: StdGPUHashmap.h:201
void Allocate(int64_t capacity)
Definition: StdGPUHashmap.h:450
__host__ void Setup(int64_t capacity, int64_t dsize_key, int64_t dsize_value, Tensor &keys, Tensor &values, Tensor &heap)
Definition: CUDAHashmapBufferAccessor.h:50
__host__ void HostAllocate(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:80
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashmap.h:89
Definition: StdGPUHashmap.h:132
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:108
Definition: DeviceHashmap.h:39
__device__ addr_t DeviceAllocate()
Definition: CUDAHashmapBufferAccessor.h:92
Definition: StdAllocator.h:42
void Synchronize()
Definition: CUDAUtils.cpp:72
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:707
Definition: CUDAHashmapBufferAccessor.h:48
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:63
void Rehash(int64_t buckets) override
Definition: StdGPUHashmap.h:336
const Dtype Int32
Definition: Dtype.cpp:66
__device__ void DeviceFree(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:97
InternalStdGPUHashmap< Key, Hash > impl_
Definition: StdGPUHashmap.h:177
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:541
__device__ iterator_t ExtractIterator(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:109
int64_t GetActiveIndices(addr_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashmap.h:320
Device device_
Definition: DeviceHashmap.h:113
Definition: Device.h:39
StdGPUAllocator(const Device &device)
Constructor from device.
Definition: StdGPUHashmap.h:58
void Activate(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Definition: StdGPUHashmap.h:230
Definition: StdGPUHashmap.h:49
int64_t GetBucketCount() const override
Definition: StdGPUHashmap.h:370
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashmap.h:78
StdGPUAllocator()=default
Default constructor.
int count
Definition: FilePCD.cpp:61
float LoadFactor() const override
Definition: StdGPUHashmap.h:380
int64_t Size() const override
Definition: StdGPUHashmap.h:206
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashmap.h:295
Definition: PinholeCameraIntrinsic.cpp:35
CUDAHashmapBufferAccessor buffer_accessor_
Definition: StdGPUHashmap.h:179
Definition: Tensor.h:49
__global__ void STDGPUFindKernel(InternalStdGPUHashmap< Key, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:239
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:465
int64_t capacity_
Definition: DeviceHashmap.h:109
__global__ void STDGPUInsertKernel(InternalStdGPUHashmap< Key, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, const void *input_values, int64_t dsize_value, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:386
void InsertImpl(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:434
Tensor & GetKeyBuffer()
Definition: DeviceHashmap.h:101
uint32_t addr_t
Definition: HashmapBuffer.h:58
int64_t dsize_key_
Definition: DeviceHashmap.h:110
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:724
T * GetDataPtr()
Definition: Tensor.h:1004
stdgpu::unordered_map< Key, addr_t, Hash, stdgpu::equal_to< Key >, InternalStdGPUHashmapAllocator< Key > > InternalStdGPUHashmap
Definition: StdGPUHashmap.h:129
OPEN3D_HOST_DEVICE addr_t operator()(const thrust::pair< Key, addr_t > &x) const
Definition: StdGPUHashmap.h:314
__host__ void HostFree(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:85
__host__ void Reset(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:67
int64_t dsize_value_
Definition: DeviceHashmap.h:111
Common CUDA utilities.
Tensor & GetValueBuffer()
Definition: DeviceHashmap.h:102
Device GetDevice() const
Returns the device on which memory is allocated.
Definition: StdGPUHashmap.h:107
InternalStdGPUHashmap< Key, Hash > GetImpl() const
Definition: StdGPUHashmap.h:172
std::shared_ptr< HashmapBuffer > buffer_
Definition: DeviceHashmap.h:115
void Find(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashmap.h:256
std::vector< int64_t > BucketSizes() const override
Definition: StdGPUHashmap.h:375
#define LogError(...)
Definition: Logging.h:78