Open3D (C++ API)  0.13.0
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
StdGPUHashmap.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 #pragma once
28 
29 #include <stdgpu/iterator.h> // device_begin, device_end
30 #include <stdgpu/memory.h> // createDeviceArray, destroyDeviceArray
31 #include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
32 #include <thrust/for_each.h>
33 #include <thrust/transform.h>
34 
35 #include <limits>
36 #include <stdgpu/unordered_map.cuh> // stdgpu::unordered_map
37 #include <unordered_map>
38 
42 
43 namespace open3d {
44 namespace core {
45 template <typename Key, typename Hash>
46 class StdGPUHashmap : public DeviceHashmap {
47 public:
48  StdGPUHashmap(int64_t init_capacity,
49  int64_t dsize_key,
50  int64_t dsize_value,
51  const Device& device);
53 
54  void Rehash(int64_t buckets) override;
55 
56  void Insert(const void* input_keys,
57  const void* input_values,
58  addr_t* output_addrs,
59  bool* output_masks,
60  int64_t count) override;
61 
62  void Activate(const void* input_keys,
63  addr_t* output_addrs,
64  bool* output_masks,
65  int64_t count) override;
66 
67  void Find(const void* input_keys,
68  addr_t* output_addrs,
69  bool* output_masks,
70  int64_t count) override;
71 
72  void Erase(const void* input_keys,
73  bool* output_masks,
74  int64_t count) override;
75 
76  int64_t GetActiveIndices(addr_t* output_indices) override;
77 
78  void Clear() override;
79 
80  int64_t Size() const override;
81 
82  int64_t GetBucketCount() const override;
83  std::vector<int64_t> BucketSizes() const override;
84  float LoadFactor() const override;
85 
86  stdgpu::unordered_map<Key, addr_t, Hash> GetImpl() const { return impl_; }
87 
88 protected:
89  // Use reference, since the structure itself is implicitly handled as a
90  // pointer directly by stdgpu.
91  stdgpu::unordered_map<Key, addr_t, Hash> impl_;
92 
94 
95  void InsertImpl(const void* input_keys,
96  const void* input_values,
97  addr_t* output_addrs,
98  bool* output_masks,
99  int64_t count);
100 
101  void Allocate(int64_t capacity);
102  void Free();
103 };
104 
105 template <typename Key, typename Hash>
107  int64_t dsize_key,
108  int64_t dsize_value,
109  const Device& device)
110  : DeviceHashmap(init_capacity, dsize_key, dsize_value, device) {
111  Allocate(init_capacity);
112 }
113 
114 template <typename Key, typename Hash>
116  Free();
117 }
118 
119 template <typename Key, typename Hash>
121  return impl_.size();
122 }
123 
124 template <typename Key, typename Hash>
125 void StdGPUHashmap<Key, Hash>::Insert(const void* input_keys,
126  const void* input_values,
127  addr_t* output_addrs,
128  bool* output_masks,
129  int64_t count) {
130  int64_t new_size = Size() + count;
131  if (new_size > this->capacity_) {
132  int64_t bucket_count = GetBucketCount();
133  float avg_capacity_per_bucket =
134  float(this->capacity_) / float(bucket_count);
135  int64_t expected_buckets = std::max(
136  bucket_count * 2,
137  int64_t(std::ceil(new_size / avg_capacity_per_bucket)));
138  Rehash(expected_buckets);
139  }
140  InsertImpl(input_keys, input_values, output_addrs, output_masks, count);
141 }
142 
143 template <typename Key, typename Hash>
144 void StdGPUHashmap<Key, Hash>::Activate(const void* input_keys,
145  addr_t* output_addrs,
146  bool* output_masks,
147  int64_t count) {
148  Insert(input_keys, nullptr, output_addrs, output_masks, count);
149 }
150 
151 // Need an explicit kernel for non-const access to map
152 template <typename Key, typename Hash>
153 __global__ void STDGPUFindKernel(stdgpu::unordered_map<Key, addr_t, Hash> map,
154  CUDAHashmapBufferAccessor buffer_accessor,
155  const Key* input_keys,
156  addr_t* output_addrs,
157  bool* output_masks,
158  int64_t count) {
159  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
160  if (tid >= count) return;
161 
162  Key key = input_keys[tid];
163  auto iter = map.find(key);
164  bool flag = (iter != map.end());
165  output_masks[tid] = flag;
166  output_addrs[tid] = flag ? iter->second : 0;
167 }
168 
169 template <typename Key, typename Hash>
170 void StdGPUHashmap<Key, Hash>::Find(const void* input_keys,
171  addr_t* output_addrs,
172  bool* output_masks,
173  int64_t count) {
174  uint32_t threads = 128;
175  uint32_t blocks = (count + threads - 1) / threads;
176 
177  STDGPUFindKernel<<<blocks, threads>>>(impl_, buffer_accessor_,
178  static_cast<const Key*>(input_keys),
179  output_addrs, output_masks, count);
180  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
181 }
182 
183 // Need an explicit kernel for non-const access to map
184 template <typename Key, typename Hash>
185 __global__ void STDGPUEraseKernel(stdgpu::unordered_map<Key, addr_t, Hash> map,
186  CUDAHashmapBufferAccessor buffer_accessor,
187  const Key* input_keys,
188  addr_t* output_addrs,
189  bool* output_masks,
190  int64_t count) {
191  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
192  if (tid >= count) return;
193 
194  Key key = input_keys[tid];
195  auto iter = map.find(key);
196  bool flag = (iter != map.end());
197  output_masks[tid] = flag;
198  output_addrs[tid] = flag ? iter->second : 0;
199 
200  if (output_masks[tid]) {
201  output_masks[tid] = map.erase(key);
202  if (output_masks[tid]) {
203  buffer_accessor.DeviceFree(output_addrs[tid]);
204  }
205  }
206 }
207 
208 template <typename Key, typename Hash>
209 void StdGPUHashmap<Key, Hash>::Erase(const void* input_keys,
210  bool* output_masks,
211  int64_t count) {
212  uint32_t threads = 128;
213  uint32_t blocks = (count + threads - 1) / threads;
214 
215  core::Tensor toutput_addrs =
216  core::Tensor({count}, Dtype::Int32, this->device_);
217  addr_t* output_addrs = static_cast<addr_t*>(toutput_addrs.GetDataPtr());
218 
219  STDGPUEraseKernel<<<blocks, threads>>>(impl_, buffer_accessor_,
220  static_cast<const Key*>(input_keys),
221  output_addrs, output_masks, count);
222  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
223 }
224 
225 template <typename Key>
228  operator()(const thrust::pair<Key, addr_t>& x) const {
229  return x.second;
230  }
231 };
232 
233 template <typename Key, typename Hash>
235  auto range = impl_.device_range();
236 
237  thrust::transform(range.begin(), range.end(), output_indices,
239 
240  return impl_.size();
241 }
242 
243 template <typename Key, typename Hash>
245  impl_.clear();
247 }
248 
249 template <typename Key, typename Hash>
250 void StdGPUHashmap<Key, Hash>::Rehash(int64_t buckets) {
251  int64_t iterator_count = Size();
252 
253  Tensor active_keys;
254  Tensor active_values;
255 
256  if (iterator_count > 0) {
257  Tensor active_addrs({iterator_count}, Dtype::Int32, this->device_);
258  GetActiveIndices(static_cast<addr_t*>(active_addrs.GetDataPtr()));
259 
260  Tensor active_indices = active_addrs.To(Dtype::Int64);
261  active_keys = this->GetKeyBuffer().IndexGet({active_indices});
262  active_values = this->GetValueBuffer().IndexGet({active_indices});
263  }
264 
265  float avg_capacity_per_bucket =
266  float(this->capacity_) / float(GetBucketCount());
267 
268  Free();
269  int64_t new_capacity =
270  int64_t(std::ceil(buckets * avg_capacity_per_bucket));
271  Allocate(new_capacity);
272 
273  if (iterator_count > 0) {
274  Tensor output_addrs({iterator_count}, Dtype::Int32, this->device_);
275  Tensor output_masks({iterator_count}, Dtype::Bool, this->device_);
276 
277  InsertImpl(active_keys.GetDataPtr(), active_values.GetDataPtr(),
278  static_cast<addr_t*>(output_addrs.GetDataPtr()),
279  output_masks.GetDataPtr<bool>(), iterator_count);
280  }
281 }
282 
283 template <typename Key, typename Hash>
285  return impl_.bucket_count();
286 }
287 
288 template <typename Key, typename Hash>
289 std::vector<int64_t> StdGPUHashmap<Key, Hash>::BucketSizes() const {
290  utility::LogError("Unimplemented");
291 }
292 
293 template <typename Key, typename Hash>
295  return impl_.load_factor();
296 }
297 
298 // Need an explicit kernel for non-const access to map
299 template <typename Key, typename Hash>
300 __global__ void STDGPUInsertKernel(stdgpu::unordered_map<Key, addr_t, Hash> map,
301  CUDAHashmapBufferAccessor buffer_accessor,
302  const Key* input_keys,
303  const void* input_values,
304  int64_t dsize_value,
305  addr_t* output_addrs,
306  bool* output_masks,
307  int64_t count) {
308  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
309  if (tid >= count) return;
310 
311  Key key = input_keys[tid];
312  output_addrs[tid] = 0;
313  output_masks[tid] = false;
314 
315  // First apply 'try insert' with a dummy index
316  auto res = map.emplace(key, 0);
317 
318  // If success, change the iterator and provide the actual index
319  if (res.second) {
320  addr_t dst_kv_addr = buffer_accessor.DeviceAllocate();
321  auto dst_kv_iter = buffer_accessor.ExtractIterator(dst_kv_addr);
322 
323  // Copy templated key to buffer (duplicate)
324  // TODO: hack stdgpu inside and take out the buffer directly
325  *static_cast<Key*>(dst_kv_iter.first) = key;
326 
327  // Copy/reset non-templated value in buffer
328  uint8_t* dst_value = static_cast<uint8_t*>(dst_kv_iter.second);
329  if (input_values != nullptr) {
330  const uint8_t* src_value =
331  static_cast<const uint8_t*>(input_values) +
332  dsize_value * tid;
333  for (int byte = 0; byte < dsize_value; ++byte) {
334  dst_value[byte] = src_value[byte];
335  }
336  }
337 
338  // Update from the dummy index
339  res.first->second = dst_kv_addr;
340 
341  // Write to return variables
342  output_addrs[tid] = dst_kv_addr;
343  output_masks[tid] = true;
344  }
345 }
346 
347 template <typename Key, typename Hash>
348 void StdGPUHashmap<Key, Hash>::InsertImpl(const void* input_keys,
349  const void* input_values,
350  addr_t* output_addrs,
351  bool* output_masks,
352  int64_t count) {
353  uint32_t threads = 128;
354  uint32_t blocks = (count + threads - 1) / threads;
355 
356  STDGPUInsertKernel<<<blocks, threads>>>(impl_, buffer_accessor_,
357  static_cast<const Key*>(input_keys),
358  input_values, this->dsize_value_,
359  output_addrs, output_masks, count);
360  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
361 }
362 
363 template <typename Key, typename Hash>
364 void StdGPUHashmap<Key, Hash>::Allocate(int64_t capacity) {
365  this->capacity_ = capacity;
366 
367  // Allocate buffer for key values.
368  this->buffer_ =
369  std::make_shared<HashmapBuffer>(this->capacity_, this->dsize_key_,
370  this->dsize_value_, this->device_);
371 
373  buffer_accessor_.Setup(this->capacity_, this->dsize_key_,
374  this->dsize_value_, this->buffer_->GetKeyBuffer(),
375  this->buffer_->GetValueBuffer(),
376  this->buffer_->GetHeap());
378 
379  impl_ = stdgpu::unordered_map<Key, addr_t, Hash>::createDeviceObject(
380  this->capacity_);
381  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
382 }
383 
384 template <typename Key, typename Hash>
386  // Buffer is automatically handled by the smart pointer.
387 
389 
390  stdgpu::unordered_map<Key, addr_t, Hash>::destroyDeviceObject(impl_);
391 }
392 } // namespace core
393 } // namespace open3d
__global__ void STDGPUInsertKernel(stdgpu::unordered_map< Key, addr_t, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, const void *input_values, int64_t dsize_value, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:300
void * first
Definition: SlabTraits.h:54
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashmap.h:244
void Insert(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashmap.h:125
Definition: StdGPUHashmap.h:226
void Free()
Definition: StdGPUHashmap.h:385
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
StdGPUHashmap(int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const Device &device)
Definition: StdGPUHashmap.h:106
~StdGPUHashmap()
Definition: StdGPUHashmap.h:115
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:59
stdgpu::unordered_map< Key, addr_t, Hash > impl_
Definition: StdGPUHashmap.h:91
void Allocate(int64_t capacity)
Definition: StdGPUHashmap.h:364
stdgpu::unordered_map< Key, addr_t, Hash > GetImpl() const
Definition: StdGPUHashmap.h:86
__host__ void Setup(int64_t capacity, int64_t dsize_key, int64_t dsize_value, Tensor &keys, Tensor &values, Tensor &heap)
Definition: CUDAHashmapBufferAccessor.h:50
__host__ void HostAllocate(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:80
Definition: StdGPUHashmap.h:46
#define LogError(...)
Definition: Console.h:79
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:108
Definition: DeviceHashmap.h:39
__device__ addr_t DeviceAllocate()
Definition: CUDAHashmapBufferAccessor.h:92
static const Dtype Int32
Definition: Dtype.h:46
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:704
Definition: CUDAHashmapBufferAccessor.h:48
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:56
void Rehash(int64_t buckets) override
Definition: StdGPUHashmap.h:250
__global__ void STDGPUEraseKernel(stdgpu::unordered_map< Key, addr_t, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:185
__device__ void DeviceFree(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:97
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:540
__device__ iterator_t ExtractIterator(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:109
int64_t GetActiveIndices(addr_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashmap.h:234
Device device_
Definition: DeviceHashmap.h:113
Definition: Device.h:39
void Activate(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Definition: StdGPUHashmap.h:144
int64_t GetBucketCount() const override
Definition: StdGPUHashmap.h:284
__global__ void STDGPUFindKernel(stdgpu::unordered_map< Key, addr_t, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:153
int count
Definition: FilePCD.cpp:61
float LoadFactor() const override
Definition: StdGPUHashmap.h:294
int64_t Size() const override
Definition: StdGPUHashmap.h:120
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashmap.h:209
static const Dtype Int64
Definition: Dtype.h:47
Definition: PinholeCameraIntrinsic.cpp:35
CUDAHashmapBufferAccessor buffer_accessor_
Definition: StdGPUHashmap.h:93
Definition: Tensor.h:50
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:465
int64_t capacity_
Definition: DeviceHashmap.h:109
void InsertImpl(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:348
Tensor & GetKeyBuffer()
Definition: DeviceHashmap.h:101
uint32_t addr_t
Definition: HashmapBuffer.h:58
int64_t dsize_key_
Definition: DeviceHashmap.h:110
T * GetDataPtr()
Definition: Tensor.h:1005
OPEN3D_HOST_DEVICE addr_t operator()(const thrust::pair< Key, addr_t > &x) const
Definition: StdGPUHashmap.h:228
__host__ void HostFree(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:85
__host__ void Reset(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:67
static const Dtype Bool
Definition: Dtype.h:52
int64_t dsize_value_
Definition: DeviceHashmap.h:111
Tensor & GetValueBuffer()
Definition: DeviceHashmap.h:102
std::shared_ptr< HashmapBuffer > buffer_
Definition: DeviceHashmap.h:115
void Find(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashmap.h:170
#define max(x, y)
Definition: SVD3x3CPU.h:38
std::vector< int64_t > BucketSizes() const override
Definition: StdGPUHashmap.h:289