29 #include <stdgpu/memory.h> 30 #include <thrust/transform.h> 32 #include <stdgpu/unordered_map.cuh> 33 #include <type_traits> 75 : std_allocator_(other.std_allocator_) {}
83 T* p = std_allocator_.allocate(n);
84 stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
94 stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
95 std_allocator_.deallocate(p, n);
100 return std_allocator_ == other.std_allocator_;
111 template <
typename T2>
119 template <
typename Key>
123 template <
typename Key,
typename Hash>
125 stdgpu::unordered_map<Key,
128 stdgpu::equal_to<Key>,
131 template <
typename Key,
typename Hash>
140 void Rehash(int64_t buckets)
override;
142 void Insert(
const void* input_keys,
143 const void* input_values,
144 addr_t* output_addrs,
146 int64_t
count)
override;
148 void Activate(
const void* input_keys,
149 addr_t* output_addrs,
151 int64_t count)
override;
153 void Find(
const void* input_keys,
154 addr_t* output_addrs,
156 int64_t count)
override;
158 void Erase(
const void* input_keys,
160 int64_t count)
override;
162 int64_t GetActiveIndices(addr_t* output_indices)
override;
164 void Clear()
override;
166 int64_t Size()
const override;
168 int64_t GetBucketCount()
const override;
169 std::vector<int64_t> BucketSizes()
const override;
170 float LoadFactor()
const override;
181 void InsertImpl(
const void* input_keys,
182 const void* input_values,
183 addr_t* output_addrs,
187 void Allocate(int64_t capacity);
191 template <
typename Key,
typename Hash>
196 :
DeviceHashmap(init_capacity, dsize_key, dsize_value, device) {
200 template <
typename Key,
typename Hash>
205 template <
typename Key,
typename Hash>
210 template <
typename Key,
typename Hash>
212 const void* input_values,
213 addr_t* output_addrs,
219 float avg_capacity_per_bucket =
221 int64_t expected_buckets = std::max(
223 int64_t(
std::ceil(new_size / avg_capacity_per_bucket)));
226 InsertImpl(input_keys, input_values, output_addrs, output_masks, count);
229 template <
typename Key,
typename Hash>
231 addr_t* output_addrs,
234 Insert(input_keys,
nullptr, output_addrs, output_masks, count);
238 template <
typename Key,
typename Hash>
241 const Key* input_keys,
242 addr_t* output_addrs,
245 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
246 if (tid >= count)
return;
248 Key key = input_keys[tid];
249 auto iter = map.find(key);
250 bool flag = (iter != map.end());
251 output_masks[tid] = flag;
252 output_addrs[tid] = flag ? iter->second : 0;
255 template <
typename Key,
typename Hash>
257 addr_t* output_addrs,
261 uint32_t blocks = (count + threads - 1) / threads;
263 STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
265 output_addrs, output_masks, count);
270 template <
typename Key,
typename Hash>
273 const Key* input_keys,
274 addr_t* output_addrs,
277 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
278 if (tid >= count)
return;
280 Key key = input_keys[tid];
281 auto iter = map.find(key);
282 bool flag = (iter != map.end());
283 output_masks[tid] = flag;
284 output_addrs[tid] = flag ? iter->second : 0;
286 if (output_masks[tid]) {
287 output_masks[tid] = map.erase(key);
288 if (output_masks[tid]) {
289 buffer_accessor.
DeviceFree(output_addrs[tid]);
294 template <
typename Key,
typename Hash>
299 uint32_t blocks = (count + threads - 1) / threads;
303 addr_t* output_addrs =
static_cast<addr_t*
>(toutput_addrs.
GetDataPtr());
305 STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
307 output_addrs, output_masks, count);
311 template <
typename Key>
319 template <
typename Key,
typename Hash>
321 auto range =
impl_.device_range();
323 thrust::transform(range.begin(), range.end(), output_indices,
329 template <
typename Key,
typename Hash>
335 template <
typename Key,
typename Hash>
337 int64_t iterator_count =
Size();
342 if (iterator_count > 0) {
351 float avg_capacity_per_bucket =
355 int64_t new_capacity =
356 int64_t(
std::ceil(buckets * avg_capacity_per_bucket));
359 if (iterator_count > 0) {
364 static_cast<addr_t*
>(output_addrs.GetDataPtr()),
365 output_masks.GetDataPtr<
bool>(), iterator_count);
369 template <
typename Key,
typename Hash>
371 return impl_.bucket_count();
374 template <
typename Key,
typename Hash>
379 template <
typename Key,
typename Hash>
381 return impl_.load_factor();
385 template <
typename Key,
typename Hash>
388 const Key* input_keys,
389 const void* input_values,
391 addr_t* output_addrs,
394 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
395 if (tid >= count)
return;
397 Key key = input_keys[tid];
398 output_addrs[tid] = 0;
399 output_masks[tid] =
false;
402 auto res = map.emplace(key, 0);
411 *
static_cast<Key*
>(dst_kv_iter.first) = key;
414 uint8_t* dst_value =
static_cast<uint8_t*
>(dst_kv_iter.second);
415 if (input_values !=
nullptr) {
416 const uint8_t* src_value =
417 static_cast<const uint8_t*
>(input_values) +
419 for (
int byte = 0; byte < dsize_value; ++byte) {
420 dst_value[byte] = src_value[byte];
425 res.
first->second = dst_kv_addr;
428 output_addrs[tid] = dst_kv_addr;
429 output_masks[tid] =
true;
433 template <
typename Key,
typename Hash>
435 const void* input_values,
436 addr_t* output_addrs,
440 uint32_t blocks = (count + threads - 1) / threads;
442 STDGPUInsertKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
444 input_values, this->
dsize_value_, output_addrs, output_masks,
449 template <
typename Key,
typename Hash>
460 this->dsize_value_, this->
buffer_->GetKeyBuffer(),
461 this->
buffer_->GetValueBuffer(),
468 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
477 template <
typename Key,
typename Hash>
486 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
void * first
Definition: SlabTraits.h:54
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashmap.h:330
void Insert(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashmap.h:211
__global__ void STDGPUEraseKernel(InternalStdGPUHashmap< Key, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:271
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashmap.h:99
void Free()
Definition: StdGPUHashmap.h:478
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashmap.h:74
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashmap.h:104
const Dtype Bool
Definition: Dtype.cpp:72
const Dtype Int64
Definition: Dtype.cpp:67
T value_type
T.
Definition: StdGPUHashmap.h:52
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
StdGPUHashmap(int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const Device &device)
Definition: StdGPUHashmap.h:192
~StdGPUHashmap()
Definition: StdGPUHashmap.h:201
void Allocate(int64_t capacity)
Definition: StdGPUHashmap.h:450
__host__ void Setup(int64_t capacity, int64_t dsize_key, int64_t dsize_value, Tensor &keys, Tensor &values, Tensor &heap)
Definition: CUDAHashmapBufferAccessor.h:50
__host__ void HostAllocate(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:80
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashmap.h:89
Definition: StdGPUHashmap.h:132
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:108
Definition: DeviceHashmap.h:39
__device__ addr_t DeviceAllocate()
Definition: CUDAHashmapBufferAccessor.h:92
Definition: StdAllocator.h:42
void Synchronize()
Definition: CUDAUtils.cpp:72
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:707
Definition: CUDAHashmapBufferAccessor.h:48
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:63
void Rehash(int64_t buckets) override
Definition: StdGPUHashmap.h:336
const Dtype Int32
Definition: Dtype.cpp:66
__device__ void DeviceFree(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:97
InternalStdGPUHashmap< Key, Hash > impl_
Definition: StdGPUHashmap.h:177
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:541
__device__ iterator_t ExtractIterator(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:109
int64_t GetActiveIndices(addr_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashmap.h:320
Device device_
Definition: DeviceHashmap.h:113
StdGPUAllocator(const Device &device)
Constructor from device.
Definition: StdGPUHashmap.h:58
void Activate(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Definition: StdGPUHashmap.h:230
Definition: StdGPUHashmap.h:49
int64_t GetBucketCount() const override
Definition: StdGPUHashmap.h:370
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashmap.h:78
StdGPUAllocator()=default
Default constructor.
int count
Definition: FilePCD.cpp:61
float LoadFactor() const override
Definition: StdGPUHashmap.h:380
int64_t Size() const override
Definition: StdGPUHashmap.h:206
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashmap.h:295
Definition: PinholeCameraIntrinsic.cpp:35
CUDAHashmapBufferAccessor buffer_accessor_
Definition: StdGPUHashmap.h:179
__global__ void STDGPUFindKernel(InternalStdGPUHashmap< Key, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:239
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:465
int64_t capacity_
Definition: DeviceHashmap.h:109
__global__ void STDGPUInsertKernel(InternalStdGPUHashmap< Key, Hash > map, CUDAHashmapBufferAccessor buffer_accessor, const Key *input_keys, const void *input_values, int64_t dsize_value, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:386
void InsertImpl(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: StdGPUHashmap.h:434
Tensor & GetKeyBuffer()
Definition: DeviceHashmap.h:101
uint32_t addr_t
Definition: HashmapBuffer.h:58
int64_t dsize_key_
Definition: DeviceHashmap.h:110
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:724
T * GetDataPtr()
Definition: Tensor.h:1004
stdgpu::unordered_map< Key, addr_t, Hash, stdgpu::equal_to< Key >, InternalStdGPUHashmapAllocator< Key > > InternalStdGPUHashmap
Definition: StdGPUHashmap.h:129
__host__ void HostFree(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:85
__host__ void Reset(const Device &device)
Definition: CUDAHashmapBufferAccessor.h:67
int64_t dsize_value_
Definition: DeviceHashmap.h:111
Tensor & GetValueBuffer()
Definition: DeviceHashmap.h:102
Device GetDevice() const
Returns the device on which memory is allocated.
Definition: StdGPUHashmap.h:107
InternalStdGPUHashmap< Key, Hash > GetImpl() const
Definition: StdGPUHashmap.h:172
std::shared_ptr< HashmapBuffer > buffer_
Definition: DeviceHashmap.h:115
void Find(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashmap.h:256
std::vector< int64_t > BucketSizes() const override
Definition: StdGPUHashmap.h:375
#define LogError(...)
Definition: Logging.h:78