53 template <
typename Key,
typename Hash>
58 __host__
void Setup(int64_t init_buckets,
59 int64_t init_capacity,
65 __device__
bool Insert(
bool lane_active,
102 bucket_id * kWarpSize + lane_id;
122 template <
typename Key,
typename Hash>
124 const void* input_keys,
126 int heap_counter_prev,
129 template <
typename Key,
typename Hash>
131 const void* input_keys,
136 template <
typename Key,
typename Hash>
138 const void* input_values,
143 template <
typename Key,
typename Hash>
145 const void* input_keys,
150 template <
typename Key,
typename Hash>
152 const void* input_keys,
157 template <
typename Key,
typename Hash>
163 template <
typename Key,
typename Hash>
168 template <
typename Key,
typename Hash>
170 int64_t* bucket_elem_counts);
172 template <
typename Key,
typename Hash>
174 : bucket_count_(0), bucket_list_head_(nullptr) {}
176 template <
typename Key,
typename Hash>
178 int64_t init_buckets,
179 int64_t init_capacity,
184 bucket_count_ = init_buckets;
185 capacity_ = init_capacity;
186 dsize_key_ = dsize_key;
187 dsize_value_ = dsize_value;
189 node_mgr_impl_ = allocator_impl;
190 buffer_accessor_ = pair_allocator_impl;
193 template <
typename Key,
typename Hash>
201 uint32_t curr_slab_ptr = kHeadSlabAddr;
207 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
210 (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
211 uint32_t src_lane = __ffs(work_queue) - 1;
213 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
218 (curr_slab_ptr == kHeadSlabAddr)
227 if (lane_found >= 0) {
228 if (lane_id == src_lane) {
235 else if (lane_empty >= 0) {
236 if (lane_id == src_lane) {
239 (curr_slab_ptr == kHeadSlabAddr)
245 addr_t old_iterator_addr =
246 atomicCAS((
unsigned int*)unit_data_ptr, kEmptyNodeAddr,
251 if (old_iterator_addr == kEmptyNodeAddr) {
267 addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
268 kNextSlabPtrLaneId, kWarpSize);
271 if (next_slab_ptr != kEmptySlabAddr) {
272 curr_slab_ptr = next_slab_ptr;
279 if (lane_id == kNextSlabPtrLaneId) {
281 (curr_slab_ptr == kHeadSlabAddr)
283 src_bucket, kNextSlabPtrLaneId)
288 addr_t old_next_slab_ptr =
289 atomicCAS((
unsigned int*)unit_data_ptr,
290 kEmptySlabAddr, new_next_slab_ptr);
294 if (old_next_slab_ptr != kEmptySlabAddr) {
303 prev_work_queue = work_queue;
309 template <
typename Key,
typename Hash>
314 const Key& query_key) {
316 uint32_t prev_work_queue = work_queue;
317 uint32_t curr_slab_ptr = kHeadSlabAddr;
319 addr_t iterator = kNullAddr;
323 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
326 (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
327 uint32_t src_lane = __ffs(work_queue) - 1;
329 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
336 (curr_slab_ptr == kHeadSlabAddr)
344 if (lane_found >= 0) {
346 addr_t found_pair_internal_ptr = __shfl_sync(
347 kSyncLanesMask, unit_data, lane_found, kWarpSize);
349 if (lane_id == src_lane) {
353 iterator = found_pair_internal_ptr;
361 addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
362 kNextSlabPtrLaneId, kWarpSize);
365 if (next_slab_ptr == kEmptySlabAddr) {
366 if (lane_id == src_lane) {
372 curr_slab_ptr = next_slab_ptr;
376 prev_work_queue = work_queue;
382 template <
typename Key,
typename Hash>
390 uint32_t curr_slab_ptr = kHeadSlabAddr;
397 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
400 (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
401 uint32_t src_lane = __ffs(work_queue) - 1;
403 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
408 (curr_slab_ptr == kHeadSlabAddr)
416 if (lane_found >= 0) {
417 if (lane_id == src_lane) {
419 (curr_slab_ptr == kHeadSlabAddr)
425 uint32_t pair_to_delete = atomicExch(
426 (
unsigned int*)unit_data_ptr, kEmptyNodeAddr);
427 mask = pair_to_delete != kEmptyNodeAddr;
428 iterator_addr = pair_to_delete;
432 addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
433 kNextSlabPtrLaneId, kWarpSize);
434 if (next_slab_ptr == kEmptySlabAddr) {
436 if (lane_id == src_lane) {
440 curr_slab_ptr = next_slab_ptr;
443 prev_work_queue = work_queue;
449 template <
typename Key,
typename Hash>
453 auto dst_key_ptr =
reinterpret_cast<int*
>(&ret_key);
454 auto src_key_ptr =
reinterpret_cast<const int*
>(&key);
457 __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
461 template <
typename Key,
typename Hash>
467 ((1 << lane_id) & kNodePtrLanesMask)
469 && (ptr != kEmptyNodeAddr)
475 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
478 template <
typename Key,
typename Hash>
480 bool is_lane_empty = (ptr == kEmptyNodeAddr);
481 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
484 template <
typename Key,
typename Hash>
490 template <
typename Key,
typename Hash>
495 template <
typename Key,
typename Hash>
501 template <
typename Key,
typename Hash>
503 const void* input_keys,
505 int heap_counter_prev,
507 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
508 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
517 *
static_cast<Key*
>(iterator.
first) = input_keys_templated[tid];
518 output_addrs[tid] = iterator_addr;
522 template <
typename Key,
typename Hash>
524 const void* input_keys,
528 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
529 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
532 if (tid - lane_id >= count) {
538 bool lane_active =
false;
546 key = input_keys_templated[tid];
547 iterator_addr = output_addrs[tid];
553 impl.
Insert(lane_active, lane_id, bucket_id, key, iterator_addr);
556 output_masks[tid] = mask;
560 template <
typename Key,
typename Hash>
562 const void* input_values,
566 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
569 addr_t iterator_addr = output_addrs[tid];
571 if (output_masks[tid]) {
576 if (input_values !=
nullptr) {
578 static_cast<const uint8_t*>(input_values) +
588 template <
typename Key,
typename Hash>
590 const void* input_keys,
594 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
595 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
596 uint32_t lane_id = threadIdx.x & 0x1F;
599 if ((tid - lane_id) >=
count) {
606 bool lane_active =
false;
615 key = input_keys_templated[tid];
619 result = impl.
Find(lane_active, lane_id, bucket_id, key);
622 output_addrs[tid] = result.
first;
623 output_masks[tid] = result.
second;
627 template <
typename Key,
typename Hash>
629 const void* input_keys,
633 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
634 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
635 uint32_t lane_id = threadIdx.x & 0x1F;
637 if (tid - lane_id >= count) {
643 bool lane_active =
false;
650 key = input_keys_templated[tid];
654 auto result = impl.
Erase(lane_active, lane_id, bucket_id, key);
657 output_addrs[tid] = result.first;
658 output_masks[tid] = result.second;
662 template <
typename Key,
typename Hash>
667 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
668 if (tid < count && output_masks[tid]) {
673 template <
typename Key,
typename Hash>
677 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
678 uint32_t lane_id = threadIdx.x & 0x1F;
690 bool is_active = src_unit_data != kEmptyNodeAddr;
692 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
693 uint32_t index = atomicAdd(output_iterator_count, 1);
694 output_addrs[index] = src_unit_data;
697 addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
701 while (next != kEmptySlabAddr) {
703 is_active = (src_unit_data != kEmptyNodeAddr);
705 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
706 uint32_t index = atomicAdd(output_iterator_count, 1);
707 output_addrs[index] = src_unit_data;
709 next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
714 template <
typename Key,
typename Hash>
716 int64_t* bucket_elem_counts) {
717 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
718 uint32_t lane_id = threadIdx.x & 0x1F;
734 __ballot_sync(kNodePtrLanesMask, src_unit_data != kEmptyNodeAddr));
735 addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
739 while (next != kEmptySlabAddr) {
741 count += __popc(__ballot_sync(kNodePtrLanesMask,
742 src_unit_data != kEmptyNodeAddr));
743 next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
749 bucket_elem_counts[bucket_id] =
count;
void * first
Definition: SlabTraits.h:54
__device__ int32_t WarpFindEmpty(addr_t unit_data)
Definition: SlabHashmapImpl.h:479
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashmapImpl.h:486
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Definition: SlabHashmapImpl.h:450
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
__global__ void InsertKernelPass2(SlabHashmapImpl< Key, Hash > impl, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:561
__device__ Pair< addr_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Definition: SlabHashmapImpl.h:310
int64_t bucket_count_
Definition: SlabHashmapImpl.h:108
CUDAHashmapBufferAccessor buffer_accessor_
Definition: SlabHashmapImpl.h:115
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:398
Second second
Definition: SlabTraits.h:61
#define MEMCPY_AS_INTS(dst, src, num_bytes)
Definition: SlabMacros.h:97
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:77
__global__ void CountElemsPerBucketKernel(SlabHashmapImpl< Key, Hash > impl, int64_t *bucket_elem_counts)
Definition: SlabHashmapImpl.h:715
Definition: SlabTraits.h:59
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashmapImpl.h:114
__device__ Pair< addr_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Definition: SlabHashmapImpl.h:383
Definition: CUDAHashmapBufferAccessor.h:48
__global__ void InsertKernelPass1(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:523
math::float4 next
Definition: LineSetBuffers.cpp:63
__device__ addr_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashmapImpl.h:491
__device__ void FreeSlab(addr_t slab_ptr)
Definition: SlabHashmapImpl.h:496
int key_size_in_int_
Definition: SlabHashmapImpl.h:118
__device__ iterator_t ExtractIterator(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:109
First first
Definition: SlabTraits.h:60
__host__ void Setup(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashmapBufferAccessor &buffer_accessor)
Definition: SlabHashmapImpl.h:177
Definition: SlabTraits.h:49
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, addr_t ptr)
Definition: SlabHashmapImpl.h:462
int64_t dsize_key_
Definition: SlabHashmapImpl.h:110
int count
Definition: FilePCD.cpp:61
Definition: SlabNodeManager.h:68
int64_t capacity_
Definition: SlabHashmapImpl.h:109
Slab * bucket_list_head_
Definition: SlabHashmapImpl.h:113
Definition: PinholeCameraIntrinsic.cpp:35
__global__ void GetActiveIndicesKernel(SlabHashmapImpl< Key, Hash > impl, addr_t *output_addrs, uint32_t *output_iterator_count)
Definition: SlabHashmapImpl.h:674
__global__ void EraseKernelPass0(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:628
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, addr_t iterator_addr)
Definition: SlabHashmapImpl.h:194
uint32_t addr_t
Definition: HashmapBuffer.h:58
__device__ void FreeUntouched(addr_t ptr)
Definition: SlabNodeManager.h:154
Definition: SlabNodeManager.h:59
__global__ void EraseKernelPass1(SlabHashmapImpl< Key, Hash > impl, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:663
__device__ addr_t * get_unit_ptr_from_list_head(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashmapImpl.h:99
void * second
Definition: SlabTraits.h:55
Definition: SlabHashmapImpl.h:54
SlabHashmapImpl()
Definition: SlabHashmapImpl.h:173
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:98
__global__ void InsertKernelPass0(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashmapImpl.h:502
int64_t dsize_value_
Definition: SlabHashmapImpl.h:111
__device__ addr_t * get_unit_ptr_from_list_nodes(addr_t slab_ptr, uint32_t lane_id)
Definition: SlabHashmapImpl.h:95
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:68
Hash hash_fn_
Definition: SlabHashmapImpl.h:106
__global__ void FindKernel(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:589