63 template <
typename Key,
typename Hash,
typename Eq>
68 __host__
void Setup(int64_t init_buckets,
73 __device__
bool Insert(
bool lane_active,
113 return (slab_ptr == kHeadSlabAddr)
125 bucket_id * kWarpSize + lane_id;
142 template <
typename Key,
typename Hash,
typename Eq>
144 const void* input_keys,
146 int heap_counter_prev,
149 template <
typename Key,
typename Hash,
typename Eq>
151 const void* input_keys,
156 template <
typename Key,
typename Hash,
typename Eq,
typename block_t>
158 const void*
const* input_values_soa,
164 template <
typename Key,
typename Hash,
typename Eq>
166 const void* input_keys,
171 template <
typename Key,
typename Hash,
typename Eq>
173 const void* input_keys,
178 template <
typename Key,
typename Hash,
typename Eq>
184 template <
typename Key,
typename Hash,
typename Eq>
189 template <
typename Key,
typename Hash,
typename Eq>
193 template <
typename Key,
typename Hash,
typename Eq>
195 : bucket_count_(0), bucket_list_head_(nullptr) {}
197 template <
typename Key,
typename Hash,
typename Eq>
199 int64_t init_buckets,
202 bucket_count_ = init_buckets;
203 node_mgr_impl_ = allocator_impl;
204 buffer_accessor_ = buffer_accessor;
207 template <
typename Key,
typename Hash,
typename Eq>
222 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
224 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
225 uint32_t src_lane = __ffs(work_queue) - 1;
227 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
236 if (lane_found >= 0) {
237 if (lane_id == src_lane) {
243 else if (lane_empty >= 0) {
246 if (lane_id == src_lane) {
252 atomicCAS((
unsigned int*)empty_entry_ptr,
253 kEmptyNodeAddr, buf_index);
256 if (old_empty_entry_value == kEmptyNodeAddr) {
272 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
273 kNextSlabPtrLaneId, kWarpSize);
276 if (next_slab_ptr != kEmptySlabAddr) {
277 slab_ptr = next_slab_ptr;
286 if (lane_id == kNextSlabPtrLaneId) {
288 src_bucket, kNextSlabPtrLaneId, slab_ptr);
290 uint32_t old_next_slab_entry_value =
291 atomicCAS((
unsigned int*)next_slab_entry_ptr,
292 kEmptySlabAddr, new_next_slab_ptr);
296 if (old_next_slab_entry_value != kEmptySlabAddr) {
306 prev_work_queue = work_queue;
312 template <
typename Key,
typename Hash,
typename Eq>
317 const Key& query_key) {
319 uint32_t prev_work_queue = work_queue;
326 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
328 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
329 uint32_t src_lane = __ffs(work_queue) - 1;
331 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
343 if (lane_found >= 0) {
345 uint32_t found_buf_index = __shfl_sync(kSyncLanesMask, slab_entry,
346 lane_found, kWarpSize);
348 if (lane_id == src_lane) {
350 buf_index = found_buf_index;
358 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
359 kNextSlabPtrLaneId, kWarpSize);
362 if (next_slab_ptr == kEmptySlabAddr) {
363 if (lane_id == src_lane) {
369 slab_ptr = next_slab_ptr;
373 prev_work_queue = work_queue;
379 template <
typename Key,
typename Hash,
typename Eq>
394 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
396 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
397 uint32_t src_lane = __ffs(work_queue) - 1;
399 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
409 if (lane_found >= 0) {
410 if (lane_id == src_lane) {
414 uint32_t old_found_entry_value = atomicExch(
415 (
unsigned int*)found_entry_ptr, kEmptyNodeAddr);
419 mask = (old_found_entry_value != kEmptyNodeAddr);
420 buf_index = old_found_entry_value;
423 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
424 kNextSlabPtrLaneId, kWarpSize);
425 if (next_slab_ptr == kEmptySlabAddr) {
427 if (lane_id == src_lane) {
431 slab_ptr = next_slab_ptr;
434 prev_work_queue = work_queue;
440 template <
typename Key,
typename Hash,
typename Eq>
442 const Key& key,
uint32_t lane_id, Key& ret_key) {
443 auto dst_key_ptr =
reinterpret_cast<int*
>(&ret_key);
444 auto src_key_ptr =
reinterpret_cast<const int*
>(&key);
447 __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
451 template <
typename Key,
typename Hash,
typename Eq>
456 ((1 << lane_id) & kNodePtrLanesMask)
458 && (slab_entry != kEmptyNodeAddr)
464 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
467 template <
typename Key,
typename Hash,
typename Eq>
470 bool is_lane_empty = (slab_entry == kEmptyNodeAddr);
471 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
474 template <
typename Key,
typename Hash,
typename Eq>
480 template <
typename Key,
typename Hash,
typename Eq>
486 template <
typename Key,
typename Hash,
typename Eq>
492 template <
typename Key,
typename Hash,
typename Eq>
494 const void* input_keys,
496 int heap_counter_prev,
498 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
499 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
506 *
static_cast<Key*
>(key) = input_keys_templated[tid];
507 output_buf_indices[tid] = buf_index;
511 template <
typename Key,
typename Hash,
typename Eq>
513 const void* input_keys,
517 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
518 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
521 if (tid - lane_id >= count) {
527 bool lane_active =
false;
535 key = input_keys_templated[tid];
536 buf_index = output_buf_indices[tid];
541 bool mask = impl.
Insert(lane_active, lane_id, bucket_id, key, buf_index);
544 output_masks[tid] = mask;
548 template <
typename Key,
typename Hash,
typename Eq,
typename block_t>
550 const void*
const* input_values_soa,
555 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
560 if (output_masks[tid]) {
561 for (
int j = 0; j < n_values; ++j) {
562 int64_t blocks_per_element =
565 block_t* dst_value =
static_cast<block_t*
>(
567 const block_t* src_value =
568 static_cast<const block_t*
>(input_values_soa[j]) +
569 blocks_per_element * tid;
570 for (
int b = 0; b < blocks_per_element; ++b) {
571 dst_value[b] = src_value[b];
580 template <
typename Key,
typename Hash,
typename Eq>
582 const void* input_keys,
586 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
587 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
588 uint32_t lane_id = threadIdx.x & 0x1F;
591 if ((tid - lane_id) >=
count) {
598 bool lane_active =
false;
607 key = input_keys_templated[tid];
611 result = impl.
Find(lane_active, lane_id, bucket_id, key);
614 output_buf_indices[tid] = result.
first;
615 output_masks[tid] = result.
second;
619 template <
typename Key,
typename Hash,
typename Eq>
621 const void* input_keys,
625 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
626 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
627 uint32_t lane_id = threadIdx.x & 0x1F;
629 if (tid - lane_id >= count) {
635 bool lane_active =
false;
642 key = input_keys_templated[tid];
646 auto result = impl.
Erase(lane_active, lane_id, bucket_id, key);
649 output_buf_indices[tid] = result.first;
650 output_masks[tid] = result.second;
654 template <
typename Key,
typename Hash,
typename Eq>
659 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
660 if (tid < count && output_masks[tid]) {
665 template <
typename Key,
typename Hash,
typename Eq>
669 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
670 uint32_t lane_id = threadIdx.x & 0x1F;
681 bool is_active = slab_entry != kEmptyNodeAddr;
683 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
684 uint32_t index = atomicAdd(output_count, 1);
685 output_buf_indices[index] = slab_entry;
688 uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
689 kNextSlabPtrLaneId, kWarpSize);
692 while (slab_ptr != kEmptySlabAddr) {
694 is_active = (slab_entry != kEmptyNodeAddr);
696 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
697 uint32_t index = atomicAdd(output_count, 1);
698 output_buf_indices[index] = slab_entry;
700 slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
705 template <
typename Key,
typename Hash,
typename Eq>
708 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
709 uint32_t lane_id = threadIdx.x & 0x1F;
724 __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
725 uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
726 kNextSlabPtrLaneId, kWarpSize);
729 while (slab_ptr != kEmptySlabAddr) {
732 __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
733 slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
739 bucket_elem_counts[bucket_id] =
count;
int key_size_in_int_
Definition: SlabHashBackendImpl.h:138
__global__ void InsertKernelPass2(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: SlabHashBackendImpl.h:549
Definition: CUDAHashBackendBufferAccessor.h:43
SlabHashBackendImpl()
Definition: SlabHashBackendImpl.h:194
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
Definition: SlabHashBackendImpl.h:666
__device__ uint32_t * SlabEntryPtrFromHead(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:122
__device__ Pair< buf_index_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-erase an entry at key.
Definition: SlabHashBackendImpl.h:380
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:153
__device__ int32_t WarpFindEmpty(uint32_t slab_entry)
Warp-find the first empty slot in a slab.
Definition: SlabHashBackendImpl.h:469
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:76
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:398
Slab * bucket_list_head_
Definition: SlabHashBackendImpl.h:133
Second second
Definition: SlabTraits.h:61
__device__ Pair< buf_index_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-find a buf_index and its mask at key.
Definition: SlabHashBackendImpl.h:313
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashBackendImpl.h:493
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, uint32_t slab_entry)
Warp-find a key in a slab.
Definition: SlabHashBackendImpl.h:452
__host__ void Setup(int64_t init_buckets, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashBackendBufferAccessor &buffer_accessor)
Definition: SlabHashBackendImpl.h:198
__device__ uint32_t * SlabEntryPtr(uint32_t bucket_id, uint32_t lane_id, uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:110
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:581
Definition: SlabTraits.h:59
__device__ uint32_t * SlabEntryPtrFromNodes(uint32_t slab_ptr, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:118
__device__ void FreeSlab(uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:487
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, buf_index_t buf_index)
Warp-insert a pre-allocated buf_index at key.
Definition: SlabHashBackendImpl.h:208
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashBackendImpl.h:134
First first
Definition: SlabTraits.h:60
Eq eq_fn_
Definition: SlabHashBackendImpl.h:130
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:620
Definition: SlabHashBackendImpl.h:64
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:107
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:655
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Warp-synchronize a key in a slab.
Definition: SlabHashBackendImpl.h:441
Hash hash_fn_
Definition: SlabHashBackendImpl.h:129
Definition: SlabNodeManager.h:67
int64_t bucket_count_
Definition: SlabHashBackendImpl.h:131
Definition: PinholeCameraIntrinsic.cpp:35
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: SlabHashBackendImpl.h:135
__device__ uint32_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashBackendImpl.h:482
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
Definition: SlabHashBackendImpl.h:706
Definition: SlabNodeManager.h:58
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:97
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:512
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:68
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashBackendImpl.h:476