Open3D (C++ API)  0.13.0
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
SlabHashmapImpl.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 // Copyright 2019 Saman Ashkiani
28 //
29 // Licensed under the Apache License, Version 2.0 (the "License");
30 // you may not use this file except in compliance with the License.
31 // You may obtain a copy of the License at
32 //
33 // http://www.apache.org/licenses/LICENSE-2.0
34 //
35 // Unless required by applicable law or agreed to in writing, software
36 // distributed under the License is distributed on an "AS IS" BASIS,
37 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38 // implied. See the License for the specific language governing permissions
39 // and limitations under the License.
40 
41 #pragma once
42 
43 #include "open3d/core/CUDAUtils.h"
49 
50 namespace open3d {
51 namespace core {
52 
53 template <typename Key, typename Hash>
55 public:
57 
58  __host__ void Setup(int64_t init_buckets,
59  int64_t init_capacity,
60  int64_t dsize_key,
61  int64_t dsize_value,
62  const SlabNodeManagerImpl& node_mgr_impl,
63  const CUDAHashmapBufferAccessor& buffer_accessor);
64 
65  __device__ bool Insert(bool lane_active,
66  uint32_t lane_id,
67  uint32_t bucket_id,
68  const Key& key,
69  addr_t iterator_addr);
70 
71  __device__ Pair<addr_t, bool> Find(bool lane_active,
72  uint32_t lane_id,
73  uint32_t bucket_id,
74  const Key& key);
75 
76  __device__ Pair<addr_t, bool> Erase(bool lane_active,
77  uint32_t lane_id,
78  uint32_t bucket_id,
79  const Key& key);
80 
81  __device__ void WarpSyncKey(const Key& key, uint32_t lane_id, Key& ret_key);
82  __device__ int32_t WarpFindKey(const Key& src_key,
83  uint32_t lane_id,
84  addr_t ptr);
85  __device__ int32_t WarpFindEmpty(addr_t unit_data);
86 
87  // Hash function.
88  __device__ int64_t ComputeBucket(const Key& key) const;
89 
90  // Node manager.
91  __device__ addr_t AllocateSlab(uint32_t lane_id);
92  __device__ void FreeSlab(addr_t slab_ptr);
93 
94  // Helpers.
96  uint32_t lane_id) {
97  return node_mgr_impl_.get_unit_ptr_from_slab(slab_ptr, lane_id);
98  }
100  uint32_t lane_id) {
101  return reinterpret_cast<uint32_t*>(bucket_list_head_) +
102  bucket_id * kWarpSize + lane_id;
103  }
104 
105 public:
106  Hash hash_fn_;
107 
108  int64_t bucket_count_;
109  int64_t capacity_;
110  int64_t dsize_key_;
111  int64_t dsize_value_;
112 
116 
117  // TODO: verify size with alignment
118  int key_size_in_int_ = sizeof(Key) / sizeof(int);
119 };
120 
122 template <typename Key, typename Hash>
123 __global__ void InsertKernelPass0(SlabHashmapImpl<Key, Hash> impl,
124  const void* input_keys,
125  addr_t* output_addrs,
126  int heap_counter_prev,
127  int64_t count);
128 
129 template <typename Key, typename Hash>
130 __global__ void InsertKernelPass1(SlabHashmapImpl<Key, Hash> impl,
131  const void* input_keys,
132  addr_t* output_addrs,
133  bool* output_masks,
134  int64_t count);
135 
136 template <typename Key, typename Hash>
137 __global__ void InsertKernelPass2(SlabHashmapImpl<Key, Hash> impl,
138  const void* input_values,
139  addr_t* output_addrs,
140  bool* output_masks,
141  int64_t count);
142 
143 template <typename Key, typename Hash>
144 __global__ void FindKernel(SlabHashmapImpl<Key, Hash> impl,
145  const void* input_keys,
146  addr_t* output_addrs,
147  bool* output_masks,
148  int64_t count);
149 
150 template <typename Key, typename Hash>
151 __global__ void EraseKernelPass0(SlabHashmapImpl<Key, Hash> impl,
152  const void* input_keys,
153  addr_t* output_addrs,
154  bool* output_masks,
155  int64_t count);
156 
157 template <typename Key, typename Hash>
158 __global__ void EraseKernelPass1(SlabHashmapImpl<Key, Hash> impl,
159  addr_t* output_addrs,
160  bool* output_masks,
161  int64_t count);
162 
163 template <typename Key, typename Hash>
165  addr_t* output_addrs,
166  uint32_t* output_iterator_count);
167 
168 template <typename Key, typename Hash>
170  int64_t* bucket_elem_counts);
171 
172 template <typename Key, typename Hash>
174  : bucket_count_(0), bucket_list_head_(nullptr) {}
175 
176 template <typename Key, typename Hash>
178  int64_t init_buckets,
179  int64_t init_capacity,
180  int64_t dsize_key,
181  int64_t dsize_value,
182  const SlabNodeManagerImpl& allocator_impl,
183  const CUDAHashmapBufferAccessor& pair_allocator_impl) {
184  bucket_count_ = init_buckets;
185  capacity_ = init_capacity;
186  dsize_key_ = dsize_key;
187  dsize_value_ = dsize_value;
188 
189  node_mgr_impl_ = allocator_impl;
190  buffer_accessor_ = pair_allocator_impl;
191 }
192 
193 template <typename Key, typename Hash>
194 __device__ bool SlabHashmapImpl<Key, Hash>::Insert(bool lane_active,
195  uint32_t lane_id,
196  uint32_t bucket_id,
197  const Key& key,
198  addr_t iterator_addr) {
199  uint32_t work_queue = 0;
200  uint32_t prev_work_queue = 0;
201  uint32_t curr_slab_ptr = kHeadSlabAddr;
202  Key src_key;
203 
204  bool mask = false;
205 
206  // > Loop when we have active lanes
207  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
208  // 0. Restart from linked list head if last insertion is finished
209  curr_slab_ptr =
210  (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
211  uint32_t src_lane = __ffs(work_queue) - 1;
212  uint32_t src_bucket =
213  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
214  WarpSyncKey(key, src_lane, src_key);
215 
216  // Each lane in the warp reads a unit in the slab
217  uint32_t unit_data =
218  (curr_slab_ptr == kHeadSlabAddr)
219  ? *(get_unit_ptr_from_list_head(src_bucket, lane_id))
220  : *(get_unit_ptr_from_list_nodes(curr_slab_ptr,
221  lane_id));
222 
223  int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data);
224  int32_t lane_empty = WarpFindEmpty(unit_data);
225 
226  // Branch 1: key already existing, ABORT
227  if (lane_found >= 0) {
228  if (lane_id == src_lane) {
229  // free memory heap
230  lane_active = false;
231  }
232  }
233 
234  // Branch 2: empty slot available, try to insert
235  else if (lane_empty >= 0) {
236  if (lane_id == src_lane) {
237  // TODO: check why we cannot put malloc here
238  const uint32_t* unit_data_ptr =
239  (curr_slab_ptr == kHeadSlabAddr)
240  ? get_unit_ptr_from_list_head(src_bucket,
241  lane_empty)
242  : get_unit_ptr_from_list_nodes(curr_slab_ptr,
243  lane_empty);
244 
245  addr_t old_iterator_addr =
246  atomicCAS((unsigned int*)unit_data_ptr, kEmptyNodeAddr,
247  iterator_addr);
248 
249  // Remember to clean up in another pass
250  // Branch 2.1: SUCCEED
251  if (old_iterator_addr == kEmptyNodeAddr) {
252  lane_active = false;
253  mask = true;
254  }
255  // Branch 2.2: failed: RESTART
256  // In the consequent attempt,
257  // > if the same key was inserted in this slot,
258  // we fall back to Branch 1;
259  // > if a different key was inserted,
260  // we go to Branch 2 or 3.
261  }
262  }
263 
264  // Branch 3: nothing found in this slab, goto next slab
265  else {
266  // broadcast next slab
267  addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
268  kNextSlabPtrLaneId, kWarpSize);
269 
270  // Branch 3.1: next slab existing, RESTART this lane
271  if (next_slab_ptr != kEmptySlabAddr) {
272  curr_slab_ptr = next_slab_ptr;
273  }
274 
275  // Branch 3.2: next slab empty, try to allocate one
276  else {
277  addr_t new_next_slab_ptr = AllocateSlab(lane_id);
278 
279  if (lane_id == kNextSlabPtrLaneId) {
280  const uint32_t* unit_data_ptr =
281  (curr_slab_ptr == kHeadSlabAddr)
283  src_bucket, kNextSlabPtrLaneId)
285  curr_slab_ptr,
286  kNextSlabPtrLaneId);
287 
288  addr_t old_next_slab_ptr =
289  atomicCAS((unsigned int*)unit_data_ptr,
290  kEmptySlabAddr, new_next_slab_ptr);
291 
292  // Branch 3.2.1: other thread allocated, RESTART lane. In
293  // the consequent attempt, goto Branch 2'
294  if (old_next_slab_ptr != kEmptySlabAddr) {
295  FreeSlab(new_next_slab_ptr);
296  }
297  // Branch 3.2.2: this thread allocated, RESTART lane, 'goto
298  // Branch 2'
299  }
300  }
301  }
302 
303  prev_work_queue = work_queue;
304  }
305 
306  return mask;
307 }
308 
309 template <typename Key, typename Hash>
311  bool lane_active,
312  uint32_t lane_id,
313  uint32_t bucket_id,
314  const Key& query_key) {
315  uint32_t work_queue = 0;
316  uint32_t prev_work_queue = work_queue;
317  uint32_t curr_slab_ptr = kHeadSlabAddr;
318 
319  addr_t iterator = kNullAddr;
320  bool mask = false;
321 
322  // > Loop when we have active lanes.
323  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
324  // 0. Restart from linked list head if the last query is finished.
325  curr_slab_ptr =
326  (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
327  uint32_t src_lane = __ffs(work_queue) - 1;
328  uint32_t src_bucket =
329  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
330 
331  Key src_key;
332  WarpSyncKey(query_key, src_lane, src_key);
333 
334  // Each lane in the warp reads a unit in the slab in parallel.
335  const uint32_t unit_data =
336  (curr_slab_ptr == kHeadSlabAddr)
337  ? *(get_unit_ptr_from_list_head(src_bucket, lane_id))
338  : *(get_unit_ptr_from_list_nodes(curr_slab_ptr,
339  lane_id));
340 
341  int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data);
342 
343  // 1. Found in this slab, SUCCEED.
344  if (lane_found >= 0) {
345  // broadcast found value
346  addr_t found_pair_internal_ptr = __shfl_sync(
347  kSyncLanesMask, unit_data, lane_found, kWarpSize);
348 
349  if (lane_id == src_lane) {
350  lane_active = false;
351 
352  // Actually iterator_addr
353  iterator = found_pair_internal_ptr;
354  mask = true;
355  }
356  }
357 
358  // 2. Not found in this slab.
359  else {
360  // Broadcast next slab: lane 31 reads 'next'.
361  addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
362  kNextSlabPtrLaneId, kWarpSize);
363 
364  // 2.1. Next slab is empty, ABORT.
365  if (next_slab_ptr == kEmptySlabAddr) {
366  if (lane_id == src_lane) {
367  lane_active = false;
368  }
369  }
370  // 2.2. Next slab exists, RESTART.
371  else {
372  curr_slab_ptr = next_slab_ptr;
373  }
374  }
375 
376  prev_work_queue = work_queue;
377  }
378 
379  return make_pair(iterator, mask);
380 }
381 
382 template <typename Key, typename Hash>
384  bool lane_active,
385  uint32_t lane_id,
386  uint32_t bucket_id,
387  const Key& key) {
388  uint32_t work_queue = 0;
389  uint32_t prev_work_queue = 0;
390  uint32_t curr_slab_ptr = kHeadSlabAddr;
391  Key src_key;
392 
393  addr_t iterator_addr = 0;
394  bool mask = false;
395 
396  // > Loop when we have active lanes.
397  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
398  // 0. Restart from linked list head if last insertion is finished.
399  curr_slab_ptr =
400  (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
401  uint32_t src_lane = __ffs(work_queue) - 1;
402  uint32_t src_bucket =
403  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
404 
405  WarpSyncKey(key, src_lane, src_key);
406 
407  const uint32_t unit_data =
408  (curr_slab_ptr == kHeadSlabAddr)
409  ? *(get_unit_ptr_from_list_head(src_bucket, lane_id))
410  : *(get_unit_ptr_from_list_nodes(curr_slab_ptr,
411  lane_id));
412 
413  int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data);
414 
415  // Branch 1: key found.
416  if (lane_found >= 0) {
417  if (lane_id == src_lane) {
418  uint32_t* unit_data_ptr =
419  (curr_slab_ptr == kHeadSlabAddr)
420  ? get_unit_ptr_from_list_head(src_bucket,
421  lane_found)
422  : get_unit_ptr_from_list_nodes(curr_slab_ptr,
423  lane_found);
424 
425  uint32_t pair_to_delete = atomicExch(
426  (unsigned int*)unit_data_ptr, kEmptyNodeAddr);
427  mask = pair_to_delete != kEmptyNodeAddr;
428  iterator_addr = pair_to_delete;
429  // Branch 1.2: other thread did the job, avoid double free
430  }
431  } else { // no matching slot found:
432  addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
433  kNextSlabPtrLaneId, kWarpSize);
434  if (next_slab_ptr == kEmptySlabAddr) {
435  // not found:
436  if (lane_id == src_lane) {
437  lane_active = false;
438  }
439  } else {
440  curr_slab_ptr = next_slab_ptr;
441  }
442  }
443  prev_work_queue = work_queue;
444  }
445 
446  return make_pair(iterator_addr, mask);
447 }
448 
449 template <typename Key, typename Hash>
450 __device__ void SlabHashmapImpl<Key, Hash>::WarpSyncKey(const Key& key,
451  uint32_t lane_id,
452  Key& ret_key) {
453  auto dst_key_ptr = reinterpret_cast<int*>(&ret_key);
454  auto src_key_ptr = reinterpret_cast<const int*>(&key);
455  for (int i = 0; i < key_size_in_int_; ++i) {
456  dst_key_ptr[i] =
457  __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
458  }
459 }
460 
461 template <typename Key, typename Hash>
463  uint32_t lane_id,
464  addr_t ptr) {
465  bool is_lane_found =
466  // Select key lanes.
467  ((1 << lane_id) & kNodePtrLanesMask)
468  // Validate key addrs.
469  && (ptr != kEmptyNodeAddr)
470  // Find keys in memory heap.
471  &&
472  *static_cast<Key*>(buffer_accessor_.ExtractIterator(ptr).first) ==
473  key;
474 
475  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
476 }
477 
478 template <typename Key, typename Hash>
480  bool is_lane_empty = (ptr == kEmptyNodeAddr);
481  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
482 }
483 
484 template <typename Key, typename Hash>
485 __device__ int64_t
487  return hash_fn_(key) % bucket_count_;
488 }
489 
490 template <typename Key, typename Hash>
492  return node_mgr_impl_.WarpAllocate(lane_id);
493 }
494 
495 template <typename Key, typename Hash>
496 __device__ __forceinline__ void SlabHashmapImpl<Key, Hash>::FreeSlab(
497  addr_t slab_ptr) {
498  node_mgr_impl_.FreeUntouched(slab_ptr);
499 }
500 
501 template <typename Key, typename Hash>
503  const void* input_keys,
504  addr_t* output_addrs,
505  int heap_counter_prev,
506  int64_t count) {
507  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
508  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
509 
510  if (tid < count) {
511  // First write ALL input_keys to avoid potential thread conflicts.
512  addr_t iterator_addr =
513  impl.buffer_accessor_.heap_[heap_counter_prev + tid];
514  iterator_t iterator =
515  impl.buffer_accessor_.ExtractIterator(iterator_addr);
516 
517  *static_cast<Key*>(iterator.first) = input_keys_templated[tid];
518  output_addrs[tid] = iterator_addr;
519  }
520 }
521 
522 template <typename Key, typename Hash>
524  const void* input_keys,
525  addr_t* output_addrs,
526  bool* output_masks,
527  int64_t count) {
528  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
529  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
530  uint32_t lane_id = tid & 0x1F;
531 
532  if (tid - lane_id >= count) {
533  return;
534  }
535 
536  impl.node_mgr_impl_.Init(tid, lane_id);
537 
538  bool lane_active = false;
539  uint32_t bucket_id = 0;
540  addr_t iterator_addr = 0;
541 
542  // Dummy for warp sync.
543  Key key;
544  if (tid < count) {
545  lane_active = true;
546  key = input_keys_templated[tid];
547  iterator_addr = output_addrs[tid];
548  bucket_id = impl.ComputeBucket(key);
549  }
550 
551  // Index out-of-bound threads still have to run for warp synchronization.
552  bool mask =
553  impl.Insert(lane_active, lane_id, bucket_id, key, iterator_addr);
554 
555  if (tid < count) {
556  output_masks[tid] = mask;
557  }
558 }
559 
560 template <typename Key, typename Hash>
562  const void* input_values,
563  addr_t* output_addrs,
564  bool* output_masks,
565  int64_t count) {
566  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
567 
568  if (tid < count) {
569  addr_t iterator_addr = output_addrs[tid];
570 
571  if (output_masks[tid]) {
572  iterator_t iterator =
573  impl.buffer_accessor_.ExtractIterator(iterator_addr);
574 
575  // Success: copy remaining input_values
576  if (input_values != nullptr) {
577  MEMCPY_AS_INTS(iterator.second,
578  static_cast<const uint8_t*>(input_values) +
579  tid * impl.dsize_value_,
580  impl.dsize_value_);
581  }
582  } else {
583  impl.buffer_accessor_.DeviceFree(iterator_addr);
584  }
585  }
586 }
587 
588 template <typename Key, typename Hash>
590  const void* input_keys,
591  addr_t* output_addrs,
592  bool* output_masks,
593  int64_t count) {
594  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
595  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
596  uint32_t lane_id = threadIdx.x & 0x1F;
597 
598  // This warp is idle.
599  if ((tid - lane_id) >= count) {
600  return;
601  }
602 
603  // Initialize the memory allocator on each warp.
604  impl.node_mgr_impl_.Init(tid, lane_id);
605 
606  bool lane_active = false;
607  uint32_t bucket_id = 0;
608 
609  // Dummy for warp sync
610  Key key;
611  Pair<addr_t, bool> result;
612 
613  if (tid < count) {
614  lane_active = true;
615  key = input_keys_templated[tid];
616  bucket_id = impl.ComputeBucket(key);
617  }
618 
619  result = impl.Find(lane_active, lane_id, bucket_id, key);
620 
621  if (tid < count) {
622  output_addrs[tid] = result.first;
623  output_masks[tid] = result.second;
624  }
625 }
626 
627 template <typename Key, typename Hash>
629  const void* input_keys,
630  addr_t* output_addrs,
631  bool* output_masks,
632  int64_t count) {
633  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
634  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
635  uint32_t lane_id = threadIdx.x & 0x1F;
636 
637  if (tid - lane_id >= count) {
638  return;
639  }
640 
641  impl.node_mgr_impl_.Init(tid, lane_id);
642 
643  bool lane_active = false;
644  uint32_t bucket_id = 0;
645 
646  // Dummy for warp sync
647  Key key;
648  if (tid < count) {
649  lane_active = true;
650  key = input_keys_templated[tid];
651  bucket_id = impl.ComputeBucket(key);
652  }
653 
654  auto result = impl.Erase(lane_active, lane_id, bucket_id, key);
655 
656  if (tid < count) {
657  output_addrs[tid] = result.first;
658  output_masks[tid] = result.second;
659  }
660 }
661 
662 template <typename Key, typename Hash>
664  addr_t* output_addrs,
665  bool* output_masks,
666  int64_t count) {
667  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
668  if (tid < count && output_masks[tid]) {
669  impl.buffer_accessor_.DeviceFree(output_addrs[tid]);
670  }
671 }
672 
673 template <typename Key, typename Hash>
675  addr_t* output_addrs,
676  uint32_t* output_iterator_count) {
677  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
678  uint32_t lane_id = threadIdx.x & 0x1F;
679 
680  // Assigning a warp per bucket.
681  uint32_t bucket_id = tid >> 5;
682  if (bucket_id >= impl.bucket_count_) {
683  return;
684  }
685 
686  impl.node_mgr_impl_.Init(tid, lane_id);
687 
688  uint32_t src_unit_data =
689  *impl.get_unit_ptr_from_list_head(bucket_id, lane_id);
690  bool is_active = src_unit_data != kEmptyNodeAddr;
691 
692  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
693  uint32_t index = atomicAdd(output_iterator_count, 1);
694  output_addrs[index] = src_unit_data;
695  }
696 
697  addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
698  kWarpSize);
699 
700  // Count following nodes,
701  while (next != kEmptySlabAddr) {
702  src_unit_data = *impl.get_unit_ptr_from_list_nodes(next, lane_id);
703  is_active = (src_unit_data != kEmptyNodeAddr);
704 
705  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
706  uint32_t index = atomicAdd(output_iterator_count, 1);
707  output_addrs[index] = src_unit_data;
708  }
709  next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
710  kWarpSize);
711  }
712 }
713 
714 template <typename Key, typename Hash>
716  int64_t* bucket_elem_counts) {
717  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
718  uint32_t lane_id = threadIdx.x & 0x1F;
719 
720  // Assigning a warp per bucket.
721  uint32_t bucket_id = tid >> 5;
722  if (bucket_id >= impl.bucket_count_) {
723  return;
724  }
725 
726  impl.node_mgr_impl_.Init(tid, lane_id);
727 
728  uint32_t count = 0;
729 
730  // Count head node.
731  uint32_t src_unit_data =
732  *impl.get_unit_ptr_from_list_head(bucket_id, lane_id);
733  count += __popc(
734  __ballot_sync(kNodePtrLanesMask, src_unit_data != kEmptyNodeAddr));
735  addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
736  kWarpSize);
737 
738  // Count following nodes.
739  while (next != kEmptySlabAddr) {
740  src_unit_data = *impl.get_unit_ptr_from_list_nodes(next, lane_id);
741  count += __popc(__ballot_sync(kNodePtrLanesMask,
742  src_unit_data != kEmptyNodeAddr));
743  next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
744  kWarpSize);
745  }
746 
747  // Write back the results.
748  if (lane_id == 0) {
749  bucket_elem_counts[bucket_id] = count;
750  }
751 }
752 
753 } // namespace core
754 } // namespace open3d
void * first
Definition: SlabTraits.h:54
__device__ int32_t WarpFindEmpty(addr_t unit_data)
Definition: SlabHashmapImpl.h:479
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashmapImpl.h:486
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Definition: SlabHashmapImpl.h:450
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
__global__ void InsertKernelPass2(SlabHashmapImpl< Key, Hash > impl, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:561
__device__ Pair< addr_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Definition: SlabHashmapImpl.h:310
int64_t bucket_count_
Definition: SlabHashmapImpl.h:108
CUDAHashmapBufferAccessor buffer_accessor_
Definition: SlabHashmapImpl.h:115
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:398
Second second
Definition: SlabTraits.h:61
#define MEMCPY_AS_INTS(dst, src, num_bytes)
Definition: SlabMacros.h:97
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:77
__global__ void CountElemsPerBucketKernel(SlabHashmapImpl< Key, Hash > impl, int64_t *bucket_elem_counts)
Definition: SlabHashmapImpl.h:715
Definition: SlabTraits.h:59
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashmapImpl.h:114
__device__ Pair< addr_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Definition: SlabHashmapImpl.h:383
Definition: CUDAHashmapBufferAccessor.h:48
__global__ void InsertKernelPass1(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:523
math::float4 next
Definition: LineSetBuffers.cpp:63
__device__ addr_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashmapImpl.h:491
__device__ void FreeSlab(addr_t slab_ptr)
Definition: SlabHashmapImpl.h:496
int key_size_in_int_
Definition: SlabHashmapImpl.h:118
__device__ iterator_t ExtractIterator(addr_t ptr)
Definition: CUDAHashmapBufferAccessor.h:109
First first
Definition: SlabTraits.h:60
__host__ void Setup(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashmapBufferAccessor &buffer_accessor)
Definition: SlabHashmapImpl.h:177
Definition: SlabTraits.h:49
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, addr_t ptr)
Definition: SlabHashmapImpl.h:462
int64_t dsize_key_
Definition: SlabHashmapImpl.h:110
int count
Definition: FilePCD.cpp:61
Definition: SlabNodeManager.h:68
int64_t capacity_
Definition: SlabHashmapImpl.h:109
Slab * bucket_list_head_
Definition: SlabHashmapImpl.h:113
Definition: PinholeCameraIntrinsic.cpp:35
__global__ void GetActiveIndicesKernel(SlabHashmapImpl< Key, Hash > impl, addr_t *output_addrs, uint32_t *output_iterator_count)
Definition: SlabHashmapImpl.h:674
__global__ void EraseKernelPass0(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:628
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, addr_t iterator_addr)
Definition: SlabHashmapImpl.h:194
uint32_t addr_t
Definition: HashmapBuffer.h:58
__device__ void FreeUntouched(addr_t ptr)
Definition: SlabNodeManager.h:154
Definition: SlabNodeManager.h:59
__global__ void EraseKernelPass1(SlabHashmapImpl< Key, Hash > impl, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:663
__device__ addr_t * get_unit_ptr_from_list_head(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashmapImpl.h:99
void * second
Definition: SlabTraits.h:55
Common CUDA utilities.
Definition: SlabHashmapImpl.h:54
SlabHashmapImpl()
Definition: SlabHashmapImpl.h:173
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:98
__global__ void InsertKernelPass0(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashmapImpl.h:502
int64_t dsize_value_
Definition: SlabHashmapImpl.h:111
__device__ addr_t * get_unit_ptr_from_list_nodes(addr_t slab_ptr, uint32_t lane_id)
Definition: SlabHashmapImpl.h:95
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:68
Hash hash_fn_
Definition: SlabHashmapImpl.h:106
__global__ void FindKernel(SlabHashmapImpl< Key, Hash > impl, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: SlabHashmapImpl.h:589