Loading [MathJax]/extensions/TeX/AMSsymbols.js
Open3D (C++ API)  0.14.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
SlabHashBackendImpl.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018-2021 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 // Copyright 2019 Saman Ashkiani
28 //
29 // Licensed under the Apache License, Version 2.0 (the "License");
30 // you may not use this file except in compliance with the License.
31 // You may obtain a copy of the License at
32 //
33 // http://www.apache.org/licenses/LICENSE-2.0
34 //
35 // Unless required by applicable law or agreed to in writing, software
36 // distributed under the License is distributed on an "AS IS" BASIS,
37 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38 // implied. See the License for the specific language governing permissions
39 // and limitations under the License.
40 
41 #pragma once
42 
43 #include "open3d/core/CUDAUtils.h"
49 
50 namespace open3d {
51 namespace core {
52 
53 // Each slab contains a collection of uint32_t entries.
54 // Each uint32_t entry can represent:
55 // 0) an empty placeholder;
56 // 1) a stored buf_index;
57 // 2) a ptr to the next slab if at the end of the slab.
58 // In case 0) and 1), it is interpreted as a buf_index_t.
59 // In case 2), it is intepreted as uint32_t.
60 // They are equivalent, but we differentiate them in the implementation to
61 // emphasize the differences.
62 
63 template <typename Key, typename Hash, typename Eq>
65 public:
67 
68  __host__ void Setup(int64_t init_buckets,
69  const SlabNodeManagerImpl& node_mgr_impl,
70  const CUDAHashBackendBufferAccessor& buffer_accessor);
71 
73  __device__ bool Insert(bool lane_active,
74  uint32_t lane_id,
75  uint32_t bucket_id,
76  const Key& key,
77  buf_index_t buf_index);
78 
80  __device__ Pair<buf_index_t, bool> Find(bool lane_active,
81  uint32_t lane_id,
82  uint32_t bucket_id,
83  const Key& key);
84 
86  __device__ Pair<buf_index_t, bool> Erase(bool lane_active,
87  uint32_t lane_id,
88  uint32_t bucket_id,
89  const Key& key);
90 
92  __device__ void WarpSyncKey(const Key& key, uint32_t lane_id, Key& ret_key);
93 
95  __device__ int32_t WarpFindKey(const Key& src_key,
96  uint32_t lane_id,
97  uint32_t slab_entry);
98 
100  __device__ int32_t WarpFindEmpty(uint32_t slab_entry);
101 
102  // Hash function.
103  __device__ int64_t ComputeBucket(const Key& key) const;
104 
105  // Node manager.
106  __device__ uint32_t AllocateSlab(uint32_t lane_id);
107  __device__ void FreeSlab(uint32_t slab_ptr);
108 
109  // Helpers.
110  __device__ uint32_t* SlabEntryPtr(uint32_t bucket_id,
111  uint32_t lane_id,
112  uint32_t slab_ptr) {
113  return (slab_ptr == kHeadSlabAddr)
114  ? SlabEntryPtrFromHead(bucket_id, lane_id)
115  : SlabEntryPtrFromNodes(slab_ptr, lane_id);
116  }
117 
119  uint32_t lane_id) {
120  return node_mgr_impl_.get_unit_ptr_from_slab(slab_ptr, lane_id);
121  }
122  __device__ uint32_t* SlabEntryPtrFromHead(uint32_t bucket_id,
123  uint32_t lane_id) {
124  return reinterpret_cast<uint32_t*>(bucket_list_head_) +
125  bucket_id * kWarpSize + lane_id;
126  }
127 
128 public:
129  Hash hash_fn_;
130  Eq eq_fn_;
131  int64_t bucket_count_;
132 
136 
137  // TODO: verify size with alignment
138  int key_size_in_int_ = sizeof(Key) / sizeof(int);
139 };
140 
142 template <typename Key, typename Hash, typename Eq>
144  const void* input_keys,
145  buf_index_t* output_buf_indices,
146  int heap_counter_prev,
147  int64_t count);
148 
149 template <typename Key, typename Hash, typename Eq>
151  const void* input_keys,
152  buf_index_t* output_buf_indices,
153  bool* output_masks,
154  int64_t count);
155 
156 template <typename Key, typename Hash, typename Eq, typename block_t>
158  const void* const* input_values_soa,
159  buf_index_t* output_buf_indices,
160  bool* output_masks,
161  int64_t count,
162  int64_t n_values);
163 
164 template <typename Key, typename Hash, typename Eq>
166  const void* input_keys,
167  buf_index_t* output_buf_indices,
168  bool* output_masks,
169  int64_t count);
170 
171 template <typename Key, typename Hash, typename Eq>
173  const void* input_keys,
174  buf_index_t* output_buf_indices,
175  bool* output_masks,
176  int64_t count);
177 
178 template <typename Key, typename Hash, typename Eq>
180  buf_index_t* output_buf_indices,
181  bool* output_masks,
182  int64_t count);
183 
184 template <typename Key, typename Hash, typename Eq>
186  buf_index_t* output_buf_indices,
187  uint32_t* output_count);
188 
189 template <typename Key, typename Hash, typename Eq>
190 __global__ void CountElemsPerBucketKernel(
191  SlabHashBackendImpl<Key, Hash, Eq> impl, int64_t* bucket_elem_counts);
192 
193 template <typename Key, typename Hash, typename Eq>
195  : bucket_count_(0), bucket_list_head_(nullptr) {}
196 
197 template <typename Key, typename Hash, typename Eq>
199  int64_t init_buckets,
200  const SlabNodeManagerImpl& allocator_impl,
201  const CUDAHashBackendBufferAccessor& buffer_accessor) {
202  bucket_count_ = init_buckets;
203  node_mgr_impl_ = allocator_impl;
204  buffer_accessor_ = buffer_accessor;
205 }
206 
207 template <typename Key, typename Hash, typename Eq>
209  bool lane_active,
210  uint32_t lane_id,
211  uint32_t bucket_id,
212  const Key& key,
213  buf_index_t buf_index) {
214  uint32_t work_queue = 0;
215  uint32_t prev_work_queue = 0;
216  uint32_t slab_ptr = kHeadSlabAddr;
217  Key src_key;
218 
219  bool mask = false;
220 
221  // > Loop when we have active lanes
222  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
223  // 0. Restart from linked list head if last insertion is finished
224  slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
225  uint32_t src_lane = __ffs(work_queue) - 1;
226  uint32_t src_bucket =
227  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
228  WarpSyncKey(key, src_lane, src_key);
229 
230  uint32_t slab_entry = *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
231 
232  int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
233  int32_t lane_empty = WarpFindEmpty(slab_entry);
234 
235  // Branch 1: key already existing, ABORT
236  if (lane_found >= 0) {
237  if (lane_id == src_lane) {
238  lane_active = false;
239  }
240  }
241 
242  // Branch 2: empty slot available, try to insert
243  else if (lane_empty >= 0) {
244  // Cannot merge if statements.
245  // otherwise the warp flow will be interrupted.
246  if (lane_id == src_lane) {
247  // Now regard the entry as a value of buf_index
248  const uint32_t* empty_entry_ptr =
249  SlabEntryPtr(src_bucket, lane_empty, slab_ptr);
250 
251  uint32_t old_empty_entry_value =
252  atomicCAS((unsigned int*)empty_entry_ptr,
253  kEmptyNodeAddr, buf_index);
254 
255  // Branch 2.1: SUCCEED
256  if (old_empty_entry_value == kEmptyNodeAddr) {
257  lane_active = false;
258  mask = true;
259  }
260  // Branch 2.2: failed: RESTART
261  // In the consequent attempt,
262  // > if the same key was inserted in this slot,
263  // we fall back to Branch 1;
264  // > if a different key was inserted,
265  // we go to Branch 2 or 3.
266  }
267  }
268 
269  // Branch 3: nothing found in this slab, goto next slab
270  else {
271  // broadcast next slab
272  uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
273  kNextSlabPtrLaneId, kWarpSize);
274 
275  // Branch 3.1: next slab existing, RESTART at updated slab ptr
276  if (next_slab_ptr != kEmptySlabAddr) {
277  slab_ptr = next_slab_ptr;
278  }
279 
280  // Branch 3.2: next slab empty, try to allocate one from the Slab
281  // buffer.
282  else {
283  // Warp allocate, must be outside the condition clause.
284  uint32_t new_next_slab_ptr = AllocateSlab(lane_id);
285 
286  if (lane_id == kNextSlabPtrLaneId) {
287  const uint32_t* next_slab_entry_ptr = SlabEntryPtr(
288  src_bucket, kNextSlabPtrLaneId, slab_ptr);
289 
290  uint32_t old_next_slab_entry_value =
291  atomicCAS((unsigned int*)next_slab_entry_ptr,
292  kEmptySlabAddr, new_next_slab_ptr);
293 
294  // Branch 3.2.1: other thread has allocated,
295  // RESTART. In the consequent attempt, goto Branch 2.
296  if (old_next_slab_entry_value != kEmptySlabAddr) {
297  FreeSlab(new_next_slab_ptr);
298  }
299 
300  // Branch 3.2.2: this thread allocated succesfully.
301  // RESTART, goto Branch 2
302  }
303  }
304  }
305 
306  prev_work_queue = work_queue;
307  }
308 
309  return mask;
310 }
311 
312 template <typename Key, typename Hash, typename Eq>
314  bool lane_active,
315  uint32_t lane_id,
316  uint32_t bucket_id,
317  const Key& query_key) {
318  uint32_t work_queue = 0;
319  uint32_t prev_work_queue = work_queue;
320  uint32_t slab_ptr = kHeadSlabAddr;
321 
322  buf_index_t buf_index = kNullAddr;
323  bool mask = false;
324 
325  // > Loop when we have active lanes.
326  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
327  // 0. Restart from linked list head if the last query is finished.
328  slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
329  uint32_t src_lane = __ffs(work_queue) - 1;
330  uint32_t src_bucket =
331  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
332 
333  Key src_key;
334  WarpSyncKey(query_key, src_lane, src_key);
335 
336  // Each lane in the warp reads a unit in the slab in parallel.
337  const uint32_t slab_entry =
338  *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
339 
340  int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
341 
342  // 1. Found in this slab, SUCCEED.
343  if (lane_found >= 0) {
344  // broadcast found value
345  uint32_t found_buf_index = __shfl_sync(kSyncLanesMask, slab_entry,
346  lane_found, kWarpSize);
347 
348  if (lane_id == src_lane) {
349  lane_active = false;
350  buf_index = found_buf_index;
351  mask = true;
352  }
353  }
354 
355  // 2. Not found in this slab.
356  else {
357  // Broadcast next slab: lane 31 reads 'next'.
358  uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
359  kNextSlabPtrLaneId, kWarpSize);
360 
361  // 2.1. Next slab is empty, ABORT.
362  if (next_slab_ptr == kEmptySlabAddr) {
363  if (lane_id == src_lane) {
364  lane_active = false;
365  }
366  }
367  // 2.2. Next slab exists, RESTART.
368  else {
369  slab_ptr = next_slab_ptr;
370  }
371  }
372 
373  prev_work_queue = work_queue;
374  }
375 
376  return make_pair(buf_index, mask);
377 }
378 
379 template <typename Key, typename Hash, typename Eq>
381  bool lane_active,
382  uint32_t lane_id,
383  uint32_t bucket_id,
384  const Key& key) {
385  uint32_t work_queue = 0;
386  uint32_t prev_work_queue = 0;
387  uint32_t slab_ptr = kHeadSlabAddr;
388  Key src_key;
389 
390  buf_index_t buf_index = 0;
391  bool mask = false;
392 
393  // > Loop when we have active lanes.
394  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
395  // 0. Restart from linked list head if last insertion is finished.
396  slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
397  uint32_t src_lane = __ffs(work_queue) - 1;
398  uint32_t src_bucket =
399  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
400 
401  WarpSyncKey(key, src_lane, src_key);
402 
403  const uint32_t slab_entry =
404  *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
405 
406  int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
407 
408  // Branch 1: key found.
409  if (lane_found >= 0) {
410  if (lane_id == src_lane) {
411  uint32_t* found_entry_ptr =
412  SlabEntryPtr(src_bucket, lane_found, slab_ptr);
413 
414  uint32_t old_found_entry_value = atomicExch(
415  (unsigned int*)found_entry_ptr, kEmptyNodeAddr);
416 
417  // Branch 1.2: other thread might have done the job,
418  // avoid double free.
419  mask = (old_found_entry_value != kEmptyNodeAddr);
420  buf_index = old_found_entry_value;
421  }
422  } else { // no matching slot found:
423  uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
424  kNextSlabPtrLaneId, kWarpSize);
425  if (next_slab_ptr == kEmptySlabAddr) {
426  // not found:
427  if (lane_id == src_lane) {
428  lane_active = false;
429  }
430  } else {
431  slab_ptr = next_slab_ptr;
432  }
433  }
434  prev_work_queue = work_queue;
435  }
436 
437  return make_pair(buf_index, mask);
438 }
439 
440 template <typename Key, typename Hash, typename Eq>
442  const Key& key, uint32_t lane_id, Key& ret_key) {
443  auto dst_key_ptr = reinterpret_cast<int*>(&ret_key);
444  auto src_key_ptr = reinterpret_cast<const int*>(&key);
445  for (int i = 0; i < key_size_in_int_; ++i) {
446  dst_key_ptr[i] =
447  __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
448  }
449 }
450 
451 template <typename Key, typename Hash, typename Eq>
453  const Key& key, uint32_t lane_id, uint32_t slab_entry) {
454  bool is_lane_found =
455  // Select key lanes.
456  ((1 << lane_id) & kNodePtrLanesMask)
457  // Validate key buf_indices.
458  && (slab_entry != kEmptyNodeAddr)
459  // Find keys in buffer. Now slab_entry is interpreted as buf_index.
460  &&
461  eq_fn_(*static_cast<Key*>(buffer_accessor_.GetKeyPtr(slab_entry)),
462  key);
463 
464  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
465 }
466 
467 template <typename Key, typename Hash, typename Eq>
468 __device__ int32_t
470  bool is_lane_empty = (slab_entry == kEmptyNodeAddr);
471  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
472 }
473 
474 template <typename Key, typename Hash, typename Eq>
475 __device__ int64_t
477  return hash_fn_(key) % bucket_count_;
478 }
479 
480 template <typename Key, typename Hash, typename Eq>
481 __device__ uint32_t
483  return node_mgr_impl_.WarpAllocate(lane_id);
484 }
485 
486 template <typename Key, typename Hash, typename Eq>
487 __device__ __forceinline__ void SlabHashBackendImpl<Key, Hash, Eq>::FreeSlab(
488  uint32_t slab_ptr) {
489  node_mgr_impl_.FreeUntouched(slab_ptr);
490 }
491 
492 template <typename Key, typename Hash, typename Eq>
494  const void* input_keys,
495  buf_index_t* output_buf_indices,
496  int heap_counter_prev,
497  int64_t count) {
498  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
499  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
500 
501  if (tid < count) {
502  // First write ALL input_keys to avoid potential thread conflicts.
503  buf_index_t buf_index =
504  impl.buffer_accessor_.heap_[heap_counter_prev + tid];
505  void* key = impl.buffer_accessor_.GetKeyPtr(buf_index);
506  *static_cast<Key*>(key) = input_keys_templated[tid];
507  output_buf_indices[tid] = buf_index;
508  }
509 }
510 
511 template <typename Key, typename Hash, typename Eq>
513  const void* input_keys,
514  buf_index_t* output_buf_indices,
515  bool* output_masks,
516  int64_t count) {
517  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
518  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
519  uint32_t lane_id = tid & 0x1F;
520 
521  if (tid - lane_id >= count) {
522  return;
523  }
524 
525  impl.node_mgr_impl_.Init(tid, lane_id);
526 
527  bool lane_active = false;
528  uint32_t bucket_id = 0;
529  buf_index_t buf_index = 0;
530 
531  // Dummy for warp sync.
532  Key key;
533  if (tid < count) {
534  lane_active = true;
535  key = input_keys_templated[tid];
536  buf_index = output_buf_indices[tid];
537  bucket_id = impl.ComputeBucket(key);
538  }
539 
540  // Index out-of-bound threads still have to run for warp synchronization.
541  bool mask = impl.Insert(lane_active, lane_id, bucket_id, key, buf_index);
542 
543  if (tid < count) {
544  output_masks[tid] = mask;
545  }
546 }
547 
548 template <typename Key, typename Hash, typename Eq, typename block_t>
550  const void* const* input_values_soa,
551  buf_index_t* output_buf_indices,
552  bool* output_masks,
553  int64_t count,
554  int64_t n_values) {
555  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
556 
557  if (tid < count) {
558  buf_index_t buf_index = output_buf_indices[tid];
559 
560  if (output_masks[tid]) {
561  for (int j = 0; j < n_values; ++j) {
562  int64_t blocks_per_element =
563  impl.buffer_accessor_.value_blocks_per_element_[j];
564 
565  block_t* dst_value = static_cast<block_t*>(
566  impl.buffer_accessor_.GetValuePtr(buf_index, j));
567  const block_t* src_value =
568  static_cast<const block_t*>(input_values_soa[j]) +
569  blocks_per_element * tid;
570  for (int b = 0; b < blocks_per_element; ++b) {
571  dst_value[b] = src_value[b];
572  }
573  }
574  } else {
575  impl.buffer_accessor_.DeviceFree(buf_index);
576  }
577  }
578 }
579 
580 template <typename Key, typename Hash, typename Eq>
582  const void* input_keys,
583  buf_index_t* output_buf_indices,
584  bool* output_masks,
585  int64_t count) {
586  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
587  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
588  uint32_t lane_id = threadIdx.x & 0x1F;
589 
590  // This warp is idle.
591  if ((tid - lane_id) >= count) {
592  return;
593  }
594 
595  // Initialize the memory allocator on each warp.
596  impl.node_mgr_impl_.Init(tid, lane_id);
597 
598  bool lane_active = false;
599  uint32_t bucket_id = 0;
600 
601  // Dummy for warp sync
602  Key key;
604 
605  if (tid < count) {
606  lane_active = true;
607  key = input_keys_templated[tid];
608  bucket_id = impl.ComputeBucket(key);
609  }
610 
611  result = impl.Find(lane_active, lane_id, bucket_id, key);
612 
613  if (tid < count) {
614  output_buf_indices[tid] = result.first;
615  output_masks[tid] = result.second;
616  }
617 }
618 
619 template <typename Key, typename Hash, typename Eq>
621  const void* input_keys,
622  buf_index_t* output_buf_indices,
623  bool* output_masks,
624  int64_t count) {
625  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
626  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
627  uint32_t lane_id = threadIdx.x & 0x1F;
628 
629  if (tid - lane_id >= count) {
630  return;
631  }
632 
633  impl.node_mgr_impl_.Init(tid, lane_id);
634 
635  bool lane_active = false;
636  uint32_t bucket_id = 0;
637 
638  // Dummy for warp sync
639  Key key;
640  if (tid < count) {
641  lane_active = true;
642  key = input_keys_templated[tid];
643  bucket_id = impl.ComputeBucket(key);
644  }
645 
646  auto result = impl.Erase(lane_active, lane_id, bucket_id, key);
647 
648  if (tid < count) {
649  output_buf_indices[tid] = result.first;
650  output_masks[tid] = result.second;
651  }
652 }
653 
654 template <typename Key, typename Hash, typename Eq>
656  buf_index_t* output_buf_indices,
657  bool* output_masks,
658  int64_t count) {
659  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
660  if (tid < count && output_masks[tid]) {
661  impl.buffer_accessor_.DeviceFree(output_buf_indices[tid]);
662  }
663 }
664 
665 template <typename Key, typename Hash, typename Eq>
667  buf_index_t* output_buf_indices,
668  uint32_t* output_count) {
669  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
670  uint32_t lane_id = threadIdx.x & 0x1F;
671 
672  // Assigning a warp per bucket.
673  uint32_t bucket_id = tid >> 5;
674  if (bucket_id >= impl.bucket_count_) {
675  return;
676  }
677 
678  impl.node_mgr_impl_.Init(tid, lane_id);
679 
680  uint32_t slab_entry = *impl.SlabEntryPtrFromHead(bucket_id, lane_id);
681  bool is_active = slab_entry != kEmptyNodeAddr;
682 
683  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
684  uint32_t index = atomicAdd(output_count, 1);
685  output_buf_indices[index] = slab_entry;
686  }
687 
688  uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
689  kNextSlabPtrLaneId, kWarpSize);
690 
691  // Count following nodes,
692  while (slab_ptr != kEmptySlabAddr) {
693  slab_entry = *impl.SlabEntryPtrFromNodes(slab_ptr, lane_id);
694  is_active = (slab_entry != kEmptyNodeAddr);
695 
696  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
697  uint32_t index = atomicAdd(output_count, 1);
698  output_buf_indices[index] = slab_entry;
699  }
700  slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
701  kWarpSize);
702  }
703 }
704 
705 template <typename Key, typename Hash, typename Eq>
707  SlabHashBackendImpl<Key, Hash, Eq> impl, int64_t* bucket_elem_counts) {
708  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
709  uint32_t lane_id = threadIdx.x & 0x1F;
710 
711  // Assigning a warp per bucket.
712  uint32_t bucket_id = tid >> 5;
713  if (bucket_id >= impl.bucket_count_) {
714  return;
715  }
716 
717  impl.node_mgr_impl_.Init(tid, lane_id);
718 
719  uint32_t count = 0;
720 
721  // Count head node.
722  uint32_t slab_entry = *impl.SlabEntryPtrFromHead(bucket_id, lane_id);
723  count += __popc(
724  __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
725  uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
726  kNextSlabPtrLaneId, kWarpSize);
727 
728  // Count following nodes.
729  while (slab_ptr != kEmptySlabAddr) {
730  slab_entry = *impl.SlabEntryPtrFromNodes(slab_ptr, lane_id);
731  count += __popc(
732  __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
733  slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
734  kWarpSize);
735  }
736 
737  // Write back the results.
738  if (lane_id == 0) {
739  bucket_elem_counts[bucket_id] = count;
740  }
741 }
742 
743 } // namespace core
744 } // namespace open3d
int key_size_in_int_
Definition: SlabHashBackendImpl.h:138
__global__ void InsertKernelPass2(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: SlabHashBackendImpl.h:549
Definition: CUDAHashBackendBufferAccessor.h:43
SlabHashBackendImpl()
Definition: SlabHashBackendImpl.h:194
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
Definition: SlabHashBackendImpl.h:666
__device__ uint32_t * SlabEntryPtrFromHead(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:122
__device__ Pair< buf_index_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-erase an entry at key.
Definition: SlabHashBackendImpl.h:380
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:153
__device__ int32_t WarpFindEmpty(uint32_t slab_entry)
Warp-find the first empty slot in a slab.
Definition: SlabHashBackendImpl.h:469
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:76
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:398
Slab * bucket_list_head_
Definition: SlabHashBackendImpl.h:133
Second second
Definition: SlabTraits.h:61
__device__ Pair< buf_index_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-find a buf_index and its mask at key.
Definition: SlabHashBackendImpl.h:313
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashBackendImpl.h:493
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, uint32_t slab_entry)
Warp-find a key in a slab.
Definition: SlabHashBackendImpl.h:452
__host__ void Setup(int64_t init_buckets, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashBackendBufferAccessor &buffer_accessor)
Definition: SlabHashBackendImpl.h:198
__device__ uint32_t * SlabEntryPtr(uint32_t bucket_id, uint32_t lane_id, uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:110
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:581
Definition: SlabTraits.h:59
__device__ uint32_t * SlabEntryPtrFromNodes(uint32_t slab_ptr, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:118
int count
Definition: FilePCD.cpp:61
__device__ void FreeSlab(uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:487
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, buf_index_t buf_index)
Warp-insert a pre-allocated buf_index at key.
Definition: SlabHashBackendImpl.h:208
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashBackendImpl.h:134
First first
Definition: SlabTraits.h:60
Eq eq_fn_
Definition: SlabHashBackendImpl.h:130
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:620
Definition: SlabHashBackendImpl.h:64
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:107
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:655
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Warp-synchronize a key in a slab.
Definition: SlabHashBackendImpl.h:441
Hash hash_fn_
Definition: SlabHashBackendImpl.h:129
Definition: SlabNodeManager.h:67
int64_t bucket_count_
Definition: SlabHashBackendImpl.h:131
Definition: PinholeCameraIntrinsic.cpp:35
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: SlabHashBackendImpl.h:135
__device__ uint32_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashBackendImpl.h:482
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
Definition: SlabHashBackendImpl.h:706
Definition: SlabNodeManager.h:58
Common CUDA utilities.
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:97
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:512
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:68
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashBackendImpl.h:476