29 #include <stdgpu/memory.h> 30 #include <thrust/device_vector.h> 31 #include <thrust/transform.h> 33 #include <stdgpu/unordered_map.cuh> 34 #include <type_traits> 77 : std_allocator_(other.std_allocator_) {}
85 T* p = std_allocator_.allocate(n);
86 stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
96 stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
97 std_allocator_.deallocate(p, n);
102 return std_allocator_ == other.std_allocator_;
113 template <
typename T2>
121 template <
typename Key>
125 template <
typename Key,
typename Hash,
typename Eq>
127 stdgpu::unordered_map<Key,
133 template <
typename Key,
typename Hash,
typename Eq>
138 const std::vector<int64_t>& value_dsizes,
142 void Reserve(int64_t capacity)
override;
144 void Insert(
const void* input_keys,
145 const std::vector<const void*>& input_values_soa,
146 buf_index_t* output_buf_indices,
148 int64_t
count)
override;
150 void Find(
const void* input_keys,
151 buf_index_t* output_buf_indices,
153 int64_t count)
override;
155 void Erase(
const void* input_keys,
157 int64_t count)
override;
159 int64_t GetActiveIndices(buf_index_t* output_indices)
override;
161 void Clear()
override;
163 int64_t Size()
const override;
165 int64_t GetBucketCount()
const override;
166 std::vector<int64_t> BucketSizes()
const override;
167 float LoadFactor()
const override;
171 void Allocate(int64_t capacity);
182 template <
typename Key,
typename Hash,
typename Eq>
184 int64_t init_capacity,
186 const std::vector<int64_t>& value_dsizes,
192 template <
typename Key,
typename Hash,
typename Eq>
197 template <
typename Key,
typename Hash,
typename Eq>
203 template <
typename Key,
typename Hash,
typename Eq>
206 const Key* input_keys,
207 buf_index_t* output_buf_indices,
210 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
211 if (tid >= count)
return;
213 Key key = input_keys[tid];
214 auto iter = map.find(key);
215 bool flag = (iter != map.end());
216 output_masks[tid] = flag;
217 output_buf_indices[tid] = flag ? iter->second : 0;
220 template <
typename Key,
typename Hash,
typename Eq>
222 buf_index_t* output_buf_indices,
226 uint32_t blocks = (count + threads - 1) / threads;
228 STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
230 output_buf_indices, output_masks, count);
235 template <
typename Key,
typename Hash,
typename Eq>
238 const Key* input_keys,
239 buf_index_t* output_buf_indices,
242 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
243 if (tid >= count)
return;
245 Key key = input_keys[tid];
246 auto iter = map.find(key);
247 bool flag = (iter != map.end());
248 output_masks[tid] = flag;
249 output_buf_indices[tid] = flag ? iter->second : 0;
251 if (output_masks[tid]) {
252 output_masks[tid] = map.erase(key);
253 if (output_masks[tid]) {
254 buffer_accessor.
DeviceFree(output_buf_indices[tid]);
259 template <
typename Key,
typename Hash,
typename Eq>
264 uint32_t blocks = (count + threads - 1) / threads;
268 buf_index_t* output_buf_indices =
269 static_cast<buf_index_t*
>(toutput_buf_indices.
GetDataPtr());
271 STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
273 output_buf_indices, output_masks, count);
277 template <
typename Key>
285 template <
typename Key,
typename Hash,
typename Eq>
287 buf_index_t* output_indices) {
288 auto range =
impl_.device_range();
290 thrust::transform(range.begin(), range.end(), output_indices,
296 template <
typename Key,
typename Hash,
typename Eq>
302 template <
typename Key,
typename Hash,
typename Eq>
305 template <
typename Key,
typename Hash,
typename Eq>
307 return impl_.bucket_count();
310 template <
typename Key,
typename Hash,
typename Eq>
315 template <
typename Key,
typename Hash,
typename Eq>
317 return impl_.load_factor();
321 template <
typename Key,
typename Hash,
typename Eq,
typename block_t>
325 const Key* input_keys,
326 const void*
const* input_values_soa,
327 buf_index_t* output_buf_indices,
331 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
332 if (tid >= count)
return;
334 Key key = input_keys[tid];
335 output_buf_indices[tid] = 0;
336 output_masks[tid] =
false;
339 auto res = map.emplace(key, 0);
344 auto key_ptr = buffer_accessor.
GetKeyPtr(buf_index);
348 *
static_cast<Key*
>(key_ptr) = key;
351 for (
int j = 0; j < n_values; ++j) {
352 const int64_t blocks_per_element =
355 block_t* dst_value =
static_cast<block_t*
>(
357 const block_t* src_value =
358 static_cast<const block_t*
>(input_values_soa[j]) +
359 blocks_per_element * tid;
360 for (
int b = 0; b < blocks_per_element; ++b) {
361 dst_value[b] = src_value[b];
366 res.first->second = buf_index;
369 output_buf_indices[tid] = buf_index;
370 output_masks[tid] =
true;
374 template <
typename Key,
typename Hash,
typename Eq>
376 const void* input_keys,
377 const std::vector<const void*>& input_values_soa,
378 buf_index_t* output_buf_indices,
382 uint32_t blocks = (count + threads - 1) / threads;
384 thrust::device_vector<const void*> input_values_soa_device(
385 input_values_soa.begin(), input_values_soa.end());
387 int64_t n_values = input_values_soa.size();
388 const void*
const* ptr_input_values_soa =
389 thrust::raw_pointer_cast(input_values_soa_device.data());
391 DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
393 STDGPUInsertKernel<Key, Hash, Eq, block_t>
394 <<<blocks, threads, 0, core::cuda::GetStream()>>>(
395 impl_, buffer_accessor_,
396 static_cast<const Key*>(input_keys),
397 ptr_input_values_soa, output_buf_indices,
398 output_masks, count, n_values);
403 template <
typename Key,
typename Hash,
typename Eq>
408 this->
buffer_ = std::make_shared<HashBackendBuffer>(
416 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
425 template <
typename Key,
typename Hash,
typename Eq>
433 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
std::shared_ptr< HashBackendBuffer > buffer_
Definition: DeviceHashBackend.h:121
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:102
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: StdGPUHashBackend.h:179
Definition: CUDAHashBackendBufferAccessor.h:43
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashBackend.h:101
Definition: StdGPUHashBackend.h:134
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashBackend.h:76
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashBackend.h:106
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashBackend.h:286
T value_type
T.
Definition: StdGPUHashBackend.h:54
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
std::vector< int64_t > value_dsizes_
Definition: DeviceHashBackend.h:117
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:127
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition: StdGPUHashBackend.h:183
~StdGPUHashBackend()
Definition: StdGPUHashBackend.h:193
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashBackend.h:91
Device device_
Definition: DeviceHashBackend.h:119
void Reserve(int64_t capacity) override
Definition: StdGPUHashBackend.h:303
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
int64_t key_dsize_
Definition: DeviceHashBackend.h:116
Definition: StdAllocator.h:42
int64_t capacity_
Definition: DeviceHashBackend.h:114
void Allocate(int64_t capacity)
Definition: StdGPUHashBackend.h:404
void Synchronize()
Definition: CUDAUtils.cpp:78
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:204
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashBackend.h:221
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:63
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition: StdGPUHashBackend.h:131
const Dtype Int32
Definition: Dtype.cpp:65
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition: StdGPUHashBackend.h:169
__host__ void Setup(HashBackendBuffer &hashmap_buffer)
Definition: CUDAHashBackendBufferAccessor.h:45
__host__ void Shutdown(const Device &device)
Definition: CUDAHashBackendBufferAccessor.h:92
StdGPUAllocator(const Device &device)
Constructor from device.
Definition: StdGPUHashBackend.h:60
Definition: StdGPUHashBackend.h:51
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition: StdGPUHashBackend.h:306
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashBackend.h:80
StdGPUAllocator()=default
Default constructor.
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:236
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:107
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashBackend.h:297
void Free()
Definition: StdGPUHashBackend.h:426
Definition: PinholeCameraIntrinsic.cpp:35
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition: StdGPUHashBackend.h:177
int64_t common_block_size_
Definition: CUDAHashBackendBufferAccessor.h:124
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition: StdGPUHashBackend.h:316
Definition: DeviceHashBackend.h:39
__device__ buf_index_t DeviceAllocate()
Definition: CUDAHashBackendBufferAccessor.h:98
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashBackend.h:260
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: StdGPUHashBackend.h:322
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:724
T * GetDataPtr()
Definition: Tensor.h:1074
void Insert(const void *input_keys, const std::vector< const void *> &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashBackend.h:375
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:110
Device GetDevice() const
Returns the device on which memory is allocated.
Definition: StdGPUHashBackend.h:109
#define LogError(...)
Definition: Logging.h:72
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition: StdGPUHashBackend.h:311
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition: StdGPUHashBackend.h:198