latest/cpp_api/_std_g_p_u_hash_backend_8h_source.html

// ----------------------------------------------------------------------------

// -                        Open3D: www.open3d.org                            -

// ----------------------------------------------------------------------------

// Copyright (c) 2018-2024 www.open3d.org

// SPDX-License-Identifier: MIT

// ----------------------------------------------------------------------------


#pragma once


#include <stdgpu/memory.h>

#include <stdgpu/utility.h>

#include <thrust/device_vector.h>

#include <thrust/transform.h>


#include <stdgpu/unordered_map.cuh>

#include <type_traits>


#include "open3d/core/CUDAUtils.h"

#include "open3d/core/StdAllocator.h"

#include "open3d/core/hashmap/CUDA/CUDAHashBackendBufferAccessor.h"

#include "open3d/core/hashmap/DeviceHashBackend.h"

#include "open3d/core/hashmap/Dispatch.h"


namespace open3d {

namespace core {


template <typename T>


class StdGPUAllocator {

public:

    using value_type = T;


    StdGPUAllocator() = default;


    explicit StdGPUAllocator(int device_id) : std_allocator_(device_id) {}


    StdGPUAllocator(const StdGPUAllocator&) = default;


    StdGPUAllocator& operator=(const StdGPUAllocator&) = default;


    StdGPUAllocator(StdGPUAllocator&&) = default;


    StdGPUAllocator& operator=(StdGPUAllocator&&) = default;


    template <typename U>


    StdGPUAllocator(const StdGPUAllocator<U>& other)

        : std_allocator_(other.std_allocator_) {}


    T* allocate(std::size_t n) {

        T* p = std_allocator_.allocate(n);

        stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);

        return p;

    }


    void deallocate(T* p, std::size_t n) {

        stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);

        std_allocator_.deallocate(p, n);

    }


    bool operator==(const StdGPUAllocator& other) {

        return std_allocator_ == other.std_allocator_;

    }


    bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }


private:

    // Allow access in rebind constructor.

    template <typename T2>

    friend class StdGPUAllocator;


    StdAllocator<T> std_allocator_;

};


// These typedefs must be defined outside of StdGPUHashBackend to make them

// accessible in raw CUDA kernels.

template <typename Key>

using InternalStdGPUHashBackendAllocator =

        StdGPUAllocator<stdgpu::pair<const Key, buf_index_t>>;


template <typename Key, typename Hash, typename Eq>

using InternalStdGPUHashBackend =

        stdgpu::unordered_map<Key,

                              buf_index_t,

                              Hash,

                              Eq,

                              InternalStdGPUHashBackendAllocator<Key>>;


template <typename Key, typename Hash, typename Eq>


class StdGPUHashBackend : public DeviceHashBackend {

public:

    StdGPUHashBackend(int64_t init_capacity,

                      int64_t key_dsize,

                      const std::vector<int64_t>& value_dsizes,

                      const Device& device);

    ~StdGPUHashBackend();


    void Reserve(int64_t capacity) override;


    void Insert(const void* input_keys,

                const std::vector<const void*>& input_values_soa,

                buf_index_t* output_buf_indices,

                bool* output_masks,

                int64_t count) override;


    void Find(const void* input_keys,

              buf_index_t* output_buf_indices,

              bool* output_masks,

              int64_t count) override;


    void Erase(const void* input_keys,

               bool* output_masks,

               int64_t count) override;


    int64_t GetActiveIndices(buf_index_t* output_indices) override;


    void Clear() override;


    int64_t Size() const override;


    int64_t GetBucketCount() const override;

    std::vector<int64_t> BucketSizes() const override;

    float LoadFactor() const override;


    InternalStdGPUHashBackend<Key, Hash, Eq> GetImpl() const { return impl_; }


    void Allocate(int64_t capacity) override;

    void Free() override;


protected:

    // Use reference, since the structure itself is implicitly handled as a

    // pointer directly by stdgpu.

    InternalStdGPUHashBackend<Key, Hash, Eq> impl_;


    CUDAHashBackendBufferAccessor buffer_accessor_;

};


template <typename Key, typename Hash, typename Eq>


StdGPUHashBackend<Key, Hash, Eq>::StdGPUHashBackend(

        int64_t init_capacity,

        int64_t key_dsize,

        const std::vector<int64_t>& value_dsizes,

        const Device& device)

    : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {

    CUDAScopedDevice scoped_device(this->device_);

    Allocate(init_capacity);

}


template <typename Key, typename Hash, typename Eq>


StdGPUHashBackend<Key, Hash, Eq>::~StdGPUHashBackend() {

    CUDAScopedDevice scoped_device(this->device_);

    Free();

}


template <typename Key, typename Hash, typename Eq>


int64_t StdGPUHashBackend<Key, Hash, Eq>::Size() const {

    CUDAScopedDevice scoped_device(this->device_);

    return impl_.size();

}


// Need an explicit kernel for non-const access to map

template <typename Key, typename Hash, typename Eq>


__global__ void STDGPUFindKernel(InternalStdGPUHashBackend<Key, Hash, Eq> map,

                                 CUDAHashBackendBufferAccessor buffer_accessor,

                                 const Key* input_keys,

                                 buf_index_t* output_buf_indices,

                                 bool* output_masks,

                                 int64_t count) {

    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid >= count) return;


    Key key = input_keys[tid];

    auto iter = map.find(key);

    bool flag = (iter != map.end());

    output_masks[tid] = flag;

    output_buf_indices[tid] = flag ? iter->second : 0;

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,

                                            buf_index_t* output_buf_indices,

                                            bool* output_masks,

                                            int64_t count) {

    CUDAScopedDevice scoped_device(this->device_);

    uint32_t threads = 128;

    uint32_t blocks = (count + threads - 1) / threads;


    STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(

            impl_, buffer_accessor_, static_cast<const Key*>(input_keys),

            output_buf_indices, output_masks, count);

    cuda::Synchronize(this->device_);

}


// Need an explicit kernel for non-const access to map

template <typename Key, typename Hash, typename Eq>


__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend<Key, Hash, Eq> map,

                                  CUDAHashBackendBufferAccessor buffer_accessor,

                                  const Key* input_keys,

                                  buf_index_t* output_buf_indices,

                                  bool* output_masks,

                                  int64_t count) {

    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid >= count) return;


    Key key = input_keys[tid];

    auto iter = map.find(key);

    bool flag = (iter != map.end());

    output_masks[tid] = flag;

    output_buf_indices[tid] = flag ? iter->second : 0;


    if (output_masks[tid]) {

        output_masks[tid] = map.erase(key);

        if (output_masks[tid]) {

            buffer_accessor.DeviceFree(output_buf_indices[tid]);

        }

    }

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,

                                             bool* output_masks,

                                             int64_t count) {

    CUDAScopedDevice scoped_device(this->device_);

    uint32_t threads = 128;

    uint32_t blocks = (count + threads - 1) / threads;


    core::Tensor toutput_buf_indices =

            core::Tensor({count}, core::Int32, this->device_);

    buf_index_t* output_buf_indices =

            static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());


    STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(

            impl_, buffer_accessor_, static_cast<const Key*>(input_keys),

            output_buf_indices, output_masks, count);

    cuda::Synchronize(this->device_);

}


template <typename Key>


struct ValueExtractor {

    OPEN3D_HOST_DEVICE buf_index_t


    operator()(const stdgpu::pair<Key, buf_index_t>& x) const {

        return x.second;

    }


};


template <typename Key, typename Hash, typename Eq>


int64_t StdGPUHashBackend<Key, Hash, Eq>::GetActiveIndices(

        buf_index_t* output_indices) {

    CUDAScopedDevice scoped_device(this->device_);

    auto range = impl_.device_range();


    thrust::transform(range.begin(), range.end(), output_indices,

                      ValueExtractor<Key>());


    return impl_.size();

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Clear() {

    CUDAScopedDevice scoped_device(this->device_);

    impl_.clear();

    this->buffer_->ResetHeap();

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Reserve(int64_t capacity) {

    CUDAScopedDevice scoped_device(this->device_);

}


template <typename Key, typename Hash, typename Eq>


int64_t StdGPUHashBackend<Key, Hash, Eq>::GetBucketCount() const {

    CUDAScopedDevice scoped_device(this->device_);

    return impl_.bucket_count();

}


template <typename Key, typename Hash, typename Eq>


std::vector<int64_t> StdGPUHashBackend<Key, Hash, Eq>::BucketSizes() const {

    CUDAScopedDevice scoped_device(this->device_);

    utility::LogError("Unimplemented");

}


template <typename Key, typename Hash, typename Eq>


float StdGPUHashBackend<Key, Hash, Eq>::LoadFactor() const {

    CUDAScopedDevice scoped_device(this->device_);

    return impl_.load_factor();

}


// Need an explicit kernel for non-const access to map

template <typename Key, typename Hash, typename Eq, typename block_t>


__global__ void STDGPUInsertKernel(

        InternalStdGPUHashBackend<Key, Hash, Eq> map,

        CUDAHashBackendBufferAccessor buffer_accessor,

        const Key* input_keys,

        const void* const* input_values_soa,

        buf_index_t* output_buf_indices,

        bool* output_masks,

        int64_t count,

        int64_t n_values) {

    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid >= count) return;


    Key key = input_keys[tid];

    output_buf_indices[tid] = 0;

    output_masks[tid] = false;


    // First apply 'try insert' with a dummy index

    auto res = map.emplace(key, 0);


    // If success, change the iterator and provide the actual index

    if (res.second) {

        buf_index_t buf_index = buffer_accessor.DeviceAllocate();

        auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);


        // Copy templated key to buffer (duplicate)

        // TODO: hack stdgpu inside and take out the buffer directly

        *static_cast<Key*>(key_ptr) = key;


        // Copy/reset non-templated value in buffer

        for (int j = 0; j < n_values; ++j) {

            const int64_t blocks_per_element =

                    buffer_accessor.value_blocks_per_element_[j];


            block_t* dst_value = static_cast<block_t*>(

                    buffer_accessor.GetValuePtr(buf_index, j));

            const block_t* src_value =

                    static_cast<const block_t*>(input_values_soa[j]) +

                    blocks_per_element * tid;

            for (int b = 0; b < blocks_per_element; ++b) {

                dst_value[b] = src_value[b];

            }

        }


        // Update from the dummy index

        res.first->second = buf_index;


        // Write to return variables

        output_buf_indices[tid] = buf_index;

        output_masks[tid] = true;

    }

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Insert(

        const void* input_keys,

        const std::vector<const void*>& input_values_soa,

        buf_index_t* output_buf_indices,

        bool* output_masks,

        int64_t count) {

    CUDAScopedDevice scoped_device(this->device_);

    uint32_t threads = 128;

    uint32_t blocks = (count + threads - 1) / threads;


    thrust::device_vector<const void*> input_values_soa_device(

            input_values_soa.begin(), input_values_soa.end());


    int64_t n_values = input_values_soa.size();

    const void* const* ptr_input_values_soa =

            thrust::raw_pointer_cast(input_values_soa_device.data());


    DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(

            buffer_accessor_.common_block_size_, [&]() {

                STDGPUInsertKernel<Key, Hash, Eq, block_t>

                        <<<blocks, threads, 0, core::cuda::GetStream()>>>(

                                impl_, buffer_accessor_,

                                static_cast<const Key*>(input_keys),

                                ptr_input_values_soa, output_buf_indices,

                                output_masks, count, n_values);

            });

    cuda::Synchronize(this->device_);

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Allocate(int64_t capacity) {

    CUDAScopedDevice scoped_device(this->device_);

    this->capacity_ = capacity;


    // Allocate buffer for key values.

    this->buffer_ = std::make_shared<HashBackendBuffer>(

            this->capacity_, this->key_dsize_, this->value_dsizes_,

            this->device_);

    buffer_accessor_.Setup(*this->buffer_);


    // stdgpu initializes on the default stream. Set the current stream to

    // ensure correct behavior.

    {

        CUDAScopedStream scoped_stream(cuda::GetDefaultStream());


        impl_ = InternalStdGPUHashBackend<Key, Hash, Eq>::createDeviceObject(

                this->capacity_,

                InternalStdGPUHashBackendAllocator<Key>(this->device_.GetID()));

        cuda::Synchronize(this->device_);

    }

}


template <typename Key, typename Hash, typename Eq>


void StdGPUHashBackend<Key, Hash, Eq>::Free() {

    CUDAScopedDevice scoped_device(this->device_);

    // Buffer is automatically handled by the smart pointer.

    buffer_accessor_.Shutdown(this->device_);


    // stdgpu initializes on the default stream. Set the current stream to

    // ensure correct behavior.

    {

        CUDAScopedStream scoped_stream(cuda::GetDefaultStream());


        InternalStdGPUHashBackend<Key, Hash, Eq>::destroyDeviceObject(impl_);

    }

}


}  // namespace core

}  // namespace open3d

CUDAHashBackendBufferAccessor.h

CUDAUtils.h
Common CUDA utilities.

OPEN3D_HOST_DEVICE
#define OPEN3D_HOST_DEVICE
Definition CUDAUtils.h:43

DeviceHashBackend.h

StdAllocator.h

open3d::core::CUDAHashBackendBufferAccessor
Definition CUDAHashBackendBufferAccessor.h:24

open3d::core::CUDAHashBackendBufferAccessor::value_blocks_per_element_
int64_t * value_blocks_per_element_
Definition CUDAHashBackendBufferAccessor.h:108

open3d::core::CUDAHashBackendBufferAccessor::GetValuePtr
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition CUDAHashBackendBufferAccessor.h:91

open3d::core::CUDAHashBackendBufferAccessor::DeviceAllocate
__device__ buf_index_t DeviceAllocate()
Definition CUDAHashBackendBufferAccessor.h:79

open3d::core::CUDAHashBackendBufferAccessor::GetKeyPtr
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition CUDAHashBackendBufferAccessor.h:88

open3d::core::CUDAHashBackendBufferAccessor::DeviceFree
__device__ void DeviceFree(buf_index_t ptr)
Definition CUDAHashBackendBufferAccessor.h:83

open3d::core::CUDAScopedDevice
When CUDA is not enabled, this is a dummy class.
Definition CUDAUtils.h:213

open3d::core::DeviceHashBackend
Definition DeviceHashBackend.h:20

open3d::core::DeviceHashBackend::device_
Device device_
Definition DeviceHashBackend.h:103

open3d::core::Device
Definition Device.h:18

open3d::core::StdAllocator
Definition StdAllocator.h:23

open3d::core::StdGPUAllocator
Definition StdGPUHashBackend.h:33

open3d::core::StdGPUAllocator::allocate
T * allocate(std::size_t n)
Allocates memory of size n.
Definition StdGPUHashBackend.h:62

open3d::core::StdGPUAllocator::StdGPUAllocator
StdGPUAllocator()=default
Default constructor.

open3d::core::StdGPUAllocator::operator=
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.

open3d::core::StdGPUAllocator::StdGPUAllocator
StdGPUAllocator(int device_id)
Constructor from device.
Definition StdGPUHashBackend.h:42

open3d::core::StdGPUAllocator::StdGPUAllocator
StdGPUAllocator(const StdGPUAllocator &)=default
Default copy constructor.

open3d::core::StdGPUAllocator::deallocate
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition StdGPUHashBackend.h:69

open3d::core::StdGPUAllocator::operator==
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition StdGPUHashBackend.h:75

open3d::core::StdGPUAllocator::operator=
StdGPUAllocator & operator=(StdGPUAllocator &&)=default
Default move assignment operator.

open3d::core::StdGPUAllocator::StdGPUAllocator
StdGPUAllocator(StdGPUAllocator &&)=default
Default move constructor.

open3d::core::StdGPUAllocator::value_type
T value_type
T.
Definition StdGPUHashBackend.h:36

open3d::core::StdGPUAllocator::operator!=
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition StdGPUHashBackend.h:80

open3d::core::StdGPUAllocator::StdGPUAllocator
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition StdGPUHashBackend.h:58

open3d::core::StdGPUHashBackend
Definition StdGPUHashBackend.h:105

open3d::core::StdGPUHashBackend::StdGPUHashBackend
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition StdGPUHashBackend.h:154

open3d::core::StdGPUHashBackend::Erase
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition StdGPUHashBackend.h:235

open3d::core::StdGPUHashBackend::~StdGPUHashBackend
~StdGPUHashBackend()
Definition StdGPUHashBackend.h:165

open3d::core::StdGPUHashBackend::LoadFactor
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition StdGPUHashBackend.h:298

open3d::core::StdGPUHashBackend::GetImpl
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition StdGPUHashBackend.h:140

open3d::core::StdGPUHashBackend::Find
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition StdGPUHashBackend.h:195

open3d::core::StdGPUHashBackend::Insert
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition StdGPUHashBackend.h:358

open3d::core::StdGPUHashBackend::BucketSizes
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition StdGPUHashBackend.h:292

open3d::core::StdGPUHashBackend::impl_
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition StdGPUHashBackend.h:148

open3d::core::StdGPUHashBackend::Reserve
void Reserve(int64_t capacity) override
Definition StdGPUHashBackend.h:281

open3d::core::StdGPUHashBackend::Allocate
void Allocate(int64_t capacity) override
Definition StdGPUHashBackend.h:388

open3d::core::StdGPUHashBackend::Free
void Free() override
Definition StdGPUHashBackend.h:411

open3d::core::StdGPUHashBackend::GetBucketCount
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition StdGPUHashBackend.h:286

open3d::core::StdGPUHashBackend::GetActiveIndices
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition StdGPUHashBackend.h:262

open3d::core::StdGPUHashBackend::Size
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition StdGPUHashBackend.h:171

open3d::core::StdGPUHashBackend::buffer_accessor_
CUDAHashBackendBufferAccessor buffer_accessor_
Definition StdGPUHashBackend.h:150

open3d::core::StdGPUHashBackend::Clear
void Clear() override
Clear stored map without reallocating memory.
Definition StdGPUHashBackend.h:274

open3d::core::Tensor
Definition Tensor.h:32

open3d::core::Tensor::GetDataPtr
T * GetDataPtr()
Definition Tensor.h:1201

Dispatch.h

count
int count
Definition FilePCD.cpp:43

open3d::core::cuda::Synchronize
void Synchronize()
Definition CUDAUtils.cpp:58

open3d::core::STDGPUFindKernel
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition StdGPUHashBackend.h:178

open3d::core::buf_index_t
uint32_t buf_index_t
Definition HashBackendBuffer.h:49

open3d::core::STDGPUEraseKernel
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition StdGPUHashBackend.h:211

open3d::core::Int32
const Dtype Int32
Definition Dtype.cpp:46

open3d::core::InternalStdGPUHashBackend
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition StdGPUHashBackend.h:102

open3d::core::STDGPUInsertKernel
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition StdGPUHashBackend.h:305

open3d
Definition PinholeCameraIntrinsic.cpp:16

open3d::core::ValueExtractor
Definition StdGPUHashBackend.h:254

open3d::core::ValueExtractor::operator()
OPEN3D_HOST_DEVICE buf_index_t operator()(const stdgpu::pair< Key, buf_index_t > &x) const
Definition StdGPUHashBackend.h:256