latest/cpp_api/_slab_node_manager_8h_source.html

 // ----------------------------------------------------------------------------

 // -                        Open3D: www.open3d.org                            -

 // ----------------------------------------------------------------------------

 // Copyright (c) 2018-2024 www.open3d.org

 // SPDX-License-Identifier: MIT

 // ----------------------------------------------------------------------------


 // Copyright 2019 Saman Ashkiani

 // Rewritten by Wei Dong 2019 - 2020

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 //

 // http://www.apache.org/licenses/LICENSE-2.0

 //

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or

 // implied. See the License for the specific language governing permissions

 // and limitations under the License.


 #pragma once


 #include <thrust/device_vector.h>


 #include <memory>


 #include "open3d/core/CUDAUtils.h"

 #include "open3d/core/MemoryManager.h"

 #include "open3d/core/hashmap/CUDA/SlabMacros.h"

 #include "open3d/core/hashmap/HashBackendBuffer.h"

 #include "open3d/utility/Random.h"


 namespace open3d {

 namespace core {


 class Slab {

 public:

     buf_index_t kv_pair_ptrs[kWarpSize - 1];

     buf_index_t next_slab_ptr;

 };


 class SlabNodeManagerImpl {

 public:

     SlabNodeManagerImpl()

         : super_blocks_(nullptr),

           hash_coef_(0),

           num_attempts_(0),

           memory_block_index_(0),

           super_block_index_(0) {}


     __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(

             const buf_index_t& next_slab_ptr, const uint32_t& lane_id) {

         return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;

     }

     __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(

             const uint32_t super_block_idx, const uint32_t bitmap_idx) {

         return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +

                bitmap_idx;

     }


     // Objective: each warp selects its own memory_block warp allocator.

     __device__ void Init(uint32_t& tid, uint32_t& lane_id) {

         // Hashing the memory block to be used.

         createMemBlockIndex(tid >> 5);


         // Loading the assigned memory block.

         memory_block_bitmap_ =

                 super_blocks_[super_block_index_ * kUIntsPerSuperBlock +

                               memory_block_index_ * kSlabsPerBlock + lane_id];

     }


     __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {

         // Try and allocate a new memory units within the memory_block memory

         // block if it returns 0xFFFFFFFF, then there was not any empty memory

         // unit a new memory_block block should be chosen, and repeat again

         // allocated result:  5  bits: super_block_index

         //                    17 bits: memory block index

         //                    5  bits: memory unit index (hi-bits of 10bit)

         //                    5  bits: memory unit index (lo-bits of 10bit)

         int empty_lane = -1;

         uint32_t free_lane;

         uint32_t read_bitmap = memory_block_bitmap_;

         uint32_t allocated_result = kNotFoundFlag;

         // Works as long as <31 bit are used in the allocated_result

         // in other words, if there are 32 super blocks and at most 64k blocks

         // per super block.


         while (allocated_result == kNotFoundFlag) {

             empty_lane = __ffs(~memory_block_bitmap_) - 1;

             free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);

             if (free_lane == 0) {

                 // all bitmaps are full: need to be rehashed again.

                 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>

                                     5);

                 read_bitmap = memory_block_bitmap_;

                 continue;

             }

             uint32_t src_lane = __ffs(free_lane) - 1;

             if (src_lane == lane_id) {

                 read_bitmap = atomicCAS(

                         super_blocks_ +

                                 super_block_index_ * kUIntsPerSuperBlock +

                                 memory_block_index_ * kSlabsPerBlock + lane_id,

                         memory_block_bitmap_,

                         memory_block_bitmap_ | (1 << empty_lane));

                 if (read_bitmap == memory_block_bitmap_) {

                     // Successful attempt.

                     memory_block_bitmap_ |= (1 << empty_lane);

                     allocated_result =

                             (super_block_index_ << kSuperBlockMaskBits) |

                             (memory_block_index_ << kBlockMaskBits) |

                             (lane_id << kSlabMaskBits) | empty_lane;

                 } else {

                     // Not successful: updating the current bitmap.

                     memory_block_bitmap_ = read_bitmap;

                 }

             }

             // Asking for the allocated result.

             allocated_result =

                     __shfl_sync(kSyncLanesMask, allocated_result, src_lane);

         }

         return allocated_result;

     }


     // This function, frees a recently allocated memory unit by a single thread.

     // Since it is untouched, there shouldn't be any worries for the actual

     // memory contents to be reset again.

     __device__ void FreeUntouched(buf_index_t ptr) {

         atomicAnd(super_blocks_ +

                           getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +

                           getMemBlockIndex(ptr) * kSlabsPerBlock +

                           (getMemUnitIndex(ptr) >> 5),

                   ~(1 << (getMemUnitIndex(ptr) & 0x1F)));

     }


 private:

     __device__ __host__ __forceinline__ uint32_t

     getSuperBlockIndex(buf_index_t address) const {

         return address >> kSuperBlockMaskBits;

     }

     __device__ __host__ __forceinline__ uint32_t

     getMemBlockIndex(buf_index_t address) const {

         return ((address >> kBlockMaskBits) & 0x1FFFF);

     }

     __device__ __host__ __forceinline__ buf_index_t

     getMemBlockAddress(buf_index_t address) const {

         return (kBitmapsPerSuperBlock +

                 getMemBlockIndex(address) * kUIntsPerBlock);

     }

     __device__ __host__ __forceinline__ uint32_t

     getMemUnitIndex(buf_index_t address) const {

         return address & 0x3FF;

     }

     __device__ __host__ __forceinline__ buf_index_t

     getMemUnitAddress(buf_index_t address) {

         return getMemUnitIndex(address) * kWarpSize;

     }


     // Called at the beginning of the kernel.

     __device__ void createMemBlockIndex(uint32_t global_warp_id) {

         super_block_index_ = global_warp_id % kSuperBlocks;

         memory_block_index_ = (hash_coef_ * global_warp_id) >>

                               (32 - kBlocksPerSuperBlockInBits);

     }


     // Called when the allocator fails to find an empty unit to allocate.

     __device__ void updateMemBlockIndex(uint32_t global_warp_id) {

         num_attempts_++;

         super_block_index_++;

         super_block_index_ =

                 (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;

         memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>

                               (32 - kBlocksPerSuperBlockInBits);

         // Loading the assigned memory block.

         memory_block_bitmap_ =

                 *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +

                   memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));

     }


     __host__ __device__ buf_index_t

     addressDecoder(buf_index_t address_ptr_index) {

         return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +

                getMemBlockAddress(address_ptr_index) +

                getMemUnitIndex(address_ptr_index) * kWarpSize;

     }


     __host__ __device__ void print_address(buf_index_t address_ptr_index) {

         printf("Super block Index: %d, Memory block index: %d, Memory unit "

                "index: "

                "%d\n",

                getSuperBlockIndex(address_ptr_index),

                getMemBlockIndex(address_ptr_index),

                getMemUnitIndex(address_ptr_index));

     }


 public:

     uint32_t* super_blocks_;

     uint32_t hash_coef_;  // A random 32-bit.


 private:

     uint32_t num_attempts_;

     uint32_t memory_block_index_;

     uint32_t memory_block_bitmap_;

     uint32_t super_block_index_;

 };


 __global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl,

                                               uint32_t* slabs_per_superblock);


 class SlabNodeManager {

 public:

     SlabNodeManager(const Device& device) : device_(device) {

         impl_.hash_coef_ = utility::random::RandUint32();


         impl_.super_blocks_ = static_cast<uint32_t*>(MemoryManager::Malloc(

                 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t),

                 device_));

         Reset();

     }


     ~SlabNodeManager() { MemoryManager::Free(impl_.super_blocks_, device_); }


     void Reset() {

         OPEN3D_CUDA_CHECK(cudaMemset(

                 impl_.super_blocks_, 0xFF,

                 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));


         for (uint32_t i = 0; i < kSuperBlocks; i++) {

             // setting bitmaps into zeros:

             OPEN3D_CUDA_CHECK(cudaMemset(

                     impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,

                     kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));

         }

         cuda::Synchronize();

         OPEN3D_CUDA_CHECK(cudaGetLastError());

     }


     std::vector<int> CountSlabsPerSuperblock() {

         const uint32_t num_super_blocks = kSuperBlocks;


         thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);

         thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),

                      0);


         // Counting total number of allocated memory units.

         int num_mem_units = kBlocksPerSuperBlock * 32;

         int num_cuda_blocks =

                 (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;

         CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock, 0,

                                         core::cuda::GetStream()>>>(

                 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));

         cuda::Synchronize();

         OPEN3D_CUDA_CHECK(cudaGetLastError());


         std::vector<int> result(num_super_blocks);

         thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),

                      result.begin());


         return result;

     }


 public:

     SlabNodeManagerImpl impl_;

     Device device_;

 };

 }  // namespace core

 }  // namespace open3d

CUDAUtils.h
Common CUDA utilities.

OPEN3D_CUDA_CHECK
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:47

HashBackendBuffer.h

MemoryManager.h

Random.h

SlabMacros.h

result
core::Tensor result
Definition: VtkUtils.cpp:76

copy
bool copy
Definition: VtkUtils.cpp:74

open3d::core::Device
Definition: Device.h:18

open3d::core::MemoryManager::Malloc
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:22

open3d::core::MemoryManager::Free
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
Definition: MemoryManager.cpp:28

open3d::core::Slab
Definition: SlabNodeManager.h:39

open3d::core::Slab::kv_pair_ptrs
buf_index_t kv_pair_ptrs[kWarpSize - 1]
Definition: SlabNodeManager.h:43

open3d::core::Slab::next_slab_ptr
buf_index_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: SlabNodeManager.h:45

open3d::core::SlabNodeManager
Definition: SlabNodeManager.h:219

open3d::core::SlabNodeManager::SlabNodeManager
SlabNodeManager(const Device &device)
Definition: SlabNodeManager.h:221

open3d::core::SlabNodeManager::Reset
void Reset()
Definition: SlabNodeManager.h:235

open3d::core::SlabNodeManager::~SlabNodeManager
~SlabNodeManager()
Definition: SlabNodeManager.h:233

open3d::core::SlabNodeManager::impl_
SlabNodeManagerImpl impl_
Definition: SlabNodeManager.h:275

open3d::core::SlabNodeManager::device_
Device device_
Definition: SlabNodeManager.h:276

open3d::core::SlabNodeManager::CountSlabsPerSuperblock
std::vector< int > CountSlabsPerSuperblock()
Definition: SlabNodeManager.h:250

open3d::core::SlabNodeManagerImpl
Definition: SlabNodeManager.h:48

open3d::core::SlabNodeManagerImpl::super_blocks_
uint32_t * super_blocks_
A pointer to each super-block.
Definition: SlabNodeManager.h:204

open3d::core::SlabNodeManagerImpl::get_ptr_for_bitmap
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: SlabNodeManager.h:61

open3d::core::SlabNodeManagerImpl::get_unit_ptr_from_slab
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:57

open3d::core::SlabNodeManagerImpl::Init
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:68

open3d::core::SlabNodeManagerImpl::hash_coef_
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: SlabNodeManager.h:206

open3d::core::SlabNodeManagerImpl::FreeUntouched
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:134

open3d::core::SlabNodeManagerImpl::WarpAllocate
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:78

open3d::core::SlabNodeManagerImpl::SlabNodeManagerImpl
SlabNodeManagerImpl()
Definition: SlabNodeManager.h:50

open3d::core::cuda::Synchronize
void Synchronize()
Definition: CUDAUtils.cpp:58

open3d::core::buf_index_t
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44

open3d::core::CountSlabsPerSuperblockKernel
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)

open3d::io::k4a_plugin::uint32_t
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548

open3d::utility::random::RandUint32
uint32_t RandUint32()
Definition: Random.cpp:59

open3d
Definition: PinholeCameraIntrinsic.cpp:16