0.13.0/cpp_api/_slab_node_manager_8h_source.html

 // ----------------------------------------------------------------------------
 // -                        Open3D: www.open3d.org                            -
 // ----------------------------------------------------------------------------
 // The MIT License (MIT)
 //
 // Copyright (c) 2018 www.open3d.org
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 // IN THE SOFTWARE.
 // ----------------------------------------------------------------------------

 // Copyright 2019 Saman Ashkiani
 // Rewritten by Wei Dong 2019 - 2020
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing permissions
 // and limitations under the License.

 #pragma once

 #include <thrust/device_vector.h>

 #include <cassert>
 #include <memory>
 #include <random>

 #include "open3d/core/CUDAUtils.h"
 #include "open3d/core/MemoryManager.h"
 #include "open3d/core/hashmap/CUDA/SlabMacros.h"
 #include "open3d/core/hashmap/HashmapBuffer.h"

 namespace open3d {
 namespace core {

 class Slab {
 public:
     addr_t kv_pair_ptrs[kWarpSize - 1];
     addr_t next_slab_ptr;
 };

 class SlabNodeManagerImpl {
 public:
     SlabNodeManagerImpl()
         : super_blocks_(nullptr),
           hash_coef_(0),
           num_attempts_(0),
           memory_block_index_(0),
           super_block_index_(0) {}

     __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(
             const addr_t& next_slab_ptr, const uint32_t& lane_id) {
         return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
     }
     __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(
             const uint32_t super_block_idx, const uint32_t bitmap_idx) {
         return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
                bitmap_idx;
     }

     // Objective: each warp selects its own memory_block warp allocator.
     __device__ void Init(uint32_t& tid, uint32_t& lane_id) {
         // Hashing the memory block to be used.
         createMemBlockIndex(tid >> 5);

         // Loading the assigned memory block.
         memory_block_bitmap_ =
                 super_blocks_[super_block_index_ * kUIntsPerSuperBlock +
                               memory_block_index_ * kSlabsPerBlock + lane_id];
     }

     __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {
         // Try and allocate a new memory units within the memory_block memory
         // block if it returns 0xFFFFFFFF, then there was not any empty memory
         // unit a new memory_block block should be chosen, and repeat again
         // allocated result:  5  bits: super_block_index
         //                    17 bits: memory block index
         //                    5  bits: memory unit index (hi-bits of 10bit)
         //                    5  bits: memory unit index (lo-bits of 10bit)
         int empty_lane = -1;
         uint32_t free_lane;
         uint32_t read_bitmap = memory_block_bitmap_;
         uint32_t allocated_result = kNotFoundFlag;
         // Works as long as <31 bit are used in the allocated_result
         // in other words, if there are 32 super blocks and at most 64k blocks
         // per super block.

         while (allocated_result == kNotFoundFlag) {
             empty_lane = __ffs(~memory_block_bitmap_) - 1;
             free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
             if (free_lane == 0) {
                 // all bitmaps are full: need to be rehashed again.
                 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
                                     5);
                 read_bitmap = memory_block_bitmap_;
                 continue;
             }
             uint32_t src_lane = __ffs(free_lane) - 1;
             if (src_lane == lane_id) {
                 read_bitmap = atomicCAS(
                         super_blocks_ +
                                 super_block_index_ * kUIntsPerSuperBlock +
                                 memory_block_index_ * kSlabsPerBlock + lane_id,
                         memory_block_bitmap_,
                         memory_block_bitmap_ | (1 << empty_lane));
                 if (read_bitmap == memory_block_bitmap_) {
                     // Successful attempt.
                     memory_block_bitmap_ |= (1 << empty_lane);
                     allocated_result =
                             (super_block_index_ << kSuperBlockMaskBits) |
                             (memory_block_index_ << kBlockMaskBits) |
                             (lane_id << kSlabMaskBits) | empty_lane;
                 } else {
                     // Not successful: updating the current bitmap.
                     memory_block_bitmap_ = read_bitmap;
                 }
             }
             // Asking for the allocated result.
             allocated_result =
                     __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
         }
         return allocated_result;
     }

     // This function, frees a recently allocated memory unit by a single thread.
     // Since it is untouched, there shouldn't be any worries for the actual
     // memory contents to be reset again.
     __device__ void FreeUntouched(addr_t ptr) {
         atomicAnd(super_blocks_ +
                           getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
                           getMemBlockIndex(ptr) * kSlabsPerBlock +
                           (getMemUnitIndex(ptr) >> 5),
                   ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
     }

 private:
     __device__ __host__ __forceinline__ uint32_t
     getSuperBlockIndex(addr_t address) const {
         return address >> kSuperBlockMaskBits;
     }
     __device__ __host__ __forceinline__ uint32_t
     getMemBlockIndex(addr_t address) const {
         return ((address >> kBlockMaskBits) & 0x1FFFF);
     }
     __device__ __host__ __forceinline__ addr_t
     getMemBlockAddress(addr_t address) const {
         return (kBitmapsPerSuperBlock +
                 getMemBlockIndex(address) * kUIntsPerBlock);
     }
     __device__ __host__ __forceinline__ uint32_t
     getMemUnitIndex(addr_t address) const {
         return address & 0x3FF;
     }
     __device__ __host__ __forceinline__ addr_t
     getMemUnitAddress(addr_t address) {
         return getMemUnitIndex(address) * kWarpSize;
     }

     // Called at the beginning of the kernel.
     __device__ void createMemBlockIndex(uint32_t global_warp_id) {
         super_block_index_ = global_warp_id % kSuperBlocks;
         memory_block_index_ = (hash_coef_ * global_warp_id) >>
                               (32 - kBlocksPerSuperBlockInBits);
     }

     // Called when the allocator fails to find an empty unit to allocate.
     __device__ void updateMemBlockIndex(uint32_t global_warp_id) {
         num_attempts_++;
         super_block_index_++;
         super_block_index_ =
                 (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
         memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>
                               (32 - kBlocksPerSuperBlockInBits);
         // Loading the assigned memory block.
         memory_block_bitmap_ =
                 *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
                   memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
     }

     __host__ __device__ addr_t addressDecoder(addr_t address_ptr_index) {
         return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
                getMemBlockAddress(address_ptr_index) +
                getMemUnitIndex(address_ptr_index) * kWarpSize;
     }

     __host__ __device__ void print_address(addr_t address_ptr_index) {
         printf("Super block Index: %d, Memory block index: %d, Memory unit "
                "index: "
                "%d\n",
                getSuperBlockIndex(address_ptr_index),
                getMemBlockIndex(address_ptr_index),
                getMemUnitIndex(address_ptr_index));
     }

 public:
     uint32_t* super_blocks_;
     uint32_t hash_coef_;  // A random 32-bit.

 private:
     uint32_t num_attempts_;
     uint32_t memory_block_index_;
     uint32_t memory_block_bitmap_;
     uint32_t super_block_index_;
 };

 __global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl,
                                               uint32_t* slabs_per_superblock);

 class SlabNodeManager {
 public:
     SlabNodeManager(const Device& device) : device_(device) {
         std::mt19937 rng(time(0));
         impl_.hash_coef_ = rng();

         impl_.super_blocks_ = static_cast<uint32_t*>(MemoryManager::Malloc(
                 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t),
                 device_));
         Reset();
     }

     ~SlabNodeManager() { MemoryManager::Free(impl_.super_blocks_, device_); }

     void Reset() {
         OPEN3D_CUDA_CHECK(cudaMemset(
                 impl_.super_blocks_, 0xFF,
                 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));

         for (uint32_t i = 0; i < kSuperBlocks; i++) {
             // setting bitmaps into zeros:
             OPEN3D_CUDA_CHECK(cudaMemset(
                     impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
                     kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
         }
         OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
         OPEN3D_CUDA_CHECK(cudaGetLastError());
     }

     std::vector<int> CountSlabsPerSuperblock() {
         const uint32_t num_super_blocks = kSuperBlocks;

         thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
         thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
                      0);

         // Counting total number of allocated memory units.
         int num_mem_units = kBlocksPerSuperBlock * 32;
         int num_cuda_blocks =
                 (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
         CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock>>>(
                 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
         OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
         OPEN3D_CUDA_CHECK(cudaGetLastError());

         std::vector<int> result(num_super_blocks);
         thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
                      result.begin());

         return result;
     }

 public:
     SlabNodeManagerImpl impl_;
     Device device_;
 };
 }  // namespace core
 }  // namespace open3d
open3d::core::SlabNodeManagerImpl::hash_coef_
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: SlabNodeManager.h:225

open3d::io::k4a_plugin::uint32_t
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557

open3d::core::MemoryManager::Free
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44

open3d::core::SlabNodeManager::device_
Device device_
Definition: SlabNodeManager.h:295

OPEN3D_CUDA_CHECK
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:59

open3d::core::SlabNodeManager
Definition: SlabNodeManager.h:238

open3d::core::MemoryManager::Malloc
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40

open3d::core::SlabNodeManagerImpl::get_unit_ptr_from_slab
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:77

open3d::core::CountSlabsPerSuperblockKernel
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)

open3d::core::Slab::next_slab_ptr
addr_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: SlabNodeManager.h:65

open3d::core::SlabNodeManagerImpl::Init
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:88

open3d::core::SlabNodeManager::SlabNodeManager
SlabNodeManager(const Device &device)
Definition: SlabNodeManager.h:240

open3d::core::SlabNodeManagerImpl::get_ptr_for_bitmap
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: SlabNodeManager.h:81

open3d::core::Device
Definition: Device.h:39

open3d::core::SlabNodeManagerImpl::super_blocks_
uint32_t * super_blocks_
A pointer to each super-block.
Definition: SlabNodeManager.h:223

MemoryManager.h

open3d::core::SlabNodeManagerImpl
Definition: SlabNodeManager.h:68

open3d
Definition: PinholeCameraIntrinsic.cpp:35

open3d::core::addr_t
uint32_t addr_t
Definition: HashmapBuffer.h:58

open3d::core::SlabNodeManagerImpl::SlabNodeManagerImpl
SlabNodeManagerImpl()
Definition: SlabNodeManager.h:70

open3d::core::SlabNodeManager::CountSlabsPerSuperblock
std::vector< int > CountSlabsPerSuperblock()
Definition: SlabNodeManager.h:270

HashmapBuffer.h

open3d::core::SlabNodeManager::impl_
SlabNodeManagerImpl impl_
Definition: SlabNodeManager.h:294

open3d::core::SlabNodeManagerImpl::FreeUntouched
__device__ void FreeUntouched(addr_t ptr)
Definition: SlabNodeManager.h:154

open3d::core::Slab
Definition: SlabNodeManager.h:59

CUDAUtils.h
Common CUDA utilities.

open3d::core::SlabNodeManagerImpl::WarpAllocate
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:98

open3d::core::Slab::kv_pair_ptrs
addr_t kv_pair_ptrs[kWarpSize - 1]
Definition: SlabNodeManager.h:63

SlabMacros.h

open3d::core::SlabNodeManager::~SlabNodeManager
~SlabNodeManager()
Definition: SlabNodeManager.h:253

open3d::core::SlabNodeManager::Reset
void Reset()
Definition: SlabNodeManager.h:255