0.13.0/cpp_api/kernel_2_fill_in_linear_system_impl_8h_source.html

 // ----------------------------------------------------------------------------
 // -                        Open3D: www.open3d.org                            -
 // ----------------------------------------------------------------------------
 // The MIT License (MIT)
 //
 // Copyright (c) 2018 www.open3d.org
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 // IN THE SOFTWARE.
 // ----------------------------------------------------------------------------

 #include "open3d/t/geometry/kernel/GeometryIndexer.h"
 #include "open3d/t/pipelines/kernel/FillInLinearSystem.h"

 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
 #include "open3d/t/pipelines/kernel/SVD3x3CUDA.cuh"
 #else
 #include "open3d/t/pipelines/kernel/SVD3x3CPU.h"
 #endif

 namespace open3d {
 namespace t {
 namespace pipelines {
 namespace kernel {
 #if defined(__CUDACC__)
 void FillInRigidAlignmentTermCUDA
 #else
 void FillInRigidAlignmentTermCPU
 #endif
         (core::Tensor &AtA,
          core::Tensor &Atb,
          core::Tensor &residual,
          const core::Tensor &Ti_ps,
          const core::Tensor &Tj_qs,
          const core::Tensor &Ri_normal_ps,
          int i,
          int j,
          float threshold) {

     core::Device device = AtA.GetDevice();
     int64_t n = Ti_ps.GetLength();
     if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
         utility::LogError(
                 "Unable to setup linear system: input length mismatch.");
     }

     // First fill in a small 12 x 12 linear system
     core::Tensor AtA_local =
             core::Tensor::Zeros({12, 12}, core::Dtype::Float32, device);
     core::Tensor Atb_local =
             core::Tensor::Zeros({12}, core::Dtype::Float32, device);

     float *AtA_local_ptr = static_cast<float *>(AtA_local.GetDataPtr());
     float *Atb_local_ptr = static_cast<float *>(Atb_local.GetDataPtr());
     float *residual_ptr = static_cast<float *>(residual.GetDataPtr());

     const float *Ti_ps_ptr = static_cast<const float *>(Ti_ps.GetDataPtr());
     const float *Tj_qs_ptr = static_cast<const float *>(Tj_qs.GetDataPtr());
     const float *Ri_normal_ps_ptr =
             static_cast<const float *>(Ri_normal_ps.GetDataPtr());

 #if defined(__CUDACC__)
     core::kernel::CUDALauncher launcher;
 #else
     core::kernel::CPULauncher launcher;
 #endif
     launcher.LaunchGeneralKernel(n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
         const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
         const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
         const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx;

         float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
                   (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
                   (p_prime[2] - q_prime[2]) * normal_p_prime[2];
         if (abs(r) > threshold) return;

         float J_ij[12];
         J_ij[0] = -q_prime[2] * normal_p_prime[1] +
                   q_prime[1] * normal_p_prime[2];
         J_ij[1] =
                 q_prime[2] * normal_p_prime[0] - q_prime[0] * normal_p_prime[2];
         J_ij[2] = -q_prime[1] * normal_p_prime[0] +
                   q_prime[0] * normal_p_prime[1];
         J_ij[3] = normal_p_prime[0];
         J_ij[4] = normal_p_prime[1];
         J_ij[5] = normal_p_prime[2];
         for (int k = 0; k < 6; ++k) {
             J_ij[k + 6] = -J_ij[k];
         }

         // Not optimized; Switch to reduction if necessary.
 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
         for (int i_local = 0; i_local < 12; ++i_local) {
             for (int j_local = 0; j_local < 12; ++j_local) {
                 atomicAdd(&AtA_local_ptr[i_local * 12 + j_local],
                           J_ij[i_local] * J_ij[j_local]);
             }
             atomicAdd(&Atb_local_ptr[i_local], J_ij[i_local] * r);
         }
         atomicAdd(residual_ptr, r * r);
 #else
 #pragma omp critical
         {
             for (int i_local = 0; i_local < 12; ++i_local) {
                 for (int j_local = 0; j_local < 12; ++j_local) {
                     AtA_local_ptr[i_local * 12 + j_local]
                       += J_ij[i_local] * J_ij[j_local];
                  }
                  Atb_local_ptr[i_local] += J_ij[i_local] * r;
             }
             *residual_ptr += r * r;
         }
 #endif
     });

     // Then fill-in the large linear system
     std::vector<int64_t> indices_vec(12);
     for (int k = 0; k < 6; ++k) {
         indices_vec[k] = i * 6 + k;
         indices_vec[k + 6] = j * 6 + k;
     }

     std::vector<int64_t> indices_i_vec;
     std::vector<int64_t> indices_j_vec;
     for (int local_i = 0; local_i < 12; ++local_i) {
         for (int local_j = 0; local_j < 12; ++local_j) {
             indices_i_vec.push_back(indices_vec[local_i]);
             indices_j_vec.push_back(indices_vec[local_j]);
         }
     }

     core::Tensor indices(indices_vec, {12}, core::Dtype::Int64, device);
     core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Dtype::Int64,
                            device);
     core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Dtype::Int64,
                            device);

     core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j});
     AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12}));

     core::Tensor Atb_sub = Atb.IndexGet({indices});
     Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
 }

 #if defined(__CUDACC__)
 void FillInSLACAlignmentTermCUDA
 #else
 void FillInSLACAlignmentTermCPU
 #endif
         (core::Tensor &AtA,
          core::Tensor &Atb,
          core::Tensor &residual,
          const core::Tensor &Ti_Cps,
          const core::Tensor &Tj_Cqs,
          const core::Tensor &Cnormal_ps,
          const core::Tensor &Ri_Cnormal_ps,
          const core::Tensor &RjT_Ri_Cnormal_ps,
          const core::Tensor &cgrid_idx_ps,
          const core::Tensor &cgrid_idx_qs,
          const core::Tensor &cgrid_ratio_qs,
          const core::Tensor &cgrid_ratio_ps,
          int i,
          int j,
          int n_frags,
          float threshold) {
     int64_t n = Ti_Cps.GetLength();
     if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
         Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
         cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
         cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
         utility::LogError(
                 "Unable to setup linear system: input length mismatch.");
     }

     int n_vars = Atb.GetLength();
     float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
     float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
     float *residual_ptr = static_cast<float *>(residual.GetDataPtr());

     // Geometric properties
     const float *Ti_Cps_ptr = static_cast<const float *>(Ti_Cps.GetDataPtr());
     const float *Tj_Cqs_ptr = static_cast<const float *>(Tj_Cqs.GetDataPtr());
     const float *Cnormal_ps_ptr =
             static_cast<const float *>(Cnormal_ps.GetDataPtr());
     const float *Ri_Cnormal_ps_ptr =
             static_cast<const float *>(Ri_Cnormal_ps.GetDataPtr());
     const float *RjT_Ri_Cnormal_ps_ptr =
             static_cast<const float *>(RjT_Ri_Cnormal_ps.GetDataPtr());

     // Association properties
     const int *cgrid_idx_ps_ptr =
             static_cast<const int *>(cgrid_idx_ps.GetDataPtr());
     const int *cgrid_idx_qs_ptr =
             static_cast<const int *>(cgrid_idx_qs.GetDataPtr());
     const float *cgrid_ratio_ps_ptr =
             static_cast<const float *>(cgrid_ratio_ps.GetDataPtr());
     const float *cgrid_ratio_qs_ptr =
             static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());

 #if defined(__CUDACC__)
     core::kernel::CUDALauncher launcher;
 #else
     core::kernel::CPULauncher launcher;
 #endif
     launcher.LaunchGeneralKernel(n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
         const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
         const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
         const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
         const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx;
         const float *RjTRi_Cnormal_p = RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;

         const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
         const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
         const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx;
         const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx;

         float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
                   (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
                   (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
         if (abs(r) > threshold) return;

         // Now we fill in a 60 x 60 sub-matrix: 2 x (6 + 8 x 3)
         float J[60];
         int idx[60];

         // Jacobian w.r.t. Ti: 0-6
         J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
         J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
         J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
         J[3] = Ri_Cnormal_p[0];
         J[4] = Ri_Cnormal_p[1];
         J[5] = Ri_Cnormal_p[2];

         // Jacobian w.r.t. Tj: 6-12
         for (int k = 0; k < 6; ++k) {
             J[k + 6] = -J[k];

             idx[k + 0] = 6 * i + k;
             idx[k + 6] = 6 * j + k;
         }

         // Jacobian w.r.t. C over p: 12-36
         for (int k = 0; k < 8; ++k) {
             J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
             J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
             J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];

             idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
             idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
             idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
         }

         // Jacobian w.r.t. C over q: 36-60
         for (int k = 0; k < 8; ++k) {
             J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
             J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
             J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];

             idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
             idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
             idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
         }

         // Not optimized; Switch to reduction if necessary.
 #if defined(__CUDACC__)
         for (int ki = 0; ki < 60; ++ki) {
             for (int kj = 0; kj < 60; ++kj) {
                 float AtA_ij = J[ki] * J[kj];
                 int ij = idx[ki] * n_vars + idx[kj];
                 atomicAdd(AtA_ptr + ij, AtA_ij);
             }
             float Atb_i = J[ki] * r;
             atomicAdd(Atb_ptr + idx[ki], Atb_i);
         }
         atomicAdd(residual_ptr, r * r);
 #else
 #pragma omp critical
         {
             for (int ki = 0; ki < 60; ++ki) {
                 for (int kj = 0; kj < 60; ++kj) {
                     AtA_ptr[idx[ki] * n_vars + idx[kj]]
                       += J[ki] * J[kj];
                  }
                  Atb_ptr[idx[ki]] += J[ki] * r;
             }
             *residual_ptr += r * r;
         }
 #endif
     });
 }

 inline OPEN3D_HOST_DEVICE void matmul3x3_3x1(float m00,
                                              float m01,
                                              float m02,
                                              float m10,
                                              float m11,
                                              float m12,
                                              float m20,
                                              float m21,
                                              float m22,
                                              float v0,
                                              float v1,
                                              float v2,
                                              float &o0,
                                              float &o1,
                                              float &o2) {
     o0 = m00 * v0 + m01 * v1 + m02 * v2;
     o1 = m10 * v0 + m11 * v1 + m12 * v2;
     o2 = m20 * v0 + m21 * v1 + m22 * v2;
 }

 inline OPEN3D_HOST_DEVICE void matmul3x3_3x3(float a00,
                                              float a01,
                                              float a02,
                                              float a10,
                                              float a11,
                                              float a12,
                                              float a20,
                                              float a21,
                                              float a22,
                                              float b00,
                                              float b01,
                                              float b02,
                                              float b10,
                                              float b11,
                                              float b12,
                                              float b20,
                                              float b21,
                                              float b22,
                                              float &c00,
                                              float &c01,
                                              float &c02,
                                              float &c10,
                                              float &c11,
                                              float &c12,
                                              float &c20,
                                              float &c21,
                                              float &c22) {
     matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b00, b10, b20,
                   c00, c10, c20);
     matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b01, b11, b21,
                   c01, c11, c21);
     matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b02, b12, b22,
                   c02, c12, c22);
 }

 inline OPEN3D_HOST_DEVICE float det3x3(float m00,
                                        float m01,
                                        float m02,
                                        float m10,
                                        float m11,
                                        float m12,
                                        float m20,
                                        float m21,
                                        float m22) {
     return m00 * (m11 * m22 - m12 * m21) - m10 * (m01 * m22 - m02 - m21) +
            m20 * (m01 * m12 - m02 * m11);
 }

 #if defined(__CUDACC__)
 void FillInSLACRegularizerTermCUDA
 #else
 void FillInSLACRegularizerTermCPU
 #endif
         (core::Tensor &AtA,
          core::Tensor &Atb,
          core::Tensor &residual,
          const core::Tensor &grid_idx,
          const core::Tensor &grid_nbs_idx,
          const core::Tensor &grid_nbs_mask,
          const core::Tensor &positions_init,
          const core::Tensor &positions_curr,
          float weight,
          int n_frags,
          int anchor_idx) {

     int64_t n = grid_idx.GetLength();
     int64_t n_vars = Atb.GetLength();

     float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
     float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
     float *residual_ptr = static_cast<float *>(residual.GetDataPtr());

     const int *grid_idx_ptr = static_cast<const int *>(grid_idx.GetDataPtr());
     const int *grid_nbs_idx_ptr =
             static_cast<const int *>(grid_nbs_idx.GetDataPtr());
     const bool *grid_nbs_mask_ptr =
             static_cast<const bool *>(grid_nbs_mask.GetDataPtr());

     const float *positions_init_ptr =
             static_cast<const float *>(positions_init.GetDataPtr());
     const float *positions_curr_ptr =
             static_cast<const float *>(positions_curr.GetDataPtr());

 #if defined(__CUDACC__)
     core::kernel::CUDALauncher launcher;
 #else
     core::kernel::CPULauncher launcher;
 #endif
     launcher.LaunchGeneralKernel(n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
         // Enumerate 6 neighbors
         int idx_i = grid_idx_ptr[workload_idx];

         const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
         const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;

         // Build a 3x3 linear system to compute the local R
         float cov[3][3] = {{0}};
         float U[3][3], V[3][3], S[3];

         int cnt = 0;
         for (int k = 0; k < 6; ++k) {
             bool mask_k = mask_nbs[k];
             if (!mask_k) continue;

             int idx_k = idx_nbs[k];

             // Now build linear systems
             float diff_ik_init[3] = {positions_init_ptr[idx_i * 3 + 0] -
                                              positions_init_ptr[idx_k * 3 + 0],
                                      positions_init_ptr[idx_i * 3 + 1] -
                                              positions_init_ptr[idx_k * 3 + 1],
                                      positions_init_ptr[idx_i * 3 + 2] -
                                              positions_init_ptr[idx_k * 3 + 2]};
             float diff_ik_curr[3] = {positions_curr_ptr[idx_i * 3 + 0] -
                                              positions_curr_ptr[idx_k * 3 + 0],
                                      positions_curr_ptr[idx_i * 3 + 1] -
                                              positions_curr_ptr[idx_k * 3 + 1],
                                      positions_curr_ptr[idx_i * 3 + 2] -
                                              positions_curr_ptr[idx_k * 3 + 2]};

             // Build linear system by computing XY^T when formulating Y = RX
             // Y: curr
             // X: init
             for (int i = 0; i < 3; ++i) {
                 for (int j = 0; j < 3; ++j) {
                     cov[i][j] += diff_ik_init[i] * diff_ik_curr[j];
                 }
             }
             ++cnt;
         }

         if (cnt < 3) {
             return;
         }

         // clang-format off
         svd(cov[0][0], cov[0][1], cov[0][2],
             cov[1][0], cov[1][1], cov[1][2],
             cov[2][0], cov[2][1], cov[2][2],
             U[0][0], U[0][1], U[0][2],
             U[1][0], U[1][1], U[1][2],
             U[2][0], U[2][1], U[2][2],
             S[0], S[1], S[2],
             V[0][0], V[0][1], V[0][2],
             V[1][0], V[1][1], V[1][2],
             V[2][0], V[2][1], V[2][2]);

         // TODO: det3x3 and matmul3x3
         float R[3][3];

         // clang-format off
         matmul3x3_3x3(V[0][0], V[0][1], V[0][2],
                       V[1][0], V[1][1], V[1][2],
                       V[2][0], V[2][1], V[2][2],
                       U[0][0], U[1][0], U[2][0],
                       U[0][1], U[1][1], U[2][1],
                       U[0][2], U[1][2], U[2][2],
                       R[0][0], R[0][1], R[0][2],
                       R[1][0], R[1][1], R[1][2],
                       R[2][0], R[2][1], R[2][2]);

         float d = det3x3(R[0][0], R[0][1], R[0][2],
                          R[1][0], R[1][1], R[1][2],
                          R[2][0], R[2][1], R[2][2]);
         // clang-format on

         if (d < 0) {
             // clang-format off
             matmul3x3_3x3(V[0][0], V[0][1], V[0][2],
                           V[1][0], V[1][1], V[1][2],
                           V[2][0], V[2][1], V[2][2],
                           U[0][0], U[1][0], U[2][0],
                           U[0][1], U[1][1], U[2][1],
                           -U[0][2], -U[1][2], -U[2][2],
                           R[0][0], R[0][1], R[0][2],
                           R[1][0], R[1][1], R[1][2],
                           R[2][0], R[2][1], R[2][2]);
             // clang-format on
         }

         // Now we have R, we build Hessian and residuals
         // But first, we need to anchor a point
         if (idx_i == anchor_idx) {
             R[0][0] = R[1][1] = R[2][2] = 1;
             R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0;
         }
         for (int k = 0; k < 6; ++k) {
             bool mask_k = mask_nbs[k];

             if (mask_k) {
                 int idx_k = idx_nbs[k];

                 float diff_ik_init[3] = {
                         positions_init_ptr[idx_i * 3 + 0] -
                                 positions_init_ptr[idx_k * 3 + 0],
                         positions_init_ptr[idx_i * 3 + 1] -
                                 positions_init_ptr[idx_k * 3 + 1],
                         positions_init_ptr[idx_i * 3 + 2] -
                                 positions_init_ptr[idx_k * 3 + 2]};
                 float diff_ik_curr[3] = {
                         positions_curr_ptr[idx_i * 3 + 0] -
                                 positions_curr_ptr[idx_k * 3 + 0],
                         positions_curr_ptr[idx_i * 3 + 1] -
                                 positions_curr_ptr[idx_k * 3 + 1],
                         positions_curr_ptr[idx_i * 3 + 2] -
                                 positions_curr_ptr[idx_k * 3 + 2]};
                 float R_diff_ik_curr[3];

                 // clang-format off
                 matmul3x3_3x1(R[0][0], R[0][1], R[0][2],
                               R[1][0], R[1][1], R[1][2],
                               R[2][0], R[2][1], R[2][2],
                               diff_ik_init[0],
                               diff_ik_init[1],
                               diff_ik_init[2],
                               R_diff_ik_curr[0],
                               R_diff_ik_curr[1],
                               R_diff_ik_curr[2]);
                 // clang-format on

                 float local_r[3];
                 local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
                 local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
                 local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];

                 int offset_idx_i = 3 * idx_i + 6 * n_frags;
                 int offset_idx_k = 3 * idx_k + 6 * n_frags;

 #if defined(__CUDACC__)
                 // Update residual
                 atomicAdd(residual_ptr, weight * (local_r[0] * local_r[0] +
                                                   local_r[1] * local_r[1] +
                                                   local_r[2] * local_r[2]));

                 for (int axis = 0; axis < 3; ++axis) {
                     // Update AtA: 2x2
                     atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
                                        offset_idx_i + axis],
                               weight);
                     atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
                                        offset_idx_k + axis],
                               weight);
                     atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
                                        offset_idx_k + axis],
                               -weight);
                     atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
                                        offset_idx_i + axis],
                               -weight);

                     // Update Atb: 2x1
                     atomicAdd(&Atb_ptr[offset_idx_i + axis],
                               +weight * local_r[axis]);
                     atomicAdd(&Atb_ptr[offset_idx_k + axis],
                               -weight * local_r[axis]);
                 }
 #else
 #pragma omp critical
                 {
                     // Update residual
                     *residual_ptr += weight * (local_r[0] * local_r[0] +
                                                local_r[1] * local_r[1] +
                                                local_r[2] * local_r[2]);

                     for (int axis = 0; axis < 3; ++axis) {
                         // Update AtA: 2x2
                         AtA_ptr[(offset_idx_i + axis) * n_vars +
                                  offset_idx_i + axis] += weight;
                         AtA_ptr[(offset_idx_k + axis) * n_vars +
                                  offset_idx_k + axis] += weight;

                         AtA_ptr[(offset_idx_i + axis) * n_vars +
                                  offset_idx_k + axis] -= weight;
                         AtA_ptr[(offset_idx_k + axis) * n_vars +
                                  offset_idx_i + axis] -= weight;

                         // Update Atb: 2x1
                         Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
                         Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
                     }
                 }
 #endif
             }
         }
     });
 }
 }  // namespace kernel
 }  // namespace pipelines
 }  // namespace t
 }  // namespace open3d
open3d::t::pipelines::kernel::FillInSLACRegularizerTermCPU
void FillInSLACRegularizerTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &grid_idx, const core::Tensor &grid_nbs_idx, const core::Tensor &grid_nbs_mask, const core::Tensor &positions_init, const core::Tensor &positions_curr, float weight, int n, int anchor_idx)
Definition: FillInLinearSystemImpl.h:380

open3d::core::kernel::CPULauncher
Definition: CPULauncher.h:42

open3d::t::pipelines::kernel::FillInRigidAlignmentTermCPU
void FillInRigidAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &Ri_normal_ps, int i, int j, float threshold)
Definition: FillInLinearSystemImpl.h:45

svd
void svd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, float &s11, float &s22, float &s33, float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33)
Definition: SVD3x3CPU.h:43

open3d::core::Tensor::GetDevice
Device GetDevice() const
Definition: Tensor.cpp:1098

LogError
#define LogError(...)
Definition: Console.h:79

open3d::core::Tensor::View
Tensor View(const SizeVector &dst_shape) const
Definition: Tensor.cpp:522

OPEN3D_DEVICE
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:57

open3d::core::Tensor::IndexGet
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:704

OPEN3D_HOST_DEVICE
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:56

open3d::core::Tensor::IndexSet
void IndexSet(const std::vector< Tensor > &index_tensors, const Tensor &src_tensor)
Advanced indexing getter.
Definition: Tensor.cpp:713

open3d::core::Device
Definition: Device.h:39

open3d::core::Tensor::Zeros
static Tensor Zeros(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor fill with zeros.
Definition: Tensor.cpp:240

open3d::core::Dtype::Float32
static const Dtype Float32
Definition: Dtype.h:42

open3d::core::kernel::CPULauncher::LaunchGeneralKernel
static void LaunchGeneralKernel(int64_t n, func_t element_kernel)
General kernels with non-conventional indexers.
Definition: CPULauncher.h:176

GeometryIndexer.h

open3d::core::Dtype::Int64
static const Dtype Int64
Definition: Dtype.h:47

open3d
Definition: PinholeCameraIntrinsic.cpp:35

open3d::core::Tensor
Definition: Tensor.h:50

SVD3x3CPU.h

open3d::t::pipelines::kernel::FillInSLACAlignmentTermCPU
void FillInSLACAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &normal_ps, const core::Tensor &Ri_normal_ps, const core::Tensor &RjT_Ri_normal_ps, const core::Tensor &cgrid_idx_ps, const core::Tensor &cgrid_idx_qs, const core::Tensor &cgrid_ratio_qs, const core::Tensor &cgrid_ratio_ps, int i, int j, int n, float threshold)
Definition: FillInLinearSystemImpl.h:165

open3d::core::Tensor::GetDataPtr
T * GetDataPtr()
Definition: Tensor.h:1005

open3d::t::pipelines::kernel::det3x3
OPEN3D_HOST_DEVICE float det3x3(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
Definition: FillInLinearSystemImpl.h:362

open3d::t::pipelines::kernel::matmul3x3_3x1
OPEN3D_HOST_DEVICE void matmul3x3_3x1(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22, float v0, float v1, float v2, float &o0, float &o1, float &o2)
Definition: FillInLinearSystemImpl.h:307

SVD3x3CUDA.cuh

FillInLinearSystem.h

open3d::t::pipelines::kernel::matmul3x3_3x3
OPEN3D_HOST_DEVICE void matmul3x3_3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22, float b00, float b01, float b02, float b10, float b11, float b12, float b20, float b21, float b22, float &c00, float &c01, float &c02, float &c10, float &c11, float &c12, float &c20, float &c21, float &c22)
Definition: FillInLinearSystemImpl.h:327