30 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__) 40 #if defined(__CUDACC__) 41 void FillInRigidAlignmentTermCUDA
56 int64_t n = Ti_ps.GetLength();
57 if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
59 "Unable to setup linear system: input length mismatch.");
68 float *AtA_local_ptr =
static_cast<float *
>(AtA_local.
GetDataPtr());
69 float *Atb_local_ptr =
static_cast<float *
>(Atb_local.GetDataPtr());
70 float *residual_ptr =
static_cast<float *
>(residual.GetDataPtr());
72 const float *Ti_ps_ptr =
static_cast<const float *
>(Ti_ps.GetDataPtr());
73 const float *Tj_qs_ptr =
static_cast<const float *
>(Tj_qs.GetDataPtr());
74 const float *Ri_normal_ps_ptr =
75 static_cast<const float *
>(Ri_normal_ps.GetDataPtr());
77 #if defined(__CUDACC__) 78 core::kernel::CUDALauncher launcher;
83 const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
84 const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
85 const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx;
87 float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
88 (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
89 (p_prime[2] - q_prime[2]) * normal_p_prime[2];
90 if (abs(r) > threshold)
return;
93 J_ij[0] = -q_prime[2] * normal_p_prime[1] +
94 q_prime[1] * normal_p_prime[2];
96 q_prime[2] * normal_p_prime[0] - q_prime[0] * normal_p_prime[2];
97 J_ij[2] = -q_prime[1] * normal_p_prime[0] +
98 q_prime[0] * normal_p_prime[1];
99 J_ij[3] = normal_p_prime[0];
100 J_ij[4] = normal_p_prime[1];
101 J_ij[5] = normal_p_prime[2];
102 for (
int k = 0; k < 6; ++k) {
103 J_ij[k + 6] = -J_ij[k];
107 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__) 108 for (
int i_local = 0; i_local < 12; ++i_local) {
109 for (
int j_local = 0; j_local < 12; ++j_local) {
110 atomicAdd(&AtA_local_ptr[i_local * 12 + j_local],
111 J_ij[i_local] * J_ij[j_local]);
113 atomicAdd(&Atb_local_ptr[i_local], J_ij[i_local] * r);
115 atomicAdd(residual_ptr, r * r);
119 for (
int i_local = 0; i_local < 12; ++i_local) {
120 for (
int j_local = 0; j_local < 12; ++j_local) {
121 AtA_local_ptr[i_local * 12 + j_local]
122 += J_ij[i_local] * J_ij[j_local];
124 Atb_local_ptr[i_local] += J_ij[i_local] * r;
126 *residual_ptr += r * r;
132 std::vector<int64_t> indices_vec(12);
133 for (
int k = 0; k < 6; ++k) {
134 indices_vec[k] = i * 6 + k;
135 indices_vec[k + 6] = j * 6 + k;
138 std::vector<int64_t> indices_i_vec;
139 std::vector<int64_t> indices_j_vec;
140 for (
int local_i = 0; local_i < 12; ++local_i) {
141 for (
int local_j = 0; local_j < 12; ++local_j) {
142 indices_i_vec.push_back(indices_vec[local_i]);
143 indices_j_vec.push_back(indices_vec[local_j]);
154 AtA.
IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.
View({12 * 12}));
157 Atb.
IndexSet({indices}, Atb_sub + Atb_local.
View({12, 1}));
160 #if defined(__CUDACC__) 161 void FillInSLACAlignmentTermCUDA
181 int64_t n = Ti_Cps.GetLength();
182 if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
183 Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
184 cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
185 cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
187 "Unable to setup linear system: input length mismatch.");
190 int n_vars = Atb.GetLength();
191 float *AtA_ptr =
static_cast<float *
>(AtA.
GetDataPtr());
192 float *Atb_ptr =
static_cast<float *
>(Atb.GetDataPtr());
193 float *residual_ptr =
static_cast<float *
>(residual.GetDataPtr());
196 const float *Ti_Cps_ptr =
static_cast<const float *
>(Ti_Cps.GetDataPtr());
197 const float *Tj_Cqs_ptr =
static_cast<const float *
>(Tj_Cqs.GetDataPtr());
198 const float *Cnormal_ps_ptr =
199 static_cast<const float *
>(Cnormal_ps.GetDataPtr());
200 const float *Ri_Cnormal_ps_ptr =
201 static_cast<const float *
>(Ri_Cnormal_ps.GetDataPtr());
202 const float *RjT_Ri_Cnormal_ps_ptr =
203 static_cast<const float *
>(RjT_Ri_Cnormal_ps.GetDataPtr());
206 const int *cgrid_idx_ps_ptr =
207 static_cast<const int *
>(cgrid_idx_ps.GetDataPtr());
208 const int *cgrid_idx_qs_ptr =
209 static_cast<const int *
>(cgrid_idx_qs.GetDataPtr());
210 const float *cgrid_ratio_ps_ptr =
211 static_cast<const float *
>(cgrid_ratio_ps.GetDataPtr());
212 const float *cgrid_ratio_qs_ptr =
213 static_cast<const float *
>(cgrid_ratio_qs.GetDataPtr());
215 #if defined(__CUDACC__) 216 core::kernel::CUDALauncher launcher;
221 const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
222 const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
223 const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
224 const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx;
225 const float *RjTRi_Cnormal_p = RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
227 const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
228 const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
229 const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx;
230 const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx;
232 float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
233 (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
234 (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
235 if (abs(r) > threshold)
return;
242 J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
243 J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
244 J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
245 J[3] = Ri_Cnormal_p[0];
246 J[4] = Ri_Cnormal_p[1];
247 J[5] = Ri_Cnormal_p[2];
250 for (
int k = 0; k < 6; ++k) {
253 idx[k + 0] = 6 * i + k;
254 idx[k + 6] = 6 * j + k;
258 for (
int k = 0; k < 8; ++k) {
259 J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
260 J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
261 J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
263 idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
264 idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
265 idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
269 for (
int k = 0; k < 8; ++k) {
270 J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
271 J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
272 J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
274 idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
275 idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
276 idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
280 #if defined(__CUDACC__) 281 for (
int ki = 0; ki < 60; ++ki) {
282 for (
int kj = 0; kj < 60; ++kj) {
283 float AtA_ij = J[ki] * J[kj];
284 int ij = idx[ki] * n_vars + idx[kj];
285 atomicAdd(AtA_ptr + ij, AtA_ij);
287 float Atb_i = J[ki] * r;
288 atomicAdd(Atb_ptr + idx[ki], Atb_i);
290 atomicAdd(residual_ptr, r * r);
294 for (
int ki = 0; ki < 60; ++ki) {
295 for (
int kj = 0; kj < 60; ++kj) {
296 AtA_ptr[idx[ki] * n_vars + idx[kj]]
299 Atb_ptr[idx[ki]] += J[ki] * r;
301 *residual_ptr += r * r;
322 o0 = m00 * v0 + m01 * v1 + m02 * v2;
323 o1 = m10 * v0 + m11 * v1 + m12 * v2;
324 o2 = m20 * v0 + m21 * v1 + m22 * v2;
354 matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b00, b10, b20,
356 matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b01, b11, b21,
358 matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b02, b12, b22,
371 return m00 * (m11 * m22 - m12 * m21) - m10 * (m01 * m22 - m02 - m21) +
372 m20 * (m01 * m12 - m02 * m11);
375 #if defined(__CUDACC__) 376 void FillInSLACRegularizerTermCUDA
392 int64_t n = grid_idx.GetLength();
393 int64_t n_vars = Atb.GetLength();
395 float *AtA_ptr =
static_cast<float *
>(AtA.
GetDataPtr());
396 float *Atb_ptr =
static_cast<float *
>(Atb.GetDataPtr());
397 float *residual_ptr =
static_cast<float *
>(residual.GetDataPtr());
399 const int *grid_idx_ptr =
static_cast<const int *
>(grid_idx.GetDataPtr());
400 const int *grid_nbs_idx_ptr =
401 static_cast<const int *
>(grid_nbs_idx.GetDataPtr());
402 const bool *grid_nbs_mask_ptr =
403 static_cast<const bool *
>(grid_nbs_mask.GetDataPtr());
405 const float *positions_init_ptr =
406 static_cast<const float *
>(positions_init.GetDataPtr());
407 const float *positions_curr_ptr =
408 static_cast<const float *
>(positions_curr.GetDataPtr());
410 #if defined(__CUDACC__) 411 core::kernel::CUDALauncher launcher;
417 int idx_i = grid_idx_ptr[workload_idx];
419 const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
420 const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
423 float cov[3][3] = {{0}};
424 float U[3][3], V[3][3], S[3];
427 for (
int k = 0; k < 6; ++k) {
428 bool mask_k = mask_nbs[k];
429 if (!mask_k)
continue;
431 int idx_k = idx_nbs[k];
434 float diff_ik_init[3] = {positions_init_ptr[idx_i * 3 + 0] -
435 positions_init_ptr[idx_k * 3 + 0],
436 positions_init_ptr[idx_i * 3 + 1] -
437 positions_init_ptr[idx_k * 3 + 1],
438 positions_init_ptr[idx_i * 3 + 2] -
439 positions_init_ptr[idx_k * 3 + 2]};
440 float diff_ik_curr[3] = {positions_curr_ptr[idx_i * 3 + 0] -
441 positions_curr_ptr[idx_k * 3 + 0],
442 positions_curr_ptr[idx_i * 3 + 1] -
443 positions_curr_ptr[idx_k * 3 + 1],
444 positions_curr_ptr[idx_i * 3 + 2] -
445 positions_curr_ptr[idx_k * 3 + 2]};
450 for (
int i = 0; i < 3; ++i) {
451 for (
int j = 0; j < 3; ++j) {
452 cov[i][j] += diff_ik_init[i] * diff_ik_curr[j];
463 svd(cov[0][0], cov[0][1], cov[0][2],
464 cov[1][0], cov[1][1], cov[1][2],
465 cov[2][0], cov[2][1], cov[2][2],
466 U[0][0], U[0][1], U[0][2],
467 U[1][0], U[1][1], U[1][2],
468 U[2][0], U[2][1], U[2][2],
470 V[0][0], V[0][1], V[0][2],
471 V[1][0], V[1][1], V[1][2],
472 V[2][0], V[2][1], V[2][2]);
479 V[1][0], V[1][1], V[1][2],
480 V[2][0], V[2][1], V[2][2],
481 U[0][0], U[1][0], U[2][0],
482 U[0][1], U[1][1], U[2][1],
483 U[0][2], U[1][2], U[2][2],
484 R[0][0], R[0][1], R[0][2],
485 R[1][0], R[1][1], R[1][2],
486 R[2][0], R[2][1], R[2][2]);
488 float d =
det3x3(R[0][0], R[0][1], R[0][2],
489 R[1][0], R[1][1], R[1][2],
490 R[2][0], R[2][1], R[2][2]);
496 V[1][0], V[1][1], V[1][2],
497 V[2][0], V[2][1], V[2][2],
498 U[0][0], U[1][0], U[2][0],
499 U[0][1], U[1][1], U[2][1],
500 -U[0][2], -U[1][2], -U[2][2],
501 R[0][0], R[0][1], R[0][2],
502 R[1][0], R[1][1], R[1][2],
503 R[2][0], R[2][1], R[2][2]);
509 if (idx_i == anchor_idx) {
510 R[0][0] = R[1][1] = R[2][2] = 1;
511 R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0;
513 for (
int k = 0; k < 6; ++k) {
514 bool mask_k = mask_nbs[k];
517 int idx_k = idx_nbs[k];
519 float diff_ik_init[3] = {
520 positions_init_ptr[idx_i * 3 + 0] -
521 positions_init_ptr[idx_k * 3 + 0],
522 positions_init_ptr[idx_i * 3 + 1] -
523 positions_init_ptr[idx_k * 3 + 1],
524 positions_init_ptr[idx_i * 3 + 2] -
525 positions_init_ptr[idx_k * 3 + 2]};
526 float diff_ik_curr[3] = {
527 positions_curr_ptr[idx_i * 3 + 0] -
528 positions_curr_ptr[idx_k * 3 + 0],
529 positions_curr_ptr[idx_i * 3 + 1] -
530 positions_curr_ptr[idx_k * 3 + 1],
531 positions_curr_ptr[idx_i * 3 + 2] -
532 positions_curr_ptr[idx_k * 3 + 2]};
533 float R_diff_ik_curr[3];
537 R[1][0], R[1][1], R[1][2],
538 R[2][0], R[2][1], R[2][2],
548 local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
549 local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
550 local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
552 int offset_idx_i = 3 * idx_i + 6 * n_frags;
553 int offset_idx_k = 3 * idx_k + 6 * n_frags;
555 #if defined(__CUDACC__) 557 atomicAdd(residual_ptr, weight * (local_r[0] * local_r[0] +
558 local_r[1] * local_r[1] +
559 local_r[2] * local_r[2]));
561 for (
int axis = 0; axis < 3; ++axis) {
563 atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
564 offset_idx_i + axis],
566 atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
567 offset_idx_k + axis],
569 atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
570 offset_idx_k + axis],
572 atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
573 offset_idx_i + axis],
577 atomicAdd(&Atb_ptr[offset_idx_i + axis],
578 +weight * local_r[axis]);
579 atomicAdd(&Atb_ptr[offset_idx_k + axis],
580 -weight * local_r[axis]);
586 *residual_ptr += weight * (local_r[0] * local_r[0] +
587 local_r[1] * local_r[1] +
588 local_r[2] * local_r[2]);
590 for (
int axis = 0; axis < 3; ++axis) {
592 AtA_ptr[(offset_idx_i + axis) * n_vars +
593 offset_idx_i + axis] += weight;
594 AtA_ptr[(offset_idx_k + axis) * n_vars +
595 offset_idx_k + axis] += weight;
597 AtA_ptr[(offset_idx_i + axis) * n_vars +
598 offset_idx_k + axis] -= weight;
599 AtA_ptr[(offset_idx_k + axis) * n_vars +
600 offset_idx_i + axis] -= weight;
603 Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
604 Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
void FillInSLACRegularizerTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &grid_idx, const core::Tensor &grid_nbs_idx, const core::Tensor &grid_nbs_mask, const core::Tensor &positions_init, const core::Tensor &positions_curr, float weight, int n, int anchor_idx)
Definition: FillInLinearSystemImpl.h:380
Definition: CPULauncher.h:42
void FillInRigidAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &Ri_normal_ps, int i, int j, float threshold)
Definition: FillInLinearSystemImpl.h:45
void svd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, float &s11, float &s22, float &s33, float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33)
Definition: SVD3x3CPU.h:43
Device GetDevice() const
Definition: Tensor.cpp:1098
#define LogError(...)
Definition: Console.h:79
Tensor View(const SizeVector &dst_shape) const
Definition: Tensor.cpp:522
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:57
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:704
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:56
void IndexSet(const std::vector< Tensor > &index_tensors, const Tensor &src_tensor)
Advanced indexing getter.
Definition: Tensor.cpp:713
static Tensor Zeros(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor fill with zeros.
Definition: Tensor.cpp:240
static const Dtype Float32
Definition: Dtype.h:42
static void LaunchGeneralKernel(int64_t n, func_t element_kernel)
General kernels with non-conventional indexers.
Definition: CPULauncher.h:176
static const Dtype Int64
Definition: Dtype.h:47
Definition: PinholeCameraIntrinsic.cpp:35
void FillInSLACAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &normal_ps, const core::Tensor &Ri_normal_ps, const core::Tensor &RjT_Ri_normal_ps, const core::Tensor &cgrid_idx_ps, const core::Tensor &cgrid_idx_qs, const core::Tensor &cgrid_ratio_qs, const core::Tensor &cgrid_ratio_ps, int i, int j, int n, float threshold)
Definition: FillInLinearSystemImpl.h:165
T * GetDataPtr()
Definition: Tensor.h:1005
OPEN3D_HOST_DEVICE float det3x3(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
Definition: FillInLinearSystemImpl.h:362
OPEN3D_HOST_DEVICE void matmul3x3_3x1(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22, float v0, float v1, float v2, float &o0, float &o1, float &o2)
Definition: FillInLinearSystemImpl.h:307
OPEN3D_HOST_DEVICE void matmul3x3_3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22, float b00, float b01, float b02, float b10, float b11, float b12, float b20, float b21, float b22, float &c00, float &c01, float &c02, float &c10, float &c11, float &c12, float &c20, float &c21, float &c22)
Definition: FillInLinearSystemImpl.h:327