35 #if defined(__CUDACC__) 36 void FillInRigidAlignmentTermCUDA
51 int64_t n = Ti_ps.GetLength();
52 if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
54 "Unable to setup linear system: input length mismatch.");
62 float *AtA_local_ptr =
static_cast<float *
>(AtA_local.
GetDataPtr());
63 float *Atb_local_ptr =
static_cast<float *
>(Atb_local.GetDataPtr());
64 float *residual_ptr =
static_cast<float *
>(residual.GetDataPtr());
66 const float *Ti_ps_ptr =
static_cast<const float *
>(Ti_ps.GetDataPtr());
67 const float *Tj_qs_ptr =
static_cast<const float *
>(Tj_qs.GetDataPtr());
68 const float *Ri_normal_ps_ptr =
69 static_cast<const float *
>(Ri_normal_ps.GetDataPtr());
73 const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
74 const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
75 const float *normal_p_prime =
76 Ri_normal_ps_ptr + 3 * workload_idx;
78 float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
79 (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
80 (p_prime[2] - q_prime[2]) * normal_p_prime[2];
81 if (abs(r) > threshold)
return;
84 J_ij[0] = -q_prime[2] * normal_p_prime[1] +
85 q_prime[1] * normal_p_prime[2];
86 J_ij[1] = q_prime[2] * normal_p_prime[0] -
87 q_prime[0] * normal_p_prime[2];
88 J_ij[2] = -q_prime[1] * normal_p_prime[0] +
89 q_prime[0] * normal_p_prime[1];
90 J_ij[3] = normal_p_prime[0];
91 J_ij[4] = normal_p_prime[1];
92 J_ij[5] = normal_p_prime[2];
93 for (
int k = 0; k < 6; ++k) {
94 J_ij[k + 6] = -J_ij[k];
98 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__) 99 for (
int i_local = 0; i_local < 12; ++i_local) {
100 for (
int j_local = 0; j_local < 12; ++j_local) {
101 atomicAdd(&AtA_local_ptr[i_local * 12 + j_local],
102 J_ij[i_local] * J_ij[j_local]);
104 atomicAdd(&Atb_local_ptr[i_local], J_ij[i_local] * r);
106 atomicAdd(residual_ptr, r * r);
108 #pragma omp critical(FillInRigidAlignmentTermCPU) 110 for (
int i_local = 0; i_local < 12; ++i_local) {
111 for (
int j_local = 0; j_local < 12; ++j_local) {
112 AtA_local_ptr[i_local * 12 + j_local]
113 += J_ij[i_local] * J_ij[j_local];
115 Atb_local_ptr[i_local] += J_ij[i_local] * r;
117 *residual_ptr += r * r;
123 std::vector<int64_t> indices_vec(12);
124 for (
int k = 0; k < 6; ++k) {
125 indices_vec[k] = i * 6 + k;
126 indices_vec[k + 6] = j * 6 + k;
129 std::vector<int64_t> indices_i_vec;
130 std::vector<int64_t> indices_j_vec;
131 for (
int local_i = 0; local_i < 12; ++local_i) {
132 for (
int local_j = 0; local_j < 12; ++local_j) {
133 indices_i_vec.push_back(indices_vec[local_i]);
134 indices_j_vec.push_back(indices_vec[local_j]);
143 AtA.
IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.
View({12 * 12}));
146 Atb.IndexSet({indices}, Atb_sub + Atb_local.
View({12, 1}));
149 #if defined(__CUDACC__) 150 void FillInSLACAlignmentTermCUDA
170 int64_t n = Ti_Cps.GetLength();
171 if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
172 Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
173 cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
174 cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
176 "Unable to setup linear system: input length mismatch.");
179 int n_vars = Atb.GetLength();
180 float *AtA_ptr =
static_cast<float *
>(AtA.
GetDataPtr());
181 float *Atb_ptr =
static_cast<float *
>(Atb.GetDataPtr());
182 float *residual_ptr =
static_cast<float *
>(residual.GetDataPtr());
185 const float *Ti_Cps_ptr =
static_cast<const float *
>(Ti_Cps.GetDataPtr());
186 const float *Tj_Cqs_ptr =
static_cast<const float *
>(Tj_Cqs.GetDataPtr());
187 const float *Cnormal_ps_ptr =
188 static_cast<const float *
>(Cnormal_ps.GetDataPtr());
189 const float *Ri_Cnormal_ps_ptr =
190 static_cast<const float *
>(Ri_Cnormal_ps.GetDataPtr());
191 const float *RjT_Ri_Cnormal_ps_ptr =
192 static_cast<const float *
>(RjT_Ri_Cnormal_ps.GetDataPtr());
195 const int *cgrid_idx_ps_ptr =
196 static_cast<const int *
>(cgrid_idx_ps.GetDataPtr());
197 const int *cgrid_idx_qs_ptr =
198 static_cast<const int *
>(cgrid_idx_qs.GetDataPtr());
199 const float *cgrid_ratio_ps_ptr =
200 static_cast<const float *
>(cgrid_ratio_ps.GetDataPtr());
201 const float *cgrid_ratio_qs_ptr =
202 static_cast<const float *
>(cgrid_ratio_qs.GetDataPtr());
206 const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
207 const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
208 const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
209 const float *Ri_Cnormal_p =
210 Ri_Cnormal_ps_ptr + 3 * workload_idx;
211 const float *RjTRi_Cnormal_p =
212 RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
214 const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
215 const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
216 const float *cgrid_ratio_p =
217 cgrid_ratio_ps_ptr + 8 * workload_idx;
218 const float *cgrid_ratio_q =
219 cgrid_ratio_qs_ptr + 8 * workload_idx;
221 float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
222 (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
223 (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
224 if (abs(r) > threshold)
return;
231 J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
232 J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
233 J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
234 J[3] = Ri_Cnormal_p[0];
235 J[4] = Ri_Cnormal_p[1];
236 J[5] = Ri_Cnormal_p[2];
239 for (
int k = 0; k < 6; ++k) {
242 idx[k + 0] = 6 * i + k;
243 idx[k + 6] = 6 * j + k;
247 for (
int k = 0; k < 8; ++k) {
248 J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
249 J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
250 J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
252 idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
253 idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
254 idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
258 for (
int k = 0; k < 8; ++k) {
259 J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
260 J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
261 J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
263 idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
264 idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
265 idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
269 #if defined(__CUDACC__) 270 for (
int ki = 0; ki < 60; ++ki) {
271 for (
int kj = 0; kj < 60; ++kj) {
272 float AtA_ij = J[ki] * J[kj];
273 int ij = idx[ki] * n_vars + idx[kj];
274 atomicAdd(AtA_ptr + ij, AtA_ij);
276 float Atb_i = J[ki] * r;
277 atomicAdd(Atb_ptr + idx[ki], Atb_i);
279 atomicAdd(residual_ptr, r * r);
281 #pragma omp critical(FillInSLACAlignmentTermCPU) 283 for (
int ki = 0; ki < 60; ++ki) {
284 for (
int kj = 0; kj < 60; ++kj) {
285 AtA_ptr[idx[ki] * n_vars + idx[kj]]
288 Atb_ptr[idx[ki]] += J[ki] * r;
290 *residual_ptr += r * r;
296 #if defined(__CUDACC__) 297 void FillInSLACRegularizerTermCUDA
313 int64_t n = grid_idx.GetLength();
314 int64_t n_vars = Atb.GetLength();
316 float *AtA_ptr =
static_cast<float *
>(AtA.
GetDataPtr());
317 float *Atb_ptr =
static_cast<float *
>(Atb.GetDataPtr());
318 float *residual_ptr =
static_cast<float *
>(residual.GetDataPtr());
320 const int *grid_idx_ptr =
static_cast<const int *
>(grid_idx.GetDataPtr());
321 const int *grid_nbs_idx_ptr =
322 static_cast<const int *
>(grid_nbs_idx.GetDataPtr());
323 const bool *grid_nbs_mask_ptr =
324 static_cast<const bool *
>(grid_nbs_mask.GetDataPtr());
326 const float *positions_init_ptr =
327 static_cast<const float *
>(positions_init.GetDataPtr());
328 const float *positions_curr_ptr =
329 static_cast<const float *
>(positions_curr.GetDataPtr());
334 int idx_i = grid_idx_ptr[workload_idx];
336 const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
337 const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
340 float cov[3][3] = {{0}};
341 float U[3][3], V[3][3], S[3];
344 for (
int k = 0; k < 6; ++k) {
345 bool mask_k = mask_nbs[k];
346 if (!mask_k)
continue;
348 int idx_k = idx_nbs[k];
351 float diff_ik_init[3] = {
352 positions_init_ptr[idx_i * 3 + 0] -
353 positions_init_ptr[idx_k * 3 + 0],
354 positions_init_ptr[idx_i * 3 + 1] -
355 positions_init_ptr[idx_k * 3 + 1],
356 positions_init_ptr[idx_i * 3 + 2] -
357 positions_init_ptr[idx_k * 3 + 2]};
358 float diff_ik_curr[3] = {
359 positions_curr_ptr[idx_i * 3 + 0] -
360 positions_curr_ptr[idx_k * 3 + 0],
361 positions_curr_ptr[idx_i * 3 + 1] -
362 positions_curr_ptr[idx_k * 3 + 1],
363 positions_curr_ptr[idx_i * 3 + 2] -
364 positions_curr_ptr[idx_k * 3 + 2]};
368 for (
int i = 0; i < 3; ++i) {
369 for (
int j = 0; j < 3; ++j) {
370 cov[i][j] += diff_ik_init[i] * diff_ik_curr[j];
397 if (idx_i == anchor_idx) {
398 R[0][0] = R[1][1] = R[2][2] = 1;
399 R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] =
402 for (
int k = 0; k < 6; ++k) {
403 bool mask_k = mask_nbs[k];
406 int idx_k = idx_nbs[k];
408 float diff_ik_init[3] = {
409 positions_init_ptr[idx_i * 3 + 0] -
410 positions_init_ptr[idx_k * 3 + 0],
411 positions_init_ptr[idx_i * 3 + 1] -
412 positions_init_ptr[idx_k * 3 + 1],
413 positions_init_ptr[idx_i * 3 + 2] -
414 positions_init_ptr[idx_k * 3 + 2]};
415 float diff_ik_curr[3] = {
416 positions_curr_ptr[idx_i * 3 + 0] -
417 positions_curr_ptr[idx_k * 3 + 0],
418 positions_curr_ptr[idx_i * 3 + 1] -
419 positions_curr_ptr[idx_k * 3 + 1],
420 positions_curr_ptr[idx_i * 3 + 2] -
421 positions_curr_ptr[idx_k * 3 + 2]};
422 float R_diff_ik_curr[3];
424 core::linalg::kernel::matmul3x3_3x1(*R, diff_ik_init,
428 local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
429 local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
430 local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
432 int offset_idx_i = 3 * idx_i + 6 * n_frags;
433 int offset_idx_k = 3 * idx_k + 6 * n_frags;
435 #if defined(__CUDACC__) 437 atomicAdd(residual_ptr,
438 weight * (local_r[0] * local_r[0] +
439 local_r[1] * local_r[1] +
440 local_r[2] * local_r[2]));
442 for (
int axis = 0; axis < 3; ++axis) {
444 atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
445 offset_idx_i + axis],
447 atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
448 offset_idx_k + axis],
450 atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
451 offset_idx_k + axis],
453 atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
454 offset_idx_i + axis],
458 atomicAdd(&Atb_ptr[offset_idx_i + axis],
459 +weight * local_r[axis]);
460 atomicAdd(&Atb_ptr[offset_idx_k + axis],
461 -weight * local_r[axis]);
464 #pragma omp critical(FillInSLACRegularizerTermCPU) 467 *residual_ptr += weight * (local_r[0] * local_r[0] +
468 local_r[1] * local_r[1] +
469 local_r[2] * local_r[2]);
471 for (
int axis = 0; axis < 3; ++axis) {
473 AtA_ptr[(offset_idx_i + axis) * n_vars +
474 offset_idx_i + axis] += weight;
475 AtA_ptr[(offset_idx_k + axis) * n_vars +
476 offset_idx_k + axis] += weight;
478 AtA_ptr[(offset_idx_i + axis) * n_vars +
479 offset_idx_k + axis] -= weight;
480 AtA_ptr[(offset_idx_k + axis) * n_vars +
481 offset_idx_i + axis] -= weight;
484 Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
485 Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
OPEN3D_DEVICE OPEN3D_FORCE_INLINE scalar_t det3x3(const scalar_t *A_3x3)
Definition: Matrix.h:108
void FillInSLACRegularizerTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &grid_idx, const core::Tensor &grid_nbs_idx, const core::Tensor &grid_nbs_mask, const core::Tensor &positions_init, const core::Tensor &positions_curr, float weight, int n, int anchor_idx)
Definition: FillInLinearSystemImpl.h:301
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition: Matrix.h:67
const Dtype Int64
Definition: Dtype.cpp:66
void FillInRigidAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &Ri_normal_ps, int i, int j, float threshold)
Definition: FillInLinearSystemImpl.h:40
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:122
const Dtype Float32
Definition: Dtype.cpp:61
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void transpose3x3_(scalar_t *A_3x3)
Definition: Matrix.h:170
Device GetDevice() const
Definition: Tensor.cpp:1365
Tensor View(const SizeVector &dst_shape) const
Definition: Tensor.cpp:695
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:64
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:879
void IndexSet(const std::vector< Tensor > &index_tensors, const Tensor &src_tensor)
Advanced indexing getter.
Definition: Tensor.cpp:910
static Tensor Zeros(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor fill with zeros.
Definition: Tensor.cpp:380
Definition: PinholeCameraIntrinsic.cpp:35
void FillInSLACAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &normal_ps, const core::Tensor &Ri_normal_ps, const core::Tensor &RjT_Ri_normal_ps, const core::Tensor &cgrid_idx_ps, const core::Tensor &cgrid_idx_qs, const core::Tensor &cgrid_ratio_qs, const core::Tensor &cgrid_ratio_ps, int i, int j, int n, float threshold)
Definition: FillInLinearSystemImpl.h:154
T * GetDataPtr()
Definition: Tensor.h:1074
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
#define LogError(...)
Definition: Logging.h:72