Open3D (C++ API)  0.13.0
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
FillInLinearSystemImpl.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
29 
30 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
32 #else
34 #endif
35 
36 namespace open3d {
37 namespace t {
38 namespace pipelines {
39 namespace kernel {
40 #if defined(__CUDACC__)
41 void FillInRigidAlignmentTermCUDA
42 #else
44 #endif
45  (core::Tensor &AtA,
46  core::Tensor &Atb,
47  core::Tensor &residual,
48  const core::Tensor &Ti_ps,
49  const core::Tensor &Tj_qs,
50  const core::Tensor &Ri_normal_ps,
51  int i,
52  int j,
53  float threshold) {
54 
55  core::Device device = AtA.GetDevice();
56  int64_t n = Ti_ps.GetLength();
57  if (Tj_qs.GetLength() != n || Ri_normal_ps.GetLength() != n) {
59  "Unable to setup linear system: input length mismatch.");
60  }
61 
62  // First fill in a small 12 x 12 linear system
63  core::Tensor AtA_local =
64  core::Tensor::Zeros({12, 12}, core::Dtype::Float32, device);
65  core::Tensor Atb_local =
67 
68  float *AtA_local_ptr = static_cast<float *>(AtA_local.GetDataPtr());
69  float *Atb_local_ptr = static_cast<float *>(Atb_local.GetDataPtr());
70  float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
71 
72  const float *Ti_ps_ptr = static_cast<const float *>(Ti_ps.GetDataPtr());
73  const float *Tj_qs_ptr = static_cast<const float *>(Tj_qs.GetDataPtr());
74  const float *Ri_normal_ps_ptr =
75  static_cast<const float *>(Ri_normal_ps.GetDataPtr());
76 
77 #if defined(__CUDACC__)
78  core::kernel::CUDALauncher launcher;
79 #else
81 #endif
82  launcher.LaunchGeneralKernel(n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
83  const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
84  const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
85  const float *normal_p_prime = Ri_normal_ps_ptr + 3 * workload_idx;
86 
87  float r = (p_prime[0] - q_prime[0]) * normal_p_prime[0] +
88  (p_prime[1] - q_prime[1]) * normal_p_prime[1] +
89  (p_prime[2] - q_prime[2]) * normal_p_prime[2];
90  if (abs(r) > threshold) return;
91 
92  float J_ij[12];
93  J_ij[0] = -q_prime[2] * normal_p_prime[1] +
94  q_prime[1] * normal_p_prime[2];
95  J_ij[1] =
96  q_prime[2] * normal_p_prime[0] - q_prime[0] * normal_p_prime[2];
97  J_ij[2] = -q_prime[1] * normal_p_prime[0] +
98  q_prime[0] * normal_p_prime[1];
99  J_ij[3] = normal_p_prime[0];
100  J_ij[4] = normal_p_prime[1];
101  J_ij[5] = normal_p_prime[2];
102  for (int k = 0; k < 6; ++k) {
103  J_ij[k + 6] = -J_ij[k];
104  }
105 
106  // Not optimized; Switch to reduction if necessary.
107 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
108  for (int i_local = 0; i_local < 12; ++i_local) {
109  for (int j_local = 0; j_local < 12; ++j_local) {
110  atomicAdd(&AtA_local_ptr[i_local * 12 + j_local],
111  J_ij[i_local] * J_ij[j_local]);
112  }
113  atomicAdd(&Atb_local_ptr[i_local], J_ij[i_local] * r);
114  }
115  atomicAdd(residual_ptr, r * r);
116 #else
117 #pragma omp critical
118  {
119  for (int i_local = 0; i_local < 12; ++i_local) {
120  for (int j_local = 0; j_local < 12; ++j_local) {
121  AtA_local_ptr[i_local * 12 + j_local]
122  += J_ij[i_local] * J_ij[j_local];
123  }
124  Atb_local_ptr[i_local] += J_ij[i_local] * r;
125  }
126  *residual_ptr += r * r;
127  }
128 #endif
129  });
130 
131  // Then fill-in the large linear system
132  std::vector<int64_t> indices_vec(12);
133  for (int k = 0; k < 6; ++k) {
134  indices_vec[k] = i * 6 + k;
135  indices_vec[k + 6] = j * 6 + k;
136  }
137 
138  std::vector<int64_t> indices_i_vec;
139  std::vector<int64_t> indices_j_vec;
140  for (int local_i = 0; local_i < 12; ++local_i) {
141  for (int local_j = 0; local_j < 12; ++local_j) {
142  indices_i_vec.push_back(indices_vec[local_i]);
143  indices_j_vec.push_back(indices_vec[local_j]);
144  }
145  }
146 
147  core::Tensor indices(indices_vec, {12}, core::Dtype::Int64, device);
148  core::Tensor indices_i(indices_i_vec, {12 * 12}, core::Dtype::Int64,
149  device);
150  core::Tensor indices_j(indices_j_vec, {12 * 12}, core::Dtype::Int64,
151  device);
152 
153  core::Tensor AtA_sub = AtA.IndexGet({indices_i, indices_j});
154  AtA.IndexSet({indices_i, indices_j}, AtA_sub + AtA_local.View({12 * 12}));
155 
156  core::Tensor Atb_sub = Atb.IndexGet({indices});
157  Atb.IndexSet({indices}, Atb_sub + Atb_local.View({12, 1}));
158 }
159 
160 #if defined(__CUDACC__)
161 void FillInSLACAlignmentTermCUDA
162 #else
164 #endif
166  core::Tensor &Atb,
167  core::Tensor &residual,
168  const core::Tensor &Ti_Cps,
169  const core::Tensor &Tj_Cqs,
170  const core::Tensor &Cnormal_ps,
171  const core::Tensor &Ri_Cnormal_ps,
172  const core::Tensor &RjT_Ri_Cnormal_ps,
173  const core::Tensor &cgrid_idx_ps,
174  const core::Tensor &cgrid_idx_qs,
175  const core::Tensor &cgrid_ratio_qs,
176  const core::Tensor &cgrid_ratio_ps,
177  int i,
178  int j,
179  int n_frags,
180  float threshold) {
181  int64_t n = Ti_Cps.GetLength();
182  if (Tj_Cqs.GetLength() != n || Cnormal_ps.GetLength() != n ||
183  Ri_Cnormal_ps.GetLength() != n || RjT_Ri_Cnormal_ps.GetLength() != n ||
184  cgrid_idx_ps.GetLength() != n || cgrid_ratio_ps.GetLength() != n ||
185  cgrid_idx_qs.GetLength() != n || cgrid_ratio_qs.GetLength() != n) {
187  "Unable to setup linear system: input length mismatch.");
188  }
189 
190  int n_vars = Atb.GetLength();
191  float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
192  float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
193  float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
194 
195  // Geometric properties
196  const float *Ti_Cps_ptr = static_cast<const float *>(Ti_Cps.GetDataPtr());
197  const float *Tj_Cqs_ptr = static_cast<const float *>(Tj_Cqs.GetDataPtr());
198  const float *Cnormal_ps_ptr =
199  static_cast<const float *>(Cnormal_ps.GetDataPtr());
200  const float *Ri_Cnormal_ps_ptr =
201  static_cast<const float *>(Ri_Cnormal_ps.GetDataPtr());
202  const float *RjT_Ri_Cnormal_ps_ptr =
203  static_cast<const float *>(RjT_Ri_Cnormal_ps.GetDataPtr());
204 
205  // Association properties
206  const int *cgrid_idx_ps_ptr =
207  static_cast<const int *>(cgrid_idx_ps.GetDataPtr());
208  const int *cgrid_idx_qs_ptr =
209  static_cast<const int *>(cgrid_idx_qs.GetDataPtr());
210  const float *cgrid_ratio_ps_ptr =
211  static_cast<const float *>(cgrid_ratio_ps.GetDataPtr());
212  const float *cgrid_ratio_qs_ptr =
213  static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());
214 
215 #if defined(__CUDACC__)
216  core::kernel::CUDALauncher launcher;
217 #else
218  core::kernel::CPULauncher launcher;
219 #endif
220  launcher.LaunchGeneralKernel(n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
221  const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
222  const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
223  const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
224  const float *Ri_Cnormal_p = Ri_Cnormal_ps_ptr + 3 * workload_idx;
225  const float *RjTRi_Cnormal_p = RjT_Ri_Cnormal_ps_ptr + 3 * workload_idx;
226 
227  const int *cgrid_idx_p = cgrid_idx_ps_ptr + 8 * workload_idx;
228  const int *cgrid_idx_q = cgrid_idx_qs_ptr + 8 * workload_idx;
229  const float *cgrid_ratio_p = cgrid_ratio_ps_ptr + 8 * workload_idx;
230  const float *cgrid_ratio_q = cgrid_ratio_qs_ptr + 8 * workload_idx;
231 
232  float r = (Ti_Cp[0] - Tj_Cq[0]) * Ri_Cnormal_p[0] +
233  (Ti_Cp[1] - Tj_Cq[1]) * Ri_Cnormal_p[1] +
234  (Ti_Cp[2] - Tj_Cq[2]) * Ri_Cnormal_p[2];
235  if (abs(r) > threshold) return;
236 
237  // Now we fill in a 60 x 60 sub-matrix: 2 x (6 + 8 x 3)
238  float J[60];
239  int idx[60];
240 
241  // Jacobian w.r.t. Ti: 0-6
242  J[0] = -Tj_Cq[2] * Ri_Cnormal_p[1] + Tj_Cq[1] * Ri_Cnormal_p[2];
243  J[1] = Tj_Cq[2] * Ri_Cnormal_p[0] - Tj_Cq[0] * Ri_Cnormal_p[2];
244  J[2] = -Tj_Cq[1] * Ri_Cnormal_p[0] + Tj_Cq[0] * Ri_Cnormal_p[1];
245  J[3] = Ri_Cnormal_p[0];
246  J[4] = Ri_Cnormal_p[1];
247  J[5] = Ri_Cnormal_p[2];
248 
249  // Jacobian w.r.t. Tj: 6-12
250  for (int k = 0; k < 6; ++k) {
251  J[k + 6] = -J[k];
252 
253  idx[k + 0] = 6 * i + k;
254  idx[k + 6] = 6 * j + k;
255  }
256 
257  // Jacobian w.r.t. C over p: 12-36
258  for (int k = 0; k < 8; ++k) {
259  J[12 + k * 3 + 0] = cgrid_ratio_p[k] * Cnormal_p[0];
260  J[12 + k * 3 + 1] = cgrid_ratio_p[k] * Cnormal_p[1];
261  J[12 + k * 3 + 2] = cgrid_ratio_p[k] * Cnormal_p[2];
262 
263  idx[12 + k * 3 + 0] = 6 * n_frags + cgrid_idx_p[k] * 3 + 0;
264  idx[12 + k * 3 + 1] = 6 * n_frags + cgrid_idx_p[k] * 3 + 1;
265  idx[12 + k * 3 + 2] = 6 * n_frags + cgrid_idx_p[k] * 3 + 2;
266  }
267 
268  // Jacobian w.r.t. C over q: 36-60
269  for (int k = 0; k < 8; ++k) {
270  J[36 + k * 3 + 0] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[0];
271  J[36 + k * 3 + 1] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[1];
272  J[36 + k * 3 + 2] = -cgrid_ratio_q[k] * RjTRi_Cnormal_p[2];
273 
274  idx[36 + k * 3 + 0] = 6 * n_frags + cgrid_idx_q[k] * 3 + 0;
275  idx[36 + k * 3 + 1] = 6 * n_frags + cgrid_idx_q[k] * 3 + 1;
276  idx[36 + k * 3 + 2] = 6 * n_frags + cgrid_idx_q[k] * 3 + 2;
277  }
278 
279  // Not optimized; Switch to reduction if necessary.
280 #if defined(__CUDACC__)
281  for (int ki = 0; ki < 60; ++ki) {
282  for (int kj = 0; kj < 60; ++kj) {
283  float AtA_ij = J[ki] * J[kj];
284  int ij = idx[ki] * n_vars + idx[kj];
285  atomicAdd(AtA_ptr + ij, AtA_ij);
286  }
287  float Atb_i = J[ki] * r;
288  atomicAdd(Atb_ptr + idx[ki], Atb_i);
289  }
290  atomicAdd(residual_ptr, r * r);
291 #else
292 #pragma omp critical
293  {
294  for (int ki = 0; ki < 60; ++ki) {
295  for (int kj = 0; kj < 60; ++kj) {
296  AtA_ptr[idx[ki] * n_vars + idx[kj]]
297  += J[ki] * J[kj];
298  }
299  Atb_ptr[idx[ki]] += J[ki] * r;
300  }
301  *residual_ptr += r * r;
302  }
303 #endif
304  });
305 }
306 
307 inline OPEN3D_HOST_DEVICE void matmul3x3_3x1(float m00,
308  float m01,
309  float m02,
310  float m10,
311  float m11,
312  float m12,
313  float m20,
314  float m21,
315  float m22,
316  float v0,
317  float v1,
318  float v2,
319  float &o0,
320  float &o1,
321  float &o2) {
322  o0 = m00 * v0 + m01 * v1 + m02 * v2;
323  o1 = m10 * v0 + m11 * v1 + m12 * v2;
324  o2 = m20 * v0 + m21 * v1 + m22 * v2;
325 }
326 
327 inline OPEN3D_HOST_DEVICE void matmul3x3_3x3(float a00,
328  float a01,
329  float a02,
330  float a10,
331  float a11,
332  float a12,
333  float a20,
334  float a21,
335  float a22,
336  float b00,
337  float b01,
338  float b02,
339  float b10,
340  float b11,
341  float b12,
342  float b20,
343  float b21,
344  float b22,
345  float &c00,
346  float &c01,
347  float &c02,
348  float &c10,
349  float &c11,
350  float &c12,
351  float &c20,
352  float &c21,
353  float &c22) {
354  matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b00, b10, b20,
355  c00, c10, c20);
356  matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b01, b11, b21,
357  c01, c11, c21);
358  matmul3x3_3x1(a00, a01, a02, a10, a11, a12, a20, a21, a22, b02, b12, b22,
359  c02, c12, c22);
360 }
361 
362 inline OPEN3D_HOST_DEVICE float det3x3(float m00,
363  float m01,
364  float m02,
365  float m10,
366  float m11,
367  float m12,
368  float m20,
369  float m21,
370  float m22) {
371  return m00 * (m11 * m22 - m12 * m21) - m10 * (m01 * m22 - m02 - m21) +
372  m20 * (m01 * m12 - m02 * m11);
373 }
374 
375 #if defined(__CUDACC__)
376 void FillInSLACRegularizerTermCUDA
377 #else
379 #endif
381  core::Tensor &Atb,
382  core::Tensor &residual,
383  const core::Tensor &grid_idx,
384  const core::Tensor &grid_nbs_idx,
385  const core::Tensor &grid_nbs_mask,
386  const core::Tensor &positions_init,
387  const core::Tensor &positions_curr,
388  float weight,
389  int n_frags,
390  int anchor_idx) {
391 
392  int64_t n = grid_idx.GetLength();
393  int64_t n_vars = Atb.GetLength();
394 
395  float *AtA_ptr = static_cast<float *>(AtA.GetDataPtr());
396  float *Atb_ptr = static_cast<float *>(Atb.GetDataPtr());
397  float *residual_ptr = static_cast<float *>(residual.GetDataPtr());
398 
399  const int *grid_idx_ptr = static_cast<const int *>(grid_idx.GetDataPtr());
400  const int *grid_nbs_idx_ptr =
401  static_cast<const int *>(grid_nbs_idx.GetDataPtr());
402  const bool *grid_nbs_mask_ptr =
403  static_cast<const bool *>(grid_nbs_mask.GetDataPtr());
404 
405  const float *positions_init_ptr =
406  static_cast<const float *>(positions_init.GetDataPtr());
407  const float *positions_curr_ptr =
408  static_cast<const float *>(positions_curr.GetDataPtr());
409 
410 #if defined(__CUDACC__)
411  core::kernel::CUDALauncher launcher;
412 #else
413  core::kernel::CPULauncher launcher;
414 #endif
415  launcher.LaunchGeneralKernel(n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
416  // Enumerate 6 neighbors
417  int idx_i = grid_idx_ptr[workload_idx];
418 
419  const int *idx_nbs = grid_nbs_idx_ptr + 6 * workload_idx;
420  const bool *mask_nbs = grid_nbs_mask_ptr + 6 * workload_idx;
421 
422  // Build a 3x3 linear system to compute the local R
423  float cov[3][3] = {{0}};
424  float U[3][3], V[3][3], S[3];
425 
426  int cnt = 0;
427  for (int k = 0; k < 6; ++k) {
428  bool mask_k = mask_nbs[k];
429  if (!mask_k) continue;
430 
431  int idx_k = idx_nbs[k];
432 
433  // Now build linear systems
434  float diff_ik_init[3] = {positions_init_ptr[idx_i * 3 + 0] -
435  positions_init_ptr[idx_k * 3 + 0],
436  positions_init_ptr[idx_i * 3 + 1] -
437  positions_init_ptr[idx_k * 3 + 1],
438  positions_init_ptr[idx_i * 3 + 2] -
439  positions_init_ptr[idx_k * 3 + 2]};
440  float diff_ik_curr[3] = {positions_curr_ptr[idx_i * 3 + 0] -
441  positions_curr_ptr[idx_k * 3 + 0],
442  positions_curr_ptr[idx_i * 3 + 1] -
443  positions_curr_ptr[idx_k * 3 + 1],
444  positions_curr_ptr[idx_i * 3 + 2] -
445  positions_curr_ptr[idx_k * 3 + 2]};
446 
447  // Build linear system by computing XY^T when formulating Y = RX
448  // Y: curr
449  // X: init
450  for (int i = 0; i < 3; ++i) {
451  for (int j = 0; j < 3; ++j) {
452  cov[i][j] += diff_ik_init[i] * diff_ik_curr[j];
453  }
454  }
455  ++cnt;
456  }
457 
458  if (cnt < 3) {
459  return;
460  }
461 
462  // clang-format off
463  svd(cov[0][0], cov[0][1], cov[0][2],
464  cov[1][0], cov[1][1], cov[1][2],
465  cov[2][0], cov[2][1], cov[2][2],
466  U[0][0], U[0][1], U[0][2],
467  U[1][0], U[1][1], U[1][2],
468  U[2][0], U[2][1], U[2][2],
469  S[0], S[1], S[2],
470  V[0][0], V[0][1], V[0][2],
471  V[1][0], V[1][1], V[1][2],
472  V[2][0], V[2][1], V[2][2]);
473 
474  // TODO: det3x3 and matmul3x3
475  float R[3][3];
476 
477  // clang-format off
478  matmul3x3_3x3(V[0][0], V[0][1], V[0][2],
479  V[1][0], V[1][1], V[1][2],
480  V[2][0], V[2][1], V[2][2],
481  U[0][0], U[1][0], U[2][0],
482  U[0][1], U[1][1], U[2][1],
483  U[0][2], U[1][2], U[2][2],
484  R[0][0], R[0][1], R[0][2],
485  R[1][0], R[1][1], R[1][2],
486  R[2][0], R[2][1], R[2][2]);
487 
488  float d = det3x3(R[0][0], R[0][1], R[0][2],
489  R[1][0], R[1][1], R[1][2],
490  R[2][0], R[2][1], R[2][2]);
491  // clang-format on
492 
493  if (d < 0) {
494  // clang-format off
495  matmul3x3_3x3(V[0][0], V[0][1], V[0][2],
496  V[1][0], V[1][1], V[1][2],
497  V[2][0], V[2][1], V[2][2],
498  U[0][0], U[1][0], U[2][0],
499  U[0][1], U[1][1], U[2][1],
500  -U[0][2], -U[1][2], -U[2][2],
501  R[0][0], R[0][1], R[0][2],
502  R[1][0], R[1][1], R[1][2],
503  R[2][0], R[2][1], R[2][2]);
504  // clang-format on
505  }
506 
507  // Now we have R, we build Hessian and residuals
508  // But first, we need to anchor a point
509  if (idx_i == anchor_idx) {
510  R[0][0] = R[1][1] = R[2][2] = 1;
511  R[0][1] = R[0][2] = R[1][0] = R[1][2] = R[2][0] = R[2][1] = 0;
512  }
513  for (int k = 0; k < 6; ++k) {
514  bool mask_k = mask_nbs[k];
515 
516  if (mask_k) {
517  int idx_k = idx_nbs[k];
518 
519  float diff_ik_init[3] = {
520  positions_init_ptr[idx_i * 3 + 0] -
521  positions_init_ptr[idx_k * 3 + 0],
522  positions_init_ptr[idx_i * 3 + 1] -
523  positions_init_ptr[idx_k * 3 + 1],
524  positions_init_ptr[idx_i * 3 + 2] -
525  positions_init_ptr[idx_k * 3 + 2]};
526  float diff_ik_curr[3] = {
527  positions_curr_ptr[idx_i * 3 + 0] -
528  positions_curr_ptr[idx_k * 3 + 0],
529  positions_curr_ptr[idx_i * 3 + 1] -
530  positions_curr_ptr[idx_k * 3 + 1],
531  positions_curr_ptr[idx_i * 3 + 2] -
532  positions_curr_ptr[idx_k * 3 + 2]};
533  float R_diff_ik_curr[3];
534 
535  // clang-format off
536  matmul3x3_3x1(R[0][0], R[0][1], R[0][2],
537  R[1][0], R[1][1], R[1][2],
538  R[2][0], R[2][1], R[2][2],
539  diff_ik_init[0],
540  diff_ik_init[1],
541  diff_ik_init[2],
542  R_diff_ik_curr[0],
543  R_diff_ik_curr[1],
544  R_diff_ik_curr[2]);
545  // clang-format on
546 
547  float local_r[3];
548  local_r[0] = diff_ik_curr[0] - R_diff_ik_curr[0];
549  local_r[1] = diff_ik_curr[1] - R_diff_ik_curr[1];
550  local_r[2] = diff_ik_curr[2] - R_diff_ik_curr[2];
551 
552  int offset_idx_i = 3 * idx_i + 6 * n_frags;
553  int offset_idx_k = 3 * idx_k + 6 * n_frags;
554 
555 #if defined(__CUDACC__)
556  // Update residual
557  atomicAdd(residual_ptr, weight * (local_r[0] * local_r[0] +
558  local_r[1] * local_r[1] +
559  local_r[2] * local_r[2]));
560 
561  for (int axis = 0; axis < 3; ++axis) {
562  // Update AtA: 2x2
563  atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
564  offset_idx_i + axis],
565  weight);
566  atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
567  offset_idx_k + axis],
568  weight);
569  atomicAdd(&AtA_ptr[(offset_idx_i + axis) * n_vars +
570  offset_idx_k + axis],
571  -weight);
572  atomicAdd(&AtA_ptr[(offset_idx_k + axis) * n_vars +
573  offset_idx_i + axis],
574  -weight);
575 
576  // Update Atb: 2x1
577  atomicAdd(&Atb_ptr[offset_idx_i + axis],
578  +weight * local_r[axis]);
579  atomicAdd(&Atb_ptr[offset_idx_k + axis],
580  -weight * local_r[axis]);
581  }
582 #else
583 #pragma omp critical
584  {
585  // Update residual
586  *residual_ptr += weight * (local_r[0] * local_r[0] +
587  local_r[1] * local_r[1] +
588  local_r[2] * local_r[2]);
589 
590  for (int axis = 0; axis < 3; ++axis) {
591  // Update AtA: 2x2
592  AtA_ptr[(offset_idx_i + axis) * n_vars +
593  offset_idx_i + axis] += weight;
594  AtA_ptr[(offset_idx_k + axis) * n_vars +
595  offset_idx_k + axis] += weight;
596 
597  AtA_ptr[(offset_idx_i + axis) * n_vars +
598  offset_idx_k + axis] -= weight;
599  AtA_ptr[(offset_idx_k + axis) * n_vars +
600  offset_idx_i + axis] -= weight;
601 
602  // Update Atb: 2x1
603  Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
604  Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
605  }
606  }
607 #endif
608  }
609  }
610  });
611 }
612 } // namespace kernel
613 } // namespace pipelines
614 } // namespace t
615 } // namespace open3d
void FillInSLACRegularizerTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &grid_idx, const core::Tensor &grid_nbs_idx, const core::Tensor &grid_nbs_mask, const core::Tensor &positions_init, const core::Tensor &positions_curr, float weight, int n, int anchor_idx)
Definition: FillInLinearSystemImpl.h:380
Definition: CPULauncher.h:42
void FillInRigidAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &Ri_normal_ps, int i, int j, float threshold)
Definition: FillInLinearSystemImpl.h:45
void svd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, float &s11, float &s22, float &s33, float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33)
Definition: SVD3x3CPU.h:43
Device GetDevice() const
Definition: Tensor.cpp:1098
#define LogError(...)
Definition: Console.h:79
Tensor View(const SizeVector &dst_shape) const
Definition: Tensor.cpp:522
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:57
Tensor IndexGet(const std::vector< Tensor > &index_tensors) const
Advanced indexing getter.
Definition: Tensor.cpp:704
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:56
void IndexSet(const std::vector< Tensor > &index_tensors, const Tensor &src_tensor)
Advanced indexing getter.
Definition: Tensor.cpp:713
Definition: Device.h:39
static Tensor Zeros(const SizeVector &shape, Dtype dtype, const Device &device=Device("CPU:0"))
Create a tensor fill with zeros.
Definition: Tensor.cpp:240
static const Dtype Float32
Definition: Dtype.h:42
static void LaunchGeneralKernel(int64_t n, func_t element_kernel)
General kernels with non-conventional indexers.
Definition: CPULauncher.h:176
static const Dtype Int64
Definition: Dtype.h:47
Definition: PinholeCameraIntrinsic.cpp:35
Definition: Tensor.h:50
void FillInSLACAlignmentTermCPU(core::Tensor &AtA, core::Tensor &Atb, core::Tensor &residual, const core::Tensor &Ti_qs, const core::Tensor &Tj_qs, const core::Tensor &normal_ps, const core::Tensor &Ri_normal_ps, const core::Tensor &RjT_Ri_normal_ps, const core::Tensor &cgrid_idx_ps, const core::Tensor &cgrid_idx_qs, const core::Tensor &cgrid_ratio_qs, const core::Tensor &cgrid_ratio_ps, int i, int j, int n, float threshold)
Definition: FillInLinearSystemImpl.h:165
T * GetDataPtr()
Definition: Tensor.h:1005
OPEN3D_HOST_DEVICE float det3x3(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
Definition: FillInLinearSystemImpl.h:362
OPEN3D_HOST_DEVICE void matmul3x3_3x1(float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22, float v0, float v1, float v2, float &o0, float &o1, float &o2)
Definition: FillInLinearSystemImpl.h:307
OPEN3D_HOST_DEVICE void matmul3x3_3x3(float a00, float a01, float a02, float a10, float a11, float a12, float a20, float a21, float a22, float b00, float b01, float b02, float b10, float b11, float b12, float b20, float b21, float b22, float &c00, float &c01, float &c02, float &c10, float &c11, float &c12, float &c20, float &c21, float &c22)
Definition: FillInLinearSystemImpl.h:327