Loading [MathJax]/extensions/TeX/AMSsymbols.js
Open3D (C++ API)  0.14.1
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
SVD3x3.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018-2021 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 /**************************************************************************
27 **
28 ** svd3
29 **
30 ** Quick singular value decomposition as described by:
31 ** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
32 ** Computing the Singular Value Decomposition of 3x3 matrices
33 ** with minimal branching and elementary floating point operations,
34 ** University of Wisconsin - Madison technical report TR1690, May 2011
35 **
36 ** Implementated by: Kui Wu
37 ** kwu@cs.utah.edu
38 ** Modified by: Wei Dong and Rishabh Singh
39 **
40 ** May 2018
41 **
42 **************************************************************************/
43 
44 #pragma once
45 
46 #include <cmath>
47 
48 #include "open3d/core/CUDAUtils.h"
50 
51 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
52 #include <cuda.h>
53 #endif
54 
55 #include "math.h"
57 
58 #define gone 1065353216
59 #define gsine_pi_over_eight 1053028117
60 
61 #define gcosine_pi_over_eight 1064076127
62 #define gtiny_number 1.e-20
63 #define gfour_gamma_squared 5.8284273147583007813
64 
65 #ifndef __CUDACC__
66 using std::abs;
67 using std::max;
68 using std::sqrt;
69 #endif
70 
71 #if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
72 #define __fadd_rn(x, y) __fadd_rn(x, y)
73 #define __fsub_rn(x, y) __fsub_rn(x, y)
74 #define __frsqrt_rn(x) __frsqrt_rn(x)
75 
76 #define __dadd_rn(x, y) __dadd_rn(x, y)
77 #define __dsub_rn(x, y) __dsub_rn(x, y)
78 #define __drsqrt_rn(x) __drcp_rn(__dsqrt_rn(x))
79 #else
80 
81 #define __fadd_rn(x, y) (x + y)
82 #define __fsub_rn(x, y) (x - y)
83 #define __frsqrt_rn(x) (1.0 / sqrt(x))
84 
85 #define __dadd_rn(x, y) (x + y)
86 #define __dsub_rn(x, y) (x - y)
87 #define __drsqrt_rn(x) (1.0 / sqrt(x))
88 
89 #define __add_rn(x, y) (x + y)
90 #define __sub_rn(x, y) (x - y)
91 #define __rsqrt_rn(x) (1.0 / sqrt(x))
92 #endif
93 
94 namespace open3d {
95 namespace core {
96 namespace linalg {
97 namespace kernel {
98 
99 template <typename scalar_t>
100 union un {
101  scalar_t f;
102  unsigned int ui;
103 };
104 
105 template <typename scalar_t>
106 OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3,
107  scalar_t *U_3x3,
108  scalar_t *S_3x1,
109  scalar_t *V_3x3);
110 
111 template <>
113  double *U_3x3,
114  double *S_3x1,
115  double *V_3x3) {
116  double gsmall_number = 1.e-12;
117 
118  un<double> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
119  un<double> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
120  un<double> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
121  un<double> Sc, Ss, Sch, Ssh;
122  un<double> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
123  un<double> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
124  un<double> Sqvs, Sqvvx, Sqvvy, Sqvvz;
125 
126  Sa11.f = A_3x3[0];
127  Sa12.f = A_3x3[1];
128  Sa13.f = A_3x3[2];
129  Sa21.f = A_3x3[3];
130  Sa22.f = A_3x3[4];
131  Sa23.f = A_3x3[5];
132  Sa31.f = A_3x3[6];
133  Sa32.f = A_3x3[7];
134  Sa33.f = A_3x3[8];
135 
136  //###########################################################
137  // Compute normal equations matrix
138  //###########################################################
139 
140  Ss11.f = Sa11.f * Sa11.f;
141  Stmp1.f = Sa21.f * Sa21.f;
142  Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
143  Stmp1.f = Sa31.f * Sa31.f;
144  Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
145 
146  Ss21.f = Sa12.f * Sa11.f;
147  Stmp1.f = Sa22.f * Sa21.f;
148  Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
149  Stmp1.f = Sa32.f * Sa31.f;
150  Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
151 
152  Ss31.f = Sa13.f * Sa11.f;
153  Stmp1.f = Sa23.f * Sa21.f;
154  Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
155  Stmp1.f = Sa33.f * Sa31.f;
156  Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
157 
158  Ss22.f = Sa12.f * Sa12.f;
159  Stmp1.f = Sa22.f * Sa22.f;
160  Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
161  Stmp1.f = Sa32.f * Sa32.f;
162  Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
163 
164  Ss32.f = Sa13.f * Sa12.f;
165  Stmp1.f = Sa23.f * Sa22.f;
166  Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
167  Stmp1.f = Sa33.f * Sa32.f;
168  Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
169 
170  Ss33.f = Sa13.f * Sa13.f;
171  Stmp1.f = Sa23.f * Sa23.f;
172  Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
173  Stmp1.f = Sa33.f * Sa33.f;
174  Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
175 
176  Sqvs.f = 1.f;
177  Sqvvx.f = 0.f;
178  Sqvvy.f = 0.f;
179  Sqvvz.f = 0.f;
180 
181  //###########################################################
182  // Solve symmetric eigenproblem using Jacobi iteration
183  //###########################################################
184  for (int i = 0; i < 4; i++) {
185  Ssh.f = Ss21.f * 0.5f;
186  Stmp5.f = __dsub_rn(Ss11.f, Ss22.f);
187 
188  Stmp2.f = Ssh.f * Ssh.f;
189  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
190  Ssh.ui = Stmp1.ui & Ssh.ui;
191  Sch.ui = Stmp1.ui & Stmp5.ui;
192  Stmp2.ui = ~Stmp1.ui & gone;
193  Sch.ui = Sch.ui | Stmp2.ui;
194 
195  Stmp1.f = Ssh.f * Ssh.f;
196  Stmp2.f = Sch.f * Sch.f;
197  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
198  Stmp4.f = __drsqrt_rn(Stmp3.f);
199 
200  Ssh.f = Stmp4.f * Ssh.f;
201  Sch.f = Stmp4.f * Sch.f;
202  Stmp1.f = gfour_gamma_squared * Stmp1.f;
203  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
204 
205  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
206  Ssh.ui = ~Stmp1.ui & Ssh.ui;
207  Ssh.ui = Ssh.ui | Stmp2.ui;
208  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
209  Sch.ui = ~Stmp1.ui & Sch.ui;
210  Sch.ui = Sch.ui | Stmp2.ui;
211 
212  Stmp1.f = Ssh.f * Ssh.f;
213  Stmp2.f = Sch.f * Sch.f;
214  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
215  Ss.f = Sch.f * Ssh.f;
216  Ss.f = __dadd_rn(Ss.f, Ss.f);
217 
218 #ifdef DEBUG_JACOBI_CONJUGATE
219  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
220  Sch.f);
221 #endif
222  //###########################################################
223  // Perform the actual Givens conjugation
224  //###########################################################
225 
226  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
227  Ss33.f = Ss33.f * Stmp3.f;
228  Ss31.f = Ss31.f * Stmp3.f;
229  Ss32.f = Ss32.f * Stmp3.f;
230  Ss33.f = Ss33.f * Stmp3.f;
231 
232  Stmp1.f = Ss.f * Ss31.f;
233  Stmp2.f = Ss.f * Ss32.f;
234  Ss31.f = Sc.f * Ss31.f;
235  Ss32.f = Sc.f * Ss32.f;
236  Ss31.f = __dadd_rn(Stmp2.f, Ss31.f);
237  Ss32.f = __dsub_rn(Ss32.f, Stmp1.f);
238 
239  Stmp2.f = Ss.f * Ss.f;
240  Stmp1.f = Ss22.f * Stmp2.f;
241  Stmp3.f = Ss11.f * Stmp2.f;
242  Stmp4.f = Sc.f * Sc.f;
243  Ss11.f = Ss11.f * Stmp4.f;
244  Ss22.f = Ss22.f * Stmp4.f;
245  Ss11.f = __dadd_rn(Ss11.f, Stmp1.f);
246  Ss22.f = __dadd_rn(Ss22.f, Stmp3.f);
247  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
248  Stmp2.f = __dadd_rn(Ss21.f, Ss21.f);
249  Ss21.f = Ss21.f * Stmp4.f;
250  Stmp4.f = Sc.f * Ss.f;
251  Stmp2.f = Stmp2.f * Stmp4.f;
252  Stmp5.f = Stmp5.f * Stmp4.f;
253  Ss11.f = __dadd_rn(Ss11.f, Stmp2.f);
254  Ss21.f = __dsub_rn(Ss21.f, Stmp5.f);
255  Ss22.f = __dsub_rn(Ss22.f, Stmp2.f);
256 
257 #ifdef DEBUG_JACOBI_CONJUGATE
258  printf("%.20g\n", Ss11.f);
259  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
260  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
261 #endif
262 
263  //###########################################################
264  // Compute the cumulative rotation, in quaternion form
265  //###########################################################
266 
267  Stmp1.f = Ssh.f * Sqvvx.f;
268  Stmp2.f = Ssh.f * Sqvvy.f;
269  Stmp3.f = Ssh.f * Sqvvz.f;
270  Ssh.f = Ssh.f * Sqvs.f;
271 
272  Sqvs.f = Sch.f * Sqvs.f;
273  Sqvvx.f = Sch.f * Sqvvx.f;
274  Sqvvy.f = Sch.f * Sqvvy.f;
275  Sqvvz.f = Sch.f * Sqvvz.f;
276 
277  Sqvvz.f = __dadd_rn(Sqvvz.f, Ssh.f);
278  Sqvs.f = __dsub_rn(Sqvs.f, Stmp3.f);
279  Sqvvx.f = __dadd_rn(Sqvvx.f, Stmp2.f);
280  Sqvvy.f = __dsub_rn(Sqvvy.f, Stmp1.f);
281 
282 #ifdef DEBUG_JACOBI_CONJUGATE
283  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
284  Sqvs.f);
285 #endif
286 
288  // (1->3)
290  Ssh.f = Ss32.f * 0.5f;
291  Stmp5.f = __dsub_rn(Ss22.f, Ss33.f);
292 
293  Stmp2.f = Ssh.f * Ssh.f;
294  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
295  Ssh.ui = Stmp1.ui & Ssh.ui;
296  Sch.ui = Stmp1.ui & Stmp5.ui;
297  Stmp2.ui = ~Stmp1.ui & gone;
298  Sch.ui = Sch.ui | Stmp2.ui;
299 
300  Stmp1.f = Ssh.f * Ssh.f;
301  Stmp2.f = Sch.f * Sch.f;
302  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
303  Stmp4.f = __drsqrt_rn(Stmp3.f);
304 
305  Ssh.f = Stmp4.f * Ssh.f;
306  Sch.f = Stmp4.f * Sch.f;
307  Stmp1.f = gfour_gamma_squared * Stmp1.f;
308  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
309 
310  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
311  Ssh.ui = ~Stmp1.ui & Ssh.ui;
312  Ssh.ui = Ssh.ui | Stmp2.ui;
313  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
314  Sch.ui = ~Stmp1.ui & Sch.ui;
315  Sch.ui = Sch.ui | Stmp2.ui;
316 
317  Stmp1.f = Ssh.f * Ssh.f;
318  Stmp2.f = Sch.f * Sch.f;
319  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
320  Ss.f = Sch.f * Ssh.f;
321  Ss.f = __dadd_rn(Ss.f, Ss.f);
322 
323 #ifdef DEBUG_JACOBI_CONJUGATE
324  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
325  Sch.f);
326 #endif
327 
328  //###########################################################
329  // Perform the actual Givens conjugation
330  //###########################################################
331 
332  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
333  Ss11.f = Ss11.f * Stmp3.f;
334  Ss21.f = Ss21.f * Stmp3.f;
335  Ss31.f = Ss31.f * Stmp3.f;
336  Ss11.f = Ss11.f * Stmp3.f;
337 
338  Stmp1.f = Ss.f * Ss21.f;
339  Stmp2.f = Ss.f * Ss31.f;
340  Ss21.f = Sc.f * Ss21.f;
341  Ss31.f = Sc.f * Ss31.f;
342  Ss21.f = __dadd_rn(Stmp2.f, Ss21.f);
343  Ss31.f = __dsub_rn(Ss31.f, Stmp1.f);
344 
345  Stmp2.f = Ss.f * Ss.f;
346  Stmp1.f = Ss33.f * Stmp2.f;
347  Stmp3.f = Ss22.f * Stmp2.f;
348  Stmp4.f = Sc.f * Sc.f;
349  Ss22.f = Ss22.f * Stmp4.f;
350  Ss33.f = Ss33.f * Stmp4.f;
351  Ss22.f = __dadd_rn(Ss22.f, Stmp1.f);
352  Ss33.f = __dadd_rn(Ss33.f, Stmp3.f);
353  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
354  Stmp2.f = __dadd_rn(Ss32.f, Ss32.f);
355  Ss32.f = Ss32.f * Stmp4.f;
356  Stmp4.f = Sc.f * Ss.f;
357  Stmp2.f = Stmp2.f * Stmp4.f;
358  Stmp5.f = Stmp5.f * Stmp4.f;
359  Ss22.f = __dadd_rn(Ss22.f, Stmp2.f);
360  Ss32.f = __dsub_rn(Ss32.f, Stmp5.f);
361  Ss33.f = __dsub_rn(Ss33.f, Stmp2.f);
362 
363 #ifdef DEBUG_JACOBI_CONJUGATE
364  printf("%.20g\n", Ss11.f);
365  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
366  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
367 #endif
368 
369  //###########################################################
370  // Compute the cumulative rotation, in quaternion form
371  //###########################################################
372 
373  Stmp1.f = Ssh.f * Sqvvx.f;
374  Stmp2.f = Ssh.f * Sqvvy.f;
375  Stmp3.f = Ssh.f * Sqvvz.f;
376  Ssh.f = Ssh.f * Sqvs.f;
377 
378  Sqvs.f = Sch.f * Sqvs.f;
379  Sqvvx.f = Sch.f * Sqvvx.f;
380  Sqvvy.f = Sch.f * Sqvvy.f;
381  Sqvvz.f = Sch.f * Sqvvz.f;
382 
383  Sqvvx.f = __dadd_rn(Sqvvx.f, Ssh.f);
384  Sqvs.f = __dsub_rn(Sqvs.f, Stmp1.f);
385  Sqvvy.f = __dadd_rn(Sqvvy.f, Stmp3.f);
386  Sqvvz.f = __dsub_rn(Sqvvz.f, Stmp2.f);
387 
388 #ifdef DEBUG_JACOBI_CONJUGATE
389  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
390  Sqvs.f);
391 #endif
392 #if 1
393  // 1 -> 2
396 
397  Ssh.f = Ss31.f * 0.5f;
398  Stmp5.f = __dsub_rn(Ss33.f, Ss11.f);
399 
400  Stmp2.f = Ssh.f * Ssh.f;
401  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
402  Ssh.ui = Stmp1.ui & Ssh.ui;
403  Sch.ui = Stmp1.ui & Stmp5.ui;
404  Stmp2.ui = ~Stmp1.ui & gone;
405  Sch.ui = Sch.ui | Stmp2.ui;
406 
407  Stmp1.f = Ssh.f * Ssh.f;
408  Stmp2.f = Sch.f * Sch.f;
409  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
410  Stmp4.f = __drsqrt_rn(Stmp3.f);
411 
412  Ssh.f = Stmp4.f * Ssh.f;
413  Sch.f = Stmp4.f * Sch.f;
414  Stmp1.f = gfour_gamma_squared * Stmp1.f;
415  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
416 
417  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
418  Ssh.ui = ~Stmp1.ui & Ssh.ui;
419  Ssh.ui = Ssh.ui | Stmp2.ui;
420  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
421  Sch.ui = ~Stmp1.ui & Sch.ui;
422  Sch.ui = Sch.ui | Stmp2.ui;
423 
424  Stmp1.f = Ssh.f * Ssh.f;
425  Stmp2.f = Sch.f * Sch.f;
426  Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
427  Ss.f = Sch.f * Ssh.f;
428  Ss.f = __dadd_rn(Ss.f, Ss.f);
429 
430 #ifdef DEBUG_JACOBI_CONJUGATE
431  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
432  Sch.f);
433 #endif
434 
435  //###########################################################
436  // Perform the actual Givens conjugation
437  //###########################################################
438 
439  Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
440  Ss22.f = Ss22.f * Stmp3.f;
441  Ss32.f = Ss32.f * Stmp3.f;
442  Ss21.f = Ss21.f * Stmp3.f;
443  Ss22.f = Ss22.f * Stmp3.f;
444 
445  Stmp1.f = Ss.f * Ss32.f;
446  Stmp2.f = Ss.f * Ss21.f;
447  Ss32.f = Sc.f * Ss32.f;
448  Ss21.f = Sc.f * Ss21.f;
449  Ss32.f = __dadd_rn(Stmp2.f, Ss32.f);
450  Ss21.f = __dsub_rn(Ss21.f, Stmp1.f);
451 
452  Stmp2.f = Ss.f * Ss.f;
453  Stmp1.f = Ss11.f * Stmp2.f;
454  Stmp3.f = Ss33.f * Stmp2.f;
455  Stmp4.f = Sc.f * Sc.f;
456  Ss33.f = Ss33.f * Stmp4.f;
457  Ss11.f = Ss11.f * Stmp4.f;
458  Ss33.f = __dadd_rn(Ss33.f, Stmp1.f);
459  Ss11.f = __dadd_rn(Ss11.f, Stmp3.f);
460  Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
461  Stmp2.f = __dadd_rn(Ss31.f, Ss31.f);
462  Ss31.f = Ss31.f * Stmp4.f;
463  Stmp4.f = Sc.f * Ss.f;
464  Stmp2.f = Stmp2.f * Stmp4.f;
465  Stmp5.f = Stmp5.f * Stmp4.f;
466  Ss33.f = __dadd_rn(Ss33.f, Stmp2.f);
467  Ss31.f = __dsub_rn(Ss31.f, Stmp5.f);
468  Ss11.f = __dsub_rn(Ss11.f, Stmp2.f);
469 
470 #ifdef DEBUG_JACOBI_CONJUGATE
471  printf("%.20g\n", Ss11.f);
472  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
473  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
474 #endif
475 
476  //###########################################################
477  // Compute the cumulative rotation, in quaternion form
478  //###########################################################
479 
480  Stmp1.f = Ssh.f * Sqvvx.f;
481  Stmp2.f = Ssh.f * Sqvvy.f;
482  Stmp3.f = Ssh.f * Sqvvz.f;
483  Ssh.f = Ssh.f * Sqvs.f;
484 
485  Sqvs.f = Sch.f * Sqvs.f;
486  Sqvvx.f = Sch.f * Sqvvx.f;
487  Sqvvy.f = Sch.f * Sqvvy.f;
488  Sqvvz.f = Sch.f * Sqvvz.f;
489 
490  Sqvvy.f = __dadd_rn(Sqvvy.f, Ssh.f);
491  Sqvs.f = __dsub_rn(Sqvs.f, Stmp2.f);
492  Sqvvz.f = __dadd_rn(Sqvvz.f, Stmp1.f);
493  Sqvvx.f = __dsub_rn(Sqvvx.f, Stmp3.f);
494 #endif
495  }
496 
497  //###########################################################
498  // Normalize quaternion for matrix V
499  //###########################################################
500 
501  Stmp2.f = Sqvs.f * Sqvs.f;
502  Stmp1.f = Sqvvx.f * Sqvvx.f;
503  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
504  Stmp1.f = Sqvvy.f * Sqvvy.f;
505  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
506  Stmp1.f = Sqvvz.f * Sqvvz.f;
507  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
508 
509  Stmp1.f = __drsqrt_rn(Stmp2.f);
510  Stmp4.f = Stmp1.f * 0.5f;
511  Stmp3.f = Stmp1.f * Stmp4.f;
512  Stmp3.f = Stmp1.f * Stmp3.f;
513  Stmp3.f = Stmp2.f * Stmp3.f;
514  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
515  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
516 
517  Sqvs.f = Sqvs.f * Stmp1.f;
518  Sqvvx.f = Sqvvx.f * Stmp1.f;
519  Sqvvy.f = Sqvvy.f * Stmp1.f;
520  Sqvvz.f = Sqvvz.f * Stmp1.f;
521 
522  //###########################################################
523  // Transform quaternion to matrix V
524  //###########################################################
525 
526  Stmp1.f = Sqvvx.f * Sqvvx.f;
527  Stmp2.f = Sqvvy.f * Sqvvy.f;
528  Stmp3.f = Sqvvz.f * Sqvvz.f;
529  Sv11.f = Sqvs.f * Sqvs.f;
530  Sv22.f = __dsub_rn(Sv11.f, Stmp1.f);
531  Sv33.f = __dsub_rn(Sv22.f, Stmp2.f);
532  Sv33.f = __dadd_rn(Sv33.f, Stmp3.f);
533  Sv22.f = __dadd_rn(Sv22.f, Stmp2.f);
534  Sv22.f = __dsub_rn(Sv22.f, Stmp3.f);
535  Sv11.f = __dadd_rn(Sv11.f, Stmp1.f);
536  Sv11.f = __dsub_rn(Sv11.f, Stmp2.f);
537  Sv11.f = __dsub_rn(Sv11.f, Stmp3.f);
538  Stmp1.f = __dadd_rn(Sqvvx.f, Sqvvx.f);
539  Stmp2.f = __dadd_rn(Sqvvy.f, Sqvvy.f);
540  Stmp3.f = __dadd_rn(Sqvvz.f, Sqvvz.f);
541  Sv32.f = Sqvs.f * Stmp1.f;
542  Sv13.f = Sqvs.f * Stmp2.f;
543  Sv21.f = Sqvs.f * Stmp3.f;
544  Stmp1.f = Sqvvy.f * Stmp1.f;
545  Stmp2.f = Sqvvz.f * Stmp2.f;
546  Stmp3.f = Sqvvx.f * Stmp3.f;
547  Sv12.f = __dsub_rn(Stmp1.f, Sv21.f);
548  Sv23.f = __dsub_rn(Stmp2.f, Sv32.f);
549  Sv31.f = __dsub_rn(Stmp3.f, Sv13.f);
550  Sv21.f = __dadd_rn(Stmp1.f, Sv21.f);
551  Sv32.f = __dadd_rn(Stmp2.f, Sv32.f);
552  Sv13.f = __dadd_rn(Stmp3.f, Sv13.f);
553 
555  // Multiply (from the right) with V
556  //###########################################################
557 
558  Stmp2.f = Sa12.f;
559  Stmp3.f = Sa13.f;
560  Sa12.f = Sv12.f * Sa11.f;
561  Sa13.f = Sv13.f * Sa11.f;
562  Sa11.f = Sv11.f * Sa11.f;
563  Stmp1.f = Sv21.f * Stmp2.f;
564  Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
565  Stmp1.f = Sv31.f * Stmp3.f;
566  Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
567  Stmp1.f = Sv22.f * Stmp2.f;
568  Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
569  Stmp1.f = Sv32.f * Stmp3.f;
570  Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
571  Stmp1.f = Sv23.f * Stmp2.f;
572  Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
573  Stmp1.f = Sv33.f * Stmp3.f;
574  Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
575 
576  Stmp2.f = Sa22.f;
577  Stmp3.f = Sa23.f;
578  Sa22.f = Sv12.f * Sa21.f;
579  Sa23.f = Sv13.f * Sa21.f;
580  Sa21.f = Sv11.f * Sa21.f;
581  Stmp1.f = Sv21.f * Stmp2.f;
582  Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
583  Stmp1.f = Sv31.f * Stmp3.f;
584  Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
585  Stmp1.f = Sv22.f * Stmp2.f;
586  Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
587  Stmp1.f = Sv32.f * Stmp3.f;
588  Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
589  Stmp1.f = Sv23.f * Stmp2.f;
590  Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
591  Stmp1.f = Sv33.f * Stmp3.f;
592  Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
593 
594  Stmp2.f = Sa32.f;
595  Stmp3.f = Sa33.f;
596  Sa32.f = Sv12.f * Sa31.f;
597  Sa33.f = Sv13.f * Sa31.f;
598  Sa31.f = Sv11.f * Sa31.f;
599  Stmp1.f = Sv21.f * Stmp2.f;
600  Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
601  Stmp1.f = Sv31.f * Stmp3.f;
602  Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
603  Stmp1.f = Sv22.f * Stmp2.f;
604  Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
605  Stmp1.f = Sv32.f * Stmp3.f;
606  Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
607  Stmp1.f = Sv23.f * Stmp2.f;
608  Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
609  Stmp1.f = Sv33.f * Stmp3.f;
610  Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
611 
612  //###########################################################
613  // Permute columns such that the singular values are sorted
614  //###########################################################
615 
616  Stmp1.f = Sa11.f * Sa11.f;
617  Stmp4.f = Sa21.f * Sa21.f;
618  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
619  Stmp4.f = Sa31.f * Sa31.f;
620  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
621 
622  Stmp2.f = Sa12.f * Sa12.f;
623  Stmp4.f = Sa22.f * Sa22.f;
624  Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
625  Stmp4.f = Sa32.f * Sa32.f;
626  Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
627 
628  Stmp3.f = Sa13.f * Sa13.f;
629  Stmp4.f = Sa23.f * Sa23.f;
630  Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
631  Stmp4.f = Sa33.f * Sa33.f;
632  Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
633 
634  // Swap columns 1-2 if necessary
635 
636  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
637  Stmp5.ui = Sa11.ui ^ Sa12.ui;
638  Stmp5.ui = Stmp5.ui & Stmp4.ui;
639  Sa11.ui = Sa11.ui ^ Stmp5.ui;
640  Sa12.ui = Sa12.ui ^ Stmp5.ui;
641 
642  Stmp5.ui = Sa21.ui ^ Sa22.ui;
643  Stmp5.ui = Stmp5.ui & Stmp4.ui;
644  Sa21.ui = Sa21.ui ^ Stmp5.ui;
645  Sa22.ui = Sa22.ui ^ Stmp5.ui;
646 
647  Stmp5.ui = Sa31.ui ^ Sa32.ui;
648  Stmp5.ui = Stmp5.ui & Stmp4.ui;
649  Sa31.ui = Sa31.ui ^ Stmp5.ui;
650  Sa32.ui = Sa32.ui ^ Stmp5.ui;
651 
652  Stmp5.ui = Sv11.ui ^ Sv12.ui;
653  Stmp5.ui = Stmp5.ui & Stmp4.ui;
654  Sv11.ui = Sv11.ui ^ Stmp5.ui;
655  Sv12.ui = Sv12.ui ^ Stmp5.ui;
656 
657  Stmp5.ui = Sv21.ui ^ Sv22.ui;
658  Stmp5.ui = Stmp5.ui & Stmp4.ui;
659  Sv21.ui = Sv21.ui ^ Stmp5.ui;
660  Sv22.ui = Sv22.ui ^ Stmp5.ui;
661 
662  Stmp5.ui = Sv31.ui ^ Sv32.ui;
663  Stmp5.ui = Stmp5.ui & Stmp4.ui;
664  Sv31.ui = Sv31.ui ^ Stmp5.ui;
665  Sv32.ui = Sv32.ui ^ Stmp5.ui;
666 
667  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
668  Stmp5.ui = Stmp5.ui & Stmp4.ui;
669  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
670  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
671 
672  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
673  // is still a rotation
674 
675  Stmp5.f = -2.f;
676  Stmp5.ui = Stmp5.ui & Stmp4.ui;
677  Stmp4.f = 1.f;
678  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
679 
680  Sa12.f = Sa12.f * Stmp4.f;
681  Sa22.f = Sa22.f * Stmp4.f;
682  Sa32.f = Sa32.f * Stmp4.f;
683 
684  Sv12.f = Sv12.f * Stmp4.f;
685  Sv22.f = Sv22.f * Stmp4.f;
686  Sv32.f = Sv32.f * Stmp4.f;
687 
688  // Swap columns 1-3 if necessary
689 
690  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
691  Stmp5.ui = Sa11.ui ^ Sa13.ui;
692  Stmp5.ui = Stmp5.ui & Stmp4.ui;
693  Sa11.ui = Sa11.ui ^ Stmp5.ui;
694  Sa13.ui = Sa13.ui ^ Stmp5.ui;
695 
696  Stmp5.ui = Sa21.ui ^ Sa23.ui;
697  Stmp5.ui = Stmp5.ui & Stmp4.ui;
698  Sa21.ui = Sa21.ui ^ Stmp5.ui;
699  Sa23.ui = Sa23.ui ^ Stmp5.ui;
700 
701  Stmp5.ui = Sa31.ui ^ Sa33.ui;
702  Stmp5.ui = Stmp5.ui & Stmp4.ui;
703  Sa31.ui = Sa31.ui ^ Stmp5.ui;
704  Sa33.ui = Sa33.ui ^ Stmp5.ui;
705 
706  Stmp5.ui = Sv11.ui ^ Sv13.ui;
707  Stmp5.ui = Stmp5.ui & Stmp4.ui;
708  Sv11.ui = Sv11.ui ^ Stmp5.ui;
709  Sv13.ui = Sv13.ui ^ Stmp5.ui;
710 
711  Stmp5.ui = Sv21.ui ^ Sv23.ui;
712  Stmp5.ui = Stmp5.ui & Stmp4.ui;
713  Sv21.ui = Sv21.ui ^ Stmp5.ui;
714  Sv23.ui = Sv23.ui ^ Stmp5.ui;
715 
716  Stmp5.ui = Sv31.ui ^ Sv33.ui;
717  Stmp5.ui = Stmp5.ui & Stmp4.ui;
718  Sv31.ui = Sv31.ui ^ Stmp5.ui;
719  Sv33.ui = Sv33.ui ^ Stmp5.ui;
720 
721  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
722  Stmp5.ui = Stmp5.ui & Stmp4.ui;
723  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
724  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
725 
726  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
727  // is still a rotation
728 
729  Stmp5.f = -2.f;
730  Stmp5.ui = Stmp5.ui & Stmp4.ui;
731  Stmp4.f = 1.f;
732  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
733 
734  Sa11.f = Sa11.f * Stmp4.f;
735  Sa21.f = Sa21.f * Stmp4.f;
736  Sa31.f = Sa31.f * Stmp4.f;
737 
738  Sv11.f = Sv11.f * Stmp4.f;
739  Sv21.f = Sv21.f * Stmp4.f;
740  Sv31.f = Sv31.f * Stmp4.f;
741 
742  // Swap columns 2-3 if necessary
743 
744  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
745  Stmp5.ui = Sa12.ui ^ Sa13.ui;
746  Stmp5.ui = Stmp5.ui & Stmp4.ui;
747  Sa12.ui = Sa12.ui ^ Stmp5.ui;
748  Sa13.ui = Sa13.ui ^ Stmp5.ui;
749 
750  Stmp5.ui = Sa22.ui ^ Sa23.ui;
751  Stmp5.ui = Stmp5.ui & Stmp4.ui;
752  Sa22.ui = Sa22.ui ^ Stmp5.ui;
753  Sa23.ui = Sa23.ui ^ Stmp5.ui;
754 
755  Stmp5.ui = Sa32.ui ^ Sa33.ui;
756  Stmp5.ui = Stmp5.ui & Stmp4.ui;
757  Sa32.ui = Sa32.ui ^ Stmp5.ui;
758  Sa33.ui = Sa33.ui ^ Stmp5.ui;
759 
760  Stmp5.ui = Sv12.ui ^ Sv13.ui;
761  Stmp5.ui = Stmp5.ui & Stmp4.ui;
762  Sv12.ui = Sv12.ui ^ Stmp5.ui;
763  Sv13.ui = Sv13.ui ^ Stmp5.ui;
764 
765  Stmp5.ui = Sv22.ui ^ Sv23.ui;
766  Stmp5.ui = Stmp5.ui & Stmp4.ui;
767  Sv22.ui = Sv22.ui ^ Stmp5.ui;
768  Sv23.ui = Sv23.ui ^ Stmp5.ui;
769 
770  Stmp5.ui = Sv32.ui ^ Sv33.ui;
771  Stmp5.ui = Stmp5.ui & Stmp4.ui;
772  Sv32.ui = Sv32.ui ^ Stmp5.ui;
773  Sv33.ui = Sv33.ui ^ Stmp5.ui;
774 
775  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
776  Stmp5.ui = Stmp5.ui & Stmp4.ui;
777  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
778  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
779 
780  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
781  // is still a rotation
782 
783  Stmp5.f = -2.f;
784  Stmp5.ui = Stmp5.ui & Stmp4.ui;
785  Stmp4.f = 1.f;
786  Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
787 
788  Sa13.f = Sa13.f * Stmp4.f;
789  Sa23.f = Sa23.f * Stmp4.f;
790  Sa33.f = Sa33.f * Stmp4.f;
791 
792  Sv13.f = Sv13.f * Stmp4.f;
793  Sv23.f = Sv23.f * Stmp4.f;
794  Sv33.f = Sv33.f * Stmp4.f;
795 
796  //###########################################################
797  // Construct QR factorization of A*V (=U*D) using Givens rotations
798  //###########################################################
799 
800  Su11.f = 1.f;
801  Su12.f = 0.f;
802  Su13.f = 0.f;
803  Su21.f = 0.f;
804  Su22.f = 1.f;
805  Su23.f = 0.f;
806  Su31.f = 0.f;
807  Su32.f = 0.f;
808  Su33.f = 1.f;
809 
810  Ssh.f = Sa21.f * Sa21.f;
811  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
812  Ssh.ui = Ssh.ui & Sa21.ui;
813 
814  Stmp5.f = 0.f;
815  Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
816  Sch.f = max(Sch.f, Sa11.f);
817  Sch.f = max(Sch.f, gsmall_number);
818  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
819 
820  Stmp1.f = Sch.f * Sch.f;
821  Stmp2.f = Ssh.f * Ssh.f;
822  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
823  Stmp1.f = __drsqrt_rn(Stmp2.f);
824 
825  Stmp4.f = Stmp1.f * 0.5f;
826  Stmp3.f = Stmp1.f * Stmp4.f;
827  Stmp3.f = Stmp1.f * Stmp3.f;
828  Stmp3.f = Stmp2.f * Stmp3.f;
829  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
830  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
831  Stmp1.f = Stmp1.f * Stmp2.f;
832 
833  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
834 
835  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
836  Stmp2.ui = ~Stmp5.ui & Sch.ui;
837  Sch.ui = Stmp5.ui & Sch.ui;
838  Ssh.ui = Stmp5.ui & Ssh.ui;
839  Sch.ui = Sch.ui | Stmp1.ui;
840  Ssh.ui = Ssh.ui | Stmp2.ui;
841 
842  Stmp1.f = Sch.f * Sch.f;
843  Stmp2.f = Ssh.f * Ssh.f;
844  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
845  Stmp1.f = __drsqrt_rn(Stmp2.f);
846 
847  Stmp4.f = Stmp1.f * 0.5f;
848  Stmp3.f = Stmp1.f * Stmp4.f;
849  Stmp3.f = Stmp1.f * Stmp3.f;
850  Stmp3.f = Stmp2.f * Stmp3.f;
851  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
852  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
853 
854  Sch.f = Sch.f * Stmp1.f;
855  Ssh.f = Ssh.f * Stmp1.f;
856 
857  Sc.f = Sch.f * Sch.f;
858  Ss.f = Ssh.f * Ssh.f;
859  Sc.f = __dsub_rn(Sc.f, Ss.f);
860  Ss.f = Ssh.f * Sch.f;
861  Ss.f = __dadd_rn(Ss.f, Ss.f);
862 
863  //###########################################################
864  // Rotate matrix A
865  //###########################################################
866 
867  Stmp1.f = Ss.f * Sa11.f;
868  Stmp2.f = Ss.f * Sa21.f;
869  Sa11.f = Sc.f * Sa11.f;
870  Sa21.f = Sc.f * Sa21.f;
871  Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
872  Sa21.f = __dsub_rn(Sa21.f, Stmp1.f);
873 
874  Stmp1.f = Ss.f * Sa12.f;
875  Stmp2.f = Ss.f * Sa22.f;
876  Sa12.f = Sc.f * Sa12.f;
877  Sa22.f = Sc.f * Sa22.f;
878  Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
879  Sa22.f = __dsub_rn(Sa22.f, Stmp1.f);
880 
881  Stmp1.f = Ss.f * Sa13.f;
882  Stmp2.f = Ss.f * Sa23.f;
883  Sa13.f = Sc.f * Sa13.f;
884  Sa23.f = Sc.f * Sa23.f;
885  Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
886  Sa23.f = __dsub_rn(Sa23.f, Stmp1.f);
887 
888  //###########################################################
889  // Update matrix U
890  //###########################################################
891 
892  Stmp1.f = Ss.f * Su11.f;
893  Stmp2.f = Ss.f * Su12.f;
894  Su11.f = Sc.f * Su11.f;
895  Su12.f = Sc.f * Su12.f;
896  Su11.f = __dadd_rn(Su11.f, Stmp2.f);
897  Su12.f = __dsub_rn(Su12.f, Stmp1.f);
898 
899  Stmp1.f = Ss.f * Su21.f;
900  Stmp2.f = Ss.f * Su22.f;
901  Su21.f = Sc.f * Su21.f;
902  Su22.f = Sc.f * Su22.f;
903  Su21.f = __dadd_rn(Su21.f, Stmp2.f);
904  Su22.f = __dsub_rn(Su22.f, Stmp1.f);
905 
906  Stmp1.f = Ss.f * Su31.f;
907  Stmp2.f = Ss.f * Su32.f;
908  Su31.f = Sc.f * Su31.f;
909  Su32.f = Sc.f * Su32.f;
910  Su31.f = __dadd_rn(Su31.f, Stmp2.f);
911  Su32.f = __dsub_rn(Su32.f, Stmp1.f);
912 
913  // Second Givens rotation
914 
915  Ssh.f = Sa31.f * Sa31.f;
916  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
917  Ssh.ui = Ssh.ui & Sa31.ui;
918 
919  Stmp5.f = 0.f;
920  Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
921  Sch.f = max(Sch.f, Sa11.f);
922  Sch.f = max(Sch.f, gsmall_number);
923  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
924 
925  Stmp1.f = Sch.f * Sch.f;
926  Stmp2.f = Ssh.f * Ssh.f;
927  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
928  Stmp1.f = __drsqrt_rn(Stmp2.f);
929 
930  Stmp4.f = Stmp1.f * 0.5;
931  Stmp3.f = Stmp1.f * Stmp4.f;
932  Stmp3.f = Stmp1.f * Stmp3.f;
933  Stmp3.f = Stmp2.f * Stmp3.f;
934  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
935  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
936  Stmp1.f = Stmp1.f * Stmp2.f;
937 
938  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
939 
940  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
941  Stmp2.ui = ~Stmp5.ui & Sch.ui;
942  Sch.ui = Stmp5.ui & Sch.ui;
943  Ssh.ui = Stmp5.ui & Ssh.ui;
944  Sch.ui = Sch.ui | Stmp1.ui;
945  Ssh.ui = Ssh.ui | Stmp2.ui;
946 
947  Stmp1.f = Sch.f * Sch.f;
948  Stmp2.f = Ssh.f * Ssh.f;
949  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
950  Stmp1.f = __drsqrt_rn(Stmp2.f);
951 
952  Stmp4.f = Stmp1.f * 0.5f;
953  Stmp3.f = Stmp1.f * Stmp4.f;
954  Stmp3.f = Stmp1.f * Stmp3.f;
955  Stmp3.f = Stmp2.f * Stmp3.f;
956  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
957  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
958 
959  Sch.f = Sch.f * Stmp1.f;
960  Ssh.f = Ssh.f * Stmp1.f;
961 
962  Sc.f = Sch.f * Sch.f;
963  Ss.f = Ssh.f * Ssh.f;
964  Sc.f = __dsub_rn(Sc.f, Ss.f);
965  Ss.f = Ssh.f * Sch.f;
966  Ss.f = __dadd_rn(Ss.f, Ss.f);
967 
968  //###########################################################
969  // Rotate matrix A
970  //###########################################################
971 
972  Stmp1.f = Ss.f * Sa11.f;
973  Stmp2.f = Ss.f * Sa31.f;
974  Sa11.f = Sc.f * Sa11.f;
975  Sa31.f = Sc.f * Sa31.f;
976  Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
977  Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
978 
979  Stmp1.f = Ss.f * Sa12.f;
980  Stmp2.f = Ss.f * Sa32.f;
981  Sa12.f = Sc.f * Sa12.f;
982  Sa32.f = Sc.f * Sa32.f;
983  Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
984  Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
985 
986  Stmp1.f = Ss.f * Sa13.f;
987  Stmp2.f = Ss.f * Sa33.f;
988  Sa13.f = Sc.f * Sa13.f;
989  Sa33.f = Sc.f * Sa33.f;
990  Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
991  Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
992 
993  //###########################################################
994  // Update matrix U
995  //###########################################################
996 
997  Stmp1.f = Ss.f * Su11.f;
998  Stmp2.f = Ss.f * Su13.f;
999  Su11.f = Sc.f * Su11.f;
1000  Su13.f = Sc.f * Su13.f;
1001  Su11.f = __dadd_rn(Su11.f, Stmp2.f);
1002  Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1003 
1004  Stmp1.f = Ss.f * Su21.f;
1005  Stmp2.f = Ss.f * Su23.f;
1006  Su21.f = Sc.f * Su21.f;
1007  Su23.f = Sc.f * Su23.f;
1008  Su21.f = __dadd_rn(Su21.f, Stmp2.f);
1009  Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1010 
1011  Stmp1.f = Ss.f * Su31.f;
1012  Stmp2.f = Ss.f * Su33.f;
1013  Su31.f = Sc.f * Su31.f;
1014  Su33.f = Sc.f * Su33.f;
1015  Su31.f = __dadd_rn(Su31.f, Stmp2.f);
1016  Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1017 
1018  // Third Givens Rotation
1019 
1020  Ssh.f = Sa32.f * Sa32.f;
1021  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1022  Ssh.ui = Ssh.ui & Sa32.ui;
1023 
1024  Stmp5.f = 0.f;
1025  Sch.f = __dsub_rn(Stmp5.f, Sa22.f);
1026  Sch.f = max(Sch.f, Sa22.f);
1027  Sch.f = max(Sch.f, gsmall_number);
1028  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
1029 
1030  Stmp1.f = Sch.f * Sch.f;
1031  Stmp2.f = Ssh.f * Ssh.f;
1032  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1033  Stmp1.f = __drsqrt_rn(Stmp2.f);
1034 
1035  Stmp4.f = Stmp1.f * 0.5f;
1036  Stmp3.f = Stmp1.f * Stmp4.f;
1037  Stmp3.f = Stmp1.f * Stmp3.f;
1038  Stmp3.f = Stmp2.f * Stmp3.f;
1039  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1040  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1041  Stmp1.f = Stmp1.f * Stmp2.f;
1042 
1043  Sch.f = __dadd_rn(Sch.f, Stmp1.f);
1044 
1045  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1046  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1047  Sch.ui = Stmp5.ui & Sch.ui;
1048  Ssh.ui = Stmp5.ui & Ssh.ui;
1049  Sch.ui = Sch.ui | Stmp1.ui;
1050  Ssh.ui = Ssh.ui | Stmp2.ui;
1051 
1052  Stmp1.f = Sch.f * Sch.f;
1053  Stmp2.f = Ssh.f * Ssh.f;
1054  Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1055  Stmp1.f = __drsqrt_rn(Stmp2.f);
1056 
1057  Stmp4.f = Stmp1.f * 0.5f;
1058  Stmp3.f = Stmp1.f * Stmp4.f;
1059  Stmp3.f = Stmp1.f * Stmp3.f;
1060  Stmp3.f = Stmp2.f * Stmp3.f;
1061  Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1062  Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1063 
1064  Sch.f = Sch.f * Stmp1.f;
1065  Ssh.f = Ssh.f * Stmp1.f;
1066 
1067  Sc.f = Sch.f * Sch.f;
1068  Ss.f = Ssh.f * Ssh.f;
1069  Sc.f = __dsub_rn(Sc.f, Ss.f);
1070  Ss.f = Ssh.f * Sch.f;
1071  Ss.f = __dadd_rn(Ss.f, Ss.f);
1072 
1073  //###########################################################
1074  // Rotate matrix A
1075  //###########################################################
1076 
1077  Stmp1.f = Ss.f * Sa21.f;
1078  Stmp2.f = Ss.f * Sa31.f;
1079  Sa21.f = Sc.f * Sa21.f;
1080  Sa31.f = Sc.f * Sa31.f;
1081  Sa21.f = __dadd_rn(Sa21.f, Stmp2.f);
1082  Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
1083 
1084  Stmp1.f = Ss.f * Sa22.f;
1085  Stmp2.f = Ss.f * Sa32.f;
1086  Sa22.f = Sc.f * Sa22.f;
1087  Sa32.f = Sc.f * Sa32.f;
1088  Sa22.f = __dadd_rn(Sa22.f, Stmp2.f);
1089  Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
1090 
1091  Stmp1.f = Ss.f * Sa23.f;
1092  Stmp2.f = Ss.f * Sa33.f;
1093  Sa23.f = Sc.f * Sa23.f;
1094  Sa33.f = Sc.f * Sa33.f;
1095  Sa23.f = __dadd_rn(Sa23.f, Stmp2.f);
1096  Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
1097 
1098  //###########################################################
1099  // Update matrix U
1100  //###########################################################
1101 
1102  Stmp1.f = Ss.f * Su12.f;
1103  Stmp2.f = Ss.f * Su13.f;
1104  Su12.f = Sc.f * Su12.f;
1105  Su13.f = Sc.f * Su13.f;
1106  Su12.f = __dadd_rn(Su12.f, Stmp2.f);
1107  Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1108 
1109  Stmp1.f = Ss.f * Su22.f;
1110  Stmp2.f = Ss.f * Su23.f;
1111  Su22.f = Sc.f * Su22.f;
1112  Su23.f = Sc.f * Su23.f;
1113  Su22.f = __dadd_rn(Su22.f, Stmp2.f);
1114  Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1115 
1116  Stmp1.f = Ss.f * Su32.f;
1117  Stmp2.f = Ss.f * Su33.f;
1118  Su32.f = Sc.f * Su32.f;
1119  Su33.f = Sc.f * Su33.f;
1120  Su32.f = __dadd_rn(Su32.f, Stmp2.f);
1121  Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1122 
1123  V_3x3[0] = Sv11.f;
1124  V_3x3[1] = Sv12.f;
1125  V_3x3[2] = Sv13.f;
1126  V_3x3[3] = Sv21.f;
1127  V_3x3[4] = Sv22.f;
1128  V_3x3[5] = Sv23.f;
1129  V_3x3[6] = Sv31.f;
1130  V_3x3[7] = Sv32.f;
1131  V_3x3[8] = Sv33.f;
1132 
1133  U_3x3[0] = Su11.f;
1134  U_3x3[1] = Su12.f;
1135  U_3x3[2] = Su13.f;
1136  U_3x3[3] = Su21.f;
1137  U_3x3[4] = Su22.f;
1138  U_3x3[5] = Su23.f;
1139  U_3x3[6] = Su31.f;
1140  U_3x3[7] = Su32.f;
1141  U_3x3[8] = Su33.f;
1142 
1143  S_3x1[0] = Sa11.f;
1144  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1145  S_3x1[1] = Sa22.f;
1146  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1147  S_3x1[2] = Sa33.f;
1148 }
1149 
1150 template <>
1152  float *U_3x3,
1153  float *S_3x1,
1154  float *V_3x3) {
1155  float gsmall_number = 1.e-12;
1156 
1157  un<float> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
1158  un<float> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
1159  un<float> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
1160  un<float> Sc, Ss, Sch, Ssh;
1161  un<float> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
1162  un<float> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
1163  un<float> Sqvs, Sqvvx, Sqvvy, Sqvvz;
1164 
1165  Sa11.f = A_3x3[0];
1166  Sa12.f = A_3x3[1];
1167  Sa13.f = A_3x3[2];
1168  Sa21.f = A_3x3[3];
1169  Sa22.f = A_3x3[4];
1170  Sa23.f = A_3x3[5];
1171  Sa31.f = A_3x3[6];
1172  Sa32.f = A_3x3[7];
1173  Sa33.f = A_3x3[8];
1174 
1175  //###########################################################
1176  // Compute normal equations matrix
1177  //###########################################################
1178 
1179  Ss11.f = Sa11.f * Sa11.f;
1180  Stmp1.f = Sa21.f * Sa21.f;
1181  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1182  Stmp1.f = Sa31.f * Sa31.f;
1183  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1184 
1185  Ss21.f = Sa12.f * Sa11.f;
1186  Stmp1.f = Sa22.f * Sa21.f;
1187  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1188  Stmp1.f = Sa32.f * Sa31.f;
1189  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1190 
1191  Ss31.f = Sa13.f * Sa11.f;
1192  Stmp1.f = Sa23.f * Sa21.f;
1193  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1194  Stmp1.f = Sa33.f * Sa31.f;
1195  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1196 
1197  Ss22.f = Sa12.f * Sa12.f;
1198  Stmp1.f = Sa22.f * Sa22.f;
1199  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1200  Stmp1.f = Sa32.f * Sa32.f;
1201  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1202 
1203  Ss32.f = Sa13.f * Sa12.f;
1204  Stmp1.f = Sa23.f * Sa22.f;
1205  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1206  Stmp1.f = Sa33.f * Sa32.f;
1207  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1208 
1209  Ss33.f = Sa13.f * Sa13.f;
1210  Stmp1.f = Sa23.f * Sa23.f;
1211  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1212  Stmp1.f = Sa33.f * Sa33.f;
1213  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1214 
1215  Sqvs.f = 1.f;
1216  Sqvvx.f = 0.f;
1217  Sqvvy.f = 0.f;
1218  Sqvvz.f = 0.f;
1219 
1220  //###########################################################
1221  // Solve symmetric eigenproblem using Jacobi iteration
1222  //###########################################################
1223  for (int i = 0; i < 4; i++) {
1224  Ssh.f = Ss21.f * 0.5f;
1225  Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
1226 
1227  Stmp2.f = Ssh.f * Ssh.f;
1228  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1229  Ssh.ui = Stmp1.ui & Ssh.ui;
1230  Sch.ui = Stmp1.ui & Stmp5.ui;
1231  Stmp2.ui = ~Stmp1.ui & gone;
1232  Sch.ui = Sch.ui | Stmp2.ui;
1233 
1234  Stmp1.f = Ssh.f * Ssh.f;
1235  Stmp2.f = Sch.f * Sch.f;
1236  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1237  Stmp4.f = __frsqrt_rn(Stmp3.f);
1238 
1239  Ssh.f = Stmp4.f * Ssh.f;
1240  Sch.f = Stmp4.f * Sch.f;
1241  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1242  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1243 
1244  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1245  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1246  Ssh.ui = Ssh.ui | Stmp2.ui;
1247  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1248  Sch.ui = ~Stmp1.ui & Sch.ui;
1249  Sch.ui = Sch.ui | Stmp2.ui;
1250 
1251  Stmp1.f = Ssh.f * Ssh.f;
1252  Stmp2.f = Sch.f * Sch.f;
1253  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1254  Ss.f = Sch.f * Ssh.f;
1255  Ss.f = __fadd_rn(Ss.f, Ss.f);
1256 
1257 #ifdef DEBUG_JACOBI_CONJUGATE
1258  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1259  Sch.f);
1260 #endif
1261  //###########################################################
1262  // Perform the actual Givens conjugation
1263  //###########################################################
1264 
1265  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1266  Ss33.f = Ss33.f * Stmp3.f;
1267  Ss31.f = Ss31.f * Stmp3.f;
1268  Ss32.f = Ss32.f * Stmp3.f;
1269  Ss33.f = Ss33.f * Stmp3.f;
1270 
1271  Stmp1.f = Ss.f * Ss31.f;
1272  Stmp2.f = Ss.f * Ss32.f;
1273  Ss31.f = Sc.f * Ss31.f;
1274  Ss32.f = Sc.f * Ss32.f;
1275  Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
1276  Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
1277 
1278  Stmp2.f = Ss.f * Ss.f;
1279  Stmp1.f = Ss22.f * Stmp2.f;
1280  Stmp3.f = Ss11.f * Stmp2.f;
1281  Stmp4.f = Sc.f * Sc.f;
1282  Ss11.f = Ss11.f * Stmp4.f;
1283  Ss22.f = Ss22.f * Stmp4.f;
1284  Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
1285  Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
1286  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1287  Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
1288  Ss21.f = Ss21.f * Stmp4.f;
1289  Stmp4.f = Sc.f * Ss.f;
1290  Stmp2.f = Stmp2.f * Stmp4.f;
1291  Stmp5.f = Stmp5.f * Stmp4.f;
1292  Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
1293  Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
1294  Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
1295 
1296 #ifdef DEBUG_JACOBI_CONJUGATE
1297  printf("%.20g\n", Ss11.f);
1298  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1299  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1300 #endif
1301 
1302  //###########################################################
1303  // Compute the cumulative rotation, in quaternion form
1304  //###########################################################
1305 
1306  Stmp1.f = Ssh.f * Sqvvx.f;
1307  Stmp2.f = Ssh.f * Sqvvy.f;
1308  Stmp3.f = Ssh.f * Sqvvz.f;
1309  Ssh.f = Ssh.f * Sqvs.f;
1310 
1311  Sqvs.f = Sch.f * Sqvs.f;
1312  Sqvvx.f = Sch.f * Sqvvx.f;
1313  Sqvvy.f = Sch.f * Sqvvy.f;
1314  Sqvvz.f = Sch.f * Sqvvz.f;
1315 
1316  Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
1317  Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
1318  Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
1319  Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
1320 
1321 #ifdef DEBUG_JACOBI_CONJUGATE
1322  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1323  Sqvs.f);
1324 #endif
1325 
1327  // (1->3)
1329  Ssh.f = Ss32.f * 0.5f;
1330  Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
1331 
1332  Stmp2.f = Ssh.f * Ssh.f;
1333  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1334  Ssh.ui = Stmp1.ui & Ssh.ui;
1335  Sch.ui = Stmp1.ui & Stmp5.ui;
1336  Stmp2.ui = ~Stmp1.ui & gone;
1337  Sch.ui = Sch.ui | Stmp2.ui;
1338 
1339  Stmp1.f = Ssh.f * Ssh.f;
1340  Stmp2.f = Sch.f * Sch.f;
1341  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1342  Stmp4.f = __frsqrt_rn(Stmp3.f);
1343 
1344  Ssh.f = Stmp4.f * Ssh.f;
1345  Sch.f = Stmp4.f * Sch.f;
1346  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1347  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1348 
1349  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1350  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1351  Ssh.ui = Ssh.ui | Stmp2.ui;
1352  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1353  Sch.ui = ~Stmp1.ui & Sch.ui;
1354  Sch.ui = Sch.ui | Stmp2.ui;
1355 
1356  Stmp1.f = Ssh.f * Ssh.f;
1357  Stmp2.f = Sch.f * Sch.f;
1358  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1359  Ss.f = Sch.f * Ssh.f;
1360  Ss.f = __fadd_rn(Ss.f, Ss.f);
1361 
1362 #ifdef DEBUG_JACOBI_CONJUGATE
1363  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1364  Sch.f);
1365 #endif
1366 
1367  //###########################################################
1368  // Perform the actual Givens conjugation
1369  //###########################################################
1370 
1371  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1372  Ss11.f = Ss11.f * Stmp3.f;
1373  Ss21.f = Ss21.f * Stmp3.f;
1374  Ss31.f = Ss31.f * Stmp3.f;
1375  Ss11.f = Ss11.f * Stmp3.f;
1376 
1377  Stmp1.f = Ss.f * Ss21.f;
1378  Stmp2.f = Ss.f * Ss31.f;
1379  Ss21.f = Sc.f * Ss21.f;
1380  Ss31.f = Sc.f * Ss31.f;
1381  Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
1382  Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
1383 
1384  Stmp2.f = Ss.f * Ss.f;
1385  Stmp1.f = Ss33.f * Stmp2.f;
1386  Stmp3.f = Ss22.f * Stmp2.f;
1387  Stmp4.f = Sc.f * Sc.f;
1388  Ss22.f = Ss22.f * Stmp4.f;
1389  Ss33.f = Ss33.f * Stmp4.f;
1390  Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
1391  Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
1392  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1393  Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
1394  Ss32.f = Ss32.f * Stmp4.f;
1395  Stmp4.f = Sc.f * Ss.f;
1396  Stmp2.f = Stmp2.f * Stmp4.f;
1397  Stmp5.f = Stmp5.f * Stmp4.f;
1398  Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
1399  Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
1400  Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
1401 
1402 #ifdef DEBUG_JACOBI_CONJUGATE
1403  printf("%.20g\n", Ss11.f);
1404  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1405  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1406 #endif
1407 
1408  //###########################################################
1409  // Compute the cumulative rotation, in quaternion form
1410  //###########################################################
1411 
1412  Stmp1.f = Ssh.f * Sqvvx.f;
1413  Stmp2.f = Ssh.f * Sqvvy.f;
1414  Stmp3.f = Ssh.f * Sqvvz.f;
1415  Ssh.f = Ssh.f * Sqvs.f;
1416 
1417  Sqvs.f = Sch.f * Sqvs.f;
1418  Sqvvx.f = Sch.f * Sqvvx.f;
1419  Sqvvy.f = Sch.f * Sqvvy.f;
1420  Sqvvz.f = Sch.f * Sqvvz.f;
1421 
1422  Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
1423  Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
1424  Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
1425  Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
1426 
1427 #ifdef DEBUG_JACOBI_CONJUGATE
1428  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1429  Sqvs.f);
1430 #endif
1431 #if 1
1432  // 1 -> 2
1435 
1436  Ssh.f = Ss31.f * 0.5f;
1437  Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
1438 
1439  Stmp2.f = Ssh.f * Ssh.f;
1440  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1441  Ssh.ui = Stmp1.ui & Ssh.ui;
1442  Sch.ui = Stmp1.ui & Stmp5.ui;
1443  Stmp2.ui = ~Stmp1.ui & gone;
1444  Sch.ui = Sch.ui | Stmp2.ui;
1445 
1446  Stmp1.f = Ssh.f * Ssh.f;
1447  Stmp2.f = Sch.f * Sch.f;
1448  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1449  Stmp4.f = __frsqrt_rn(Stmp3.f);
1450 
1451  Ssh.f = Stmp4.f * Ssh.f;
1452  Sch.f = Stmp4.f * Sch.f;
1453  Stmp1.f = gfour_gamma_squared * Stmp1.f;
1454  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1455 
1456  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1457  Ssh.ui = ~Stmp1.ui & Ssh.ui;
1458  Ssh.ui = Ssh.ui | Stmp2.ui;
1459  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1460  Sch.ui = ~Stmp1.ui & Sch.ui;
1461  Sch.ui = Sch.ui | Stmp2.ui;
1462 
1463  Stmp1.f = Ssh.f * Ssh.f;
1464  Stmp2.f = Sch.f * Sch.f;
1465  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1466  Ss.f = Sch.f * Ssh.f;
1467  Ss.f = __fadd_rn(Ss.f, Ss.f);
1468 
1469 #ifdef DEBUG_JACOBI_CONJUGATE
1470  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1471  Sch.f);
1472 #endif
1473 
1474  //###########################################################
1475  // Perform the actual Givens conjugation
1476  //###########################################################
1477 
1478  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1479  Ss22.f = Ss22.f * Stmp3.f;
1480  Ss32.f = Ss32.f * Stmp3.f;
1481  Ss21.f = Ss21.f * Stmp3.f;
1482  Ss22.f = Ss22.f * Stmp3.f;
1483 
1484  Stmp1.f = Ss.f * Ss32.f;
1485  Stmp2.f = Ss.f * Ss21.f;
1486  Ss32.f = Sc.f * Ss32.f;
1487  Ss21.f = Sc.f * Ss21.f;
1488  Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
1489  Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
1490 
1491  Stmp2.f = Ss.f * Ss.f;
1492  Stmp1.f = Ss11.f * Stmp2.f;
1493  Stmp3.f = Ss33.f * Stmp2.f;
1494  Stmp4.f = Sc.f * Sc.f;
1495  Ss33.f = Ss33.f * Stmp4.f;
1496  Ss11.f = Ss11.f * Stmp4.f;
1497  Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
1498  Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
1499  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1500  Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
1501  Ss31.f = Ss31.f * Stmp4.f;
1502  Stmp4.f = Sc.f * Ss.f;
1503  Stmp2.f = Stmp2.f * Stmp4.f;
1504  Stmp5.f = Stmp5.f * Stmp4.f;
1505  Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
1506  Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
1507  Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
1508 
1509 #ifdef DEBUG_JACOBI_CONJUGATE
1510  printf("%.20g\n", Ss11.f);
1511  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1512  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1513 #endif
1514 
1515  //###########################################################
1516  // Compute the cumulative rotation, in quaternion form
1517  //###########################################################
1518 
1519  Stmp1.f = Ssh.f * Sqvvx.f;
1520  Stmp2.f = Ssh.f * Sqvvy.f;
1521  Stmp3.f = Ssh.f * Sqvvz.f;
1522  Ssh.f = Ssh.f * Sqvs.f;
1523 
1524  Sqvs.f = Sch.f * Sqvs.f;
1525  Sqvvx.f = Sch.f * Sqvvx.f;
1526  Sqvvy.f = Sch.f * Sqvvy.f;
1527  Sqvvz.f = Sch.f * Sqvvz.f;
1528 
1529  Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
1530  Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
1531  Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
1532  Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
1533 #endif
1534  }
1535 
1536  //###########################################################
1537  // Normalize quaternion for matrix V
1538  //###########################################################
1539 
1540  Stmp2.f = Sqvs.f * Sqvs.f;
1541  Stmp1.f = Sqvvx.f * Sqvvx.f;
1542  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1543  Stmp1.f = Sqvvy.f * Sqvvy.f;
1544  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1545  Stmp1.f = Sqvvz.f * Sqvvz.f;
1546  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1547 
1548  Stmp1.f = __frsqrt_rn(Stmp2.f);
1549  Stmp4.f = Stmp1.f * 0.5f;
1550  Stmp3.f = Stmp1.f * Stmp4.f;
1551  Stmp3.f = Stmp1.f * Stmp3.f;
1552  Stmp3.f = Stmp2.f * Stmp3.f;
1553  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1554  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1555 
1556  Sqvs.f = Sqvs.f * Stmp1.f;
1557  Sqvvx.f = Sqvvx.f * Stmp1.f;
1558  Sqvvy.f = Sqvvy.f * Stmp1.f;
1559  Sqvvz.f = Sqvvz.f * Stmp1.f;
1560 
1561  //###########################################################
1562  // Transform quaternion to matrix V
1563  //###########################################################
1564 
1565  Stmp1.f = Sqvvx.f * Sqvvx.f;
1566  Stmp2.f = Sqvvy.f * Sqvvy.f;
1567  Stmp3.f = Sqvvz.f * Sqvvz.f;
1568  Sv11.f = Sqvs.f * Sqvs.f;
1569  Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
1570  Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
1571  Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
1572  Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
1573  Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
1574  Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
1575  Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
1576  Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
1577  Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
1578  Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
1579  Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
1580  Sv32.f = Sqvs.f * Stmp1.f;
1581  Sv13.f = Sqvs.f * Stmp2.f;
1582  Sv21.f = Sqvs.f * Stmp3.f;
1583  Stmp1.f = Sqvvy.f * Stmp1.f;
1584  Stmp2.f = Sqvvz.f * Stmp2.f;
1585  Stmp3.f = Sqvvx.f * Stmp3.f;
1586  Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
1587  Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
1588  Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
1589  Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
1590  Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
1591  Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
1592 
1594  // Multiply (from the right) with V
1595  //###########################################################
1596 
1597  Stmp2.f = Sa12.f;
1598  Stmp3.f = Sa13.f;
1599  Sa12.f = Sv12.f * Sa11.f;
1600  Sa13.f = Sv13.f * Sa11.f;
1601  Sa11.f = Sv11.f * Sa11.f;
1602  Stmp1.f = Sv21.f * Stmp2.f;
1603  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1604  Stmp1.f = Sv31.f * Stmp3.f;
1605  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1606  Stmp1.f = Sv22.f * Stmp2.f;
1607  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1608  Stmp1.f = Sv32.f * Stmp3.f;
1609  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1610  Stmp1.f = Sv23.f * Stmp2.f;
1611  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1612  Stmp1.f = Sv33.f * Stmp3.f;
1613  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1614 
1615  Stmp2.f = Sa22.f;
1616  Stmp3.f = Sa23.f;
1617  Sa22.f = Sv12.f * Sa21.f;
1618  Sa23.f = Sv13.f * Sa21.f;
1619  Sa21.f = Sv11.f * Sa21.f;
1620  Stmp1.f = Sv21.f * Stmp2.f;
1621  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1622  Stmp1.f = Sv31.f * Stmp3.f;
1623  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1624  Stmp1.f = Sv22.f * Stmp2.f;
1625  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1626  Stmp1.f = Sv32.f * Stmp3.f;
1627  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1628  Stmp1.f = Sv23.f * Stmp2.f;
1629  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1630  Stmp1.f = Sv33.f * Stmp3.f;
1631  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1632 
1633  Stmp2.f = Sa32.f;
1634  Stmp3.f = Sa33.f;
1635  Sa32.f = Sv12.f * Sa31.f;
1636  Sa33.f = Sv13.f * Sa31.f;
1637  Sa31.f = Sv11.f * Sa31.f;
1638  Stmp1.f = Sv21.f * Stmp2.f;
1639  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1640  Stmp1.f = Sv31.f * Stmp3.f;
1641  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1642  Stmp1.f = Sv22.f * Stmp2.f;
1643  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1644  Stmp1.f = Sv32.f * Stmp3.f;
1645  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1646  Stmp1.f = Sv23.f * Stmp2.f;
1647  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1648  Stmp1.f = Sv33.f * Stmp3.f;
1649  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1650 
1651  //###########################################################
1652  // Permute columns such that the singular values are sorted
1653  //###########################################################
1654 
1655  Stmp1.f = Sa11.f * Sa11.f;
1656  Stmp4.f = Sa21.f * Sa21.f;
1657  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1658  Stmp4.f = Sa31.f * Sa31.f;
1659  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1660 
1661  Stmp2.f = Sa12.f * Sa12.f;
1662  Stmp4.f = Sa22.f * Sa22.f;
1663  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1664  Stmp4.f = Sa32.f * Sa32.f;
1665  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1666 
1667  Stmp3.f = Sa13.f * Sa13.f;
1668  Stmp4.f = Sa23.f * Sa23.f;
1669  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1670  Stmp4.f = Sa33.f * Sa33.f;
1671  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1672 
1673  // Swap columns 1-2 if necessary
1674 
1675  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
1676  Stmp5.ui = Sa11.ui ^ Sa12.ui;
1677  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1678  Sa11.ui = Sa11.ui ^ Stmp5.ui;
1679  Sa12.ui = Sa12.ui ^ Stmp5.ui;
1680 
1681  Stmp5.ui = Sa21.ui ^ Sa22.ui;
1682  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1683  Sa21.ui = Sa21.ui ^ Stmp5.ui;
1684  Sa22.ui = Sa22.ui ^ Stmp5.ui;
1685 
1686  Stmp5.ui = Sa31.ui ^ Sa32.ui;
1687  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1688  Sa31.ui = Sa31.ui ^ Stmp5.ui;
1689  Sa32.ui = Sa32.ui ^ Stmp5.ui;
1690 
1691  Stmp5.ui = Sv11.ui ^ Sv12.ui;
1692  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1693  Sv11.ui = Sv11.ui ^ Stmp5.ui;
1694  Sv12.ui = Sv12.ui ^ Stmp5.ui;
1695 
1696  Stmp5.ui = Sv21.ui ^ Sv22.ui;
1697  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1698  Sv21.ui = Sv21.ui ^ Stmp5.ui;
1699  Sv22.ui = Sv22.ui ^ Stmp5.ui;
1700 
1701  Stmp5.ui = Sv31.ui ^ Sv32.ui;
1702  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1703  Sv31.ui = Sv31.ui ^ Stmp5.ui;
1704  Sv32.ui = Sv32.ui ^ Stmp5.ui;
1705 
1706  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
1707  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1708  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1709  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1710 
1711  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
1712  // is still a rotation
1713 
1714  Stmp5.f = -2.f;
1715  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1716  Stmp4.f = 1.f;
1717  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1718 
1719  Sa12.f = Sa12.f * Stmp4.f;
1720  Sa22.f = Sa22.f * Stmp4.f;
1721  Sa32.f = Sa32.f * Stmp4.f;
1722 
1723  Sv12.f = Sv12.f * Stmp4.f;
1724  Sv22.f = Sv22.f * Stmp4.f;
1725  Sv32.f = Sv32.f * Stmp4.f;
1726 
1727  // Swap columns 1-3 if necessary
1728 
1729  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
1730  Stmp5.ui = Sa11.ui ^ Sa13.ui;
1731  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1732  Sa11.ui = Sa11.ui ^ Stmp5.ui;
1733  Sa13.ui = Sa13.ui ^ Stmp5.ui;
1734 
1735  Stmp5.ui = Sa21.ui ^ Sa23.ui;
1736  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1737  Sa21.ui = Sa21.ui ^ Stmp5.ui;
1738  Sa23.ui = Sa23.ui ^ Stmp5.ui;
1739 
1740  Stmp5.ui = Sa31.ui ^ Sa33.ui;
1741  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1742  Sa31.ui = Sa31.ui ^ Stmp5.ui;
1743  Sa33.ui = Sa33.ui ^ Stmp5.ui;
1744 
1745  Stmp5.ui = Sv11.ui ^ Sv13.ui;
1746  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1747  Sv11.ui = Sv11.ui ^ Stmp5.ui;
1748  Sv13.ui = Sv13.ui ^ Stmp5.ui;
1749 
1750  Stmp5.ui = Sv21.ui ^ Sv23.ui;
1751  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1752  Sv21.ui = Sv21.ui ^ Stmp5.ui;
1753  Sv23.ui = Sv23.ui ^ Stmp5.ui;
1754 
1755  Stmp5.ui = Sv31.ui ^ Sv33.ui;
1756  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1757  Sv31.ui = Sv31.ui ^ Stmp5.ui;
1758  Sv33.ui = Sv33.ui ^ Stmp5.ui;
1759 
1760  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
1761  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1762  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1763  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1764 
1765  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
1766  // is still a rotation
1767 
1768  Stmp5.f = -2.f;
1769  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1770  Stmp4.f = 1.f;
1771  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1772 
1773  Sa11.f = Sa11.f * Stmp4.f;
1774  Sa21.f = Sa21.f * Stmp4.f;
1775  Sa31.f = Sa31.f * Stmp4.f;
1776 
1777  Sv11.f = Sv11.f * Stmp4.f;
1778  Sv21.f = Sv21.f * Stmp4.f;
1779  Sv31.f = Sv31.f * Stmp4.f;
1780 
1781  // Swap columns 2-3 if necessary
1782 
1783  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
1784  Stmp5.ui = Sa12.ui ^ Sa13.ui;
1785  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1786  Sa12.ui = Sa12.ui ^ Stmp5.ui;
1787  Sa13.ui = Sa13.ui ^ Stmp5.ui;
1788 
1789  Stmp5.ui = Sa22.ui ^ Sa23.ui;
1790  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1791  Sa22.ui = Sa22.ui ^ Stmp5.ui;
1792  Sa23.ui = Sa23.ui ^ Stmp5.ui;
1793 
1794  Stmp5.ui = Sa32.ui ^ Sa33.ui;
1795  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1796  Sa32.ui = Sa32.ui ^ Stmp5.ui;
1797  Sa33.ui = Sa33.ui ^ Stmp5.ui;
1798 
1799  Stmp5.ui = Sv12.ui ^ Sv13.ui;
1800  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1801  Sv12.ui = Sv12.ui ^ Stmp5.ui;
1802  Sv13.ui = Sv13.ui ^ Stmp5.ui;
1803 
1804  Stmp5.ui = Sv22.ui ^ Sv23.ui;
1805  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1806  Sv22.ui = Sv22.ui ^ Stmp5.ui;
1807  Sv23.ui = Sv23.ui ^ Stmp5.ui;
1808 
1809  Stmp5.ui = Sv32.ui ^ Sv33.ui;
1810  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1811  Sv32.ui = Sv32.ui ^ Stmp5.ui;
1812  Sv33.ui = Sv33.ui ^ Stmp5.ui;
1813 
1814  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
1815  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1816  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1817  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1818 
1819  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
1820  // is still a rotation
1821 
1822  Stmp5.f = -2.f;
1823  Stmp5.ui = Stmp5.ui & Stmp4.ui;
1824  Stmp4.f = 1.f;
1825  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1826 
1827  Sa13.f = Sa13.f * Stmp4.f;
1828  Sa23.f = Sa23.f * Stmp4.f;
1829  Sa33.f = Sa33.f * Stmp4.f;
1830 
1831  Sv13.f = Sv13.f * Stmp4.f;
1832  Sv23.f = Sv23.f * Stmp4.f;
1833  Sv33.f = Sv33.f * Stmp4.f;
1834 
1835  //###########################################################
1836  // Construct QR factorization of A*V (=U*D) using Givens rotations
1837  //###########################################################
1838 
1839  Su11.f = 1.f;
1840  Su12.f = 0.f;
1841  Su13.f = 0.f;
1842  Su21.f = 0.f;
1843  Su22.f = 1.f;
1844  Su23.f = 0.f;
1845  Su31.f = 0.f;
1846  Su32.f = 0.f;
1847  Su33.f = 1.f;
1848 
1849  Ssh.f = Sa21.f * Sa21.f;
1850  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1851  Ssh.ui = Ssh.ui & Sa21.ui;
1852 
1853  Stmp5.f = 0.f;
1854  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1855  Sch.f = max(Sch.f, Sa11.f);
1856  Sch.f = max(Sch.f, gsmall_number);
1857  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1858 
1859  Stmp1.f = Sch.f * Sch.f;
1860  Stmp2.f = Ssh.f * Ssh.f;
1861  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1862  Stmp1.f = __frsqrt_rn(Stmp2.f);
1863 
1864  Stmp4.f = Stmp1.f * 0.5f;
1865  Stmp3.f = Stmp1.f * Stmp4.f;
1866  Stmp3.f = Stmp1.f * Stmp3.f;
1867  Stmp3.f = Stmp2.f * Stmp3.f;
1868  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1869  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1870  Stmp1.f = Stmp1.f * Stmp2.f;
1871 
1872  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1873 
1874  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1875  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1876  Sch.ui = Stmp5.ui & Sch.ui;
1877  Ssh.ui = Stmp5.ui & Ssh.ui;
1878  Sch.ui = Sch.ui | Stmp1.ui;
1879  Ssh.ui = Ssh.ui | Stmp2.ui;
1880 
1881  Stmp1.f = Sch.f * Sch.f;
1882  Stmp2.f = Ssh.f * Ssh.f;
1883  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1884  Stmp1.f = __frsqrt_rn(Stmp2.f);
1885 
1886  Stmp4.f = Stmp1.f * 0.5f;
1887  Stmp3.f = Stmp1.f * Stmp4.f;
1888  Stmp3.f = Stmp1.f * Stmp3.f;
1889  Stmp3.f = Stmp2.f * Stmp3.f;
1890  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1891  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1892 
1893  Sch.f = Sch.f * Stmp1.f;
1894  Ssh.f = Ssh.f * Stmp1.f;
1895 
1896  Sc.f = Sch.f * Sch.f;
1897  Ss.f = Ssh.f * Ssh.f;
1898  Sc.f = __fsub_rn(Sc.f, Ss.f);
1899  Ss.f = Ssh.f * Sch.f;
1900  Ss.f = __fadd_rn(Ss.f, Ss.f);
1901 
1902  //###########################################################
1903  // Rotate matrix A
1904  //###########################################################
1905 
1906  Stmp1.f = Ss.f * Sa11.f;
1907  Stmp2.f = Ss.f * Sa21.f;
1908  Sa11.f = Sc.f * Sa11.f;
1909  Sa21.f = Sc.f * Sa21.f;
1910  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1911  Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
1912 
1913  Stmp1.f = Ss.f * Sa12.f;
1914  Stmp2.f = Ss.f * Sa22.f;
1915  Sa12.f = Sc.f * Sa12.f;
1916  Sa22.f = Sc.f * Sa22.f;
1917  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
1918  Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
1919 
1920  Stmp1.f = Ss.f * Sa13.f;
1921  Stmp2.f = Ss.f * Sa23.f;
1922  Sa13.f = Sc.f * Sa13.f;
1923  Sa23.f = Sc.f * Sa23.f;
1924  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
1925  Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
1926 
1927  //###########################################################
1928  // Update matrix U
1929  //###########################################################
1930 
1931  Stmp1.f = Ss.f * Su11.f;
1932  Stmp2.f = Ss.f * Su12.f;
1933  Su11.f = Sc.f * Su11.f;
1934  Su12.f = Sc.f * Su12.f;
1935  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
1936  Su12.f = __fsub_rn(Su12.f, Stmp1.f);
1937 
1938  Stmp1.f = Ss.f * Su21.f;
1939  Stmp2.f = Ss.f * Su22.f;
1940  Su21.f = Sc.f * Su21.f;
1941  Su22.f = Sc.f * Su22.f;
1942  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
1943  Su22.f = __fsub_rn(Su22.f, Stmp1.f);
1944 
1945  Stmp1.f = Ss.f * Su31.f;
1946  Stmp2.f = Ss.f * Su32.f;
1947  Su31.f = Sc.f * Su31.f;
1948  Su32.f = Sc.f * Su32.f;
1949  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
1950  Su32.f = __fsub_rn(Su32.f, Stmp1.f);
1951 
1952  // Second Givens rotation
1953 
1954  Ssh.f = Sa31.f * Sa31.f;
1955  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1956  Ssh.ui = Ssh.ui & Sa31.ui;
1957 
1958  Stmp5.f = 0.f;
1959  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1960  Sch.f = max(Sch.f, Sa11.f);
1961  Sch.f = max(Sch.f, gsmall_number);
1962  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1963 
1964  Stmp1.f = Sch.f * Sch.f;
1965  Stmp2.f = Ssh.f * Ssh.f;
1966  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1967  Stmp1.f = __frsqrt_rn(Stmp2.f);
1968 
1969  Stmp4.f = Stmp1.f * 0.5;
1970  Stmp3.f = Stmp1.f * Stmp4.f;
1971  Stmp3.f = Stmp1.f * Stmp3.f;
1972  Stmp3.f = Stmp2.f * Stmp3.f;
1973  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1974  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1975  Stmp1.f = Stmp1.f * Stmp2.f;
1976 
1977  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1978 
1979  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1980  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1981  Sch.ui = Stmp5.ui & Sch.ui;
1982  Ssh.ui = Stmp5.ui & Ssh.ui;
1983  Sch.ui = Sch.ui | Stmp1.ui;
1984  Ssh.ui = Ssh.ui | Stmp2.ui;
1985 
1986  Stmp1.f = Sch.f * Sch.f;
1987  Stmp2.f = Ssh.f * Ssh.f;
1988  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1989  Stmp1.f = __frsqrt_rn(Stmp2.f);
1990 
1991  Stmp4.f = Stmp1.f * 0.5f;
1992  Stmp3.f = Stmp1.f * Stmp4.f;
1993  Stmp3.f = Stmp1.f * Stmp3.f;
1994  Stmp3.f = Stmp2.f * Stmp3.f;
1995  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1996  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1997 
1998  Sch.f = Sch.f * Stmp1.f;
1999  Ssh.f = Ssh.f * Stmp1.f;
2000 
2001  Sc.f = Sch.f * Sch.f;
2002  Ss.f = Ssh.f * Ssh.f;
2003  Sc.f = __fsub_rn(Sc.f, Ss.f);
2004  Ss.f = Ssh.f * Sch.f;
2005  Ss.f = __fadd_rn(Ss.f, Ss.f);
2006 
2007  //###########################################################
2008  // Rotate matrix A
2009  //###########################################################
2010 
2011  Stmp1.f = Ss.f * Sa11.f;
2012  Stmp2.f = Ss.f * Sa31.f;
2013  Sa11.f = Sc.f * Sa11.f;
2014  Sa31.f = Sc.f * Sa31.f;
2015  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
2016  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2017 
2018  Stmp1.f = Ss.f * Sa12.f;
2019  Stmp2.f = Ss.f * Sa32.f;
2020  Sa12.f = Sc.f * Sa12.f;
2021  Sa32.f = Sc.f * Sa32.f;
2022  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
2023  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2024 
2025  Stmp1.f = Ss.f * Sa13.f;
2026  Stmp2.f = Ss.f * Sa33.f;
2027  Sa13.f = Sc.f * Sa13.f;
2028  Sa33.f = Sc.f * Sa33.f;
2029  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
2030  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2031 
2032  //###########################################################
2033  // Update matrix U
2034  //###########################################################
2035 
2036  Stmp1.f = Ss.f * Su11.f;
2037  Stmp2.f = Ss.f * Su13.f;
2038  Su11.f = Sc.f * Su11.f;
2039  Su13.f = Sc.f * Su13.f;
2040  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
2041  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2042 
2043  Stmp1.f = Ss.f * Su21.f;
2044  Stmp2.f = Ss.f * Su23.f;
2045  Su21.f = Sc.f * Su21.f;
2046  Su23.f = Sc.f * Su23.f;
2047  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
2048  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2049 
2050  Stmp1.f = Ss.f * Su31.f;
2051  Stmp2.f = Ss.f * Su33.f;
2052  Su31.f = Sc.f * Su31.f;
2053  Su33.f = Sc.f * Su33.f;
2054  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
2055  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2056 
2057  // Third Givens Rotation
2058 
2059  Ssh.f = Sa32.f * Sa32.f;
2060  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
2061  Ssh.ui = Ssh.ui & Sa32.ui;
2062 
2063  Stmp5.f = 0.f;
2064  Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
2065  Sch.f = max(Sch.f, Sa22.f);
2066  Sch.f = max(Sch.f, gsmall_number);
2067  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
2068 
2069  Stmp1.f = Sch.f * Sch.f;
2070  Stmp2.f = Ssh.f * Ssh.f;
2071  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2072  Stmp1.f = __frsqrt_rn(Stmp2.f);
2073 
2074  Stmp4.f = Stmp1.f * 0.5f;
2075  Stmp3.f = Stmp1.f * Stmp4.f;
2076  Stmp3.f = Stmp1.f * Stmp3.f;
2077  Stmp3.f = Stmp2.f * Stmp3.f;
2078  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2079  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2080  Stmp1.f = Stmp1.f * Stmp2.f;
2081 
2082  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
2083 
2084  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
2085  Stmp2.ui = ~Stmp5.ui & Sch.ui;
2086  Sch.ui = Stmp5.ui & Sch.ui;
2087  Ssh.ui = Stmp5.ui & Ssh.ui;
2088  Sch.ui = Sch.ui | Stmp1.ui;
2089  Ssh.ui = Ssh.ui | Stmp2.ui;
2090 
2091  Stmp1.f = Sch.f * Sch.f;
2092  Stmp2.f = Ssh.f * Ssh.f;
2093  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2094  Stmp1.f = __frsqrt_rn(Stmp2.f);
2095 
2096  Stmp4.f = Stmp1.f * 0.5f;
2097  Stmp3.f = Stmp1.f * Stmp4.f;
2098  Stmp3.f = Stmp1.f * Stmp3.f;
2099  Stmp3.f = Stmp2.f * Stmp3.f;
2100  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2101  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2102 
2103  Sch.f = Sch.f * Stmp1.f;
2104  Ssh.f = Ssh.f * Stmp1.f;
2105 
2106  Sc.f = Sch.f * Sch.f;
2107  Ss.f = Ssh.f * Ssh.f;
2108  Sc.f = __fsub_rn(Sc.f, Ss.f);
2109  Ss.f = Ssh.f * Sch.f;
2110  Ss.f = __fadd_rn(Ss.f, Ss.f);
2111 
2112  //###########################################################
2113  // Rotate matrix A
2114  //###########################################################
2115 
2116  Stmp1.f = Ss.f * Sa21.f;
2117  Stmp2.f = Ss.f * Sa31.f;
2118  Sa21.f = Sc.f * Sa21.f;
2119  Sa31.f = Sc.f * Sa31.f;
2120  Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
2121  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2122 
2123  Stmp1.f = Ss.f * Sa22.f;
2124  Stmp2.f = Ss.f * Sa32.f;
2125  Sa22.f = Sc.f * Sa22.f;
2126  Sa32.f = Sc.f * Sa32.f;
2127  Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
2128  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2129 
2130  Stmp1.f = Ss.f * Sa23.f;
2131  Stmp2.f = Ss.f * Sa33.f;
2132  Sa23.f = Sc.f * Sa23.f;
2133  Sa33.f = Sc.f * Sa33.f;
2134  Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
2135  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2136 
2137  //###########################################################
2138  // Update matrix U
2139  //###########################################################
2140 
2141  Stmp1.f = Ss.f * Su12.f;
2142  Stmp2.f = Ss.f * Su13.f;
2143  Su12.f = Sc.f * Su12.f;
2144  Su13.f = Sc.f * Su13.f;
2145  Su12.f = __fadd_rn(Su12.f, Stmp2.f);
2146  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2147 
2148  Stmp1.f = Ss.f * Su22.f;
2149  Stmp2.f = Ss.f * Su23.f;
2150  Su22.f = Sc.f * Su22.f;
2151  Su23.f = Sc.f * Su23.f;
2152  Su22.f = __fadd_rn(Su22.f, Stmp2.f);
2153  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2154 
2155  Stmp1.f = Ss.f * Su32.f;
2156  Stmp2.f = Ss.f * Su33.f;
2157  Su32.f = Sc.f * Su32.f;
2158  Su33.f = Sc.f * Su33.f;
2159  Su32.f = __fadd_rn(Su32.f, Stmp2.f);
2160  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2161 
2162  V_3x3[0] = Sv11.f;
2163  V_3x3[1] = Sv12.f;
2164  V_3x3[2] = Sv13.f;
2165  V_3x3[3] = Sv21.f;
2166  V_3x3[4] = Sv22.f;
2167  V_3x3[5] = Sv23.f;
2168  V_3x3[6] = Sv31.f;
2169  V_3x3[7] = Sv32.f;
2170  V_3x3[8] = Sv33.f;
2171 
2172  U_3x3[0] = Su11.f;
2173  U_3x3[1] = Su12.f;
2174  U_3x3[2] = Su13.f;
2175  U_3x3[3] = Su21.f;
2176  U_3x3[4] = Su22.f;
2177  U_3x3[5] = Su23.f;
2178  U_3x3[6] = Su31.f;
2179  U_3x3[7] = Su32.f;
2180  U_3x3[8] = Su33.f;
2181 
2182  S_3x1[0] = Sa11.f;
2183  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
2184  S_3x1[1] = Sa22.f;
2185  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
2186  S_3x1[2] = Sa33.f;
2187 }
2188 
2189 template <typename scalar_t>
2191  const scalar_t *A_3x3, // input A {3,3}
2192  const scalar_t *B_3x1, // input b {3,1}
2193  scalar_t *X_3x1) // output x {3,1}
2194 {
2195  scalar_t U[9];
2196  scalar_t V[9];
2197  scalar_t S[3];
2198  svd3x3(A_3x3, U, S, V);
2199 
2200  //###########################################################
2201  // Sigma^+
2202  //###########################################################
2203  const scalar_t epsilon = 1e-10;
2204  S[0] = abs(S[0]) < epsilon ? 0 : 1.0 / S[0];
2205  S[1] = abs(S[1]) < epsilon ? 0 : 1.0 / S[1];
2206  S[2] = abs(S[2]) < epsilon ? 0 : 1.0 / S[2];
2207 
2208  //###########################################################
2209  // (Sigma^+) * UT
2210  //###########################################################
2211  scalar_t S_UT[9];
2212 
2213  S_UT[0] = U[0] * S[0];
2214  S_UT[1] = U[3] * S[0];
2215  S_UT[2] = U[6] * S[0];
2216  S_UT[3] = U[1] * S[1];
2217  S_UT[4] = U[4] * S[1];
2218  S_UT[5] = U[7] * S[1];
2219  S_UT[6] = U[2] * S[2];
2220  S_UT[7] = U[5] * S[2];
2221  S_UT[8] = U[8] * S[2];
2222 
2223  //###########################################################
2224  // Ainv = V * [(Sigma^+) * UT]
2225  //###########################################################
2226  scalar_t Ainv[9] = {0};
2227  matmul3x3_3x3(V, S_UT, Ainv);
2228 
2229  //###########################################################
2230  // x = Ainv * b
2231  //###########################################################
2232 
2233  matmul3x3_3x1(Ainv, B_3x1, X_3x1);
2234 }
2235 
2236 } // namespace kernel
2237 } // namespace linalg
2238 } // namespace core
2239 } // namespace open3d
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void solve_svd3x3(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *X_3x1)
Definition: SVD3x3.h:2190
#define __dsub_rn(x, y)
Definition: SVD3x3.h:86
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition: Matrix.h:67
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< double >(const double *A_3x3, double *U_3x3, double *S_3x1, double *V_3x3)
Definition: SVD3x3.h:112
#define OPEN3D_FORCE_INLINE
Definition: CUDAUtils.h:62
#define __fsub_rn(x, y)
Definition: SVD3x3.h:82
#define __frsqrt_rn(x)
Definition: SVD3x3.h:83
#define gsine_pi_over_eight
Definition: SVD3x3.h:59
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:64
#define __drsqrt_rn(x)
Definition: SVD3x3.h:87
scalar_t f
Definition: SVD3x3.h:101
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< float >(const float *A_3x3, float *U_3x3, float *S_3x1, float *V_3x3)
Definition: SVD3x3.h:1151
#define gcosine_pi_over_eight
Definition: SVD3x3.h:61
#define gtiny_number
Definition: SVD3x3.h:62
Definition: SVD3x3.h:100
unsigned int ui
Definition: SVD3x3.h:102
Definition: PinholeCameraIntrinsic.cpp:35
#define __dadd_rn(x, y)
Definition: SVD3x3.h:85
#define gfour_gamma_squared
Definition: SVD3x3.h:63
#define __fadd_rn(x, y)
Definition: SVD3x3.h:81
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x1(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *C_3x1)
Definition: Matrix.h:58
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
Common CUDA utilities.
#define gone
Definition: SVD3x3.h:58