Open3D (C++ API)  0.13.0
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
SVD3x3CPU.h
Go to the documentation of this file.
1 /**************************************************************************
2 **
3 ** svd3
4 **
5 ** Quick singular value decomposition as described by:
6 ** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
7 ** Computing the Singular Value Decomposition of 3x3 matrices
8 ** with minimal branching and elementary floating point operations,
9 ** University of Wisconsin - Madison technical report TR1690, May 2011
10 **
11 ** Identical GPU version
12 ** Implementated by: Kui Wu
13 ** kwu@cs.utah.edu
14 ** Simply modified back to CPU by: Wei Dong
15 **
16 ** May 2018
17 **
18 **************************************************************************/
19 
20 #pragma once
21 
22 #include "math.h"
23 
24 #define gone 1065353216
25 #define gsine_pi_over_eight 1053028117
26 
27 #define gcosine_pi_over_eight 1064076127
28 #define gone_half 0.5f
29 #define gsmall_number 1.e-12f
30 #define gtiny_number 1.e-20f
31 #define gfour_gamma_squared 5.8284273147583007813f
32 
33 union un {
34  float f;
35  unsigned int ui;
36 };
37 
38 #define max(x, y) (x > y ? x : y)
39 #define __fadd_rn(x, y) (x + y)
40 #define __fsub_rn(x, y) (x - y)
41 #define __frsqrt_rn(x) (1.0 / sqrt(x))
42 
43 inline void svd(float a11,
44  float a12,
45  float a13,
46  float a21,
47  float a22,
48  float a23,
49  float a31,
50  float a32,
51  float a33, // input A
52  float &u11,
53  float &u12,
54  float &u13,
55  float &u21,
56  float &u22,
57  float &u23,
58  float &u31,
59  float &u32,
60  float &u33, // output U
61  float &s11,
62  float &s22,
63  float &s33, // output S
64  float &v11,
65  float &v12,
66  float &v13,
67  float &v21,
68  float &v22,
69  float &v23,
70  float &v31,
71  float &v32,
72  float &v33 // output V
73 ) {
74  un Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
75  un Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
76  un Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
77  un Sc, Ss, Sch, Ssh;
78  un Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
79  un Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
80  un Sqvs, Sqvvx, Sqvvy, Sqvvz;
81 
82  Sa11.f = a11;
83  Sa12.f = a12;
84  Sa13.f = a13;
85  Sa21.f = a21;
86  Sa22.f = a22;
87  Sa23.f = a23;
88  Sa31.f = a31;
89  Sa32.f = a32;
90  Sa33.f = a33;
91 
92  //###########################################################
93  // Compute normal equations matrix
94  //###########################################################
95 
96  Ss11.f = Sa11.f * Sa11.f;
97  Stmp1.f = Sa21.f * Sa21.f;
98  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
99  Stmp1.f = Sa31.f * Sa31.f;
100  Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
101 
102  Ss21.f = Sa12.f * Sa11.f;
103  Stmp1.f = Sa22.f * Sa21.f;
104  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
105  Stmp1.f = Sa32.f * Sa31.f;
106  Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
107 
108  Ss31.f = Sa13.f * Sa11.f;
109  Stmp1.f = Sa23.f * Sa21.f;
110  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
111  Stmp1.f = Sa33.f * Sa31.f;
112  Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
113 
114  Ss22.f = Sa12.f * Sa12.f;
115  Stmp1.f = Sa22.f * Sa22.f;
116  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
117  Stmp1.f = Sa32.f * Sa32.f;
118  Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
119 
120  Ss32.f = Sa13.f * Sa12.f;
121  Stmp1.f = Sa23.f * Sa22.f;
122  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
123  Stmp1.f = Sa33.f * Sa32.f;
124  Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
125 
126  Ss33.f = Sa13.f * Sa13.f;
127  Stmp1.f = Sa23.f * Sa23.f;
128  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
129  Stmp1.f = Sa33.f * Sa33.f;
130  Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
131 
132  Sqvs.f = 1.f;
133  Sqvvx.f = 0.f;
134  Sqvvy.f = 0.f;
135  Sqvvz.f = 0.f;
136 
137  //###########################################################
138  // Solve symmetric eigenproblem using Jacobi iteration
139  //###########################################################
140  for (int i = 0; i < 4; i++) {
141  Ssh.f = Ss21.f * 0.5f;
142  Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
143 
144  Stmp2.f = Ssh.f * Ssh.f;
145  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
146  Ssh.ui = Stmp1.ui & Ssh.ui;
147  Sch.ui = Stmp1.ui & Stmp5.ui;
148  Stmp2.ui = ~Stmp1.ui & gone;
149  Sch.ui = Sch.ui | Stmp2.ui;
150 
151  Stmp1.f = Ssh.f * Ssh.f;
152  Stmp2.f = Sch.f * Sch.f;
153  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
154  Stmp4.f = __frsqrt_rn(Stmp3.f);
155 
156  Ssh.f = Stmp4.f * Ssh.f;
157  Sch.f = Stmp4.f * Sch.f;
158  Stmp1.f = gfour_gamma_squared * Stmp1.f;
159  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
160 
161  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
162  Ssh.ui = ~Stmp1.ui & Ssh.ui;
163  Ssh.ui = Ssh.ui | Stmp2.ui;
164  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
165  Sch.ui = ~Stmp1.ui & Sch.ui;
166  Sch.ui = Sch.ui | Stmp2.ui;
167 
168  Stmp1.f = Ssh.f * Ssh.f;
169  Stmp2.f = Sch.f * Sch.f;
170  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
171  Ss.f = Sch.f * Ssh.f;
172  Ss.f = __fadd_rn(Ss.f, Ss.f);
173 
174 #ifdef DEBUG_JACOBI_CONJUGATE
175  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
176  Sch.f);
177 #endif
178  //###########################################################
179  // Perform the actual Givens conjugation
180  //###########################################################
181 
182  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
183  Ss33.f = Ss33.f * Stmp3.f;
184  Ss31.f = Ss31.f * Stmp3.f;
185  Ss32.f = Ss32.f * Stmp3.f;
186  Ss33.f = Ss33.f * Stmp3.f;
187 
188  Stmp1.f = Ss.f * Ss31.f;
189  Stmp2.f = Ss.f * Ss32.f;
190  Ss31.f = Sc.f * Ss31.f;
191  Ss32.f = Sc.f * Ss32.f;
192  Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
193  Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
194 
195  Stmp2.f = Ss.f * Ss.f;
196  Stmp1.f = Ss22.f * Stmp2.f;
197  Stmp3.f = Ss11.f * Stmp2.f;
198  Stmp4.f = Sc.f * Sc.f;
199  Ss11.f = Ss11.f * Stmp4.f;
200  Ss22.f = Ss22.f * Stmp4.f;
201  Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
202  Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
203  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
204  Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
205  Ss21.f = Ss21.f * Stmp4.f;
206  Stmp4.f = Sc.f * Ss.f;
207  Stmp2.f = Stmp2.f * Stmp4.f;
208  Stmp5.f = Stmp5.f * Stmp4.f;
209  Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
210  Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
211  Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
212 
213 #ifdef DEBUG_JACOBI_CONJUGATE
214  printf("%.20g\n", Ss11.f);
215  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
216  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
217 #endif
218 
219  //###########################################################
220  // Compute the cumulative rotation, in quaternion form
221  //###########################################################
222 
223  Stmp1.f = Ssh.f * Sqvvx.f;
224  Stmp2.f = Ssh.f * Sqvvy.f;
225  Stmp3.f = Ssh.f * Sqvvz.f;
226  Ssh.f = Ssh.f * Sqvs.f;
227 
228  Sqvs.f = Sch.f * Sqvs.f;
229  Sqvvx.f = Sch.f * Sqvvx.f;
230  Sqvvy.f = Sch.f * Sqvvy.f;
231  Sqvvz.f = Sch.f * Sqvvz.f;
232 
233  Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
234  Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
235  Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
236  Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
237 
238 #ifdef DEBUG_JACOBI_CONJUGATE
239  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
240  Sqvs.f);
241 #endif
242 
244  // (1->3)
246  Ssh.f = Ss32.f * 0.5f;
247  Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
248 
249  Stmp2.f = Ssh.f * Ssh.f;
250  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
251  Ssh.ui = Stmp1.ui & Ssh.ui;
252  Sch.ui = Stmp1.ui & Stmp5.ui;
253  Stmp2.ui = ~Stmp1.ui & gone;
254  Sch.ui = Sch.ui | Stmp2.ui;
255 
256  Stmp1.f = Ssh.f * Ssh.f;
257  Stmp2.f = Sch.f * Sch.f;
258  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
259  Stmp4.f = __frsqrt_rn(Stmp3.f);
260 
261  Ssh.f = Stmp4.f * Ssh.f;
262  Sch.f = Stmp4.f * Sch.f;
263  Stmp1.f = gfour_gamma_squared * Stmp1.f;
264  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
265 
266  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
267  Ssh.ui = ~Stmp1.ui & Ssh.ui;
268  Ssh.ui = Ssh.ui | Stmp2.ui;
269  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
270  Sch.ui = ~Stmp1.ui & Sch.ui;
271  Sch.ui = Sch.ui | Stmp2.ui;
272 
273  Stmp1.f = Ssh.f * Ssh.f;
274  Stmp2.f = Sch.f * Sch.f;
275  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
276  Ss.f = Sch.f * Ssh.f;
277  Ss.f = __fadd_rn(Ss.f, Ss.f);
278 
279 #ifdef DEBUG_JACOBI_CONJUGATE
280  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
281  Sch.f);
282 #endif
283 
284  //###########################################################
285  // Perform the actual Givens conjugation
286  //###########################################################
287 
288  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
289  Ss11.f = Ss11.f * Stmp3.f;
290  Ss21.f = Ss21.f * Stmp3.f;
291  Ss31.f = Ss31.f * Stmp3.f;
292  Ss11.f = Ss11.f * Stmp3.f;
293 
294  Stmp1.f = Ss.f * Ss21.f;
295  Stmp2.f = Ss.f * Ss31.f;
296  Ss21.f = Sc.f * Ss21.f;
297  Ss31.f = Sc.f * Ss31.f;
298  Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
299  Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
300 
301  Stmp2.f = Ss.f * Ss.f;
302  Stmp1.f = Ss33.f * Stmp2.f;
303  Stmp3.f = Ss22.f * Stmp2.f;
304  Stmp4.f = Sc.f * Sc.f;
305  Ss22.f = Ss22.f * Stmp4.f;
306  Ss33.f = Ss33.f * Stmp4.f;
307  Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
308  Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
309  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
310  Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
311  Ss32.f = Ss32.f * Stmp4.f;
312  Stmp4.f = Sc.f * Ss.f;
313  Stmp2.f = Stmp2.f * Stmp4.f;
314  Stmp5.f = Stmp5.f * Stmp4.f;
315  Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
316  Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
317  Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
318 
319 #ifdef DEBUG_JACOBI_CONJUGATE
320  printf("%.20g\n", Ss11.f);
321  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
322  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
323 #endif
324 
325  //###########################################################
326  // Compute the cumulative rotation, in quaternion form
327  //###########################################################
328 
329  Stmp1.f = Ssh.f * Sqvvx.f;
330  Stmp2.f = Ssh.f * Sqvvy.f;
331  Stmp3.f = Ssh.f * Sqvvz.f;
332  Ssh.f = Ssh.f * Sqvs.f;
333 
334  Sqvs.f = Sch.f * Sqvs.f;
335  Sqvvx.f = Sch.f * Sqvvx.f;
336  Sqvvy.f = Sch.f * Sqvvy.f;
337  Sqvvz.f = Sch.f * Sqvvz.f;
338 
339  Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
340  Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
341  Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
342  Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
343 
344 #ifdef DEBUG_JACOBI_CONJUGATE
345  printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
346  Sqvs.f);
347 #endif
348 #if 1
349  // 1 -> 2
352 
353  Ssh.f = Ss31.f * 0.5f;
354  Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
355 
356  Stmp2.f = Ssh.f * Ssh.f;
357  Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
358  Ssh.ui = Stmp1.ui & Ssh.ui;
359  Sch.ui = Stmp1.ui & Stmp5.ui;
360  Stmp2.ui = ~Stmp1.ui & gone;
361  Sch.ui = Sch.ui | Stmp2.ui;
362 
363  Stmp1.f = Ssh.f * Ssh.f;
364  Stmp2.f = Sch.f * Sch.f;
365  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
366  Stmp4.f = __frsqrt_rn(Stmp3.f);
367 
368  Ssh.f = Stmp4.f * Ssh.f;
369  Sch.f = Stmp4.f * Sch.f;
370  Stmp1.f = gfour_gamma_squared * Stmp1.f;
371  Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
372 
373  Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
374  Ssh.ui = ~Stmp1.ui & Ssh.ui;
375  Ssh.ui = Ssh.ui | Stmp2.ui;
376  Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
377  Sch.ui = ~Stmp1.ui & Sch.ui;
378  Sch.ui = Sch.ui | Stmp2.ui;
379 
380  Stmp1.f = Ssh.f * Ssh.f;
381  Stmp2.f = Sch.f * Sch.f;
382  Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
383  Ss.f = Sch.f * Ssh.f;
384  Ss.f = __fadd_rn(Ss.f, Ss.f);
385 
386 #ifdef DEBUG_JACOBI_CONJUGATE
387  printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
388  Sch.f);
389 #endif
390 
391  //###########################################################
392  // Perform the actual Givens conjugation
393  //###########################################################
394 
395  Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
396  Ss22.f = Ss22.f * Stmp3.f;
397  Ss32.f = Ss32.f * Stmp3.f;
398  Ss21.f = Ss21.f * Stmp3.f;
399  Ss22.f = Ss22.f * Stmp3.f;
400 
401  Stmp1.f = Ss.f * Ss32.f;
402  Stmp2.f = Ss.f * Ss21.f;
403  Ss32.f = Sc.f * Ss32.f;
404  Ss21.f = Sc.f * Ss21.f;
405  Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
406  Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
407 
408  Stmp2.f = Ss.f * Ss.f;
409  Stmp1.f = Ss11.f * Stmp2.f;
410  Stmp3.f = Ss33.f * Stmp2.f;
411  Stmp4.f = Sc.f * Sc.f;
412  Ss33.f = Ss33.f * Stmp4.f;
413  Ss11.f = Ss11.f * Stmp4.f;
414  Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
415  Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
416  Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
417  Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
418  Ss31.f = Ss31.f * Stmp4.f;
419  Stmp4.f = Sc.f * Ss.f;
420  Stmp2.f = Stmp2.f * Stmp4.f;
421  Stmp5.f = Stmp5.f * Stmp4.f;
422  Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
423  Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
424  Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
425 
426 #ifdef DEBUG_JACOBI_CONJUGATE
427  printf("%.20g\n", Ss11.f);
428  printf("%.20g %.20g\n", Ss21.f, Ss22.f);
429  printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
430 #endif
431 
432  //###########################################################
433  // Compute the cumulative rotation, in quaternion form
434  //###########################################################
435 
436  Stmp1.f = Ssh.f * Sqvvx.f;
437  Stmp2.f = Ssh.f * Sqvvy.f;
438  Stmp3.f = Ssh.f * Sqvvz.f;
439  Ssh.f = Ssh.f * Sqvs.f;
440 
441  Sqvs.f = Sch.f * Sqvs.f;
442  Sqvvx.f = Sch.f * Sqvvx.f;
443  Sqvvy.f = Sch.f * Sqvvy.f;
444  Sqvvz.f = Sch.f * Sqvvz.f;
445 
446  Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
447  Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
448  Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
449  Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
450 #endif
451  }
452 
453  //###########################################################
454  // Normalize quaternion for matrix V
455  //###########################################################
456 
457  Stmp2.f = Sqvs.f * Sqvs.f;
458  Stmp1.f = Sqvvx.f * Sqvvx.f;
459  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
460  Stmp1.f = Sqvvy.f * Sqvvy.f;
461  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
462  Stmp1.f = Sqvvz.f * Sqvvz.f;
463  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
464 
465  Stmp1.f = __frsqrt_rn(Stmp2.f);
466  Stmp4.f = Stmp1.f * 0.5f;
467  Stmp3.f = Stmp1.f * Stmp4.f;
468  Stmp3.f = Stmp1.f * Stmp3.f;
469  Stmp3.f = Stmp2.f * Stmp3.f;
470  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
471  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
472 
473  Sqvs.f = Sqvs.f * Stmp1.f;
474  Sqvvx.f = Sqvvx.f * Stmp1.f;
475  Sqvvy.f = Sqvvy.f * Stmp1.f;
476  Sqvvz.f = Sqvvz.f * Stmp1.f;
477 
478  //###########################################################
479  // Transform quaternion to matrix V
480  //###########################################################
481 
482  Stmp1.f = Sqvvx.f * Sqvvx.f;
483  Stmp2.f = Sqvvy.f * Sqvvy.f;
484  Stmp3.f = Sqvvz.f * Sqvvz.f;
485  Sv11.f = Sqvs.f * Sqvs.f;
486  Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
487  Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
488  Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
489  Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
490  Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
491  Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
492  Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
493  Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
494  Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
495  Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
496  Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
497  Sv32.f = Sqvs.f * Stmp1.f;
498  Sv13.f = Sqvs.f * Stmp2.f;
499  Sv21.f = Sqvs.f * Stmp3.f;
500  Stmp1.f = Sqvvy.f * Stmp1.f;
501  Stmp2.f = Sqvvz.f * Stmp2.f;
502  Stmp3.f = Sqvvx.f * Stmp3.f;
503  Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
504  Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
505  Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
506  Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
507  Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
508  Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
509 
511  // Multiply (from the right) with V
512  //###########################################################
513 
514  Stmp2.f = Sa12.f;
515  Stmp3.f = Sa13.f;
516  Sa12.f = Sv12.f * Sa11.f;
517  Sa13.f = Sv13.f * Sa11.f;
518  Sa11.f = Sv11.f * Sa11.f;
519  Stmp1.f = Sv21.f * Stmp2.f;
520  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
521  Stmp1.f = Sv31.f * Stmp3.f;
522  Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
523  Stmp1.f = Sv22.f * Stmp2.f;
524  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
525  Stmp1.f = Sv32.f * Stmp3.f;
526  Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
527  Stmp1.f = Sv23.f * Stmp2.f;
528  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
529  Stmp1.f = Sv33.f * Stmp3.f;
530  Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
531 
532  Stmp2.f = Sa22.f;
533  Stmp3.f = Sa23.f;
534  Sa22.f = Sv12.f * Sa21.f;
535  Sa23.f = Sv13.f * Sa21.f;
536  Sa21.f = Sv11.f * Sa21.f;
537  Stmp1.f = Sv21.f * Stmp2.f;
538  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
539  Stmp1.f = Sv31.f * Stmp3.f;
540  Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
541  Stmp1.f = Sv22.f * Stmp2.f;
542  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
543  Stmp1.f = Sv32.f * Stmp3.f;
544  Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
545  Stmp1.f = Sv23.f * Stmp2.f;
546  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
547  Stmp1.f = Sv33.f * Stmp3.f;
548  Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
549 
550  Stmp2.f = Sa32.f;
551  Stmp3.f = Sa33.f;
552  Sa32.f = Sv12.f * Sa31.f;
553  Sa33.f = Sv13.f * Sa31.f;
554  Sa31.f = Sv11.f * Sa31.f;
555  Stmp1.f = Sv21.f * Stmp2.f;
556  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
557  Stmp1.f = Sv31.f * Stmp3.f;
558  Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
559  Stmp1.f = Sv22.f * Stmp2.f;
560  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
561  Stmp1.f = Sv32.f * Stmp3.f;
562  Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
563  Stmp1.f = Sv23.f * Stmp2.f;
564  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
565  Stmp1.f = Sv33.f * Stmp3.f;
566  Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
567 
568  //###########################################################
569  // Permute columns such that the singular values are sorted
570  //###########################################################
571 
572  Stmp1.f = Sa11.f * Sa11.f;
573  Stmp4.f = Sa21.f * Sa21.f;
574  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
575  Stmp4.f = Sa31.f * Sa31.f;
576  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
577 
578  Stmp2.f = Sa12.f * Sa12.f;
579  Stmp4.f = Sa22.f * Sa22.f;
580  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
581  Stmp4.f = Sa32.f * Sa32.f;
582  Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
583 
584  Stmp3.f = Sa13.f * Sa13.f;
585  Stmp4.f = Sa23.f * Sa23.f;
586  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
587  Stmp4.f = Sa33.f * Sa33.f;
588  Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
589 
590  // Swap columns 1-2 if necessary
591 
592  Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
593  Stmp5.ui = Sa11.ui ^ Sa12.ui;
594  Stmp5.ui = Stmp5.ui & Stmp4.ui;
595  Sa11.ui = Sa11.ui ^ Stmp5.ui;
596  Sa12.ui = Sa12.ui ^ Stmp5.ui;
597 
598  Stmp5.ui = Sa21.ui ^ Sa22.ui;
599  Stmp5.ui = Stmp5.ui & Stmp4.ui;
600  Sa21.ui = Sa21.ui ^ Stmp5.ui;
601  Sa22.ui = Sa22.ui ^ Stmp5.ui;
602 
603  Stmp5.ui = Sa31.ui ^ Sa32.ui;
604  Stmp5.ui = Stmp5.ui & Stmp4.ui;
605  Sa31.ui = Sa31.ui ^ Stmp5.ui;
606  Sa32.ui = Sa32.ui ^ Stmp5.ui;
607 
608  Stmp5.ui = Sv11.ui ^ Sv12.ui;
609  Stmp5.ui = Stmp5.ui & Stmp4.ui;
610  Sv11.ui = Sv11.ui ^ Stmp5.ui;
611  Sv12.ui = Sv12.ui ^ Stmp5.ui;
612 
613  Stmp5.ui = Sv21.ui ^ Sv22.ui;
614  Stmp5.ui = Stmp5.ui & Stmp4.ui;
615  Sv21.ui = Sv21.ui ^ Stmp5.ui;
616  Sv22.ui = Sv22.ui ^ Stmp5.ui;
617 
618  Stmp5.ui = Sv31.ui ^ Sv32.ui;
619  Stmp5.ui = Stmp5.ui & Stmp4.ui;
620  Sv31.ui = Sv31.ui ^ Stmp5.ui;
621  Sv32.ui = Sv32.ui ^ Stmp5.ui;
622 
623  Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
624  Stmp5.ui = Stmp5.ui & Stmp4.ui;
625  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
626  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
627 
628  // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
629  // is still a rotation
630 
631  Stmp5.f = -2.f;
632  Stmp5.ui = Stmp5.ui & Stmp4.ui;
633  Stmp4.f = 1.f;
634  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
635 
636  Sa12.f = Sa12.f * Stmp4.f;
637  Sa22.f = Sa22.f * Stmp4.f;
638  Sa32.f = Sa32.f * Stmp4.f;
639 
640  Sv12.f = Sv12.f * Stmp4.f;
641  Sv22.f = Sv22.f * Stmp4.f;
642  Sv32.f = Sv32.f * Stmp4.f;
643 
644  // Swap columns 1-3 if necessary
645 
646  Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
647  Stmp5.ui = Sa11.ui ^ Sa13.ui;
648  Stmp5.ui = Stmp5.ui & Stmp4.ui;
649  Sa11.ui = Sa11.ui ^ Stmp5.ui;
650  Sa13.ui = Sa13.ui ^ Stmp5.ui;
651 
652  Stmp5.ui = Sa21.ui ^ Sa23.ui;
653  Stmp5.ui = Stmp5.ui & Stmp4.ui;
654  Sa21.ui = Sa21.ui ^ Stmp5.ui;
655  Sa23.ui = Sa23.ui ^ Stmp5.ui;
656 
657  Stmp5.ui = Sa31.ui ^ Sa33.ui;
658  Stmp5.ui = Stmp5.ui & Stmp4.ui;
659  Sa31.ui = Sa31.ui ^ Stmp5.ui;
660  Sa33.ui = Sa33.ui ^ Stmp5.ui;
661 
662  Stmp5.ui = Sv11.ui ^ Sv13.ui;
663  Stmp5.ui = Stmp5.ui & Stmp4.ui;
664  Sv11.ui = Sv11.ui ^ Stmp5.ui;
665  Sv13.ui = Sv13.ui ^ Stmp5.ui;
666 
667  Stmp5.ui = Sv21.ui ^ Sv23.ui;
668  Stmp5.ui = Stmp5.ui & Stmp4.ui;
669  Sv21.ui = Sv21.ui ^ Stmp5.ui;
670  Sv23.ui = Sv23.ui ^ Stmp5.ui;
671 
672  Stmp5.ui = Sv31.ui ^ Sv33.ui;
673  Stmp5.ui = Stmp5.ui & Stmp4.ui;
674  Sv31.ui = Sv31.ui ^ Stmp5.ui;
675  Sv33.ui = Sv33.ui ^ Stmp5.ui;
676 
677  Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
678  Stmp5.ui = Stmp5.ui & Stmp4.ui;
679  Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
680  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
681 
682  // If columns 1-3 have been swapped, negate 1st column of A and V so that V
683  // is still a rotation
684 
685  Stmp5.f = -2.f;
686  Stmp5.ui = Stmp5.ui & Stmp4.ui;
687  Stmp4.f = 1.f;
688  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
689 
690  Sa11.f = Sa11.f * Stmp4.f;
691  Sa21.f = Sa21.f * Stmp4.f;
692  Sa31.f = Sa31.f * Stmp4.f;
693 
694  Sv11.f = Sv11.f * Stmp4.f;
695  Sv21.f = Sv21.f * Stmp4.f;
696  Sv31.f = Sv31.f * Stmp4.f;
697 
698  // Swap columns 2-3 if necessary
699 
700  Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
701  Stmp5.ui = Sa12.ui ^ Sa13.ui;
702  Stmp5.ui = Stmp5.ui & Stmp4.ui;
703  Sa12.ui = Sa12.ui ^ Stmp5.ui;
704  Sa13.ui = Sa13.ui ^ Stmp5.ui;
705 
706  Stmp5.ui = Sa22.ui ^ Sa23.ui;
707  Stmp5.ui = Stmp5.ui & Stmp4.ui;
708  Sa22.ui = Sa22.ui ^ Stmp5.ui;
709  Sa23.ui = Sa23.ui ^ Stmp5.ui;
710 
711  Stmp5.ui = Sa32.ui ^ Sa33.ui;
712  Stmp5.ui = Stmp5.ui & Stmp4.ui;
713  Sa32.ui = Sa32.ui ^ Stmp5.ui;
714  Sa33.ui = Sa33.ui ^ Stmp5.ui;
715 
716  Stmp5.ui = Sv12.ui ^ Sv13.ui;
717  Stmp5.ui = Stmp5.ui & Stmp4.ui;
718  Sv12.ui = Sv12.ui ^ Stmp5.ui;
719  Sv13.ui = Sv13.ui ^ Stmp5.ui;
720 
721  Stmp5.ui = Sv22.ui ^ Sv23.ui;
722  Stmp5.ui = Stmp5.ui & Stmp4.ui;
723  Sv22.ui = Sv22.ui ^ Stmp5.ui;
724  Sv23.ui = Sv23.ui ^ Stmp5.ui;
725 
726  Stmp5.ui = Sv32.ui ^ Sv33.ui;
727  Stmp5.ui = Stmp5.ui & Stmp4.ui;
728  Sv32.ui = Sv32.ui ^ Stmp5.ui;
729  Sv33.ui = Sv33.ui ^ Stmp5.ui;
730 
731  Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
732  Stmp5.ui = Stmp5.ui & Stmp4.ui;
733  Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
734  Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
735 
736  // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
737  // is still a rotation
738 
739  Stmp5.f = -2.f;
740  Stmp5.ui = Stmp5.ui & Stmp4.ui;
741  Stmp4.f = 1.f;
742  Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
743 
744  Sa13.f = Sa13.f * Stmp4.f;
745  Sa23.f = Sa23.f * Stmp4.f;
746  Sa33.f = Sa33.f * Stmp4.f;
747 
748  Sv13.f = Sv13.f * Stmp4.f;
749  Sv23.f = Sv23.f * Stmp4.f;
750  Sv33.f = Sv33.f * Stmp4.f;
751 
752  //###########################################################
753  // Construct QR factorization of A*V (=U*D) using Givens rotations
754  //###########################################################
755 
756  Su11.f = 1.f;
757  Su12.f = 0.f;
758  Su13.f = 0.f;
759  Su21.f = 0.f;
760  Su22.f = 1.f;
761  Su23.f = 0.f;
762  Su31.f = 0.f;
763  Su32.f = 0.f;
764  Su33.f = 1.f;
765 
766  Ssh.f = Sa21.f * Sa21.f;
767  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
768  Ssh.ui = Ssh.ui & Sa21.ui;
769 
770  Stmp5.f = 0.f;
771  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
772  Sch.f = max(Sch.f, Sa11.f);
773  Sch.f = max(Sch.f, gsmall_number);
774  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
775 
776  Stmp1.f = Sch.f * Sch.f;
777  Stmp2.f = Ssh.f * Ssh.f;
778  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
779  Stmp1.f = __frsqrt_rn(Stmp2.f);
780 
781  Stmp4.f = Stmp1.f * 0.5f;
782  Stmp3.f = Stmp1.f * Stmp4.f;
783  Stmp3.f = Stmp1.f * Stmp3.f;
784  Stmp3.f = Stmp2.f * Stmp3.f;
785  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
786  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
787  Stmp1.f = Stmp1.f * Stmp2.f;
788 
789  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
790 
791  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
792  Stmp2.ui = ~Stmp5.ui & Sch.ui;
793  Sch.ui = Stmp5.ui & Sch.ui;
794  Ssh.ui = Stmp5.ui & Ssh.ui;
795  Sch.ui = Sch.ui | Stmp1.ui;
796  Ssh.ui = Ssh.ui | Stmp2.ui;
797 
798  Stmp1.f = Sch.f * Sch.f;
799  Stmp2.f = Ssh.f * Ssh.f;
800  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
801  Stmp1.f = __frsqrt_rn(Stmp2.f);
802 
803  Stmp4.f = Stmp1.f * 0.5f;
804  Stmp3.f = Stmp1.f * Stmp4.f;
805  Stmp3.f = Stmp1.f * Stmp3.f;
806  Stmp3.f = Stmp2.f * Stmp3.f;
807  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
808  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
809 
810  Sch.f = Sch.f * Stmp1.f;
811  Ssh.f = Ssh.f * Stmp1.f;
812 
813  Sc.f = Sch.f * Sch.f;
814  Ss.f = Ssh.f * Ssh.f;
815  Sc.f = __fsub_rn(Sc.f, Ss.f);
816  Ss.f = Ssh.f * Sch.f;
817  Ss.f = __fadd_rn(Ss.f, Ss.f);
818 
819  //###########################################################
820  // Rotate matrix A
821  //###########################################################
822 
823  Stmp1.f = Ss.f * Sa11.f;
824  Stmp2.f = Ss.f * Sa21.f;
825  Sa11.f = Sc.f * Sa11.f;
826  Sa21.f = Sc.f * Sa21.f;
827  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
828  Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
829 
830  Stmp1.f = Ss.f * Sa12.f;
831  Stmp2.f = Ss.f * Sa22.f;
832  Sa12.f = Sc.f * Sa12.f;
833  Sa22.f = Sc.f * Sa22.f;
834  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
835  Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
836 
837  Stmp1.f = Ss.f * Sa13.f;
838  Stmp2.f = Ss.f * Sa23.f;
839  Sa13.f = Sc.f * Sa13.f;
840  Sa23.f = Sc.f * Sa23.f;
841  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
842  Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
843 
844  //###########################################################
845  // Update matrix U
846  //###########################################################
847 
848  Stmp1.f = Ss.f * Su11.f;
849  Stmp2.f = Ss.f * Su12.f;
850  Su11.f = Sc.f * Su11.f;
851  Su12.f = Sc.f * Su12.f;
852  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
853  Su12.f = __fsub_rn(Su12.f, Stmp1.f);
854 
855  Stmp1.f = Ss.f * Su21.f;
856  Stmp2.f = Ss.f * Su22.f;
857  Su21.f = Sc.f * Su21.f;
858  Su22.f = Sc.f * Su22.f;
859  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
860  Su22.f = __fsub_rn(Su22.f, Stmp1.f);
861 
862  Stmp1.f = Ss.f * Su31.f;
863  Stmp2.f = Ss.f * Su32.f;
864  Su31.f = Sc.f * Su31.f;
865  Su32.f = Sc.f * Su32.f;
866  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
867  Su32.f = __fsub_rn(Su32.f, Stmp1.f);
868 
869  // Second Givens rotation
870 
871  Ssh.f = Sa31.f * Sa31.f;
872  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
873  Ssh.ui = Ssh.ui & Sa31.ui;
874 
875  Stmp5.f = 0.f;
876  Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
877  Sch.f = max(Sch.f, Sa11.f);
878  Sch.f = max(Sch.f, gsmall_number);
879  Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
880 
881  Stmp1.f = Sch.f * Sch.f;
882  Stmp2.f = Ssh.f * Ssh.f;
883  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
884  Stmp1.f = __frsqrt_rn(Stmp2.f);
885 
886  Stmp4.f = Stmp1.f * 0.5;
887  Stmp3.f = Stmp1.f * Stmp4.f;
888  Stmp3.f = Stmp1.f * Stmp3.f;
889  Stmp3.f = Stmp2.f * Stmp3.f;
890  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
891  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
892  Stmp1.f = Stmp1.f * Stmp2.f;
893 
894  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
895 
896  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
897  Stmp2.ui = ~Stmp5.ui & Sch.ui;
898  Sch.ui = Stmp5.ui & Sch.ui;
899  Ssh.ui = Stmp5.ui & Ssh.ui;
900  Sch.ui = Sch.ui | Stmp1.ui;
901  Ssh.ui = Ssh.ui | Stmp2.ui;
902 
903  Stmp1.f = Sch.f * Sch.f;
904  Stmp2.f = Ssh.f * Ssh.f;
905  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
906  Stmp1.f = __frsqrt_rn(Stmp2.f);
907 
908  Stmp4.f = Stmp1.f * 0.5f;
909  Stmp3.f = Stmp1.f * Stmp4.f;
910  Stmp3.f = Stmp1.f * Stmp3.f;
911  Stmp3.f = Stmp2.f * Stmp3.f;
912  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
913  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
914 
915  Sch.f = Sch.f * Stmp1.f;
916  Ssh.f = Ssh.f * Stmp1.f;
917 
918  Sc.f = Sch.f * Sch.f;
919  Ss.f = Ssh.f * Ssh.f;
920  Sc.f = __fsub_rn(Sc.f, Ss.f);
921  Ss.f = Ssh.f * Sch.f;
922  Ss.f = __fadd_rn(Ss.f, Ss.f);
923 
924  //###########################################################
925  // Rotate matrix A
926  //###########################################################
927 
928  Stmp1.f = Ss.f * Sa11.f;
929  Stmp2.f = Ss.f * Sa31.f;
930  Sa11.f = Sc.f * Sa11.f;
931  Sa31.f = Sc.f * Sa31.f;
932  Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
933  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
934 
935  Stmp1.f = Ss.f * Sa12.f;
936  Stmp2.f = Ss.f * Sa32.f;
937  Sa12.f = Sc.f * Sa12.f;
938  Sa32.f = Sc.f * Sa32.f;
939  Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
940  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
941 
942  Stmp1.f = Ss.f * Sa13.f;
943  Stmp2.f = Ss.f * Sa33.f;
944  Sa13.f = Sc.f * Sa13.f;
945  Sa33.f = Sc.f * Sa33.f;
946  Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
947  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
948 
949  //###########################################################
950  // Update matrix U
951  //###########################################################
952 
953  Stmp1.f = Ss.f * Su11.f;
954  Stmp2.f = Ss.f * Su13.f;
955  Su11.f = Sc.f * Su11.f;
956  Su13.f = Sc.f * Su13.f;
957  Su11.f = __fadd_rn(Su11.f, Stmp2.f);
958  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
959 
960  Stmp1.f = Ss.f * Su21.f;
961  Stmp2.f = Ss.f * Su23.f;
962  Su21.f = Sc.f * Su21.f;
963  Su23.f = Sc.f * Su23.f;
964  Su21.f = __fadd_rn(Su21.f, Stmp2.f);
965  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
966 
967  Stmp1.f = Ss.f * Su31.f;
968  Stmp2.f = Ss.f * Su33.f;
969  Su31.f = Sc.f * Su31.f;
970  Su33.f = Sc.f * Su33.f;
971  Su31.f = __fadd_rn(Su31.f, Stmp2.f);
972  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
973 
974  // Third Givens Rotation
975 
976  Ssh.f = Sa32.f * Sa32.f;
977  Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
978  Ssh.ui = Ssh.ui & Sa32.ui;
979 
980  Stmp5.f = 0.f;
981  Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
982  Sch.f = max(Sch.f, Sa22.f);
983  Sch.f = max(Sch.f, gsmall_number);
984  Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
985 
986  Stmp1.f = Sch.f * Sch.f;
987  Stmp2.f = Ssh.f * Ssh.f;
988  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
989  Stmp1.f = __frsqrt_rn(Stmp2.f);
990 
991  Stmp4.f = Stmp1.f * 0.5f;
992  Stmp3.f = Stmp1.f * Stmp4.f;
993  Stmp3.f = Stmp1.f * Stmp3.f;
994  Stmp3.f = Stmp2.f * Stmp3.f;
995  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
996  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
997  Stmp1.f = Stmp1.f * Stmp2.f;
998 
999  Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1000 
1001  Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1002  Stmp2.ui = ~Stmp5.ui & Sch.ui;
1003  Sch.ui = Stmp5.ui & Sch.ui;
1004  Ssh.ui = Stmp5.ui & Ssh.ui;
1005  Sch.ui = Sch.ui | Stmp1.ui;
1006  Ssh.ui = Ssh.ui | Stmp2.ui;
1007 
1008  Stmp1.f = Sch.f * Sch.f;
1009  Stmp2.f = Ssh.f * Ssh.f;
1010  Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1011  Stmp1.f = __frsqrt_rn(Stmp2.f);
1012 
1013  Stmp4.f = Stmp1.f * 0.5f;
1014  Stmp3.f = Stmp1.f * Stmp4.f;
1015  Stmp3.f = Stmp1.f * Stmp3.f;
1016  Stmp3.f = Stmp2.f * Stmp3.f;
1017  Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1018  Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1019 
1020  Sch.f = Sch.f * Stmp1.f;
1021  Ssh.f = Ssh.f * Stmp1.f;
1022 
1023  Sc.f = Sch.f * Sch.f;
1024  Ss.f = Ssh.f * Ssh.f;
1025  Sc.f = __fsub_rn(Sc.f, Ss.f);
1026  Ss.f = Ssh.f * Sch.f;
1027  Ss.f = __fadd_rn(Ss.f, Ss.f);
1028 
1029  //###########################################################
1030  // Rotate matrix A
1031  //###########################################################
1032 
1033  Stmp1.f = Ss.f * Sa21.f;
1034  Stmp2.f = Ss.f * Sa31.f;
1035  Sa21.f = Sc.f * Sa21.f;
1036  Sa31.f = Sc.f * Sa31.f;
1037  Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
1038  Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
1039 
1040  Stmp1.f = Ss.f * Sa22.f;
1041  Stmp2.f = Ss.f * Sa32.f;
1042  Sa22.f = Sc.f * Sa22.f;
1043  Sa32.f = Sc.f * Sa32.f;
1044  Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
1045  Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
1046 
1047  Stmp1.f = Ss.f * Sa23.f;
1048  Stmp2.f = Ss.f * Sa33.f;
1049  Sa23.f = Sc.f * Sa23.f;
1050  Sa33.f = Sc.f * Sa33.f;
1051  Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
1052  Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
1053 
1054  //###########################################################
1055  // Update matrix U
1056  //###########################################################
1057 
1058  Stmp1.f = Ss.f * Su12.f;
1059  Stmp2.f = Ss.f * Su13.f;
1060  Su12.f = Sc.f * Su12.f;
1061  Su13.f = Sc.f * Su13.f;
1062  Su12.f = __fadd_rn(Su12.f, Stmp2.f);
1063  Su13.f = __fsub_rn(Su13.f, Stmp1.f);
1064 
1065  Stmp1.f = Ss.f * Su22.f;
1066  Stmp2.f = Ss.f * Su23.f;
1067  Su22.f = Sc.f * Su22.f;
1068  Su23.f = Sc.f * Su23.f;
1069  Su22.f = __fadd_rn(Su22.f, Stmp2.f);
1070  Su23.f = __fsub_rn(Su23.f, Stmp1.f);
1071 
1072  Stmp1.f = Ss.f * Su32.f;
1073  Stmp2.f = Ss.f * Su33.f;
1074  Su32.f = Sc.f * Su32.f;
1075  Su33.f = Sc.f * Su33.f;
1076  Su32.f = __fadd_rn(Su32.f, Stmp2.f);
1077  Su33.f = __fsub_rn(Su33.f, Stmp1.f);
1078 
1079  v11 = Sv11.f;
1080  v12 = Sv12.f;
1081  v13 = Sv13.f;
1082  v21 = Sv21.f;
1083  v22 = Sv22.f;
1084  v23 = Sv23.f;
1085  v31 = Sv31.f;
1086  v32 = Sv32.f;
1087  v33 = Sv33.f;
1088 
1089  u11 = Su11.f;
1090  u12 = Su12.f;
1091  u13 = Su13.f;
1092  u21 = Su21.f;
1093  u22 = Su22.f;
1094  u23 = Su23.f;
1095  u31 = Su31.f;
1096  u32 = Su32.f;
1097  u33 = Su33.f;
1098 
1099  s11 = Sa11.f;
1100  // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1101  s22 = Sa22.f;
1102  // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1103  s33 = Sa33.f;
1104 }
#define gfour_gamma_squared
Definition: SVD3x3CPU.h:31
Definition: SVD3x3CPU.h:33
#define __fadd_rn(x, y)
Definition: SVD3x3CPU.h:39
void svd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, float &s11, float &s22, float &s33, float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33)
Definition: SVD3x3CPU.h:43
#define gcosine_pi_over_eight
Definition: SVD3x3CPU.h:27
#define gsine_pi_over_eight
Definition: SVD3x3CPU.h:25
#define __fsub_rn(x, y)
Definition: SVD3x3CPU.h:40
#define gone
Definition: SVD3x3CPU.h:24
#define gsmall_number
Definition: SVD3x3CPU.h:29
float f
Definition: SVD3x3CPU.h:34
#define gtiny_number
Definition: SVD3x3CPU.h:30
#define __frsqrt_rn(x)
Definition: SVD3x3CPU.h:41
unsigned int ui
Definition: SVD3x3CPU.h:35
#define max(x, y)
Definition: SVD3x3CPU.h:38