Open3D (C++ API)  0.16.0
SVD3x3.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// The MIT License (MIT)
5//
6// Copyright (c) 2018-2021 www.open3d.org
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24// IN THE SOFTWARE.
25// ----------------------------------------------------------------------------
26/**************************************************************************
27**
28** svd3
29**
30** Quick singular value decomposition as described by:
31** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
32** Computing the Singular Value Decomposition of 3x3 matrices
33** with minimal branching and elementary floating point operations,
34** University of Wisconsin - Madison technical report TR1690, May 2011
35**
36** Implemented by: Kui Wu
37** kwu@cs.utah.edu
38** Modified by: Wei Dong and Rishabh Singh
39**
40** May 2018
41**
42**************************************************************************/
43
44#pragma once
45
46#include <cmath>
47
50
51#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
52#include <cuda.h>
53#endif
54
55#include "math.h"
57
58#define gone 1065353216
59#define gsine_pi_over_eight 1053028117
60
61#define gcosine_pi_over_eight 1064076127
62#define gtiny_number 1.e-20
63#define gfour_gamma_squared 5.8284273147583007813
64
65#ifndef __CUDACC__
66using std::abs;
67using std::max;
68using std::sqrt;
69#endif
70
71#if defined(BUILD_CUDA_MODULE) && defined(__CUDACC__)
72#define __fadd_rn(x, y) __fadd_rn(x, y)
73#define __fsub_rn(x, y) __fsub_rn(x, y)
74#define __frsqrt_rn(x) __frsqrt_rn(x)
75
76#define __dadd_rn(x, y) __dadd_rn(x, y)
77#define __dsub_rn(x, y) __dsub_rn(x, y)
78#define __drsqrt_rn(x) __drcp_rn(__dsqrt_rn(x))
79#else
80
81#define __fadd_rn(x, y) (x + y)
82#define __fsub_rn(x, y) (x - y)
83#define __frsqrt_rn(x) (1.0 / sqrt(x))
84
85#define __dadd_rn(x, y) (x + y)
86#define __dsub_rn(x, y) (x - y)
87#define __drsqrt_rn(x) (1.0 / sqrt(x))
88
89#define __add_rn(x, y) (x + y)
90#define __sub_rn(x, y) (x - y)
91#define __rsqrt_rn(x) (1.0 / sqrt(x))
92#endif
93
94namespace open3d {
95namespace core {
96namespace linalg {
97namespace kernel {
98
99template <typename scalar_t>
100union un {
101 scalar_t f;
102 unsigned int ui;
103};
104
105template <typename scalar_t>
106OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3,
107 scalar_t *U_3x3,
108 scalar_t *S_3x1,
109 scalar_t *V_3x3);
110
111template <>
113 double *U_3x3,
114 double *S_3x1,
115 double *V_3x3) {
116 double gsmall_number = 1.e-12;
117
118 un<double> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
119 un<double> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
120 un<double> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
121 un<double> Sc, Ss, Sch, Ssh;
122 un<double> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
123 un<double> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
124 un<double> Sqvs, Sqvvx, Sqvvy, Sqvvz;
125
126 Sa11.f = A_3x3[0];
127 Sa12.f = A_3x3[1];
128 Sa13.f = A_3x3[2];
129 Sa21.f = A_3x3[3];
130 Sa22.f = A_3x3[4];
131 Sa23.f = A_3x3[5];
132 Sa31.f = A_3x3[6];
133 Sa32.f = A_3x3[7];
134 Sa33.f = A_3x3[8];
135
136 //###########################################################
137 // Compute normal equations matrix
138 //###########################################################
139
140 Ss11.f = Sa11.f * Sa11.f;
141 Stmp1.f = Sa21.f * Sa21.f;
142 Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
143 Stmp1.f = Sa31.f * Sa31.f;
144 Ss11.f = __dadd_rn(Stmp1.f, Ss11.f);
145
146 Ss21.f = Sa12.f * Sa11.f;
147 Stmp1.f = Sa22.f * Sa21.f;
148 Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
149 Stmp1.f = Sa32.f * Sa31.f;
150 Ss21.f = __dadd_rn(Stmp1.f, Ss21.f);
151
152 Ss31.f = Sa13.f * Sa11.f;
153 Stmp1.f = Sa23.f * Sa21.f;
154 Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
155 Stmp1.f = Sa33.f * Sa31.f;
156 Ss31.f = __dadd_rn(Stmp1.f, Ss31.f);
157
158 Ss22.f = Sa12.f * Sa12.f;
159 Stmp1.f = Sa22.f * Sa22.f;
160 Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
161 Stmp1.f = Sa32.f * Sa32.f;
162 Ss22.f = __dadd_rn(Stmp1.f, Ss22.f);
163
164 Ss32.f = Sa13.f * Sa12.f;
165 Stmp1.f = Sa23.f * Sa22.f;
166 Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
167 Stmp1.f = Sa33.f * Sa32.f;
168 Ss32.f = __dadd_rn(Stmp1.f, Ss32.f);
169
170 Ss33.f = Sa13.f * Sa13.f;
171 Stmp1.f = Sa23.f * Sa23.f;
172 Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
173 Stmp1.f = Sa33.f * Sa33.f;
174 Ss33.f = __dadd_rn(Stmp1.f, Ss33.f);
175
176 Sqvs.f = 1.f;
177 Sqvvx.f = 0.f;
178 Sqvvy.f = 0.f;
179 Sqvvz.f = 0.f;
180
181 //###########################################################
182 // Solve symmetric eigenproblem using Jacobi iteration
183 //###########################################################
184 for (int i = 0; i < 4; i++) {
185 Ssh.f = Ss21.f * 0.5f;
186 Stmp5.f = __dsub_rn(Ss11.f, Ss22.f);
187
188 Stmp2.f = Ssh.f * Ssh.f;
189 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
190 Ssh.ui = Stmp1.ui & Ssh.ui;
191 Sch.ui = Stmp1.ui & Stmp5.ui;
192 Stmp2.ui = ~Stmp1.ui & gone;
193 Sch.ui = Sch.ui | Stmp2.ui;
194
195 Stmp1.f = Ssh.f * Ssh.f;
196 Stmp2.f = Sch.f * Sch.f;
197 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
198 Stmp4.f = __drsqrt_rn(Stmp3.f);
199
200 Ssh.f = Stmp4.f * Ssh.f;
201 Sch.f = Stmp4.f * Sch.f;
202 Stmp1.f = gfour_gamma_squared * Stmp1.f;
203 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
204
205 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
206 Ssh.ui = ~Stmp1.ui & Ssh.ui;
207 Ssh.ui = Ssh.ui | Stmp2.ui;
208 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
209 Sch.ui = ~Stmp1.ui & Sch.ui;
210 Sch.ui = Sch.ui | Stmp2.ui;
211
212 Stmp1.f = Ssh.f * Ssh.f;
213 Stmp2.f = Sch.f * Sch.f;
214 Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
215 Ss.f = Sch.f * Ssh.f;
216 Ss.f = __dadd_rn(Ss.f, Ss.f);
217
218#ifdef DEBUG_JACOBI_CONJUGATE
219 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
220 Sch.f);
221#endif
222 //###########################################################
223 // Perform the actual Givens conjugation
224 //###########################################################
225
226 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
227 Ss33.f = Ss33.f * Stmp3.f;
228 Ss31.f = Ss31.f * Stmp3.f;
229 Ss32.f = Ss32.f * Stmp3.f;
230 Ss33.f = Ss33.f * Stmp3.f;
231
232 Stmp1.f = Ss.f * Ss31.f;
233 Stmp2.f = Ss.f * Ss32.f;
234 Ss31.f = Sc.f * Ss31.f;
235 Ss32.f = Sc.f * Ss32.f;
236 Ss31.f = __dadd_rn(Stmp2.f, Ss31.f);
237 Ss32.f = __dsub_rn(Ss32.f, Stmp1.f);
238
239 Stmp2.f = Ss.f * Ss.f;
240 Stmp1.f = Ss22.f * Stmp2.f;
241 Stmp3.f = Ss11.f * Stmp2.f;
242 Stmp4.f = Sc.f * Sc.f;
243 Ss11.f = Ss11.f * Stmp4.f;
244 Ss22.f = Ss22.f * Stmp4.f;
245 Ss11.f = __dadd_rn(Ss11.f, Stmp1.f);
246 Ss22.f = __dadd_rn(Ss22.f, Stmp3.f);
247 Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
248 Stmp2.f = __dadd_rn(Ss21.f, Ss21.f);
249 Ss21.f = Ss21.f * Stmp4.f;
250 Stmp4.f = Sc.f * Ss.f;
251 Stmp2.f = Stmp2.f * Stmp4.f;
252 Stmp5.f = Stmp5.f * Stmp4.f;
253 Ss11.f = __dadd_rn(Ss11.f, Stmp2.f);
254 Ss21.f = __dsub_rn(Ss21.f, Stmp5.f);
255 Ss22.f = __dsub_rn(Ss22.f, Stmp2.f);
256
257#ifdef DEBUG_JACOBI_CONJUGATE
258 printf("%.20g\n", Ss11.f);
259 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
260 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
261#endif
262
263 //###########################################################
264 // Compute the cumulative rotation, in quaternion form
265 //###########################################################
266
267 Stmp1.f = Ssh.f * Sqvvx.f;
268 Stmp2.f = Ssh.f * Sqvvy.f;
269 Stmp3.f = Ssh.f * Sqvvz.f;
270 Ssh.f = Ssh.f * Sqvs.f;
271
272 Sqvs.f = Sch.f * Sqvs.f;
273 Sqvvx.f = Sch.f * Sqvvx.f;
274 Sqvvy.f = Sch.f * Sqvvy.f;
275 Sqvvz.f = Sch.f * Sqvvz.f;
276
277 Sqvvz.f = __dadd_rn(Sqvvz.f, Ssh.f);
278 Sqvs.f = __dsub_rn(Sqvs.f, Stmp3.f);
279 Sqvvx.f = __dadd_rn(Sqvvx.f, Stmp2.f);
280 Sqvvy.f = __dsub_rn(Sqvvy.f, Stmp1.f);
281
282#ifdef DEBUG_JACOBI_CONJUGATE
283 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
284 Sqvs.f);
285#endif
286
288 // (1->3)
290 Ssh.f = Ss32.f * 0.5f;
291 Stmp5.f = __dsub_rn(Ss22.f, Ss33.f);
292
293 Stmp2.f = Ssh.f * Ssh.f;
294 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
295 Ssh.ui = Stmp1.ui & Ssh.ui;
296 Sch.ui = Stmp1.ui & Stmp5.ui;
297 Stmp2.ui = ~Stmp1.ui & gone;
298 Sch.ui = Sch.ui | Stmp2.ui;
299
300 Stmp1.f = Ssh.f * Ssh.f;
301 Stmp2.f = Sch.f * Sch.f;
302 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
303 Stmp4.f = __drsqrt_rn(Stmp3.f);
304
305 Ssh.f = Stmp4.f * Ssh.f;
306 Sch.f = Stmp4.f * Sch.f;
307 Stmp1.f = gfour_gamma_squared * Stmp1.f;
308 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
309
310 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
311 Ssh.ui = ~Stmp1.ui & Ssh.ui;
312 Ssh.ui = Ssh.ui | Stmp2.ui;
313 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
314 Sch.ui = ~Stmp1.ui & Sch.ui;
315 Sch.ui = Sch.ui | Stmp2.ui;
316
317 Stmp1.f = Ssh.f * Ssh.f;
318 Stmp2.f = Sch.f * Sch.f;
319 Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
320 Ss.f = Sch.f * Ssh.f;
321 Ss.f = __dadd_rn(Ss.f, Ss.f);
322
323#ifdef DEBUG_JACOBI_CONJUGATE
324 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
325 Sch.f);
326#endif
327
328 //###########################################################
329 // Perform the actual Givens conjugation
330 //###########################################################
331
332 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
333 Ss11.f = Ss11.f * Stmp3.f;
334 Ss21.f = Ss21.f * Stmp3.f;
335 Ss31.f = Ss31.f * Stmp3.f;
336 Ss11.f = Ss11.f * Stmp3.f;
337
338 Stmp1.f = Ss.f * Ss21.f;
339 Stmp2.f = Ss.f * Ss31.f;
340 Ss21.f = Sc.f * Ss21.f;
341 Ss31.f = Sc.f * Ss31.f;
342 Ss21.f = __dadd_rn(Stmp2.f, Ss21.f);
343 Ss31.f = __dsub_rn(Ss31.f, Stmp1.f);
344
345 Stmp2.f = Ss.f * Ss.f;
346 Stmp1.f = Ss33.f * Stmp2.f;
347 Stmp3.f = Ss22.f * Stmp2.f;
348 Stmp4.f = Sc.f * Sc.f;
349 Ss22.f = Ss22.f * Stmp4.f;
350 Ss33.f = Ss33.f * Stmp4.f;
351 Ss22.f = __dadd_rn(Ss22.f, Stmp1.f);
352 Ss33.f = __dadd_rn(Ss33.f, Stmp3.f);
353 Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
354 Stmp2.f = __dadd_rn(Ss32.f, Ss32.f);
355 Ss32.f = Ss32.f * Stmp4.f;
356 Stmp4.f = Sc.f * Ss.f;
357 Stmp2.f = Stmp2.f * Stmp4.f;
358 Stmp5.f = Stmp5.f * Stmp4.f;
359 Ss22.f = __dadd_rn(Ss22.f, Stmp2.f);
360 Ss32.f = __dsub_rn(Ss32.f, Stmp5.f);
361 Ss33.f = __dsub_rn(Ss33.f, Stmp2.f);
362
363#ifdef DEBUG_JACOBI_CONJUGATE
364 printf("%.20g\n", Ss11.f);
365 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
366 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
367#endif
368
369 //###########################################################
370 // Compute the cumulative rotation, in quaternion form
371 //###########################################################
372
373 Stmp1.f = Ssh.f * Sqvvx.f;
374 Stmp2.f = Ssh.f * Sqvvy.f;
375 Stmp3.f = Ssh.f * Sqvvz.f;
376 Ssh.f = Ssh.f * Sqvs.f;
377
378 Sqvs.f = Sch.f * Sqvs.f;
379 Sqvvx.f = Sch.f * Sqvvx.f;
380 Sqvvy.f = Sch.f * Sqvvy.f;
381 Sqvvz.f = Sch.f * Sqvvz.f;
382
383 Sqvvx.f = __dadd_rn(Sqvvx.f, Ssh.f);
384 Sqvs.f = __dsub_rn(Sqvs.f, Stmp1.f);
385 Sqvvy.f = __dadd_rn(Sqvvy.f, Stmp3.f);
386 Sqvvz.f = __dsub_rn(Sqvvz.f, Stmp2.f);
387
388#ifdef DEBUG_JACOBI_CONJUGATE
389 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
390 Sqvs.f);
391#endif
392#if 1
394 // 1 -> 2
396
397 Ssh.f = Ss31.f * 0.5f;
398 Stmp5.f = __dsub_rn(Ss33.f, Ss11.f);
399
400 Stmp2.f = Ssh.f * Ssh.f;
401 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
402 Ssh.ui = Stmp1.ui & Ssh.ui;
403 Sch.ui = Stmp1.ui & Stmp5.ui;
404 Stmp2.ui = ~Stmp1.ui & gone;
405 Sch.ui = Sch.ui | Stmp2.ui;
406
407 Stmp1.f = Ssh.f * Ssh.f;
408 Stmp2.f = Sch.f * Sch.f;
409 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
410 Stmp4.f = __drsqrt_rn(Stmp3.f);
411
412 Ssh.f = Stmp4.f * Ssh.f;
413 Sch.f = Stmp4.f * Sch.f;
414 Stmp1.f = gfour_gamma_squared * Stmp1.f;
415 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
416
417 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
418 Ssh.ui = ~Stmp1.ui & Ssh.ui;
419 Ssh.ui = Ssh.ui | Stmp2.ui;
420 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
421 Sch.ui = ~Stmp1.ui & Sch.ui;
422 Sch.ui = Sch.ui | Stmp2.ui;
423
424 Stmp1.f = Ssh.f * Ssh.f;
425 Stmp2.f = Sch.f * Sch.f;
426 Sc.f = __dsub_rn(Stmp2.f, Stmp1.f);
427 Ss.f = Sch.f * Ssh.f;
428 Ss.f = __dadd_rn(Ss.f, Ss.f);
429
430#ifdef DEBUG_JACOBI_CONJUGATE
431 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
432 Sch.f);
433#endif
434
435 //###########################################################
436 // Perform the actual Givens conjugation
437 //###########################################################
438
439 Stmp3.f = __dadd_rn(Stmp1.f, Stmp2.f);
440 Ss22.f = Ss22.f * Stmp3.f;
441 Ss32.f = Ss32.f * Stmp3.f;
442 Ss21.f = Ss21.f * Stmp3.f;
443 Ss22.f = Ss22.f * Stmp3.f;
444
445 Stmp1.f = Ss.f * Ss32.f;
446 Stmp2.f = Ss.f * Ss21.f;
447 Ss32.f = Sc.f * Ss32.f;
448 Ss21.f = Sc.f * Ss21.f;
449 Ss32.f = __dadd_rn(Stmp2.f, Ss32.f);
450 Ss21.f = __dsub_rn(Ss21.f, Stmp1.f);
451
452 Stmp2.f = Ss.f * Ss.f;
453 Stmp1.f = Ss11.f * Stmp2.f;
454 Stmp3.f = Ss33.f * Stmp2.f;
455 Stmp4.f = Sc.f * Sc.f;
456 Ss33.f = Ss33.f * Stmp4.f;
457 Ss11.f = Ss11.f * Stmp4.f;
458 Ss33.f = __dadd_rn(Ss33.f, Stmp1.f);
459 Ss11.f = __dadd_rn(Ss11.f, Stmp3.f);
460 Stmp4.f = __dsub_rn(Stmp4.f, Stmp2.f);
461 Stmp2.f = __dadd_rn(Ss31.f, Ss31.f);
462 Ss31.f = Ss31.f * Stmp4.f;
463 Stmp4.f = Sc.f * Ss.f;
464 Stmp2.f = Stmp2.f * Stmp4.f;
465 Stmp5.f = Stmp5.f * Stmp4.f;
466 Ss33.f = __dadd_rn(Ss33.f, Stmp2.f);
467 Ss31.f = __dsub_rn(Ss31.f, Stmp5.f);
468 Ss11.f = __dsub_rn(Ss11.f, Stmp2.f);
469
470#ifdef DEBUG_JACOBI_CONJUGATE
471 printf("%.20g\n", Ss11.f);
472 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
473 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
474#endif
475
476 //###########################################################
477 // Compute the cumulative rotation, in quaternion form
478 //###########################################################
479
480 Stmp1.f = Ssh.f * Sqvvx.f;
481 Stmp2.f = Ssh.f * Sqvvy.f;
482 Stmp3.f = Ssh.f * Sqvvz.f;
483 Ssh.f = Ssh.f * Sqvs.f;
484
485 Sqvs.f = Sch.f * Sqvs.f;
486 Sqvvx.f = Sch.f * Sqvvx.f;
487 Sqvvy.f = Sch.f * Sqvvy.f;
488 Sqvvz.f = Sch.f * Sqvvz.f;
489
490 Sqvvy.f = __dadd_rn(Sqvvy.f, Ssh.f);
491 Sqvs.f = __dsub_rn(Sqvs.f, Stmp2.f);
492 Sqvvz.f = __dadd_rn(Sqvvz.f, Stmp1.f);
493 Sqvvx.f = __dsub_rn(Sqvvx.f, Stmp3.f);
494#endif
495 }
496
497 //###########################################################
498 // Normalize quaternion for matrix V
499 //###########################################################
500
501 Stmp2.f = Sqvs.f * Sqvs.f;
502 Stmp1.f = Sqvvx.f * Sqvvx.f;
503 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
504 Stmp1.f = Sqvvy.f * Sqvvy.f;
505 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
506 Stmp1.f = Sqvvz.f * Sqvvz.f;
507 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
508
509 Stmp1.f = __drsqrt_rn(Stmp2.f);
510 Stmp4.f = Stmp1.f * 0.5f;
511 Stmp3.f = Stmp1.f * Stmp4.f;
512 Stmp3.f = Stmp1.f * Stmp3.f;
513 Stmp3.f = Stmp2.f * Stmp3.f;
514 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
515 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
516
517 Sqvs.f = Sqvs.f * Stmp1.f;
518 Sqvvx.f = Sqvvx.f * Stmp1.f;
519 Sqvvy.f = Sqvvy.f * Stmp1.f;
520 Sqvvz.f = Sqvvz.f * Stmp1.f;
521
522 //###########################################################
523 // Transform quaternion to matrix V
524 //###########################################################
525
526 Stmp1.f = Sqvvx.f * Sqvvx.f;
527 Stmp2.f = Sqvvy.f * Sqvvy.f;
528 Stmp3.f = Sqvvz.f * Sqvvz.f;
529 Sv11.f = Sqvs.f * Sqvs.f;
530 Sv22.f = __dsub_rn(Sv11.f, Stmp1.f);
531 Sv33.f = __dsub_rn(Sv22.f, Stmp2.f);
532 Sv33.f = __dadd_rn(Sv33.f, Stmp3.f);
533 Sv22.f = __dadd_rn(Sv22.f, Stmp2.f);
534 Sv22.f = __dsub_rn(Sv22.f, Stmp3.f);
535 Sv11.f = __dadd_rn(Sv11.f, Stmp1.f);
536 Sv11.f = __dsub_rn(Sv11.f, Stmp2.f);
537 Sv11.f = __dsub_rn(Sv11.f, Stmp3.f);
538 Stmp1.f = __dadd_rn(Sqvvx.f, Sqvvx.f);
539 Stmp2.f = __dadd_rn(Sqvvy.f, Sqvvy.f);
540 Stmp3.f = __dadd_rn(Sqvvz.f, Sqvvz.f);
541 Sv32.f = Sqvs.f * Stmp1.f;
542 Sv13.f = Sqvs.f * Stmp2.f;
543 Sv21.f = Sqvs.f * Stmp3.f;
544 Stmp1.f = Sqvvy.f * Stmp1.f;
545 Stmp2.f = Sqvvz.f * Stmp2.f;
546 Stmp3.f = Sqvvx.f * Stmp3.f;
547 Sv12.f = __dsub_rn(Stmp1.f, Sv21.f);
548 Sv23.f = __dsub_rn(Stmp2.f, Sv32.f);
549 Sv31.f = __dsub_rn(Stmp3.f, Sv13.f);
550 Sv21.f = __dadd_rn(Stmp1.f, Sv21.f);
551 Sv32.f = __dadd_rn(Stmp2.f, Sv32.f);
552 Sv13.f = __dadd_rn(Stmp3.f, Sv13.f);
553
555 // Multiply (from the right) with V
556 //###########################################################
557
558 Stmp2.f = Sa12.f;
559 Stmp3.f = Sa13.f;
560 Sa12.f = Sv12.f * Sa11.f;
561 Sa13.f = Sv13.f * Sa11.f;
562 Sa11.f = Sv11.f * Sa11.f;
563 Stmp1.f = Sv21.f * Stmp2.f;
564 Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
565 Stmp1.f = Sv31.f * Stmp3.f;
566 Sa11.f = __dadd_rn(Sa11.f, Stmp1.f);
567 Stmp1.f = Sv22.f * Stmp2.f;
568 Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
569 Stmp1.f = Sv32.f * Stmp3.f;
570 Sa12.f = __dadd_rn(Sa12.f, Stmp1.f);
571 Stmp1.f = Sv23.f * Stmp2.f;
572 Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
573 Stmp1.f = Sv33.f * Stmp3.f;
574 Sa13.f = __dadd_rn(Sa13.f, Stmp1.f);
575
576 Stmp2.f = Sa22.f;
577 Stmp3.f = Sa23.f;
578 Sa22.f = Sv12.f * Sa21.f;
579 Sa23.f = Sv13.f * Sa21.f;
580 Sa21.f = Sv11.f * Sa21.f;
581 Stmp1.f = Sv21.f * Stmp2.f;
582 Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
583 Stmp1.f = Sv31.f * Stmp3.f;
584 Sa21.f = __dadd_rn(Sa21.f, Stmp1.f);
585 Stmp1.f = Sv22.f * Stmp2.f;
586 Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
587 Stmp1.f = Sv32.f * Stmp3.f;
588 Sa22.f = __dadd_rn(Sa22.f, Stmp1.f);
589 Stmp1.f = Sv23.f * Stmp2.f;
590 Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
591 Stmp1.f = Sv33.f * Stmp3.f;
592 Sa23.f = __dadd_rn(Sa23.f, Stmp1.f);
593
594 Stmp2.f = Sa32.f;
595 Stmp3.f = Sa33.f;
596 Sa32.f = Sv12.f * Sa31.f;
597 Sa33.f = Sv13.f * Sa31.f;
598 Sa31.f = Sv11.f * Sa31.f;
599 Stmp1.f = Sv21.f * Stmp2.f;
600 Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
601 Stmp1.f = Sv31.f * Stmp3.f;
602 Sa31.f = __dadd_rn(Sa31.f, Stmp1.f);
603 Stmp1.f = Sv22.f * Stmp2.f;
604 Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
605 Stmp1.f = Sv32.f * Stmp3.f;
606 Sa32.f = __dadd_rn(Sa32.f, Stmp1.f);
607 Stmp1.f = Sv23.f * Stmp2.f;
608 Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
609 Stmp1.f = Sv33.f * Stmp3.f;
610 Sa33.f = __dadd_rn(Sa33.f, Stmp1.f);
611
612 //###########################################################
613 // Permute columns such that the singular values are sorted
614 //###########################################################
615
616 Stmp1.f = Sa11.f * Sa11.f;
617 Stmp4.f = Sa21.f * Sa21.f;
618 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
619 Stmp4.f = Sa31.f * Sa31.f;
620 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
621
622 Stmp2.f = Sa12.f * Sa12.f;
623 Stmp4.f = Sa22.f * Sa22.f;
624 Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
625 Stmp4.f = Sa32.f * Sa32.f;
626 Stmp2.f = __dadd_rn(Stmp2.f, Stmp4.f);
627
628 Stmp3.f = Sa13.f * Sa13.f;
629 Stmp4.f = Sa23.f * Sa23.f;
630 Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
631 Stmp4.f = Sa33.f * Sa33.f;
632 Stmp3.f = __dadd_rn(Stmp3.f, Stmp4.f);
633
634 // Swap columns 1-2 if necessary
635
636 Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
637 Stmp5.ui = Sa11.ui ^ Sa12.ui;
638 Stmp5.ui = Stmp5.ui & Stmp4.ui;
639 Sa11.ui = Sa11.ui ^ Stmp5.ui;
640 Sa12.ui = Sa12.ui ^ Stmp5.ui;
641
642 Stmp5.ui = Sa21.ui ^ Sa22.ui;
643 Stmp5.ui = Stmp5.ui & Stmp4.ui;
644 Sa21.ui = Sa21.ui ^ Stmp5.ui;
645 Sa22.ui = Sa22.ui ^ Stmp5.ui;
646
647 Stmp5.ui = Sa31.ui ^ Sa32.ui;
648 Stmp5.ui = Stmp5.ui & Stmp4.ui;
649 Sa31.ui = Sa31.ui ^ Stmp5.ui;
650 Sa32.ui = Sa32.ui ^ Stmp5.ui;
651
652 Stmp5.ui = Sv11.ui ^ Sv12.ui;
653 Stmp5.ui = Stmp5.ui & Stmp4.ui;
654 Sv11.ui = Sv11.ui ^ Stmp5.ui;
655 Sv12.ui = Sv12.ui ^ Stmp5.ui;
656
657 Stmp5.ui = Sv21.ui ^ Sv22.ui;
658 Stmp5.ui = Stmp5.ui & Stmp4.ui;
659 Sv21.ui = Sv21.ui ^ Stmp5.ui;
660 Sv22.ui = Sv22.ui ^ Stmp5.ui;
661
662 Stmp5.ui = Sv31.ui ^ Sv32.ui;
663 Stmp5.ui = Stmp5.ui & Stmp4.ui;
664 Sv31.ui = Sv31.ui ^ Stmp5.ui;
665 Sv32.ui = Sv32.ui ^ Stmp5.ui;
666
667 Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
668 Stmp5.ui = Stmp5.ui & Stmp4.ui;
669 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
670 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
671
672 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
673 // is still a rotation
674
675 Stmp5.f = -2.f;
676 Stmp5.ui = Stmp5.ui & Stmp4.ui;
677 Stmp4.f = 1.f;
678 Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
679
680 Sa12.f = Sa12.f * Stmp4.f;
681 Sa22.f = Sa22.f * Stmp4.f;
682 Sa32.f = Sa32.f * Stmp4.f;
683
684 Sv12.f = Sv12.f * Stmp4.f;
685 Sv22.f = Sv22.f * Stmp4.f;
686 Sv32.f = Sv32.f * Stmp4.f;
687
688 // Swap columns 1-3 if necessary
689
690 Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
691 Stmp5.ui = Sa11.ui ^ Sa13.ui;
692 Stmp5.ui = Stmp5.ui & Stmp4.ui;
693 Sa11.ui = Sa11.ui ^ Stmp5.ui;
694 Sa13.ui = Sa13.ui ^ Stmp5.ui;
695
696 Stmp5.ui = Sa21.ui ^ Sa23.ui;
697 Stmp5.ui = Stmp5.ui & Stmp4.ui;
698 Sa21.ui = Sa21.ui ^ Stmp5.ui;
699 Sa23.ui = Sa23.ui ^ Stmp5.ui;
700
701 Stmp5.ui = Sa31.ui ^ Sa33.ui;
702 Stmp5.ui = Stmp5.ui & Stmp4.ui;
703 Sa31.ui = Sa31.ui ^ Stmp5.ui;
704 Sa33.ui = Sa33.ui ^ Stmp5.ui;
705
706 Stmp5.ui = Sv11.ui ^ Sv13.ui;
707 Stmp5.ui = Stmp5.ui & Stmp4.ui;
708 Sv11.ui = Sv11.ui ^ Stmp5.ui;
709 Sv13.ui = Sv13.ui ^ Stmp5.ui;
710
711 Stmp5.ui = Sv21.ui ^ Sv23.ui;
712 Stmp5.ui = Stmp5.ui & Stmp4.ui;
713 Sv21.ui = Sv21.ui ^ Stmp5.ui;
714 Sv23.ui = Sv23.ui ^ Stmp5.ui;
715
716 Stmp5.ui = Sv31.ui ^ Sv33.ui;
717 Stmp5.ui = Stmp5.ui & Stmp4.ui;
718 Sv31.ui = Sv31.ui ^ Stmp5.ui;
719 Sv33.ui = Sv33.ui ^ Stmp5.ui;
720
721 Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
722 Stmp5.ui = Stmp5.ui & Stmp4.ui;
723 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
724 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
725
726 // If columns 1-3 have been swapped, negate 1st column of A and V so that V
727 // is still a rotation
728
729 Stmp5.f = -2.f;
730 Stmp5.ui = Stmp5.ui & Stmp4.ui;
731 Stmp4.f = 1.f;
732 Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
733
734 Sa11.f = Sa11.f * Stmp4.f;
735 Sa21.f = Sa21.f * Stmp4.f;
736 Sa31.f = Sa31.f * Stmp4.f;
737
738 Sv11.f = Sv11.f * Stmp4.f;
739 Sv21.f = Sv21.f * Stmp4.f;
740 Sv31.f = Sv31.f * Stmp4.f;
741
742 // Swap columns 2-3 if necessary
743
744 Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
745 Stmp5.ui = Sa12.ui ^ Sa13.ui;
746 Stmp5.ui = Stmp5.ui & Stmp4.ui;
747 Sa12.ui = Sa12.ui ^ Stmp5.ui;
748 Sa13.ui = Sa13.ui ^ Stmp5.ui;
749
750 Stmp5.ui = Sa22.ui ^ Sa23.ui;
751 Stmp5.ui = Stmp5.ui & Stmp4.ui;
752 Sa22.ui = Sa22.ui ^ Stmp5.ui;
753 Sa23.ui = Sa23.ui ^ Stmp5.ui;
754
755 Stmp5.ui = Sa32.ui ^ Sa33.ui;
756 Stmp5.ui = Stmp5.ui & Stmp4.ui;
757 Sa32.ui = Sa32.ui ^ Stmp5.ui;
758 Sa33.ui = Sa33.ui ^ Stmp5.ui;
759
760 Stmp5.ui = Sv12.ui ^ Sv13.ui;
761 Stmp5.ui = Stmp5.ui & Stmp4.ui;
762 Sv12.ui = Sv12.ui ^ Stmp5.ui;
763 Sv13.ui = Sv13.ui ^ Stmp5.ui;
764
765 Stmp5.ui = Sv22.ui ^ Sv23.ui;
766 Stmp5.ui = Stmp5.ui & Stmp4.ui;
767 Sv22.ui = Sv22.ui ^ Stmp5.ui;
768 Sv23.ui = Sv23.ui ^ Stmp5.ui;
769
770 Stmp5.ui = Sv32.ui ^ Sv33.ui;
771 Stmp5.ui = Stmp5.ui & Stmp4.ui;
772 Sv32.ui = Sv32.ui ^ Stmp5.ui;
773 Sv33.ui = Sv33.ui ^ Stmp5.ui;
774
775 Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
776 Stmp5.ui = Stmp5.ui & Stmp4.ui;
777 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
778 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
779
780 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
781 // is still a rotation
782
783 Stmp5.f = -2.f;
784 Stmp5.ui = Stmp5.ui & Stmp4.ui;
785 Stmp4.f = 1.f;
786 Stmp4.f = __dadd_rn(Stmp4.f, Stmp5.f);
787
788 Sa13.f = Sa13.f * Stmp4.f;
789 Sa23.f = Sa23.f * Stmp4.f;
790 Sa33.f = Sa33.f * Stmp4.f;
791
792 Sv13.f = Sv13.f * Stmp4.f;
793 Sv23.f = Sv23.f * Stmp4.f;
794 Sv33.f = Sv33.f * Stmp4.f;
795
796 //###########################################################
797 // Construct QR factorization of A*V (=U*D) using Givens rotations
798 //###########################################################
799
800 Su11.f = 1.f;
801 Su12.f = 0.f;
802 Su13.f = 0.f;
803 Su21.f = 0.f;
804 Su22.f = 1.f;
805 Su23.f = 0.f;
806 Su31.f = 0.f;
807 Su32.f = 0.f;
808 Su33.f = 1.f;
809
810 Ssh.f = Sa21.f * Sa21.f;
811 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
812 Ssh.ui = Ssh.ui & Sa21.ui;
813
814 Stmp5.f = 0.f;
815 Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
816 Sch.f = max(Sch.f, Sa11.f);
817 Sch.f = max(Sch.f, gsmall_number);
818 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
819
820 Stmp1.f = Sch.f * Sch.f;
821 Stmp2.f = Ssh.f * Ssh.f;
822 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
823 Stmp1.f = __drsqrt_rn(Stmp2.f);
824
825 Stmp4.f = Stmp1.f * 0.5f;
826 Stmp3.f = Stmp1.f * Stmp4.f;
827 Stmp3.f = Stmp1.f * Stmp3.f;
828 Stmp3.f = Stmp2.f * Stmp3.f;
829 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
830 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
831 Stmp1.f = Stmp1.f * Stmp2.f;
832
833 Sch.f = __dadd_rn(Sch.f, Stmp1.f);
834
835 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
836 Stmp2.ui = ~Stmp5.ui & Sch.ui;
837 Sch.ui = Stmp5.ui & Sch.ui;
838 Ssh.ui = Stmp5.ui & Ssh.ui;
839 Sch.ui = Sch.ui | Stmp1.ui;
840 Ssh.ui = Ssh.ui | Stmp2.ui;
841
842 Stmp1.f = Sch.f * Sch.f;
843 Stmp2.f = Ssh.f * Ssh.f;
844 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
845 Stmp1.f = __drsqrt_rn(Stmp2.f);
846
847 Stmp4.f = Stmp1.f * 0.5f;
848 Stmp3.f = Stmp1.f * Stmp4.f;
849 Stmp3.f = Stmp1.f * Stmp3.f;
850 Stmp3.f = Stmp2.f * Stmp3.f;
851 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
852 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
853
854 Sch.f = Sch.f * Stmp1.f;
855 Ssh.f = Ssh.f * Stmp1.f;
856
857 Sc.f = Sch.f * Sch.f;
858 Ss.f = Ssh.f * Ssh.f;
859 Sc.f = __dsub_rn(Sc.f, Ss.f);
860 Ss.f = Ssh.f * Sch.f;
861 Ss.f = __dadd_rn(Ss.f, Ss.f);
862
863 //###########################################################
864 // Rotate matrix A
865 //###########################################################
866
867 Stmp1.f = Ss.f * Sa11.f;
868 Stmp2.f = Ss.f * Sa21.f;
869 Sa11.f = Sc.f * Sa11.f;
870 Sa21.f = Sc.f * Sa21.f;
871 Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
872 Sa21.f = __dsub_rn(Sa21.f, Stmp1.f);
873
874 Stmp1.f = Ss.f * Sa12.f;
875 Stmp2.f = Ss.f * Sa22.f;
876 Sa12.f = Sc.f * Sa12.f;
877 Sa22.f = Sc.f * Sa22.f;
878 Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
879 Sa22.f = __dsub_rn(Sa22.f, Stmp1.f);
880
881 Stmp1.f = Ss.f * Sa13.f;
882 Stmp2.f = Ss.f * Sa23.f;
883 Sa13.f = Sc.f * Sa13.f;
884 Sa23.f = Sc.f * Sa23.f;
885 Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
886 Sa23.f = __dsub_rn(Sa23.f, Stmp1.f);
887
888 //###########################################################
889 // Update matrix U
890 //###########################################################
891
892 Stmp1.f = Ss.f * Su11.f;
893 Stmp2.f = Ss.f * Su12.f;
894 Su11.f = Sc.f * Su11.f;
895 Su12.f = Sc.f * Su12.f;
896 Su11.f = __dadd_rn(Su11.f, Stmp2.f);
897 Su12.f = __dsub_rn(Su12.f, Stmp1.f);
898
899 Stmp1.f = Ss.f * Su21.f;
900 Stmp2.f = Ss.f * Su22.f;
901 Su21.f = Sc.f * Su21.f;
902 Su22.f = Sc.f * Su22.f;
903 Su21.f = __dadd_rn(Su21.f, Stmp2.f);
904 Su22.f = __dsub_rn(Su22.f, Stmp1.f);
905
906 Stmp1.f = Ss.f * Su31.f;
907 Stmp2.f = Ss.f * Su32.f;
908 Su31.f = Sc.f * Su31.f;
909 Su32.f = Sc.f * Su32.f;
910 Su31.f = __dadd_rn(Su31.f, Stmp2.f);
911 Su32.f = __dsub_rn(Su32.f, Stmp1.f);
912
913 // Second Givens rotation
914
915 Ssh.f = Sa31.f * Sa31.f;
916 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
917 Ssh.ui = Ssh.ui & Sa31.ui;
918
919 Stmp5.f = 0.f;
920 Sch.f = __dsub_rn(Stmp5.f, Sa11.f);
921 Sch.f = max(Sch.f, Sa11.f);
922 Sch.f = max(Sch.f, gsmall_number);
923 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
924
925 Stmp1.f = Sch.f * Sch.f;
926 Stmp2.f = Ssh.f * Ssh.f;
927 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
928 Stmp1.f = __drsqrt_rn(Stmp2.f);
929
930 Stmp4.f = Stmp1.f * 0.5;
931 Stmp3.f = Stmp1.f * Stmp4.f;
932 Stmp3.f = Stmp1.f * Stmp3.f;
933 Stmp3.f = Stmp2.f * Stmp3.f;
934 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
935 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
936 Stmp1.f = Stmp1.f * Stmp2.f;
937
938 Sch.f = __dadd_rn(Sch.f, Stmp1.f);
939
940 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
941 Stmp2.ui = ~Stmp5.ui & Sch.ui;
942 Sch.ui = Stmp5.ui & Sch.ui;
943 Ssh.ui = Stmp5.ui & Ssh.ui;
944 Sch.ui = Sch.ui | Stmp1.ui;
945 Ssh.ui = Ssh.ui | Stmp2.ui;
946
947 Stmp1.f = Sch.f * Sch.f;
948 Stmp2.f = Ssh.f * Ssh.f;
949 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
950 Stmp1.f = __drsqrt_rn(Stmp2.f);
951
952 Stmp4.f = Stmp1.f * 0.5f;
953 Stmp3.f = Stmp1.f * Stmp4.f;
954 Stmp3.f = Stmp1.f * Stmp3.f;
955 Stmp3.f = Stmp2.f * Stmp3.f;
956 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
957 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
958
959 Sch.f = Sch.f * Stmp1.f;
960 Ssh.f = Ssh.f * Stmp1.f;
961
962 Sc.f = Sch.f * Sch.f;
963 Ss.f = Ssh.f * Ssh.f;
964 Sc.f = __dsub_rn(Sc.f, Ss.f);
965 Ss.f = Ssh.f * Sch.f;
966 Ss.f = __dadd_rn(Ss.f, Ss.f);
967
968 //###########################################################
969 // Rotate matrix A
970 //###########################################################
971
972 Stmp1.f = Ss.f * Sa11.f;
973 Stmp2.f = Ss.f * Sa31.f;
974 Sa11.f = Sc.f * Sa11.f;
975 Sa31.f = Sc.f * Sa31.f;
976 Sa11.f = __dadd_rn(Sa11.f, Stmp2.f);
977 Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
978
979 Stmp1.f = Ss.f * Sa12.f;
980 Stmp2.f = Ss.f * Sa32.f;
981 Sa12.f = Sc.f * Sa12.f;
982 Sa32.f = Sc.f * Sa32.f;
983 Sa12.f = __dadd_rn(Sa12.f, Stmp2.f);
984 Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
985
986 Stmp1.f = Ss.f * Sa13.f;
987 Stmp2.f = Ss.f * Sa33.f;
988 Sa13.f = Sc.f * Sa13.f;
989 Sa33.f = Sc.f * Sa33.f;
990 Sa13.f = __dadd_rn(Sa13.f, Stmp2.f);
991 Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
992
993 //###########################################################
994 // Update matrix U
995 //###########################################################
996
997 Stmp1.f = Ss.f * Su11.f;
998 Stmp2.f = Ss.f * Su13.f;
999 Su11.f = Sc.f * Su11.f;
1000 Su13.f = Sc.f * Su13.f;
1001 Su11.f = __dadd_rn(Su11.f, Stmp2.f);
1002 Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1003
1004 Stmp1.f = Ss.f * Su21.f;
1005 Stmp2.f = Ss.f * Su23.f;
1006 Su21.f = Sc.f * Su21.f;
1007 Su23.f = Sc.f * Su23.f;
1008 Su21.f = __dadd_rn(Su21.f, Stmp2.f);
1009 Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1010
1011 Stmp1.f = Ss.f * Su31.f;
1012 Stmp2.f = Ss.f * Su33.f;
1013 Su31.f = Sc.f * Su31.f;
1014 Su33.f = Sc.f * Su33.f;
1015 Su31.f = __dadd_rn(Su31.f, Stmp2.f);
1016 Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1017
1018 // Third Givens Rotation
1019
1020 Ssh.f = Sa32.f * Sa32.f;
1021 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1022 Ssh.ui = Ssh.ui & Sa32.ui;
1023
1024 Stmp5.f = 0.f;
1025 Sch.f = __dsub_rn(Stmp5.f, Sa22.f);
1026 Sch.f = max(Sch.f, Sa22.f);
1027 Sch.f = max(Sch.f, gsmall_number);
1028 Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
1029
1030 Stmp1.f = Sch.f * Sch.f;
1031 Stmp2.f = Ssh.f * Ssh.f;
1032 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1033 Stmp1.f = __drsqrt_rn(Stmp2.f);
1034
1035 Stmp4.f = Stmp1.f * 0.5f;
1036 Stmp3.f = Stmp1.f * Stmp4.f;
1037 Stmp3.f = Stmp1.f * Stmp3.f;
1038 Stmp3.f = Stmp2.f * Stmp3.f;
1039 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1040 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1041 Stmp1.f = Stmp1.f * Stmp2.f;
1042
1043 Sch.f = __dadd_rn(Sch.f, Stmp1.f);
1044
1045 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1046 Stmp2.ui = ~Stmp5.ui & Sch.ui;
1047 Sch.ui = Stmp5.ui & Sch.ui;
1048 Ssh.ui = Stmp5.ui & Ssh.ui;
1049 Sch.ui = Sch.ui | Stmp1.ui;
1050 Ssh.ui = Ssh.ui | Stmp2.ui;
1051
1052 Stmp1.f = Sch.f * Sch.f;
1053 Stmp2.f = Ssh.f * Ssh.f;
1054 Stmp2.f = __dadd_rn(Stmp1.f, Stmp2.f);
1055 Stmp1.f = __drsqrt_rn(Stmp2.f);
1056
1057 Stmp4.f = Stmp1.f * 0.5f;
1058 Stmp3.f = Stmp1.f * Stmp4.f;
1059 Stmp3.f = Stmp1.f * Stmp3.f;
1060 Stmp3.f = Stmp2.f * Stmp3.f;
1061 Stmp1.f = __dadd_rn(Stmp1.f, Stmp4.f);
1062 Stmp1.f = __dsub_rn(Stmp1.f, Stmp3.f);
1063
1064 Sch.f = Sch.f * Stmp1.f;
1065 Ssh.f = Ssh.f * Stmp1.f;
1066
1067 Sc.f = Sch.f * Sch.f;
1068 Ss.f = Ssh.f * Ssh.f;
1069 Sc.f = __dsub_rn(Sc.f, Ss.f);
1070 Ss.f = Ssh.f * Sch.f;
1071 Ss.f = __dadd_rn(Ss.f, Ss.f);
1072
1073 //###########################################################
1074 // Rotate matrix A
1075 //###########################################################
1076
1077 Stmp1.f = Ss.f * Sa21.f;
1078 Stmp2.f = Ss.f * Sa31.f;
1079 Sa21.f = Sc.f * Sa21.f;
1080 Sa31.f = Sc.f * Sa31.f;
1081 Sa21.f = __dadd_rn(Sa21.f, Stmp2.f);
1082 Sa31.f = __dsub_rn(Sa31.f, Stmp1.f);
1083
1084 Stmp1.f = Ss.f * Sa22.f;
1085 Stmp2.f = Ss.f * Sa32.f;
1086 Sa22.f = Sc.f * Sa22.f;
1087 Sa32.f = Sc.f * Sa32.f;
1088 Sa22.f = __dadd_rn(Sa22.f, Stmp2.f);
1089 Sa32.f = __dsub_rn(Sa32.f, Stmp1.f);
1090
1091 Stmp1.f = Ss.f * Sa23.f;
1092 Stmp2.f = Ss.f * Sa33.f;
1093 Sa23.f = Sc.f * Sa23.f;
1094 Sa33.f = Sc.f * Sa33.f;
1095 Sa23.f = __dadd_rn(Sa23.f, Stmp2.f);
1096 Sa33.f = __dsub_rn(Sa33.f, Stmp1.f);
1097
1098 //###########################################################
1099 // Update matrix U
1100 //###########################################################
1101
1102 Stmp1.f = Ss.f * Su12.f;
1103 Stmp2.f = Ss.f * Su13.f;
1104 Su12.f = Sc.f * Su12.f;
1105 Su13.f = Sc.f * Su13.f;
1106 Su12.f = __dadd_rn(Su12.f, Stmp2.f);
1107 Su13.f = __dsub_rn(Su13.f, Stmp1.f);
1108
1109 Stmp1.f = Ss.f * Su22.f;
1110 Stmp2.f = Ss.f * Su23.f;
1111 Su22.f = Sc.f * Su22.f;
1112 Su23.f = Sc.f * Su23.f;
1113 Su22.f = __dadd_rn(Su22.f, Stmp2.f);
1114 Su23.f = __dsub_rn(Su23.f, Stmp1.f);
1115
1116 Stmp1.f = Ss.f * Su32.f;
1117 Stmp2.f = Ss.f * Su33.f;
1118 Su32.f = Sc.f * Su32.f;
1119 Su33.f = Sc.f * Su33.f;
1120 Su32.f = __dadd_rn(Su32.f, Stmp2.f);
1121 Su33.f = __dsub_rn(Su33.f, Stmp1.f);
1122
1123 V_3x3[0] = Sv11.f;
1124 V_3x3[1] = Sv12.f;
1125 V_3x3[2] = Sv13.f;
1126 V_3x3[3] = Sv21.f;
1127 V_3x3[4] = Sv22.f;
1128 V_3x3[5] = Sv23.f;
1129 V_3x3[6] = Sv31.f;
1130 V_3x3[7] = Sv32.f;
1131 V_3x3[8] = Sv33.f;
1132
1133 U_3x3[0] = Su11.f;
1134 U_3x3[1] = Su12.f;
1135 U_3x3[2] = Su13.f;
1136 U_3x3[3] = Su21.f;
1137 U_3x3[4] = Su22.f;
1138 U_3x3[5] = Su23.f;
1139 U_3x3[6] = Su31.f;
1140 U_3x3[7] = Su32.f;
1141 U_3x3[8] = Su33.f;
1142
1143 S_3x1[0] = Sa11.f;
1144 // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1145 S_3x1[1] = Sa22.f;
1146 // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1147 S_3x1[2] = Sa33.f;
1148}
1149
1150template <>
1152 float *U_3x3,
1153 float *S_3x1,
1154 float *V_3x3) {
1155 float gsmall_number = 1.e-12;
1156
1157 un<float> Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
1158 un<float> Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
1159 un<float> Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
1160 un<float> Sc, Ss, Sch, Ssh;
1161 un<float> Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
1162 un<float> Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
1163 un<float> Sqvs, Sqvvx, Sqvvy, Sqvvz;
1164
1165 Sa11.f = A_3x3[0];
1166 Sa12.f = A_3x3[1];
1167 Sa13.f = A_3x3[2];
1168 Sa21.f = A_3x3[3];
1169 Sa22.f = A_3x3[4];
1170 Sa23.f = A_3x3[5];
1171 Sa31.f = A_3x3[6];
1172 Sa32.f = A_3x3[7];
1173 Sa33.f = A_3x3[8];
1174
1175 //###########################################################
1176 // Compute normal equations matrix
1177 //###########################################################
1178
1179 Ss11.f = Sa11.f * Sa11.f;
1180 Stmp1.f = Sa21.f * Sa21.f;
1181 Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1182 Stmp1.f = Sa31.f * Sa31.f;
1183 Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
1184
1185 Ss21.f = Sa12.f * Sa11.f;
1186 Stmp1.f = Sa22.f * Sa21.f;
1187 Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1188 Stmp1.f = Sa32.f * Sa31.f;
1189 Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
1190
1191 Ss31.f = Sa13.f * Sa11.f;
1192 Stmp1.f = Sa23.f * Sa21.f;
1193 Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1194 Stmp1.f = Sa33.f * Sa31.f;
1195 Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
1196
1197 Ss22.f = Sa12.f * Sa12.f;
1198 Stmp1.f = Sa22.f * Sa22.f;
1199 Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1200 Stmp1.f = Sa32.f * Sa32.f;
1201 Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
1202
1203 Ss32.f = Sa13.f * Sa12.f;
1204 Stmp1.f = Sa23.f * Sa22.f;
1205 Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1206 Stmp1.f = Sa33.f * Sa32.f;
1207 Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
1208
1209 Ss33.f = Sa13.f * Sa13.f;
1210 Stmp1.f = Sa23.f * Sa23.f;
1211 Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1212 Stmp1.f = Sa33.f * Sa33.f;
1213 Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
1214
1215 Sqvs.f = 1.f;
1216 Sqvvx.f = 0.f;
1217 Sqvvy.f = 0.f;
1218 Sqvvz.f = 0.f;
1219
1220 //###########################################################
1221 // Solve symmetric eigenproblem using Jacobi iteration
1222 //###########################################################
1223 for (int i = 0; i < 4; i++) {
1224 Ssh.f = Ss21.f * 0.5f;
1225 Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
1226
1227 Stmp2.f = Ssh.f * Ssh.f;
1228 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1229 Ssh.ui = Stmp1.ui & Ssh.ui;
1230 Sch.ui = Stmp1.ui & Stmp5.ui;
1231 Stmp2.ui = ~Stmp1.ui & gone;
1232 Sch.ui = Sch.ui | Stmp2.ui;
1233
1234 Stmp1.f = Ssh.f * Ssh.f;
1235 Stmp2.f = Sch.f * Sch.f;
1236 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1237 Stmp4.f = __frsqrt_rn(Stmp3.f);
1238
1239 Ssh.f = Stmp4.f * Ssh.f;
1240 Sch.f = Stmp4.f * Sch.f;
1241 Stmp1.f = gfour_gamma_squared * Stmp1.f;
1242 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1243
1244 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1245 Ssh.ui = ~Stmp1.ui & Ssh.ui;
1246 Ssh.ui = Ssh.ui | Stmp2.ui;
1247 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1248 Sch.ui = ~Stmp1.ui & Sch.ui;
1249 Sch.ui = Sch.ui | Stmp2.ui;
1250
1251 Stmp1.f = Ssh.f * Ssh.f;
1252 Stmp2.f = Sch.f * Sch.f;
1253 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1254 Ss.f = Sch.f * Ssh.f;
1255 Ss.f = __fadd_rn(Ss.f, Ss.f);
1256
1257#ifdef DEBUG_JACOBI_CONJUGATE
1258 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1259 Sch.f);
1260#endif
1261 //###########################################################
1262 // Perform the actual Givens conjugation
1263 //###########################################################
1264
1265 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1266 Ss33.f = Ss33.f * Stmp3.f;
1267 Ss31.f = Ss31.f * Stmp3.f;
1268 Ss32.f = Ss32.f * Stmp3.f;
1269 Ss33.f = Ss33.f * Stmp3.f;
1270
1271 Stmp1.f = Ss.f * Ss31.f;
1272 Stmp2.f = Ss.f * Ss32.f;
1273 Ss31.f = Sc.f * Ss31.f;
1274 Ss32.f = Sc.f * Ss32.f;
1275 Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
1276 Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
1277
1278 Stmp2.f = Ss.f * Ss.f;
1279 Stmp1.f = Ss22.f * Stmp2.f;
1280 Stmp3.f = Ss11.f * Stmp2.f;
1281 Stmp4.f = Sc.f * Sc.f;
1282 Ss11.f = Ss11.f * Stmp4.f;
1283 Ss22.f = Ss22.f * Stmp4.f;
1284 Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
1285 Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
1286 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1287 Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
1288 Ss21.f = Ss21.f * Stmp4.f;
1289 Stmp4.f = Sc.f * Ss.f;
1290 Stmp2.f = Stmp2.f * Stmp4.f;
1291 Stmp5.f = Stmp5.f * Stmp4.f;
1292 Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
1293 Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
1294 Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
1295
1296#ifdef DEBUG_JACOBI_CONJUGATE
1297 printf("%.20g\n", Ss11.f);
1298 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1299 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1300#endif
1301
1302 //###########################################################
1303 // Compute the cumulative rotation, in quaternion form
1304 //###########################################################
1305
1306 Stmp1.f = Ssh.f * Sqvvx.f;
1307 Stmp2.f = Ssh.f * Sqvvy.f;
1308 Stmp3.f = Ssh.f * Sqvvz.f;
1309 Ssh.f = Ssh.f * Sqvs.f;
1310
1311 Sqvs.f = Sch.f * Sqvs.f;
1312 Sqvvx.f = Sch.f * Sqvvx.f;
1313 Sqvvy.f = Sch.f * Sqvvy.f;
1314 Sqvvz.f = Sch.f * Sqvvz.f;
1315
1316 Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
1317 Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
1318 Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
1319 Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
1320
1321#ifdef DEBUG_JACOBI_CONJUGATE
1322 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1323 Sqvs.f);
1324#endif
1325
1327 // (1->3)
1329 Ssh.f = Ss32.f * 0.5f;
1330 Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
1331
1332 Stmp2.f = Ssh.f * Ssh.f;
1333 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1334 Ssh.ui = Stmp1.ui & Ssh.ui;
1335 Sch.ui = Stmp1.ui & Stmp5.ui;
1336 Stmp2.ui = ~Stmp1.ui & gone;
1337 Sch.ui = Sch.ui | Stmp2.ui;
1338
1339 Stmp1.f = Ssh.f * Ssh.f;
1340 Stmp2.f = Sch.f * Sch.f;
1341 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1342 Stmp4.f = __frsqrt_rn(Stmp3.f);
1343
1344 Ssh.f = Stmp4.f * Ssh.f;
1345 Sch.f = Stmp4.f * Sch.f;
1346 Stmp1.f = gfour_gamma_squared * Stmp1.f;
1347 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1348
1349 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1350 Ssh.ui = ~Stmp1.ui & Ssh.ui;
1351 Ssh.ui = Ssh.ui | Stmp2.ui;
1352 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1353 Sch.ui = ~Stmp1.ui & Sch.ui;
1354 Sch.ui = Sch.ui | Stmp2.ui;
1355
1356 Stmp1.f = Ssh.f * Ssh.f;
1357 Stmp2.f = Sch.f * Sch.f;
1358 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1359 Ss.f = Sch.f * Ssh.f;
1360 Ss.f = __fadd_rn(Ss.f, Ss.f);
1361
1362#ifdef DEBUG_JACOBI_CONJUGATE
1363 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1364 Sch.f);
1365#endif
1366
1367 //###########################################################
1368 // Perform the actual Givens conjugation
1369 //###########################################################
1370
1371 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1372 Ss11.f = Ss11.f * Stmp3.f;
1373 Ss21.f = Ss21.f * Stmp3.f;
1374 Ss31.f = Ss31.f * Stmp3.f;
1375 Ss11.f = Ss11.f * Stmp3.f;
1376
1377 Stmp1.f = Ss.f * Ss21.f;
1378 Stmp2.f = Ss.f * Ss31.f;
1379 Ss21.f = Sc.f * Ss21.f;
1380 Ss31.f = Sc.f * Ss31.f;
1381 Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
1382 Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
1383
1384 Stmp2.f = Ss.f * Ss.f;
1385 Stmp1.f = Ss33.f * Stmp2.f;
1386 Stmp3.f = Ss22.f * Stmp2.f;
1387 Stmp4.f = Sc.f * Sc.f;
1388 Ss22.f = Ss22.f * Stmp4.f;
1389 Ss33.f = Ss33.f * Stmp4.f;
1390 Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
1391 Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
1392 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1393 Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
1394 Ss32.f = Ss32.f * Stmp4.f;
1395 Stmp4.f = Sc.f * Ss.f;
1396 Stmp2.f = Stmp2.f * Stmp4.f;
1397 Stmp5.f = Stmp5.f * Stmp4.f;
1398 Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
1399 Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
1400 Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
1401
1402#ifdef DEBUG_JACOBI_CONJUGATE
1403 printf("%.20g\n", Ss11.f);
1404 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1405 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1406#endif
1407
1408 //###########################################################
1409 // Compute the cumulative rotation, in quaternion form
1410 //###########################################################
1411
1412 Stmp1.f = Ssh.f * Sqvvx.f;
1413 Stmp2.f = Ssh.f * Sqvvy.f;
1414 Stmp3.f = Ssh.f * Sqvvz.f;
1415 Ssh.f = Ssh.f * Sqvs.f;
1416
1417 Sqvs.f = Sch.f * Sqvs.f;
1418 Sqvvx.f = Sch.f * Sqvvx.f;
1419 Sqvvy.f = Sch.f * Sqvvy.f;
1420 Sqvvz.f = Sch.f * Sqvvz.f;
1421
1422 Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
1423 Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
1424 Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
1425 Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
1426
1427#ifdef DEBUG_JACOBI_CONJUGATE
1428 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f,
1429 Sqvs.f);
1430#endif
1431#if 1
1433 // 1 -> 2
1435
1436 Ssh.f = Ss31.f * 0.5f;
1437 Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
1438
1439 Stmp2.f = Ssh.f * Ssh.f;
1440 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
1441 Ssh.ui = Stmp1.ui & Ssh.ui;
1442 Sch.ui = Stmp1.ui & Stmp5.ui;
1443 Stmp2.ui = ~Stmp1.ui & gone;
1444 Sch.ui = Sch.ui | Stmp2.ui;
1445
1446 Stmp1.f = Ssh.f * Ssh.f;
1447 Stmp2.f = Sch.f * Sch.f;
1448 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1449 Stmp4.f = __frsqrt_rn(Stmp3.f);
1450
1451 Ssh.f = Stmp4.f * Ssh.f;
1452 Sch.f = Stmp4.f * Sch.f;
1453 Stmp1.f = gfour_gamma_squared * Stmp1.f;
1454 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
1455
1456 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
1457 Ssh.ui = ~Stmp1.ui & Ssh.ui;
1458 Ssh.ui = Ssh.ui | Stmp2.ui;
1459 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
1460 Sch.ui = ~Stmp1.ui & Sch.ui;
1461 Sch.ui = Sch.ui | Stmp2.ui;
1462
1463 Stmp1.f = Ssh.f * Ssh.f;
1464 Stmp2.f = Sch.f * Sch.f;
1465 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
1466 Ss.f = Sch.f * Ssh.f;
1467 Ss.f = __fadd_rn(Ss.f, Ss.f);
1468
1469#ifdef DEBUG_JACOBI_CONJUGATE
1470 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f,
1471 Sch.f);
1472#endif
1473
1474 //###########################################################
1475 // Perform the actual Givens conjugation
1476 //###########################################################
1477
1478 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
1479 Ss22.f = Ss22.f * Stmp3.f;
1480 Ss32.f = Ss32.f * Stmp3.f;
1481 Ss21.f = Ss21.f * Stmp3.f;
1482 Ss22.f = Ss22.f * Stmp3.f;
1483
1484 Stmp1.f = Ss.f * Ss32.f;
1485 Stmp2.f = Ss.f * Ss21.f;
1486 Ss32.f = Sc.f * Ss32.f;
1487 Ss21.f = Sc.f * Ss21.f;
1488 Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
1489 Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
1490
1491 Stmp2.f = Ss.f * Ss.f;
1492 Stmp1.f = Ss11.f * Stmp2.f;
1493 Stmp3.f = Ss33.f * Stmp2.f;
1494 Stmp4.f = Sc.f * Sc.f;
1495 Ss33.f = Ss33.f * Stmp4.f;
1496 Ss11.f = Ss11.f * Stmp4.f;
1497 Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
1498 Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
1499 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
1500 Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
1501 Ss31.f = Ss31.f * Stmp4.f;
1502 Stmp4.f = Sc.f * Ss.f;
1503 Stmp2.f = Stmp2.f * Stmp4.f;
1504 Stmp5.f = Stmp5.f * Stmp4.f;
1505 Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
1506 Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
1507 Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
1508
1509#ifdef DEBUG_JACOBI_CONJUGATE
1510 printf("%.20g\n", Ss11.f);
1511 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
1512 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
1513#endif
1514
1515 //###########################################################
1516 // Compute the cumulative rotation, in quaternion form
1517 //###########################################################
1518
1519 Stmp1.f = Ssh.f * Sqvvx.f;
1520 Stmp2.f = Ssh.f * Sqvvy.f;
1521 Stmp3.f = Ssh.f * Sqvvz.f;
1522 Ssh.f = Ssh.f * Sqvs.f;
1523
1524 Sqvs.f = Sch.f * Sqvs.f;
1525 Sqvvx.f = Sch.f * Sqvvx.f;
1526 Sqvvy.f = Sch.f * Sqvvy.f;
1527 Sqvvz.f = Sch.f * Sqvvz.f;
1528
1529 Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
1530 Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
1531 Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
1532 Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
1533#endif
1534 }
1535
1536 //###########################################################
1537 // Normalize quaternion for matrix V
1538 //###########################################################
1539
1540 Stmp2.f = Sqvs.f * Sqvs.f;
1541 Stmp1.f = Sqvvx.f * Sqvvx.f;
1542 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1543 Stmp1.f = Sqvvy.f * Sqvvy.f;
1544 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1545 Stmp1.f = Sqvvz.f * Sqvvz.f;
1546 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1547
1548 Stmp1.f = __frsqrt_rn(Stmp2.f);
1549 Stmp4.f = Stmp1.f * 0.5f;
1550 Stmp3.f = Stmp1.f * Stmp4.f;
1551 Stmp3.f = Stmp1.f * Stmp3.f;
1552 Stmp3.f = Stmp2.f * Stmp3.f;
1553 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1554 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1555
1556 Sqvs.f = Sqvs.f * Stmp1.f;
1557 Sqvvx.f = Sqvvx.f * Stmp1.f;
1558 Sqvvy.f = Sqvvy.f * Stmp1.f;
1559 Sqvvz.f = Sqvvz.f * Stmp1.f;
1560
1561 //###########################################################
1562 // Transform quaternion to matrix V
1563 //###########################################################
1564
1565 Stmp1.f = Sqvvx.f * Sqvvx.f;
1566 Stmp2.f = Sqvvy.f * Sqvvy.f;
1567 Stmp3.f = Sqvvz.f * Sqvvz.f;
1568 Sv11.f = Sqvs.f * Sqvs.f;
1569 Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
1570 Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
1571 Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
1572 Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
1573 Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
1574 Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
1575 Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
1576 Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
1577 Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
1578 Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
1579 Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
1580 Sv32.f = Sqvs.f * Stmp1.f;
1581 Sv13.f = Sqvs.f * Stmp2.f;
1582 Sv21.f = Sqvs.f * Stmp3.f;
1583 Stmp1.f = Sqvvy.f * Stmp1.f;
1584 Stmp2.f = Sqvvz.f * Stmp2.f;
1585 Stmp3.f = Sqvvx.f * Stmp3.f;
1586 Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
1587 Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
1588 Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
1589 Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
1590 Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
1591 Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
1592
1594 // Multiply (from the right) with V
1595 //###########################################################
1596
1597 Stmp2.f = Sa12.f;
1598 Stmp3.f = Sa13.f;
1599 Sa12.f = Sv12.f * Sa11.f;
1600 Sa13.f = Sv13.f * Sa11.f;
1601 Sa11.f = Sv11.f * Sa11.f;
1602 Stmp1.f = Sv21.f * Stmp2.f;
1603 Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1604 Stmp1.f = Sv31.f * Stmp3.f;
1605 Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
1606 Stmp1.f = Sv22.f * Stmp2.f;
1607 Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1608 Stmp1.f = Sv32.f * Stmp3.f;
1609 Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
1610 Stmp1.f = Sv23.f * Stmp2.f;
1611 Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1612 Stmp1.f = Sv33.f * Stmp3.f;
1613 Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
1614
1615 Stmp2.f = Sa22.f;
1616 Stmp3.f = Sa23.f;
1617 Sa22.f = Sv12.f * Sa21.f;
1618 Sa23.f = Sv13.f * Sa21.f;
1619 Sa21.f = Sv11.f * Sa21.f;
1620 Stmp1.f = Sv21.f * Stmp2.f;
1621 Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1622 Stmp1.f = Sv31.f * Stmp3.f;
1623 Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
1624 Stmp1.f = Sv22.f * Stmp2.f;
1625 Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1626 Stmp1.f = Sv32.f * Stmp3.f;
1627 Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
1628 Stmp1.f = Sv23.f * Stmp2.f;
1629 Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1630 Stmp1.f = Sv33.f * Stmp3.f;
1631 Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
1632
1633 Stmp2.f = Sa32.f;
1634 Stmp3.f = Sa33.f;
1635 Sa32.f = Sv12.f * Sa31.f;
1636 Sa33.f = Sv13.f * Sa31.f;
1637 Sa31.f = Sv11.f * Sa31.f;
1638 Stmp1.f = Sv21.f * Stmp2.f;
1639 Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1640 Stmp1.f = Sv31.f * Stmp3.f;
1641 Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
1642 Stmp1.f = Sv22.f * Stmp2.f;
1643 Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1644 Stmp1.f = Sv32.f * Stmp3.f;
1645 Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
1646 Stmp1.f = Sv23.f * Stmp2.f;
1647 Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1648 Stmp1.f = Sv33.f * Stmp3.f;
1649 Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
1650
1651 //###########################################################
1652 // Permute columns such that the singular values are sorted
1653 //###########################################################
1654
1655 Stmp1.f = Sa11.f * Sa11.f;
1656 Stmp4.f = Sa21.f * Sa21.f;
1657 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1658 Stmp4.f = Sa31.f * Sa31.f;
1659 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1660
1661 Stmp2.f = Sa12.f * Sa12.f;
1662 Stmp4.f = Sa22.f * Sa22.f;
1663 Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1664 Stmp4.f = Sa32.f * Sa32.f;
1665 Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
1666
1667 Stmp3.f = Sa13.f * Sa13.f;
1668 Stmp4.f = Sa23.f * Sa23.f;
1669 Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1670 Stmp4.f = Sa33.f * Sa33.f;
1671 Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
1672
1673 // Swap columns 1-2 if necessary
1674
1675 Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
1676 Stmp5.ui = Sa11.ui ^ Sa12.ui;
1677 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1678 Sa11.ui = Sa11.ui ^ Stmp5.ui;
1679 Sa12.ui = Sa12.ui ^ Stmp5.ui;
1680
1681 Stmp5.ui = Sa21.ui ^ Sa22.ui;
1682 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1683 Sa21.ui = Sa21.ui ^ Stmp5.ui;
1684 Sa22.ui = Sa22.ui ^ Stmp5.ui;
1685
1686 Stmp5.ui = Sa31.ui ^ Sa32.ui;
1687 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1688 Sa31.ui = Sa31.ui ^ Stmp5.ui;
1689 Sa32.ui = Sa32.ui ^ Stmp5.ui;
1690
1691 Stmp5.ui = Sv11.ui ^ Sv12.ui;
1692 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1693 Sv11.ui = Sv11.ui ^ Stmp5.ui;
1694 Sv12.ui = Sv12.ui ^ Stmp5.ui;
1695
1696 Stmp5.ui = Sv21.ui ^ Sv22.ui;
1697 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1698 Sv21.ui = Sv21.ui ^ Stmp5.ui;
1699 Sv22.ui = Sv22.ui ^ Stmp5.ui;
1700
1701 Stmp5.ui = Sv31.ui ^ Sv32.ui;
1702 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1703 Sv31.ui = Sv31.ui ^ Stmp5.ui;
1704 Sv32.ui = Sv32.ui ^ Stmp5.ui;
1705
1706 Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
1707 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1708 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1709 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1710
1711 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V
1712 // is still a rotation
1713
1714 Stmp5.f = -2.f;
1715 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1716 Stmp4.f = 1.f;
1717 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1718
1719 Sa12.f = Sa12.f * Stmp4.f;
1720 Sa22.f = Sa22.f * Stmp4.f;
1721 Sa32.f = Sa32.f * Stmp4.f;
1722
1723 Sv12.f = Sv12.f * Stmp4.f;
1724 Sv22.f = Sv22.f * Stmp4.f;
1725 Sv32.f = Sv32.f * Stmp4.f;
1726
1727 // Swap columns 1-3 if necessary
1728
1729 Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
1730 Stmp5.ui = Sa11.ui ^ Sa13.ui;
1731 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1732 Sa11.ui = Sa11.ui ^ Stmp5.ui;
1733 Sa13.ui = Sa13.ui ^ Stmp5.ui;
1734
1735 Stmp5.ui = Sa21.ui ^ Sa23.ui;
1736 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1737 Sa21.ui = Sa21.ui ^ Stmp5.ui;
1738 Sa23.ui = Sa23.ui ^ Stmp5.ui;
1739
1740 Stmp5.ui = Sa31.ui ^ Sa33.ui;
1741 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1742 Sa31.ui = Sa31.ui ^ Stmp5.ui;
1743 Sa33.ui = Sa33.ui ^ Stmp5.ui;
1744
1745 Stmp5.ui = Sv11.ui ^ Sv13.ui;
1746 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1747 Sv11.ui = Sv11.ui ^ Stmp5.ui;
1748 Sv13.ui = Sv13.ui ^ Stmp5.ui;
1749
1750 Stmp5.ui = Sv21.ui ^ Sv23.ui;
1751 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1752 Sv21.ui = Sv21.ui ^ Stmp5.ui;
1753 Sv23.ui = Sv23.ui ^ Stmp5.ui;
1754
1755 Stmp5.ui = Sv31.ui ^ Sv33.ui;
1756 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1757 Sv31.ui = Sv31.ui ^ Stmp5.ui;
1758 Sv33.ui = Sv33.ui ^ Stmp5.ui;
1759
1760 Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
1761 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1762 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
1763 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1764
1765 // If columns 1-3 have been swapped, negate 1st column of A and V so that V
1766 // is still a rotation
1767
1768 Stmp5.f = -2.f;
1769 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1770 Stmp4.f = 1.f;
1771 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1772
1773 Sa11.f = Sa11.f * Stmp4.f;
1774 Sa21.f = Sa21.f * Stmp4.f;
1775 Sa31.f = Sa31.f * Stmp4.f;
1776
1777 Sv11.f = Sv11.f * Stmp4.f;
1778 Sv21.f = Sv21.f * Stmp4.f;
1779 Sv31.f = Sv31.f * Stmp4.f;
1780
1781 // Swap columns 2-3 if necessary
1782
1783 Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
1784 Stmp5.ui = Sa12.ui ^ Sa13.ui;
1785 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1786 Sa12.ui = Sa12.ui ^ Stmp5.ui;
1787 Sa13.ui = Sa13.ui ^ Stmp5.ui;
1788
1789 Stmp5.ui = Sa22.ui ^ Sa23.ui;
1790 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1791 Sa22.ui = Sa22.ui ^ Stmp5.ui;
1792 Sa23.ui = Sa23.ui ^ Stmp5.ui;
1793
1794 Stmp5.ui = Sa32.ui ^ Sa33.ui;
1795 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1796 Sa32.ui = Sa32.ui ^ Stmp5.ui;
1797 Sa33.ui = Sa33.ui ^ Stmp5.ui;
1798
1799 Stmp5.ui = Sv12.ui ^ Sv13.ui;
1800 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1801 Sv12.ui = Sv12.ui ^ Stmp5.ui;
1802 Sv13.ui = Sv13.ui ^ Stmp5.ui;
1803
1804 Stmp5.ui = Sv22.ui ^ Sv23.ui;
1805 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1806 Sv22.ui = Sv22.ui ^ Stmp5.ui;
1807 Sv23.ui = Sv23.ui ^ Stmp5.ui;
1808
1809 Stmp5.ui = Sv32.ui ^ Sv33.ui;
1810 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1811 Sv32.ui = Sv32.ui ^ Stmp5.ui;
1812 Sv33.ui = Sv33.ui ^ Stmp5.ui;
1813
1814 Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
1815 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1816 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
1817 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
1818
1819 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V
1820 // is still a rotation
1821
1822 Stmp5.f = -2.f;
1823 Stmp5.ui = Stmp5.ui & Stmp4.ui;
1824 Stmp4.f = 1.f;
1825 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
1826
1827 Sa13.f = Sa13.f * Stmp4.f;
1828 Sa23.f = Sa23.f * Stmp4.f;
1829 Sa33.f = Sa33.f * Stmp4.f;
1830
1831 Sv13.f = Sv13.f * Stmp4.f;
1832 Sv23.f = Sv23.f * Stmp4.f;
1833 Sv33.f = Sv33.f * Stmp4.f;
1834
1835 //###########################################################
1836 // Construct QR factorization of A*V (=U*D) using Givens rotations
1837 //###########################################################
1838
1839 Su11.f = 1.f;
1840 Su12.f = 0.f;
1841 Su13.f = 0.f;
1842 Su21.f = 0.f;
1843 Su22.f = 1.f;
1844 Su23.f = 0.f;
1845 Su31.f = 0.f;
1846 Su32.f = 0.f;
1847 Su33.f = 1.f;
1848
1849 Ssh.f = Sa21.f * Sa21.f;
1850 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1851 Ssh.ui = Ssh.ui & Sa21.ui;
1852
1853 Stmp5.f = 0.f;
1854 Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1855 Sch.f = max(Sch.f, Sa11.f);
1856 Sch.f = max(Sch.f, gsmall_number);
1857 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1858
1859 Stmp1.f = Sch.f * Sch.f;
1860 Stmp2.f = Ssh.f * Ssh.f;
1861 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1862 Stmp1.f = __frsqrt_rn(Stmp2.f);
1863
1864 Stmp4.f = Stmp1.f * 0.5f;
1865 Stmp3.f = Stmp1.f * Stmp4.f;
1866 Stmp3.f = Stmp1.f * Stmp3.f;
1867 Stmp3.f = Stmp2.f * Stmp3.f;
1868 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1869 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1870 Stmp1.f = Stmp1.f * Stmp2.f;
1871
1872 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1873
1874 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1875 Stmp2.ui = ~Stmp5.ui & Sch.ui;
1876 Sch.ui = Stmp5.ui & Sch.ui;
1877 Ssh.ui = Stmp5.ui & Ssh.ui;
1878 Sch.ui = Sch.ui | Stmp1.ui;
1879 Ssh.ui = Ssh.ui | Stmp2.ui;
1880
1881 Stmp1.f = Sch.f * Sch.f;
1882 Stmp2.f = Ssh.f * Ssh.f;
1883 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1884 Stmp1.f = __frsqrt_rn(Stmp2.f);
1885
1886 Stmp4.f = Stmp1.f * 0.5f;
1887 Stmp3.f = Stmp1.f * Stmp4.f;
1888 Stmp3.f = Stmp1.f * Stmp3.f;
1889 Stmp3.f = Stmp2.f * Stmp3.f;
1890 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1891 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1892
1893 Sch.f = Sch.f * Stmp1.f;
1894 Ssh.f = Ssh.f * Stmp1.f;
1895
1896 Sc.f = Sch.f * Sch.f;
1897 Ss.f = Ssh.f * Ssh.f;
1898 Sc.f = __fsub_rn(Sc.f, Ss.f);
1899 Ss.f = Ssh.f * Sch.f;
1900 Ss.f = __fadd_rn(Ss.f, Ss.f);
1901
1902 //###########################################################
1903 // Rotate matrix A
1904 //###########################################################
1905
1906 Stmp1.f = Ss.f * Sa11.f;
1907 Stmp2.f = Ss.f * Sa21.f;
1908 Sa11.f = Sc.f * Sa11.f;
1909 Sa21.f = Sc.f * Sa21.f;
1910 Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
1911 Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
1912
1913 Stmp1.f = Ss.f * Sa12.f;
1914 Stmp2.f = Ss.f * Sa22.f;
1915 Sa12.f = Sc.f * Sa12.f;
1916 Sa22.f = Sc.f * Sa22.f;
1917 Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
1918 Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
1919
1920 Stmp1.f = Ss.f * Sa13.f;
1921 Stmp2.f = Ss.f * Sa23.f;
1922 Sa13.f = Sc.f * Sa13.f;
1923 Sa23.f = Sc.f * Sa23.f;
1924 Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
1925 Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
1926
1927 //###########################################################
1928 // Update matrix U
1929 //###########################################################
1930
1931 Stmp1.f = Ss.f * Su11.f;
1932 Stmp2.f = Ss.f * Su12.f;
1933 Su11.f = Sc.f * Su11.f;
1934 Su12.f = Sc.f * Su12.f;
1935 Su11.f = __fadd_rn(Su11.f, Stmp2.f);
1936 Su12.f = __fsub_rn(Su12.f, Stmp1.f);
1937
1938 Stmp1.f = Ss.f * Su21.f;
1939 Stmp2.f = Ss.f * Su22.f;
1940 Su21.f = Sc.f * Su21.f;
1941 Su22.f = Sc.f * Su22.f;
1942 Su21.f = __fadd_rn(Su21.f, Stmp2.f);
1943 Su22.f = __fsub_rn(Su22.f, Stmp1.f);
1944
1945 Stmp1.f = Ss.f * Su31.f;
1946 Stmp2.f = Ss.f * Su32.f;
1947 Su31.f = Sc.f * Su31.f;
1948 Su32.f = Sc.f * Su32.f;
1949 Su31.f = __fadd_rn(Su31.f, Stmp2.f);
1950 Su32.f = __fsub_rn(Su32.f, Stmp1.f);
1951
1952 // Second Givens rotation
1953
1954 Ssh.f = Sa31.f * Sa31.f;
1955 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
1956 Ssh.ui = Ssh.ui & Sa31.ui;
1957
1958 Stmp5.f = 0.f;
1959 Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
1960 Sch.f = max(Sch.f, Sa11.f);
1961 Sch.f = max(Sch.f, gsmall_number);
1962 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
1963
1964 Stmp1.f = Sch.f * Sch.f;
1965 Stmp2.f = Ssh.f * Ssh.f;
1966 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1967 Stmp1.f = __frsqrt_rn(Stmp2.f);
1968
1969 Stmp4.f = Stmp1.f * 0.5;
1970 Stmp3.f = Stmp1.f * Stmp4.f;
1971 Stmp3.f = Stmp1.f * Stmp3.f;
1972 Stmp3.f = Stmp2.f * Stmp3.f;
1973 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1974 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1975 Stmp1.f = Stmp1.f * Stmp2.f;
1976
1977 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
1978
1979 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
1980 Stmp2.ui = ~Stmp5.ui & Sch.ui;
1981 Sch.ui = Stmp5.ui & Sch.ui;
1982 Ssh.ui = Stmp5.ui & Ssh.ui;
1983 Sch.ui = Sch.ui | Stmp1.ui;
1984 Ssh.ui = Ssh.ui | Stmp2.ui;
1985
1986 Stmp1.f = Sch.f * Sch.f;
1987 Stmp2.f = Ssh.f * Ssh.f;
1988 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
1989 Stmp1.f = __frsqrt_rn(Stmp2.f);
1990
1991 Stmp4.f = Stmp1.f * 0.5f;
1992 Stmp3.f = Stmp1.f * Stmp4.f;
1993 Stmp3.f = Stmp1.f * Stmp3.f;
1994 Stmp3.f = Stmp2.f * Stmp3.f;
1995 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
1996 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
1997
1998 Sch.f = Sch.f * Stmp1.f;
1999 Ssh.f = Ssh.f * Stmp1.f;
2000
2001 Sc.f = Sch.f * Sch.f;
2002 Ss.f = Ssh.f * Ssh.f;
2003 Sc.f = __fsub_rn(Sc.f, Ss.f);
2004 Ss.f = Ssh.f * Sch.f;
2005 Ss.f = __fadd_rn(Ss.f, Ss.f);
2006
2007 //###########################################################
2008 // Rotate matrix A
2009 //###########################################################
2010
2011 Stmp1.f = Ss.f * Sa11.f;
2012 Stmp2.f = Ss.f * Sa31.f;
2013 Sa11.f = Sc.f * Sa11.f;
2014 Sa31.f = Sc.f * Sa31.f;
2015 Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
2016 Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2017
2018 Stmp1.f = Ss.f * Sa12.f;
2019 Stmp2.f = Ss.f * Sa32.f;
2020 Sa12.f = Sc.f * Sa12.f;
2021 Sa32.f = Sc.f * Sa32.f;
2022 Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
2023 Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2024
2025 Stmp1.f = Ss.f * Sa13.f;
2026 Stmp2.f = Ss.f * Sa33.f;
2027 Sa13.f = Sc.f * Sa13.f;
2028 Sa33.f = Sc.f * Sa33.f;
2029 Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
2030 Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2031
2032 //###########################################################
2033 // Update matrix U
2034 //###########################################################
2035
2036 Stmp1.f = Ss.f * Su11.f;
2037 Stmp2.f = Ss.f * Su13.f;
2038 Su11.f = Sc.f * Su11.f;
2039 Su13.f = Sc.f * Su13.f;
2040 Su11.f = __fadd_rn(Su11.f, Stmp2.f);
2041 Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2042
2043 Stmp1.f = Ss.f * Su21.f;
2044 Stmp2.f = Ss.f * Su23.f;
2045 Su21.f = Sc.f * Su21.f;
2046 Su23.f = Sc.f * Su23.f;
2047 Su21.f = __fadd_rn(Su21.f, Stmp2.f);
2048 Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2049
2050 Stmp1.f = Ss.f * Su31.f;
2051 Stmp2.f = Ss.f * Su33.f;
2052 Su31.f = Sc.f * Su31.f;
2053 Su33.f = Sc.f * Su33.f;
2054 Su31.f = __fadd_rn(Su31.f, Stmp2.f);
2055 Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2056
2057 // Third Givens Rotation
2058
2059 Ssh.f = Sa32.f * Sa32.f;
2060 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
2061 Ssh.ui = Ssh.ui & Sa32.ui;
2062
2063 Stmp5.f = 0.f;
2064 Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
2065 Sch.f = max(Sch.f, Sa22.f);
2066 Sch.f = max(Sch.f, gsmall_number);
2067 Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
2068
2069 Stmp1.f = Sch.f * Sch.f;
2070 Stmp2.f = Ssh.f * Ssh.f;
2071 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2072 Stmp1.f = __frsqrt_rn(Stmp2.f);
2073
2074 Stmp4.f = Stmp1.f * 0.5f;
2075 Stmp3.f = Stmp1.f * Stmp4.f;
2076 Stmp3.f = Stmp1.f * Stmp3.f;
2077 Stmp3.f = Stmp2.f * Stmp3.f;
2078 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2079 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2080 Stmp1.f = Stmp1.f * Stmp2.f;
2081
2082 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
2083
2084 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
2085 Stmp2.ui = ~Stmp5.ui & Sch.ui;
2086 Sch.ui = Stmp5.ui & Sch.ui;
2087 Ssh.ui = Stmp5.ui & Ssh.ui;
2088 Sch.ui = Sch.ui | Stmp1.ui;
2089 Ssh.ui = Ssh.ui | Stmp2.ui;
2090
2091 Stmp1.f = Sch.f * Sch.f;
2092 Stmp2.f = Ssh.f * Ssh.f;
2093 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
2094 Stmp1.f = __frsqrt_rn(Stmp2.f);
2095
2096 Stmp4.f = Stmp1.f * 0.5f;
2097 Stmp3.f = Stmp1.f * Stmp4.f;
2098 Stmp3.f = Stmp1.f * Stmp3.f;
2099 Stmp3.f = Stmp2.f * Stmp3.f;
2100 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
2101 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
2102
2103 Sch.f = Sch.f * Stmp1.f;
2104 Ssh.f = Ssh.f * Stmp1.f;
2105
2106 Sc.f = Sch.f * Sch.f;
2107 Ss.f = Ssh.f * Ssh.f;
2108 Sc.f = __fsub_rn(Sc.f, Ss.f);
2109 Ss.f = Ssh.f * Sch.f;
2110 Ss.f = __fadd_rn(Ss.f, Ss.f);
2111
2112 //###########################################################
2113 // Rotate matrix A
2114 //###########################################################
2115
2116 Stmp1.f = Ss.f * Sa21.f;
2117 Stmp2.f = Ss.f * Sa31.f;
2118 Sa21.f = Sc.f * Sa21.f;
2119 Sa31.f = Sc.f * Sa31.f;
2120 Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
2121 Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
2122
2123 Stmp1.f = Ss.f * Sa22.f;
2124 Stmp2.f = Ss.f * Sa32.f;
2125 Sa22.f = Sc.f * Sa22.f;
2126 Sa32.f = Sc.f * Sa32.f;
2127 Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
2128 Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
2129
2130 Stmp1.f = Ss.f * Sa23.f;
2131 Stmp2.f = Ss.f * Sa33.f;
2132 Sa23.f = Sc.f * Sa23.f;
2133 Sa33.f = Sc.f * Sa33.f;
2134 Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
2135 Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
2136
2137 //###########################################################
2138 // Update matrix U
2139 //###########################################################
2140
2141 Stmp1.f = Ss.f * Su12.f;
2142 Stmp2.f = Ss.f * Su13.f;
2143 Su12.f = Sc.f * Su12.f;
2144 Su13.f = Sc.f * Su13.f;
2145 Su12.f = __fadd_rn(Su12.f, Stmp2.f);
2146 Su13.f = __fsub_rn(Su13.f, Stmp1.f);
2147
2148 Stmp1.f = Ss.f * Su22.f;
2149 Stmp2.f = Ss.f * Su23.f;
2150 Su22.f = Sc.f * Su22.f;
2151 Su23.f = Sc.f * Su23.f;
2152 Su22.f = __fadd_rn(Su22.f, Stmp2.f);
2153 Su23.f = __fsub_rn(Su23.f, Stmp1.f);
2154
2155 Stmp1.f = Ss.f * Su32.f;
2156 Stmp2.f = Ss.f * Su33.f;
2157 Su32.f = Sc.f * Su32.f;
2158 Su33.f = Sc.f * Su33.f;
2159 Su32.f = __fadd_rn(Su32.f, Stmp2.f);
2160 Su33.f = __fsub_rn(Su33.f, Stmp1.f);
2161
2162 V_3x3[0] = Sv11.f;
2163 V_3x3[1] = Sv12.f;
2164 V_3x3[2] = Sv13.f;
2165 V_3x3[3] = Sv21.f;
2166 V_3x3[4] = Sv22.f;
2167 V_3x3[5] = Sv23.f;
2168 V_3x3[6] = Sv31.f;
2169 V_3x3[7] = Sv32.f;
2170 V_3x3[8] = Sv33.f;
2171
2172 U_3x3[0] = Su11.f;
2173 U_3x3[1] = Su12.f;
2174 U_3x3[2] = Su13.f;
2175 U_3x3[3] = Su21.f;
2176 U_3x3[4] = Su22.f;
2177 U_3x3[5] = Su23.f;
2178 U_3x3[6] = Su31.f;
2179 U_3x3[7] = Su32.f;
2180 U_3x3[8] = Su33.f;
2181
2182 S_3x1[0] = Sa11.f;
2183 // s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
2184 S_3x1[1] = Sa22.f;
2185 // s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
2186 S_3x1[2] = Sa33.f;
2187}
2188
2189template <typename scalar_t>
2191 const scalar_t *A_3x3, // input A {3,3}
2192 const scalar_t *B_3x1, // input b {3,1}
2193 scalar_t *X_3x1) // output x {3,1}
2194{
2195 scalar_t U[9];
2196 scalar_t V[9];
2197 scalar_t S[3];
2198 svd3x3(A_3x3, U, S, V);
2199
2200 //###########################################################
2201 // Sigma^+
2202 //###########################################################
2203 const scalar_t epsilon = 1e-10;
2204 S[0] = abs(S[0]) < epsilon ? 0 : 1.0 / S[0];
2205 S[1] = abs(S[1]) < epsilon ? 0 : 1.0 / S[1];
2206 S[2] = abs(S[2]) < epsilon ? 0 : 1.0 / S[2];
2207
2208 //###########################################################
2209 // (Sigma^+) * UT
2210 //###########################################################
2211 scalar_t S_UT[9];
2212
2213 S_UT[0] = U[0] * S[0];
2214 S_UT[1] = U[3] * S[0];
2215 S_UT[2] = U[6] * S[0];
2216 S_UT[3] = U[1] * S[1];
2217 S_UT[4] = U[4] * S[1];
2218 S_UT[5] = U[7] * S[1];
2219 S_UT[6] = U[2] * S[2];
2220 S_UT[7] = U[5] * S[2];
2221 S_UT[8] = U[8] * S[2];
2222
2223 //###########################################################
2224 // Ainv = V * [(Sigma^+) * UT]
2225 //###########################################################
2226 scalar_t Ainv[9] = {0};
2227 matmul3x3_3x3(V, S_UT, Ainv);
2228
2229 //###########################################################
2230 // x = Ainv * b
2231 //###########################################################
2232
2233 matmul3x3_3x1(Ainv, B_3x1, X_3x1);
2234}
2235
2236} // namespace kernel
2237} // namespace linalg
2238} // namespace core
2239} // namespace open3d
Common CUDA utilities.
#define OPEN3D_DEVICE
Definition: CUDAUtils.h:64
#define OPEN3D_FORCE_INLINE
Definition: CUDAUtils.h:62
#define __fsub_rn(x, y)
Definition: SVD3x3.h:82
#define __fadd_rn(x, y)
Definition: SVD3x3.h:81
#define __drsqrt_rn(x)
Definition: SVD3x3.h:87
#define gone
Definition: SVD3x3.h:58
#define __frsqrt_rn(x)
Definition: SVD3x3.h:83
#define gcosine_pi_over_eight
Definition: SVD3x3.h:61
#define __dadd_rn(x, y)
Definition: SVD3x3.h:85
#define gsine_pi_over_eight
Definition: SVD3x3.h:59
#define gfour_gamma_squared
Definition: SVD3x3.h:63
#define __dsub_rn(x, y)
Definition: SVD3x3.h:86
#define gtiny_number
Definition: SVD3x3.h:62
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< double >(const double *A_3x3, double *U_3x3, double *S_3x1, double *V_3x3)
Definition: SVD3x3.h:112
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x3(const scalar_t *A_3x3, const scalar_t *B_3x3, scalar_t *C_3x3)
Definition: Matrix.h:67
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3< float >(const float *A_3x3, float *U_3x3, float *S_3x1, float *V_3x3)
Definition: SVD3x3.h:1151
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void svd3x3(const scalar_t *A_3x3, scalar_t *U_3x3, scalar_t *S_3x1, scalar_t *V_3x3)
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void matmul3x3_3x1(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *C_3x1)
Definition: Matrix.h:58
OPEN3D_DEVICE OPEN3D_FORCE_INLINE void solve_svd3x3(const scalar_t *A_3x3, const scalar_t *B_3x1, scalar_t *X_3x1)
Definition: SVD3x3.h:2190
Definition: PinholeCameraIntrinsic.cpp:35
Definition: SVD3x3.h:100
unsigned int ui
Definition: SVD3x3.h:102
scalar_t f
Definition: SVD3x3.h:101