g3 0.1.4

Neat library for computer graphics based on geometric algebra
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562

use std::simd::{f32x4};

// a + b is a general bivector but it is most likely *non-simple* meaning
// that it is neither purely real nor purely ideal.
// Exponentiates the bivector and returns the motor defined by partitions 1
// and 2.
pub(crate) fn exp(a:&f32x4, b:&f32x4)->(f32x4,f32x4) {
  // The exponential map produces a continuous group of rotations about an
  // axis. We'd *like* to evaluate the exp(a + b) as exp(a)exp(b) but we
  // cannot do that in general because a and b do not commute (consider
  // the differences between the Taylor expansion of exp(ab) and
  // exp(a)exp(b)).

  // Check if the bivector we're exponentiating is ideal

  if *a == f32x4::splat(0.0) {
    // When exponentiating an ideal line, the terms past the linear
    // term in the Taylor series expansion vanishes
    return ([1.0, 0.0, 0.0, 0.0].into(),b.clone());
  }

  // First, we need to decompose the bivector into the sum of two
  // commutative bivectors (the product of these two parts will be a
  // scalar multiple of the pseudoscalar; see "Bivector times its ideal
  // axis and vice versa in demo.klein"). To do this, we compute the
  // squared norm of the bivector:
  //
  // NOTE: a sign flip is introduced since the square of a Euclidean
  // line is negative
  //
  // (a1^2 + a2^2 + a3^2) - 2(a1 b1 + a2 b2 + a3 b3) e0123

  // Broadcast dot(a, a) ignoring the scalar component to all components
  // of a2
  let a2 = &hi_dp_bc(a, a);
  let ab = &hi_dp_bc(a, b);

  // Next, we need the sqrt of that quantity. Since e0123 squares to 0,
  // this has a closed form solution.
  //
  // sqrt(a1^2 + a2^2 + a3^2)
  //  - (a1 b1 + a2 b2 + a3 b3) / sqrt(a1^2 + a2^2 + a3^2) e0123
  //
  // (relabeling) = u + vI
  //
  // (square the above quantity yourself to quickly verify the claim)
  // Maximum relative error < 1.5*2e-12
  let a2_sqrt_rcp = &rsqrt_nr1(a2);
  let u:f32x4 = a2 * a2_sqrt_rcp;
  // Don't forget the minus later!
  let minus_v = ab * a2_sqrt_rcp;

  // Last, we need the reciprocal of the norm to compute the normalized
  // bivector.
  //
  // 1 / sqrt(a1^2 + a2^2 + a3^2) + (a1 b1 + a2 b2 + a3 b3) / (a1^2 + a2^2 + a3^2)^(3/2) e0123
  //
  // The original bivector * the inverse norm gives us a normalized
  // bivector.
  let norm_real:&f32x4  = &(a * a2_sqrt_rcp);
  let mut norm_ideal = b * a2_sqrt_rcp;
  // The real part of the bivector also interacts with the pseudoscalar to
  // produce a portion of the normalized ideal part
  // e12 e0123 = -e03, e31 e0123 = -e02, e23 e0123 = -e01
  // Notice how the products above actually commute
  norm_ideal = norm_ideal - a * ab * a2_sqrt_rcp * rcp_nr1(a2);

  // The norm * our normalized bivector is the original bivector (a + b).
  // Thus, we have:
  //
  // (u + vI)n = u n + v n e0123
  //
  // Note that n and n e0123 are perpendicular (n e0123 lies on the ideal
  // plane, and all ideal components of n are extinguished after
  // polarization). As a result, we can now decompose the exponential.
  //
  // e^(u n + v n e0123) = e^(u n) e^(v n e0123) =
  // (cosu + sinu n) * (1 + v n e0123) =
  // cosu + sinu n + v n cosu e0123 + v sinu n^2 e0123 =
  // cosu + sinu n + v n cosu e0123 - v sinu e0123
  //
  // where we've used the fact that n is normalized and squares to -1.
  let uv_0 = u[0];
  // Note the v here corresponds to minus_v
  let uv_1 = minus_v[0];

  let sincosu_0 = uv_0.sin();
  let sincosu_1:f32 = uv_0.cos();

  let sinu = f32x4::splat(sincosu_0);
  let tmp:f32x4 = [sincosu_1, 0.0, 0.0, 0.0].into();
  let p1_out:f32x4  = tmp + (sinu * norm_real.clone());

  // The second partition has contributions from both the real and ideal
  // parts.
  let cosu:f32x4 = [0.0, sincosu_1, sincosu_1, sincosu_1].into();
  let minus_vcosu = minus_v * cosu;
  let mut p2_out = sinu * norm_ideal;
  p2_out = p2_out + minus_vcosu * norm_real;
  let minus_vsinu = uv_1 * sincosu_0;
  let tmp:f32x4 = [minus_vsinu, 0.0, 0.0, 0.0].into();
  p2_out = tmp + p2_out;
  return (p1_out,p2_out);
}

pub(crate) fn logarithm(p1:&f32x4, p2:&f32x4) ->(f32x4, f32x4) {
  // The logarithm follows from the derivation of the exponential. Working
  // backwards, we ended up computing the exponential like so:
  //
  // cosu + sinu n + v n cosu e0123 - v sinu e0123 =
  // (cosu - v sinu e0123) + (sinu + v cosu e0123) n
  //
  // where n is the normalized bivector. If we compute the norm, that will
  // allow us to match it to sinu + vcosu e0123, which will then allow us
  // to deduce u and v.

  // The first thing we need to do is extract only the bivector components
  // from the motor.
  let bv_mask:f32x4 = [0.0, 1.0, 1.0, 1.0].into();
  let a = bv_mask * p1;

  // Early out if we're taking the log of a motor without any rotation
  if a == f32x4::splat(0.0) { return (a, p2.clone()); }

  let b = bv_mask * p2;

  // Next, we need to compute the norm as in the exponential.
  let a2 = &hi_dp_bc(&a, &a);
  let ab = hi_dp_bc(&a, &b);
  let a2_sqrt_rcp = &rsqrt_nr1(a2);
  let s = a2 * a2_sqrt_rcp;
  let minus_t = ab * a2_sqrt_rcp;
  // s + t e0123 is the norm of our bivector.

  // Store the scalar component
  let p = p1[0];

  // Store the pseudoscalar component
  let q = p2[0];

  let s_scalar = s[0];
  let t_scalar = minus_t[0] * -1.0;
  // p = cosu
  // q = -v sinu
  // s_scalar = sinu
  // t_scalar = v cosu

  let p_zero = p.abs() < 0.000_001;
  let u = if p_zero { (-q).atan2(t_scalar) } else { s_scalar.atan2(p) };
  let v = if p_zero { -q / s_scalar } else { t_scalar / p };

  // Now, (u + v e0123) * n when exponentiated will give us the motor, so
  // (u + v e0123) * n is the logarithm. To proceed, we need to compute
  // the normalized bivector.
  let norm_real = a * a2_sqrt_rcp;
  let mut norm_ideal = b * a2_sqrt_rcp;
  norm_ideal -= &a * &ab * a2_sqrt_rcp * rcp_nr1(a2);

  let uvec = f32x4::splat(u);
  let p1_out = uvec * norm_real;
  let p2_out = (uvec * norm_ideal) - (f32x4::splat(v) * norm_real);
  (p1_out, p2_out)
}

// point * plane
pub fn gp30(a:&f32x4, b:&f32x4)->(f32x4,f32x4) {
  let mut p1 = a * b.xxxx();
  p1 = b0a1a2a3(&p1, &f32x4::splat(0.0));

  // (_, a3 b2, a1 b3, a2 b1)
  let mut p2 = a.xwyz() * b.xzwy();
  p2 -= a.xzwy() * b.xwyz();
  // Compute a0 b0 + a1 b1 + a2 b2 + a3 b3 and store it in the low component
  let mut tmp = dp(a, b);
  tmp = flip_signs(&tmp, [true, false, false, false].into());
  p2 = b0a1a2a3(&p2, &tmp);
  (p1,p2)
}

// plane * point
pub fn gp03(a:&f32x4, b:&f32x4)->(f32x4,f32x4) {
  // (_, a3 b2, a1 b3, a2 b1)
  // Compute a0 b0 + a1 b1 + a2 b2 + a3 b3 and store it in the low component
  (b0a1a2a3(&(a * b.xxxx()), &f32x4::splat(0.0)),
   a.xwyz() * b.xzwy() - a.xzwy() * b.xwyz() + dp(a, b))
}

// p1: (1, e23, e31, e12)
pub fn gp11(a:&f32x4, b:&f32x4)->f32x4 {
  // (a0 b0 - a1 b1 - a2 b2 - a3 b3) +
  // (a0 b1 - a2 b3 + a1 b0 + a3 b2)*e23
  // (a0 b2 - a3 b1 + a2 b0 + a1 b3)*e31
  // (a0 b3 - a1 b2 + a3 b0 + a2 b1)*e12

  // We use abcd to refer to the slots to avoid conflating bivector/scalar
  // coefficients with cartesian coordinates

  // In general, we can get rid of at most one swizzle
  let mut p1_out = a.xxxx() * b;
  p1_out -= a.yzwy() * b.ywyz();
  // In a separate register, accumulate the later components so we can
  // negate the lower single-precision element with a single instruction
  let tmp1 = a.zyzw() * b.zxxx();
  let tmp2 = a.wwyz() * b.wzwy();
  let tmp = f32x4_xor(&(tmp1 + tmp2), &[-0.0, 0.0, 0.0, 0.0].into());
  p1_out + tmp
}

pub fn gptr(a:&f32x4, b:&f32x4)->f32x4 {
  // (a1 b1 + a2 b2 + a3 b3) e0123 +
  // (a0 b1 + a2 b3 - a3 b2) e01 +
  // (a0 b2 + a3 b1 - a1 b3) e02 +
  // (a0 b3 + a1 b2 - a2 b1) e03
  let mut p2 = a.yxxx() * b.yyzw();
  p2 += a.zzwy() * b.zwyz();
  let tmp = a.wwyz() * b.wzwy();
  p2 - f32x4_xor(&tmp, &[-0.0,0.0,0.0,0.0].into())
}

pub fn gprt(a:&f32x4, b:&f32x4)->f32x4 {
  // (a1 b1 + a2 b2 + a3 b3) e0123 +
  // (a0 b1 + a3 b2 - a2 b3) e01 +
  // (a0 b2 + a1 b3 - a3 b1) e02 +
  // (a0 b3 + a2 b1 - a1 b2) e03
  let mut p2 = a.yxxx() * b.yyzw();
  p2 += a.zwyz() * b.zzwy();
  let tmp = a.wzwy() * b.wwyz();
  p2 - f32x4_xor(&tmp, &[-0.0,0.0,0.0,0.0].into())
}

pub fn gp12(a:&f32x4, b:&f32x4)->f32x4 {
  let p2 = gprt(a, b);
  let tmp = a * b.xxxx();
  p2 - flip_signs(&tmp, mask32x4::from_array([true,false,false,false]))
}

pub fn gp21(a:&f32x4, b:&f32x4)->f32x4 {
  let p2 = gptr(a, b);
  let tmp = a * b.xxxx();
  p2 - flip_signs(&tmp, mask32x4::from_array([true,false,false,false]))
}

pub(crate) fn gpll(a:&f32x4, d:&f32x4, b:&f32x4, c:&f32x4)->(f32x4, f32x4) {
  let flip = &[-0.0,0.0,0.0,0.0].into();
  let mut p1 = a.yzyw() * b.yywz();
  p1 = f32x4_xor(&p1, flip);
  p1 -= a.wywz() * b.wzyw();
  let a2 = a.zzww();
  let b2 = b.zzww();
  let p1 = sub_ss(&p1, mul_ss(&a2, &b2));

  let mut p2 = a.ywyz() * c.yzwy();
  p2 -= f32x4_xor(flip, &(a.wzwy() * c.wwyz()));
  p2 += b.yzwy() * d.ywyz();
  p2 -= f32x4_xor(flip, &(b.wwyz() * d.wzwy()));
  let c2 = c.zzww();
  let d2 = d.zzww();
  p2 = add_ss(&p2, &(a2*c2));
  p2 = add_ss(&p2, &(b2*d2));
  (p1, p2)
}

pub fn gpmm(a:&f32x4, b:&f32x4, c:&f32x4, d:&f32x4)->(f32x4,f32x4) {
  // (a0 c0 - a1 c1 - a2 c2 - a3 c3) +
  // (a0 c1 + a3 c2 + a1 c0 - a2 c3) e23 +
  // (a0 c2 + a1 c3 + a2 c0 - a3 c1) e31 +
  // (a0 c3 + a2 c1 + a3 c0 - a1 c2) e12 +
  //
  // (a0 d0 + b0 c0 + a1 d1 + b1 c1 + a2 d2 + a3 d3 + b2 c2 + b3 c3) e0123 +
  // (a0 d1 + b1 c0 + a3 d2 + b3 c2 - a1 d0 - a2 d3 - b0 c1 - b2 c3) e01 +
  // (a0 d2 + b2 c0 + a1 d3 + b1 c3 - a2 d0 - a3 d1 - b0 c2 - b3 c1) e02 +
  // (a0 d3 + b3 c0 + a2 d1 + b2 c1 - a3 d0 - a1 d2 - b0 c3 - b1 c2) e03

  let a_xxxx = a.xxxx();
  let a_zyzw = a.zyzw();
  let a_ywyz = a.ywyz();
  let a_wzwy = a.wzwy();
  let c_wwyz = c.wwyz();
  let c_yzwy = c.yzwy();
  let s_flip = mask32x4::from_array([true, false, false, false]);

  let mut e = a_xxxx * c;
  let mut t = a_ywyz * c_yzwy;

  t += a_zyzw * c.zxxx();
  t = flip_signs(&t, s_flip);

  e = e + t;
  e = e - a_wzwy * c_wwyz;

  let f = a_xxxx * d + b * c.xxxx() + a_ywyz * d.yzwy() + b.ywyz() * c_yzwy;

  let mut t = a_zyzw * d.zxxx();
  t += a_wzwy * d.wwyz();
  t += b.zxxx() * c.zyzw();
  t += b.wzwy() * c_wwyz;
  t = f32x4_xor(&t, &[-0.0,0.0,0.0,0.0].into());

  return (e, f-t);
}

pub fn gpdl(u:f32, v:f32, b:&f32x4, c:&f32x4)->(f32x4,f32x4) {
  // b1 u e23 +
  // b2 u e31 +
  // b3 u e12 +
  // (-b1 v + c1 u) e01 +
  // (-b2 v + c2 u) e02 +
  // (-b3 v + c3 u) e03
  let u_vec = f32x4::splat(u);
  let v_vec = f32x4::splat(v);
  let p1 = u_vec * b;
  let p2 = c * u_vec - b * v_vec;
  (p1,p2)
}

use std::simd::{*, simd_swizzle as swizzle, Which::{First, Second}, StdFloat};

// Workaround for to_bits issue, TODO remove if fixed
pub fn to_bits(a:&f32x4)->u32x4 { unsafe { std::mem::transmute::<f32x4, u32x4>((*a).clone()) } }
fn rsqrt(a:&f32x4)->f32x4 { f32x4::splat(1.0) / a.sqrt() } // TODO fast rsqrt...
#[inline] fn rcp(a:&f32x4)->f32x4 { f32x4::splat(1.0) / a }
#[inline] pub fn f32x4_xor(a:&f32x4,b:&f32x4)->f32x4 {
  f32x4::from_bits(to_bits(&a) ^ to_bits(&b))
}
#[inline] pub fn refined_reciprocal(s:f32)->f32x4 { rcp_nr1(&f32x4::splat(s)) }
#[inline] pub fn sqrt_nr1(a:&f32x4)->f32x4 { a * rsqrt_nr1(&a) } // TODO either write faster rsqrt_nr1, or derive sqrt_nr1 yourself...

// Reciprocal sqrt with an additional single Newton-Raphson refinement.
// f(x) = 1/x^2 - a
// f'(x) = -1/(2x^(3/2))
// Let x_n be the estimate, and x_{n+1} be the refinement
// x_{n+1} = x_n - f(x)/f'(x)
//         = 0.5 * x_n * (3 - a x_n^2)
pub fn rsqrt_nr1(a:&f32x4)->f32x4 {
  // TODO find portable version of _mm_rsqrt_ps in core_simd
  // From Intel optimization manual: expected performance is ~5.2x
  // baseline (sqrtps + divps) with ~22 bits of accuracy
  let xn = &rsqrt(&a);
  let axn2 = xn * xn * a;
  let xn3 = f32x4::splat(3.0) - axn2;
  f32x4::splat(0.5) * xn * xn3
}

// Reciprocal with an additional single Newton-Raphson refinement
// f(x) = 1/x - a
// f'(x) = -1/x^2
// x_{n+1} = x_n - f(x)/f'(x)
//         = 2x_n - a x_n^2 = x_n (2 - a x_n)
#[inline] pub fn rcp_nr1(a:&f32x4)->f32x4 {
  let xn = &rcp(a);
  xn * (f32x4::splat(2.0) - a * xn)
}

pub fn hi_dp(a:&f32x4, b:&f32x4)->f32x4 {
  let ab = a*b;
  swizzle!((ab.yyww() + ab + ab.xxyy()).zwzw(), f32x4::splat(0.0), [First(0),Second(1),Second(2),Second(3)]) // TODO make faster???
}

pub fn hi_dp_bc(a:&f32x4, b:&f32x4)->f32x4 {
  let ab = a*b;
  (ab.yyww() + ab + ab.xxyy()).zzzz()
}

pub fn hi_dp_ss(a:&f32x4, b:&f32x4)->f32x4 {
  let mut ab = a * b;
  let hi = &ab.yyww();
  let sum = hi + ab;
  ab = sum + ab.xxyy();
  ab.zwzw()
}

pub fn dp(a:&f32x4, b:&f32x4)->f32x4 {
  let mut ab = a * b;
  let hi = &ab.yyww();

  // (a1 b1, a2 b2, a3 b3, 0) + (a2 b2, a2 b2, 0, 0)
  // = (a1 b1 + a2 b2, _, a3 b3, 0)
  ab = hi + ab;
  ab[0] += b2b3a2a3(hi, &ab)[0];
  let true_falses = mask32x4::from_array([true, false, false, false]);
  true_falses.select(ab, f32x4::splat(0.0))
}

pub fn dp_bc(a:&f32x4, b:&f32x4)->f32x4 {
  let mut ab = a * b;
  let hi = &ab.yyww();

  // (a1 b1, a2 b2, a3 b3, 0) + (a2 b2, a2 b2, 0, 0)
  // = (a1 b1 + a2 b2, _, a3 b3, 0)
  ab = hi + ab;
  ab = add_ss(&ab, &b2b3a2a3(hi, &ab));
  ab.xxxx()
}

#[inline] pub fn zero_first(a:f32x4)->f32x4 { swizzle!(a, f32x4::splat(0.0), [Second(0), First(1), First(2), First(3)]) } // TODO find a faster way
#[inline] pub fn f32x4_and(a:f32x4,b:f32x4)->f32x4 { f32x4::from_bits(a.to_bits() & b.to_bits()) }
#[inline] pub fn flip_signs(x:&f32x4, mask:mask32x4)->f32x4 { mask.select(-x.clone(), x.clone())}
pub fn add_ss(a:&f32x4,b:&f32x4)->f32x4 { swizzle!(a + b, a.clone(), [First(0), Second(1), Second(2), Second(3)]) }
#[inline] pub fn sub_ss(a:&f32x4,b:f32x4)->f32x4 { swizzle!(a - b, a.clone(), [First(0), Second(1), Second(2), Second(3)]) }
pub fn mul_ss(a:&f32x4,b:&f32x4)->f32x4 { swizzle!(a * b, a.clone(), [First(0), Second(1), Second(2), Second(3)]) }
#[inline] pub fn b2b3a2a3(a:&f32x4,b:&f32x4)->f32x4 { swizzle!(a.clone(), b.clone(), [Second(2),Second(3),First(2),First(3)]) }
#[inline] pub fn b0a1a2a3(a:&f32x4,b:&f32x4)->f32x4 { swizzle!(a.clone(), b.clone(), [Second(0),First(1),First(2),First(3)]) }

pub trait Shuffle:Clone {
  fn xxxx(&self)->Self;
  fn xyxy(&self)->Self;
  fn xxyy(&self)->Self;
  fn xxzz(&self)->Self;
  fn xzxx(&self)->Self;
  fn xzwy(&self)->Self;
  fn xwyz(&self)->Self;
  fn xwzy(&self)->Self;
  fn yxxx(&self)->Self;
  fn yxwx(&self)->Self;
  fn yyyy(&self)->Self;
  fn yyzw(&self)->Self;
  fn yywz(&self)->Self;
  fn yyww(&self)->Self;
  fn yzxx(&self)->Self;
  fn yzyz(&self)->Self;
  fn yzyw(&self)->Self;
  fn yzwx(&self)->Self;
  fn yzwy(&self)->Self;
  fn ywyx(&self)->Self;
  fn ywyz(&self)->Self;
  fn zxxx(&self)->Self;
  fn zxzx(&self)->Self;
  fn zyzw(&self)->Self;
  fn zzzz(&self)->Self;
  fn zzwx(&self)->Self;
  fn zzwy(&self)->Self;
  fn zzww(&self)->Self;
  fn zwxx(&self)->Self;
  fn zwxy(&self)->Self;
  fn zwyx(&self)->Self;
  fn zwyz(&self)->Self;
  fn zwzw(&self)->Self;
  fn zwwy(&self)->Self;
  fn wxxx(&self)->Self;
  fn wyzx(&self)->Self;
  fn wyzw(&self)->Self;
  fn wywx(&self)->Self;
  fn wywz(&self)->Self;
  fn wzyz(&self)->Self;
  fn wzyw(&self)->Self;
  fn wzwy(&self)->Self;
  fn wwxx(&self)->Self;
  fn wwyz(&self)->Self;
  fn wwww(&self)->Self;
}

// impl<T:Clone+StdFloat> Shuffle for T {}
impl Shuffle for f32x4 {
  #[inline] fn xxxx(&self)->Self { swizzle!(self.clone(), [0,0,0,0]) }
  #[inline] fn xyxy(&self)->Self { swizzle!(self.clone(), [0,1,0,1]) }
  #[inline] fn xxyy(&self)->Self { swizzle!(self.clone(), [0,0,1,1]) }
  #[inline] fn xxzz(&self)->Self { swizzle!(self.clone(), [0,0,2,2]) }
  #[inline] fn xzxx(&self)->Self { swizzle!(self.clone(), [0,2,0,0]) }
  #[inline] fn xzwy(&self)->Self { swizzle!(self.clone(), [0,2,3,1]) }
  #[inline] fn xwyz(&self)->Self { swizzle!(self.clone(), [0,3,1,2]) }
  #[inline] fn xwzy(&self)->Self { swizzle!(self.clone(), [0,3,2,1]) }
  #[inline] fn yxxx(&self)->Self { swizzle!(self.clone(), [1,0,0,0]) }
  #[inline] fn yxwx(&self)->Self { swizzle!(self.clone(), [1,0,3,0]) }
  #[inline] fn yyyy(&self)->Self { swizzle!(self.clone(), [1,1,1,1]) }
  #[inline] fn yyzw(&self)->Self { swizzle!(self.clone(), [1,1,2,3]) }
  #[inline] fn yywz(&self)->Self { swizzle!(self.clone(), [1,1,3,2]) }
  #[inline] fn yyww(&self)->Self { swizzle!(self.clone(), [1,1,3,3]) }
  #[inline] fn yzxx(&self)->Self { swizzle!(self.clone(), [1,2,0,0]) }
  #[inline] fn yzyz(&self)->Self { swizzle!(self.clone(), [1,2,1,2]) }
  #[inline] fn yzyw(&self)->Self { swizzle!(self.clone(), [1,2,1,3]) }
  #[inline] fn yzwx(&self)->Self { swizzle!(self.clone(), [1,2,3,0]) }
  #[inline] fn yzwy(&self)->Self { swizzle!(self.clone(), [1,2,3,1]) }
  #[inline] fn ywyx(&self)->Self { swizzle!(self.clone(), [1,3,1,0]) }
  #[inline] fn ywyz(&self)->Self { swizzle!(self.clone(), [1,3,1,2]) }
  #[inline] fn zxxx(&self)->Self { swizzle!(self.clone(), [2,0,0,0]) }
  #[inline] fn zxzx(&self)->Self { swizzle!(self.clone(), [2,0,2,0]) }
  #[inline] fn zyzw(&self)->Self { swizzle!(self.clone(), [2,1,2,3]) }
  #[inline] fn zzzz(&self)->Self { swizzle!(self.clone(), [2,2,2,2]) }
  #[inline] fn zzwx(&self)->Self { swizzle!(self.clone(), [2,2,3,0]) }
  #[inline] fn zzwy(&self)->Self { swizzle!(self.clone(), [2,2,3,1]) }
  #[inline] fn zzww(&self)->Self { swizzle!(self.clone(), [2,2,3,3]) }
  #[inline] fn zwxx(&self)->Self { swizzle!(self.clone(), [2,3,0,0]) }
  #[inline] fn zwxy(&self)->Self { swizzle!(self.clone(), [2,3,0,1]) }
  #[inline] fn zwyx(&self)->Self { swizzle!(self.clone(), [2,3,1,0]) }
  #[inline] fn zwyz(&self)->Self { swizzle!(self.clone(), [2,3,1,2]) }
  #[inline] fn zwzw(&self)->Self { swizzle!(self.clone(), [2,3,2,3]) }
  #[inline] fn zwwy(&self)->Self { swizzle!(self.clone(), [2,3,3,1]) }
  #[inline] fn wxxx(&self)->Self { swizzle!(self.clone(), [3,0,0,0]) }
  #[inline] fn wyzx(&self)->Self { swizzle!(self.clone(), [3,1,2,0]) }
  #[inline] fn wyzw(&self)->Self { swizzle!(self.clone(), [3,1,2,3]) }
  #[inline] fn wywx(&self)->Self { swizzle!(self.clone(), [3,1,3,0]) }
  #[inline] fn wywz(&self)->Self { swizzle!(self.clone(), [3,1,3,2]) }
  #[inline] fn wzyz(&self)->Self { swizzle!(self.clone(), [3,2,1,2]) }
  #[inline] fn wzyw(&self)->Self { swizzle!(self.clone(), [3,2,1,3]) }
  #[inline] fn wzwy(&self)->Self { swizzle!(self.clone(), [3,2,3,1]) }
  #[inline] fn wwxx(&self)->Self { swizzle!(self.clone(), [3,3,0,0]) }
  #[inline] fn wwyz(&self)->Self { swizzle!(self.clone(), [3,3,1,2]) }
  #[inline] fn wwww(&self)->Self { swizzle!(self.clone(), [3,3,3,3]) }
}

#[inline] pub fn bits_wwww(a:u32x4)->u32x4 { swizzle!(a, [0,0,0,0]) }

#[cfg(test)]
pub fn approx_eq(result: [f32; 4], expected: [f32; 4]) {
  const EPSILON: f32 = 0.02;
  assert_eq!(result.len(), expected.len());
  for (i, a) in result.iter().enumerate() {
    let b = expected[i];
    assert!((a - b).abs() < EPSILON, "{:?} ≉ {:?}, at index {:}", result, expected, i);
  }
}

#[cfg(test)]
mod tests {
  use super::*;
  use std::{simd::{f32x4}};

  #[test] fn hi_dp_ss_test() {
    let a:f32x4 = [1.0, 2.0, 3.0, 5.0].into();
    let b = [-4.0, -3.0, -2.0, -1.0].into();
    assert_eq!(hi_dp_ss(&a, &b), [-17.0, -16.0, -17.0, -16.0].into());
  }

  #[test] fn add_first() {
    let a:f32x4 = [2.0, 2.0, 3.0, 4.0].into();
    assert_eq!(add_ss(&a, &a), [4.0, 2.0, 3.0, 4.0].into());
  }

  #[test] fn rcp_test() {
    let a:f32x4 = [2.0, 4.0, 8.0, 10.0].into();
    approx_eq(rcp(&a).into(), [0.5, 0.25, 0.125, 0.1].into());
  }

  #[test] fn multiply_first() {
    let a = <f32x4>::from([2.0, 2.0, 3.0, 4.0]);
    assert_eq!(mul_ss(&a, &a), [4.0, 2.0, 3.0, 4.0].into());
  }

  #[test] fn inverse_sqrt() {
    let a = <f32x4>::from([4.0, 9.0, 16.0, 25.0]);
    assert_eq!(a.sqrt(), [2.0, 3.0, 4.0, 5.0].into());
    assert_eq!(f32x4::splat(1.0)/a.sqrt(), [1.0/2.0, 1.0/3.0, 1.0/4.0, 1.0/5.0].into());
  }

  #[test] fn rcp_nr1_test() {
    let a = <f32x4>::from([1.0, 2.0, 3.0, 4.0]);
    let b = rcp_nr1(&a);
    approx_eq(*b.as_array(), [1.0, 0.5, 1.0/3.0, 0.25]);
  }

  #[test] #[ignore] fn sqrt_nr1_test() {}

  #[test] #[ignore] fn rsqrt_nr1_test() {}

  #[test] fn test_f32_sign_flipping() {
    let v1 = f32x4::from_array([1.0,2.0,3.0,4.0]);
    let expected = f32x4::from_array([-1.0,-2.0,-3.0,4.0]);
    assert_eq!(f32x4_xor(&v1, &f32x4::from_array([-0.0,-0.0,-0.0,0.0])), expected);
    assert_eq!(flip_signs(&v1, [true,true,true,false].into()), expected);
  }
}