Skip to main content

linear_srgb/
simd.rs

1//! SIMD-accelerated sRGB ↔ linear conversion.
2//!
3//! This module provides high-performance conversion functions using AVX2/SSE SIMD
4//! instructions via the `wide` crate with runtime CPU feature detection.
5//!
6//! # API Overview
7//!
8//! ## x8 Functions (process 8 values at once)
9//! - [`srgb_to_linear_x8`] - f32x8 sRGB → f32x8 linear
10//! - [`linear_to_srgb_x8`] - f32x8 linear → f32x8 sRGB
11//! - [`srgb_u8_to_linear_x8`] - \[u8; 8\] sRGB → f32x8 linear
12//! - [`linear_to_srgb_u8_x8`] - f32x8 linear → \[u8; 8\] sRGB
13//!
14//! ## Slice Functions (process entire slices)
15//! - [`srgb_to_linear_slice`] - &mut \[f32\] sRGB → linear in-place
16//! - [`linear_to_srgb_slice`] - &mut \[f32\] linear → sRGB in-place
17//! - [`srgb_u8_to_linear_slice`] - &\[u8\] sRGB → &mut \[f32\] linear
18//! - [`linear_to_srgb_u8_slice`] - &\[f32\] linear → &mut \[u8\] sRGB
19
20use archmage::{Desktop64, ScalarToken, arcane, incant, rite};
21use wide::{CmpLt, f32x8};
22
23use crate::fast_math::pow_x8;
24
25// Alias magetypes f32x8 to avoid name clash with wide::f32x8
26#[cfg(target_arch = "x86_64")]
27use magetypes::simd::f32x8 as mt_f32x8;
28
29// sRGB transfer function constants (C0-continuous, moxcms-derived)
30// These ensure exact continuity at the linear/power segment junction.
31// Standard IEC values (0.055, 1.055, 0.04045) have a tiny discontinuity.
32const SRGB_LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.039_293_37);
33const LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.003_041_282_6);
34const LINEAR_SCALE: f32x8 = f32x8::splat(1.0 / 12.92);
35const TWELVE_92: f32x8 = f32x8::splat(12.92);
36const ZERO: f32x8 = f32x8::splat(0.0);
37const ONE: f32x8 = f32x8::splat(1.0);
38const HALF: f32x8 = f32x8::splat(0.5);
39
40/// Precomputed sRGB u8 → linear f32 lookup table.
41/// Uses the same constants as the transfer module (C0-continuous IEC 61966-2-1).
42/// Generated by computing `srgb_u8_to_linear(i)` for each i in 0..=255.
43/// To regenerate: `cargo run --release --example generate_lut`
44const SRGB_U8_TO_LINEAR_LUT: [f32; 256] = [
45    0.0_f32,
46    0.000303527_f32,
47    0.000607054_f32,
48    0.000910581_f32,
49    0.001214108_f32,
50    0.001517635_f32,
51    0.001821162_f32,
52    0.0021246888_f32,
53    0.002428216_f32,
54    0.002731743_f32,
55    0.00303527_f32,
56    0.0033473307_f32,
57    0.0036773437_f32,
58    0.0040255957_f32,
59    0.004392362_f32,
60    0.004777916_f32,
61    0.0051825214_f32,
62    0.00560644_f32,
63    0.006049924_f32,
64    0.0065132244_f32,
65    0.0069965874_f32,
66    0.007500253_f32,
67    0.008024457_f32,
68    0.008569433_f32,
69    0.009135411_f32,
70    0.009722613_f32,
71    0.010331264_f32,
72    0.010961577_f32,
73    0.011613773_f32,
74    0.012288062_f32,
75    0.012984648_f32,
76    0.013703744_f32,
77    0.01444555_f32,
78    0.015210266_f32,
79    0.01599809_f32,
80    0.016809216_f32,
81    0.01764384_f32,
82    0.018502146_f32,
83    0.019384334_f32,
84    0.02029058_f32,
85    0.02122107_f32,
86    0.022175988_f32,
87    0.023155512_f32,
88    0.024159823_f32,
89    0.025189094_f32,
90    0.026243499_f32,
91    0.027323212_f32,
92    0.0284284_f32,
93    0.02955924_f32,
94    0.030715894_f32,
95    0.03189852_f32,
96    0.0331073_f32,
97    0.034342386_f32,
98    0.03560393_f32,
99    0.036892105_f32,
100    0.03820707_f32,
101    0.039548974_f32,
102    0.04091798_f32,
103    0.04231424_f32,
104    0.04373789_f32,
105    0.045189105_f32,
106    0.04666803_f32,
107    0.04817481_f32,
108    0.049709592_f32,
109    0.051272515_f32,
110    0.052863743_f32,
111    0.054483414_f32,
112    0.05613167_f32,
113    0.05780865_f32,
114    0.05951448_f32,
115    0.061249338_f32,
116    0.063013345_f32,
117    0.06480663_f32,
118    0.06662934_f32,
119    0.068481594_f32,
120    0.07036356_f32,
121    0.072275355_f32,
122    0.07421711_f32,
123    0.07618896_f32,
124    0.07819102_f32,
125    0.080223456_f32,
126    0.08228638_f32,
127    0.08437992_f32,
128    0.086504206_f32,
129    0.088659346_f32,
130    0.09084551_f32,
131    0.093062796_f32,
132    0.09531133_f32,
133    0.09759124_f32,
134    0.09990266_f32,
135    0.10224568_f32,
136    0.104620464_f32,
137    0.10702711_f32,
138    0.109465756_f32,
139    0.1119365_f32,
140    0.11443946_f32,
141    0.116974786_f32,
142    0.11954258_f32,
143    0.12214295_f32,
144    0.12477602_f32,
145    0.1274419_f32,
146    0.13014072_f32,
147    0.1328726_f32,
148    0.13563763_f32,
149    0.13843594_f32,
150    0.14126763_f32,
151    0.14413282_f32,
152    0.14703165_f32,
153    0.1499642_f32,
154    0.15293059_f32,
155    0.15593089_f32,
156    0.15896529_f32,
157    0.16203386_f32,
158    0.1651367_f32,
159    0.16827393_f32,
160    0.17144562_f32,
161    0.17465195_f32,
162    0.17789298_f32,
163    0.18116882_f32,
164    0.1844796_f32,
165    0.18782537_f32,
166    0.1912063_f32,
167    0.19462249_f32,
168    0.19807397_f32,
169    0.2015609_f32,
170    0.20508343_f32,
171    0.20864154_f32,
172    0.21223548_f32,
173    0.21586527_f32,
174    0.21953095_f32,
175    0.22323275_f32,
176    0.22697066_f32,
177    0.23074481_f32,
178    0.2345554_f32,
179    0.23840237_f32,
180    0.24228595_f32,
181    0.24620613_f32,
182    0.25016314_f32,
183    0.25415692_f32,
184    0.25818765_f32,
185    0.26225552_f32,
186    0.26636043_f32,
187    0.27050266_f32,
188    0.27468216_f32,
189    0.27889907_f32,
190    0.2831536_f32,
191    0.28744566_f32,
192    0.29177552_f32,
193    0.2961431_f32,
194    0.30054858_f32,
195    0.30499217_f32,
196    0.30947372_f32,
197    0.31399357_f32,
198    0.3185516_f32,
199    0.32314798_f32,
200    0.3277829_f32,
201    0.33245632_f32,
202    0.33716843_f32,
203    0.34191918_f32,
204    0.34670877_f32,
205    0.35153738_f32,
206    0.35640487_f32,
207    0.36131153_f32,
208    0.3662573_f32,
209    0.37124234_f32,
210    0.37626684_f32,
211    0.38133067_f32,
212    0.3864341_f32,
213    0.39157712_f32,
214    0.3967598_f32,
215    0.4019824_f32,
216    0.40724477_f32,
217    0.4125472_f32,
218    0.41788962_f32,
219    0.42327216_f32,
220    0.42869502_f32,
221    0.4341581_f32,
222    0.43966165_f32,
223    0.44520563_f32,
224    0.45079017_f32,
225    0.4564154_f32,
226    0.46208134_f32,
227    0.46778816_f32,
228    0.4735358_f32,
229    0.47932443_f32,
230    0.4851542_f32,
231    0.49102503_f32,
232    0.49693722_f32,
233    0.5028906_f32,
234    0.5088854_f32,
235    0.5149218_f32,
236    0.5209996_f32,
237    0.52711916_f32,
238    0.5332804_f32,
239    0.53948337_f32,
240    0.5457284_f32,
241    0.55201524_f32,
242    0.55834424_f32,
243    0.56471527_f32,
244    0.57112855_f32,
245    0.57758415_f32,
246    0.58408207_f32,
247    0.5906225_f32,
248    0.59720534_f32,
249    0.6038308_f32,
250    0.6104991_f32,
251    0.61721_f32,
252    0.62396383_f32,
253    0.6307605_f32,
254    0.6376001_f32,
255    0.644483_f32,
256    0.6514088_f32,
257    0.658378_f32,
258    0.6653904_f32,
259    0.67244613_f32,
260    0.67954546_f32,
261    0.68668824_f32,
262    0.6938747_f32,
263    0.7011047_f32,
264    0.7083785_f32,
265    0.7156962_f32,
266    0.72305775_f32,
267    0.7304634_f32,
268    0.73791295_f32,
269    0.7454066_f32,
270    0.75294465_f32,
271    0.76052684_f32,
272    0.7681535_f32,
273    0.7758244_f32,
274    0.7835399_f32,
275    0.79130006_f32,
276    0.79910475_f32,
277    0.80695426_f32,
278    0.8148484_f32,
279    0.82278764_f32,
280    0.8307716_f32,
281    0.83880067_f32,
282    0.8468749_f32,
283    0.8549941_f32,
284    0.8631587_f32,
285    0.8713685_f32,
286    0.87962353_f32,
287    0.8879244_f32,
288    0.89627033_f32,
289    0.9046623_f32,
290    0.9130995_f32,
291    0.9215827_f32,
292    0.9301116_f32,
293    0.93868643_f32,
294    0.9473071_f32,
295    0.9559739_f32,
296    0.9646866_f32,
297    0.9734457_f32,
298    0.9822507_f32,
299    0.9911024_f32,
300    1.0_f32,
301];
302
303#[inline]
304fn get_lut() -> &'static [f32; 256] {
305    &SRGB_U8_TO_LINEAR_LUT
306}
307
308/// Convert a single sRGB u8 value to linear f32 using LUT lookup.
309///
310/// This is the fastest method for u8 input as it uses a precomputed lookup table
311/// embedded in the binary. For batch conversions, use [`srgb_u8_to_linear_slice`].
312///
313/// # Example
314/// ```
315/// use linear_srgb::simd::srgb_u8_to_linear;
316///
317/// let linear = srgb_u8_to_linear(128);
318/// assert!((linear - 0.2158).abs() < 0.001);
319/// ```
320#[inline]
321pub fn srgb_u8_to_linear(value: u8) -> f32 {
322    get_lut()[value as usize]
323}
324
325// ============================================================================
326// x8 Inline Functions - Always inlined, for use in caller's magetypes code
327// ============================================================================
328
329/// Convert 8 sRGB f32 values to linear (always inlined).
330///
331/// Use this variant inside your own `#[magetypes]` functions to avoid
332/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_dispatch`].
333///
334/// Input values are clamped to \[0, 1\].
335#[inline(always)]
336pub fn srgb_to_linear_x8_inline(srgb: f32x8) -> f32x8 {
337    let srgb = srgb.max(ZERO).min(ONE);
338    let linear_result = srgb * LINEAR_SCALE;
339
340    // Degree-11 Chebyshev polynomial (Estrin evaluation)
341    let u = srgb.mul_add(f32x8::splat(S2L_INV_HW), f32x8::splat(S2L_BIAS));
342    let u2 = u * u;
343    let u4 = u2 * u2;
344    let u_8 = u4 * u4;
345    let p01 = f32x8::splat(S2L_C1).mul_add(u, f32x8::splat(S2L_C0));
346    let p23 = f32x8::splat(S2L_C3).mul_add(u, f32x8::splat(S2L_C2));
347    let p45 = f32x8::splat(S2L_C5).mul_add(u, f32x8::splat(S2L_C4));
348    let p67 = f32x8::splat(S2L_C7).mul_add(u, f32x8::splat(S2L_C6));
349    let p89 = f32x8::splat(S2L_C9).mul_add(u, f32x8::splat(S2L_C8));
350    let pab = f32x8::splat(S2L_C11).mul_add(u, f32x8::splat(S2L_C10));
351    let p0123 = p23.mul_add(u2, p01);
352    let p4567 = p67.mul_add(u2, p45);
353    let p8_11 = pab.mul_add(u2, p89);
354    let p0_7 = p4567.mul_add(u4, p0123);
355    let power_result = p8_11.mul_add(u_8, p0_7);
356
357    let mask = srgb.simd_lt(SRGB_LINEAR_THRESHOLD);
358    mask.blend(linear_result, power_result)
359}
360
361/// Convert 8 linear f32 values to sRGB (always inlined).
362///
363/// Use this variant inside your own `#[magetypes]` functions to avoid
364/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_dispatch`].
365///
366/// Input values are clamped to \[0, 1\].
367#[inline(always)]
368pub fn linear_to_srgb_x8_inline(linear: f32x8) -> f32x8 {
369    let linear = linear.max(ZERO).min(ONE);
370    let linear_result = linear * TWELVE_92;
371
372    // sqrt transform + degree-15 Chebyshev polynomial (Estrin evaluation)
373    let s = linear.sqrt();
374    let u = s.mul_add(f32x8::splat(L2S_INV_HW), f32x8::splat(L2S_BIAS));
375    let u2 = u * u;
376    let u4 = u2 * u2;
377    let u_8 = u4 * u4;
378    let p01 = f32x8::splat(L2S_C1).mul_add(u, f32x8::splat(L2S_C0));
379    let p23 = f32x8::splat(L2S_C3).mul_add(u, f32x8::splat(L2S_C2));
380    let p45 = f32x8::splat(L2S_C5).mul_add(u, f32x8::splat(L2S_C4));
381    let p67 = f32x8::splat(L2S_C7).mul_add(u, f32x8::splat(L2S_C6));
382    let p89 = f32x8::splat(L2S_C9).mul_add(u, f32x8::splat(L2S_C8));
383    let pab = f32x8::splat(L2S_C11).mul_add(u, f32x8::splat(L2S_C10));
384    let pcd = f32x8::splat(L2S_C13).mul_add(u, f32x8::splat(L2S_C12));
385    let pef = f32x8::splat(L2S_C15).mul_add(u, f32x8::splat(L2S_C14));
386    let p0123 = p23.mul_add(u2, p01);
387    let p4567 = p67.mul_add(u2, p45);
388    let p89ab = pab.mul_add(u2, p89);
389    let pcdef = pef.mul_add(u2, pcd);
390    let p0_7 = p4567.mul_add(u4, p0123);
391    let p8_f = pcdef.mul_add(u4, p89ab);
392    let power_result = p8_f.mul_add(u_8, p0_7);
393
394    let mask = linear.simd_lt(LINEAR_THRESHOLD);
395    mask.blend(linear_result, power_result)
396}
397
398/// Convert 8 linear f32 values to sRGB u8 (always inlined).
399///
400/// Uses a 4096-entry const LUT for direct lookup — no pow/log/exp computation.
401/// Max error: ±1 u8 level (same as the SIMD polynomial path).
402#[inline(always)]
403pub fn linear_to_srgb_u8_x8_inline(linear: f32x8) -> [u8; 8] {
404    linear_to_srgb_u8_lut_x8(linear)
405}
406
407/// Convert 8 linear f32 values to sRGB u8 using const LUT.
408///
409/// Clamps to [0,1], scales to LUT index, does 8 scalar lookups from
410/// a 4KB table (fits L1 cache). No pow/exp/log computation.
411#[inline(always)]
412pub(crate) fn linear_to_srgb_u8_lut_x8(linear: f32x8) -> [u8; 8] {
413    let clamped = linear.max(ZERO).min(ONE);
414    let scaled = clamped * f32x8::splat(4095.0) + HALF;
415    let arr: [f32; 8] = scaled.into();
416    let lut = &crate::const_luts::LINEAR_TO_SRGB_U8;
417    [
418        lut[arr[0] as usize & 0xFFF],
419        lut[arr[1] as usize & 0xFFF],
420        lut[arr[2] as usize & 0xFFF],
421        lut[arr[3] as usize & 0xFFF],
422        lut[arr[4] as usize & 0xFFF],
423        lut[arr[5] as usize & 0xFFF],
424        lut[arr[6] as usize & 0xFFF],
425        lut[arr[7] as usize & 0xFFF],
426    ]
427}
428
429/// Convert 8 gamma-encoded f32 values to linear (always inlined).
430///
431/// Use this variant inside your own `#[magetypes]` functions to avoid
432/// double dispatch overhead.
433#[inline(always)]
434pub fn gamma_to_linear_x8_inline(encoded: f32x8, gamma: f32) -> f32x8 {
435    let encoded = encoded.max(ZERO).min(ONE);
436    pow_x8(encoded, gamma)
437}
438
439/// Convert 8 linear f32 values to gamma-encoded (always inlined).
440///
441/// Use this variant inside your own `#[magetypes]` functions to avoid
442/// double dispatch overhead.
443#[inline(always)]
444pub fn linear_to_gamma_x8_inline(linear: f32x8, gamma: f32) -> f32x8 {
445    let linear = linear.max(ZERO).min(ONE);
446    pow_x8(linear, 1.0 / gamma)
447}
448
449// ============================================================================
450// magetypes #[rite] helpers (x86-64 only) — real AVX2+FMA SIMD
451// ============================================================================
452
453// sRGB transfer function scalar constants (for magetypes which needs token-gated splat)
454const MT_SRGB_LINEAR_THRESHOLD: f32 = 0.039_293_37;
455const MT_LINEAR_THRESHOLD: f32 = 0.003_041_282_6;
456const MT_LINEAR_SCALE: f32 = 1.0 / 12.92;
457const MT_TWELVE_92: f32 = 12.92;
458
459// sRGB→linear degree-11 Chebyshev polynomial (Estrin's scheme)
460// Approximates ((s + offset) / scale)^2.4 on [threshold, 1.0]
461// u = s * INV_HW + BIAS maps [threshold, 1] → [-1, 1]
462const S2L_INV_HW: f32 = 2.081_801;
463const S2L_BIAS: f32 = -1.081_800_9;
464const S2L_C0: f32 = 2.326_832_7e-1;
465const S2L_C1: f32 = 4.667_970_8e-1;
466const S2L_C2: f32 = 2.731_341e-1;
467const S2L_C3: f32 = 3.044_251_2e-2;
468const S2L_C4: f32 = -3.802_638_5e-3;
469const S2L_C5: f32 = 1.011_499_3e-3;
470const S2L_C6: f32 = -4.267_19e-4;
471const S2L_C7: f32 = 1.966_666_5e-4;
472const S2L_C8: f32 = 2.025_719_4e-5;
473const S2L_C9: f32 = -2.400_594_3e-5;
474const S2L_C10: f32 = -8.762_017e-5;
475const S2L_C11: f32 = 5.557_536_5e-5;
476
477// linear→sRGB degree-15 Chebyshev polynomial via sqrt transform (Estrin's scheme)
478// Approximates scale * (√l)^(5/6) - offset on [sqrt(threshold), 1.0]
479// u = √l * INV_HW + BIAS maps [sqrt(threshold), 1] → [-1, 1]
480const L2S_INV_HW: f32 = 2.116_733_3;
481const L2S_BIAS: f32 = -1.116_733_2;
482const L2S_C0: f32 = 5.641_828e-1;
483const L2S_C1: f32 = 4.620_569_3e-1;
484const L2S_C2: f32 = -3.450_065e-2;
485const L2S_C3: f32 = 1.202_464_2e-2;
486const L2S_C4: f32 = -5.398_721e-3;
487const L2S_C5: f32 = 2.946_610_3e-3;
488const L2S_C6: f32 = -5.274_399_6e-3;
489const L2S_C7: f32 = 4.055_202e-3;
490const L2S_C8: f32 = 1.062_489_9e-2;
491const L2S_C9: f32 = -9.012_202e-3;
492const L2S_C10: f32 = -2.186_026_6e-2;
493const L2S_C11: f32 = 1.824_478_4e-2;
494const L2S_C12: f32 = 1.958_387_2e-2;
495const L2S_C13: f32 = -1.638_288e-2;
496const L2S_C14: f32 = -7.710_282_7e-3;
497const L2S_C15: f32 = 6.419_743e-3;
498
499#[cfg(target_arch = "x86_64")]
500#[rite]
501fn srgb_to_linear_mt(token: Desktop64, srgb: mt_f32x8) -> mt_f32x8 {
502    let zero = mt_f32x8::zero(token);
503    let one = mt_f32x8::splat(token, 1.0);
504    let srgb = srgb.max(zero).min(one);
505
506    let linear_result = srgb * mt_f32x8::splat(token, MT_LINEAR_SCALE);
507
508    // Degree-11 Chebyshev polynomial (Estrin evaluation)
509    let u = srgb.mul_add(
510        mt_f32x8::splat(token, S2L_INV_HW),
511        mt_f32x8::splat(token, S2L_BIAS),
512    );
513    let u2 = u * u;
514    let u4 = u2 * u2;
515    let u_8 = u4 * u4;
516    let p01 = mt_f32x8::splat(token, S2L_C1).mul_add(u, mt_f32x8::splat(token, S2L_C0));
517    let p23 = mt_f32x8::splat(token, S2L_C3).mul_add(u, mt_f32x8::splat(token, S2L_C2));
518    let p45 = mt_f32x8::splat(token, S2L_C5).mul_add(u, mt_f32x8::splat(token, S2L_C4));
519    let p67 = mt_f32x8::splat(token, S2L_C7).mul_add(u, mt_f32x8::splat(token, S2L_C6));
520    let p89 = mt_f32x8::splat(token, S2L_C9).mul_add(u, mt_f32x8::splat(token, S2L_C8));
521    let pab = mt_f32x8::splat(token, S2L_C11).mul_add(u, mt_f32x8::splat(token, S2L_C10));
522    let p0123 = p23.mul_add(u2, p01);
523    let p4567 = p67.mul_add(u2, p45);
524    let p8_11 = pab.mul_add(u2, p89);
525    let p0_7 = p4567.mul_add(u4, p0123);
526    let power_result = p8_11.mul_add(u_8, p0_7);
527
528    let mask = srgb.simd_lt(mt_f32x8::splat(token, MT_SRGB_LINEAR_THRESHOLD));
529    mt_f32x8::blend(mask, linear_result, power_result)
530}
531
532#[cfg(target_arch = "x86_64")]
533#[rite]
534fn linear_to_srgb_mt(token: Desktop64, linear: mt_f32x8) -> mt_f32x8 {
535    let zero = mt_f32x8::zero(token);
536    let one = mt_f32x8::splat(token, 1.0);
537    let linear = linear.max(zero).min(one);
538
539    let linear_result = linear * mt_f32x8::splat(token, MT_TWELVE_92);
540
541    // sqrt transform + degree-15 Chebyshev polynomial (Estrin evaluation)
542    let s = linear.sqrt();
543    let u = s.mul_add(
544        mt_f32x8::splat(token, L2S_INV_HW),
545        mt_f32x8::splat(token, L2S_BIAS),
546    );
547    let u2 = u * u;
548    let u4 = u2 * u2;
549    let u_8 = u4 * u4;
550    let p01 = mt_f32x8::splat(token, L2S_C1).mul_add(u, mt_f32x8::splat(token, L2S_C0));
551    let p23 = mt_f32x8::splat(token, L2S_C3).mul_add(u, mt_f32x8::splat(token, L2S_C2));
552    let p45 = mt_f32x8::splat(token, L2S_C5).mul_add(u, mt_f32x8::splat(token, L2S_C4));
553    let p67 = mt_f32x8::splat(token, L2S_C7).mul_add(u, mt_f32x8::splat(token, L2S_C6));
554    let p89 = mt_f32x8::splat(token, L2S_C9).mul_add(u, mt_f32x8::splat(token, L2S_C8));
555    let pab = mt_f32x8::splat(token, L2S_C11).mul_add(u, mt_f32x8::splat(token, L2S_C10));
556    let pcd = mt_f32x8::splat(token, L2S_C13).mul_add(u, mt_f32x8::splat(token, L2S_C12));
557    let pef = mt_f32x8::splat(token, L2S_C15).mul_add(u, mt_f32x8::splat(token, L2S_C14));
558    let p0123 = p23.mul_add(u2, p01);
559    let p4567 = p67.mul_add(u2, p45);
560    let p89ab = pab.mul_add(u2, p89);
561    let pcdef = pef.mul_add(u2, pcd);
562    let p0_7 = p4567.mul_add(u4, p0123);
563    let p8_f = pcdef.mul_add(u4, p89ab);
564    let power_result = p8_f.mul_add(u_8, p0_7);
565
566    let mask = linear.simd_lt(mt_f32x8::splat(token, MT_LINEAR_THRESHOLD));
567    mt_f32x8::blend(mask, linear_result, power_result)
568}
569
570#[cfg(target_arch = "x86_64")]
571#[rite]
572fn gamma_to_linear_mt(token: Desktop64, encoded: mt_f32x8, gamma: f32) -> mt_f32x8 {
573    let zero = mt_f32x8::zero(token);
574    let one = mt_f32x8::splat(token, 1.0);
575    let encoded = encoded.max(zero).min(one);
576    encoded.pow_midp(gamma)
577}
578
579#[cfg(target_arch = "x86_64")]
580#[rite]
581fn linear_to_gamma_mt(token: Desktop64, linear: mt_f32x8, gamma: f32) -> mt_f32x8 {
582    let zero = mt_f32x8::zero(token);
583    let one = mt_f32x8::splat(token, 1.0);
584    let linear = linear.max(zero).min(one);
585    linear.pow_midp(1.0 / gamma)
586}
587
588// ============================================================================
589// x8 Dispatch Functions - Runtime CPU feature detection
590// ============================================================================
591
592#[cfg(target_arch = "x86_64")]
593#[arcane]
594fn srgb_to_linear_x8_tier_v3(token: Desktop64, srgb: f32x8) -> f32x8 {
595    let arr: [f32; 8] = srgb.into();
596    let v = mt_f32x8::from_array(token, arr);
597    let result = srgb_to_linear_mt(token, v);
598    f32x8::from(result.to_array())
599}
600
601fn srgb_to_linear_x8_tier_scalar(_token: ScalarToken, srgb: f32x8) -> f32x8 {
602    srgb_to_linear_x8_inline(srgb)
603}
604
605/// Convert 8 sRGB f32 values to linear (with CPU dispatch).
606///
607/// This variant uses runtime CPU feature detection to select the optimal
608/// implementation. Use [`srgb_to_linear_x8_inline`] inside your own
609/// `#[magetypes]` functions to avoid double dispatch.
610///
611/// Input values are clamped to \[0, 1\].
612#[inline]
613pub fn srgb_to_linear_x8_dispatch(srgb: f32x8) -> f32x8 {
614    incant!(srgb_to_linear_x8_tier(srgb), [v3])
615}
616
617#[cfg(target_arch = "x86_64")]
618#[arcane]
619fn linear_to_srgb_x8_tier_v3(token: Desktop64, linear: f32x8) -> f32x8 {
620    let arr: [f32; 8] = linear.into();
621    let v = mt_f32x8::from_array(token, arr);
622    let result = linear_to_srgb_mt(token, v);
623    f32x8::from(result.to_array())
624}
625
626fn linear_to_srgb_x8_tier_scalar(_token: ScalarToken, linear: f32x8) -> f32x8 {
627    linear_to_srgb_x8_inline(linear)
628}
629
630/// Convert 8 linear f32 values to sRGB (with CPU dispatch).
631///
632/// This variant uses runtime CPU feature detection to select the optimal
633/// implementation. Use [`linear_to_srgb_x8_inline`] inside your own
634/// `#[magetypes]` functions to avoid double dispatch.
635///
636/// Input values are clamped to \[0, 1\].
637#[inline]
638pub fn linear_to_srgb_x8_dispatch(linear: f32x8) -> f32x8 {
639    incant!(linear_to_srgb_x8_tier(linear), [v3])
640}
641
642/// Convert 8 linear f32 values to sRGB u8 (LUT-based, no dispatch needed).
643#[inline]
644pub fn linear_to_srgb_u8_x8_dispatch(linear: f32x8) -> [u8; 8] {
645    linear_to_srgb_u8_x8_inline(linear)
646}
647
648#[cfg(target_arch = "x86_64")]
649#[arcane]
650fn gamma_to_linear_x8_tier_v3(token: Desktop64, encoded: f32x8, gamma: f32) -> f32x8 {
651    let arr: [f32; 8] = encoded.into();
652    let v = mt_f32x8::from_array(token, arr);
653    let result = gamma_to_linear_mt(token, v, gamma);
654    f32x8::from(result.to_array())
655}
656
657fn gamma_to_linear_x8_tier_scalar(_token: ScalarToken, encoded: f32x8, gamma: f32) -> f32x8 {
658    gamma_to_linear_x8_inline(encoded, gamma)
659}
660
661/// Convert 8 gamma-encoded f32 values to linear (with CPU dispatch).
662#[inline]
663pub fn gamma_to_linear_x8_dispatch(encoded: f32x8, gamma: f32) -> f32x8 {
664    incant!(gamma_to_linear_x8_tier(encoded, gamma), [v3])
665}
666
667#[cfg(target_arch = "x86_64")]
668#[arcane]
669fn linear_to_gamma_x8_tier_v3(token: Desktop64, linear: f32x8, gamma: f32) -> f32x8 {
670    let arr: [f32; 8] = linear.into();
671    let v = mt_f32x8::from_array(token, arr);
672    let result = linear_to_gamma_mt(token, v, gamma);
673    f32x8::from(result.to_array())
674}
675
676fn linear_to_gamma_x8_tier_scalar(_token: ScalarToken, linear: f32x8, gamma: f32) -> f32x8 {
677    linear_to_gamma_x8_inline(linear, gamma)
678}
679
680/// Convert 8 linear f32 values to gamma-encoded (with CPU dispatch).
681#[inline]
682pub fn linear_to_gamma_x8_dispatch(linear: f32x8, gamma: f32) -> f32x8 {
683    incant!(linear_to_gamma_x8_tier(linear, gamma), [v3])
684}
685
686// ============================================================================
687// x8 Default Functions - Calls inline variant, compiler decides inlining
688// ============================================================================
689
690/// Convert 8 sRGB f32 values to linear.
691///
692/// This is the default variant that calls the inline implementation.
693/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
694/// inside your own `#[magetypes]` functions.
695///
696/// Input values are clamped to \[0, 1\].
697///
698/// # Example
699/// ```
700/// use linear_srgb::simd::srgb_to_linear_x8;
701/// use wide::f32x8;
702///
703/// let srgb = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
704/// let linear = srgb_to_linear_x8(srgb);
705/// ```
706#[inline]
707pub fn srgb_to_linear_x8(srgb: f32x8) -> f32x8 {
708    srgb_to_linear_x8_inline(srgb)
709}
710
711/// Convert 8 linear f32 values to sRGB.
712///
713/// This is the default variant that calls the inline implementation.
714/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
715/// inside your own `#[magetypes]` functions.
716///
717/// Input values are clamped to \[0, 1\].
718///
719/// # Example
720/// ```
721/// use linear_srgb::simd::linear_to_srgb_x8;
722/// use wide::f32x8;
723///
724/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
725/// let srgb = linear_to_srgb_x8(linear);
726/// ```
727#[inline]
728pub fn linear_to_srgb_x8(linear: f32x8) -> f32x8 {
729    linear_to_srgb_x8_inline(linear)
730}
731
732/// Convert 8 sRGB u8 values to linear f32 using LUT lookup.
733///
734/// This is the fastest method for u8 input as it uses a precomputed lookup table.
735///
736/// # Example
737/// ```
738/// use linear_srgb::simd::srgb_u8_to_linear_x8;
739///
740/// let srgb = [0u8, 64, 128, 192, 255, 32, 96, 160];
741/// let linear = srgb_u8_to_linear_x8(srgb);
742/// ```
743#[inline]
744pub fn srgb_u8_to_linear_x8(srgb: [u8; 8]) -> f32x8 {
745    let lut = get_lut();
746    f32x8::from([
747        lut[srgb[0] as usize],
748        lut[srgb[1] as usize],
749        lut[srgb[2] as usize],
750        lut[srgb[3] as usize],
751        lut[srgb[4] as usize],
752        lut[srgb[5] as usize],
753        lut[srgb[6] as usize],
754        lut[srgb[7] as usize],
755    ])
756}
757
758/// Convert 8 linear f32 values to sRGB u8.
759///
760/// Input values are clamped to \[0, 1\], output is rounded to nearest u8.
761///
762/// # Example
763/// ```
764/// use linear_srgb::simd::linear_to_srgb_u8_x8;
765/// use wide::f32x8;
766///
767/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
768/// let srgb = linear_to_srgb_u8_x8(linear);
769/// ```
770#[inline]
771pub fn linear_to_srgb_u8_x8(linear: f32x8) -> [u8; 8] {
772    linear_to_srgb_u8_x8_inline(linear)
773}
774
775/// Convert 8 gamma-encoded f32 values to linear.
776///
777/// # Example
778/// ```
779/// use linear_srgb::simd::gamma_to_linear_x8;
780/// use wide::f32x8;
781///
782/// let encoded = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
783/// let linear = gamma_to_linear_x8(encoded, 2.2);
784/// ```
785#[inline]
786pub fn gamma_to_linear_x8(encoded: f32x8, gamma: f32) -> f32x8 {
787    gamma_to_linear_x8_inline(encoded, gamma)
788}
789
790/// Convert 8 linear f32 values to gamma-encoded.
791///
792/// # Example
793/// ```
794/// use linear_srgb::simd::linear_to_gamma_x8;
795/// use wide::f32x8;
796///
797/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
798/// let encoded = linear_to_gamma_x8(linear, 2.2);
799/// ```
800#[inline]
801pub fn linear_to_gamma_x8(linear: f32x8, gamma: f32) -> f32x8 {
802    linear_to_gamma_x8_inline(linear, gamma)
803}
804
805// ============================================================================
806// Slice Functions - Process entire slices
807// ============================================================================
808
809#[cfg(target_arch = "x86_64")]
810#[arcane]
811fn srgb_to_linear_slice_tier_v3(token: Desktop64, values: &mut [f32]) {
812    let (chunks, remainder) = values.as_chunks_mut::<8>();
813
814    for chunk in chunks {
815        let v = mt_f32x8::from_array(token, *chunk);
816        let result = srgb_to_linear_mt(token, v);
817        *chunk = result.to_array();
818    }
819
820    for v in remainder {
821        *v = crate::scalar::srgb_to_linear(*v);
822    }
823}
824
825fn srgb_to_linear_slice_tier_scalar(_token: ScalarToken, values: &mut [f32]) {
826    for v in values.iter_mut() {
827        *v = crate::scalar::srgb_to_linear(*v);
828    }
829}
830
831/// Convert sRGB f32 values to linear in-place.
832///
833/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
834///
835/// # Example
836/// ```
837/// use linear_srgb::simd::srgb_to_linear_slice;
838///
839/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
840/// srgb_to_linear_slice(&mut values);
841/// ```
842#[inline]
843pub fn srgb_to_linear_slice(values: &mut [f32]) {
844    incant!(srgb_to_linear_slice_tier(values), [v3])
845}
846
847#[cfg(target_arch = "x86_64")]
848#[arcane]
849fn linear_to_srgb_slice_tier_v3(token: Desktop64, values: &mut [f32]) {
850    let (chunks, remainder) = values.as_chunks_mut::<8>();
851
852    for chunk in chunks {
853        let v = mt_f32x8::from_array(token, *chunk);
854        let result = linear_to_srgb_mt(token, v);
855        *chunk = result.to_array();
856    }
857
858    for v in remainder {
859        *v = crate::scalar::linear_to_srgb(*v);
860    }
861}
862
863fn linear_to_srgb_slice_tier_scalar(_token: ScalarToken, values: &mut [f32]) {
864    for v in values.iter_mut() {
865        *v = crate::scalar::linear_to_srgb(*v);
866    }
867}
868
869/// Convert linear f32 values to sRGB in-place.
870///
871/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
872///
873/// # Example
874/// ```
875/// use linear_srgb::simd::linear_to_srgb_slice;
876///
877/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
878/// linear_to_srgb_slice(&mut values);
879/// ```
880#[inline]
881pub fn linear_to_srgb_slice(values: &mut [f32]) {
882    incant!(linear_to_srgb_slice_tier(values), [v3])
883}
884
885/// Convert sRGB u8 values to linear f32.
886///
887/// Uses a precomputed LUT for each u8 value, processed in SIMD batches of 8.
888///
889/// # Panics
890/// Panics if `input.len() != output.len()`.
891///
892/// # Example
893/// ```
894/// use linear_srgb::simd::srgb_u8_to_linear_slice;
895///
896/// let input: Vec<u8> = (0..=255).collect();
897/// let mut output = vec![0.0f32; 256];
898/// srgb_u8_to_linear_slice(&input, &mut output);
899/// ```
900#[inline]
901pub fn srgb_u8_to_linear_slice(input: &[u8], output: &mut [f32]) {
902    assert_eq!(input.len(), output.len());
903    let lut = get_lut();
904
905    let (in_chunks, in_remainder) = input.as_chunks::<8>();
906    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
907
908    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
909        *out = [
910            lut[inp[0] as usize],
911            lut[inp[1] as usize],
912            lut[inp[2] as usize],
913            lut[inp[3] as usize],
914            lut[inp[4] as usize],
915            lut[inp[5] as usize],
916            lut[inp[6] as usize],
917            lut[inp[7] as usize],
918        ];
919    }
920
921    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
922        *out = lut[*inp as usize];
923    }
924}
925
926/// Convert linear f32 values to sRGB u8.
927///
928/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
929///
930/// # Panics
931/// Panics if `input.len() != output.len()`.
932///
933/// # Example
934/// ```
935/// use linear_srgb::simd::linear_to_srgb_u8_slice;
936///
937/// let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
938/// let mut output = vec![0u8; 256];
939/// linear_to_srgb_u8_slice(&input, &mut output);
940/// ```
941pub fn linear_to_srgb_u8_slice(input: &[f32], output: &mut [u8]) {
942    assert_eq!(input.len(), output.len());
943
944    let lut = &crate::const_luts::LINEAR_TO_SRGB_U8;
945
946    // Process 8 at a time using SIMD for index computation
947    let (in_chunks, in_remainder) = input.as_chunks::<8>();
948    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
949
950    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
951        let linear = f32x8::from(*inp);
952        let clamped = linear.max(ZERO).min(ONE);
953        let scaled = clamped * f32x8::splat(4095.0) + HALF;
954        let arr: [f32; 8] = scaled.into();
955        *out = [
956            lut[arr[0] as usize & 0xFFF],
957            lut[arr[1] as usize & 0xFFF],
958            lut[arr[2] as usize & 0xFFF],
959            lut[arr[3] as usize & 0xFFF],
960            lut[arr[4] as usize & 0xFFF],
961            lut[arr[5] as usize & 0xFFF],
962            lut[arr[6] as usize & 0xFFF],
963            lut[arr[7] as usize & 0xFFF],
964        ];
965    }
966
967    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
968        *out = crate::scalar::linear_to_srgb_u8(*inp);
969    }
970}
971
972// ============================================================================
973// u16 Batch Functions (LUT-based)
974// ============================================================================
975
976/// Convert sRGB u16 values to linear f32 using a 65536-entry const LUT.
977///
978/// Pure table lookup, no math. The LUT is 256KB.
979///
980/// # Panics
981/// Panics if `input.len() != output.len()`.
982pub fn srgb_u16_to_linear_slice(input: &[u16], output: &mut [f32]) {
983    assert_eq!(input.len(), output.len());
984    let lut = &crate::const_luts_u16::SRGB_U16_TO_LINEAR_F32;
985
986    let (in_chunks, in_remainder) = input.as_chunks::<8>();
987    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
988
989    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
990        *out = [
991            lut[inp[0] as usize],
992            lut[inp[1] as usize],
993            lut[inp[2] as usize],
994            lut[inp[3] as usize],
995            lut[inp[4] as usize],
996            lut[inp[5] as usize],
997            lut[inp[6] as usize],
998            lut[inp[7] as usize],
999        ];
1000    }
1001
1002    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
1003        *out = lut[*inp as usize];
1004    }
1005}
1006
1007/// Convert linear f32 values to sRGB u16 using a 65537-entry const LUT.
1008///
1009/// # Panics
1010/// Panics if `input.len() != output.len()`.
1011pub fn linear_to_srgb_u16_slice(input: &[f32], output: &mut [u16]) {
1012    assert_eq!(input.len(), output.len());
1013    let lut = &crate::const_luts_u16::LINEAR_TO_SRGB_U16_65536;
1014
1015    let (in_chunks, in_remainder) = input.as_chunks::<8>();
1016    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
1017
1018    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
1019        let linear = f32x8::from(*inp);
1020        let clamped = linear.max(ZERO).min(ONE);
1021        let scaled = clamped * f32x8::splat(65536.0) + HALF;
1022        let arr: [f32; 8] = scaled.into();
1023        *out = [
1024            lut[arr[0] as usize],
1025            lut[arr[1] as usize],
1026            lut[arr[2] as usize],
1027            lut[arr[3] as usize],
1028            lut[arr[4] as usize],
1029            lut[arr[5] as usize],
1030            lut[arr[6] as usize],
1031            lut[arr[7] as usize],
1032        ];
1033    }
1034
1035    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
1036        *out = crate::scalar::linear_to_srgb_u16(*inp);
1037    }
1038}
1039
1040// ============================================================================
1041// Custom Gamma Slice Functions
1042// ============================================================================
1043
1044#[cfg(target_arch = "x86_64")]
1045#[arcane]
1046fn gamma_to_linear_slice_tier_v3(token: Desktop64, values: &mut [f32], gamma: f32) {
1047    let (chunks, remainder) = values.as_chunks_mut::<8>();
1048
1049    for chunk in chunks {
1050        let v = mt_f32x8::from_array(token, *chunk);
1051        let result = gamma_to_linear_mt(token, v, gamma);
1052        *chunk = result.to_array();
1053    }
1054
1055    for v in remainder {
1056        *v = crate::scalar::gamma_to_linear(*v, gamma);
1057    }
1058}
1059
1060fn gamma_to_linear_slice_tier_scalar(_token: ScalarToken, values: &mut [f32], gamma: f32) {
1061    for v in values.iter_mut() {
1062        *v = crate::scalar::gamma_to_linear(*v, gamma);
1063    }
1064}
1065
1066/// Convert gamma-encoded f32 values to linear in-place using a custom gamma.
1067///
1068/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
1069///
1070/// # Example
1071/// ```
1072/// use linear_srgb::simd::gamma_to_linear_slice;
1073///
1074/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
1075/// gamma_to_linear_slice(&mut values, 2.2);
1076/// ```
1077#[inline]
1078pub fn gamma_to_linear_slice(values: &mut [f32], gamma: f32) {
1079    incant!(gamma_to_linear_slice_tier(values, gamma), [v3])
1080}
1081
1082#[cfg(target_arch = "x86_64")]
1083#[arcane]
1084fn linear_to_gamma_slice_tier_v3(token: Desktop64, values: &mut [f32], gamma: f32) {
1085    let (chunks, remainder) = values.as_chunks_mut::<8>();
1086
1087    for chunk in chunks {
1088        let v = mt_f32x8::from_array(token, *chunk);
1089        let result = linear_to_gamma_mt(token, v, gamma);
1090        *chunk = result.to_array();
1091    }
1092
1093    for v in remainder {
1094        *v = crate::scalar::linear_to_gamma(*v, gamma);
1095    }
1096}
1097
1098fn linear_to_gamma_slice_tier_scalar(_token: ScalarToken, values: &mut [f32], gamma: f32) {
1099    for v in values.iter_mut() {
1100        *v = crate::scalar::linear_to_gamma(*v, gamma);
1101    }
1102}
1103
1104/// Convert linear f32 values to gamma-encoded in-place using a custom gamma.
1105///
1106/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
1107///
1108/// # Example
1109/// ```
1110/// use linear_srgb::simd::linear_to_gamma_slice;
1111///
1112/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
1113/// linear_to_gamma_slice(&mut values, 2.2);
1114/// ```
1115#[inline]
1116pub fn linear_to_gamma_slice(values: &mut [f32], gamma: f32) {
1117    incant!(linear_to_gamma_slice_tier(values, gamma), [v3])
1118}
1119
1120// ============================================================================
1121// f32x8 Slice Functions (for pre-aligned SIMD data)
1122// ============================================================================
1123
1124#[cfg(target_arch = "x86_64")]
1125#[arcane]
1126fn srgb_to_linear_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8]) {
1127    for v in values.iter_mut() {
1128        let arr: [f32; 8] = (*v).into();
1129        let mt_v = mt_f32x8::from_array(token, arr);
1130        let result = srgb_to_linear_mt(token, mt_v);
1131        *v = f32x8::from(result.to_array());
1132    }
1133}
1134
1135fn srgb_to_linear_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8]) {
1136    for v in values.iter_mut() {
1137        *v = srgb_to_linear_x8_inline(*v);
1138    }
1139}
1140
1141/// Convert sRGB f32x8 values to linear in-place.
1142///
1143/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1144/// use [`srgb_to_linear_slice`] instead which handles remainders automatically.
1145///
1146/// # Example
1147/// ```
1148/// use linear_srgb::simd::srgb_to_linear_x8_slice;
1149/// use wide::f32x8;
1150///
1151/// let mut values = vec![f32x8::splat(0.5); 100];
1152/// srgb_to_linear_x8_slice(&mut values);
1153/// ```
1154#[inline]
1155pub fn srgb_to_linear_x8_slice(values: &mut [f32x8]) {
1156    incant!(srgb_to_linear_x8_slice_tier(values), [v3])
1157}
1158
1159#[cfg(target_arch = "x86_64")]
1160#[arcane]
1161fn linear_to_srgb_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8]) {
1162    for v in values.iter_mut() {
1163        let arr: [f32; 8] = (*v).into();
1164        let mt_v = mt_f32x8::from_array(token, arr);
1165        let result = linear_to_srgb_mt(token, mt_v);
1166        *v = f32x8::from(result.to_array());
1167    }
1168}
1169
1170fn linear_to_srgb_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8]) {
1171    for v in values.iter_mut() {
1172        *v = linear_to_srgb_x8_inline(*v);
1173    }
1174}
1175
1176/// Convert linear f32x8 values to sRGB in-place.
1177///
1178/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1179/// use [`linear_to_srgb_slice`] instead which handles remainders automatically.
1180///
1181/// # Example
1182/// ```
1183/// use linear_srgb::simd::linear_to_srgb_x8_slice;
1184/// use wide::f32x8;
1185///
1186/// let mut values = vec![f32x8::splat(0.5); 100];
1187/// linear_to_srgb_x8_slice(&mut values);
1188/// ```
1189#[inline]
1190pub fn linear_to_srgb_x8_slice(values: &mut [f32x8]) {
1191    incant!(linear_to_srgb_x8_slice_tier(values), [v3])
1192}
1193
1194#[cfg(target_arch = "x86_64")]
1195#[arcane]
1196fn gamma_to_linear_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8], gamma: f32) {
1197    for v in values.iter_mut() {
1198        let arr: [f32; 8] = (*v).into();
1199        let mt_v = mt_f32x8::from_array(token, arr);
1200        let result = gamma_to_linear_mt(token, mt_v, gamma);
1201        *v = f32x8::from(result.to_array());
1202    }
1203}
1204
1205fn gamma_to_linear_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8], gamma: f32) {
1206    for v in values.iter_mut() {
1207        *v = gamma_to_linear_x8_inline(*v, gamma);
1208    }
1209}
1210
1211/// Convert gamma-encoded f32x8 values to linear in-place using a custom gamma.
1212///
1213/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1214/// use [`gamma_to_linear_slice`] instead which handles remainders automatically.
1215///
1216/// # Example
1217/// ```
1218/// use linear_srgb::simd::gamma_to_linear_x8_slice;
1219/// use wide::f32x8;
1220///
1221/// let mut values = vec![f32x8::splat(0.5); 100];
1222/// gamma_to_linear_x8_slice(&mut values, 2.2);
1223/// ```
1224#[inline]
1225pub fn gamma_to_linear_x8_slice(values: &mut [f32x8], gamma: f32) {
1226    incant!(gamma_to_linear_x8_slice_tier(values, gamma), [v3])
1227}
1228
1229#[cfg(target_arch = "x86_64")]
1230#[arcane]
1231fn linear_to_gamma_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8], gamma: f32) {
1232    for v in values.iter_mut() {
1233        let arr: [f32; 8] = (*v).into();
1234        let mt_v = mt_f32x8::from_array(token, arr);
1235        let result = linear_to_gamma_mt(token, mt_v, gamma);
1236        *v = f32x8::from(result.to_array());
1237    }
1238}
1239
1240fn linear_to_gamma_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8], gamma: f32) {
1241    for v in values.iter_mut() {
1242        *v = linear_to_gamma_x8_inline(*v, gamma);
1243    }
1244}
1245
1246/// Convert linear f32x8 values to gamma-encoded in-place using a custom gamma.
1247///
1248/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1249/// use [`linear_to_gamma_slice`] instead which handles remainders automatically.
1250///
1251/// # Example
1252/// ```
1253/// use linear_srgb::simd::linear_to_gamma_x8_slice;
1254/// use wide::f32x8;
1255///
1256/// let mut values = vec![f32x8::splat(0.2); 100];
1257/// linear_to_gamma_x8_slice(&mut values, 2.2);
1258/// ```
1259#[inline]
1260pub fn linear_to_gamma_x8_slice(values: &mut [f32x8], gamma: f32) {
1261    incant!(linear_to_gamma_x8_slice_tier(values, gamma), [v3])
1262}
1263
1264// ============================================================================
1265// f32x8 Slice Inline Functions (for use inside caller's magetypes code)
1266// ============================================================================
1267
1268/// Convert sRGB f32x8 values to linear in-place (always inlined).
1269///
1270/// Use this variant inside your own `#[magetypes]` functions to avoid
1271/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_slice`].
1272#[inline(always)]
1273pub fn srgb_to_linear_x8_slice_inline(values: &mut [f32x8]) {
1274    for v in values.iter_mut() {
1275        *v = srgb_to_linear_x8_inline(*v);
1276    }
1277}
1278
1279/// Convert linear f32x8 values to sRGB in-place (always inlined).
1280///
1281/// Use this variant inside your own `#[magetypes]` functions to avoid
1282/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_slice`].
1283#[inline(always)]
1284pub fn linear_to_srgb_x8_slice_inline(values: &mut [f32x8]) {
1285    for v in values.iter_mut() {
1286        *v = linear_to_srgb_x8_inline(*v);
1287    }
1288}
1289
1290/// Convert gamma-encoded f32x8 values to linear in-place (always inlined).
1291///
1292/// Use this variant inside your own `#[magetypes]` functions to avoid
1293/// double dispatch overhead. For standalone calls, use [`gamma_to_linear_x8_slice`].
1294#[inline(always)]
1295pub fn gamma_to_linear_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
1296    for v in values.iter_mut() {
1297        *v = gamma_to_linear_x8_inline(*v, gamma);
1298    }
1299}
1300
1301/// Convert linear f32x8 values to gamma-encoded in-place (always inlined).
1302///
1303/// Use this variant inside your own `#[magetypes]` functions to avoid
1304/// double dispatch overhead. For standalone calls, use [`linear_to_gamma_x8_slice`].
1305#[inline(always)]
1306pub fn linear_to_gamma_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
1307    for v in values.iter_mut() {
1308        *v = linear_to_gamma_x8_inline(*v, gamma);
1309    }
1310}
1311
1312// ============================================================================
1313// Tests
1314// ============================================================================
1315
1316#[cfg(test)]
1317mod tests {
1318    use super::*;
1319
1320    #[cfg(not(feature = "std"))]
1321    use alloc::{vec, vec::Vec};
1322
1323    // ---- x8 function tests ----
1324
1325    #[test]
1326    #[allow(deprecated)]
1327    fn test_srgb_to_linear_x8() {
1328        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1329        let result = srgb_to_linear_x8(f32x8::from(input));
1330        let result_arr: [f32; 8] = result.into();
1331
1332        for (i, &inp) in input.iter().enumerate() {
1333            let expected = crate::scalar::srgb_to_linear(inp);
1334            assert!(
1335                (result_arr[i] - expected).abs() < 1e-5,
1336                "srgb_to_linear_x8 mismatch at {}: got {}, expected {}",
1337                i,
1338                result_arr[i],
1339                expected
1340            );
1341        }
1342    }
1343
1344    #[test]
1345    fn test_linear_to_srgb_x8() {
1346        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1347        let result = linear_to_srgb_x8(f32x8::from(input));
1348        let result_arr: [f32; 8] = result.into();
1349
1350        for (i, &inp) in input.iter().enumerate() {
1351            let expected = crate::scalar::linear_to_srgb(inp);
1352            assert!(
1353                (result_arr[i] - expected).abs() < 1e-5,
1354                "linear_to_srgb_x8 mismatch at {}: got {}, expected {}",
1355                i,
1356                result_arr[i],
1357                expected
1358            );
1359        }
1360    }
1361
1362    #[test]
1363    #[allow(deprecated)]
1364    fn test_srgb_u8_to_linear_x8() {
1365        let input: [u8; 8] = [0, 64, 128, 192, 255, 32, 96, 160];
1366        let result = srgb_u8_to_linear_x8(input);
1367        let result_arr: [f32; 8] = result.into();
1368
1369        for (i, &inp) in input.iter().enumerate() {
1370            let expected = crate::scalar::srgb_u8_to_linear(inp);
1371            assert!(
1372                (result_arr[i] - expected).abs() < 1e-6,
1373                "srgb_u8_to_linear_x8 mismatch at {}: got {}, expected {}",
1374                i,
1375                result_arr[i],
1376                expected
1377            );
1378        }
1379    }
1380
1381    #[test]
1382    fn test_linear_to_srgb_u8_x8() {
1383        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8];
1384        let result = linear_to_srgb_u8_x8(f32x8::from(input));
1385
1386        for (i, &inp) in input.iter().enumerate() {
1387            let expected = (crate::scalar::linear_to_srgb(inp) * 255.0 + 0.5) as u8;
1388            assert!(
1389                (result[i] as i16 - expected as i16).abs() <= 1,
1390                "linear_to_srgb_u8_x8 mismatch at {}: got {}, expected {}",
1391                i,
1392                result[i],
1393                expected
1394            );
1395        }
1396    }
1397
1398    // ---- Slice function tests ----
1399
1400    #[test]
1401    fn test_srgb_to_linear_slice() {
1402        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1403        let expected: Vec<f32> = values
1404            .iter()
1405            .map(|&v| crate::scalar::srgb_to_linear(v))
1406            .collect();
1407
1408        srgb_to_linear_slice(&mut values);
1409
1410        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1411            assert!(
1412                (got - exp).abs() < 1e-5,
1413                "srgb_to_linear_slice mismatch at {}: got {}, expected {}",
1414                i,
1415                got,
1416                exp
1417            );
1418        }
1419    }
1420
1421    #[test]
1422    fn test_linear_to_srgb_slice() {
1423        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1424        let expected: Vec<f32> = values
1425            .iter()
1426            .map(|&v| crate::scalar::linear_to_srgb(v))
1427            .collect();
1428
1429        linear_to_srgb_slice(&mut values);
1430
1431        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1432            assert!(
1433                (got - exp).abs() < 1e-5,
1434                "linear_to_srgb_slice mismatch at {}: got {}, expected {}",
1435                i,
1436                got,
1437                exp
1438            );
1439        }
1440    }
1441
1442    #[test]
1443    #[allow(deprecated)]
1444    fn test_srgb_u8_to_linear_slice() {
1445        let input: Vec<u8> = (0..=255).collect();
1446        let mut output = vec![0.0f32; 256];
1447
1448        srgb_u8_to_linear_slice(&input, &mut output);
1449
1450        for (i, &out) in output.iter().enumerate() {
1451            let expected = crate::scalar::srgb_u8_to_linear(i as u8);
1452            assert!(
1453                (out - expected).abs() < 1e-6,
1454                "srgb_u8_to_linear_slice mismatch at {}: got {}, expected {}",
1455                i,
1456                out,
1457                expected
1458            );
1459        }
1460    }
1461
1462    #[test]
1463    fn test_linear_to_srgb_u8_slice() {
1464        let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
1465        let mut output = vec![0u8; 256];
1466
1467        linear_to_srgb_u8_slice(&input, &mut output);
1468
1469        for i in 0..256 {
1470            let expected = (crate::scalar::linear_to_srgb(input[i]) * 255.0 + 0.5) as u8;
1471            assert!(
1472                (output[i] as i16 - expected as i16).abs() <= 1,
1473                "linear_to_srgb_u8_slice mismatch at {}: got {}, expected {}",
1474                i,
1475                output[i],
1476                expected
1477            );
1478        }
1479    }
1480
1481    // ---- Roundtrip tests ----
1482
1483    #[test]
1484    fn test_f32_roundtrip() {
1485        let mut values: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
1486        let original = values.clone();
1487
1488        srgb_to_linear_slice(&mut values);
1489        linear_to_srgb_slice(&mut values);
1490
1491        for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1492            assert!(
1493                (orig - conv).abs() < 1e-4,
1494                "f32 roundtrip failed at {}: {} -> {}",
1495                i,
1496                orig,
1497                conv
1498            );
1499        }
1500    }
1501
1502    #[test]
1503    fn test_u8_roundtrip() {
1504        let input: Vec<u8> = (0..=255).collect();
1505        let mut linear = vec![0.0f32; 256];
1506        let mut back = vec![0u8; 256];
1507
1508        srgb_u8_to_linear_slice(&input, &mut linear);
1509        linear_to_srgb_u8_slice(&linear, &mut back);
1510
1511        for i in 0..256 {
1512            assert!(
1513                (input[i] as i16 - back[i] as i16).abs() <= 1,
1514                "u8 roundtrip failed at {}: {} -> {} -> {}",
1515                i,
1516                input[i],
1517                linear[i],
1518                back[i]
1519            );
1520        }
1521    }
1522
1523    // ---- Edge case tests ----
1524
1525    #[test]
1526    #[allow(deprecated)]
1527    fn test_clamping() {
1528        // Test that out-of-range values are clamped
1529        let input = f32x8::from([-0.5, -0.1, 0.0, 0.5, 1.0, 1.5, 2.0, 10.0]);
1530        let result = srgb_to_linear_x8(input);
1531        let arr: [f32; 8] = result.into();
1532
1533        assert_eq!(arr[0], 0.0, "negative should clamp to 0");
1534        assert_eq!(arr[1], 0.0, "negative should clamp to 0");
1535        assert!(arr[4] > 0.99 && arr[4] <= 1.0, "1.0 should stay ~1.0");
1536        assert!(arr[5] > 0.99 && arr[5] <= 1.0, "values > 1 should clamp");
1537    }
1538
1539    #[test]
1540    #[allow(deprecated)]
1541    fn test_linear_segment() {
1542        // Test values in the linear segment (< 0.04045)
1543        let input = f32x8::from([0.0, 0.01, 0.02, 0.03, 0.04, 0.005, 0.015, 0.035]);
1544        let result = srgb_to_linear_x8(input);
1545        let arr: [f32; 8] = result.into();
1546        let input_arr: [f32; 8] = input.into();
1547
1548        for i in 0..8 {
1549            let expected = input_arr[i] / 12.92;
1550            assert!(
1551                (arr[i] - expected).abs() < 1e-6,
1552                "linear segment mismatch at {}: got {}, expected {}",
1553                i,
1554                arr[i],
1555                expected
1556            );
1557        }
1558    }
1559
1560    /// Verify the const LUT stays in sync with the transfer function.
1561    /// Allows 1 ULP difference for cross-platform float variance (powf isn't
1562    /// perfectly deterministic across architectures).
1563    #[test]
1564    #[allow(deprecated)]
1565    fn test_lut_matches_transfer_function() {
1566        let lut = get_lut();
1567        for i in 0..=255u8 {
1568            let expected = crate::scalar::srgb_u8_to_linear(i);
1569            let got = lut[i as usize];
1570            let got_bits = got.to_bits();
1571            let expected_bits = expected.to_bits();
1572            let ulp_diff = (got_bits as i64 - expected_bits as i64).unsigned_abs();
1573            assert!(
1574                ulp_diff <= 1,
1575                "LUT[{}] = {} ({:08x}) differs by {} ULP from srgb_u8_to_linear({}) = {} ({:08x}). \
1576                 LUT needs regeneration if transfer constants changed.",
1577                i,
1578                got,
1579                got_bits,
1580                ulp_diff,
1581                i,
1582                expected,
1583                expected_bits
1584            );
1585        }
1586    }
1587
1588    #[test]
1589    fn test_empty_slice() {
1590        let mut empty: Vec<f32> = vec![];
1591        srgb_to_linear_slice(&mut empty);
1592        assert!(empty.is_empty());
1593
1594        let empty_u8: Vec<u8> = vec![];
1595        let mut empty_out: Vec<f32> = vec![];
1596        srgb_u8_to_linear_slice(&empty_u8, &mut empty_out);
1597    }
1598
1599    #[test]
1600    fn test_non_multiple_of_8() {
1601        // Test slices that aren't multiples of 8
1602        for len in [1, 3, 7, 9, 15, 17, 100] {
1603            let mut values: Vec<f32> = (0..len).map(|i| i as f32 / len as f32).collect();
1604            let expected: Vec<f32> = values
1605                .iter()
1606                .map(|&v| crate::scalar::srgb_to_linear(v))
1607                .collect();
1608
1609            srgb_to_linear_slice(&mut values);
1610
1611            for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1612                assert!(
1613                    (got - exp).abs() < 1e-5,
1614                    "len={} mismatch at {}: got {}, expected {}",
1615                    len,
1616                    i,
1617                    got,
1618                    exp
1619                );
1620            }
1621        }
1622    }
1623
1624    // ---- Custom gamma tests ----
1625
1626    #[test]
1627    fn test_gamma_to_linear_x8() {
1628        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1629        let gamma = 2.2f32;
1630        let result = gamma_to_linear_x8(f32x8::from(input), gamma);
1631        let result_arr: [f32; 8] = result.into();
1632
1633        for (i, &inp) in input.iter().enumerate() {
1634            let expected = crate::scalar::gamma_to_linear(inp, gamma);
1635            assert!(
1636                (result_arr[i] - expected).abs() < 1e-5,
1637                "gamma_to_linear_x8 mismatch at {}: got {}, expected {}",
1638                i,
1639                result_arr[i],
1640                expected
1641            );
1642        }
1643    }
1644
1645    #[test]
1646    fn test_linear_to_gamma_x8() {
1647        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1648        let gamma = 2.2f32;
1649        let result = linear_to_gamma_x8(f32x8::from(input), gamma);
1650        let result_arr: [f32; 8] = result.into();
1651
1652        for (i, &inp) in input.iter().enumerate() {
1653            let expected = crate::scalar::linear_to_gamma(inp, gamma);
1654            assert!(
1655                (result_arr[i] - expected).abs() < 1e-5,
1656                "linear_to_gamma_x8 mismatch at {}: got {}, expected {}",
1657                i,
1658                result_arr[i],
1659                expected
1660            );
1661        }
1662    }
1663
1664    #[test]
1665    fn test_gamma_roundtrip_x8() {
1666        let input = [0.0f32, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 1.0];
1667        for gamma in [1.8f32, 2.0, 2.2, 2.4] {
1668            let linear = gamma_to_linear_x8(f32x8::from(input), gamma);
1669            let back = linear_to_gamma_x8(linear, gamma);
1670            let back_arr: [f32; 8] = back.into();
1671
1672            for (i, &inp) in input.iter().enumerate() {
1673                assert!(
1674                    (inp - back_arr[i]).abs() < 1e-4,
1675                    "gamma {} roundtrip failed at {}: {} -> {}",
1676                    gamma,
1677                    i,
1678                    inp,
1679                    back_arr[i]
1680                );
1681            }
1682        }
1683    }
1684
1685    #[test]
1686    fn test_gamma_slice_functions() {
1687        let gamma = 2.2f32;
1688
1689        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1690        let expected: Vec<f32> = values
1691            .iter()
1692            .map(|&v| crate::scalar::gamma_to_linear(v, gamma))
1693            .collect();
1694
1695        gamma_to_linear_slice(&mut values, gamma);
1696
1697        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1698            assert!(
1699                (got - exp).abs() < 1e-5,
1700                "gamma_to_linear_slice mismatch at {}: got {}, expected {}",
1701                i,
1702                got,
1703                exp
1704            );
1705        }
1706
1707        // Test linear_to_gamma_slice
1708        let expected_back: Vec<f32> = values
1709            .iter()
1710            .map(|&v| crate::scalar::linear_to_gamma(v, gamma))
1711            .collect();
1712
1713        linear_to_gamma_slice(&mut values, gamma);
1714
1715        for (i, (&got, &exp)) in values.iter().zip(expected_back.iter()).enumerate() {
1716            assert!(
1717                (got - exp).abs() < 1e-5,
1718                "linear_to_gamma_slice mismatch at {}: got {}, expected {}",
1719                i,
1720                got,
1721                exp
1722            );
1723        }
1724    }
1725
1726    // ---- Permutation tests (archmage tier testing) ----
1727
1728    #[test]
1729    fn srgb_roundtrip_all_tiers() {
1730        let report = archmage::testing::for_each_token_permutation(
1731            archmage::testing::CompileTimePolicy::Warn,
1732            |_perm| {
1733                let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1734                let original = values.clone();
1735                srgb_to_linear_slice(&mut values);
1736                linear_to_srgb_slice(&mut values);
1737                for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1738                    assert!(
1739                        (orig - conv).abs() < 1e-4,
1740                        "tier roundtrip failed at {i}: {orig} -> {conv}"
1741                    );
1742                }
1743            },
1744        );
1745        eprintln!("{report}");
1746    }
1747}