Skip to main content

linear_srgb/
simd.rs

1//! SIMD-accelerated sRGB ↔ linear conversion.
2//!
3//! This module provides high-performance conversion functions using AVX2/SSE SIMD
4//! instructions via the `wide` crate with runtime CPU feature detection.
5//!
6//! # API Overview
7//!
8//! ## x8 Functions (process 8 values at once)
9//! - `srgb_to_linear_x8` - f32x8 sRGB → f32x8 linear
10//! - `linear_to_srgb_x8` - f32x8 linear → f32x8 sRGB
11//! - `srgb_u8_to_linear_x8` - \[u8; 8\] sRGB → f32x8 linear
12//! - `linear_to_srgb_u8_x8` - f32x8 linear → \[u8; 8\] sRGB
13//!
14//! ## Slice Functions (process entire slices)
15//! - `srgb_to_linear_slice` - &mut \[f32\] sRGB → linear in-place
16//! - `linear_to_srgb_slice` - &mut \[f32\] linear → sRGB in-place
17//! - `srgb_u8_to_linear_slice` - &\[u8\] sRGB → &mut \[f32\] linear
18//! - `linear_to_srgb_u8_slice` - &\[f32\] linear → &mut \[u8\] sRGB
19
20#[cfg(target_arch = "x86_64")]
21use archmage::{Desktop64, arcane, rite};
22use archmage::{ScalarToken, incant};
23use wide::{CmpLt, f32x8};
24
25use crate::fast_math::pow_x8;
26
27// Alias magetypes f32x8 to avoid name clash with wide::f32x8
28#[cfg(target_arch = "x86_64")]
29use magetypes::simd::f32x8 as mt_f32x8;
30
31// sRGB transfer function constants (C0-continuous, moxcms-derived)
32// These ensure exact continuity at the linear/power segment junction.
33// Standard IEC values (0.055, 1.055, 0.04045) have a tiny discontinuity.
34const SRGB_LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.039_293_37);
35const LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.003_041_282_6);
36const LINEAR_SCALE: f32x8 = f32x8::splat(1.0 / 12.92);
37const TWELVE_92: f32x8 = f32x8::splat(12.92);
38const ZERO: f32x8 = f32x8::splat(0.0);
39const ONE: f32x8 = f32x8::splat(1.0);
40const HALF: f32x8 = f32x8::splat(0.5);
41
42/// Precomputed sRGB u8 → linear f32 lookup table.
43/// Uses the same constants as the transfer module (C0-continuous IEC 61966-2-1).
44/// Generated by computing `srgb_u8_to_linear(i)` for each i in 0..=255.
45/// To regenerate: `cargo run --release --example generate_lut`
46const SRGB_U8_TO_LINEAR_LUT: [f32; 256] = [
47    0.0_f32,
48    0.000303527_f32,
49    0.000607054_f32,
50    0.000910581_f32,
51    0.001214108_f32,
52    0.001517635_f32,
53    0.001821162_f32,
54    0.0021246888_f32,
55    0.002428216_f32,
56    0.002731743_f32,
57    0.00303527_f32,
58    0.0033473307_f32,
59    0.0036773437_f32,
60    0.0040255957_f32,
61    0.004392362_f32,
62    0.004777916_f32,
63    0.0051825214_f32,
64    0.00560644_f32,
65    0.006049924_f32,
66    0.0065132244_f32,
67    0.0069965874_f32,
68    0.007500253_f32,
69    0.008024457_f32,
70    0.008569433_f32,
71    0.009135411_f32,
72    0.009722613_f32,
73    0.010331264_f32,
74    0.010961577_f32,
75    0.011613773_f32,
76    0.012288062_f32,
77    0.012984648_f32,
78    0.013703744_f32,
79    0.01444555_f32,
80    0.015210266_f32,
81    0.01599809_f32,
82    0.016809216_f32,
83    0.01764384_f32,
84    0.018502146_f32,
85    0.019384334_f32,
86    0.02029058_f32,
87    0.02122107_f32,
88    0.022175988_f32,
89    0.023155512_f32,
90    0.024159823_f32,
91    0.025189094_f32,
92    0.026243499_f32,
93    0.027323212_f32,
94    0.0284284_f32,
95    0.02955924_f32,
96    0.030715894_f32,
97    0.03189852_f32,
98    0.0331073_f32,
99    0.034342386_f32,
100    0.03560393_f32,
101    0.036892105_f32,
102    0.03820707_f32,
103    0.039548974_f32,
104    0.04091798_f32,
105    0.04231424_f32,
106    0.04373789_f32,
107    0.045189105_f32,
108    0.04666803_f32,
109    0.04817481_f32,
110    0.049709592_f32,
111    0.051272515_f32,
112    0.052863743_f32,
113    0.054483414_f32,
114    0.05613167_f32,
115    0.05780865_f32,
116    0.05951448_f32,
117    0.061249338_f32,
118    0.063013345_f32,
119    0.06480663_f32,
120    0.06662934_f32,
121    0.068481594_f32,
122    0.07036356_f32,
123    0.072275355_f32,
124    0.07421711_f32,
125    0.07618896_f32,
126    0.07819102_f32,
127    0.080223456_f32,
128    0.08228638_f32,
129    0.08437992_f32,
130    0.086504206_f32,
131    0.088659346_f32,
132    0.09084551_f32,
133    0.093062796_f32,
134    0.09531133_f32,
135    0.09759124_f32,
136    0.09990266_f32,
137    0.10224568_f32,
138    0.104620464_f32,
139    0.10702711_f32,
140    0.109465756_f32,
141    0.1119365_f32,
142    0.11443946_f32,
143    0.116974786_f32,
144    0.11954258_f32,
145    0.12214295_f32,
146    0.12477602_f32,
147    0.1274419_f32,
148    0.13014072_f32,
149    0.1328726_f32,
150    0.13563763_f32,
151    0.13843594_f32,
152    0.14126763_f32,
153    0.14413282_f32,
154    0.14703165_f32,
155    0.1499642_f32,
156    0.15293059_f32,
157    0.15593089_f32,
158    0.15896529_f32,
159    0.16203386_f32,
160    0.1651367_f32,
161    0.16827393_f32,
162    0.17144562_f32,
163    0.17465195_f32,
164    0.17789298_f32,
165    0.18116882_f32,
166    0.1844796_f32,
167    0.18782537_f32,
168    0.1912063_f32,
169    0.19462249_f32,
170    0.19807397_f32,
171    0.2015609_f32,
172    0.20508343_f32,
173    0.20864154_f32,
174    0.21223548_f32,
175    0.21586527_f32,
176    0.21953095_f32,
177    0.22323275_f32,
178    0.22697066_f32,
179    0.23074481_f32,
180    0.2345554_f32,
181    0.23840237_f32,
182    0.24228595_f32,
183    0.24620613_f32,
184    0.25016314_f32,
185    0.25415692_f32,
186    0.25818765_f32,
187    0.26225552_f32,
188    0.26636043_f32,
189    0.27050266_f32,
190    0.27468216_f32,
191    0.27889907_f32,
192    0.2831536_f32,
193    0.28744566_f32,
194    0.29177552_f32,
195    0.2961431_f32,
196    0.30054858_f32,
197    0.30499217_f32,
198    0.30947372_f32,
199    0.31399357_f32,
200    0.3185516_f32,
201    0.32314798_f32,
202    0.3277829_f32,
203    0.33245632_f32,
204    0.33716843_f32,
205    0.34191918_f32,
206    0.34670877_f32,
207    0.35153738_f32,
208    0.35640487_f32,
209    0.36131153_f32,
210    0.3662573_f32,
211    0.37124234_f32,
212    0.37626684_f32,
213    0.38133067_f32,
214    0.3864341_f32,
215    0.39157712_f32,
216    0.3967598_f32,
217    0.4019824_f32,
218    0.40724477_f32,
219    0.4125472_f32,
220    0.41788962_f32,
221    0.42327216_f32,
222    0.42869502_f32,
223    0.4341581_f32,
224    0.43966165_f32,
225    0.44520563_f32,
226    0.45079017_f32,
227    0.4564154_f32,
228    0.46208134_f32,
229    0.46778816_f32,
230    0.4735358_f32,
231    0.47932443_f32,
232    0.4851542_f32,
233    0.49102503_f32,
234    0.49693722_f32,
235    0.5028906_f32,
236    0.5088854_f32,
237    0.5149218_f32,
238    0.5209996_f32,
239    0.52711916_f32,
240    0.5332804_f32,
241    0.53948337_f32,
242    0.5457284_f32,
243    0.55201524_f32,
244    0.55834424_f32,
245    0.56471527_f32,
246    0.57112855_f32,
247    0.57758415_f32,
248    0.58408207_f32,
249    0.5906225_f32,
250    0.59720534_f32,
251    0.6038308_f32,
252    0.6104991_f32,
253    0.61721_f32,
254    0.62396383_f32,
255    0.6307605_f32,
256    0.6376001_f32,
257    0.644483_f32,
258    0.6514088_f32,
259    0.658378_f32,
260    0.6653904_f32,
261    0.67244613_f32,
262    0.67954546_f32,
263    0.68668824_f32,
264    0.6938747_f32,
265    0.7011047_f32,
266    0.7083785_f32,
267    0.7156962_f32,
268    0.72305775_f32,
269    0.7304634_f32,
270    0.73791295_f32,
271    0.7454066_f32,
272    0.75294465_f32,
273    0.76052684_f32,
274    0.7681535_f32,
275    0.7758244_f32,
276    0.7835399_f32,
277    0.79130006_f32,
278    0.79910475_f32,
279    0.80695426_f32,
280    0.8148484_f32,
281    0.82278764_f32,
282    0.8307716_f32,
283    0.83880067_f32,
284    0.8468749_f32,
285    0.8549941_f32,
286    0.8631587_f32,
287    0.8713685_f32,
288    0.87962353_f32,
289    0.8879244_f32,
290    0.89627033_f32,
291    0.9046623_f32,
292    0.9130995_f32,
293    0.9215827_f32,
294    0.9301116_f32,
295    0.93868643_f32,
296    0.9473071_f32,
297    0.9559739_f32,
298    0.9646866_f32,
299    0.9734457_f32,
300    0.9822507_f32,
301    0.9911024_f32,
302    1.0_f32,
303];
304
305#[inline]
306fn get_lut() -> &'static [f32; 256] {
307    &SRGB_U8_TO_LINEAR_LUT
308}
309
310/// Convert a single sRGB u8 value to linear f32 using LUT lookup.
311///
312/// This is the fastest method for u8 input as it uses a precomputed lookup table
313/// embedded in the binary. For batch conversions, use [`srgb_u8_to_linear_slice`].
314///
315/// # Example
316/// ```
317/// use linear_srgb::simd::srgb_u8_to_linear;
318///
319/// let linear = srgb_u8_to_linear(128);
320/// assert!((linear - 0.2158).abs() < 0.001);
321/// ```
322#[inline]
323pub fn srgb_u8_to_linear(value: u8) -> f32 {
324    get_lut()[value as usize]
325}
326
327// ============================================================================
328// x8 Inline Functions - Always inlined, for use in caller's magetypes code
329// ============================================================================
330
331/// Convert 8 sRGB f32 values to linear (always inlined).
332///
333/// Use this variant inside your own `#[magetypes]` functions to avoid
334/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_dispatch`].
335///
336/// Input values are clamped to \[0, 1\].
337#[inline(always)]
338pub fn srgb_to_linear_x8_inline(srgb: f32x8) -> f32x8 {
339    let srgb = srgb.max(ZERO).min(ONE);
340    let linear_result = srgb * LINEAR_SCALE;
341
342    // Degree-11 Chebyshev polynomial (Estrin evaluation)
343    let u = srgb.mul_add(f32x8::splat(S2L_INV_HW), f32x8::splat(S2L_BIAS));
344    let u2 = u * u;
345    let u4 = u2 * u2;
346    let u_8 = u4 * u4;
347    let p01 = f32x8::splat(S2L_C1).mul_add(u, f32x8::splat(S2L_C0));
348    let p23 = f32x8::splat(S2L_C3).mul_add(u, f32x8::splat(S2L_C2));
349    let p45 = f32x8::splat(S2L_C5).mul_add(u, f32x8::splat(S2L_C4));
350    let p67 = f32x8::splat(S2L_C7).mul_add(u, f32x8::splat(S2L_C6));
351    let p89 = f32x8::splat(S2L_C9).mul_add(u, f32x8::splat(S2L_C8));
352    let pab = f32x8::splat(S2L_C11).mul_add(u, f32x8::splat(S2L_C10));
353    let p0123 = p23.mul_add(u2, p01);
354    let p4567 = p67.mul_add(u2, p45);
355    let p8_11 = pab.mul_add(u2, p89);
356    let p0_7 = p4567.mul_add(u4, p0123);
357    let power_result = p8_11.mul_add(u_8, p0_7);
358
359    let mask = srgb.simd_lt(SRGB_LINEAR_THRESHOLD);
360    mask.blend(linear_result, power_result)
361}
362
363/// Convert 8 linear f32 values to sRGB (always inlined).
364///
365/// Use this variant inside your own `#[magetypes]` functions to avoid
366/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_dispatch`].
367///
368/// Input values are clamped to \[0, 1\].
369#[inline(always)]
370pub fn linear_to_srgb_x8_inline(linear: f32x8) -> f32x8 {
371    let linear = linear.max(ZERO).min(ONE);
372    let linear_result = linear * TWELVE_92;
373
374    // sqrt transform + degree-15 Chebyshev polynomial (Estrin evaluation)
375    let s = linear.sqrt();
376    let u = s.mul_add(f32x8::splat(L2S_INV_HW), f32x8::splat(L2S_BIAS));
377    let u2 = u * u;
378    let u4 = u2 * u2;
379    let u_8 = u4 * u4;
380    let p01 = f32x8::splat(L2S_C1).mul_add(u, f32x8::splat(L2S_C0));
381    let p23 = f32x8::splat(L2S_C3).mul_add(u, f32x8::splat(L2S_C2));
382    let p45 = f32x8::splat(L2S_C5).mul_add(u, f32x8::splat(L2S_C4));
383    let p67 = f32x8::splat(L2S_C7).mul_add(u, f32x8::splat(L2S_C6));
384    let p89 = f32x8::splat(L2S_C9).mul_add(u, f32x8::splat(L2S_C8));
385    let pab = f32x8::splat(L2S_C11).mul_add(u, f32x8::splat(L2S_C10));
386    let pcd = f32x8::splat(L2S_C13).mul_add(u, f32x8::splat(L2S_C12));
387    let pef = f32x8::splat(L2S_C15).mul_add(u, f32x8::splat(L2S_C14));
388    let p0123 = p23.mul_add(u2, p01);
389    let p4567 = p67.mul_add(u2, p45);
390    let p89ab = pab.mul_add(u2, p89);
391    let pcdef = pef.mul_add(u2, pcd);
392    let p0_7 = p4567.mul_add(u4, p0123);
393    let p8_f = pcdef.mul_add(u4, p89ab);
394    let power_result = p8_f.mul_add(u_8, p0_7);
395
396    let mask = linear.simd_lt(LINEAR_THRESHOLD);
397    mask.blend(linear_result, power_result)
398}
399
400/// Convert 8 linear f32 values to sRGB u8 (always inlined).
401///
402/// Uses a 4096-entry const LUT for direct lookup — no pow/log/exp computation.
403/// Max error: ±1 u8 level (same as the SIMD polynomial path).
404#[inline(always)]
405pub fn linear_to_srgb_u8_x8_inline(linear: f32x8) -> [u8; 8] {
406    linear_to_srgb_u8_lut_x8(linear)
407}
408
409/// Convert 8 linear f32 values to sRGB u8 using const LUT.
410///
411/// Clamps to [0,1], scales to LUT index, does 8 scalar lookups from
412/// a 4KB table (fits L1 cache). No pow/exp/log computation.
413#[inline(always)]
414pub(crate) fn linear_to_srgb_u8_lut_x8(linear: f32x8) -> [u8; 8] {
415    let clamped = linear.max(ZERO).min(ONE);
416    let scaled = clamped * f32x8::splat(4095.0) + HALF;
417    let arr: [f32; 8] = scaled.into();
418    let lut = &crate::const_luts::LINEAR_TO_SRGB_U8;
419    [
420        lut[arr[0] as usize & 0xFFF],
421        lut[arr[1] as usize & 0xFFF],
422        lut[arr[2] as usize & 0xFFF],
423        lut[arr[3] as usize & 0xFFF],
424        lut[arr[4] as usize & 0xFFF],
425        lut[arr[5] as usize & 0xFFF],
426        lut[arr[6] as usize & 0xFFF],
427        lut[arr[7] as usize & 0xFFF],
428    ]
429}
430
431/// Convert 8 gamma-encoded f32 values to linear (always inlined).
432///
433/// Use this variant inside your own `#[magetypes]` functions to avoid
434/// double dispatch overhead.
435#[inline(always)]
436pub fn gamma_to_linear_x8_inline(encoded: f32x8, gamma: f32) -> f32x8 {
437    let encoded = encoded.max(ZERO).min(ONE);
438    pow_x8(encoded, gamma)
439}
440
441/// Convert 8 linear f32 values to gamma-encoded (always inlined).
442///
443/// Use this variant inside your own `#[magetypes]` functions to avoid
444/// double dispatch overhead.
445#[inline(always)]
446pub fn linear_to_gamma_x8_inline(linear: f32x8, gamma: f32) -> f32x8 {
447    let linear = linear.max(ZERO).min(ONE);
448    pow_x8(linear, 1.0 / gamma)
449}
450
451// ============================================================================
452// magetypes #[rite] helpers (x86-64 only) — real AVX2+FMA SIMD
453// ============================================================================
454
455// sRGB transfer function scalar constants (for magetypes which needs token-gated splat)
456#[cfg(target_arch = "x86_64")]
457const MT_SRGB_LINEAR_THRESHOLD: f32 = 0.039_293_37;
458#[cfg(target_arch = "x86_64")]
459const MT_LINEAR_THRESHOLD: f32 = 0.003_041_282_6;
460#[cfg(target_arch = "x86_64")]
461const MT_LINEAR_SCALE: f32 = 1.0 / 12.92;
462#[cfg(target_arch = "x86_64")]
463const MT_TWELVE_92: f32 = 12.92;
464
465// sRGB→linear degree-11 Chebyshev polynomial (Estrin's scheme)
466// Approximates ((s + offset) / scale)^2.4 on [threshold, 1.0]
467// u = s * INV_HW + BIAS maps [threshold, 1] → [-1, 1]
468const S2L_INV_HW: f32 = 2.081_801;
469const S2L_BIAS: f32 = -1.081_800_9;
470const S2L_C0: f32 = 2.326_832_7e-1;
471const S2L_C1: f32 = 4.667_970_8e-1;
472const S2L_C2: f32 = 2.731_341e-1;
473const S2L_C3: f32 = 3.044_251_2e-2;
474const S2L_C4: f32 = -3.802_638_5e-3;
475const S2L_C5: f32 = 1.011_499_3e-3;
476const S2L_C6: f32 = -4.267_19e-4;
477const S2L_C7: f32 = 1.966_666_5e-4;
478const S2L_C8: f32 = 2.025_719_4e-5;
479const S2L_C9: f32 = -2.400_594_3e-5;
480const S2L_C10: f32 = -8.762_017e-5;
481const S2L_C11: f32 = 5.557_536_5e-5;
482
483// linear→sRGB degree-15 Chebyshev polynomial via sqrt transform (Estrin's scheme)
484// Approximates scale * (√l)^(5/6) - offset on [sqrt(threshold), 1.0]
485// u = √l * INV_HW + BIAS maps [sqrt(threshold), 1] → [-1, 1]
486const L2S_INV_HW: f32 = 2.116_733_3;
487const L2S_BIAS: f32 = -1.116_733_2;
488const L2S_C0: f32 = 5.641_828e-1;
489const L2S_C1: f32 = 4.620_569_3e-1;
490const L2S_C2: f32 = -3.450_065e-2;
491const L2S_C3: f32 = 1.202_464_2e-2;
492const L2S_C4: f32 = -5.398_721e-3;
493const L2S_C5: f32 = 2.946_610_3e-3;
494const L2S_C6: f32 = -5.274_399_6e-3;
495const L2S_C7: f32 = 4.055_202e-3;
496const L2S_C8: f32 = 1.062_489_9e-2;
497const L2S_C9: f32 = -9.012_202e-3;
498const L2S_C10: f32 = -2.186_026_6e-2;
499const L2S_C11: f32 = 1.824_478_4e-2;
500const L2S_C12: f32 = 1.958_387_2e-2;
501const L2S_C13: f32 = -1.638_288e-2;
502const L2S_C14: f32 = -7.710_282_7e-3;
503const L2S_C15: f32 = 6.419_743e-3;
504
505#[cfg(target_arch = "x86_64")]
506#[rite]
507fn srgb_to_linear_mt(token: Desktop64, srgb: mt_f32x8) -> mt_f32x8 {
508    let zero = mt_f32x8::zero(token);
509    let one = mt_f32x8::splat(token, 1.0);
510    let srgb = srgb.max(zero).min(one);
511
512    let linear_result = srgb * mt_f32x8::splat(token, MT_LINEAR_SCALE);
513
514    // Degree-11 Chebyshev polynomial (Estrin evaluation)
515    let u = srgb.mul_add(
516        mt_f32x8::splat(token, S2L_INV_HW),
517        mt_f32x8::splat(token, S2L_BIAS),
518    );
519    let u2 = u * u;
520    let u4 = u2 * u2;
521    let u_8 = u4 * u4;
522    let p01 = mt_f32x8::splat(token, S2L_C1).mul_add(u, mt_f32x8::splat(token, S2L_C0));
523    let p23 = mt_f32x8::splat(token, S2L_C3).mul_add(u, mt_f32x8::splat(token, S2L_C2));
524    let p45 = mt_f32x8::splat(token, S2L_C5).mul_add(u, mt_f32x8::splat(token, S2L_C4));
525    let p67 = mt_f32x8::splat(token, S2L_C7).mul_add(u, mt_f32x8::splat(token, S2L_C6));
526    let p89 = mt_f32x8::splat(token, S2L_C9).mul_add(u, mt_f32x8::splat(token, S2L_C8));
527    let pab = mt_f32x8::splat(token, S2L_C11).mul_add(u, mt_f32x8::splat(token, S2L_C10));
528    let p0123 = p23.mul_add(u2, p01);
529    let p4567 = p67.mul_add(u2, p45);
530    let p8_11 = pab.mul_add(u2, p89);
531    let p0_7 = p4567.mul_add(u4, p0123);
532    let power_result = p8_11.mul_add(u_8, p0_7);
533
534    let mask = srgb.simd_lt(mt_f32x8::splat(token, MT_SRGB_LINEAR_THRESHOLD));
535    mt_f32x8::blend(mask, linear_result, power_result)
536}
537
538#[cfg(target_arch = "x86_64")]
539#[rite]
540fn linear_to_srgb_mt(token: Desktop64, linear: mt_f32x8) -> mt_f32x8 {
541    let zero = mt_f32x8::zero(token);
542    let one = mt_f32x8::splat(token, 1.0);
543    let linear = linear.max(zero).min(one);
544
545    let linear_result = linear * mt_f32x8::splat(token, MT_TWELVE_92);
546
547    // sqrt transform + degree-15 Chebyshev polynomial (Estrin evaluation)
548    let s = linear.sqrt();
549    let u = s.mul_add(
550        mt_f32x8::splat(token, L2S_INV_HW),
551        mt_f32x8::splat(token, L2S_BIAS),
552    );
553    let u2 = u * u;
554    let u4 = u2 * u2;
555    let u_8 = u4 * u4;
556    let p01 = mt_f32x8::splat(token, L2S_C1).mul_add(u, mt_f32x8::splat(token, L2S_C0));
557    let p23 = mt_f32x8::splat(token, L2S_C3).mul_add(u, mt_f32x8::splat(token, L2S_C2));
558    let p45 = mt_f32x8::splat(token, L2S_C5).mul_add(u, mt_f32x8::splat(token, L2S_C4));
559    let p67 = mt_f32x8::splat(token, L2S_C7).mul_add(u, mt_f32x8::splat(token, L2S_C6));
560    let p89 = mt_f32x8::splat(token, L2S_C9).mul_add(u, mt_f32x8::splat(token, L2S_C8));
561    let pab = mt_f32x8::splat(token, L2S_C11).mul_add(u, mt_f32x8::splat(token, L2S_C10));
562    let pcd = mt_f32x8::splat(token, L2S_C13).mul_add(u, mt_f32x8::splat(token, L2S_C12));
563    let pef = mt_f32x8::splat(token, L2S_C15).mul_add(u, mt_f32x8::splat(token, L2S_C14));
564    let p0123 = p23.mul_add(u2, p01);
565    let p4567 = p67.mul_add(u2, p45);
566    let p89ab = pab.mul_add(u2, p89);
567    let pcdef = pef.mul_add(u2, pcd);
568    let p0_7 = p4567.mul_add(u4, p0123);
569    let p8_f = pcdef.mul_add(u4, p89ab);
570    let power_result = p8_f.mul_add(u_8, p0_7);
571
572    let mask = linear.simd_lt(mt_f32x8::splat(token, MT_LINEAR_THRESHOLD));
573    mt_f32x8::blend(mask, linear_result, power_result)
574}
575
576#[cfg(target_arch = "x86_64")]
577#[rite]
578fn gamma_to_linear_mt(token: Desktop64, encoded: mt_f32x8, gamma: f32) -> mt_f32x8 {
579    let zero = mt_f32x8::zero(token);
580    let one = mt_f32x8::splat(token, 1.0);
581    let encoded = encoded.max(zero).min(one);
582    encoded.pow_midp(gamma)
583}
584
585#[cfg(target_arch = "x86_64")]
586#[rite]
587fn linear_to_gamma_mt(token: Desktop64, linear: mt_f32x8, gamma: f32) -> mt_f32x8 {
588    let zero = mt_f32x8::zero(token);
589    let one = mt_f32x8::splat(token, 1.0);
590    let linear = linear.max(zero).min(one);
591    linear.pow_midp(1.0 / gamma)
592}
593
594// ============================================================================
595// x8 Dispatch Functions - Runtime CPU feature detection
596// ============================================================================
597
598#[cfg(target_arch = "x86_64")]
599#[arcane]
600fn srgb_to_linear_x8_tier_v3(token: Desktop64, srgb: f32x8) -> f32x8 {
601    let arr: [f32; 8] = srgb.into();
602    let v = mt_f32x8::from_array(token, arr);
603    let result = srgb_to_linear_mt(token, v);
604    f32x8::from(result.to_array())
605}
606
607fn srgb_to_linear_x8_tier_scalar(_token: ScalarToken, srgb: f32x8) -> f32x8 {
608    srgb_to_linear_x8_inline(srgb)
609}
610
611/// Convert 8 sRGB f32 values to linear (with CPU dispatch).
612///
613/// This variant uses runtime CPU feature detection to select the optimal
614/// implementation. Use [`srgb_to_linear_x8_inline`] inside your own
615/// `#[magetypes]` functions to avoid double dispatch.
616///
617/// Input values are clamped to \[0, 1\].
618#[inline]
619pub fn srgb_to_linear_x8_dispatch(srgb: f32x8) -> f32x8 {
620    incant!(srgb_to_linear_x8_tier(srgb), [v3])
621}
622
623#[cfg(target_arch = "x86_64")]
624#[arcane]
625fn linear_to_srgb_x8_tier_v3(token: Desktop64, linear: f32x8) -> f32x8 {
626    let arr: [f32; 8] = linear.into();
627    let v = mt_f32x8::from_array(token, arr);
628    let result = linear_to_srgb_mt(token, v);
629    f32x8::from(result.to_array())
630}
631
632fn linear_to_srgb_x8_tier_scalar(_token: ScalarToken, linear: f32x8) -> f32x8 {
633    linear_to_srgb_x8_inline(linear)
634}
635
636/// Convert 8 linear f32 values to sRGB (with CPU dispatch).
637///
638/// This variant uses runtime CPU feature detection to select the optimal
639/// implementation. Use [`linear_to_srgb_x8_inline`] inside your own
640/// `#[magetypes]` functions to avoid double dispatch.
641///
642/// Input values are clamped to \[0, 1\].
643#[inline]
644pub fn linear_to_srgb_x8_dispatch(linear: f32x8) -> f32x8 {
645    incant!(linear_to_srgb_x8_tier(linear), [v3])
646}
647
648/// Convert 8 linear f32 values to sRGB u8 (LUT-based, no dispatch needed).
649#[inline]
650pub fn linear_to_srgb_u8_x8_dispatch(linear: f32x8) -> [u8; 8] {
651    linear_to_srgb_u8_x8_inline(linear)
652}
653
654#[cfg(target_arch = "x86_64")]
655#[arcane]
656fn gamma_to_linear_x8_tier_v3(token: Desktop64, encoded: f32x8, gamma: f32) -> f32x8 {
657    let arr: [f32; 8] = encoded.into();
658    let v = mt_f32x8::from_array(token, arr);
659    let result = gamma_to_linear_mt(token, v, gamma);
660    f32x8::from(result.to_array())
661}
662
663fn gamma_to_linear_x8_tier_scalar(_token: ScalarToken, encoded: f32x8, gamma: f32) -> f32x8 {
664    gamma_to_linear_x8_inline(encoded, gamma)
665}
666
667/// Convert 8 gamma-encoded f32 values to linear (with CPU dispatch).
668#[inline]
669pub fn gamma_to_linear_x8_dispatch(encoded: f32x8, gamma: f32) -> f32x8 {
670    incant!(gamma_to_linear_x8_tier(encoded, gamma), [v3])
671}
672
673#[cfg(target_arch = "x86_64")]
674#[arcane]
675fn linear_to_gamma_x8_tier_v3(token: Desktop64, linear: f32x8, gamma: f32) -> f32x8 {
676    let arr: [f32; 8] = linear.into();
677    let v = mt_f32x8::from_array(token, arr);
678    let result = linear_to_gamma_mt(token, v, gamma);
679    f32x8::from(result.to_array())
680}
681
682fn linear_to_gamma_x8_tier_scalar(_token: ScalarToken, linear: f32x8, gamma: f32) -> f32x8 {
683    linear_to_gamma_x8_inline(linear, gamma)
684}
685
686/// Convert 8 linear f32 values to gamma-encoded (with CPU dispatch).
687#[inline]
688pub fn linear_to_gamma_x8_dispatch(linear: f32x8, gamma: f32) -> f32x8 {
689    incant!(linear_to_gamma_x8_tier(linear, gamma), [v3])
690}
691
692// ============================================================================
693// x8 Default Functions - Calls inline variant, compiler decides inlining
694// ============================================================================
695
696/// Convert 8 sRGB f32 values to linear.
697///
698/// This is the default variant that calls the inline implementation.
699/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
700/// inside your own `#[magetypes]` functions.
701///
702/// Input values are clamped to \[0, 1\].
703///
704/// # Example
705/// ```
706/// use linear_srgb::simd::srgb_to_linear_x8;
707/// use wide::f32x8;
708///
709/// let srgb = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
710/// let linear = srgb_to_linear_x8(srgb);
711/// ```
712#[inline]
713pub fn srgb_to_linear_x8(srgb: f32x8) -> f32x8 {
714    srgb_to_linear_x8_inline(srgb)
715}
716
717/// Convert 8 linear f32 values to sRGB.
718///
719/// This is the default variant that calls the inline implementation.
720/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
721/// inside your own `#[magetypes]` functions.
722///
723/// Input values are clamped to \[0, 1\].
724///
725/// # Example
726/// ```
727/// use linear_srgb::simd::linear_to_srgb_x8;
728/// use wide::f32x8;
729///
730/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
731/// let srgb = linear_to_srgb_x8(linear);
732/// ```
733#[inline]
734pub fn linear_to_srgb_x8(linear: f32x8) -> f32x8 {
735    linear_to_srgb_x8_inline(linear)
736}
737
738/// Convert 8 sRGB u8 values to linear f32 using LUT lookup.
739///
740/// This is the fastest method for u8 input as it uses a precomputed lookup table.
741///
742/// # Example
743/// ```
744/// use linear_srgb::simd::srgb_u8_to_linear_x8;
745///
746/// let srgb = [0u8, 64, 128, 192, 255, 32, 96, 160];
747/// let linear = srgb_u8_to_linear_x8(srgb);
748/// ```
749#[inline]
750pub fn srgb_u8_to_linear_x8(srgb: [u8; 8]) -> f32x8 {
751    let lut = get_lut();
752    f32x8::from([
753        lut[srgb[0] as usize],
754        lut[srgb[1] as usize],
755        lut[srgb[2] as usize],
756        lut[srgb[3] as usize],
757        lut[srgb[4] as usize],
758        lut[srgb[5] as usize],
759        lut[srgb[6] as usize],
760        lut[srgb[7] as usize],
761    ])
762}
763
764/// Convert 8 linear f32 values to sRGB u8.
765///
766/// Input values are clamped to \[0, 1\], output is rounded to nearest u8.
767///
768/// # Example
769/// ```
770/// use linear_srgb::simd::linear_to_srgb_u8_x8;
771/// use wide::f32x8;
772///
773/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
774/// let srgb = linear_to_srgb_u8_x8(linear);
775/// ```
776#[inline]
777pub fn linear_to_srgb_u8_x8(linear: f32x8) -> [u8; 8] {
778    linear_to_srgb_u8_x8_inline(linear)
779}
780
781/// Convert 8 gamma-encoded f32 values to linear.
782///
783/// # Example
784/// ```
785/// use linear_srgb::simd::gamma_to_linear_x8;
786/// use wide::f32x8;
787///
788/// let encoded = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
789/// let linear = gamma_to_linear_x8(encoded, 2.2);
790/// ```
791#[inline]
792pub fn gamma_to_linear_x8(encoded: f32x8, gamma: f32) -> f32x8 {
793    gamma_to_linear_x8_inline(encoded, gamma)
794}
795
796/// Convert 8 linear f32 values to gamma-encoded.
797///
798/// # Example
799/// ```
800/// use linear_srgb::simd::linear_to_gamma_x8;
801/// use wide::f32x8;
802///
803/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
804/// let encoded = linear_to_gamma_x8(linear, 2.2);
805/// ```
806#[inline]
807pub fn linear_to_gamma_x8(linear: f32x8, gamma: f32) -> f32x8 {
808    linear_to_gamma_x8_inline(linear, gamma)
809}
810
811// ============================================================================
812// Slice Functions - Process entire slices
813// ============================================================================
814
815#[cfg(target_arch = "x86_64")]
816#[arcane]
817fn srgb_to_linear_slice_tier_v3(token: Desktop64, values: &mut [f32]) {
818    let (chunks, remainder) = values.as_chunks_mut::<8>();
819
820    for chunk in chunks {
821        let v = mt_f32x8::from_array(token, *chunk);
822        let result = srgb_to_linear_mt(token, v);
823        *chunk = result.to_array();
824    }
825
826    for v in remainder {
827        *v = crate::scalar::srgb_to_linear(*v);
828    }
829}
830
831fn srgb_to_linear_slice_tier_scalar(_token: ScalarToken, values: &mut [f32]) {
832    for v in values.iter_mut() {
833        *v = crate::scalar::srgb_to_linear(*v);
834    }
835}
836
837/// Convert sRGB f32 values to linear in-place.
838///
839/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
840///
841/// # Example
842/// ```
843/// use linear_srgb::simd::srgb_to_linear_slice;
844///
845/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
846/// srgb_to_linear_slice(&mut values);
847/// ```
848#[inline]
849pub fn srgb_to_linear_slice(values: &mut [f32]) {
850    incant!(srgb_to_linear_slice_tier(values), [v3])
851}
852
853#[cfg(target_arch = "x86_64")]
854#[arcane]
855fn linear_to_srgb_slice_tier_v3(token: Desktop64, values: &mut [f32]) {
856    let (chunks, remainder) = values.as_chunks_mut::<8>();
857
858    for chunk in chunks {
859        let v = mt_f32x8::from_array(token, *chunk);
860        let result = linear_to_srgb_mt(token, v);
861        *chunk = result.to_array();
862    }
863
864    for v in remainder {
865        *v = crate::scalar::linear_to_srgb(*v);
866    }
867}
868
869fn linear_to_srgb_slice_tier_scalar(_token: ScalarToken, values: &mut [f32]) {
870    for v in values.iter_mut() {
871        *v = crate::scalar::linear_to_srgb(*v);
872    }
873}
874
875/// Convert linear f32 values to sRGB in-place.
876///
877/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
878///
879/// # Example
880/// ```
881/// use linear_srgb::simd::linear_to_srgb_slice;
882///
883/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
884/// linear_to_srgb_slice(&mut values);
885/// ```
886#[inline]
887pub fn linear_to_srgb_slice(values: &mut [f32]) {
888    incant!(linear_to_srgb_slice_tier(values), [v3])
889}
890
891/// Convert sRGB u8 values to linear f32.
892///
893/// Uses a precomputed LUT for each u8 value, processed in SIMD batches of 8.
894///
895/// # Panics
896/// Panics if `input.len() != output.len()`.
897///
898/// # Example
899/// ```
900/// use linear_srgb::simd::srgb_u8_to_linear_slice;
901///
902/// let input: Vec<u8> = (0..=255).collect();
903/// let mut output = vec![0.0f32; 256];
904/// srgb_u8_to_linear_slice(&input, &mut output);
905/// ```
906#[inline]
907pub fn srgb_u8_to_linear_slice(input: &[u8], output: &mut [f32]) {
908    assert_eq!(input.len(), output.len());
909    let lut = get_lut();
910
911    let (in_chunks, in_remainder) = input.as_chunks::<8>();
912    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
913
914    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
915        *out = [
916            lut[inp[0] as usize],
917            lut[inp[1] as usize],
918            lut[inp[2] as usize],
919            lut[inp[3] as usize],
920            lut[inp[4] as usize],
921            lut[inp[5] as usize],
922            lut[inp[6] as usize],
923            lut[inp[7] as usize],
924        ];
925    }
926
927    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
928        *out = lut[*inp as usize];
929    }
930}
931
932/// Convert linear f32 values to sRGB u8.
933///
934/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
935///
936/// # Panics
937/// Panics if `input.len() != output.len()`.
938///
939/// # Example
940/// ```
941/// use linear_srgb::simd::linear_to_srgb_u8_slice;
942///
943/// let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
944/// let mut output = vec![0u8; 256];
945/// linear_to_srgb_u8_slice(&input, &mut output);
946/// ```
947pub fn linear_to_srgb_u8_slice(input: &[f32], output: &mut [u8]) {
948    assert_eq!(input.len(), output.len());
949
950    let lut = &crate::const_luts::LINEAR_TO_SRGB_U8;
951
952    // Process 8 at a time using SIMD for index computation
953    let (in_chunks, in_remainder) = input.as_chunks::<8>();
954    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
955
956    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
957        let linear = f32x8::from(*inp);
958        let clamped = linear.max(ZERO).min(ONE);
959        let scaled = clamped * f32x8::splat(4095.0) + HALF;
960        let arr: [f32; 8] = scaled.into();
961        *out = [
962            lut[arr[0] as usize & 0xFFF],
963            lut[arr[1] as usize & 0xFFF],
964            lut[arr[2] as usize & 0xFFF],
965            lut[arr[3] as usize & 0xFFF],
966            lut[arr[4] as usize & 0xFFF],
967            lut[arr[5] as usize & 0xFFF],
968            lut[arr[6] as usize & 0xFFF],
969            lut[arr[7] as usize & 0xFFF],
970        ];
971    }
972
973    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
974        *out = crate::scalar::linear_to_srgb_u8(*inp);
975    }
976}
977
978// ============================================================================
979// u16 Batch Functions (LUT-based)
980// ============================================================================
981
982/// Convert sRGB u16 values to linear f32 using a 65536-entry const LUT.
983///
984/// Pure table lookup, no math. The LUT is 256KB.
985///
986/// # Panics
987/// Panics if `input.len() != output.len()`.
988pub fn srgb_u16_to_linear_slice(input: &[u16], output: &mut [f32]) {
989    assert_eq!(input.len(), output.len());
990    let lut = &crate::const_luts_u16::SRGB_U16_TO_LINEAR_F32;
991
992    let (in_chunks, in_remainder) = input.as_chunks::<8>();
993    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
994
995    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
996        *out = [
997            lut[inp[0] as usize],
998            lut[inp[1] as usize],
999            lut[inp[2] as usize],
1000            lut[inp[3] as usize],
1001            lut[inp[4] as usize],
1002            lut[inp[5] as usize],
1003            lut[inp[6] as usize],
1004            lut[inp[7] as usize],
1005        ];
1006    }
1007
1008    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
1009        *out = lut[*inp as usize];
1010    }
1011}
1012
1013/// Convert linear f32 values to sRGB u16 using a 65537-entry const LUT.
1014///
1015/// # Panics
1016/// Panics if `input.len() != output.len()`.
1017pub fn linear_to_srgb_u16_slice(input: &[f32], output: &mut [u16]) {
1018    assert_eq!(input.len(), output.len());
1019    let lut = &crate::const_luts_u16::LINEAR_TO_SRGB_U16_65536;
1020
1021    let (in_chunks, in_remainder) = input.as_chunks::<8>();
1022    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
1023
1024    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
1025        let linear = f32x8::from(*inp);
1026        let clamped = linear.max(ZERO).min(ONE);
1027        let scaled = clamped * f32x8::splat(65536.0) + HALF;
1028        let arr: [f32; 8] = scaled.into();
1029        *out = [
1030            lut[arr[0] as usize],
1031            lut[arr[1] as usize],
1032            lut[arr[2] as usize],
1033            lut[arr[3] as usize],
1034            lut[arr[4] as usize],
1035            lut[arr[5] as usize],
1036            lut[arr[6] as usize],
1037            lut[arr[7] as usize],
1038        ];
1039    }
1040
1041    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
1042        *out = crate::scalar::linear_to_srgb_u16(*inp);
1043    }
1044}
1045
1046// ============================================================================
1047// Custom Gamma Slice Functions
1048// ============================================================================
1049
1050#[cfg(target_arch = "x86_64")]
1051#[arcane]
1052fn gamma_to_linear_slice_tier_v3(token: Desktop64, values: &mut [f32], gamma: f32) {
1053    let (chunks, remainder) = values.as_chunks_mut::<8>();
1054
1055    for chunk in chunks {
1056        let v = mt_f32x8::from_array(token, *chunk);
1057        let result = gamma_to_linear_mt(token, v, gamma);
1058        *chunk = result.to_array();
1059    }
1060
1061    for v in remainder {
1062        *v = crate::scalar::gamma_to_linear(*v, gamma);
1063    }
1064}
1065
1066fn gamma_to_linear_slice_tier_scalar(_token: ScalarToken, values: &mut [f32], gamma: f32) {
1067    for v in values.iter_mut() {
1068        *v = crate::scalar::gamma_to_linear(*v, gamma);
1069    }
1070}
1071
1072/// Convert gamma-encoded f32 values to linear in-place using a custom gamma.
1073///
1074/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
1075///
1076/// # Example
1077/// ```
1078/// use linear_srgb::simd::gamma_to_linear_slice;
1079///
1080/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
1081/// gamma_to_linear_slice(&mut values, 2.2);
1082/// ```
1083#[inline]
1084pub fn gamma_to_linear_slice(values: &mut [f32], gamma: f32) {
1085    incant!(gamma_to_linear_slice_tier(values, gamma), [v3])
1086}
1087
1088#[cfg(target_arch = "x86_64")]
1089#[arcane]
1090fn linear_to_gamma_slice_tier_v3(token: Desktop64, values: &mut [f32], gamma: f32) {
1091    let (chunks, remainder) = values.as_chunks_mut::<8>();
1092
1093    for chunk in chunks {
1094        let v = mt_f32x8::from_array(token, *chunk);
1095        let result = linear_to_gamma_mt(token, v, gamma);
1096        *chunk = result.to_array();
1097    }
1098
1099    for v in remainder {
1100        *v = crate::scalar::linear_to_gamma(*v, gamma);
1101    }
1102}
1103
1104fn linear_to_gamma_slice_tier_scalar(_token: ScalarToken, values: &mut [f32], gamma: f32) {
1105    for v in values.iter_mut() {
1106        *v = crate::scalar::linear_to_gamma(*v, gamma);
1107    }
1108}
1109
1110/// Convert linear f32 values to gamma-encoded in-place using a custom gamma.
1111///
1112/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
1113///
1114/// # Example
1115/// ```
1116/// use linear_srgb::simd::linear_to_gamma_slice;
1117///
1118/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
1119/// linear_to_gamma_slice(&mut values, 2.2);
1120/// ```
1121#[inline]
1122pub fn linear_to_gamma_slice(values: &mut [f32], gamma: f32) {
1123    incant!(linear_to_gamma_slice_tier(values, gamma), [v3])
1124}
1125
1126// ============================================================================
1127// f32x8 Slice Functions (for pre-aligned SIMD data)
1128// ============================================================================
1129
1130#[cfg(target_arch = "x86_64")]
1131#[arcane]
1132fn srgb_to_linear_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8]) {
1133    for v in values.iter_mut() {
1134        let arr: [f32; 8] = (*v).into();
1135        let mt_v = mt_f32x8::from_array(token, arr);
1136        let result = srgb_to_linear_mt(token, mt_v);
1137        *v = f32x8::from(result.to_array());
1138    }
1139}
1140
1141fn srgb_to_linear_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8]) {
1142    for v in values.iter_mut() {
1143        *v = srgb_to_linear_x8_inline(*v);
1144    }
1145}
1146
1147/// Convert sRGB f32x8 values to linear in-place.
1148///
1149/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1150/// use [`srgb_to_linear_slice`] instead which handles remainders automatically.
1151///
1152/// # Example
1153/// ```
1154/// use linear_srgb::simd::srgb_to_linear_x8_slice;
1155/// use wide::f32x8;
1156///
1157/// let mut values = vec![f32x8::splat(0.5); 100];
1158/// srgb_to_linear_x8_slice(&mut values);
1159/// ```
1160#[inline]
1161pub fn srgb_to_linear_x8_slice(values: &mut [f32x8]) {
1162    incant!(srgb_to_linear_x8_slice_tier(values), [v3])
1163}
1164
1165#[cfg(target_arch = "x86_64")]
1166#[arcane]
1167fn linear_to_srgb_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8]) {
1168    for v in values.iter_mut() {
1169        let arr: [f32; 8] = (*v).into();
1170        let mt_v = mt_f32x8::from_array(token, arr);
1171        let result = linear_to_srgb_mt(token, mt_v);
1172        *v = f32x8::from(result.to_array());
1173    }
1174}
1175
1176fn linear_to_srgb_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8]) {
1177    for v in values.iter_mut() {
1178        *v = linear_to_srgb_x8_inline(*v);
1179    }
1180}
1181
1182/// Convert linear f32x8 values to sRGB in-place.
1183///
1184/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1185/// use [`linear_to_srgb_slice`] instead which handles remainders automatically.
1186///
1187/// # Example
1188/// ```
1189/// use linear_srgb::simd::linear_to_srgb_x8_slice;
1190/// use wide::f32x8;
1191///
1192/// let mut values = vec![f32x8::splat(0.5); 100];
1193/// linear_to_srgb_x8_slice(&mut values);
1194/// ```
1195#[inline]
1196pub fn linear_to_srgb_x8_slice(values: &mut [f32x8]) {
1197    incant!(linear_to_srgb_x8_slice_tier(values), [v3])
1198}
1199
1200#[cfg(target_arch = "x86_64")]
1201#[arcane]
1202fn gamma_to_linear_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8], gamma: f32) {
1203    for v in values.iter_mut() {
1204        let arr: [f32; 8] = (*v).into();
1205        let mt_v = mt_f32x8::from_array(token, arr);
1206        let result = gamma_to_linear_mt(token, mt_v, gamma);
1207        *v = f32x8::from(result.to_array());
1208    }
1209}
1210
1211fn gamma_to_linear_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8], gamma: f32) {
1212    for v in values.iter_mut() {
1213        *v = gamma_to_linear_x8_inline(*v, gamma);
1214    }
1215}
1216
1217/// Convert gamma-encoded f32x8 values to linear in-place using a custom gamma.
1218///
1219/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1220/// use [`gamma_to_linear_slice`] instead which handles remainders automatically.
1221///
1222/// # Example
1223/// ```
1224/// use linear_srgb::simd::gamma_to_linear_x8_slice;
1225/// use wide::f32x8;
1226///
1227/// let mut values = vec![f32x8::splat(0.5); 100];
1228/// gamma_to_linear_x8_slice(&mut values, 2.2);
1229/// ```
1230#[inline]
1231pub fn gamma_to_linear_x8_slice(values: &mut [f32x8], gamma: f32) {
1232    incant!(gamma_to_linear_x8_slice_tier(values, gamma), [v3])
1233}
1234
1235#[cfg(target_arch = "x86_64")]
1236#[arcane]
1237fn linear_to_gamma_x8_slice_tier_v3(token: Desktop64, values: &mut [f32x8], gamma: f32) {
1238    for v in values.iter_mut() {
1239        let arr: [f32; 8] = (*v).into();
1240        let mt_v = mt_f32x8::from_array(token, arr);
1241        let result = linear_to_gamma_mt(token, mt_v, gamma);
1242        *v = f32x8::from(result.to_array());
1243    }
1244}
1245
1246fn linear_to_gamma_x8_slice_tier_scalar(_token: ScalarToken, values: &mut [f32x8], gamma: f32) {
1247    for v in values.iter_mut() {
1248        *v = linear_to_gamma_x8_inline(*v, gamma);
1249    }
1250}
1251
1252/// Convert linear f32x8 values to gamma-encoded in-place using a custom gamma.
1253///
1254/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
1255/// use [`linear_to_gamma_slice`] instead which handles remainders automatically.
1256///
1257/// # Example
1258/// ```
1259/// use linear_srgb::simd::linear_to_gamma_x8_slice;
1260/// use wide::f32x8;
1261///
1262/// let mut values = vec![f32x8::splat(0.2); 100];
1263/// linear_to_gamma_x8_slice(&mut values, 2.2);
1264/// ```
1265#[inline]
1266pub fn linear_to_gamma_x8_slice(values: &mut [f32x8], gamma: f32) {
1267    incant!(linear_to_gamma_x8_slice_tier(values, gamma), [v3])
1268}
1269
1270// ============================================================================
1271// f32x8 Slice Inline Functions (for use inside caller's magetypes code)
1272// ============================================================================
1273
1274/// Convert sRGB f32x8 values to linear in-place (always inlined).
1275///
1276/// Use this variant inside your own `#[magetypes]` functions to avoid
1277/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_slice`].
1278#[inline(always)]
1279pub fn srgb_to_linear_x8_slice_inline(values: &mut [f32x8]) {
1280    for v in values.iter_mut() {
1281        *v = srgb_to_linear_x8_inline(*v);
1282    }
1283}
1284
1285/// Convert linear f32x8 values to sRGB in-place (always inlined).
1286///
1287/// Use this variant inside your own `#[magetypes]` functions to avoid
1288/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_slice`].
1289#[inline(always)]
1290pub fn linear_to_srgb_x8_slice_inline(values: &mut [f32x8]) {
1291    for v in values.iter_mut() {
1292        *v = linear_to_srgb_x8_inline(*v);
1293    }
1294}
1295
1296/// Convert gamma-encoded f32x8 values to linear in-place (always inlined).
1297///
1298/// Use this variant inside your own `#[magetypes]` functions to avoid
1299/// double dispatch overhead. For standalone calls, use [`gamma_to_linear_x8_slice`].
1300#[inline(always)]
1301pub fn gamma_to_linear_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
1302    for v in values.iter_mut() {
1303        *v = gamma_to_linear_x8_inline(*v, gamma);
1304    }
1305}
1306
1307/// Convert linear f32x8 values to gamma-encoded in-place (always inlined).
1308///
1309/// Use this variant inside your own `#[magetypes]` functions to avoid
1310/// double dispatch overhead. For standalone calls, use [`linear_to_gamma_x8_slice`].
1311#[inline(always)]
1312pub fn linear_to_gamma_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
1313    for v in values.iter_mut() {
1314        *v = linear_to_gamma_x8_inline(*v, gamma);
1315    }
1316}
1317
1318// ============================================================================
1319// Tests
1320// ============================================================================
1321
1322#[cfg(test)]
1323mod tests {
1324    use super::*;
1325
1326    #[cfg(not(feature = "std"))]
1327    use alloc::{vec, vec::Vec};
1328
1329    // ---- x8 function tests ----
1330
1331    #[test]
1332    #[allow(deprecated)]
1333    fn test_srgb_to_linear_x8() {
1334        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1335        let result = srgb_to_linear_x8(f32x8::from(input));
1336        let result_arr: [f32; 8] = result.into();
1337
1338        for (i, &inp) in input.iter().enumerate() {
1339            let expected = crate::scalar::srgb_to_linear(inp);
1340            assert!(
1341                (result_arr[i] - expected).abs() < 1e-5,
1342                "srgb_to_linear_x8 mismatch at {}: got {}, expected {}",
1343                i,
1344                result_arr[i],
1345                expected
1346            );
1347        }
1348    }
1349
1350    #[test]
1351    fn test_linear_to_srgb_x8() {
1352        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1353        let result = linear_to_srgb_x8(f32x8::from(input));
1354        let result_arr: [f32; 8] = result.into();
1355
1356        for (i, &inp) in input.iter().enumerate() {
1357            let expected = crate::scalar::linear_to_srgb(inp);
1358            assert!(
1359                (result_arr[i] - expected).abs() < 1e-5,
1360                "linear_to_srgb_x8 mismatch at {}: got {}, expected {}",
1361                i,
1362                result_arr[i],
1363                expected
1364            );
1365        }
1366    }
1367
1368    #[test]
1369    #[allow(deprecated)]
1370    fn test_srgb_u8_to_linear_x8() {
1371        let input: [u8; 8] = [0, 64, 128, 192, 255, 32, 96, 160];
1372        let result = srgb_u8_to_linear_x8(input);
1373        let result_arr: [f32; 8] = result.into();
1374
1375        for (i, &inp) in input.iter().enumerate() {
1376            let expected = crate::scalar::srgb_u8_to_linear(inp);
1377            assert!(
1378                (result_arr[i] - expected).abs() < 1e-6,
1379                "srgb_u8_to_linear_x8 mismatch at {}: got {}, expected {}",
1380                i,
1381                result_arr[i],
1382                expected
1383            );
1384        }
1385    }
1386
1387    #[test]
1388    fn test_linear_to_srgb_u8_x8() {
1389        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8];
1390        let result = linear_to_srgb_u8_x8(f32x8::from(input));
1391
1392        for (i, &inp) in input.iter().enumerate() {
1393            let expected = (crate::scalar::linear_to_srgb(inp) * 255.0 + 0.5) as u8;
1394            assert!(
1395                (result[i] as i16 - expected as i16).abs() <= 1,
1396                "linear_to_srgb_u8_x8 mismatch at {}: got {}, expected {}",
1397                i,
1398                result[i],
1399                expected
1400            );
1401        }
1402    }
1403
1404    // ---- Slice function tests ----
1405
1406    #[test]
1407    fn test_srgb_to_linear_slice() {
1408        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1409        let expected: Vec<f32> = values
1410            .iter()
1411            .map(|&v| crate::scalar::srgb_to_linear(v))
1412            .collect();
1413
1414        srgb_to_linear_slice(&mut values);
1415
1416        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1417            assert!(
1418                (got - exp).abs() < 1e-5,
1419                "srgb_to_linear_slice mismatch at {}: got {}, expected {}",
1420                i,
1421                got,
1422                exp
1423            );
1424        }
1425    }
1426
1427    #[test]
1428    fn test_linear_to_srgb_slice() {
1429        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1430        let expected: Vec<f32> = values
1431            .iter()
1432            .map(|&v| crate::scalar::linear_to_srgb(v))
1433            .collect();
1434
1435        linear_to_srgb_slice(&mut values);
1436
1437        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1438            assert!(
1439                (got - exp).abs() < 1e-5,
1440                "linear_to_srgb_slice mismatch at {}: got {}, expected {}",
1441                i,
1442                got,
1443                exp
1444            );
1445        }
1446    }
1447
1448    #[test]
1449    #[allow(deprecated)]
1450    fn test_srgb_u8_to_linear_slice() {
1451        let input: Vec<u8> = (0..=255).collect();
1452        let mut output = vec![0.0f32; 256];
1453
1454        srgb_u8_to_linear_slice(&input, &mut output);
1455
1456        for (i, &out) in output.iter().enumerate() {
1457            let expected = crate::scalar::srgb_u8_to_linear(i as u8);
1458            assert!(
1459                (out - expected).abs() < 1e-6,
1460                "srgb_u8_to_linear_slice mismatch at {}: got {}, expected {}",
1461                i,
1462                out,
1463                expected
1464            );
1465        }
1466    }
1467
1468    #[test]
1469    fn test_linear_to_srgb_u8_slice() {
1470        let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
1471        let mut output = vec![0u8; 256];
1472
1473        linear_to_srgb_u8_slice(&input, &mut output);
1474
1475        for i in 0..256 {
1476            let expected = (crate::scalar::linear_to_srgb(input[i]) * 255.0 + 0.5) as u8;
1477            assert!(
1478                (output[i] as i16 - expected as i16).abs() <= 1,
1479                "linear_to_srgb_u8_slice mismatch at {}: got {}, expected {}",
1480                i,
1481                output[i],
1482                expected
1483            );
1484        }
1485    }
1486
1487    // ---- Roundtrip tests ----
1488
1489    #[test]
1490    fn test_f32_roundtrip() {
1491        let mut values: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
1492        let original = values.clone();
1493
1494        srgb_to_linear_slice(&mut values);
1495        linear_to_srgb_slice(&mut values);
1496
1497        for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1498            assert!(
1499                (orig - conv).abs() < 1e-4,
1500                "f32 roundtrip failed at {}: {} -> {}",
1501                i,
1502                orig,
1503                conv
1504            );
1505        }
1506    }
1507
1508    #[test]
1509    fn test_u8_roundtrip() {
1510        let input: Vec<u8> = (0..=255).collect();
1511        let mut linear = vec![0.0f32; 256];
1512        let mut back = vec![0u8; 256];
1513
1514        srgb_u8_to_linear_slice(&input, &mut linear);
1515        linear_to_srgb_u8_slice(&linear, &mut back);
1516
1517        for i in 0..256 {
1518            assert!(
1519                (input[i] as i16 - back[i] as i16).abs() <= 1,
1520                "u8 roundtrip failed at {}: {} -> {} -> {}",
1521                i,
1522                input[i],
1523                linear[i],
1524                back[i]
1525            );
1526        }
1527    }
1528
1529    // ---- Edge case tests ----
1530
1531    #[test]
1532    #[allow(deprecated)]
1533    fn test_clamping() {
1534        // Test that out-of-range values are clamped
1535        let input = f32x8::from([-0.5, -0.1, 0.0, 0.5, 1.0, 1.5, 2.0, 10.0]);
1536        let result = srgb_to_linear_x8(input);
1537        let arr: [f32; 8] = result.into();
1538
1539        assert_eq!(arr[0], 0.0, "negative should clamp to 0");
1540        assert_eq!(arr[1], 0.0, "negative should clamp to 0");
1541        assert!(arr[4] > 0.99 && arr[4] <= 1.0, "1.0 should stay ~1.0");
1542        assert!(arr[5] > 0.99 && arr[5] <= 1.0, "values > 1 should clamp");
1543    }
1544
1545    #[test]
1546    #[allow(deprecated)]
1547    fn test_linear_segment() {
1548        // Test values in the linear segment (< 0.04045)
1549        let input = f32x8::from([0.0, 0.01, 0.02, 0.03, 0.04, 0.005, 0.015, 0.035]);
1550        let result = srgb_to_linear_x8(input);
1551        let arr: [f32; 8] = result.into();
1552        let input_arr: [f32; 8] = input.into();
1553
1554        for i in 0..8 {
1555            let expected = input_arr[i] / 12.92;
1556            assert!(
1557                (arr[i] - expected).abs() < 1e-6,
1558                "linear segment mismatch at {}: got {}, expected {}",
1559                i,
1560                arr[i],
1561                expected
1562            );
1563        }
1564    }
1565
1566    /// Verify the const LUT stays in sync with the transfer function.
1567    /// Allows 1 ULP difference for cross-platform float variance (powf isn't
1568    /// perfectly deterministic across architectures).
1569    #[test]
1570    #[allow(deprecated)]
1571    fn test_lut_matches_transfer_function() {
1572        let lut = get_lut();
1573        for i in 0..=255u8 {
1574            let expected = crate::scalar::srgb_u8_to_linear(i);
1575            let got = lut[i as usize];
1576            let got_bits = got.to_bits();
1577            let expected_bits = expected.to_bits();
1578            let ulp_diff = (got_bits as i64 - expected_bits as i64).unsigned_abs();
1579            assert!(
1580                ulp_diff <= 1,
1581                "LUT[{}] = {} ({:08x}) differs by {} ULP from srgb_u8_to_linear({}) = {} ({:08x}). \
1582                 LUT needs regeneration if transfer constants changed.",
1583                i,
1584                got,
1585                got_bits,
1586                ulp_diff,
1587                i,
1588                expected,
1589                expected_bits
1590            );
1591        }
1592    }
1593
1594    #[test]
1595    fn test_empty_slice() {
1596        let mut empty: Vec<f32> = vec![];
1597        srgb_to_linear_slice(&mut empty);
1598        assert!(empty.is_empty());
1599
1600        let empty_u8: Vec<u8> = vec![];
1601        let mut empty_out: Vec<f32> = vec![];
1602        srgb_u8_to_linear_slice(&empty_u8, &mut empty_out);
1603    }
1604
1605    #[test]
1606    fn test_non_multiple_of_8() {
1607        // Test slices that aren't multiples of 8
1608        for len in [1, 3, 7, 9, 15, 17, 100] {
1609            let mut values: Vec<f32> = (0..len).map(|i| i as f32 / len as f32).collect();
1610            let expected: Vec<f32> = values
1611                .iter()
1612                .map(|&v| crate::scalar::srgb_to_linear(v))
1613                .collect();
1614
1615            srgb_to_linear_slice(&mut values);
1616
1617            for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1618                assert!(
1619                    (got - exp).abs() < 1e-5,
1620                    "len={} mismatch at {}: got {}, expected {}",
1621                    len,
1622                    i,
1623                    got,
1624                    exp
1625                );
1626            }
1627        }
1628    }
1629
1630    // ---- Custom gamma tests ----
1631
1632    #[test]
1633    fn test_gamma_to_linear_x8() {
1634        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1635        let gamma = 2.2f32;
1636        let result = gamma_to_linear_x8(f32x8::from(input), gamma);
1637        let result_arr: [f32; 8] = result.into();
1638
1639        for (i, &inp) in input.iter().enumerate() {
1640            let expected = crate::scalar::gamma_to_linear(inp, gamma);
1641            assert!(
1642                (result_arr[i] - expected).abs() < 1e-5,
1643                "gamma_to_linear_x8 mismatch at {}: got {}, expected {}",
1644                i,
1645                result_arr[i],
1646                expected
1647            );
1648        }
1649    }
1650
1651    #[test]
1652    fn test_linear_to_gamma_x8() {
1653        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1654        let gamma = 2.2f32;
1655        let result = linear_to_gamma_x8(f32x8::from(input), gamma);
1656        let result_arr: [f32; 8] = result.into();
1657
1658        for (i, &inp) in input.iter().enumerate() {
1659            let expected = crate::scalar::linear_to_gamma(inp, gamma);
1660            assert!(
1661                (result_arr[i] - expected).abs() < 1e-5,
1662                "linear_to_gamma_x8 mismatch at {}: got {}, expected {}",
1663                i,
1664                result_arr[i],
1665                expected
1666            );
1667        }
1668    }
1669
1670    #[test]
1671    fn test_gamma_roundtrip_x8() {
1672        let input = [0.0f32, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 1.0];
1673        for gamma in [1.8f32, 2.0, 2.2, 2.4] {
1674            let linear = gamma_to_linear_x8(f32x8::from(input), gamma);
1675            let back = linear_to_gamma_x8(linear, gamma);
1676            let back_arr: [f32; 8] = back.into();
1677
1678            for (i, &inp) in input.iter().enumerate() {
1679                assert!(
1680                    (inp - back_arr[i]).abs() < 1e-4,
1681                    "gamma {} roundtrip failed at {}: {} -> {}",
1682                    gamma,
1683                    i,
1684                    inp,
1685                    back_arr[i]
1686                );
1687            }
1688        }
1689    }
1690
1691    #[test]
1692    fn test_gamma_slice_functions() {
1693        let gamma = 2.2f32;
1694
1695        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1696        let expected: Vec<f32> = values
1697            .iter()
1698            .map(|&v| crate::scalar::gamma_to_linear(v, gamma))
1699            .collect();
1700
1701        gamma_to_linear_slice(&mut values, gamma);
1702
1703        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1704            assert!(
1705                (got - exp).abs() < 1e-5,
1706                "gamma_to_linear_slice mismatch at {}: got {}, expected {}",
1707                i,
1708                got,
1709                exp
1710            );
1711        }
1712
1713        // Test linear_to_gamma_slice
1714        let expected_back: Vec<f32> = values
1715            .iter()
1716            .map(|&v| crate::scalar::linear_to_gamma(v, gamma))
1717            .collect();
1718
1719        linear_to_gamma_slice(&mut values, gamma);
1720
1721        for (i, (&got, &exp)) in values.iter().zip(expected_back.iter()).enumerate() {
1722            assert!(
1723                (got - exp).abs() < 1e-5,
1724                "linear_to_gamma_slice mismatch at {}: got {}, expected {}",
1725                i,
1726                got,
1727                exp
1728            );
1729        }
1730    }
1731
1732    // ---- Permutation tests (archmage tier testing) ----
1733
1734    #[test]
1735    fn srgb_roundtrip_all_tiers() {
1736        let report = archmage::testing::for_each_token_permutation(
1737            archmage::testing::CompileTimePolicy::Warn,
1738            |_perm| {
1739                let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1740                let original = values.clone();
1741                srgb_to_linear_slice(&mut values);
1742                linear_to_srgb_slice(&mut values);
1743                for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1744                    assert!(
1745                        (orig - conv).abs() < 1e-4,
1746                        "tier roundtrip failed at {i}: {orig} -> {conv}"
1747                    );
1748                }
1749            },
1750        );
1751        eprintln!("{report}");
1752    }
1753}