linear_srgb/
simd.rs

1//! SIMD-accelerated sRGB ↔ linear conversion.
2//!
3//! This module provides high-performance conversion functions using AVX2/SSE SIMD
4//! instructions via the `wide` crate with runtime CPU feature detection.
5//!
6//! # API Overview
7//!
8//! ## x8 Functions (process 8 values at once)
9//! - [`srgb_to_linear_x8`] - f32x8 sRGB → f32x8 linear
10//! - [`linear_to_srgb_x8`] - f32x8 linear → f32x8 sRGB
11//! - [`srgb_u8_to_linear_x8`] - \[u8; 8\] sRGB → f32x8 linear
12//! - [`linear_to_srgb_u8_x8`] - f32x8 linear → \[u8; 8\] sRGB
13//!
14//! ## Slice Functions (process entire slices)
15//! - [`srgb_to_linear_slice`] - &mut \[f32\] sRGB → linear in-place
16//! - [`linear_to_srgb_slice`] - &mut \[f32\] linear → sRGB in-place
17//! - [`srgb_u8_to_linear_slice`] - &\[u8\] sRGB → &mut \[f32\] linear
18//! - [`linear_to_srgb_u8_slice`] - &\[f32\] linear → &mut \[u8\] sRGB
19
20use multiversed::multiversed;
21use wide::{CmpLt, f32x8};
22
23use crate::fast_math::pow_x8;
24
25// sRGB transfer function constants (C0-continuous, moxcms-derived)
26// These ensure exact continuity at the linear/power segment junction.
27// Standard IEC values (0.055, 1.055, 0.04045) have a tiny discontinuity.
28const SRGB_LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.039_293_37);
29const LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.003_041_282_6);
30const LINEAR_SCALE: f32x8 = f32x8::splat(1.0 / 12.92);
31const SRGB_OFFSET: f32x8 = f32x8::splat(0.055_010_72);
32const SRGB_SCALE: f32x8 = f32x8::splat(1.055_010_7);
33const TWELVE_92: f32x8 = f32x8::splat(12.92);
34const ZERO: f32x8 = f32x8::splat(0.0);
35const ONE: f32x8 = f32x8::splat(1.0);
36const U8_MAX: f32x8 = f32x8::splat(255.0);
37const HALF: f32x8 = f32x8::splat(0.5);
38
39/// Precomputed sRGB u8 → linear f32 lookup table.
40/// Uses the same constants as the transfer module (C0-continuous IEC 61966-2-1).
41/// Generated by computing `srgb_u8_to_linear(i)` for each i in 0..=255.
42/// To regenerate: `cargo run --release --example generate_lut`
43const SRGB_U8_TO_LINEAR_LUT: [f32; 256] = [
44    0.0_f32,
45    0.000303527_f32,
46    0.000607054_f32,
47    0.000910581_f32,
48    0.001214108_f32,
49    0.001517635_f32,
50    0.001821162_f32,
51    0.0021246888_f32,
52    0.002428216_f32,
53    0.002731743_f32,
54    0.00303527_f32,
55    0.0033473307_f32,
56    0.0036773437_f32,
57    0.0040255957_f32,
58    0.004392362_f32,
59    0.004777916_f32,
60    0.0051825214_f32,
61    0.00560644_f32,
62    0.006049924_f32,
63    0.0065132244_f32,
64    0.0069965874_f32,
65    0.007500253_f32,
66    0.008024457_f32,
67    0.008569433_f32,
68    0.009135411_f32,
69    0.009722613_f32,
70    0.010331264_f32,
71    0.010961577_f32,
72    0.011613773_f32,
73    0.012288062_f32,
74    0.012984648_f32,
75    0.013703744_f32,
76    0.01444555_f32,
77    0.015210266_f32,
78    0.01599809_f32,
79    0.016809216_f32,
80    0.01764384_f32,
81    0.018502146_f32,
82    0.019384334_f32,
83    0.02029058_f32,
84    0.02122107_f32,
85    0.022175988_f32,
86    0.023155512_f32,
87    0.024159823_f32,
88    0.025189094_f32,
89    0.026243499_f32,
90    0.027323212_f32,
91    0.0284284_f32,
92    0.02955924_f32,
93    0.030715894_f32,
94    0.03189852_f32,
95    0.0331073_f32,
96    0.034342386_f32,
97    0.03560393_f32,
98    0.036892105_f32,
99    0.03820707_f32,
100    0.039548974_f32,
101    0.04091798_f32,
102    0.04231424_f32,
103    0.04373789_f32,
104    0.045189105_f32,
105    0.04666803_f32,
106    0.04817481_f32,
107    0.049709592_f32,
108    0.051272515_f32,
109    0.052863743_f32,
110    0.054483414_f32,
111    0.05613167_f32,
112    0.05780865_f32,
113    0.05951448_f32,
114    0.061249338_f32,
115    0.063013345_f32,
116    0.06480663_f32,
117    0.06662934_f32,
118    0.068481594_f32,
119    0.07036356_f32,
120    0.072275355_f32,
121    0.07421711_f32,
122    0.07618896_f32,
123    0.07819102_f32,
124    0.080223456_f32,
125    0.08228638_f32,
126    0.08437992_f32,
127    0.086504206_f32,
128    0.088659346_f32,
129    0.09084551_f32,
130    0.093062796_f32,
131    0.09531133_f32,
132    0.09759124_f32,
133    0.09990266_f32,
134    0.10224568_f32,
135    0.104620464_f32,
136    0.10702711_f32,
137    0.109465756_f32,
138    0.1119365_f32,
139    0.11443946_f32,
140    0.116974786_f32,
141    0.11954258_f32,
142    0.12214295_f32,
143    0.12477602_f32,
144    0.1274419_f32,
145    0.13014072_f32,
146    0.1328726_f32,
147    0.13563763_f32,
148    0.13843594_f32,
149    0.14126763_f32,
150    0.14413282_f32,
151    0.14703165_f32,
152    0.1499642_f32,
153    0.15293059_f32,
154    0.15593089_f32,
155    0.15896529_f32,
156    0.16203386_f32,
157    0.1651367_f32,
158    0.16827393_f32,
159    0.17144562_f32,
160    0.17465195_f32,
161    0.17789298_f32,
162    0.18116882_f32,
163    0.1844796_f32,
164    0.18782537_f32,
165    0.1912063_f32,
166    0.19462249_f32,
167    0.19807397_f32,
168    0.2015609_f32,
169    0.20508343_f32,
170    0.20864154_f32,
171    0.21223548_f32,
172    0.21586527_f32,
173    0.21953095_f32,
174    0.22323275_f32,
175    0.22697066_f32,
176    0.23074481_f32,
177    0.2345554_f32,
178    0.23840237_f32,
179    0.24228595_f32,
180    0.24620613_f32,
181    0.25016314_f32,
182    0.25415692_f32,
183    0.25818765_f32,
184    0.26225552_f32,
185    0.26636043_f32,
186    0.27050266_f32,
187    0.27468216_f32,
188    0.27889907_f32,
189    0.2831536_f32,
190    0.28744566_f32,
191    0.29177552_f32,
192    0.2961431_f32,
193    0.30054858_f32,
194    0.30499217_f32,
195    0.30947372_f32,
196    0.31399357_f32,
197    0.3185516_f32,
198    0.32314798_f32,
199    0.3277829_f32,
200    0.33245632_f32,
201    0.33716843_f32,
202    0.34191918_f32,
203    0.34670877_f32,
204    0.35153738_f32,
205    0.35640487_f32,
206    0.36131153_f32,
207    0.3662573_f32,
208    0.37124234_f32,
209    0.37626684_f32,
210    0.38133067_f32,
211    0.3864341_f32,
212    0.39157712_f32,
213    0.3967598_f32,
214    0.4019824_f32,
215    0.40724477_f32,
216    0.4125472_f32,
217    0.41788962_f32,
218    0.42327216_f32,
219    0.42869502_f32,
220    0.4341581_f32,
221    0.43966165_f32,
222    0.44520563_f32,
223    0.45079017_f32,
224    0.4564154_f32,
225    0.46208134_f32,
226    0.46778816_f32,
227    0.4735358_f32,
228    0.47932443_f32,
229    0.4851542_f32,
230    0.49102503_f32,
231    0.49693722_f32,
232    0.5028906_f32,
233    0.5088854_f32,
234    0.5149218_f32,
235    0.5209996_f32,
236    0.52711916_f32,
237    0.5332804_f32,
238    0.53948337_f32,
239    0.5457284_f32,
240    0.55201524_f32,
241    0.55834424_f32,
242    0.56471527_f32,
243    0.57112855_f32,
244    0.57758415_f32,
245    0.58408207_f32,
246    0.5906225_f32,
247    0.59720534_f32,
248    0.6038308_f32,
249    0.6104991_f32,
250    0.61721_f32,
251    0.62396383_f32,
252    0.6307605_f32,
253    0.6376001_f32,
254    0.644483_f32,
255    0.6514088_f32,
256    0.658378_f32,
257    0.6653904_f32,
258    0.67244613_f32,
259    0.67954546_f32,
260    0.68668824_f32,
261    0.6938747_f32,
262    0.7011047_f32,
263    0.7083785_f32,
264    0.7156962_f32,
265    0.72305775_f32,
266    0.7304634_f32,
267    0.73791295_f32,
268    0.7454066_f32,
269    0.75294465_f32,
270    0.76052684_f32,
271    0.7681535_f32,
272    0.7758244_f32,
273    0.7835399_f32,
274    0.79130006_f32,
275    0.79910475_f32,
276    0.80695426_f32,
277    0.8148484_f32,
278    0.82278764_f32,
279    0.8307716_f32,
280    0.83880067_f32,
281    0.8468749_f32,
282    0.8549941_f32,
283    0.8631587_f32,
284    0.8713685_f32,
285    0.87962353_f32,
286    0.8879244_f32,
287    0.89627033_f32,
288    0.9046623_f32,
289    0.9130995_f32,
290    0.9215827_f32,
291    0.9301116_f32,
292    0.93868643_f32,
293    0.9473071_f32,
294    0.9559739_f32,
295    0.9646866_f32,
296    0.9734457_f32,
297    0.9822507_f32,
298    0.9911024_f32,
299    1.0_f32,
300];
301
302#[inline]
303fn get_lut() -> &'static [f32; 256] {
304    &SRGB_U8_TO_LINEAR_LUT
305}
306
307/// Convert a single sRGB u8 value to linear f32 using LUT lookup.
308///
309/// This is the fastest method for u8 input as it uses a precomputed lookup table
310/// embedded in the binary. For batch conversions, use [`srgb_u8_to_linear_slice`].
311///
312/// # Example
313/// ```
314/// use linear_srgb::simd::srgb_u8_to_linear;
315///
316/// let linear = srgb_u8_to_linear(128);
317/// assert!((linear - 0.2158).abs() < 0.001);
318/// ```
319#[inline]
320pub fn srgb_u8_to_linear(value: u8) -> f32 {
321    get_lut()[value as usize]
322}
323
324// ============================================================================
325// x8 Inline Functions - Always inlined, for use in caller's multiversed code
326// ============================================================================
327
328/// Convert 8 sRGB f32 values to linear (always inlined).
329///
330/// Use this variant inside your own `#[multiversed]` functions to avoid
331/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_dispatch`].
332///
333/// Input values are clamped to \[0, 1\].
334#[inline(always)]
335pub fn srgb_to_linear_x8_inline(srgb: f32x8) -> f32x8 {
336    let srgb = srgb.max(ZERO).min(ONE);
337    let linear_result = srgb * LINEAR_SCALE;
338    let power_result = pow_x8((srgb + SRGB_OFFSET) / SRGB_SCALE, 2.4);
339    let mask = srgb.simd_lt(SRGB_LINEAR_THRESHOLD);
340    mask.blend(linear_result, power_result)
341}
342
343/// Convert 8 linear f32 values to sRGB (always inlined).
344///
345/// Use this variant inside your own `#[multiversed]` functions to avoid
346/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_dispatch`].
347///
348/// Input values are clamped to \[0, 1\].
349#[inline(always)]
350pub fn linear_to_srgb_x8_inline(linear: f32x8) -> f32x8 {
351    let linear = linear.max(ZERO).min(ONE);
352    let linear_result = linear * TWELVE_92;
353    let power_result = SRGB_SCALE * pow_x8(linear, 1.0 / 2.4) - SRGB_OFFSET;
354    let mask = linear.simd_lt(LINEAR_THRESHOLD);
355    mask.blend(linear_result, power_result)
356}
357
358/// Convert 8 linear f32 values to sRGB u8 (always inlined).
359///
360/// Use this variant inside your own `#[multiversed]` functions to avoid
361/// double dispatch overhead.
362#[inline(always)]
363pub fn linear_to_srgb_u8_x8_inline(linear: f32x8) -> [u8; 8] {
364    let srgb = linear_to_srgb_x8_inline(linear);
365    let scaled = srgb * U8_MAX + HALF;
366    let arr: [f32; 8] = scaled.into();
367    [
368        arr[0] as u8,
369        arr[1] as u8,
370        arr[2] as u8,
371        arr[3] as u8,
372        arr[4] as u8,
373        arr[5] as u8,
374        arr[6] as u8,
375        arr[7] as u8,
376    ]
377}
378
379/// Convert 8 gamma-encoded f32 values to linear (always inlined).
380///
381/// Use this variant inside your own `#[multiversed]` functions to avoid
382/// double dispatch overhead.
383#[inline(always)]
384pub fn gamma_to_linear_x8_inline(encoded: f32x8, gamma: f32) -> f32x8 {
385    let encoded = encoded.max(ZERO).min(ONE);
386    pow_x8(encoded, gamma)
387}
388
389/// Convert 8 linear f32 values to gamma-encoded (always inlined).
390///
391/// Use this variant inside your own `#[multiversed]` functions to avoid
392/// double dispatch overhead.
393#[inline(always)]
394pub fn linear_to_gamma_x8_inline(linear: f32x8, gamma: f32) -> f32x8 {
395    let linear = linear.max(ZERO).min(ONE);
396    pow_x8(linear, 1.0 / gamma)
397}
398
399// ============================================================================
400// x8 Dispatch Functions - Runtime CPU feature detection
401// ============================================================================
402
403/// Convert 8 sRGB f32 values to linear (with CPU dispatch).
404///
405/// This variant uses runtime CPU feature detection to select the optimal
406/// implementation. Use [`srgb_to_linear_x8_inline`] inside your own
407/// `#[multiversed]` functions to avoid double dispatch.
408///
409/// Input values are clamped to \[0, 1\].
410#[multiversed]
411#[inline]
412pub fn srgb_to_linear_x8_dispatch(srgb: f32x8) -> f32x8 {
413    srgb_to_linear_x8_inline(srgb)
414}
415
416/// Convert 8 linear f32 values to sRGB (with CPU dispatch).
417///
418/// This variant uses runtime CPU feature detection to select the optimal
419/// implementation. Use [`linear_to_srgb_x8_inline`] inside your own
420/// `#[multiversed]` functions to avoid double dispatch.
421///
422/// Input values are clamped to \[0, 1\].
423#[multiversed]
424#[inline]
425pub fn linear_to_srgb_x8_dispatch(linear: f32x8) -> f32x8 {
426    linear_to_srgb_x8_inline(linear)
427}
428
429/// Convert 8 linear f32 values to sRGB u8 (with CPU dispatch).
430#[multiversed]
431#[inline]
432pub fn linear_to_srgb_u8_x8_dispatch(linear: f32x8) -> [u8; 8] {
433    linear_to_srgb_u8_x8_inline(linear)
434}
435
436/// Convert 8 gamma-encoded f32 values to linear (with CPU dispatch).
437#[multiversed]
438#[inline]
439pub fn gamma_to_linear_x8_dispatch(encoded: f32x8, gamma: f32) -> f32x8 {
440    gamma_to_linear_x8_inline(encoded, gamma)
441}
442
443/// Convert 8 linear f32 values to gamma-encoded (with CPU dispatch).
444#[multiversed]
445#[inline]
446pub fn linear_to_gamma_x8_dispatch(linear: f32x8, gamma: f32) -> f32x8 {
447    linear_to_gamma_x8_inline(linear, gamma)
448}
449
450// ============================================================================
451// x8 Default Functions - Calls inline variant, compiler decides inlining
452// ============================================================================
453
454/// Convert 8 sRGB f32 values to linear.
455///
456/// This is the default variant that calls the inline implementation.
457/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
458/// inside your own `#[multiversed]` functions.
459///
460/// Input values are clamped to \[0, 1\].
461///
462/// # Example
463/// ```
464/// use linear_srgb::simd::srgb_to_linear_x8;
465/// use wide::f32x8;
466///
467/// let srgb = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
468/// let linear = srgb_to_linear_x8(srgb);
469/// ```
470#[inline]
471pub fn srgb_to_linear_x8(srgb: f32x8) -> f32x8 {
472    srgb_to_linear_x8_inline(srgb)
473}
474
475/// Convert 8 linear f32 values to sRGB.
476///
477/// This is the default variant that calls the inline implementation.
478/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
479/// inside your own `#[multiversed]` functions.
480///
481/// Input values are clamped to \[0, 1\].
482///
483/// # Example
484/// ```
485/// use linear_srgb::simd::linear_to_srgb_x8;
486/// use wide::f32x8;
487///
488/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
489/// let srgb = linear_to_srgb_x8(linear);
490/// ```
491#[inline]
492pub fn linear_to_srgb_x8(linear: f32x8) -> f32x8 {
493    linear_to_srgb_x8_inline(linear)
494}
495
496/// Convert 8 sRGB u8 values to linear f32 using LUT lookup.
497///
498/// This is the fastest method for u8 input as it uses a precomputed lookup table.
499///
500/// # Example
501/// ```
502/// use linear_srgb::simd::srgb_u8_to_linear_x8;
503///
504/// let srgb = [0u8, 64, 128, 192, 255, 32, 96, 160];
505/// let linear = srgb_u8_to_linear_x8(srgb);
506/// ```
507#[inline]
508pub fn srgb_u8_to_linear_x8(srgb: [u8; 8]) -> f32x8 {
509    let lut = get_lut();
510    f32x8::from([
511        lut[srgb[0] as usize],
512        lut[srgb[1] as usize],
513        lut[srgb[2] as usize],
514        lut[srgb[3] as usize],
515        lut[srgb[4] as usize],
516        lut[srgb[5] as usize],
517        lut[srgb[6] as usize],
518        lut[srgb[7] as usize],
519    ])
520}
521
522/// Convert 8 linear f32 values to sRGB u8.
523///
524/// Input values are clamped to \[0, 1\], output is rounded to nearest u8.
525///
526/// # Example
527/// ```
528/// use linear_srgb::simd::linear_to_srgb_u8_x8;
529/// use wide::f32x8;
530///
531/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
532/// let srgb = linear_to_srgb_u8_x8(linear);
533/// ```
534#[inline]
535pub fn linear_to_srgb_u8_x8(linear: f32x8) -> [u8; 8] {
536    linear_to_srgb_u8_x8_inline(linear)
537}
538
539/// Convert 8 gamma-encoded f32 values to linear.
540///
541/// # Example
542/// ```
543/// use linear_srgb::simd::gamma_to_linear_x8;
544/// use wide::f32x8;
545///
546/// let encoded = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
547/// let linear = gamma_to_linear_x8(encoded, 2.2);
548/// ```
549#[inline]
550pub fn gamma_to_linear_x8(encoded: f32x8, gamma: f32) -> f32x8 {
551    gamma_to_linear_x8_inline(encoded, gamma)
552}
553
554/// Convert 8 linear f32 values to gamma-encoded.
555///
556/// # Example
557/// ```
558/// use linear_srgb::simd::linear_to_gamma_x8;
559/// use wide::f32x8;
560///
561/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
562/// let encoded = linear_to_gamma_x8(linear, 2.2);
563/// ```
564#[inline]
565pub fn linear_to_gamma_x8(linear: f32x8, gamma: f32) -> f32x8 {
566    linear_to_gamma_x8_inline(linear, gamma)
567}
568
569// ============================================================================
570// Slice Functions - Process entire slices
571// ============================================================================
572
573/// Convert sRGB f32 values to linear in-place.
574///
575/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
576///
577/// # Example
578/// ```
579/// use linear_srgb::simd::srgb_to_linear_slice;
580///
581/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
582/// srgb_to_linear_slice(&mut values);
583/// ```
584#[multiversed]
585#[inline]
586pub fn srgb_to_linear_slice(values: &mut [f32]) {
587    let (chunks, remainder) = values.as_chunks_mut::<8>();
588
589    for chunk in chunks {
590        let result = srgb_to_linear_x8_inline(f32x8::from(*chunk));
591        *chunk = result.into();
592    }
593
594    for v in remainder {
595        *v = crate::scalar::srgb_to_linear(*v);
596    }
597}
598
599/// Convert linear f32 values to sRGB in-place.
600///
601/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
602///
603/// # Example
604/// ```
605/// use linear_srgb::simd::linear_to_srgb_slice;
606///
607/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
608/// linear_to_srgb_slice(&mut values);
609/// ```
610#[multiversed]
611#[inline]
612pub fn linear_to_srgb_slice(values: &mut [f32]) {
613    let (chunks, remainder) = values.as_chunks_mut::<8>();
614
615    for chunk in chunks {
616        let result = linear_to_srgb_x8_inline(f32x8::from(*chunk));
617        *chunk = result.into();
618    }
619
620    for v in remainder {
621        *v = crate::scalar::linear_to_srgb(*v);
622    }
623}
624
625/// Convert sRGB u8 values to linear f32.
626///
627/// Uses a precomputed LUT for each u8 value, processed in SIMD batches of 8.
628///
629/// # Panics
630/// Panics if `input.len() != output.len()`.
631///
632/// # Example
633/// ```
634/// use linear_srgb::simd::srgb_u8_to_linear_slice;
635///
636/// let input: Vec<u8> = (0..=255).collect();
637/// let mut output = vec![0.0f32; 256];
638/// srgb_u8_to_linear_slice(&input, &mut output);
639/// ```
640#[inline]
641pub fn srgb_u8_to_linear_slice(input: &[u8], output: &mut [f32]) {
642    assert_eq!(input.len(), output.len());
643    let lut = get_lut();
644
645    let (in_chunks, in_remainder) = input.as_chunks::<8>();
646    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
647
648    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
649        *out = [
650            lut[inp[0] as usize],
651            lut[inp[1] as usize],
652            lut[inp[2] as usize],
653            lut[inp[3] as usize],
654            lut[inp[4] as usize],
655            lut[inp[5] as usize],
656            lut[inp[6] as usize],
657            lut[inp[7] as usize],
658        ];
659    }
660
661    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
662        *out = lut[*inp as usize];
663    }
664}
665
666/// Convert linear f32 values to sRGB u8.
667///
668/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
669///
670/// # Panics
671/// Panics if `input.len() != output.len()`.
672///
673/// # Example
674/// ```
675/// use linear_srgb::simd::linear_to_srgb_u8_slice;
676///
677/// let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
678/// let mut output = vec![0u8; 256];
679/// linear_to_srgb_u8_slice(&input, &mut output);
680/// ```
681#[multiversed]
682#[inline]
683pub fn linear_to_srgb_u8_slice(input: &[f32], output: &mut [u8]) {
684    assert_eq!(input.len(), output.len());
685
686    let (in_chunks, in_remainder) = input.as_chunks::<8>();
687    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
688
689    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
690        *out = linear_to_srgb_u8_x8_inline(f32x8::from(*inp));
691    }
692
693    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
694        let srgb = crate::scalar::linear_to_srgb(*inp);
695        *out = (srgb * 255.0 + 0.5) as u8;
696    }
697}
698
699// ============================================================================
700// Custom Gamma Slice Functions
701// ============================================================================
702
703/// Convert gamma-encoded f32 values to linear in-place using a custom gamma.
704///
705/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
706///
707/// # Example
708/// ```
709/// use linear_srgb::simd::gamma_to_linear_slice;
710///
711/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
712/// gamma_to_linear_slice(&mut values, 2.2);
713/// ```
714#[multiversed]
715#[inline]
716pub fn gamma_to_linear_slice(values: &mut [f32], gamma: f32) {
717    let (chunks, remainder) = values.as_chunks_mut::<8>();
718
719    for chunk in chunks {
720        let result = gamma_to_linear_x8_inline(f32x8::from(*chunk), gamma);
721        *chunk = result.into();
722    }
723
724    for v in remainder {
725        *v = crate::scalar::gamma_to_linear(*v, gamma);
726    }
727}
728
729/// Convert linear f32 values to gamma-encoded in-place using a custom gamma.
730///
731/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
732///
733/// # Example
734/// ```
735/// use linear_srgb::simd::linear_to_gamma_slice;
736///
737/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
738/// linear_to_gamma_slice(&mut values, 2.2);
739/// ```
740#[multiversed]
741#[inline]
742pub fn linear_to_gamma_slice(values: &mut [f32], gamma: f32) {
743    let (chunks, remainder) = values.as_chunks_mut::<8>();
744
745    for chunk in chunks {
746        let result = linear_to_gamma_x8_inline(f32x8::from(*chunk), gamma);
747        *chunk = result.into();
748    }
749
750    for v in remainder {
751        *v = crate::scalar::linear_to_gamma(*v, gamma);
752    }
753}
754
755// ============================================================================
756// f32x8 Slice Functions (for pre-aligned SIMD data)
757// ============================================================================
758
759/// Convert sRGB f32x8 values to linear in-place.
760///
761/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
762/// use [`srgb_to_linear_slice`] instead which handles remainders automatically.
763///
764/// # Example
765/// ```
766/// use linear_srgb::simd::srgb_to_linear_x8_slice;
767/// use wide::f32x8;
768///
769/// let mut values = vec![f32x8::splat(0.5); 100];
770/// srgb_to_linear_x8_slice(&mut values);
771/// ```
772#[multiversed]
773#[inline]
774pub fn srgb_to_linear_x8_slice(values: &mut [f32x8]) {
775    for v in values.iter_mut() {
776        *v = srgb_to_linear_x8_inline(*v);
777    }
778}
779
780/// Convert linear f32x8 values to sRGB in-place.
781///
782/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
783/// use [`linear_to_srgb_slice`] instead which handles remainders automatically.
784///
785/// # Example
786/// ```
787/// use linear_srgb::simd::linear_to_srgb_x8_slice;
788/// use wide::f32x8;
789///
790/// let mut values = vec![f32x8::splat(0.5); 100];
791/// linear_to_srgb_x8_slice(&mut values);
792/// ```
793#[multiversed]
794#[inline]
795pub fn linear_to_srgb_x8_slice(values: &mut [f32x8]) {
796    for v in values.iter_mut() {
797        *v = linear_to_srgb_x8_inline(*v);
798    }
799}
800
801/// Convert gamma-encoded f32x8 values to linear in-place using a custom gamma.
802///
803/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
804/// use [`gamma_to_linear_slice`] instead which handles remainders automatically.
805///
806/// # Example
807/// ```
808/// use linear_srgb::simd::gamma_to_linear_x8_slice;
809/// use wide::f32x8;
810///
811/// let mut values = vec![f32x8::splat(0.5); 100];
812/// gamma_to_linear_x8_slice(&mut values, 2.2);
813/// ```
814#[multiversed]
815#[inline]
816pub fn gamma_to_linear_x8_slice(values: &mut [f32x8], gamma: f32) {
817    for v in values.iter_mut() {
818        *v = gamma_to_linear_x8_inline(*v, gamma);
819    }
820}
821
822/// Convert linear f32x8 values to gamma-encoded in-place using a custom gamma.
823///
824/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
825/// use [`linear_to_gamma_slice`] instead which handles remainders automatically.
826///
827/// # Example
828/// ```
829/// use linear_srgb::simd::linear_to_gamma_x8_slice;
830/// use wide::f32x8;
831///
832/// let mut values = vec![f32x8::splat(0.2); 100];
833/// linear_to_gamma_x8_slice(&mut values, 2.2);
834/// ```
835#[multiversed]
836#[inline]
837pub fn linear_to_gamma_x8_slice(values: &mut [f32x8], gamma: f32) {
838    for v in values.iter_mut() {
839        *v = linear_to_gamma_x8_inline(*v, gamma);
840    }
841}
842
843// ============================================================================
844// f32x8 Slice Inline Functions (for use inside caller's multiversed code)
845// ============================================================================
846
847/// Convert sRGB f32x8 values to linear in-place (always inlined).
848///
849/// Use this variant inside your own `#[multiversed]` functions to avoid
850/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_slice`].
851#[inline(always)]
852pub fn srgb_to_linear_x8_slice_inline(values: &mut [f32x8]) {
853    for v in values.iter_mut() {
854        *v = srgb_to_linear_x8_inline(*v);
855    }
856}
857
858/// Convert linear f32x8 values to sRGB in-place (always inlined).
859///
860/// Use this variant inside your own `#[multiversed]` functions to avoid
861/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_slice`].
862#[inline(always)]
863pub fn linear_to_srgb_x8_slice_inline(values: &mut [f32x8]) {
864    for v in values.iter_mut() {
865        *v = linear_to_srgb_x8_inline(*v);
866    }
867}
868
869/// Convert gamma-encoded f32x8 values to linear in-place (always inlined).
870///
871/// Use this variant inside your own `#[multiversed]` functions to avoid
872/// double dispatch overhead. For standalone calls, use [`gamma_to_linear_x8_slice`].
873#[inline(always)]
874pub fn gamma_to_linear_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
875    for v in values.iter_mut() {
876        *v = gamma_to_linear_x8_inline(*v, gamma);
877    }
878}
879
880/// Convert linear f32x8 values to gamma-encoded in-place (always inlined).
881///
882/// Use this variant inside your own `#[multiversed]` functions to avoid
883/// double dispatch overhead. For standalone calls, use [`linear_to_gamma_x8_slice`].
884#[inline(always)]
885pub fn linear_to_gamma_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
886    for v in values.iter_mut() {
887        *v = linear_to_gamma_x8_inline(*v, gamma);
888    }
889}
890
891// ============================================================================
892// Tests
893// ============================================================================
894
895#[cfg(test)]
896mod tests {
897    use super::*;
898
899    #[cfg(not(feature = "std"))]
900    use alloc::{vec, vec::Vec};
901
902    // ---- x8 function tests ----
903
904    #[test]
905    #[allow(deprecated)]
906    fn test_srgb_to_linear_x8() {
907        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
908        let result = srgb_to_linear_x8(f32x8::from(input));
909        let result_arr: [f32; 8] = result.into();
910
911        for (i, &inp) in input.iter().enumerate() {
912            let expected = crate::scalar::srgb_to_linear(inp);
913            assert!(
914                (result_arr[i] - expected).abs() < 1e-5,
915                "srgb_to_linear_x8 mismatch at {}: got {}, expected {}",
916                i,
917                result_arr[i],
918                expected
919            );
920        }
921    }
922
923    #[test]
924    fn test_linear_to_srgb_x8() {
925        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
926        let result = linear_to_srgb_x8(f32x8::from(input));
927        let result_arr: [f32; 8] = result.into();
928
929        for (i, &inp) in input.iter().enumerate() {
930            let expected = crate::scalar::linear_to_srgb(inp);
931            assert!(
932                (result_arr[i] - expected).abs() < 1e-5,
933                "linear_to_srgb_x8 mismatch at {}: got {}, expected {}",
934                i,
935                result_arr[i],
936                expected
937            );
938        }
939    }
940
941    #[test]
942    #[allow(deprecated)]
943    fn test_srgb_u8_to_linear_x8() {
944        let input: [u8; 8] = [0, 64, 128, 192, 255, 32, 96, 160];
945        let result = srgb_u8_to_linear_x8(input);
946        let result_arr: [f32; 8] = result.into();
947
948        for (i, &inp) in input.iter().enumerate() {
949            let expected = crate::scalar::srgb_u8_to_linear(inp);
950            assert!(
951                (result_arr[i] - expected).abs() < 1e-6,
952                "srgb_u8_to_linear_x8 mismatch at {}: got {}, expected {}",
953                i,
954                result_arr[i],
955                expected
956            );
957        }
958    }
959
960    #[test]
961    fn test_linear_to_srgb_u8_x8() {
962        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8];
963        let result = linear_to_srgb_u8_x8(f32x8::from(input));
964
965        for (i, &inp) in input.iter().enumerate() {
966            let expected = (crate::scalar::linear_to_srgb(inp) * 255.0 + 0.5) as u8;
967            assert!(
968                (result[i] as i16 - expected as i16).abs() <= 1,
969                "linear_to_srgb_u8_x8 mismatch at {}: got {}, expected {}",
970                i,
971                result[i],
972                expected
973            );
974        }
975    }
976
977    // ---- Slice function tests ----
978
979    #[test]
980    fn test_srgb_to_linear_slice() {
981        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
982        let expected: Vec<f32> = values
983            .iter()
984            .map(|&v| crate::scalar::srgb_to_linear(v))
985            .collect();
986
987        srgb_to_linear_slice(&mut values);
988
989        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
990            assert!(
991                (got - exp).abs() < 1e-5,
992                "srgb_to_linear_slice mismatch at {}: got {}, expected {}",
993                i,
994                got,
995                exp
996            );
997        }
998    }
999
1000    #[test]
1001    fn test_linear_to_srgb_slice() {
1002        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1003        let expected: Vec<f32> = values
1004            .iter()
1005            .map(|&v| crate::scalar::linear_to_srgb(v))
1006            .collect();
1007
1008        linear_to_srgb_slice(&mut values);
1009
1010        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1011            assert!(
1012                (got - exp).abs() < 1e-5,
1013                "linear_to_srgb_slice mismatch at {}: got {}, expected {}",
1014                i,
1015                got,
1016                exp
1017            );
1018        }
1019    }
1020
1021    #[test]
1022    #[allow(deprecated)]
1023    fn test_srgb_u8_to_linear_slice() {
1024        let input: Vec<u8> = (0..=255).collect();
1025        let mut output = vec![0.0f32; 256];
1026
1027        srgb_u8_to_linear_slice(&input, &mut output);
1028
1029        for (i, &out) in output.iter().enumerate() {
1030            let expected = crate::scalar::srgb_u8_to_linear(i as u8);
1031            assert!(
1032                (out - expected).abs() < 1e-6,
1033                "srgb_u8_to_linear_slice mismatch at {}: got {}, expected {}",
1034                i,
1035                out,
1036                expected
1037            );
1038        }
1039    }
1040
1041    #[test]
1042    fn test_linear_to_srgb_u8_slice() {
1043        let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
1044        let mut output = vec![0u8; 256];
1045
1046        linear_to_srgb_u8_slice(&input, &mut output);
1047
1048        for i in 0..256 {
1049            let expected = (crate::scalar::linear_to_srgb(input[i]) * 255.0 + 0.5) as u8;
1050            assert!(
1051                (output[i] as i16 - expected as i16).abs() <= 1,
1052                "linear_to_srgb_u8_slice mismatch at {}: got {}, expected {}",
1053                i,
1054                output[i],
1055                expected
1056            );
1057        }
1058    }
1059
1060    // ---- Roundtrip tests ----
1061
1062    #[test]
1063    fn test_f32_roundtrip() {
1064        let mut values: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
1065        let original = values.clone();
1066
1067        srgb_to_linear_slice(&mut values);
1068        linear_to_srgb_slice(&mut values);
1069
1070        for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1071            assert!(
1072                (orig - conv).abs() < 1e-4,
1073                "f32 roundtrip failed at {}: {} -> {}",
1074                i,
1075                orig,
1076                conv
1077            );
1078        }
1079    }
1080
1081    #[test]
1082    fn test_u8_roundtrip() {
1083        let input: Vec<u8> = (0..=255).collect();
1084        let mut linear = vec![0.0f32; 256];
1085        let mut back = vec![0u8; 256];
1086
1087        srgb_u8_to_linear_slice(&input, &mut linear);
1088        linear_to_srgb_u8_slice(&linear, &mut back);
1089
1090        for i in 0..256 {
1091            assert!(
1092                (input[i] as i16 - back[i] as i16).abs() <= 1,
1093                "u8 roundtrip failed at {}: {} -> {} -> {}",
1094                i,
1095                input[i],
1096                linear[i],
1097                back[i]
1098            );
1099        }
1100    }
1101
1102    // ---- Edge case tests ----
1103
1104    #[test]
1105    #[allow(deprecated)]
1106    fn test_clamping() {
1107        // Test that out-of-range values are clamped
1108        let input = f32x8::from([-0.5, -0.1, 0.0, 0.5, 1.0, 1.5, 2.0, 10.0]);
1109        let result = srgb_to_linear_x8(input);
1110        let arr: [f32; 8] = result.into();
1111
1112        assert_eq!(arr[0], 0.0, "negative should clamp to 0");
1113        assert_eq!(arr[1], 0.0, "negative should clamp to 0");
1114        assert!(arr[4] > 0.99 && arr[4] <= 1.0, "1.0 should stay ~1.0");
1115        assert!(arr[5] > 0.99 && arr[5] <= 1.0, "values > 1 should clamp");
1116    }
1117
1118    #[test]
1119    #[allow(deprecated)]
1120    fn test_linear_segment() {
1121        // Test values in the linear segment (< 0.04045)
1122        let input = f32x8::from([0.0, 0.01, 0.02, 0.03, 0.04, 0.005, 0.015, 0.035]);
1123        let result = srgb_to_linear_x8(input);
1124        let arr: [f32; 8] = result.into();
1125        let input_arr: [f32; 8] = input.into();
1126
1127        for i in 0..8 {
1128            let expected = input_arr[i] / 12.92;
1129            assert!(
1130                (arr[i] - expected).abs() < 1e-6,
1131                "linear segment mismatch at {}: got {}, expected {}",
1132                i,
1133                arr[i],
1134                expected
1135            );
1136        }
1137    }
1138
1139    /// Verify the const LUT stays in sync with the transfer function.
1140    /// Allows 1 ULP difference for cross-platform float variance (powf isn't
1141    /// perfectly deterministic across architectures).
1142    #[test]
1143    #[allow(deprecated)]
1144    fn test_lut_matches_transfer_function() {
1145        let lut = get_lut();
1146        for i in 0..=255u8 {
1147            let expected = crate::scalar::srgb_u8_to_linear(i);
1148            let got = lut[i as usize];
1149            let got_bits = got.to_bits();
1150            let expected_bits = expected.to_bits();
1151            let ulp_diff = (got_bits as i64 - expected_bits as i64).unsigned_abs();
1152            assert!(
1153                ulp_diff <= 1,
1154                "LUT[{}] = {} ({:08x}) differs by {} ULP from srgb_u8_to_linear({}) = {} ({:08x}). \
1155                 LUT needs regeneration if transfer constants changed.",
1156                i,
1157                got,
1158                got_bits,
1159                ulp_diff,
1160                i,
1161                expected,
1162                expected_bits
1163            );
1164        }
1165    }
1166
1167    #[test]
1168    fn test_empty_slice() {
1169        let mut empty: Vec<f32> = vec![];
1170        srgb_to_linear_slice(&mut empty);
1171        assert!(empty.is_empty());
1172
1173        let empty_u8: Vec<u8> = vec![];
1174        let mut empty_out: Vec<f32> = vec![];
1175        srgb_u8_to_linear_slice(&empty_u8, &mut empty_out);
1176    }
1177
1178    #[test]
1179    fn test_non_multiple_of_8() {
1180        // Test slices that aren't multiples of 8
1181        for len in [1, 3, 7, 9, 15, 17, 100] {
1182            let mut values: Vec<f32> = (0..len).map(|i| i as f32 / len as f32).collect();
1183            let expected: Vec<f32> = values
1184                .iter()
1185                .map(|&v| crate::scalar::srgb_to_linear(v))
1186                .collect();
1187
1188            srgb_to_linear_slice(&mut values);
1189
1190            for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1191                assert!(
1192                    (got - exp).abs() < 1e-5,
1193                    "len={} mismatch at {}: got {}, expected {}",
1194                    len,
1195                    i,
1196                    got,
1197                    exp
1198                );
1199            }
1200        }
1201    }
1202
1203    // ---- Custom gamma tests ----
1204
1205    #[test]
1206    fn test_gamma_to_linear_x8() {
1207        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1208        let gamma = 2.2f32;
1209        let result = gamma_to_linear_x8(f32x8::from(input), gamma);
1210        let result_arr: [f32; 8] = result.into();
1211
1212        for (i, &inp) in input.iter().enumerate() {
1213            let expected = crate::scalar::gamma_to_linear(inp, gamma);
1214            assert!(
1215                (result_arr[i] - expected).abs() < 1e-5,
1216                "gamma_to_linear_x8 mismatch at {}: got {}, expected {}",
1217                i,
1218                result_arr[i],
1219                expected
1220            );
1221        }
1222    }
1223
1224    #[test]
1225    fn test_linear_to_gamma_x8() {
1226        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1227        let gamma = 2.2f32;
1228        let result = linear_to_gamma_x8(f32x8::from(input), gamma);
1229        let result_arr: [f32; 8] = result.into();
1230
1231        for (i, &inp) in input.iter().enumerate() {
1232            let expected = crate::scalar::linear_to_gamma(inp, gamma);
1233            assert!(
1234                (result_arr[i] - expected).abs() < 1e-5,
1235                "linear_to_gamma_x8 mismatch at {}: got {}, expected {}",
1236                i,
1237                result_arr[i],
1238                expected
1239            );
1240        }
1241    }
1242
1243    #[test]
1244    fn test_gamma_roundtrip_x8() {
1245        let input = [0.0f32, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 1.0];
1246        for gamma in [1.8f32, 2.0, 2.2, 2.4] {
1247            let linear = gamma_to_linear_x8(f32x8::from(input), gamma);
1248            let back = linear_to_gamma_x8(linear, gamma);
1249            let back_arr: [f32; 8] = back.into();
1250
1251            for (i, &inp) in input.iter().enumerate() {
1252                assert!(
1253                    (inp - back_arr[i]).abs() < 1e-4,
1254                    "gamma {} roundtrip failed at {}: {} -> {}",
1255                    gamma,
1256                    i,
1257                    inp,
1258                    back_arr[i]
1259                );
1260            }
1261        }
1262    }
1263
1264    #[test]
1265    fn test_gamma_slice_functions() {
1266        let gamma = 2.2f32;
1267
1268        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1269        let expected: Vec<f32> = values
1270            .iter()
1271            .map(|&v| crate::scalar::gamma_to_linear(v, gamma))
1272            .collect();
1273
1274        gamma_to_linear_slice(&mut values, gamma);
1275
1276        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1277            assert!(
1278                (got - exp).abs() < 1e-5,
1279                "gamma_to_linear_slice mismatch at {}: got {}, expected {}",
1280                i,
1281                got,
1282                exp
1283            );
1284        }
1285
1286        // Test linear_to_gamma_slice
1287        let expected_back: Vec<f32> = values
1288            .iter()
1289            .map(|&v| crate::scalar::linear_to_gamma(v, gamma))
1290            .collect();
1291
1292        linear_to_gamma_slice(&mut values, gamma);
1293
1294        for (i, (&got, &exp)) in values.iter().zip(expected_back.iter()).enumerate() {
1295            assert!(
1296                (got - exp).abs() < 1e-5,
1297                "linear_to_gamma_slice mismatch at {}: got {}, expected {}",
1298                i,
1299                got,
1300                exp
1301            );
1302        }
1303    }
1304}