linear_srgb/
simd.rs

1//! SIMD-accelerated sRGB ↔ linear conversion.
2//!
3//! This module provides high-performance conversion functions using AVX2/SSE SIMD
4//! instructions via the `wide` crate with runtime CPU feature detection.
5//!
6//! # API Overview
7//!
8//! ## x8 Functions (process 8 values at once)
9//! - [`srgb_to_linear_x8`] - f32x8 sRGB → f32x8 linear
10//! - [`linear_to_srgb_x8`] - f32x8 linear → f32x8 sRGB
11//! - [`srgb_u8_to_linear_x8`] - \[u8; 8\] sRGB → f32x8 linear
12//! - [`linear_to_srgb_u8_x8`] - f32x8 linear → \[u8; 8\] sRGB
13//!
14//! ## Slice Functions (process entire slices)
15//! - [`srgb_to_linear_slice`] - &mut \[f32\] sRGB → linear in-place
16//! - [`linear_to_srgb_slice`] - &mut \[f32\] linear → sRGB in-place
17//! - [`srgb_u8_to_linear_slice`] - &\[u8\] sRGB → &mut \[f32\] linear
18//! - [`linear_to_srgb_u8_slice`] - &\[f32\] linear → &mut \[u8\] sRGB
19
20use multiversed::multiversed;
21use wide::{CmpLt, f32x8};
22
23use crate::fast_math::pow_x8;
24
25// sRGB transfer function constants (C0-continuous, moxcms-derived)
26// These ensure exact continuity at the linear/power segment junction.
27// Standard IEC values (0.055, 1.055, 0.04045) have a tiny discontinuity.
28const SRGB_LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.039_293_37);
29const LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.003_041_282_6);
30const LINEAR_SCALE: f32x8 = f32x8::splat(1.0 / 12.92);
31const SRGB_OFFSET: f32x8 = f32x8::splat(0.055_010_72);
32const SRGB_SCALE: f32x8 = f32x8::splat(1.055_010_7);
33const TWELVE_92: f32x8 = f32x8::splat(12.92);
34const ZERO: f32x8 = f32x8::splat(0.0);
35const ONE: f32x8 = f32x8::splat(1.0);
36const U8_MAX: f32x8 = f32x8::splat(255.0);
37const HALF: f32x8 = f32x8::splat(0.5);
38
39/// Precomputed sRGB u8 → linear f32 lookup table.
40/// Uses the same constants as the transfer module (C0-continuous IEC 61966-2-1).
41/// Generated by computing `srgb_u8_to_linear(i)` for each i in 0..=255.
42/// To regenerate: `cargo run --release --example generate_lut`
43const SRGB_U8_TO_LINEAR_LUT: [f32; 256] = [
44    0.0_f32,
45    0.000303527_f32,
46    0.000607054_f32,
47    0.000910581_f32,
48    0.001214108_f32,
49    0.001517635_f32,
50    0.001821162_f32,
51    0.0021246888_f32,
52    0.002428216_f32,
53    0.002731743_f32,
54    0.00303527_f32,
55    0.0033473307_f32,
56    0.0036773437_f32,
57    0.0040255957_f32,
58    0.004392362_f32,
59    0.004777916_f32,
60    0.0051825214_f32,
61    0.00560644_f32,
62    0.006049924_f32,
63    0.0065132244_f32,
64    0.0069965874_f32,
65    0.007500253_f32,
66    0.008024457_f32,
67    0.008569433_f32,
68    0.009135411_f32,
69    0.009722613_f32,
70    0.010331264_f32,
71    0.010961577_f32,
72    0.011613773_f32,
73    0.012288062_f32,
74    0.012984648_f32,
75    0.013703744_f32,
76    0.01444555_f32,
77    0.015210266_f32,
78    0.01599809_f32,
79    0.016809216_f32,
80    0.01764384_f32,
81    0.018502146_f32,
82    0.019384334_f32,
83    0.02029058_f32,
84    0.02122107_f32,
85    0.022175988_f32,
86    0.023155512_f32,
87    0.024159823_f32,
88    0.025189094_f32,
89    0.026243499_f32,
90    0.027323212_f32,
91    0.0284284_f32,
92    0.02955924_f32,
93    0.030715894_f32,
94    0.03189852_f32,
95    0.0331073_f32,
96    0.034342386_f32,
97    0.03560393_f32,
98    0.036892105_f32,
99    0.03820707_f32,
100    0.039548974_f32,
101    0.04091798_f32,
102    0.04231424_f32,
103    0.04373789_f32,
104    0.045189105_f32,
105    0.04666803_f32,
106    0.04817481_f32,
107    0.049709592_f32,
108    0.051272515_f32,
109    0.052863743_f32,
110    0.054483414_f32,
111    0.05613167_f32,
112    0.05780865_f32,
113    0.05951448_f32,
114    0.061249338_f32,
115    0.063013345_f32,
116    0.06480663_f32,
117    0.06662934_f32,
118    0.068481594_f32,
119    0.07036356_f32,
120    0.072275355_f32,
121    0.07421711_f32,
122    0.07618896_f32,
123    0.07819102_f32,
124    0.080223456_f32,
125    0.08228638_f32,
126    0.08437992_f32,
127    0.086504206_f32,
128    0.088659346_f32,
129    0.09084551_f32,
130    0.093062796_f32,
131    0.09531133_f32,
132    0.09759124_f32,
133    0.09990266_f32,
134    0.10224568_f32,
135    0.104620464_f32,
136    0.10702711_f32,
137    0.109465756_f32,
138    0.1119365_f32,
139    0.11443946_f32,
140    0.116974786_f32,
141    0.11954258_f32,
142    0.12214295_f32,
143    0.12477602_f32,
144    0.1274419_f32,
145    0.13014072_f32,
146    0.1328726_f32,
147    0.13563763_f32,
148    0.13843594_f32,
149    0.14126763_f32,
150    0.14413282_f32,
151    0.14703165_f32,
152    0.1499642_f32,
153    0.15293059_f32,
154    0.15593089_f32,
155    0.15896529_f32,
156    0.16203386_f32,
157    0.1651367_f32,
158    0.16827393_f32,
159    0.17144562_f32,
160    0.17465195_f32,
161    0.17789298_f32,
162    0.18116882_f32,
163    0.1844796_f32,
164    0.18782537_f32,
165    0.1912063_f32,
166    0.19462249_f32,
167    0.19807397_f32,
168    0.2015609_f32,
169    0.20508343_f32,
170    0.20864154_f32,
171    0.21223548_f32,
172    0.21586527_f32,
173    0.21953095_f32,
174    0.22323275_f32,
175    0.22697066_f32,
176    0.23074481_f32,
177    0.2345554_f32,
178    0.23840237_f32,
179    0.24228595_f32,
180    0.24620613_f32,
181    0.25016314_f32,
182    0.25415692_f32,
183    0.25818765_f32,
184    0.26225552_f32,
185    0.26636043_f32,
186    0.27050266_f32,
187    0.27468216_f32,
188    0.27889907_f32,
189    0.2831536_f32,
190    0.28744566_f32,
191    0.29177552_f32,
192    0.2961431_f32,
193    0.30054858_f32,
194    0.30499217_f32,
195    0.30947372_f32,
196    0.31399357_f32,
197    0.3185516_f32,
198    0.32314798_f32,
199    0.3277829_f32,
200    0.33245632_f32,
201    0.33716843_f32,
202    0.34191918_f32,
203    0.34670877_f32,
204    0.35153738_f32,
205    0.35640487_f32,
206    0.36131153_f32,
207    0.3662573_f32,
208    0.37124234_f32,
209    0.37626684_f32,
210    0.38133067_f32,
211    0.3864341_f32,
212    0.39157712_f32,
213    0.3967598_f32,
214    0.4019824_f32,
215    0.40724477_f32,
216    0.4125472_f32,
217    0.41788962_f32,
218    0.42327216_f32,
219    0.42869502_f32,
220    0.4341581_f32,
221    0.43966165_f32,
222    0.44520563_f32,
223    0.45079017_f32,
224    0.4564154_f32,
225    0.46208134_f32,
226    0.46778816_f32,
227    0.4735358_f32,
228    0.47932443_f32,
229    0.4851542_f32,
230    0.49102503_f32,
231    0.49693722_f32,
232    0.5028906_f32,
233    0.5088854_f32,
234    0.5149218_f32,
235    0.5209996_f32,
236    0.52711916_f32,
237    0.5332804_f32,
238    0.53948337_f32,
239    0.5457284_f32,
240    0.55201524_f32,
241    0.55834424_f32,
242    0.56471527_f32,
243    0.57112855_f32,
244    0.57758415_f32,
245    0.58408207_f32,
246    0.5906225_f32,
247    0.59720534_f32,
248    0.6038308_f32,
249    0.6104991_f32,
250    0.61721_f32,
251    0.62396383_f32,
252    0.6307605_f32,
253    0.6376001_f32,
254    0.644483_f32,
255    0.6514088_f32,
256    0.658378_f32,
257    0.6653904_f32,
258    0.67244613_f32,
259    0.67954546_f32,
260    0.68668824_f32,
261    0.6938747_f32,
262    0.7011047_f32,
263    0.7083785_f32,
264    0.7156962_f32,
265    0.72305775_f32,
266    0.7304634_f32,
267    0.73791295_f32,
268    0.7454066_f32,
269    0.75294465_f32,
270    0.76052684_f32,
271    0.7681535_f32,
272    0.7758244_f32,
273    0.7835399_f32,
274    0.79130006_f32,
275    0.79910475_f32,
276    0.80695426_f32,
277    0.8148484_f32,
278    0.82278764_f32,
279    0.8307716_f32,
280    0.83880067_f32,
281    0.8468749_f32,
282    0.8549941_f32,
283    0.8631587_f32,
284    0.8713685_f32,
285    0.87962353_f32,
286    0.8879244_f32,
287    0.89627033_f32,
288    0.9046623_f32,
289    0.9130995_f32,
290    0.9215827_f32,
291    0.9301116_f32,
292    0.93868643_f32,
293    0.9473071_f32,
294    0.9559739_f32,
295    0.9646866_f32,
296    0.9734457_f32,
297    0.9822507_f32,
298    0.9911024_f32,
299    1.0_f32,
300];
301
302#[inline]
303fn get_lut() -> &'static [f32; 256] {
304    &SRGB_U8_TO_LINEAR_LUT
305}
306
307/// Convert a single sRGB u8 value to linear f32 using LUT lookup.
308///
309/// This is the fastest method for u8 input as it uses a precomputed lookup table
310/// embedded in the binary. For batch conversions, use [`srgb_u8_to_linear_slice`].
311///
312/// # Example
313/// ```
314/// use linear_srgb::simd::srgb_u8_to_linear;
315///
316/// let linear = srgb_u8_to_linear(128);
317/// assert!((linear - 0.2158).abs() < 0.001);
318/// ```
319#[inline]
320pub fn srgb_u8_to_linear(value: u8) -> f32 {
321    get_lut()[value as usize]
322}
323
324// ============================================================================
325// x8 Inline Functions - Always inlined, for use in caller's multiversed code
326// ============================================================================
327
328/// Convert 8 sRGB f32 values to linear (always inlined).
329///
330/// Use this variant inside your own `#[multiversed]` functions to avoid
331/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_dispatch`].
332///
333/// Input values are clamped to \[0, 1\].
334#[inline(always)]
335pub fn srgb_to_linear_x8_inline(srgb: f32x8) -> f32x8 {
336    let srgb = srgb.max(ZERO).min(ONE);
337    let linear_result = srgb * LINEAR_SCALE;
338    let power_result = pow_x8((srgb + SRGB_OFFSET) / SRGB_SCALE, 2.4);
339    let mask = srgb.simd_lt(SRGB_LINEAR_THRESHOLD);
340    mask.blend(linear_result, power_result)
341}
342
343/// Convert 8 linear f32 values to sRGB (always inlined).
344///
345/// Use this variant inside your own `#[multiversed]` functions to avoid
346/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_dispatch`].
347///
348/// Input values are clamped to \[0, 1\].
349#[inline(always)]
350pub fn linear_to_srgb_x8_inline(linear: f32x8) -> f32x8 {
351    let linear = linear.max(ZERO).min(ONE);
352    let linear_result = linear * TWELVE_92;
353    let power_result = SRGB_SCALE * pow_x8(linear, 1.0 / 2.4) - SRGB_OFFSET;
354    let mask = linear.simd_lt(LINEAR_THRESHOLD);
355    mask.blend(linear_result, power_result)
356}
357
358/// Convert 8 linear f32 values to sRGB u8 (always inlined).
359///
360/// Use this variant inside your own `#[multiversed]` functions to avoid
361/// double dispatch overhead.
362#[inline(always)]
363pub fn linear_to_srgb_u8_x8_inline(linear: f32x8) -> [u8; 8] {
364    let srgb = linear_to_srgb_x8_inline(linear);
365    let scaled = srgb * U8_MAX + HALF;
366    let arr: [f32; 8] = scaled.into();
367    [
368        arr[0] as u8,
369        arr[1] as u8,
370        arr[2] as u8,
371        arr[3] as u8,
372        arr[4] as u8,
373        arr[5] as u8,
374        arr[6] as u8,
375        arr[7] as u8,
376    ]
377}
378
379/// Convert 8 gamma-encoded f32 values to linear (always inlined).
380///
381/// Use this variant inside your own `#[multiversed]` functions to avoid
382/// double dispatch overhead.
383#[inline(always)]
384pub fn gamma_to_linear_x8_inline(encoded: f32x8, gamma: f32) -> f32x8 {
385    let encoded = encoded.max(ZERO).min(ONE);
386    pow_x8(encoded, gamma)
387}
388
389/// Convert 8 linear f32 values to gamma-encoded (always inlined).
390///
391/// Use this variant inside your own `#[multiversed]` functions to avoid
392/// double dispatch overhead.
393#[inline(always)]
394pub fn linear_to_gamma_x8_inline(linear: f32x8, gamma: f32) -> f32x8 {
395    let linear = linear.max(ZERO).min(ONE);
396    pow_x8(linear, 1.0 / gamma)
397}
398
399// ============================================================================
400// x8 Dispatch Functions - Runtime CPU feature detection
401// ============================================================================
402
403/// Convert 8 sRGB f32 values to linear (with CPU dispatch).
404///
405/// This variant uses runtime CPU feature detection to select the optimal
406/// implementation. Use [`srgb_to_linear_x8_inline`] inside your own
407/// `#[multiversed]` functions to avoid double dispatch.
408///
409/// Input values are clamped to \[0, 1\].
410#[multiversed]
411#[inline]
412pub fn srgb_to_linear_x8_dispatch(srgb: f32x8) -> f32x8 {
413    srgb_to_linear_x8_inline(srgb)
414}
415
416/// Convert 8 linear f32 values to sRGB (with CPU dispatch).
417///
418/// This variant uses runtime CPU feature detection to select the optimal
419/// implementation. Use [`linear_to_srgb_x8_inline`] inside your own
420/// `#[multiversed]` functions to avoid double dispatch.
421///
422/// Input values are clamped to \[0, 1\].
423#[multiversed]
424#[inline]
425pub fn linear_to_srgb_x8_dispatch(linear: f32x8) -> f32x8 {
426    linear_to_srgb_x8_inline(linear)
427}
428
429/// Convert 8 linear f32 values to sRGB u8 (with CPU dispatch).
430#[multiversed]
431#[inline]
432pub fn linear_to_srgb_u8_x8_dispatch(linear: f32x8) -> [u8; 8] {
433    linear_to_srgb_u8_x8_inline(linear)
434}
435
436/// Convert 8 gamma-encoded f32 values to linear (with CPU dispatch).
437#[multiversed]
438#[inline]
439pub fn gamma_to_linear_x8_dispatch(encoded: f32x8, gamma: f32) -> f32x8 {
440    gamma_to_linear_x8_inline(encoded, gamma)
441}
442
443/// Convert 8 linear f32 values to gamma-encoded (with CPU dispatch).
444#[multiversed]
445#[inline]
446pub fn linear_to_gamma_x8_dispatch(linear: f32x8, gamma: f32) -> f32x8 {
447    linear_to_gamma_x8_inline(linear, gamma)
448}
449
450// ============================================================================
451// x8 Default Functions - Calls inline variant, compiler decides inlining
452// ============================================================================
453
454/// Convert 8 sRGB f32 values to linear.
455///
456/// This is the default variant that calls the inline implementation.
457/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
458/// inside your own `#[multiversed]` functions.
459///
460/// Input values are clamped to \[0, 1\].
461///
462/// # Example
463/// ```
464/// use linear_srgb::simd::srgb_to_linear_x8;
465/// use wide::f32x8;
466///
467/// let srgb = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
468/// let linear = srgb_to_linear_x8(srgb);
469/// ```
470#[inline]
471pub fn srgb_to_linear_x8(srgb: f32x8) -> f32x8 {
472    srgb_to_linear_x8_inline(srgb)
473}
474
475/// Convert 8 linear f32 values to sRGB.
476///
477/// This is the default variant that calls the inline implementation.
478/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
479/// inside your own `#[multiversed]` functions.
480///
481/// Input values are clamped to \[0, 1\].
482///
483/// # Example
484/// ```
485/// use linear_srgb::simd::linear_to_srgb_x8;
486/// use wide::f32x8;
487///
488/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
489/// let srgb = linear_to_srgb_x8(linear);
490/// ```
491#[inline]
492pub fn linear_to_srgb_x8(linear: f32x8) -> f32x8 {
493    linear_to_srgb_x8_inline(linear)
494}
495
496/// Convert 8 sRGB u8 values to linear f32 using LUT lookup.
497///
498/// This is the fastest method for u8 input as it uses a precomputed lookup table.
499///
500/// # Example
501/// ```
502/// use linear_srgb::simd::srgb_u8_to_linear_x8;
503///
504/// let srgb = [0u8, 64, 128, 192, 255, 32, 96, 160];
505/// let linear = srgb_u8_to_linear_x8(srgb);
506/// ```
507#[inline]
508pub fn srgb_u8_to_linear_x8(srgb: [u8; 8]) -> f32x8 {
509    let lut = get_lut();
510    f32x8::from([
511        lut[srgb[0] as usize],
512        lut[srgb[1] as usize],
513        lut[srgb[2] as usize],
514        lut[srgb[3] as usize],
515        lut[srgb[4] as usize],
516        lut[srgb[5] as usize],
517        lut[srgb[6] as usize],
518        lut[srgb[7] as usize],
519    ])
520}
521
522/// Convert 8 linear f32 values to sRGB u8.
523///
524/// Input values are clamped to \[0, 1\], output is rounded to nearest u8.
525///
526/// # Example
527/// ```
528/// use linear_srgb::simd::linear_to_srgb_u8_x8;
529/// use wide::f32x8;
530///
531/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
532/// let srgb = linear_to_srgb_u8_x8(linear);
533/// ```
534#[inline]
535pub fn linear_to_srgb_u8_x8(linear: f32x8) -> [u8; 8] {
536    linear_to_srgb_u8_x8_inline(linear)
537}
538
539/// Convert 8 gamma-encoded f32 values to linear.
540///
541/// # Example
542/// ```
543/// use linear_srgb::simd::gamma_to_linear_x8;
544/// use wide::f32x8;
545///
546/// let encoded = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
547/// let linear = gamma_to_linear_x8(encoded, 2.2);
548/// ```
549#[inline]
550pub fn gamma_to_linear_x8(encoded: f32x8, gamma: f32) -> f32x8 {
551    gamma_to_linear_x8_inline(encoded, gamma)
552}
553
554/// Convert 8 linear f32 values to gamma-encoded.
555///
556/// # Example
557/// ```
558/// use linear_srgb::simd::linear_to_gamma_x8;
559/// use wide::f32x8;
560///
561/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
562/// let encoded = linear_to_gamma_x8(linear, 2.2);
563/// ```
564#[inline]
565pub fn linear_to_gamma_x8(linear: f32x8, gamma: f32) -> f32x8 {
566    linear_to_gamma_x8_inline(linear, gamma)
567}
568
569// ============================================================================
570// Slice Functions - Process entire slices
571// ============================================================================
572
573/// Convert sRGB f32 values to linear in-place.
574///
575/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
576///
577/// # Example
578/// ```
579/// use linear_srgb::simd::srgb_to_linear_slice;
580///
581/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
582/// srgb_to_linear_slice(&mut values);
583/// ```
584#[multiversed]
585#[inline]
586pub fn srgb_to_linear_slice(values: &mut [f32]) {
587    let (chunks, remainder) = values.as_chunks_mut::<8>();
588
589    for chunk in chunks {
590        let result = srgb_to_linear_x8_inline(f32x8::from(*chunk));
591        *chunk = result.into();
592    }
593
594    for v in remainder {
595        *v = crate::scalar::srgb_to_linear(*v);
596    }
597}
598
599/// Convert linear f32 values to sRGB in-place.
600///
601/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
602///
603/// # Example
604/// ```
605/// use linear_srgb::simd::linear_to_srgb_slice;
606///
607/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
608/// linear_to_srgb_slice(&mut values);
609/// ```
610#[multiversed]
611#[inline]
612pub fn linear_to_srgb_slice(values: &mut [f32]) {
613    let (chunks, remainder) = values.as_chunks_mut::<8>();
614
615    for chunk in chunks {
616        let result = linear_to_srgb_x8_inline(f32x8::from(*chunk));
617        *chunk = result.into();
618    }
619
620    for v in remainder {
621        *v = crate::scalar::linear_to_srgb(*v);
622    }
623}
624
625/// Convert sRGB u8 values to linear f32.
626///
627/// Uses a precomputed LUT for each u8 value, processed in SIMD batches of 8.
628///
629/// # Panics
630/// Panics if `input.len() != output.len()`.
631///
632/// # Example
633/// ```
634/// use linear_srgb::simd::srgb_u8_to_linear_slice;
635///
636/// let input: Vec<u8> = (0..=255).collect();
637/// let mut output = vec![0.0f32; 256];
638/// srgb_u8_to_linear_slice(&input, &mut output);
639/// ```
640#[inline]
641pub fn srgb_u8_to_linear_slice(input: &[u8], output: &mut [f32]) {
642    assert_eq!(input.len(), output.len());
643    let lut = get_lut();
644
645    let (in_chunks, in_remainder) = input.as_chunks::<8>();
646    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
647
648    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
649        *out = [
650            lut[inp[0] as usize],
651            lut[inp[1] as usize],
652            lut[inp[2] as usize],
653            lut[inp[3] as usize],
654            lut[inp[4] as usize],
655            lut[inp[5] as usize],
656            lut[inp[6] as usize],
657            lut[inp[7] as usize],
658        ];
659    }
660
661    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
662        *out = lut[*inp as usize];
663    }
664}
665
666/// Convert linear f32 values to sRGB u8.
667///
668/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
669///
670/// # Panics
671/// Panics if `input.len() != output.len()`.
672///
673/// # Example
674/// ```
675/// use linear_srgb::simd::linear_to_srgb_u8_slice;
676///
677/// let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
678/// let mut output = vec![0u8; 256];
679/// linear_to_srgb_u8_slice(&input, &mut output);
680/// ```
681#[multiversed]
682#[inline]
683pub fn linear_to_srgb_u8_slice(input: &[f32], output: &mut [u8]) {
684    assert_eq!(input.len(), output.len());
685
686    let (in_chunks, in_remainder) = input.as_chunks::<8>();
687    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
688
689    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
690        *out = linear_to_srgb_u8_x8_inline(f32x8::from(*inp));
691    }
692
693    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
694        let srgb = crate::scalar::linear_to_srgb(*inp);
695        *out = (srgb * 255.0 + 0.5) as u8;
696    }
697}
698
699// ============================================================================
700// Custom Gamma Slice Functions
701// ============================================================================
702
703/// Convert gamma-encoded f32 values to linear in-place using a custom gamma.
704///
705/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
706///
707/// # Example
708/// ```
709/// use linear_srgb::simd::gamma_to_linear_slice;
710///
711/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
712/// gamma_to_linear_slice(&mut values, 2.2);
713/// ```
714#[multiversed]
715#[inline]
716pub fn gamma_to_linear_slice(values: &mut [f32], gamma: f32) {
717    let (chunks, remainder) = values.as_chunks_mut::<8>();
718
719    for chunk in chunks {
720        let result = gamma_to_linear_x8_inline(f32x8::from(*chunk), gamma);
721        *chunk = result.into();
722    }
723
724    for v in remainder {
725        *v = crate::scalar::gamma_to_linear(*v, gamma);
726    }
727}
728
729/// Convert linear f32 values to gamma-encoded in-place using a custom gamma.
730///
731/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
732///
733/// # Example
734/// ```
735/// use linear_srgb::simd::linear_to_gamma_slice;
736///
737/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
738/// linear_to_gamma_slice(&mut values, 2.2);
739/// ```
740#[multiversed]
741#[inline]
742pub fn linear_to_gamma_slice(values: &mut [f32], gamma: f32) {
743    let (chunks, remainder) = values.as_chunks_mut::<8>();
744
745    for chunk in chunks {
746        let result = linear_to_gamma_x8_inline(f32x8::from(*chunk), gamma);
747        *chunk = result.into();
748    }
749
750    for v in remainder {
751        *v = crate::scalar::linear_to_gamma(*v, gamma);
752    }
753}
754
755// ============================================================================
756// f32x8 Slice Functions (for pre-aligned SIMD data)
757// ============================================================================
758
759/// Convert linear f32x8 values to sRGB in-place.
760///
761/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
762/// use [`linear_to_srgb_slice`] instead which handles remainders automatically.
763///
764/// # Example
765/// ```
766/// use linear_srgb::simd::linear_to_srgb_x8_slice;
767/// use wide::f32x8;
768///
769/// let mut values = vec![f32x8::splat(0.5); 100];
770/// linear_to_srgb_x8_slice(&mut values);
771/// ```
772#[multiversed]
773#[inline]
774pub fn linear_to_srgb_x8_slice(values: &mut [f32x8]) {
775    for v in values.iter_mut() {
776        *v = linear_to_srgb_x8_inline(*v);
777    }
778}
779
780/// Convert gamma-encoded f32x8 values to linear in-place using a custom gamma.
781///
782/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
783/// use [`gamma_to_linear_slice`] instead which handles remainders automatically.
784///
785/// # Example
786/// ```
787/// use linear_srgb::simd::gamma_to_linear_x8_slice;
788/// use wide::f32x8;
789///
790/// let mut values = vec![f32x8::splat(0.5); 100];
791/// gamma_to_linear_x8_slice(&mut values, 2.2);
792/// ```
793#[multiversed]
794#[inline]
795pub fn gamma_to_linear_x8_slice(values: &mut [f32x8], gamma: f32) {
796    for v in values.iter_mut() {
797        *v = gamma_to_linear_x8_inline(*v, gamma);
798    }
799}
800
801/// Convert linear f32x8 values to gamma-encoded in-place using a custom gamma.
802///
803/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
804/// use [`linear_to_gamma_slice`] instead which handles remainders automatically.
805///
806/// # Example
807/// ```
808/// use linear_srgb::simd::linear_to_gamma_x8_slice;
809/// use wide::f32x8;
810///
811/// let mut values = vec![f32x8::splat(0.2); 100];
812/// linear_to_gamma_x8_slice(&mut values, 2.2);
813/// ```
814#[multiversed]
815#[inline]
816pub fn linear_to_gamma_x8_slice(values: &mut [f32x8], gamma: f32) {
817    for v in values.iter_mut() {
818        *v = linear_to_gamma_x8_inline(*v, gamma);
819    }
820}
821
822// ============================================================================
823// f32x8 Slice Inline Functions (for use inside caller's multiversed code)
824// ============================================================================
825
826/// Convert linear f32x8 values to sRGB in-place (always inlined).
827///
828/// Use this variant inside your own `#[multiversed]` functions to avoid
829/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_slice`].
830#[inline(always)]
831pub fn linear_to_srgb_x8_slice_inline(values: &mut [f32x8]) {
832    for v in values.iter_mut() {
833        *v = linear_to_srgb_x8_inline(*v);
834    }
835}
836
837/// Convert gamma-encoded f32x8 values to linear in-place (always inlined).
838///
839/// Use this variant inside your own `#[multiversed]` functions to avoid
840/// double dispatch overhead. For standalone calls, use [`gamma_to_linear_x8_slice`].
841#[inline(always)]
842pub fn gamma_to_linear_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
843    for v in values.iter_mut() {
844        *v = gamma_to_linear_x8_inline(*v, gamma);
845    }
846}
847
848/// Convert linear f32x8 values to gamma-encoded in-place (always inlined).
849///
850/// Use this variant inside your own `#[multiversed]` functions to avoid
851/// double dispatch overhead. For standalone calls, use [`linear_to_gamma_x8_slice`].
852#[inline(always)]
853pub fn linear_to_gamma_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
854    for v in values.iter_mut() {
855        *v = linear_to_gamma_x8_inline(*v, gamma);
856    }
857}
858
859// ============================================================================
860// Tests
861// ============================================================================
862
863#[cfg(test)]
864mod tests {
865    use super::*;
866
867    #[cfg(not(feature = "std"))]
868    use alloc::{vec, vec::Vec};
869
870    // ---- x8 function tests ----
871
872    #[test]
873    #[allow(deprecated)]
874    fn test_srgb_to_linear_x8() {
875        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
876        let result = srgb_to_linear_x8(f32x8::from(input));
877        let result_arr: [f32; 8] = result.into();
878
879        for (i, &inp) in input.iter().enumerate() {
880            let expected = crate::scalar::srgb_to_linear(inp);
881            assert!(
882                (result_arr[i] - expected).abs() < 1e-5,
883                "srgb_to_linear_x8 mismatch at {}: got {}, expected {}",
884                i,
885                result_arr[i],
886                expected
887            );
888        }
889    }
890
891    #[test]
892    fn test_linear_to_srgb_x8() {
893        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
894        let result = linear_to_srgb_x8(f32x8::from(input));
895        let result_arr: [f32; 8] = result.into();
896
897        for (i, &inp) in input.iter().enumerate() {
898            let expected = crate::scalar::linear_to_srgb(inp);
899            assert!(
900                (result_arr[i] - expected).abs() < 1e-5,
901                "linear_to_srgb_x8 mismatch at {}: got {}, expected {}",
902                i,
903                result_arr[i],
904                expected
905            );
906        }
907    }
908
909    #[test]
910    #[allow(deprecated)]
911    fn test_srgb_u8_to_linear_x8() {
912        let input: [u8; 8] = [0, 64, 128, 192, 255, 32, 96, 160];
913        let result = srgb_u8_to_linear_x8(input);
914        let result_arr: [f32; 8] = result.into();
915
916        for (i, &inp) in input.iter().enumerate() {
917            let expected = crate::scalar::srgb_u8_to_linear(inp);
918            assert!(
919                (result_arr[i] - expected).abs() < 1e-6,
920                "srgb_u8_to_linear_x8 mismatch at {}: got {}, expected {}",
921                i,
922                result_arr[i],
923                expected
924            );
925        }
926    }
927
928    #[test]
929    fn test_linear_to_srgb_u8_x8() {
930        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8];
931        let result = linear_to_srgb_u8_x8(f32x8::from(input));
932
933        for (i, &inp) in input.iter().enumerate() {
934            let expected = (crate::scalar::linear_to_srgb(inp) * 255.0 + 0.5) as u8;
935            assert!(
936                (result[i] as i16 - expected as i16).abs() <= 1,
937                "linear_to_srgb_u8_x8 mismatch at {}: got {}, expected {}",
938                i,
939                result[i],
940                expected
941            );
942        }
943    }
944
945    // ---- Slice function tests ----
946
947    #[test]
948    fn test_srgb_to_linear_slice() {
949        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
950        let expected: Vec<f32> = values
951            .iter()
952            .map(|&v| crate::scalar::srgb_to_linear(v))
953            .collect();
954
955        srgb_to_linear_slice(&mut values);
956
957        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
958            assert!(
959                (got - exp).abs() < 1e-5,
960                "srgb_to_linear_slice mismatch at {}: got {}, expected {}",
961                i,
962                got,
963                exp
964            );
965        }
966    }
967
968    #[test]
969    fn test_linear_to_srgb_slice() {
970        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
971        let expected: Vec<f32> = values
972            .iter()
973            .map(|&v| crate::scalar::linear_to_srgb(v))
974            .collect();
975
976        linear_to_srgb_slice(&mut values);
977
978        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
979            assert!(
980                (got - exp).abs() < 1e-5,
981                "linear_to_srgb_slice mismatch at {}: got {}, expected {}",
982                i,
983                got,
984                exp
985            );
986        }
987    }
988
989    #[test]
990    #[allow(deprecated)]
991    fn test_srgb_u8_to_linear_slice() {
992        let input: Vec<u8> = (0..=255).collect();
993        let mut output = vec![0.0f32; 256];
994
995        srgb_u8_to_linear_slice(&input, &mut output);
996
997        for (i, &out) in output.iter().enumerate() {
998            let expected = crate::scalar::srgb_u8_to_linear(i as u8);
999            assert!(
1000                (out - expected).abs() < 1e-6,
1001                "srgb_u8_to_linear_slice mismatch at {}: got {}, expected {}",
1002                i,
1003                out,
1004                expected
1005            );
1006        }
1007    }
1008
1009    #[test]
1010    fn test_linear_to_srgb_u8_slice() {
1011        let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
1012        let mut output = vec![0u8; 256];
1013
1014        linear_to_srgb_u8_slice(&input, &mut output);
1015
1016        for i in 0..256 {
1017            let expected = (crate::scalar::linear_to_srgb(input[i]) * 255.0 + 0.5) as u8;
1018            assert!(
1019                (output[i] as i16 - expected as i16).abs() <= 1,
1020                "linear_to_srgb_u8_slice mismatch at {}: got {}, expected {}",
1021                i,
1022                output[i],
1023                expected
1024            );
1025        }
1026    }
1027
1028    // ---- Roundtrip tests ----
1029
1030    #[test]
1031    fn test_f32_roundtrip() {
1032        let mut values: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
1033        let original = values.clone();
1034
1035        srgb_to_linear_slice(&mut values);
1036        linear_to_srgb_slice(&mut values);
1037
1038        for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1039            assert!(
1040                (orig - conv).abs() < 1e-4,
1041                "f32 roundtrip failed at {}: {} -> {}",
1042                i,
1043                orig,
1044                conv
1045            );
1046        }
1047    }
1048
1049    #[test]
1050    fn test_u8_roundtrip() {
1051        let input: Vec<u8> = (0..=255).collect();
1052        let mut linear = vec![0.0f32; 256];
1053        let mut back = vec![0u8; 256];
1054
1055        srgb_u8_to_linear_slice(&input, &mut linear);
1056        linear_to_srgb_u8_slice(&linear, &mut back);
1057
1058        for i in 0..256 {
1059            assert!(
1060                (input[i] as i16 - back[i] as i16).abs() <= 1,
1061                "u8 roundtrip failed at {}: {} -> {} -> {}",
1062                i,
1063                input[i],
1064                linear[i],
1065                back[i]
1066            );
1067        }
1068    }
1069
1070    // ---- Edge case tests ----
1071
1072    #[test]
1073    #[allow(deprecated)]
1074    fn test_clamping() {
1075        // Test that out-of-range values are clamped
1076        let input = f32x8::from([-0.5, -0.1, 0.0, 0.5, 1.0, 1.5, 2.0, 10.0]);
1077        let result = srgb_to_linear_x8(input);
1078        let arr: [f32; 8] = result.into();
1079
1080        assert_eq!(arr[0], 0.0, "negative should clamp to 0");
1081        assert_eq!(arr[1], 0.0, "negative should clamp to 0");
1082        assert!(arr[4] > 0.99 && arr[4] <= 1.0, "1.0 should stay ~1.0");
1083        assert!(arr[5] > 0.99 && arr[5] <= 1.0, "values > 1 should clamp");
1084    }
1085
1086    #[test]
1087    #[allow(deprecated)]
1088    fn test_linear_segment() {
1089        // Test values in the linear segment (< 0.04045)
1090        let input = f32x8::from([0.0, 0.01, 0.02, 0.03, 0.04, 0.005, 0.015, 0.035]);
1091        let result = srgb_to_linear_x8(input);
1092        let arr: [f32; 8] = result.into();
1093        let input_arr: [f32; 8] = input.into();
1094
1095        for i in 0..8 {
1096            let expected = input_arr[i] / 12.92;
1097            assert!(
1098                (arr[i] - expected).abs() < 1e-6,
1099                "linear segment mismatch at {}: got {}, expected {}",
1100                i,
1101                arr[i],
1102                expected
1103            );
1104        }
1105    }
1106
1107    /// Verify the const LUT stays in sync with the transfer function.
1108    /// Allows 1 ULP difference for cross-platform float variance (powf isn't
1109    /// perfectly deterministic across architectures).
1110    #[test]
1111    #[allow(deprecated)]
1112    fn test_lut_matches_transfer_function() {
1113        let lut = get_lut();
1114        for i in 0..=255u8 {
1115            let expected = crate::scalar::srgb_u8_to_linear(i);
1116            let got = lut[i as usize];
1117            let got_bits = got.to_bits();
1118            let expected_bits = expected.to_bits();
1119            let ulp_diff = (got_bits as i64 - expected_bits as i64).unsigned_abs();
1120            assert!(
1121                ulp_diff <= 1,
1122                "LUT[{}] = {} ({:08x}) differs by {} ULP from srgb_u8_to_linear({}) = {} ({:08x}). \
1123                 LUT needs regeneration if transfer constants changed.",
1124                i,
1125                got,
1126                got_bits,
1127                ulp_diff,
1128                i,
1129                expected,
1130                expected_bits
1131            );
1132        }
1133    }
1134
1135    #[test]
1136    fn test_empty_slice() {
1137        let mut empty: Vec<f32> = vec![];
1138        srgb_to_linear_slice(&mut empty);
1139        assert!(empty.is_empty());
1140
1141        let empty_u8: Vec<u8> = vec![];
1142        let mut empty_out: Vec<f32> = vec![];
1143        srgb_u8_to_linear_slice(&empty_u8, &mut empty_out);
1144    }
1145
1146    #[test]
1147    fn test_non_multiple_of_8() {
1148        // Test slices that aren't multiples of 8
1149        for len in [1, 3, 7, 9, 15, 17, 100] {
1150            let mut values: Vec<f32> = (0..len).map(|i| i as f32 / len as f32).collect();
1151            let expected: Vec<f32> = values
1152                .iter()
1153                .map(|&v| crate::scalar::srgb_to_linear(v))
1154                .collect();
1155
1156            srgb_to_linear_slice(&mut values);
1157
1158            for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1159                assert!(
1160                    (got - exp).abs() < 1e-5,
1161                    "len={} mismatch at {}: got {}, expected {}",
1162                    len,
1163                    i,
1164                    got,
1165                    exp
1166                );
1167            }
1168        }
1169    }
1170
1171    // ---- Custom gamma tests ----
1172
1173    #[test]
1174    fn test_gamma_to_linear_x8() {
1175        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1176        let gamma = 2.2f32;
1177        let result = gamma_to_linear_x8(f32x8::from(input), gamma);
1178        let result_arr: [f32; 8] = result.into();
1179
1180        for (i, &inp) in input.iter().enumerate() {
1181            let expected = crate::scalar::gamma_to_linear(inp, gamma);
1182            assert!(
1183                (result_arr[i] - expected).abs() < 1e-5,
1184                "gamma_to_linear_x8 mismatch at {}: got {}, expected {}",
1185                i,
1186                result_arr[i],
1187                expected
1188            );
1189        }
1190    }
1191
1192    #[test]
1193    fn test_linear_to_gamma_x8() {
1194        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1195        let gamma = 2.2f32;
1196        let result = linear_to_gamma_x8(f32x8::from(input), gamma);
1197        let result_arr: [f32; 8] = result.into();
1198
1199        for (i, &inp) in input.iter().enumerate() {
1200            let expected = crate::scalar::linear_to_gamma(inp, gamma);
1201            assert!(
1202                (result_arr[i] - expected).abs() < 1e-5,
1203                "linear_to_gamma_x8 mismatch at {}: got {}, expected {}",
1204                i,
1205                result_arr[i],
1206                expected
1207            );
1208        }
1209    }
1210
1211    #[test]
1212    fn test_gamma_roundtrip_x8() {
1213        let input = [0.0f32, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 1.0];
1214        for gamma in [1.8f32, 2.0, 2.2, 2.4] {
1215            let linear = gamma_to_linear_x8(f32x8::from(input), gamma);
1216            let back = linear_to_gamma_x8(linear, gamma);
1217            let back_arr: [f32; 8] = back.into();
1218
1219            for (i, &inp) in input.iter().enumerate() {
1220                assert!(
1221                    (inp - back_arr[i]).abs() < 1e-4,
1222                    "gamma {} roundtrip failed at {}: {} -> {}",
1223                    gamma,
1224                    i,
1225                    inp,
1226                    back_arr[i]
1227                );
1228            }
1229        }
1230    }
1231
1232    #[test]
1233    fn test_gamma_slice_functions() {
1234        let gamma = 2.2f32;
1235
1236        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1237        let expected: Vec<f32> = values
1238            .iter()
1239            .map(|&v| crate::scalar::gamma_to_linear(v, gamma))
1240            .collect();
1241
1242        gamma_to_linear_slice(&mut values, gamma);
1243
1244        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1245            assert!(
1246                (got - exp).abs() < 1e-5,
1247                "gamma_to_linear_slice mismatch at {}: got {}, expected {}",
1248                i,
1249                got,
1250                exp
1251            );
1252        }
1253
1254        // Test linear_to_gamma_slice
1255        let expected_back: Vec<f32> = values
1256            .iter()
1257            .map(|&v| crate::scalar::linear_to_gamma(v, gamma))
1258            .collect();
1259
1260        linear_to_gamma_slice(&mut values, gamma);
1261
1262        for (i, (&got, &exp)) in values.iter().zip(expected_back.iter()).enumerate() {
1263            assert!(
1264                (got - exp).abs() < 1e-5,
1265                "linear_to_gamma_slice mismatch at {}: got {}, expected {}",
1266                i,
1267                got,
1268                exp
1269            );
1270        }
1271    }
1272}