linear_srgb/
simd.rs

1//! SIMD-accelerated sRGB ↔ linear conversion.
2//!
3//! This module provides high-performance conversion functions using AVX2/SSE SIMD
4//! instructions via the `wide` crate with runtime CPU feature detection.
5//!
6//! # API Overview
7//!
8//! ## x8 Functions (process 8 values at once)
9//! - [`srgb_to_linear_x8`] - f32x8 sRGB → f32x8 linear
10//! - [`linear_to_srgb_x8`] - f32x8 linear → f32x8 sRGB
11//! - [`srgb_u8_to_linear_x8`] - \[u8; 8\] sRGB → f32x8 linear
12//! - [`linear_to_srgb_u8_x8`] - f32x8 linear → \[u8; 8\] sRGB
13//!
14//! ## Slice Functions (process entire slices)
15//! - [`srgb_to_linear_slice`] - &mut \[f32\] sRGB → linear in-place
16//! - [`linear_to_srgb_slice`] - &mut \[f32\] linear → sRGB in-place
17//! - [`srgb_u8_to_linear_slice`] - &\[u8\] sRGB → &mut \[f32\] linear
18//! - [`linear_to_srgb_u8_slice`] - &\[f32\] linear → &mut \[u8\] sRGB
19
20use multiversed::multiversed;
21use wide::{CmpLt, f32x8};
22
23use crate::fast_math::pow_x8;
24
25// sRGB transfer function constants (C0-continuous, moxcms-derived)
26// These ensure exact continuity at the linear/power segment junction.
27// Standard IEC values (0.055, 1.055, 0.04045) have a tiny discontinuity.
28const SRGB_LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.039_293_37);
29const LINEAR_THRESHOLD: f32x8 = f32x8::splat(0.003_041_282_6);
30const LINEAR_SCALE: f32x8 = f32x8::splat(1.0 / 12.92);
31const SRGB_OFFSET: f32x8 = f32x8::splat(0.055_010_72);
32const SRGB_SCALE: f32x8 = f32x8::splat(1.055_010_7);
33const TWELVE_92: f32x8 = f32x8::splat(12.92);
34const ZERO: f32x8 = f32x8::splat(0.0);
35const ONE: f32x8 = f32x8::splat(1.0);
36const U8_MAX: f32x8 = f32x8::splat(255.0);
37const HALF: f32x8 = f32x8::splat(0.5);
38
39/// Precomputed sRGB u8 → linear f32 lookup table.
40/// Uses the same constants as the transfer module (C0-continuous IEC 61966-2-1).
41/// Generated by computing `srgb_u8_to_linear(i)` for each i in 0..=255.
42/// To regenerate: `cargo run --release --example generate_lut`
43const SRGB_U8_TO_LINEAR_LUT: [f32; 256] = [
44    0.0_f32,
45    0.000303527_f32,
46    0.000607054_f32,
47    0.000910581_f32,
48    0.001214108_f32,
49    0.001517635_f32,
50    0.001821162_f32,
51    0.0021246888_f32,
52    0.002428216_f32,
53    0.002731743_f32,
54    0.00303527_f32,
55    0.0033473307_f32,
56    0.0036773437_f32,
57    0.0040255957_f32,
58    0.004392362_f32,
59    0.004777916_f32,
60    0.0051825214_f32,
61    0.00560644_f32,
62    0.006049924_f32,
63    0.0065132244_f32,
64    0.0069965874_f32,
65    0.007500253_f32,
66    0.008024457_f32,
67    0.008569433_f32,
68    0.009135411_f32,
69    0.009722613_f32,
70    0.010331264_f32,
71    0.010961577_f32,
72    0.011613773_f32,
73    0.012288062_f32,
74    0.012984648_f32,
75    0.013703744_f32,
76    0.01444555_f32,
77    0.015210266_f32,
78    0.01599809_f32,
79    0.016809216_f32,
80    0.01764384_f32,
81    0.018502146_f32,
82    0.019384334_f32,
83    0.02029058_f32,
84    0.02122107_f32,
85    0.022175988_f32,
86    0.023155512_f32,
87    0.024159823_f32,
88    0.025189094_f32,
89    0.026243499_f32,
90    0.027323212_f32,
91    0.0284284_f32,
92    0.02955924_f32,
93    0.030715894_f32,
94    0.03189852_f32,
95    0.0331073_f32,
96    0.034342386_f32,
97    0.03560393_f32,
98    0.036892105_f32,
99    0.03820707_f32,
100    0.039548974_f32,
101    0.04091798_f32,
102    0.04231424_f32,
103    0.04373789_f32,
104    0.045189105_f32,
105    0.04666803_f32,
106    0.04817481_f32,
107    0.049709592_f32,
108    0.051272515_f32,
109    0.052863743_f32,
110    0.054483414_f32,
111    0.05613167_f32,
112    0.05780865_f32,
113    0.05951448_f32,
114    0.061249338_f32,
115    0.063013345_f32,
116    0.06480663_f32,
117    0.06662934_f32,
118    0.068481594_f32,
119    0.07036356_f32,
120    0.072275355_f32,
121    0.07421711_f32,
122    0.07618896_f32,
123    0.07819102_f32,
124    0.080223456_f32,
125    0.08228638_f32,
126    0.08437992_f32,
127    0.086504206_f32,
128    0.088659346_f32,
129    0.09084551_f32,
130    0.093062796_f32,
131    0.09531133_f32,
132    0.09759124_f32,
133    0.09990266_f32,
134    0.10224568_f32,
135    0.104620464_f32,
136    0.10702711_f32,
137    0.109465756_f32,
138    0.1119365_f32,
139    0.11443946_f32,
140    0.116974786_f32,
141    0.11954258_f32,
142    0.12214295_f32,
143    0.12477602_f32,
144    0.1274419_f32,
145    0.13014072_f32,
146    0.1328726_f32,
147    0.13563763_f32,
148    0.13843594_f32,
149    0.14126763_f32,
150    0.14413282_f32,
151    0.14703165_f32,
152    0.1499642_f32,
153    0.15293059_f32,
154    0.15593089_f32,
155    0.15896529_f32,
156    0.16203386_f32,
157    0.1651367_f32,
158    0.16827393_f32,
159    0.17144562_f32,
160    0.17465195_f32,
161    0.17789298_f32,
162    0.18116882_f32,
163    0.1844796_f32,
164    0.18782537_f32,
165    0.1912063_f32,
166    0.19462249_f32,
167    0.19807397_f32,
168    0.2015609_f32,
169    0.20508343_f32,
170    0.20864154_f32,
171    0.21223548_f32,
172    0.21586527_f32,
173    0.21953095_f32,
174    0.22323275_f32,
175    0.22697066_f32,
176    0.23074481_f32,
177    0.2345554_f32,
178    0.23840237_f32,
179    0.24228595_f32,
180    0.24620613_f32,
181    0.25016314_f32,
182    0.25415692_f32,
183    0.25818765_f32,
184    0.26225552_f32,
185    0.26636043_f32,
186    0.27050266_f32,
187    0.27468216_f32,
188    0.27889907_f32,
189    0.2831536_f32,
190    0.28744566_f32,
191    0.29177552_f32,
192    0.2961431_f32,
193    0.30054858_f32,
194    0.30499217_f32,
195    0.30947372_f32,
196    0.31399357_f32,
197    0.3185516_f32,
198    0.32314798_f32,
199    0.3277829_f32,
200    0.33245632_f32,
201    0.33716843_f32,
202    0.34191918_f32,
203    0.34670877_f32,
204    0.35153738_f32,
205    0.35640487_f32,
206    0.36131153_f32,
207    0.3662573_f32,
208    0.37124234_f32,
209    0.37626684_f32,
210    0.38133067_f32,
211    0.3864341_f32,
212    0.39157712_f32,
213    0.3967598_f32,
214    0.4019824_f32,
215    0.40724477_f32,
216    0.4125472_f32,
217    0.41788962_f32,
218    0.42327216_f32,
219    0.42869502_f32,
220    0.4341581_f32,
221    0.43966165_f32,
222    0.44520563_f32,
223    0.45079017_f32,
224    0.4564154_f32,
225    0.46208134_f32,
226    0.46778816_f32,
227    0.4735358_f32,
228    0.47932443_f32,
229    0.4851542_f32,
230    0.49102503_f32,
231    0.49693722_f32,
232    0.5028906_f32,
233    0.5088854_f32,
234    0.5149218_f32,
235    0.5209996_f32,
236    0.52711916_f32,
237    0.5332804_f32,
238    0.53948337_f32,
239    0.5457284_f32,
240    0.55201524_f32,
241    0.55834424_f32,
242    0.56471527_f32,
243    0.57112855_f32,
244    0.57758415_f32,
245    0.58408207_f32,
246    0.5906225_f32,
247    0.59720534_f32,
248    0.6038308_f32,
249    0.6104991_f32,
250    0.61721_f32,
251    0.62396383_f32,
252    0.6307605_f32,
253    0.6376001_f32,
254    0.644483_f32,
255    0.6514088_f32,
256    0.658378_f32,
257    0.6653904_f32,
258    0.67244613_f32,
259    0.67954546_f32,
260    0.68668824_f32,
261    0.6938747_f32,
262    0.7011047_f32,
263    0.7083785_f32,
264    0.7156962_f32,
265    0.72305775_f32,
266    0.7304634_f32,
267    0.73791295_f32,
268    0.7454066_f32,
269    0.75294465_f32,
270    0.76052684_f32,
271    0.7681535_f32,
272    0.7758244_f32,
273    0.7835399_f32,
274    0.79130006_f32,
275    0.79910475_f32,
276    0.80695426_f32,
277    0.8148484_f32,
278    0.82278764_f32,
279    0.8307716_f32,
280    0.83880067_f32,
281    0.8468749_f32,
282    0.8549941_f32,
283    0.8631587_f32,
284    0.8713685_f32,
285    0.87962353_f32,
286    0.8879244_f32,
287    0.89627033_f32,
288    0.9046623_f32,
289    0.9130995_f32,
290    0.9215827_f32,
291    0.9301116_f32,
292    0.93868643_f32,
293    0.9473071_f32,
294    0.9559739_f32,
295    0.9646866_f32,
296    0.9734457_f32,
297    0.9822507_f32,
298    0.9911024_f32,
299    1.0_f32,
300];
301
302#[inline]
303fn get_lut() -> &'static [f32; 256] {
304    &SRGB_U8_TO_LINEAR_LUT
305}
306
307/// Convert a single sRGB u8 value to linear f32 using LUT lookup.
308///
309/// This is the fastest method for u8 input as it uses a precomputed lookup table
310/// embedded in the binary. For batch conversions, use [`srgb_u8_to_linear_slice`].
311///
312/// # Example
313/// ```
314/// use linear_srgb::simd::srgb_u8_to_linear;
315///
316/// let linear = srgb_u8_to_linear(128);
317/// assert!((linear - 0.2158).abs() < 0.001);
318/// ```
319#[inline]
320pub fn srgb_u8_to_linear(value: u8) -> f32 {
321    get_lut()[value as usize]
322}
323
324// ============================================================================
325// x8 Inline Functions - Always inlined, for use in caller's multiversed code
326// ============================================================================
327
328/// Convert 8 sRGB f32 values to linear (always inlined).
329///
330/// Use this variant inside your own `#[multiversed]` functions to avoid
331/// double dispatch overhead. For standalone calls, use [`srgb_to_linear_x8_dispatch`].
332///
333/// Input values are clamped to \[0, 1\].
334///
335/// # Deprecation
336///
337/// This SIMD implementation is ~4x slower than scalar `powf(2.4)` due to
338/// polynomial approximation overhead. Prefer [`srgb_to_linear_slice`] or
339/// [`crate::scalar::srgb_to_linear`] in a loop. Kept for benchmarking and
340/// compatibility.
341#[deprecated(
342    since = "0.3.0",
343    note = "SIMD srgb_to_linear is ~4x slower than scalar. Use srgb_to_linear_slice or scalar::srgb_to_linear instead."
344)]
345#[inline(always)]
346pub fn srgb_to_linear_x8_inline(srgb: f32x8) -> f32x8 {
347    let srgb = srgb.max(ZERO).min(ONE);
348    let linear_result = srgb * LINEAR_SCALE;
349    let power_result = pow_x8((srgb + SRGB_OFFSET) / SRGB_SCALE, 2.4);
350    let mask = srgb.simd_lt(SRGB_LINEAR_THRESHOLD);
351    mask.blend(linear_result, power_result)
352}
353
354/// Convert 8 linear f32 values to sRGB (always inlined).
355///
356/// Use this variant inside your own `#[multiversed]` functions to avoid
357/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_dispatch`].
358///
359/// Input values are clamped to \[0, 1\].
360#[inline(always)]
361pub fn linear_to_srgb_x8_inline(linear: f32x8) -> f32x8 {
362    let linear = linear.max(ZERO).min(ONE);
363    let linear_result = linear * TWELVE_92;
364    let power_result = SRGB_SCALE * pow_x8(linear, 1.0 / 2.4) - SRGB_OFFSET;
365    let mask = linear.simd_lt(LINEAR_THRESHOLD);
366    mask.blend(linear_result, power_result)
367}
368
369/// Convert 8 linear f32 values to sRGB u8 (always inlined).
370///
371/// Use this variant inside your own `#[multiversed]` functions to avoid
372/// double dispatch overhead.
373#[inline(always)]
374pub fn linear_to_srgb_u8_x8_inline(linear: f32x8) -> [u8; 8] {
375    let srgb = linear_to_srgb_x8_inline(linear);
376    let scaled = srgb * U8_MAX + HALF;
377    let arr: [f32; 8] = scaled.into();
378    [
379        arr[0] as u8,
380        arr[1] as u8,
381        arr[2] as u8,
382        arr[3] as u8,
383        arr[4] as u8,
384        arr[5] as u8,
385        arr[6] as u8,
386        arr[7] as u8,
387    ]
388}
389
390/// Convert 8 gamma-encoded f32 values to linear (always inlined).
391///
392/// Use this variant inside your own `#[multiversed]` functions to avoid
393/// double dispatch overhead.
394#[inline(always)]
395pub fn gamma_to_linear_x8_inline(encoded: f32x8, gamma: f32) -> f32x8 {
396    let encoded = encoded.max(ZERO).min(ONE);
397    pow_x8(encoded, gamma)
398}
399
400/// Convert 8 linear f32 values to gamma-encoded (always inlined).
401///
402/// Use this variant inside your own `#[multiversed]` functions to avoid
403/// double dispatch overhead.
404#[inline(always)]
405pub fn linear_to_gamma_x8_inline(linear: f32x8, gamma: f32) -> f32x8 {
406    let linear = linear.max(ZERO).min(ONE);
407    pow_x8(linear, 1.0 / gamma)
408}
409
410// ============================================================================
411// x8 Dispatch Functions - Runtime CPU feature detection
412// ============================================================================
413
414/// Convert 8 sRGB f32 values to linear (with CPU dispatch).
415///
416/// This variant uses runtime CPU feature detection to select the optimal
417/// implementation. Use [`srgb_to_linear_x8_inline`] inside your own
418/// `#[multiversed]` functions to avoid double dispatch.
419///
420/// Input values are clamped to \[0, 1\].
421///
422/// # Deprecation
423///
424/// This SIMD implementation is ~4x slower than scalar `powf(2.4)`.
425/// Prefer [`srgb_to_linear_slice`] or [`crate::scalar::srgb_to_linear`].
426#[deprecated(
427    since = "0.3.0",
428    note = "SIMD srgb_to_linear is ~4x slower than scalar. Use srgb_to_linear_slice or scalar::srgb_to_linear instead."
429)]
430#[multiversed]
431#[inline]
432#[allow(deprecated)]
433pub fn srgb_to_linear_x8_dispatch(srgb: f32x8) -> f32x8 {
434    srgb_to_linear_x8_inline(srgb)
435}
436
437/// Convert 8 linear f32 values to sRGB (with CPU dispatch).
438///
439/// This variant uses runtime CPU feature detection to select the optimal
440/// implementation. Use [`linear_to_srgb_x8_inline`] inside your own
441/// `#[multiversed]` functions to avoid double dispatch.
442///
443/// Input values are clamped to \[0, 1\].
444#[multiversed]
445#[inline]
446pub fn linear_to_srgb_x8_dispatch(linear: f32x8) -> f32x8 {
447    linear_to_srgb_x8_inline(linear)
448}
449
450/// Convert 8 linear f32 values to sRGB u8 (with CPU dispatch).
451#[multiversed]
452#[inline]
453pub fn linear_to_srgb_u8_x8_dispatch(linear: f32x8) -> [u8; 8] {
454    linear_to_srgb_u8_x8_inline(linear)
455}
456
457/// Convert 8 gamma-encoded f32 values to linear (with CPU dispatch).
458#[multiversed]
459#[inline]
460pub fn gamma_to_linear_x8_dispatch(encoded: f32x8, gamma: f32) -> f32x8 {
461    gamma_to_linear_x8_inline(encoded, gamma)
462}
463
464/// Convert 8 linear f32 values to gamma-encoded (with CPU dispatch).
465#[multiversed]
466#[inline]
467pub fn linear_to_gamma_x8_dispatch(linear: f32x8, gamma: f32) -> f32x8 {
468    linear_to_gamma_x8_inline(linear, gamma)
469}
470
471// ============================================================================
472// x8 Default Functions - Calls inline variant, compiler decides inlining
473// ============================================================================
474
475/// Convert 8 sRGB f32 values to linear.
476///
477/// This is the default variant that calls the inline implementation.
478/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
479/// inside your own `#[multiversed]` functions.
480///
481/// Input values are clamped to \[0, 1\].
482///
483/// # Deprecation
484///
485/// This SIMD implementation is ~4x slower than scalar `powf(2.4)`.
486/// Prefer [`srgb_to_linear_slice`] or [`crate::scalar::srgb_to_linear`].
487///
488/// # Example
489/// ```
490/// use linear_srgb::simd::srgb_to_linear_x8;
491/// use wide::f32x8;
492///
493/// let srgb = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
494/// #[allow(deprecated)]
495/// let linear = srgb_to_linear_x8(srgb);
496/// ```
497#[deprecated(
498    since = "0.3.0",
499    note = "SIMD srgb_to_linear is ~4x slower than scalar. Use srgb_to_linear_slice or scalar::srgb_to_linear instead."
500)]
501#[inline]
502#[allow(deprecated)]
503pub fn srgb_to_linear_x8(srgb: f32x8) -> f32x8 {
504    srgb_to_linear_x8_inline(srgb)
505}
506
507/// Convert 8 linear f32 values to sRGB.
508///
509/// This is the default variant that calls the inline implementation.
510/// Use `_dispatch` for guaranteed CPU feature detection, or `_inline`
511/// inside your own `#[multiversed]` functions.
512///
513/// Input values are clamped to \[0, 1\].
514///
515/// # Example
516/// ```
517/// use linear_srgb::simd::linear_to_srgb_x8;
518/// use wide::f32x8;
519///
520/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
521/// let srgb = linear_to_srgb_x8(linear);
522/// ```
523#[inline]
524pub fn linear_to_srgb_x8(linear: f32x8) -> f32x8 {
525    linear_to_srgb_x8_inline(linear)
526}
527
528/// Convert 8 sRGB u8 values to linear f32 using LUT lookup.
529///
530/// This is the fastest method for u8 input as it uses a precomputed lookup table.
531///
532/// # Example
533/// ```
534/// use linear_srgb::simd::srgb_u8_to_linear_x8;
535///
536/// let srgb = [0u8, 64, 128, 192, 255, 32, 96, 160];
537/// let linear = srgb_u8_to_linear_x8(srgb);
538/// ```
539#[inline]
540pub fn srgb_u8_to_linear_x8(srgb: [u8; 8]) -> f32x8 {
541    let lut = get_lut();
542    f32x8::from([
543        lut[srgb[0] as usize],
544        lut[srgb[1] as usize],
545        lut[srgb[2] as usize],
546        lut[srgb[3] as usize],
547        lut[srgb[4] as usize],
548        lut[srgb[5] as usize],
549        lut[srgb[6] as usize],
550        lut[srgb[7] as usize],
551    ])
552}
553
554/// Convert 8 linear f32 values to sRGB u8.
555///
556/// Input values are clamped to \[0, 1\], output is rounded to nearest u8.
557///
558/// # Example
559/// ```
560/// use linear_srgb::simd::linear_to_srgb_u8_x8;
561/// use wide::f32x8;
562///
563/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
564/// let srgb = linear_to_srgb_u8_x8(linear);
565/// ```
566#[inline]
567pub fn linear_to_srgb_u8_x8(linear: f32x8) -> [u8; 8] {
568    linear_to_srgb_u8_x8_inline(linear)
569}
570
571/// Convert 8 gamma-encoded f32 values to linear.
572///
573/// # Example
574/// ```
575/// use linear_srgb::simd::gamma_to_linear_x8;
576/// use wide::f32x8;
577///
578/// let encoded = f32x8::from([0.0, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.5]);
579/// let linear = gamma_to_linear_x8(encoded, 2.2);
580/// ```
581#[inline]
582pub fn gamma_to_linear_x8(encoded: f32x8, gamma: f32) -> f32x8 {
583    gamma_to_linear_x8_inline(encoded, gamma)
584}
585
586/// Convert 8 linear f32 values to gamma-encoded.
587///
588/// # Example
589/// ```
590/// use linear_srgb::simd::linear_to_gamma_x8;
591/// use wide::f32x8;
592///
593/// let linear = f32x8::from([0.0, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8]);
594/// let encoded = linear_to_gamma_x8(linear, 2.2);
595/// ```
596#[inline]
597pub fn linear_to_gamma_x8(linear: f32x8, gamma: f32) -> f32x8 {
598    linear_to_gamma_x8_inline(linear, gamma)
599}
600
601// ============================================================================
602// Slice Functions - Process entire slices
603// ============================================================================
604
605/// Convert sRGB f32 values to linear in-place.
606///
607/// # Example
608/// ```
609/// use linear_srgb::simd::srgb_to_linear_slice;
610///
611/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
612/// srgb_to_linear_slice(&mut values);
613/// ```
614///
615/// # Performance Note
616///
617/// This function uses scalar `powf()` internally because hardware transcendentals
618/// are ~4x faster than SIMD polynomial approximation for the sRGB linearization
619/// exponent (2.4). For the inverse direction, see [`linear_to_srgb_slice`] which
620/// does benefit from SIMD.
621#[inline]
622pub fn srgb_to_linear_slice(values: &mut [f32]) {
623    // Scalar powf(2.4) is ~4x faster than SIMD polynomial approximation.
624    // See benchmarks in benches/benchmarks.rs for details.
625    for v in values.iter_mut() {
626        *v = crate::scalar::srgb_to_linear(*v);
627    }
628}
629
630/// Convert linear f32 values to sRGB in-place.
631///
632/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
633///
634/// # Example
635/// ```
636/// use linear_srgb::simd::linear_to_srgb_slice;
637///
638/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
639/// linear_to_srgb_slice(&mut values);
640/// ```
641#[multiversed]
642#[inline]
643pub fn linear_to_srgb_slice(values: &mut [f32]) {
644    let (chunks, remainder) = values.as_chunks_mut::<8>();
645
646    for chunk in chunks {
647        let result = linear_to_srgb_x8_inline(f32x8::from(*chunk));
648        *chunk = result.into();
649    }
650
651    for v in remainder {
652        *v = crate::scalar::linear_to_srgb(*v);
653    }
654}
655
656/// Convert sRGB u8 values to linear f32.
657///
658/// Uses a precomputed LUT for each u8 value, processed in SIMD batches of 8.
659///
660/// # Panics
661/// Panics if `input.len() != output.len()`.
662///
663/// # Example
664/// ```
665/// use linear_srgb::simd::srgb_u8_to_linear_slice;
666///
667/// let input: Vec<u8> = (0..=255).collect();
668/// let mut output = vec![0.0f32; 256];
669/// srgb_u8_to_linear_slice(&input, &mut output);
670/// ```
671#[inline]
672pub fn srgb_u8_to_linear_slice(input: &[u8], output: &mut [f32]) {
673    assert_eq!(input.len(), output.len());
674    let lut = get_lut();
675
676    let (in_chunks, in_remainder) = input.as_chunks::<8>();
677    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
678
679    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
680        *out = [
681            lut[inp[0] as usize],
682            lut[inp[1] as usize],
683            lut[inp[2] as usize],
684            lut[inp[3] as usize],
685            lut[inp[4] as usize],
686            lut[inp[5] as usize],
687            lut[inp[6] as usize],
688            lut[inp[7] as usize],
689        ];
690    }
691
692    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
693        *out = lut[*inp as usize];
694    }
695}
696
697/// Convert linear f32 values to sRGB u8.
698///
699/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
700///
701/// # Panics
702/// Panics if `input.len() != output.len()`.
703///
704/// # Example
705/// ```
706/// use linear_srgb::simd::linear_to_srgb_u8_slice;
707///
708/// let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
709/// let mut output = vec![0u8; 256];
710/// linear_to_srgb_u8_slice(&input, &mut output);
711/// ```
712#[multiversed]
713#[inline]
714pub fn linear_to_srgb_u8_slice(input: &[f32], output: &mut [u8]) {
715    assert_eq!(input.len(), output.len());
716
717    let (in_chunks, in_remainder) = input.as_chunks::<8>();
718    let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
719
720    for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
721        *out = linear_to_srgb_u8_x8_inline(f32x8::from(*inp));
722    }
723
724    for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
725        let srgb = crate::scalar::linear_to_srgb(*inp);
726        *out = (srgb * 255.0 + 0.5) as u8;
727    }
728}
729
730// ============================================================================
731// Custom Gamma Slice Functions
732// ============================================================================
733
734/// Convert gamma-encoded f32 values to linear in-place using a custom gamma.
735///
736/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
737///
738/// # Example
739/// ```
740/// use linear_srgb::simd::gamma_to_linear_slice;
741///
742/// let mut values = vec![0.0f32, 0.25, 0.5, 0.75, 1.0];
743/// gamma_to_linear_slice(&mut values, 2.2);
744/// ```
745#[multiversed]
746#[inline]
747pub fn gamma_to_linear_slice(values: &mut [f32], gamma: f32) {
748    let (chunks, remainder) = values.as_chunks_mut::<8>();
749
750    for chunk in chunks {
751        let result = gamma_to_linear_x8_inline(f32x8::from(*chunk), gamma);
752        *chunk = result.into();
753    }
754
755    for v in remainder {
756        *v = crate::scalar::gamma_to_linear(*v, gamma);
757    }
758}
759
760/// Convert linear f32 values to gamma-encoded in-place using a custom gamma.
761///
762/// Processes 8 values at a time using SIMD, with scalar fallback for remainder.
763///
764/// # Example
765/// ```
766/// use linear_srgb::simd::linear_to_gamma_slice;
767///
768/// let mut values = vec![0.0f32, 0.1, 0.2, 0.5, 1.0];
769/// linear_to_gamma_slice(&mut values, 2.2);
770/// ```
771#[multiversed]
772#[inline]
773pub fn linear_to_gamma_slice(values: &mut [f32], gamma: f32) {
774    let (chunks, remainder) = values.as_chunks_mut::<8>();
775
776    for chunk in chunks {
777        let result = linear_to_gamma_x8_inline(f32x8::from(*chunk), gamma);
778        *chunk = result.into();
779    }
780
781    for v in remainder {
782        *v = crate::scalar::linear_to_gamma(*v, gamma);
783    }
784}
785
786// ============================================================================
787// f32x8 Slice Functions (for pre-aligned SIMD data)
788// ============================================================================
789
790/// Convert linear f32x8 values to sRGB in-place.
791///
792/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
793/// use [`linear_to_srgb_slice`] instead which handles remainders automatically.
794///
795/// # Example
796/// ```
797/// use linear_srgb::simd::linear_to_srgb_x8_slice;
798/// use wide::f32x8;
799///
800/// let mut values = vec![f32x8::splat(0.5); 100];
801/// linear_to_srgb_x8_slice(&mut values);
802/// ```
803#[multiversed]
804#[inline]
805pub fn linear_to_srgb_x8_slice(values: &mut [f32x8]) {
806    for v in values.iter_mut() {
807        *v = linear_to_srgb_x8_inline(*v);
808    }
809}
810
811/// Convert gamma-encoded f32x8 values to linear in-place using a custom gamma.
812///
813/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
814/// use [`gamma_to_linear_slice`] instead which handles remainders automatically.
815///
816/// # Example
817/// ```
818/// use linear_srgb::simd::gamma_to_linear_x8_slice;
819/// use wide::f32x8;
820///
821/// let mut values = vec![f32x8::splat(0.5); 100];
822/// gamma_to_linear_x8_slice(&mut values, 2.2);
823/// ```
824#[multiversed]
825#[inline]
826pub fn gamma_to_linear_x8_slice(values: &mut [f32x8], gamma: f32) {
827    for v in values.iter_mut() {
828        *v = gamma_to_linear_x8_inline(*v, gamma);
829    }
830}
831
832/// Convert linear f32x8 values to gamma-encoded in-place using a custom gamma.
833///
834/// For data already structured as `f32x8` slices. If you have `&mut [f32]`,
835/// use [`linear_to_gamma_slice`] instead which handles remainders automatically.
836///
837/// # Example
838/// ```
839/// use linear_srgb::simd::linear_to_gamma_x8_slice;
840/// use wide::f32x8;
841///
842/// let mut values = vec![f32x8::splat(0.2); 100];
843/// linear_to_gamma_x8_slice(&mut values, 2.2);
844/// ```
845#[multiversed]
846#[inline]
847pub fn linear_to_gamma_x8_slice(values: &mut [f32x8], gamma: f32) {
848    for v in values.iter_mut() {
849        *v = linear_to_gamma_x8_inline(*v, gamma);
850    }
851}
852
853// ============================================================================
854// f32x8 Slice Inline Functions (for use inside caller's multiversed code)
855// ============================================================================
856
857/// Convert linear f32x8 values to sRGB in-place (always inlined).
858///
859/// Use this variant inside your own `#[multiversed]` functions to avoid
860/// double dispatch overhead. For standalone calls, use [`linear_to_srgb_x8_slice`].
861#[inline(always)]
862pub fn linear_to_srgb_x8_slice_inline(values: &mut [f32x8]) {
863    for v in values.iter_mut() {
864        *v = linear_to_srgb_x8_inline(*v);
865    }
866}
867
868/// Convert gamma-encoded f32x8 values to linear in-place (always inlined).
869///
870/// Use this variant inside your own `#[multiversed]` functions to avoid
871/// double dispatch overhead. For standalone calls, use [`gamma_to_linear_x8_slice`].
872#[inline(always)]
873pub fn gamma_to_linear_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
874    for v in values.iter_mut() {
875        *v = gamma_to_linear_x8_inline(*v, gamma);
876    }
877}
878
879/// Convert linear f32x8 values to gamma-encoded in-place (always inlined).
880///
881/// Use this variant inside your own `#[multiversed]` functions to avoid
882/// double dispatch overhead. For standalone calls, use [`linear_to_gamma_x8_slice`].
883#[inline(always)]
884pub fn linear_to_gamma_x8_slice_inline(values: &mut [f32x8], gamma: f32) {
885    for v in values.iter_mut() {
886        *v = linear_to_gamma_x8_inline(*v, gamma);
887    }
888}
889
890// ============================================================================
891// Tests
892// ============================================================================
893
894#[cfg(test)]
895mod tests {
896    use super::*;
897
898    #[cfg(not(feature = "std"))]
899    use alloc::{vec, vec::Vec};
900
901    // ---- x8 function tests ----
902
903    #[test]
904    #[allow(deprecated)]
905    fn test_srgb_to_linear_x8() {
906        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
907        let result = srgb_to_linear_x8(f32x8::from(input));
908        let result_arr: [f32; 8] = result.into();
909
910        for (i, &inp) in input.iter().enumerate() {
911            let expected = crate::scalar::srgb_to_linear(inp);
912            assert!(
913                (result_arr[i] - expected).abs() < 1e-5,
914                "srgb_to_linear_x8 mismatch at {}: got {}, expected {}",
915                i,
916                result_arr[i],
917                expected
918            );
919        }
920    }
921
922    #[test]
923    fn test_linear_to_srgb_x8() {
924        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
925        let result = linear_to_srgb_x8(f32x8::from(input));
926        let result_arr: [f32; 8] = result.into();
927
928        for (i, &inp) in input.iter().enumerate() {
929            let expected = crate::scalar::linear_to_srgb(inp);
930            assert!(
931                (result_arr[i] - expected).abs() < 1e-5,
932                "linear_to_srgb_x8 mismatch at {}: got {}, expected {}",
933                i,
934                result_arr[i],
935                expected
936            );
937        }
938    }
939
940    #[test]
941    #[allow(deprecated)]
942    fn test_srgb_u8_to_linear_x8() {
943        let input: [u8; 8] = [0, 64, 128, 192, 255, 32, 96, 160];
944        let result = srgb_u8_to_linear_x8(input);
945        let result_arr: [f32; 8] = result.into();
946
947        for (i, &inp) in input.iter().enumerate() {
948            let expected = crate::scalar::srgb_u8_to_linear(inp);
949            assert!(
950                (result_arr[i] - expected).abs() < 1e-6,
951                "srgb_u8_to_linear_x8 mismatch at {}: got {}, expected {}",
952                i,
953                result_arr[i],
954                expected
955            );
956        }
957    }
958
959    #[test]
960    fn test_linear_to_srgb_u8_x8() {
961        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.05, 0.8];
962        let result = linear_to_srgb_u8_x8(f32x8::from(input));
963
964        for (i, &inp) in input.iter().enumerate() {
965            let expected = (crate::scalar::linear_to_srgb(inp) * 255.0 + 0.5) as u8;
966            assert!(
967                (result[i] as i16 - expected as i16).abs() <= 1,
968                "linear_to_srgb_u8_x8 mismatch at {}: got {}, expected {}",
969                i,
970                result[i],
971                expected
972            );
973        }
974    }
975
976    // ---- Slice function tests ----
977
978    #[test]
979    fn test_srgb_to_linear_slice() {
980        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
981        let expected: Vec<f32> = values
982            .iter()
983            .map(|&v| crate::scalar::srgb_to_linear(v))
984            .collect();
985
986        srgb_to_linear_slice(&mut values);
987
988        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
989            assert!(
990                (got - exp).abs() < 1e-5,
991                "srgb_to_linear_slice mismatch at {}: got {}, expected {}",
992                i,
993                got,
994                exp
995            );
996        }
997    }
998
999    #[test]
1000    fn test_linear_to_srgb_slice() {
1001        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1002        let expected: Vec<f32> = values
1003            .iter()
1004            .map(|&v| crate::scalar::linear_to_srgb(v))
1005            .collect();
1006
1007        linear_to_srgb_slice(&mut values);
1008
1009        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1010            assert!(
1011                (got - exp).abs() < 1e-5,
1012                "linear_to_srgb_slice mismatch at {}: got {}, expected {}",
1013                i,
1014                got,
1015                exp
1016            );
1017        }
1018    }
1019
1020    #[test]
1021    #[allow(deprecated)]
1022    fn test_srgb_u8_to_linear_slice() {
1023        let input: Vec<u8> = (0..=255).collect();
1024        let mut output = vec![0.0f32; 256];
1025
1026        srgb_u8_to_linear_slice(&input, &mut output);
1027
1028        for (i, &out) in output.iter().enumerate() {
1029            let expected = crate::scalar::srgb_u8_to_linear(i as u8);
1030            assert!(
1031                (out - expected).abs() < 1e-6,
1032                "srgb_u8_to_linear_slice mismatch at {}: got {}, expected {}",
1033                i,
1034                out,
1035                expected
1036            );
1037        }
1038    }
1039
1040    #[test]
1041    fn test_linear_to_srgb_u8_slice() {
1042        let input: Vec<f32> = (0..=255).map(|i| i as f32 / 255.0).collect();
1043        let mut output = vec![0u8; 256];
1044
1045        linear_to_srgb_u8_slice(&input, &mut output);
1046
1047        for i in 0..256 {
1048            let expected = (crate::scalar::linear_to_srgb(input[i]) * 255.0 + 0.5) as u8;
1049            assert!(
1050                (output[i] as i16 - expected as i16).abs() <= 1,
1051                "linear_to_srgb_u8_slice mismatch at {}: got {}, expected {}",
1052                i,
1053                output[i],
1054                expected
1055            );
1056        }
1057    }
1058
1059    // ---- Roundtrip tests ----
1060
1061    #[test]
1062    fn test_f32_roundtrip() {
1063        let mut values: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
1064        let original = values.clone();
1065
1066        srgb_to_linear_slice(&mut values);
1067        linear_to_srgb_slice(&mut values);
1068
1069        for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
1070            assert!(
1071                (orig - conv).abs() < 1e-4,
1072                "f32 roundtrip failed at {}: {} -> {}",
1073                i,
1074                orig,
1075                conv
1076            );
1077        }
1078    }
1079
1080    #[test]
1081    fn test_u8_roundtrip() {
1082        let input: Vec<u8> = (0..=255).collect();
1083        let mut linear = vec![0.0f32; 256];
1084        let mut back = vec![0u8; 256];
1085
1086        srgb_u8_to_linear_slice(&input, &mut linear);
1087        linear_to_srgb_u8_slice(&linear, &mut back);
1088
1089        for i in 0..256 {
1090            assert!(
1091                (input[i] as i16 - back[i] as i16).abs() <= 1,
1092                "u8 roundtrip failed at {}: {} -> {} -> {}",
1093                i,
1094                input[i],
1095                linear[i],
1096                back[i]
1097            );
1098        }
1099    }
1100
1101    // ---- Edge case tests ----
1102
1103    #[test]
1104    #[allow(deprecated)]
1105    fn test_clamping() {
1106        // Test that out-of-range values are clamped
1107        let input = f32x8::from([-0.5, -0.1, 0.0, 0.5, 1.0, 1.5, 2.0, 10.0]);
1108        let result = srgb_to_linear_x8(input);
1109        let arr: [f32; 8] = result.into();
1110
1111        assert_eq!(arr[0], 0.0, "negative should clamp to 0");
1112        assert_eq!(arr[1], 0.0, "negative should clamp to 0");
1113        assert!(arr[4] > 0.99 && arr[4] <= 1.0, "1.0 should stay ~1.0");
1114        assert!(arr[5] > 0.99 && arr[5] <= 1.0, "values > 1 should clamp");
1115    }
1116
1117    #[test]
1118    #[allow(deprecated)]
1119    fn test_linear_segment() {
1120        // Test values in the linear segment (< 0.04045)
1121        let input = f32x8::from([0.0, 0.01, 0.02, 0.03, 0.04, 0.005, 0.015, 0.035]);
1122        let result = srgb_to_linear_x8(input);
1123        let arr: [f32; 8] = result.into();
1124        let input_arr: [f32; 8] = input.into();
1125
1126        for i in 0..8 {
1127            let expected = input_arr[i] / 12.92;
1128            assert!(
1129                (arr[i] - expected).abs() < 1e-6,
1130                "linear segment mismatch at {}: got {}, expected {}",
1131                i,
1132                arr[i],
1133                expected
1134            );
1135        }
1136    }
1137
1138    /// Verify the const LUT stays in sync with the transfer function.
1139    /// Allows 1 ULP difference for cross-platform float variance (powf isn't
1140    /// perfectly deterministic across architectures).
1141    #[test]
1142    #[allow(deprecated)]
1143    fn test_lut_matches_transfer_function() {
1144        let lut = get_lut();
1145        for i in 0..=255u8 {
1146            let expected = crate::scalar::srgb_u8_to_linear(i);
1147            let got = lut[i as usize];
1148            let got_bits = got.to_bits();
1149            let expected_bits = expected.to_bits();
1150            let ulp_diff = (got_bits as i64 - expected_bits as i64).unsigned_abs();
1151            assert!(
1152                ulp_diff <= 1,
1153                "LUT[{}] = {} ({:08x}) differs by {} ULP from srgb_u8_to_linear({}) = {} ({:08x}). \
1154                 LUT needs regeneration if transfer constants changed.",
1155                i,
1156                got,
1157                got_bits,
1158                ulp_diff,
1159                i,
1160                expected,
1161                expected_bits
1162            );
1163        }
1164    }
1165
1166    #[test]
1167    fn test_empty_slice() {
1168        let mut empty: Vec<f32> = vec![];
1169        srgb_to_linear_slice(&mut empty);
1170        assert!(empty.is_empty());
1171
1172        let empty_u8: Vec<u8> = vec![];
1173        let mut empty_out: Vec<f32> = vec![];
1174        srgb_u8_to_linear_slice(&empty_u8, &mut empty_out);
1175    }
1176
1177    #[test]
1178    fn test_non_multiple_of_8() {
1179        // Test slices that aren't multiples of 8
1180        for len in [1, 3, 7, 9, 15, 17, 100] {
1181            let mut values: Vec<f32> = (0..len).map(|i| i as f32 / len as f32).collect();
1182            let expected: Vec<f32> = values
1183                .iter()
1184                .map(|&v| crate::scalar::srgb_to_linear(v))
1185                .collect();
1186
1187            srgb_to_linear_slice(&mut values);
1188
1189            for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1190                assert!(
1191                    (got - exp).abs() < 1e-5,
1192                    "len={} mismatch at {}: got {}, expected {}",
1193                    len,
1194                    i,
1195                    got,
1196                    exp
1197                );
1198            }
1199        }
1200    }
1201
1202    // ---- Custom gamma tests ----
1203
1204    #[test]
1205    fn test_gamma_to_linear_x8() {
1206        let input = [0.0f32, 0.25, 0.5, 0.75, 1.0, 0.1, 0.9, 0.04];
1207        let gamma = 2.2f32;
1208        let result = gamma_to_linear_x8(f32x8::from(input), gamma);
1209        let result_arr: [f32; 8] = result.into();
1210
1211        for (i, &inp) in input.iter().enumerate() {
1212            let expected = crate::scalar::gamma_to_linear(inp, gamma);
1213            assert!(
1214                (result_arr[i] - expected).abs() < 1e-5,
1215                "gamma_to_linear_x8 mismatch at {}: got {}, expected {}",
1216                i,
1217                result_arr[i],
1218                expected
1219            );
1220        }
1221    }
1222
1223    #[test]
1224    fn test_linear_to_gamma_x8() {
1225        let input = [0.0f32, 0.1, 0.2, 0.5, 1.0, 0.01, 0.001, 0.8];
1226        let gamma = 2.2f32;
1227        let result = linear_to_gamma_x8(f32x8::from(input), gamma);
1228        let result_arr: [f32; 8] = result.into();
1229
1230        for (i, &inp) in input.iter().enumerate() {
1231            let expected = crate::scalar::linear_to_gamma(inp, gamma);
1232            assert!(
1233                (result_arr[i] - expected).abs() < 1e-5,
1234                "linear_to_gamma_x8 mismatch at {}: got {}, expected {}",
1235                i,
1236                result_arr[i],
1237                expected
1238            );
1239        }
1240    }
1241
1242    #[test]
1243    fn test_gamma_roundtrip_x8() {
1244        let input = [0.0f32, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 1.0];
1245        for gamma in [1.8f32, 2.0, 2.2, 2.4] {
1246            let linear = gamma_to_linear_x8(f32x8::from(input), gamma);
1247            let back = linear_to_gamma_x8(linear, gamma);
1248            let back_arr: [f32; 8] = back.into();
1249
1250            for (i, &inp) in input.iter().enumerate() {
1251                assert!(
1252                    (inp - back_arr[i]).abs() < 1e-4,
1253                    "gamma {} roundtrip failed at {}: {} -> {}",
1254                    gamma,
1255                    i,
1256                    inp,
1257                    back_arr[i]
1258                );
1259            }
1260        }
1261    }
1262
1263    #[test]
1264    fn test_gamma_slice_functions() {
1265        let gamma = 2.2f32;
1266
1267        let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
1268        let expected: Vec<f32> = values
1269            .iter()
1270            .map(|&v| crate::scalar::gamma_to_linear(v, gamma))
1271            .collect();
1272
1273        gamma_to_linear_slice(&mut values, gamma);
1274
1275        for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
1276            assert!(
1277                (got - exp).abs() < 1e-5,
1278                "gamma_to_linear_slice mismatch at {}: got {}, expected {}",
1279                i,
1280                got,
1281                exp
1282            );
1283        }
1284
1285        // Test linear_to_gamma_slice
1286        let expected_back: Vec<f32> = values
1287            .iter()
1288            .map(|&v| crate::scalar::linear_to_gamma(v, gamma))
1289            .collect();
1290
1291        linear_to_gamma_slice(&mut values, gamma);
1292
1293        for (i, (&got, &exp)) in values.iter().zip(expected_back.iter()).enumerate() {
1294            assert!(
1295                (got - exp).abs() < 1e-5,
1296                "linear_to_gamma_slice mismatch at {}: got {}, expected {}",
1297                i,
1298                got,
1299                exp
1300            );
1301        }
1302    }
1303}