use std::sync::LazyLock;
use super::{Buffer, write_pixels};
const OETF_LUT_SIZE: usize = 4096;
static OETF_LUT: LazyLock<[f32; OETF_LUT_SIZE + 1]> = LazyLock::new(|| {
let mut table = [0.0f32; OETF_LUT_SIZE + 1];
for (i, entry) in table.iter_mut().enumerate() {
let c = i as f32 / OETF_LUT_SIZE as f32;
*entry = srgb_oetf_precise(c);
}
table
});
fn srgb_oetf_precise(c: f32) -> f32 {
if c <= 0.0031308 {
c * 12.92
} else {
1.055 * c.powf(1.0 / 2.4) - 0.055
}
}
#[inline(always)]
fn srgb_oetf_fast(c: f32) -> f32 {
let c = c.clamp(0.0, 1.0);
let scaled = c * OETF_LUT_SIZE as f32;
let idx = scaled as usize;
if idx >= OETF_LUT_SIZE {
return OETF_LUT[OETF_LUT_SIZE];
}
let frac = scaled - idx as f32;
OETF_LUT[idx] + frac * (OETF_LUT[idx + 1] - OETF_LUT[idx])
}
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
const SRGB_OETF_MINIMAX_A: f32 = 0.075_058_33;
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
const SRGB_OETF_MINIMAX_B: f32 = 0.048_553_98;
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
const SRGB_OETF_MINIMAX_C: f32 = 0.027_579_91;
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
#[inline]
unsafe fn rsqrt_refined_sse4_1(x: std::arch::x86_64::__m128) -> std::arch::x86_64::__m128 {
use std::arch::x86_64::*;
let y = _mm_rsqrt_ps(x);
let y_sq = _mm_mul_ps(y, y);
let half_x = _mm_mul_ps(_mm_set1_ps(0.5), x);
let correction = _mm_sub_ps(_mm_set1_ps(1.5), _mm_mul_ps(half_x, y_sq));
_mm_mul_ps(y, correction)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
#[inline]
unsafe fn rcp_refined_sse4_1(x: std::arch::x86_64::__m128) -> std::arch::x86_64::__m128 {
use std::arch::x86_64::*;
let y = _mm_rcp_ps(x);
let correction = _mm_sub_ps(_mm_set1_ps(2.0), _mm_mul_ps(x, y));
_mm_mul_ps(y, correction)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
#[inline]
unsafe fn rsqrt_refined_avx2(x: std::arch::x86_64::__m256) -> std::arch::x86_64::__m256 {
use std::arch::x86_64::*;
let y = _mm256_rsqrt_ps(x);
let y_sq = _mm256_mul_ps(y, y);
let half_x = _mm256_mul_ps(_mm256_set1_ps(0.5), x);
let correction = _mm256_fnmadd_ps(half_x, y_sq, _mm256_set1_ps(1.5));
_mm256_mul_ps(y, correction)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
#[inline]
unsafe fn rcp_refined_avx2(x: std::arch::x86_64::__m256) -> std::arch::x86_64::__m256 {
use std::arch::x86_64::*;
let y = _mm256_rcp_ps(x);
let correction = _mm256_fnmadd_ps(x, y, _mm256_set1_ps(2.0));
_mm256_mul_ps(y, correction)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx512bw")]
#[inline]
unsafe fn rsqrt_refined_avx512(x: std::arch::x86_64::__m512) -> std::arch::x86_64::__m512 {
use std::arch::x86_64::*;
let y = _mm512_rsqrt14_ps(x);
let y_sq = _mm512_mul_ps(y, y);
let half_x = _mm512_mul_ps(_mm512_set1_ps(0.5), x);
let correction = _mm512_fnmadd_ps(half_x, y_sq, _mm512_set1_ps(1.5));
_mm512_mul_ps(y, correction)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx512bw")]
#[inline]
unsafe fn rcp_refined_avx512(x: std::arch::x86_64::__m512) -> std::arch::x86_64::__m512 {
use std::arch::x86_64::*;
let y = _mm512_rcp14_ps(x);
let correction = _mm512_fnmadd_ps(x, y, _mm512_set1_ps(2.0));
_mm512_mul_ps(y, correction)
}
pub fn store_srgb8_f32(buf: &Buffer<f32>, channels: usize) -> Vec<u8> {
profiling::scope!("store_srgb8_f32");
#[cfg(target_arch = "x86_64")]
{
if channels == 4
&& is_x86_feature_detected!("avx512f")
&& is_x86_feature_detected!("avx512bw")
&& is_x86_feature_detected!("avx512vl")
{
return unsafe { store_srgb8_f32_avx512::<false>(buf) };
}
if channels == 4 && is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
return unsafe { store_srgb8_f32_avx2_fma::<false>(buf) };
}
if channels == 4 && is_x86_feature_detected!("sse4.1") {
return unsafe { store_srgb8_f32_sse4_1::<false>(buf) };
}
}
#[cfg(target_arch = "aarch64")]
{
if channels == 4 && std::arch::is_aarch64_feature_detected!("neon") {
return unsafe { store_srgb8_f32_neon::<false>(buf) };
}
}
store_srgb8_f32_serial(buf, channels)
}
#[doc(hidden)]
pub fn store_srgb8_f32_serial(buf: &Buffer<f32>, channels: usize) -> Vec<u8> {
profiling::scope!("store_srgb8_f32_serial");
write_pixels(buf, channels, 1, |lanes, bytes| {
for (c, (&lane, byte)) in lanes.iter().zip(bytes.iter_mut()).enumerate() {
let encoded = if c < 3 {
srgb_oetf_fast(lane)
} else {
lane.clamp(0.0, 1.0)
};
*byte = (encoded * 255.0).round() as u8;
}
})
}
pub fn store_bgra8_srgb_f32(buf: &Buffer<f32>) -> Vec<u8> {
profiling::scope!("store_bgra8_srgb_f32");
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx512f")
&& is_x86_feature_detected!("avx512bw")
&& is_x86_feature_detected!("avx512vl")
{
return unsafe { store_srgb8_f32_avx512::<true>(buf) };
}
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
return unsafe { store_srgb8_f32_avx2_fma::<true>(buf) };
}
if is_x86_feature_detected!("sse4.1") {
return unsafe { store_srgb8_f32_sse4_1::<true>(buf) };
}
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") {
return unsafe { store_srgb8_f32_neon::<true>(buf) };
}
}
store_bgra8_srgb_f32_serial(buf)
}
#[doc(hidden)]
pub fn store_bgra8_srgb_f32_serial(buf: &Buffer<f32>) -> Vec<u8> {
profiling::scope!("store_bgra8_srgb_f32_serial");
write_pixels(buf, 4, 1, |lanes, bytes| {
let arr = <&mut [u8; 4]>::try_from(bytes).expect("4-byte pixel");
arr[0] = (srgb_oetf_fast(lanes[2]) * 255.0).round() as u8;
arr[1] = (srgb_oetf_fast(lanes[1]) * 255.0).round() as u8;
arr[2] = (srgb_oetf_fast(lanes[0]) * 255.0).round() as u8;
arr[3] = (lanes[3].clamp(0.0, 1.0) * 255.0).round() as u8;
})
}
pub fn store_bgr8_srgb_f32(buf: &Buffer<f32>) -> Vec<u8> {
profiling::scope!("store_bgr8_srgb_f32");
write_pixels(buf, 3, 1, |lanes, bytes| {
let arr = <&mut [u8; 3]>::try_from(bytes).expect("3-byte pixel");
arr[0] = (srgb_oetf_fast(lanes[2]) * 255.0).round() as u8;
arr[1] = (srgb_oetf_fast(lanes[1]) * 255.0).round() as u8;
arr[2] = (srgb_oetf_fast(lanes[0]) * 255.0).round() as u8;
})
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
#[inline]
unsafe fn encode_srgb_pixel_sse4_1<const BGRA: bool>(lanes: std::arch::x86_64::__m128) -> u32 {
use std::arch::x86_64::*;
let lanes = if BGRA {
_mm_shuffle_ps::<0b11_00_01_10>(lanes, lanes)
} else {
lanes
};
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let x = _mm_max_ps(_mm_min_ps(lanes, one), zero);
let coeff_a = _mm_set1_ps(SRGB_OETF_MINIMAX_A);
let coeff_b = _mm_set1_ps(SRGB_OETF_MINIMAX_B);
let coeff_c = _mm_set1_ps(SRGB_OETF_MINIMAX_C);
let linear_scale = _mm_set1_ps(12.92);
let threshold = _mm_set1_ps(0.003_130_8);
let scale_255 = _mm_set1_ps(255.0);
let alpha_lane_mask = _mm_castsi128_ps(_mm_setr_epi32(0, 0, 0, -1));
let quarter = _mm_sqrt_ps(_mm_sqrt_ps(x));
let diff = _mm_sub_ps(quarter, coeff_a);
let r3 = unsafe { rsqrt_refined_sse4_1(diff) };
let inner = _mm_sub_ps(r3, coeff_b);
let cube = _mm_mul_ps(_mm_mul_ps(inner, inner), inner);
let rcp = unsafe { rcp_refined_sse4_1(cube) };
let curve = _mm_sub_ps(rcp, coeff_c);
let linear = _mm_mul_ps(x, linear_scale);
let use_linear = _mm_cmplt_ps(x, threshold);
let rgb = _mm_blendv_ps(curve, linear, use_linear);
let encoded = _mm_blendv_ps(rgb, x, alpha_lane_mask);
let scaled = _mm_mul_ps(encoded, scale_255);
let i32s = _mm_cvtps_epi32(scaled);
let u16s = _mm_packus_epi32(i32s, i32s);
let u8s = _mm_packus_epi16(u16s, u16s);
_mm_cvtsi128_si32(u8s) as u32
}
#[doc(hidden)]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1")]
pub unsafe fn store_srgb8_f32_sse4_1<const BGRA: bool>(buf: &Buffer<f32>) -> Vec<u8> {
use std::arch::x86_64::*;
profiling::scope!("store_srgb8_f32_sse4_1");
let total_pixels = buf.pixels.len();
let mut out = vec![0u8; total_pixels * 4];
let src_base = buf.pixels.as_ptr() as *const f32;
let dst_base = out.as_mut_ptr();
unsafe {
for i in 0..total_pixels {
let lanes = _mm_loadu_ps(src_base.add(i * 4));
let packed = encode_srgb_pixel_sse4_1::<BGRA>(lanes);
dst_base.add(i * 4).cast::<u32>().write_unaligned(packed);
}
}
out
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
#[inline]
unsafe fn encode_srgb_pixels_avx2<const BGRA: bool>(
lanes: std::arch::x86_64::__m256,
) -> std::arch::x86_64::__m128i {
use std::arch::x86_64::*;
let lanes = if BGRA {
_mm256_shuffle_ps::<0b11_00_01_10>(lanes, lanes)
} else {
lanes
};
let coeff_a = _mm256_set1_ps(SRGB_OETF_MINIMAX_A);
let coeff_b = _mm256_set1_ps(SRGB_OETF_MINIMAX_B);
let coeff_c = _mm256_set1_ps(SRGB_OETF_MINIMAX_C);
let linear_scale = _mm256_set1_ps(12.92);
let threshold = _mm256_set1_ps(0.003_130_8);
let scale_255 = _mm256_set1_ps(255.0);
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let alpha_lane_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0, 0, 0, -1, 0, 0, 0, -1));
let x = _mm256_max_ps(_mm256_min_ps(lanes, one), zero);
let quarter = _mm256_sqrt_ps(_mm256_sqrt_ps(x));
let diff = _mm256_sub_ps(quarter, coeff_a);
let r3 = unsafe { rsqrt_refined_avx2(diff) };
let inner = _mm256_sub_ps(r3, coeff_b);
let cube = _mm256_mul_ps(_mm256_mul_ps(inner, inner), inner);
let rcp = unsafe { rcp_refined_avx2(cube) };
let curve = _mm256_sub_ps(rcp, coeff_c);
let linear = _mm256_mul_ps(x, linear_scale);
let use_linear = _mm256_cmp_ps::<_CMP_LT_OQ>(x, threshold);
let rgb = _mm256_blendv_ps(curve, linear, use_linear);
let encoded = _mm256_blendv_ps(rgb, x, alpha_lane_mask);
let scaled = _mm256_mul_ps(encoded, scale_255);
let i32s = _mm256_cvtps_epi32(scaled);
let lo = _mm256_castsi256_si128(i32s);
let hi = _mm256_extracti128_si256::<1>(i32s);
_mm_packus_epi32(lo, hi)
}
#[doc(hidden)]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
pub unsafe fn store_srgb8_f32_avx2_fma<const BGRA: bool>(buf: &Buffer<f32>) -> Vec<u8> {
use std::arch::x86_64::*;
profiling::scope!("store_srgb8_f32_avx2_fma");
let total_pixels = buf.pixels.len();
let mut out = vec![0u8; total_pixels * 4];
let src_base = buf.pixels.as_ptr() as *const f32;
let dst_base = out.as_mut_ptr();
let quad_count = total_pixels / 4;
let tail_pixels = total_pixels % 4;
unsafe {
for i in 0..quad_count {
let lanes_a = _mm256_loadu_ps(src_base.add(i * 16));
let lanes_b = _mm256_loadu_ps(src_base.add(i * 16 + 8));
let u16s_a = encode_srgb_pixels_avx2::<BGRA>(lanes_a);
let u16s_b = encode_srgb_pixels_avx2::<BGRA>(lanes_b);
let u8s = _mm_packus_epi16(u16s_a, u16s_b);
_mm_storeu_si128(dst_base.add(i * 16) as *mut __m128i, u8s);
}
let mut offset = quad_count * 16;
if tail_pixels >= 2 {
let lanes = _mm256_loadu_ps(src_base.add(offset));
let u16s = encode_srgb_pixels_avx2::<BGRA>(lanes);
let u8s = _mm_packus_epi16(u16s, u16s);
_mm_storel_epi64(dst_base.add(offset) as *mut __m128i, u8s);
offset += 8;
}
if tail_pixels % 2 == 1 {
let lanes = _mm_loadu_ps(src_base.add(offset));
let packed = encode_srgb_pixel_sse4_1::<BGRA>(lanes);
dst_base.add(offset).cast::<u32>().write_unaligned(packed);
}
}
out
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx512bw")]
#[inline]
unsafe fn encode_srgb_pixels_avx512<const BGRA: bool>(
lanes: std::arch::x86_64::__m512,
) -> std::arch::x86_64::__m128i {
use std::arch::x86_64::*;
let lanes = if BGRA {
_mm512_shuffle_ps::<0b11_00_01_10>(lanes, lanes)
} else {
lanes
};
let coeff_a = _mm512_set1_ps(SRGB_OETF_MINIMAX_A);
let coeff_b = _mm512_set1_ps(SRGB_OETF_MINIMAX_B);
let coeff_c = _mm512_set1_ps(SRGB_OETF_MINIMAX_C);
let linear_scale = _mm512_set1_ps(12.92);
let threshold = _mm512_set1_ps(0.003_130_8);
let scale_255 = _mm512_set1_ps(255.0);
let zero = _mm512_setzero_ps();
let one = _mm512_set1_ps(1.0);
let alpha_lane_mask: __mmask16 = 0b1000_1000_1000_1000;
let x = _mm512_max_ps(_mm512_min_ps(lanes, one), zero);
let quarter = _mm512_sqrt_ps(_mm512_sqrt_ps(x));
let diff = _mm512_sub_ps(quarter, coeff_a);
let r3 = unsafe { rsqrt_refined_avx512(diff) };
let inner = _mm512_sub_ps(r3, coeff_b);
let cube = _mm512_mul_ps(_mm512_mul_ps(inner, inner), inner);
let rcp = unsafe { rcp_refined_avx512(cube) };
let curve = _mm512_sub_ps(rcp, coeff_c);
let linear = _mm512_mul_ps(x, linear_scale);
let use_linear = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(x, threshold);
let rgb = _mm512_mask_blend_ps(use_linear, curve, linear);
let encoded = _mm512_mask_blend_ps(alpha_lane_mask, rgb, x);
let scaled = _mm512_mul_ps(encoded, scale_255);
let i32s = _mm512_cvtps_epi32(scaled);
_mm512_cvtusepi32_epi8(i32s)
}
#[doc(hidden)]
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx512bw")]
pub unsafe fn store_srgb8_f32_avx512<const BGRA: bool>(buf: &Buffer<f32>) -> Vec<u8> {
use std::arch::x86_64::*;
profiling::scope!("store_srgb8_f32_avx512");
let total_pixels = buf.pixels.len();
let mut out = vec![0u8; total_pixels * 4];
let src_base = buf.pixels.as_ptr() as *const f32;
let dst_base = out.as_mut_ptr();
let quad_count = total_pixels / 4;
let tail_pixels = total_pixels % 4;
unsafe {
for i in 0..quad_count {
let lanes = _mm512_loadu_ps(src_base.add(i * 16));
let u8s = encode_srgb_pixels_avx512::<BGRA>(lanes);
_mm_storeu_si128(dst_base.add(i * 16) as *mut __m128i, u8s);
}
if tail_pixels > 0 {
let offset = quad_count * 16;
let mask: __mmask16 = (1u16 << (tail_pixels * 4)) - 1;
let lanes = _mm512_maskz_loadu_ps(mask, src_base.add(offset));
let u8s = encode_srgb_pixels_avx512::<BGRA>(lanes);
_mm_mask_storeu_epi8(dst_base.add(offset) as *mut i8, mask, u8s);
}
}
out
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
unsafe fn encode_srgb_lanes_neon<const BGRA: bool>(
lanes: std::arch::aarch64::float32x4_t,
) -> std::arch::aarch64::uint16x4_t {
use std::arch::aarch64::*;
let lanes = if BGRA {
let r = vgetq_lane_f32::<0>(lanes);
let b = vgetq_lane_f32::<2>(lanes);
let lanes = vsetq_lane_f32::<0>(b, lanes);
vsetq_lane_f32::<2>(r, lanes)
} else {
lanes
};
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let x = vmaxq_f32(vminq_f32(lanes, one), zero);
let coeff_a = vdupq_n_f32(SRGB_OETF_MINIMAX_A);
let coeff_b = vdupq_n_f32(SRGB_OETF_MINIMAX_B);
let coeff_c = vdupq_n_f32(SRGB_OETF_MINIMAX_C);
let linear_scale = vdupq_n_f32(12.92);
let threshold = vdupq_n_f32(0.003_130_8);
let scale_255 = vdupq_n_f32(255.0);
let alpha_lane_mask = vsetq_lane_u32::<3>(u32::MAX, vdupq_n_u32(0));
let quarter = vsqrtq_f32(vsqrtq_f32(x));
let diff = vsubq_f32(quarter, coeff_a);
let r3 = vdivq_f32(one, vsqrtq_f32(diff));
let inner = vsubq_f32(r3, coeff_b);
let cube = vmulq_f32(vmulq_f32(inner, inner), inner);
let rcp = vdivq_f32(one, cube);
let curve = vsubq_f32(rcp, coeff_c);
let linear = vmulq_f32(x, linear_scale);
let use_linear = vcltq_f32(x, threshold);
let rgb = vbslq_f32(use_linear, linear, curve);
let encoded = vbslq_f32(alpha_lane_mask, x, rgb);
let scaled = vmulq_f32(encoded, scale_255);
vqmovun_s32(vcvtnq_s32_f32(scaled))
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline]
unsafe fn encode_srgb_pixel_neon<const BGRA: bool>(lanes: std::arch::aarch64::float32x4_t) -> u32 {
use std::arch::aarch64::*;
let u16s = unsafe { encode_srgb_lanes_neon::<BGRA>(lanes) };
let u8s = vqmovn_u16(vcombine_u16(u16s, u16s));
vget_lane_u32::<0>(vreinterpret_u32_u8(u8s))
}
#[doc(hidden)]
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
pub unsafe fn store_srgb8_f32_neon<const BGRA: bool>(buf: &Buffer<f32>) -> Vec<u8> {
use std::arch::aarch64::*;
profiling::scope!("store_srgb8_f32_neon");
let total_pixels = buf.pixels.len();
let mut out = vec![0u8; total_pixels * 4];
let src_base = buf.pixels.as_ptr() as *const f32;
let dst_base = out.as_mut_ptr();
let quad_count = total_pixels / 4;
let tail_pixels = total_pixels % 4;
unsafe {
for i in 0..quad_count {
let offset = i * 16;
let p0 = vld1q_f32(src_base.add(offset));
let p1 = vld1q_f32(src_base.add(offset + 4));
let p2 = vld1q_f32(src_base.add(offset + 8));
let p3 = vld1q_f32(src_base.add(offset + 12));
let u16s0 = encode_srgb_lanes_neon::<BGRA>(p0);
let u16s1 = encode_srgb_lanes_neon::<BGRA>(p1);
let u16s2 = encode_srgb_lanes_neon::<BGRA>(p2);
let u16s3 = encode_srgb_lanes_neon::<BGRA>(p3);
let bytes01 = vqmovn_u16(vcombine_u16(u16s0, u16s1));
let bytes23 = vqmovn_u16(vcombine_u16(u16s2, u16s3));
let bytes = vcombine_u8(bytes01, bytes23);
vst1q_u8(dst_base.add(offset), bytes);
}
let mut offset = quad_count * 16;
for _ in 0..tail_pixels {
let lanes = vld1q_f32(src_base.add(offset));
let packed = encode_srgb_pixel_neon::<BGRA>(lanes);
dst_base.add(offset).cast::<u32>().write_unaligned(packed);
offset += 4;
}
}
out
}
#[cfg(all(test, target_arch = "x86_64"))]
mod simd_tests {
use super::*;
fn reference_bytes<const BGRA: bool>(pixels: &[[f32; 4]]) -> Vec<u8> {
let mut out = vec![0u8; pixels.len() * 4];
for (pixel, bytes) in pixels.iter().zip(out.chunks_exact_mut(4)) {
let (r, g, b, a) = (pixel[0], pixel[1], pixel[2], pixel[3]);
let r_u8 = (srgb_oetf_fast(r) * 255.0).round() as u8;
let g_u8 = (srgb_oetf_fast(g) * 255.0).round() as u8;
let b_u8 = (srgb_oetf_fast(b) * 255.0).round() as u8;
let a_u8 = (a.clamp(0.0, 1.0) * 255.0).round() as u8;
if BGRA {
bytes.copy_from_slice(&[b_u8, g_u8, r_u8, a_u8]);
} else {
bytes.copy_from_slice(&[r_u8, g_u8, b_u8, a_u8]);
}
}
out
}
fn assert_within_u8_tolerance<const BGRA: bool>(actual: &[u8], pixels: &[[f32; 4]]) {
let reference = reference_bytes::<BGRA>(pixels);
assert_eq!(actual.len(), reference.len());
for (i, (&got, &want)) in actual.iter().zip(&reference).enumerate() {
let diff = got.abs_diff(want);
assert!(
diff <= 1,
"pixel {} byte {} got={got} want={want} (BGRA={BGRA})",
i / 4,
i % 4,
);
}
}
fn u8_roundtrip_pixels() -> Vec<[f32; 4]> {
fn srgb_eotf_exact(c: f32) -> f32 {
if c <= 0.040_45 {
c / 12.92
} else {
((c + 0.055) / 1.055).powf(2.4)
}
}
(0..=255u8)
.map(|b| {
let lin = srgb_eotf_exact(b as f32 / 255.0);
[lin, lin, lin, b as f32 / 255.0]
})
.collect()
}
fn assert_roundtrips(bytes: &[u8]) {
for b in 0..=255u8 {
let base = b as usize * 4;
assert_eq!(bytes[base], b, "byte 0 roundtrip failed for value {b}");
assert_eq!(bytes[base + 1], b, "byte 1 roundtrip failed for value {b}");
assert_eq!(bytes[base + 2], b, "byte 2 roundtrip failed for value {b}");
assert_eq!(bytes[base + 3], b, "byte 3 roundtrip failed for value {b}");
}
}
fn fine_grid_pixels() -> Vec<[f32; 4]> {
let n = 1024usize;
(0..n)
.map(|i| {
let x = i as f32 / (n - 1) as f32;
[x, (x * 0.5 + 0.2).clamp(0.0, 1.0), x * x, x]
})
.collect()
}
fn buf_from(pixels: Vec<[f32; 4]>) -> Buffer<f32> {
let width = pixels.len() as u32;
Buffer {
pixels,
width,
height: 1,
}
}
#[test]
fn sse4_rgba_matches_lut_within_u8_tolerance() {
if !is_x86_feature_detected!("sse4.1") {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_sse4_1::<false>(&buf) };
assert_within_u8_tolerance::<false>(&got, &buf.pixels);
}
#[test]
fn sse4_bgra_matches_lut_within_u8_tolerance() {
if !is_x86_feature_detected!("sse4.1") {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_sse4_1::<true>(&buf) };
assert_within_u8_tolerance::<true>(&got, &buf.pixels);
}
#[test]
fn avx2_rgba_matches_lut_within_u8_tolerance() {
if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_avx2_fma::<false>(&buf) };
assert_within_u8_tolerance::<false>(&got, &buf.pixels);
}
#[test]
fn avx2_bgra_matches_lut_within_u8_tolerance() {
if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_avx2_fma::<true>(&buf) };
assert_within_u8_tolerance::<true>(&got, &buf.pixels);
}
#[test]
fn sse4_rgba_u8_roundtrip_is_exact() {
if !is_x86_feature_detected!("sse4.1") {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_sse4_1::<false>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn sse4_bgra_u8_roundtrip_is_exact() {
if !is_x86_feature_detected!("sse4.1") {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_sse4_1::<true>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn avx2_rgba_u8_roundtrip_is_exact() {
if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_avx2_fma::<false>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn avx2_bgra_u8_roundtrip_is_exact() {
if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_avx2_fma::<true>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn avx2_odd_count_tail_matches_sse4_path() {
if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
return;
}
let pixels = vec![
[0.0, 0.1, 0.5, 1.0],
[0.25, 0.75, 0.9, 0.5],
[0.123, 0.456, 0.789, 0.321],
];
let buf = buf_from(pixels.clone());
for bgra in [false, true] {
let (avx2, sse4_tail) = unsafe {
let tail_buf = buf_from(vec![pixels[2]]);
if bgra {
(
store_srgb8_f32_avx2_fma::<true>(&buf),
store_srgb8_f32_sse4_1::<true>(&tail_buf),
)
} else {
(
store_srgb8_f32_avx2_fma::<false>(&buf),
store_srgb8_f32_sse4_1::<false>(&tail_buf),
)
}
};
assert_eq!(&avx2[8..12], &sse4_tail[..], "bgra={bgra}");
}
}
#[test]
fn avx2_quad_plus_tail_counts_match_lut() {
if !(is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")) {
return;
}
let mut pixels = Vec::with_capacity(11);
for i in 0..11 {
let t = i as f32 / 10.0;
pixels.push([t, (t * 0.7).clamp(0.0, 1.0), 1.0 - t, t * t]);
}
for n in [4usize, 5, 6, 7, 8, 9, 10, 11] {
let buf = buf_from(pixels[..n].to_vec());
let rgba = unsafe { store_srgb8_f32_avx2_fma::<false>(&buf) };
assert_within_u8_tolerance::<false>(&rgba, &buf.pixels);
let bgra = unsafe { store_srgb8_f32_avx2_fma::<true>(&buf) };
assert_within_u8_tolerance::<true>(&bgra, &buf.pixels);
}
}
#[test]
fn bgra_swaps_r_and_b_bytes() {
if !is_x86_feature_detected!("sse4.1") {
return;
}
let pixels = vec![[1.0, 0.0, 0.25, 0.5]];
let buf = buf_from(pixels);
let rgba = unsafe { store_srgb8_f32_sse4_1::<false>(&buf) };
let bgra = unsafe { store_srgb8_f32_sse4_1::<true>(&buf) };
assert_eq!(bgra[0], rgba[2], "byte 0: BGRA should hold former-B");
assert_eq!(bgra[1], rgba[1], "byte 1: G unchanged");
assert_eq!(bgra[2], rgba[0], "byte 2: BGRA should hold former-R");
assert_eq!(bgra[3], rgba[3], "byte 3: A unchanged");
}
#[test]
fn clamps_out_of_range_inputs() {
if !is_x86_feature_detected!("sse4.1") {
return;
}
let pixels = vec![[-0.5, 2.0, 0.5, -0.1], [1.5, -1.0, 0.0, 1.2]];
let buf = buf_from(pixels);
let got = unsafe { store_srgb8_f32_sse4_1::<false>(&buf) };
assert_eq!(got[0], 0); assert_eq!(got[1], 255); assert_eq!(got[3], 0); assert_eq!(got[4], 255); assert_eq!(got[5], 0); assert_eq!(got[7], 255); }
fn has_avx512() -> bool {
is_x86_feature_detected!("avx512f")
&& is_x86_feature_detected!("avx512bw")
&& is_x86_feature_detected!("avx512vl")
}
#[test]
fn avx512_rgba_matches_lut_within_u8_tolerance() {
if !has_avx512() {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_avx512::<false>(&buf) };
assert_within_u8_tolerance::<false>(&got, &buf.pixels);
}
#[test]
fn avx512_bgra_matches_lut_within_u8_tolerance() {
if !has_avx512() {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_avx512::<true>(&buf) };
assert_within_u8_tolerance::<true>(&got, &buf.pixels);
}
#[test]
fn avx512_rgba_u8_roundtrip_is_exact() {
if !has_avx512() {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_avx512::<false>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn avx512_bgra_u8_roundtrip_is_exact() {
if !has_avx512() {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_avx512::<true>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn avx512_tail_matches_lut_within_u8_tolerance() {
if !has_avx512() {
return;
}
let pixels = vec![
[0.0, 0.1, 0.5, 1.0],
[0.25, 0.75, 0.9, 0.5],
[0.123, 0.456, 0.789, 0.321],
[0.2, 0.4, 0.6, 0.8],
[0.01, 0.99, 0.33, 0.77],
[0.02, 0.98, 0.66, 0.44],
[0.05, 0.95, 0.5, 0.5],
];
let buf = buf_from(pixels);
let rgba = unsafe { store_srgb8_f32_avx512::<false>(&buf) };
assert_within_u8_tolerance::<false>(&rgba, &buf.pixels);
let bgra = unsafe { store_srgb8_f32_avx512::<true>(&buf) };
assert_within_u8_tolerance::<true>(&bgra, &buf.pixels);
}
#[test]
fn avx512_multiple_main_plus_tail_matches_lut() {
if !has_avx512() {
return;
}
let mut pixels = Vec::with_capacity(15);
for i in 0..15 {
let t = i as f32 / 14.0;
pixels.push([t, (t * 0.7).clamp(0.0, 1.0), 1.0 - t, t * t]);
}
let buf = buf_from(pixels);
let rgba = unsafe { store_srgb8_f32_avx512::<false>(&buf) };
assert_within_u8_tolerance::<false>(&rgba, &buf.pixels);
let bgra = unsafe { store_srgb8_f32_avx512::<true>(&buf) };
assert_within_u8_tolerance::<true>(&bgra, &buf.pixels);
}
}
#[cfg(all(test, target_arch = "aarch64"))]
mod neon_tests {
use super::*;
fn has_neon() -> bool {
std::arch::is_aarch64_feature_detected!("neon")
}
fn reference_bytes<const BGRA: bool>(pixels: &[[f32; 4]]) -> Vec<u8> {
let mut out = vec![0u8; pixels.len() * 4];
for (pixel, bytes) in pixels.iter().zip(out.chunks_exact_mut(4)) {
let (r, g, b, a) = (pixel[0], pixel[1], pixel[2], pixel[3]);
let r_u8 = (srgb_oetf_fast(r) * 255.0).round() as u8;
let g_u8 = (srgb_oetf_fast(g) * 255.0).round() as u8;
let b_u8 = (srgb_oetf_fast(b) * 255.0).round() as u8;
let a_u8 = (a.clamp(0.0, 1.0) * 255.0).round() as u8;
if BGRA {
bytes.copy_from_slice(&[b_u8, g_u8, r_u8, a_u8]);
} else {
bytes.copy_from_slice(&[r_u8, g_u8, b_u8, a_u8]);
}
}
out
}
fn assert_within_u8_tolerance<const BGRA: bool>(actual: &[u8], pixels: &[[f32; 4]]) {
let reference = reference_bytes::<BGRA>(pixels);
assert_eq!(actual.len(), reference.len());
for (i, (&got, &want)) in actual.iter().zip(&reference).enumerate() {
let diff = got.abs_diff(want);
assert!(
diff <= 1,
"pixel {} byte {} got={got} want={want} (BGRA={BGRA})",
i / 4,
i % 4,
);
}
}
fn u8_roundtrip_pixels() -> Vec<[f32; 4]> {
fn srgb_eotf_exact(c: f32) -> f32 {
if c <= 0.040_45 {
c / 12.92
} else {
((c + 0.055) / 1.055).powf(2.4)
}
}
(0..=255u8)
.map(|b| {
let lin = srgb_eotf_exact(b as f32 / 255.0);
[lin, lin, lin, b as f32 / 255.0]
})
.collect()
}
fn assert_roundtrips(bytes: &[u8]) {
for b in 0..=255u8 {
let base = b as usize * 4;
assert_eq!(bytes[base], b, "byte 0 roundtrip failed for value {b}");
assert_eq!(bytes[base + 1], b, "byte 1 roundtrip failed for value {b}");
assert_eq!(bytes[base + 2], b, "byte 2 roundtrip failed for value {b}");
assert_eq!(bytes[base + 3], b, "byte 3 roundtrip failed for value {b}");
}
}
fn fine_grid_pixels() -> Vec<[f32; 4]> {
let n = 1024usize;
(0..n)
.map(|i| {
let x = i as f32 / (n - 1) as f32;
[x, (x * 0.5 + 0.2).clamp(0.0, 1.0), x * x, x]
})
.collect()
}
fn buf_from(pixels: Vec<[f32; 4]>) -> Buffer<f32> {
let width = pixels.len() as u32;
Buffer {
pixels,
width,
height: 1,
}
}
#[test]
fn neon_rgba_matches_lut_within_u8_tolerance() {
if !has_neon() {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_neon::<false>(&buf) };
assert_within_u8_tolerance::<false>(&got, &buf.pixels);
}
#[test]
fn neon_bgra_matches_lut_within_u8_tolerance() {
if !has_neon() {
return;
}
let buf = buf_from(fine_grid_pixels());
let got = unsafe { store_srgb8_f32_neon::<true>(&buf) };
assert_within_u8_tolerance::<true>(&got, &buf.pixels);
}
#[test]
fn neon_rgba_u8_roundtrip_is_exact() {
if !has_neon() {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_neon::<false>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn neon_bgra_u8_roundtrip_is_exact() {
if !has_neon() {
return;
}
let buf = buf_from(u8_roundtrip_pixels());
let got = unsafe { store_srgb8_f32_neon::<true>(&buf) };
assert_roundtrips(&got);
}
#[test]
fn neon_tail_matches_lut_within_u8_tolerance() {
if !has_neon() {
return;
}
let pixels = vec![
[0.0, 0.1, 0.5, 1.0],
[0.25, 0.75, 0.9, 0.5],
[0.123, 0.456, 0.789, 0.321],
[0.2, 0.4, 0.6, 0.8],
[0.01, 0.99, 0.33, 0.77],
[0.02, 0.98, 0.66, 0.44],
[0.05, 0.95, 0.5, 0.5],
];
let buf = buf_from(pixels);
let rgba = unsafe { store_srgb8_f32_neon::<false>(&buf) };
assert_within_u8_tolerance::<false>(&rgba, &buf.pixels);
let bgra = unsafe { store_srgb8_f32_neon::<true>(&buf) };
assert_within_u8_tolerance::<true>(&bgra, &buf.pixels);
}
#[test]
fn neon_bgra_swaps_r_and_b_bytes() {
if !has_neon() {
return;
}
let buf = buf_from(vec![[1.0, 0.0, 0.25, 0.5]]);
let rgba = unsafe { store_srgb8_f32_neon::<false>(&buf) };
let bgra = unsafe { store_srgb8_f32_neon::<true>(&buf) };
assert_eq!(bgra[0], rgba[2]);
assert_eq!(bgra[1], rgba[1]);
assert_eq!(bgra[2], rgba[0]);
assert_eq!(bgra[3], rgba[3]);
}
}