#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
pub unsafe fn row_rgba_to_gray_sse2(input: &[u8], output: &mut [u8]) {
let pixel_count = output.len();
let chunks = pixel_count / 4;
let remainder = pixel_count % 4;
let zero = _mm_setzero_si128();
let coeff_r = _mm_set1_epi16(77);
let coeff_g = _mm_set1_epi16(150);
let coeff_b = _mm_set1_epi16(29);
let mut i = 0usize;
for _ in 0..chunks {
let base = i * 4;
let rgba = _mm_loadu_si128(input.as_ptr().add(base) as *const __m128i);
let r_shuf = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, 12, -1, 8, -1, 4, -1, 0, );
let g_shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 13, -1, 9, -1, 5, -1, 1);
let b_shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 14, -1, 10, -1, 6, -1, 2);
let mask_byte0 = _mm_set1_epi32(0x000000FF_u32 as i32);
let r_32 = _mm_and_si128(rgba, mask_byte0);
let g_32 = _mm_and_si128(_mm_srli_epi32(rgba, 8), mask_byte0);
let b_32 = _mm_and_si128(_mm_srli_epi32(rgba, 16), mask_byte0);
let r_16 = _mm_packs_epi32(r_32, zero);
let g_16 = _mm_packs_epi32(g_32, zero);
let b_16 = _mm_packs_epi32(b_32, zero);
let r_prod = _mm_mullo_epi16(r_16, coeff_r);
let g_prod = _mm_mullo_epi16(g_16, coeff_g);
let b_prod = _mm_mullo_epi16(b_16, coeff_b);
let sum = _mm_add_epi16(_mm_add_epi16(r_prod, g_prod), b_prod);
let shifted = _mm_srli_epi16(sum, 8);
output[i] = _mm_extract_epi16(shifted, 0) as u8;
output[i + 1] = _mm_extract_epi16(shifted, 1) as u8;
output[i + 2] = _mm_extract_epi16(shifted, 2) as u8;
output[i + 3] = _mm_extract_epi16(shifted, 3) as u8;
i += 4;
}
for j in 0..remainder {
let idx = (i + j) * 4;
output[i + j] = super::color::rgba_to_gray(input[idx], input[idx + 1], input[idx + 2]);
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
pub unsafe fn row_brightness_sse2(row: &mut [u8], factor: f32) {
let pixel_count = row.len() / 4;
let chunks = pixel_count / 4;
let remainder = pixel_count % 4;
let vfactor = _mm_set1_ps(factor);
let vzero = _mm_setzero_ps();
let vmax = _mm_set1_ps(255.0);
let mut i = 0usize;
for _ in 0..chunks {
let base = i * 4;
for ch in 0..3usize {
let vals = _mm_set_ps(
row[base + 12 + ch] as f32,
row[base + 8 + ch] as f32,
row[base + 4 + ch] as f32,
row[base + ch] as f32,
);
let scaled = _mm_mul_ps(vals, vfactor);
let clamped = _mm_min_ps(_mm_max_ps(scaled, vzero), vmax);
let mut result = [0.0f32; 4];
_mm_storeu_ps(result.as_mut_ptr(), clamped);
row[base + ch] = result[0] as u8;
row[base + 4 + ch] = result[1] as u8;
row[base + 8 + ch] = result[2] as u8;
row[base + 12 + ch] = result[3] as u8;
}
i += 4;
}
for j in 0..remainder {
let base = (i + j) * 4;
row[base] = ((row[base] as f32 * factor).clamp(0.0, 255.0)) as u8;
row[base + 1] = ((row[base + 1] as f32 * factor).clamp(0.0, 255.0)) as u8;
row[base + 2] = ((row[base + 2] as f32 * factor).clamp(0.0, 255.0)) as u8;
}
}
#[cfg(test)]
#[cfg(target_arch = "x86_64")]
mod tests {
use super::*;
#[test]
fn test_sse2_rgba_to_gray() {
let mut input = vec![0u8; 16 * 4];
let mut output_simd = vec![0u8; 16];
let mut output_scalar = vec![0u8; 16];
for i in 0..16 {
input[i * 4] = (i * 16) as u8;
input[i * 4 + 1] = (255 - i * 16) as u8;
input[i * 4 + 2] = (i * 8) as u8;
input[i * 4 + 3] = 255;
}
unsafe {
row_rgba_to_gray_sse2(&input, &mut output_simd);
}
super::super::color::row_rgba_to_gray_scalar(&input, &mut output_scalar);
for i in 0..16 {
assert!(
(output_simd[i] as i32 - output_scalar[i] as i32).abs() <= 1,
"Pixel {}: SSE2={} Scalar={}",
i,
output_simd[i],
output_scalar[i]
);
}
}
#[test]
fn test_sse2_brightness() {
let mut row_simd = vec![
100u8, 150, 200, 255, 50, 100, 200, 255, 0, 0, 0, 255, 255, 255, 255, 255,
];
let mut row_scalar = row_simd.clone();
unsafe {
row_brightness_sse2(&mut row_simd, 1.5);
}
super::super::color::row_brightness(&mut row_scalar, 1.5);
for i in 0..row_simd.len() {
assert!(
(row_simd[i] as i32 - row_scalar[i] as i32).abs() <= 1,
"Byte {}: SSE2={} Scalar={}",
i,
row_simd[i],
row_scalar[i]
);
}
}
}