#[cfg(target_arch = "wasm32")]
use core::arch::wasm32::*;
#[cfg(target_arch = "wasm32")]
#[target_feature(enable = "simd128")]
pub unsafe fn row_rgba_to_gray_wasm(input: &[u8], output: &mut [u8]) {
let pixel_count = output.len();
let chunks = pixel_count / 4;
let remainder = pixel_count % 4;
let coeff_r = i16x8_splat(77);
let coeff_g = i16x8_splat(150);
let coeff_b = i16x8_splat(29);
let zero = i8x16_splat(0);
let mut i = 0usize;
for _ in 0..chunks {
let base = i * 4;
let rgba = v128_load(input.as_ptr().add(base) as *const v128);
let r_bytes = i8x16_shuffle::<0, 16, 4, 16, 8, 16, 12, 16, 16, 16, 16, 16, 16, 16, 16, 16>(
rgba, zero,
);
let g_bytes = i8x16_shuffle::<1, 16, 5, 16, 9, 16, 13, 16, 16, 16, 16, 16, 16, 16, 16, 16>(
rgba, zero,
);
let b_bytes = i8x16_shuffle::<2, 16, 6, 16, 10, 16, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16>(
rgba, zero,
);
let r_prod = i16x8_mul(r_bytes, coeff_r);
let g_prod = i16x8_mul(g_bytes, coeff_g);
let b_prod = i16x8_mul(b_bytes, coeff_b);
let sum = i16x8_add(i16x8_add(r_prod, g_prod), b_prod);
let shifted = u16x8_shr(sum, 8);
output[i] = u16x8_extract_lane::<0>(shifted) as u8;
output[i + 1] = u16x8_extract_lane::<1>(shifted) as u8;
output[i + 2] = u16x8_extract_lane::<2>(shifted) as u8;
output[i + 3] = u16x8_extract_lane::<3>(shifted) as u8;
i += 4;
}
for j in 0..remainder {
let idx = (i + j) * 4;
output[i + j] = super::color::rgba_to_gray(input[idx], input[idx + 1], input[idx + 2]);
}
}
#[cfg(target_arch = "wasm32")]
#[target_feature(enable = "simd128")]
pub unsafe fn row_brightness_wasm(row: &mut [u8], factor: f32) {
let pixel_count = row.len() / 4;
let chunks = pixel_count / 4;
let remainder = pixel_count % 4;
let vfactor = f32x4_splat(factor);
let vzero = f32x4_splat(0.0);
let vmax = f32x4_splat(255.0);
let mut i = 0usize;
for _ in 0..chunks {
let base = i * 4;
let r = f32x4(
row[base] as f32,
row[base + 4] as f32,
row[base + 8] as f32,
row[base + 12] as f32,
);
let r = f32x4_max(vzero, f32x4_min(vmax, f32x4_mul(r, vfactor)));
let g = f32x4(
row[base + 1] as f32,
row[base + 5] as f32,
row[base + 9] as f32,
row[base + 13] as f32,
);
let g = f32x4_max(vzero, f32x4_min(vmax, f32x4_mul(g, vfactor)));
let b = f32x4(
row[base + 2] as f32,
row[base + 6] as f32,
row[base + 10] as f32,
row[base + 14] as f32,
);
let b = f32x4_max(vzero, f32x4_min(vmax, f32x4_mul(b, vfactor)));
row[base] = f32x4_extract_lane::<0>(r) as u8;
row[base + 4] = f32x4_extract_lane::<1>(r) as u8;
row[base + 8] = f32x4_extract_lane::<2>(r) as u8;
row[base + 12] = f32x4_extract_lane::<3>(r) as u8;
row[base + 1] = f32x4_extract_lane::<0>(g) as u8;
row[base + 5] = f32x4_extract_lane::<1>(g) as u8;
row[base + 9] = f32x4_extract_lane::<2>(g) as u8;
row[base + 13] = f32x4_extract_lane::<3>(g) as u8;
row[base + 2] = f32x4_extract_lane::<0>(b) as u8;
row[base + 6] = f32x4_extract_lane::<1>(b) as u8;
row[base + 10] = f32x4_extract_lane::<2>(b) as u8;
row[base + 14] = f32x4_extract_lane::<3>(b) as u8;
i += 4;
}
for j in 0..remainder {
let base = (i + j) * 4;
row[base] = ((row[base] as f32 * factor).clamp(0.0, 255.0)) as u8;
row[base + 1] = ((row[base + 1] as f32 * factor).clamp(0.0, 255.0)) as u8;
row[base + 2] = ((row[base + 2] as f32 * factor).clamp(0.0, 255.0)) as u8;
}
}