#[cfg(target_arch = "x86_64")]
use archmage::{SimdToken, arcane};
#[cfg(target_arch = "x86_64")]
use safe_unaligned_simd::x86_64 as safe_simd;
pub fn upsample_h2v2_i16_nearest(
input: &[i16],
in_width: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_height: usize,
) {
for out_y in 0..out_height {
let in_y = (out_y / 2).min(in_height.saturating_sub(1));
let out_row = out_y * out_width;
let in_row = in_y * in_width;
for out_x in 0..out_width {
let in_x = (out_x / 2).min(in_width.saturating_sub(1));
output[out_row + out_x] = input[in_row + in_x];
}
}
}
pub fn upsample_h2v1_i16_nearest(
input: &[i16],
in_width: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_height: usize,
) {
upsample_h2v1_i16_nearest_strided(
input, in_width, in_width, in_height, output, out_width, out_width, out_height,
);
}
pub fn upsample_h1v2_i16_nearest(
input: &[i16],
in_width: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_height: usize,
) {
upsample_h1v2_i16_nearest_strided(
input, in_width, in_width, in_height, output, out_width, out_width, out_height,
);
}
pub fn upsample_h2v1_i16_nearest_strided(
input: &[i16],
in_width: usize,
in_stride: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_stride: usize,
out_height: usize,
) {
for out_y in 0..out_height {
let in_y = out_y.min(in_height.saturating_sub(1));
let out_row = out_y * out_stride;
let in_row = in_y * in_stride;
for out_x in 0..out_width {
let in_x = (out_x / 2).min(in_width.saturating_sub(1));
output[out_row + out_x] = input[in_row + in_x];
}
}
}
pub fn upsample_h1v2_i16_nearest_strided(
input: &[i16],
in_width: usize,
in_stride: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_stride: usize,
out_height: usize,
) {
for out_y in 0..out_height {
let in_y = (out_y / 2).min(in_height.saturating_sub(1));
let out_row = out_y * out_stride;
let in_row = in_y * in_stride;
for out_x in 0..out_width {
let in_x = out_x.min(in_width.saturating_sub(1));
output[out_row + out_x] = input[in_row + in_x];
}
}
}
pub fn upsample_h2v2_i16_nearest_strided(
input: &[i16],
in_width: usize,
in_stride: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_stride: usize,
out_height: usize,
) {
for out_y in 0..out_height {
let in_y = (out_y / 2).min(in_height.saturating_sub(1));
let out_row = out_y * out_stride;
let in_row = in_y * in_stride;
for out_x in 0..out_width {
let in_x = (out_x / 2).min(in_width.saturating_sub(1));
output[out_row + out_x] = input[in_row + in_x];
}
}
}
pub fn upsample_h2v1_i16_libjpeg(
input: &[i16],
in_width: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_height: usize,
) {
upsample_h2v1_i16_libjpeg_strided(
input, in_width, in_width, in_height, output, out_width, out_width, out_height,
);
}
pub fn upsample_h1v2_i16_libjpeg(
input: &[i16],
in_width: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_height: usize,
) {
upsample_h1v2_i16_libjpeg_strided(
input, in_width, in_width, in_height, output, out_width, out_width, out_height,
);
}
pub fn upsample_h2v1_i16_libjpeg_strided(
input: &[i16],
in_width: usize,
in_stride: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_stride: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y.min(in_height.saturating_sub(1));
let out_row = out_y * out_stride;
let in_row = in_y * in_stride;
if in_width == 1 {
let val = input[in_row];
if out_width > 0 {
output[out_row] = val;
}
if out_width > 1 {
output[out_row + 1] = val;
}
continue;
}
let curr = input[in_row] as i32;
let next = input[in_row + 1] as i32;
output[out_row] = curr as i16;
if out_width > 1 {
output[out_row + 1] = ((curr * 3 + next + 2) >> 2) as i16;
}
for in_x in 1..in_width.saturating_sub(1) {
let prev = input[in_row + in_x - 1] as i32;
let curr = input[in_row + in_x] as i32;
let next = input[in_row + in_x + 1] as i32;
let left_out = in_x * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[out_row + left_out] = ((curr * 3 + prev + 1) >> 2) as i16;
}
if right_out < out_width {
output[out_row + right_out] = ((curr * 3 + next + 2) >> 2) as i16;
}
}
let last = in_width - 1;
let prev = input[in_row + last - 1] as i32;
let curr = input[in_row + last] as i32;
let left_out = last * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[out_row + left_out] = ((curr * 3 + prev + 1) >> 2) as i16;
}
if right_out < out_width {
output[out_row + right_out] = curr as i16;
}
}
}
pub fn upsample_h1v2_i16_libjpeg_strided(
input: &[i16],
in_width: usize,
in_stride: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_stride: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y / 2;
let in_y_clamped = in_y.min(in_height.saturating_sub(1));
let is_upper = out_y % 2 == 0;
let out_row = out_y * out_stride;
let far_y = if is_upper {
in_y_clamped.saturating_sub(1)
} else {
(in_y + 1).min(in_height.saturating_sub(1))
};
let near_row = in_y_clamped * in_stride;
let far_row = far_y * in_stride;
let bias = if is_upper { 1i16 } else { 2i16 };
let w = out_width.min(in_width);
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
upsample_h1v2_row_avx2(
token,
&input[near_row..],
&input[far_row..],
&mut output[out_row..],
w,
bias,
);
if out_width > in_width && in_width > 0 {
let edge = output[out_row + in_width - 1];
for x in in_width..out_width {
output[out_row + x] = edge;
}
}
continue;
}
}
for out_x in 0..out_width {
let in_x = out_x.min(in_width.saturating_sub(1));
let near = input[near_row + in_x] as i32;
let far = input[far_row + in_x] as i32;
output[out_row + out_x] = ((near * 3 + far + bias as i32) >> 2) as i16;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn upsample_h1v2_row_avx2(
_token: archmage::X64V3Token,
near: &[i16],
far: &[i16],
output: &mut [i16],
width: usize,
bias: i16,
) {
use core::arch::x86_64::*;
let v_three = _mm256_set1_epi16(3);
let v_bias = _mm256_set1_epi16(bias);
let mut x = 0;
while x + 16 <= width {
let v_near =
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&near[x..x + 16]).unwrap());
let v_far = safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&far[x..x + 16]).unwrap());
let v_result = _mm256_srai_epi16(
_mm256_add_epi16(
_mm256_add_epi16(_mm256_mullo_epi16(v_near, v_three), v_far),
v_bias,
),
2,
);
safe_simd::_mm256_storeu_si256(
<&mut [i16; 16]>::try_from(&mut output[x..x + 16]).unwrap(),
v_result,
);
x += 16;
}
for x in x..width {
let n = near[x] as i32;
let f = far[x] as i32;
output[x] = ((n * 3 + f + bias as i32) >> 2) as i16;
}
}
pub fn upsample_h2v2_i16_libjpeg(
input: &[i16],
in_width: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 || out_width == 0 || out_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y / 2;
let in_y_clamped = in_y.min(in_height.saturating_sub(1));
let is_upper = out_y % 2 == 0;
let far_y = if is_upper {
in_y_clamped.saturating_sub(1)
} else {
(in_y + 1).min(in_height.saturating_sub(1))
};
let near_row = in_y_clamped * in_width;
let far_row = far_y * in_width;
let out_row = out_y * out_width;
upsample_h2v2_libjpeg_row(
&input[near_row..near_row + in_width],
&input[far_row..far_row + in_width],
&mut output[out_row..],
in_width,
out_width,
is_upper,
);
}
}
#[inline]
pub(super) fn upsample_h2v2_libjpeg_row(
near: &[i16],
far: &[i16],
output: &mut [i16],
in_width: usize,
out_width: usize,
is_upper: bool,
) {
#[cfg(target_arch = "x86_64")]
{
if let Some(token) = archmage::X64V3Token::summon() {
upsample_h2v2_libjpeg_row_avx2(token, near, far, output, in_width, out_width, is_upper);
return;
}
}
upsample_h2v2_libjpeg_row_scalar(near, far, output, in_width, out_width, is_upper);
}
#[inline]
fn upsample_h2v2_libjpeg_row_scalar(
near: &[i16],
far: &[i16],
output: &mut [i16],
in_width: usize,
out_width: usize,
is_upper: bool,
) {
if in_width == 1 {
let colsum = near[0] as i32 * 3 + far[0] as i32;
let val = ((colsum * 4 + 8) >> 4) as i16;
if out_width > 0 {
output[0] = val;
}
if out_width > 1 {
output[1] = val;
}
return;
}
let (bias_left, bias_right) = if is_upper { (8i32, 7i32) } else { (7i32, 8i32) };
let this_colsum = near[0] as i32 * 3 + far[0] as i32;
let next_colsum = near[1] as i32 * 3 + far[1] as i32;
output[0] = ((this_colsum * 4 + 8) >> 4) as i16;
if out_width > 1 {
output[1] = ((this_colsum * 3 + next_colsum + bias_right) >> 4) as i16;
}
let mut last_colsum = this_colsum;
for in_x in 1..in_width.saturating_sub(1) {
let this_colsum = near[in_x] as i32 * 3 + far[in_x] as i32;
let next_colsum = near[in_x + 1] as i32 * 3 + far[in_x + 1] as i32;
let left_out = in_x * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[left_out] = ((this_colsum * 3 + last_colsum + bias_left) >> 4) as i16;
}
if right_out < out_width {
output[right_out] = ((this_colsum * 3 + next_colsum + bias_right) >> 4) as i16;
}
last_colsum = this_colsum;
}
let last = in_width - 1;
let this_colsum = near[last] as i32 * 3 + far[last] as i32;
let left_out = last * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[left_out] = ((this_colsum * 3 + last_colsum + bias_left) >> 4) as i16;
}
if right_out < out_width {
output[right_out] = ((this_colsum * 4 + bias_right) >> 4) as i16;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn upsample_h2v2_libjpeg_row_avx2(
_token: archmage::X64V3Token,
near: &[i16],
far: &[i16],
output: &mut [i16],
in_width: usize,
out_width: usize,
is_upper: bool,
) {
use core::arch::x86_64::*;
if in_width < 18 {
upsample_h2v2_libjpeg_row_scalar(near, far, output, in_width, out_width, is_upper);
return;
}
let (bias_left, bias_right) = if is_upper { (8i16, 7i16) } else { (7i16, 8i16) };
let v_three = _mm256_set1_epi16(3);
let v_bias_left = _mm256_set1_epi16(bias_left);
let v_bias_right = _mm256_set1_epi16(bias_right);
let colsum_0 = near[0] as i32 * 3 + far[0] as i32;
let colsum_1 = near[1] as i32 * 3 + far[1] as i32;
output[0] = ((colsum_0 * 4 + 8) >> 4) as i16;
if out_width > 1 {
output[1] = ((colsum_0 * 3 + colsum_1 + bias_right as i32) >> 4) as i16;
}
let simd_start = 1usize;
let simd_end_exclusive = if in_width >= 17 {
let max_start = in_width - 17;
let num_chunks = (max_start - simd_start + 16) / 16;
simd_start + num_chunks * 16
} else {
simd_start
};
let mut x = simd_start;
while x + 16 <= simd_end_exclusive {
let out_base = x * 2;
if out_base + 32 > out_width {
break;
}
let v_near =
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&near[x..x + 16]).unwrap());
let v_far = safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&far[x..x + 16]).unwrap());
let v_colsum = _mm256_add_epi16(_mm256_mullo_epi16(v_near, v_three), v_far);
let v_near_prev =
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&near[x - 1..x + 15]).unwrap());
let v_far_prev =
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&far[x - 1..x + 15]).unwrap());
let v_colsum_prev = _mm256_add_epi16(_mm256_mullo_epi16(v_near_prev, v_three), v_far_prev);
let v_near_next =
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&near[x + 1..x + 17]).unwrap());
let v_far_next =
safe_simd::_mm256_loadu_si256(<&[i16; 16]>::try_from(&far[x + 1..x + 17]).unwrap());
let v_colsum_next = _mm256_add_epi16(_mm256_mullo_epi16(v_near_next, v_three), v_far_next);
let v_colsum3 = _mm256_mullo_epi16(v_colsum, v_three);
let v_left = _mm256_srai_epi16(
_mm256_add_epi16(_mm256_add_epi16(v_colsum3, v_colsum_prev), v_bias_left),
4,
);
let v_right = _mm256_srai_epi16(
_mm256_add_epi16(_mm256_add_epi16(v_colsum3, v_colsum_next), v_bias_right),
4,
);
let lo = _mm256_unpacklo_epi16(v_left, v_right);
let hi = _mm256_unpackhi_epi16(v_left, v_right);
let out0 = _mm256_permute2x128_si256(lo, hi, 0x20);
let out1 = _mm256_permute2x128_si256(lo, hi, 0x31);
safe_simd::_mm256_storeu_si256(
<&mut [i16; 16]>::try_from(&mut output[out_base..out_base + 16]).unwrap(),
out0,
);
safe_simd::_mm256_storeu_si256(
<&mut [i16; 16]>::try_from(&mut output[out_base + 16..out_base + 32]).unwrap(),
out1,
);
x += 16;
}
let mut last_colsum_i32 = if x > 1 {
near[x - 1] as i32 * 3 + far[x - 1] as i32
} else {
colsum_0
};
for in_x in x..in_width.saturating_sub(1) {
let this_colsum = near[in_x] as i32 * 3 + far[in_x] as i32;
let next_colsum = near[in_x + 1] as i32 * 3 + far[in_x + 1] as i32;
let left_out = in_x * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[left_out] = ((this_colsum * 3 + last_colsum_i32 + bias_left as i32) >> 4) as i16;
}
if right_out < out_width {
output[right_out] = ((this_colsum * 3 + next_colsum + bias_right as i32) >> 4) as i16;
}
last_colsum_i32 = this_colsum;
}
let last = in_width - 1;
if last >= x || x == simd_start {
let this_colsum = near[last] as i32 * 3 + far[last] as i32;
let prev_colsum = if last > 0 {
near[last - 1] as i32 * 3 + far[last - 1] as i32
} else {
this_colsum
};
let left_out = last * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[left_out] = ((this_colsum * 3 + prev_colsum + bias_left as i32) >> 4) as i16;
}
if right_out < out_width {
output[right_out] = ((this_colsum * 4 + bias_right as i32) >> 4) as i16;
}
}
}
pub fn upsample_h2v2_i16_libjpeg_strided(
input: &[i16],
in_width: usize,
in_stride: usize,
in_height: usize,
output: &mut [i16],
out_width: usize,
out_stride: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 || out_width == 0 || out_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y / 2;
let in_y_clamped = in_y.min(in_height.saturating_sub(1));
let is_upper = out_y % 2 == 0;
let far_y = if is_upper {
in_y_clamped.saturating_sub(1)
} else {
(in_y + 1).min(in_height.saturating_sub(1))
};
let near_row = in_y_clamped * in_stride;
let far_row = far_y * in_stride;
let out_row = out_y * out_stride;
upsample_h2v2_libjpeg_row(
&input[near_row..near_row + in_width],
&input[far_row..far_row + in_width],
&mut output[out_row..],
in_width,
out_width,
is_upper,
);
}
}
pub fn upsample_nearest_f32(
input: &[f32],
in_width: usize,
in_height: usize,
output: &mut [f32],
out_width: usize,
out_height: usize,
scale_x: usize,
scale_y: usize,
) {
for py in 0..out_height {
let sy = (py / scale_y).min(in_height.saturating_sub(1));
let out_row = py * out_width;
let in_row = sy * in_width;
for px in 0..out_width {
let sx = (px / scale_x).min(in_width.saturating_sub(1));
output[out_row + px] = input[in_row + sx];
}
}
}
pub fn upsample_libjpeg_f32(
input: &[f32],
in_width: usize,
in_height: usize,
out_width: usize,
out_height: usize,
scale_x: usize,
scale_y: usize,
) -> Vec<f32> {
let mut output = vec![0.0f32; out_width * out_height];
match (scale_x, scale_y) {
(2, 2) => upsample_h2v2_f32_libjpeg(
input,
in_width,
in_height,
&mut output,
out_width,
out_height,
),
(2, 1) => upsample_h2v1_f32_libjpeg(
input,
in_width,
in_height,
&mut output,
out_width,
out_height,
),
(1, 2) => upsample_h1v2_f32_libjpeg(
input,
in_width,
in_height,
&mut output,
out_width,
out_height,
),
(1, 1) => {
for y in 0..out_height {
let in_y = y.min(in_height.saturating_sub(1));
for x in 0..out_width {
let in_x = x.min(in_width.saturating_sub(1));
output[y * out_width + x] = input[in_y * in_width + in_x];
}
}
}
_ => {
upsample_nearest_f32(
input,
in_width,
in_height,
&mut output,
out_width,
out_height,
scale_x,
scale_y,
);
}
}
output
}
fn upsample_h2v1_f32_libjpeg(
input: &[f32],
in_width: usize,
in_height: usize,
output: &mut [f32],
out_width: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y.min(in_height.saturating_sub(1));
let out_row = out_y * out_width;
let in_row = in_y * in_width;
if in_width == 1 {
let val = input[in_row];
if out_width > 0 {
output[out_row] = val;
}
if out_width > 1 {
output[out_row + 1] = val;
}
continue;
}
let curr = input[in_row];
let next = input[in_row + 1];
output[out_row] = curr;
if out_width > 1 {
output[out_row + 1] = curr * 0.75 + next * 0.25;
}
for in_x in 1..in_width.saturating_sub(1) {
let prev = input[in_row + in_x - 1];
let curr = input[in_row + in_x];
let next = input[in_row + in_x + 1];
let left_out = in_x * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[out_row + left_out] = curr * 0.75 + prev * 0.25;
}
if right_out < out_width {
output[out_row + right_out] = curr * 0.75 + next * 0.25;
}
}
let last = in_width - 1;
let prev = input[in_row + last - 1];
let curr = input[in_row + last];
let left_out = last * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[out_row + left_out] = curr * 0.75 + prev * 0.25;
}
if right_out < out_width {
output[out_row + right_out] = curr;
}
}
}
fn upsample_h1v2_f32_libjpeg(
input: &[f32],
in_width: usize,
in_height: usize,
output: &mut [f32],
out_width: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y / 2;
let in_y_clamped = in_y.min(in_height.saturating_sub(1));
let is_upper = out_y % 2 == 0;
let out_row = out_y * out_width;
let far_y = if is_upper {
in_y_clamped.saturating_sub(1)
} else {
(in_y + 1).min(in_height.saturating_sub(1))
};
let near_row = in_y_clamped * in_width;
let far_row = far_y * in_width;
for out_x in 0..out_width {
let in_x = out_x.min(in_width.saturating_sub(1));
let near = input[near_row + in_x];
let far = input[far_row + in_x];
output[out_row + out_x] = near * 0.75 + far * 0.25;
}
}
}
fn upsample_h2v2_f32_libjpeg(
input: &[f32],
in_width: usize,
in_height: usize,
output: &mut [f32],
out_width: usize,
out_height: usize,
) {
if in_width == 0 || in_height == 0 || out_width == 0 || out_height == 0 {
return;
}
for out_y in 0..out_height {
let in_y = out_y / 2;
let in_y_clamped = in_y.min(in_height.saturating_sub(1));
let is_upper = out_y % 2 == 0;
let far_y = if is_upper {
in_y_clamped.saturating_sub(1)
} else {
(in_y + 1).min(in_height.saturating_sub(1))
};
let near_row = in_y_clamped * in_width;
let far_row = far_y * in_width;
let out_row = out_y * out_width;
if in_width == 1 {
let colsum = input[near_row] * 3.0 + input[far_row];
let val = colsum * 0.25;
if out_width > 0 {
output[out_row] = val;
}
if out_width > 1 {
output[out_row + 1] = val;
}
continue;
}
let this_colsum = input[near_row] * 3.0 + input[far_row];
let next_colsum = input[near_row + 1] * 3.0 + input[far_row + 1];
output[out_row] = this_colsum * 0.25;
if out_width > 1 {
output[out_row + 1] = (this_colsum * 3.0 + next_colsum) / 16.0;
}
let mut last_colsum = this_colsum;
for in_x in 1..in_width.saturating_sub(1) {
let this_colsum = input[near_row + in_x] * 3.0 + input[far_row + in_x];
let next_colsum = input[near_row + in_x + 1] * 3.0 + input[far_row + in_x + 1];
let left_out = in_x * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[out_row + left_out] = (this_colsum * 3.0 + last_colsum) / 16.0;
}
if right_out < out_width {
output[out_row + right_out] = (this_colsum * 3.0 + next_colsum) / 16.0;
}
last_colsum = this_colsum;
}
let last = in_width - 1;
let this_colsum = input[near_row + last] * 3.0 + input[far_row + last];
let left_out = last * 2;
let right_out = left_out + 1;
if left_out < out_width {
output[out_row + left_out] = (this_colsum * 3.0 + last_colsum) / 16.0;
}
if right_out < out_width {
output[out_row + right_out] = this_colsum * 0.25;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn h2v2_i16_nearest_basic() {
let input: Vec<i16> = vec![750; 4 * 4];
let mut output = vec![0i16; 8 * 8];
upsample_h2v2_i16_nearest(&input, 4, 4, &mut output, 8, 8);
for &v in &output {
assert_eq!(v, 750);
}
}
#[test]
fn h2v1_i16_nearest_basic() {
let input: Vec<i16> = vec![300; 4];
let mut output = vec![0i16; 8];
upsample_h2v1_i16_nearest(&input, 4, 1, &mut output, 8, 1);
for &v in &output {
assert_eq!(v, 300);
}
}
#[test]
fn h1v2_i16_nearest_basic() {
let input: Vec<i16> = vec![200; 4 * 4];
let mut output = vec![0i16; 4 * 8];
upsample_h1v2_i16_nearest(&input, 4, 4, &mut output, 4, 8);
for &v in &output {
assert_eq!(v, 200);
}
}
#[test]
fn h2v1_i16_libjpeg_basic() {
let input: Vec<i16> = vec![400; 8];
let mut output = vec![0i16; 16];
upsample_h2v1_i16_libjpeg(&input, 8, 1, &mut output, 16, 1);
for (i, &v) in output.iter().enumerate() {
assert!((v - 400).abs() <= 1, "libjpeg h2v1 pixel {i}: {v} != ~400");
}
}
fn gradient_test_data(width: usize, height: usize) -> Vec<i16> {
(0..width * height)
.map(|i| {
let x = i % width;
let y = i / width;
((x as i32 * 37 + y as i32 * 53) % 500 - 250) as i16
})
.collect()
}
fn extreme_test_data(width: usize, height: usize) -> Vec<i16> {
(0..width * height)
.map(|i| {
let x = i % width;
let y = i / width;
if (x / 4 + y / 4) % 2 == 0 {
2000
} else {
-2000
}
})
.collect()
}
#[cfg(target_arch = "x86_64")]
#[test]
fn h2v2_libjpeg_row_dispatch_parity() {
use archmage::testing::{CompileTimePolicy, for_each_token_permutation};
let widths: &[usize] = &[2, 4, 8, 15, 16, 17, 18, 32, 33, 64, 128, 255, 256, 512];
for &in_width in widths {
let out_width = in_width * 2;
for (label, near_data, far_data) in [
(
"gradient",
(0..in_width)
.map(|x| (x as i32 * 37 % 500 - 250) as i16)
.collect::<Vec<_>>(),
(0..in_width)
.map(|x| (x as i32 * 53 % 500 - 250) as i16)
.collect::<Vec<_>>(),
),
(
"extreme",
(0..in_width)
.map(|x| if x % 2 == 0 { 2000i16 } else { -2000 })
.collect(),
(0..in_width)
.map(|x| if x % 2 == 0 { -2000i16 } else { 2000 })
.collect(),
),
("constant", vec![1000i16; in_width], vec![500i16; in_width]),
] {
for is_upper in [true, false] {
let mut reference = vec![0i16; out_width];
upsample_h2v2_libjpeg_row_scalar(
&near_data,
&far_data,
&mut reference,
in_width,
out_width,
is_upper,
);
let report = for_each_token_permutation(CompileTimePolicy::Warn, |perm| {
let mut result = vec![0i16; out_width];
upsample_h2v2_libjpeg_row(
&near_data,
&far_data,
&mut result,
in_width,
out_width,
is_upper,
);
assert_eq!(
result, reference,
"h2v2_libjpeg_row mismatch: {label} width={in_width} \
is_upper={is_upper} at {perm}"
);
});
if label == "gradient" && in_width == 32 && is_upper {
eprintln!("h2v2_libjpeg_row dispatch: {report}");
assert!(
report.permutations_run >= 2,
"expected at least 2 permutations"
);
}
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn h2v2_libjpeg_full_dispatch_parity() {
use archmage::testing::{CompileTimePolicy, for_each_token_permutation};
let sizes: &[(usize, usize)] = &[
(4, 4),
(8, 8),
(16, 16),
(17, 9),
(32, 32),
(64, 16),
(33, 33),
(128, 64),
];
for &(in_w, in_h) in sizes {
let out_w = in_w * 2;
let out_h = in_h * 2;
for (label, input) in [
("gradient", gradient_test_data(in_w, in_h)),
("extreme", extreme_test_data(in_w, in_h)),
("constant", vec![1000i16; in_w * in_h]),
] {
let mut reference = vec![0i16; out_w * out_h];
for out_y in 0..out_h {
let in_y = out_y / 2;
let in_y_clamped = in_y.min(in_h.saturating_sub(1));
let is_upper = out_y % 2 == 0;
let far_y = if is_upper {
in_y_clamped.saturating_sub(1)
} else {
(in_y + 1).min(in_h.saturating_sub(1))
};
let near_row = in_y_clamped * in_w;
let far_row = far_y * in_w;
let out_row = out_y * out_w;
upsample_h2v2_libjpeg_row_scalar(
&input[near_row..near_row + in_w],
&input[far_row..far_row + in_w],
&mut reference[out_row..],
in_w,
out_w,
is_upper,
);
}
let report = for_each_token_permutation(CompileTimePolicy::Warn, |perm| {
let mut result = vec![0i16; out_w * out_h];
upsample_h2v2_i16_libjpeg(&input, in_w, in_h, &mut result, out_w, out_h);
assert_eq!(
result, reference,
"h2v2_libjpeg_full mismatch: {label} {in_w}x{in_h} at {perm}"
);
});
if label == "gradient" && in_w == 32 {
eprintln!("h2v2_libjpeg_full dispatch: {report}");
assert!(
report.permutations_run >= 2,
"expected at least 2 permutations"
);
}
}
}
}
}