zoomvtools 1.1.0

#![allow(clippy::unwrap_used, reason = "allow in test files")]
#![allow(clippy::undocumented_unsafe_blocks, reason = "allow in test files")]
#![allow(clippy::indexing_slicing, reason = "allow in test files")]

use std::num::NonZeroUsize;

use pastey::paste;

macro_rules! create_tests {
    ($module:ident) => {
        paste! {
            #[test]
            fn [<test_reduce_cubic_u8_2x2_ $module>]() {
                // Test basic 2x2 -> 1x1 reduction
                let src = vec![
                    10u8, 20, // first row
                    30, 40, // second row
                ];
                // Destination buffer needs to accommodate intermediate width of 2
                // (dest_width*2)
                let mut dest = vec![0u8; 2];
                let src_pitch = NonZeroUsize::new(2).unwrap();
                let dest_pitch = NonZeroUsize::new(2).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(1).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Cubic filter is separable:
                // 1. Vertical: For single line (height=1), uses simple averaging: (10 +
                //    30).div_ceil(2) = 20, (20 + 40).div_ceil(2) = 30 So intermediate = [20,
                //    30]
                // 2. Horizontal: For single pixel (width=1), uses simple averaging: (20 +
                //    30).div_ceil(2) = 25
                assert_eq!(dest[0], 25);
            }

            #[test]
            fn [<test_reduce_cubic_u8_4x2_ $module>]() {
                // Test 4x2 -> 2x1 reduction
                let src = vec![
                    10u8, 20, 30, 40, // first row
                    50, 60, 70, 80, // second row
                ];
                // Destination buffer needs to accommodate intermediate width of 4
                // (dest_width*2)
                let mut dest = vec![0u8; 4];
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(2).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Cubic filter is separable:
                // 1. Vertical: For single height, uses simple averaging: [0]: (10 +
                //    50).div_ceil(2) = 30, [1]: (20 + 60).div_ceil(2) = 40 [2]: (30 +
                //    70).div_ceil(2) = 50, [3]: (40 + 80).div_ceil(2) = 60 So intermediate =
                //    [30, 40, 50, 60]
                // 2. Horizontal: For width=2, edge cases use simple averaging: dest[0]: (30 +
                //    40).div_ceil(2) = 35 (start of line) dest[1]: (50 + 60).div_ceil(2) = 55
                //    (end of line)
                assert_eq!(dest[0], 35);
                assert_eq!(dest[1], 55);
            }

            #[test]
            fn [<test_reduce_cubic_u8_4x4_ $module>]() {
                // Test 4x4 -> 2x2 reduction
                let src = vec![
                    10u8, 20, 30, 40, // first row
                    50, 60, 70, 80, // second row
                    90, 100, 110, 120, // third row
                    130, 140, 150, 160, // fourth row
                ];
                // Destination buffer needs to accommodate intermediate width of 4
                // (dest_width*2) and height of 2
                let mut dest = vec![0u8; 8]; // 4 width * 2 height
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(2).unwrap();
                let dest_height = NonZeroUsize::new(2).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Cubic filter is separable with edge case handling:
                // 1. Vertical: Row 0 uses simple averaging, Row 1 uses edge case averaging Row
                //    0: [0]: (10 + 50).div_ceil(2) = 30, [1]: (20 + 60).div_ceil(2) = 40 [2]:
                //    (30 + 70).div_ceil(2) = 50, [3]: (40 + 80).div_ceil(2) = 60 Row 1: [0]:
                //    (90 + 130).div_ceil(2) = 110, [1]: (100 + 140).div_ceil(2) = 120 [2]: (110
                //    + 150).div_ceil(2) = 130, [3]: (120 + 160).div_ceil(2) = 140
                // 2. Horizontal: Uses edge case averaging for width=2 Row 0: dest[0]: (30 +
                //    40).div_ceil(2) = 35, dest[1]: (50 + 60).div_ceil(2) = 55 Row 1: dest[4]:
                //    (110 + 120).div_ceil(2) = 115, dest[5]: (130 + 140).div_ceil(2) = 135
                assert_eq!(dest[0], 35); // Top-left
                assert_eq!(dest[1], 55); // Top-right
                assert_eq!(dest[4], 115); // Bottom-left
                assert_eq!(dest[5], 135); // Bottom-right
            }

            #[test]
            fn [<test_reduce_cubic_u8_6x4_ $module>]() {
                // Test 6x4 -> 3x2 reduction with more moderate filtering
                let src = vec![
                    10u8, 20, 30, 40, 50, 60, // first row
                    70, 80, 90, 100, 110, 120, // second row
                    130, 140, 150, 160, 170, 180, // third row
                    190, 200, 210, 220, 230, 240, // fourth row
                ];
                // Destination buffer needs to accommodate intermediate width of 6
                // (dest_width*2) and height of 2
                let mut dest = vec![0u8; 12]; // 6 width * 2 height
                let src_pitch = NonZeroUsize::new(6).unwrap();
                let dest_pitch = NonZeroUsize::new(6).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(3).unwrap();
                let dest_height = NonZeroUsize::new(2).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Cubic filter should produce reasonable smoothed values
                // We verify that values are reasonable and follow expected trends
                assert!(dest[0] > 10 && dest[0] < 200); // Should be smoothed values
                assert!(dest[1] > 10 && dest[1] < 200);
                assert!(dest[2] > 10 && dest[2] < 200);
                assert!(dest[6] > 10 && dest[6] < 240); // Second row
                assert!(dest[7] > 10 && dest[7] < 240);
                assert!(dest[8] > 10 && dest[8] < 240);
            }

            #[test]
            fn [<test_reduce_cubic_gradient_ $module>]() {
                // Test with a simple gradient pattern 4x4 -> 2x2
                let src = vec![
                    0u8, 25, 50, 75, // first row
                    50, 75, 100, 125, // second row
                    100, 125, 150, 175, // third row
                    150, 175, 200, 225, // fourth row
                ];
                // Destination buffer needs to accommodate intermediate width of 4
                // (dest_width*2) and height of 2
                let mut dest = vec![0u8; 8]; // 4 width * 2 height
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(2).unwrap();
                let dest_height = NonZeroUsize::new(2).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify gradient property is preserved
                assert!(dest[0] < dest[1]); // Left to right increase
                assert!(dest[0] < dest[4]); // Top to bottom increase
                assert!(dest[1] < dest[5]); // Diagonal increase
                assert!(dest[4] < dest[5]); // Left to right in bottom row

                // Values should be reasonable
                assert!(dest[0] > 0 && dest[0] < 100);
                assert!(dest[5] > dest[0]); // Bottom-right should be largest
            }

            #[test]
            fn [<test_reduce_cubic_u16_basic_ $module>]() {
                // Test with u16 values
                let src = vec![
                    1000u16, 2000, // first row
                    3000, 4000, // second row
                ];
                // Destination buffer needs to accommodate intermediate width of 2
                // (dest_width*2)
                let mut dest = vec![0u16; 2];
                let src_pitch = NonZeroUsize::new(2).unwrap();
                let dest_pitch = NonZeroUsize::new(2).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(1).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Cubic filter with edge case handling:
                // Vertical: (1000 + 3000).div_ceil(2) = 2000, (2000 + 4000).div_ceil(2) = 3000
                // Horizontal: (2000 + 3000).div_ceil(2) = 2500
                assert_eq!(dest[0], 2500);
            }

            #[test]
            fn [<test_reduce_cubic_u16_large_values_ $module>]() {
                // Test with larger u16 values near the upper range
                let src = vec![
                    60000u16, 61000, // first row
                    62000, 63000, // second row
                ];
                // Destination buffer needs to accommodate intermediate width of 2
                // (dest_width*2)
                let mut dest = vec![0u16; 2];
                let src_pitch = NonZeroUsize::new(2).unwrap();
                let dest_pitch = NonZeroUsize::new(2).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(1).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Should handle large values without overflow
                // Vertical: (60000 + 62000).div_ceil(2) = 61000, (61000 + 63000).div_ceil(2) =
                // 62000 Horizontal: (61000 + 62000).div_ceil(2) = 61500
                assert_eq!(dest[0], 61500);
            }

            #[test]
            fn [<test_reduce_cubic_u16_4x4_ $module>]() {
                // Test with u16 values in a 4x4 configuration
                let src = vec![
                    1000u16, 2000, 3000, 4000, // first row
                    5000, 6000, 7000, 8000, // second row
                    9000, 10000, 11000, 12000, // third row
                    13000, 14000, 15000, 16000, // fourth row
                ];
                // Destination buffer needs to accommodate intermediate width of 4
                // (dest_width*2) and height of 2
                let mut dest = vec![0u16; 8]; // 4 width * 2 height
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(2).unwrap();
                let dest_height = NonZeroUsize::new(2).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify reasonable values and ordering
                assert!(dest[0] > 1000 && dest[0] < 8000); // Top-left
                assert!(dest[1] > 2000 && dest[1] < 10000); // Top-right
                assert!(dest[4] > 5000 && dest[4] < 14000); // Bottom-left
                assert!(dest[5] > 6000 && dest[5] < 16000); // Bottom-right

                // Values should increase from left to right and top to bottom
                assert!(dest[0] < dest[1]);
                assert!(dest[0] < dest[4]);
                assert!(dest[1] < dest[5]);
                assert!(dest[4] < dest[5]);
            }

            #[test]
            fn [<test_reduce_cubic_with_padding_ $module>]() {
                // Test with source pitch > width (includes padding)
                let src = vec![
                    10u8, 20, 255, 255, // first row (last 2 are padding)
                    30, 40, 255, 255, // second row (last 2 are padding)
                ];
                // Destination buffer needs to accommodate intermediate width of 2
                // (dest_width*2)
                let mut dest = vec![0u8; 4]; // 2 intermediate width, plus padding for dest_pitch=4
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width + padding
                let dest_width = NonZeroUsize::new(1).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Should only process the first 2x2 block, ignoring padding
                // Vertical: (10 + 30).div_ceil(2) = 20, (20 + 40).div_ceil(2) = 30
                // Horizontal: (20 + 30).div_ceil(2) = 25
                assert_eq!(dest[0], 25);
            }

            #[test]
            fn [<test_reduce_cubic_uniform_values_ $module>]() {
                // Test with uniform values to ensure filter preserves them
                let src = vec![
                    100u8, 100, 100, 100, // first row
                    100, 100, 100, 100, // second row
                    100, 100, 100, 100, // third row
                    100, 100, 100, 100, // fourth row
                ];
                // Destination buffer needs to accommodate intermediate width of 4
                // (dest_width*2) and height of 2
                let mut dest = vec![0u8; 8]; // 4 width * 2 height
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(2).unwrap();
                let dest_height = NonZeroUsize::new(2).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Uniform input should produce uniform output
                assert_eq!(dest[0], 100);
                assert_eq!(dest[1], 100);
                assert_eq!(dest[4], 100);
                assert_eq!(dest[5], 100);
            }

            #[test]
            fn [<test_reduce_cubic_edge_case_single_pixel_ $module>]() {
                // Test edge case with 2x2 -> 1x1 reduction (minimal case)
                let src = vec![
                    50u8, 60, // first row
                    70, 80, // second row
                ];
                // Destination buffer needs to accommodate intermediate width of 2
                // (dest_width*2)
                let mut dest = vec![0u8; 2];
                let src_pitch = NonZeroUsize::new(2).unwrap();
                let dest_pitch = NonZeroUsize::new(2).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(1).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // With minimal case, should use simple averaging
                // Vertical: (50 + 70).div_ceil(2) = 60, (60 + 80).div_ceil(2) = 70
                // Horizontal: (60 + 70).div_ceil(2) = 65
                assert_eq!(dest[0], 65);
            }

            #[test]
            fn [<test_reduce_cubic_max_values_ $module>]() {
                // Test with maximum values to ensure no overflow
                let src = vec![
                    255u8, 255, // first row
                    255, 255, // second row
                ];
                // Destination buffer needs to accommodate intermediate width of 2
                // (dest_width*2)
                let mut dest = vec![0u8; 2];
                let src_pitch = NonZeroUsize::new(2).unwrap();
                let dest_pitch = NonZeroUsize::new(2).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(1).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Maximum values should be preserved
                assert_eq!(dest[0], 255);
            }

            #[test]
            fn [<test_reduce_cubic_large_height_ $module>]() {
                // Test 4x6 -> 2x3 reduction to exercise the middle lines loop
                let src = vec![
                    10u8, 20, 30, 40, // row 0
                    50, 60, 70, 80, // row 1
                    90, 100, 110, 120, // row 2
                    130, 140, 150, 160, // row 3
                    170, 180, 190, 200, // row 4
                    210, 220, 230, 240, // row 5
                ];
                // Destination buffer needs to accommodate intermediate width of 4
                // (dest_width*2) and height of 3
                let mut dest = vec![0u8; 12]; // 4 width * 3 height
                let src_pitch = NonZeroUsize::new(4).unwrap();
                let dest_pitch = NonZeroUsize::new(4).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(2).unwrap();
                let dest_height = NonZeroUsize::new(3).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // This test primarily ensures the middle lines loop doesn't crash
                // The exact values are less important than ensuring no index out of bounds
                assert_ne!(dest[0], 0); // Should have been modified
                assert_ne!(dest[4], 0); // Second row should have been modified
                assert_ne!(dest[8], 0); // Third row should have been modified
            }

            #[test]
            fn [<test_reduce_cubic_u8_large_simd_ $module>]() {
                // Test large enough to trigger SIMD processing (64x2 -> 32x1)
                // This ensures we cover the SIMD loop in vertical reduction for AVX2 implementation
                let mut src = Vec::new();

                // First row: 64 pixels with values 0-63
                for i in 0..64u8 {
                    src.push(i);
                }

                // Second row: 64 pixels with values 64-127
                for i in 64..128u8 {
                    src.push(i);
                }

                // Destination buffer needs intermediate width of 64 (dest_width*2)
                let mut dest = vec![0u8; 64];
                let src_pitch = NonZeroUsize::new(64).unwrap();
                let dest_pitch = NonZeroUsize::new(64).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(32).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify the cubic filtering results
                // Step 1: Vertical reduction creates intermediate array with width=64
                // For height=1, this uses simple averaging: (a + b + 1) / 2
                let mut intermediate = vec![0u8; 64];
                for i in 0..64usize {
                    let a = src[i] as u32; // First row
                    let b = src[64 + i] as u32; // Second row
                    intermediate[i] = ((a + b + 1) / 2) as u8;
                }

                // Step 2: Horizontal reduction processes the intermediate array
                // For cubic, this uses edge case handling and complex filtering
                for i in 0..32usize {
                    let expected = if i == 0 {
                        // First pixel: simple averaging (edge case)
                        let a = intermediate[0] as u32;
                        let b = intermediate[1] as u32;
                        ((a + b + 1) / 2) as u8
                    } else if i == 31 {
                        // Last pixel: simple averaging (edge case)
                        let a = intermediate[62] as u32;
                        let b = intermediate[63] as u32;
                        ((a + b + 1) / 2) as u8
                    } else {
                        // Middle pixels: cubic interpolation
                        // This is complex, so we'll just verify reasonable values
                        dest[i] // Accept whatever the implementation produces
                    };

                    if i == 0 || i == 31 {
                        assert_eq!(dest[i], expected, "Mismatch at edge position {}", i);
                    } else {
                        // For middle pixels, just verify they're reasonable
                        assert!(dest[i] > 0 && dest[i] < 255, "Unreasonable value at position {}: {}", i, dest[i]);
                    }
                }

                // Verify that SIMD processing produced reasonable results
                // Values should generally increase due to the input pattern
                assert!(dest[0] < dest[15], "Values should generally increase across the array");
                assert!(dest[15] < dest[31], "Values should generally increase across the array");
            }

            #[test]
            fn [<test_reduce_cubic_u8_large_simd_middle_lines_ $module>]() {
                // Test large enough to trigger SIMD processing for middle lines (64x12 -> 32x6)
                // This ensures we cover the complex cubic filter SIMD loop for middle lines
                let mut src = Vec::new();

                // Create 12 rows of 64 pixels each with a gradient pattern
                // Keep values small to avoid overflow in the cubic filter calculations
                for row in 0..12u8 {
                    for col in 0..64u8 {
                        src.push((row * 10 + col / 8) % 200); // Values 0-199
                    }
                }

                // Destination buffer needs intermediate width of 64 (dest_width*2) and height of 6
                let mut dest = vec![0u8; 384]; // 64 width * 6 height
                let src_pitch = NonZeroUsize::new(64).unwrap();
                let dest_pitch = NonZeroUsize::new(64).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(32).unwrap();
                let dest_height = NonZeroUsize::new(6).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify the complex cubic filtering for middle lines
                // The middle lines (y=1,2,3,4) use the complex 6-tap cubic filter
                // which is: m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16 >> 5

                                // Test that all pixels in all rows have been processed
                // We'll test a few specific positions rather than asserting all are non-zero
                // since the cubic filter can legitimately produce zero values
                for y in 0..6usize {
                    // Test a few sample positions in each row
                    for &x in &[0, 15, 31] {
                        let dest_idx = y * 64 + x;
                        // Just verify the memory was written to - cubic filter can produce any u8 value
                        let _value = dest[dest_idx]; // This will read the computed value
                    }
                }

                // Test a few more samples to ensure the SIMD loop processed the full width
                for sample_x in [0, 15, 31] {
                    let middle_sample = dest[64 + sample_x]; // Second row, various positions
                    assert_ne!(middle_sample, 0, "SIMD should have processed position {}", sample_x);
                    // middle_sample is u8, so it's automatically valid
                }
            }

            #[test]
            fn [<test_reduce_cubic_u8_main_horizontal_simd_loop_ $module>]() {
                // Hit widened 32-pixel horizontal AVX2 loop directly (68x2 -> 34x1).
                let mut src = Vec::new();

                for i in 0..68u8 {
                    src.push(i);
                }

                for i in 68..136u8 {
                    src.push(i);
                }

                let mut dest = vec![0u8; 68];
                let src_pitch = NonZeroUsize::new(68).unwrap();
                let dest_pitch = NonZeroUsize::new(68).unwrap();
                let dest_width = NonZeroUsize::new(34).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                let mut intermediate = vec![0u16; 68];
                for i in 0..68usize {
                    let a = src[i] as u16;
                    let b = src[68 + i] as u16;
                    intermediate[i] = (a + b + 1) / 2;
                }

                for i in 0..34usize {
                    let expected = if i == 0 {
                        ((intermediate[0] + intermediate[1] + 1) / 2) as u8
                    } else if i == 33 {
                        ((intermediate[66] + intermediate[67] + 1) / 2) as u8
                    } else {
                        let x = i * 2;
                        let m0 = intermediate[x - 2];
                        let m1 = intermediate[x - 1];
                        let m2 = intermediate[x];
                        let m3 = intermediate[x + 1];
                        let m4 = intermediate[x + 2];
                        let m5 = intermediate[x + 3];
                        let result = (m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16) >> 5;
                        result.min(255) as u8
                    };

                    assert_eq!(dest[i], expected, "Mismatch at position {}", i);
                }
            }

            #[test]
            fn [<test_reduce_cubic_u8_scalar_fallback_middle_lines_ $module>]() {
                // Test scalar tail after SIMD in u8 middle lines (38x6 -> 19x3)
                // Vertical pass uses SIMD + scalar tail, horizontal pass uses
                // SIMD for x=1..16 then scalar at x=17.
                let mut src = Vec::new();

                for row in 0..6u16 {
                    for col in 0..38u16 {
                        let value = ((row * 13 + col * 7) % 200) + 50;
                        src.push(u8::try_from(value).expect("value fits in u8"));
                    }
                }

                let mut dest = vec![0u8; 114]; // 38 width * 3 height
                let src_pitch = NonZeroUsize::new(38).unwrap();
                let dest_pitch = NonZeroUsize::new(38).unwrap();
                let dest_width = NonZeroUsize::new(19).unwrap();
                let dest_height = NonZeroUsize::new(3).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Middle row gets full 6-tap vertical filter.
                // Check SIMD/scalar-tail transition neighborhood is computed.
                let middle_row_start = 38;
                for i in 15..19usize {
                    let value = dest[middle_row_start + i];
                    assert!(value >= 50, "Output should reflect non-zero source at position {}", i);
                }

                // Edge outputs in each row must be written by edge logic.
                for y in 0..3usize {
                    let row = y * 38;
                    assert_ne!(dest[row], 0, "Left edge should be written in row {}", y);
                    assert_ne!(dest[row + 18], 0, "Right edge should be written in row {}", y);
                }
            }

            #[test]
            fn [<test_reduce_cubic_u16_large_simd_ $module>]() {
                // Test large enough to trigger SIMD processing for u16 vertical reduction (16x2 -> 8x1)
                // This ensures we cover the u16 SIMD loop at lines 293-302: while x + 16 <= dest_width
                let mut src = Vec::new();

                // First row: 16 pixels with values 0-15 scaled to u16 range
                for i in 0..16u16 {
                    src.push(i * 1000);
                }

                // Second row: 16 pixels with values 16-31 scaled to u16 range
                for i in 16..32u16 {
                    src.push(i * 1000);
                }

                // Destination buffer needs intermediate width of 16 (dest_width*2)
                let mut dest = vec![0u16; 16];
                let src_pitch = NonZeroUsize::new(16).unwrap();
                let dest_pitch = NonZeroUsize::new(16).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(8).unwrap();
                let dest_height = NonZeroUsize::new(1).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify the SIMD processing results for u16
                // Step 1: Vertical reduction creates intermediate array with width=16
                // For height=1, this uses simple averaging: (a + b + 1) / 2
                // The SIMD loop should process x=0..15 in one iteration since 0 + 16 <= 16
                                for i in 0..16usize {
                    // For the horizontal reduction step, we need to check the final result
                    // The intermediate array gets horizontally reduced using cubic filtering
                    if i < 8 {
                        // Verify the output is reasonable (SIMD should have processed this)
                        assert!(dest[i] > 100 && dest[i] < 32000,
                               "SIMD processing should produce reasonable u16 values at position {}: {}", i, dest[i]);
                    }
                }

                // Verify the SIMD code path was taken by checking consistent results
                // The values should follow a general increasing pattern due to the input pattern
                for i in 0..7usize {
                    assert!(dest[i] <= dest[i + 1] + 2000,
                           "Values should be reasonably ordered at positions {} and {}: {} vs {}",
                           i, i + 1, dest[i], dest[i + 1]);
                }
            }

            #[test]
            fn [<test_reduce_cubic_u16_large_simd_middle_lines_ $module>]() {
                // Test large enough to trigger SIMD processing for u16 middle lines (32x6 -> 16x3)
                // This ensures we cover the middle lines SIMD loop at lines 314-378: for y in 1..(dest_height - 1)
                // With dest_height=3, this gives y=1 (one iteration of the middle lines loop)
                let mut src = Vec::new();

                // Create 6 rows of 32 pixels each with a controlled pattern
                // Keep values moderate to avoid overflow in 6-tap cubic filter calculations
                for row in 0..6u16 {
                    for col in 0..32u16 {
                        src.push((row * 500 + col * 50) % 30000); // Values 0-29999
                    }
                }

                // Destination buffer needs intermediate width of 32 (dest_width*2) and height of 3
                let mut dest = vec![0u16; 96]; // 32 width * 3 height
                let src_pitch = NonZeroUsize::new(32).unwrap();
                let dest_pitch = NonZeroUsize::new(32).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(16).unwrap();
                let dest_height = NonZeroUsize::new(3).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify the complex cubic filtering for middle lines
                // The middle line (y=1) uses the complex 6-tap cubic filter:
                // result = (m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16) >> 5
                // where m0..m5 are the 6 vertical taps for the cubic filter

                // The SIMD loop should process x=0..15 in one iteration since 0 + 16 <= 16
                // Verify that all rows have been processed
                for y in 0..3usize {
                    for &x in &[0, 8, 15] {
                        let dest_idx = y * 32 + x; // Using dest_pitch=32 for intermediate buffer

                        // Verify the output is reasonable (SIMD should have processed this)
                        assert!(dest[dest_idx] < 65535,
                               "SIMD processing should produce valid u16 values at row {} position {}: {}",
                               y, x, dest[dest_idx]);
                    }
                }

                // Specifically verify the middle row (y=1) which uses the complex cubic filter
                // This row should have different values from simple averaging due to the 6-tap filter
                let middle_row_start = 32; // y=1 * dest_pitch=32
                for i in 0..16usize {
                    let middle_value = dest[middle_row_start + i];

                    // The 6-tap cubic filter should produce reasonable values
                    assert!(middle_value < 50000,
                           "Middle row cubic filter should produce reasonable values at position {}: {}",
                           i, middle_value);
                }

                // Verify the SIMD code path was exercised by checking value consistency
                // The complex cubic filter should produce smoother transitions
                let first_row_sample = dest[0]; // y=0, x=0 (simple edge case)
                let middle_row_sample = dest[32]; // y=1, x=0 (complex cubic filter)
                let last_row_sample = dest[64]; // y=2, x=0 (simple edge case)

                // All should be valid u16 values
                assert!(first_row_sample < 65535 && middle_row_sample < 65535 && last_row_sample < 65535,
                       "All processed values should be valid u16");
            }

            #[test]
            fn [<test_reduce_cubic_u16_high_values_simd_middle_lines_ $module>]() {
                // Keep middle taps above i16::MAX so SIMD path must preserve full u16 range,
                // while edge rows stay below overflow range for the line-averaging path.
                let mut src = Vec::new();

                for row in 0..6u16 {
                    for col in 0..32u16 {
                        let base = match row {
                            2 | 3 => 40000,
                            _ => 1000,
                        };
                        src.push(base + row * 200 + col * 25);
                    }
                }

                let mut dest = vec![0u16; 96];
                let src_pitch = NonZeroUsize::new(32).unwrap();
                let dest_pitch = NonZeroUsize::new(32).unwrap();
                let dest_width = NonZeroUsize::new(16).unwrap();
                let dest_height = NonZeroUsize::new(3).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                let middle_row_start = 32;
                assert_eq!(dest[middle_row_start], 25888);
                assert_eq!(dest[middle_row_start + 7], 26238);
                assert_eq!(dest[middle_row_start + 15], 26638);
            }

            #[test]
            fn [<test_reduce_cubic_u16_scalar_fallback_middle_lines_ $module>]() {
                // Test to trigger scalar fallback in u16 middle lines processing (36x6 -> 18x3)
                // This ensures we cover the scalar fallback loop at lines 364-375: while x < dest_width
                // With dest_width=18, SIMD processes x=0..15, then scalar handles x=16,17
                let mut src = Vec::new();

                // Create 6 rows of 36 pixels each with a controlled pattern
                // Keep values moderate to avoid overflow in 6-tap cubic filter calculations
                for row in 0..6u16 {
                    for col in 0..36u16 {
                        src.push((row * 400 + col * 40) % 25000); // Values 0-24999
                    }
                }

                // Destination buffer needs intermediate width of 36 (dest_width*2) and height of 3
                let mut dest = vec![0u16; 108]; // 36 width * 3 height
                let src_pitch = NonZeroUsize::new(36).unwrap();
                let dest_pitch = NonZeroUsize::new(36).unwrap(); // Must accommodate intermediate width
                let dest_width = NonZeroUsize::new(18).unwrap(); // Not a multiple of 16
                let dest_height = NonZeroUsize::new(3).unwrap();

                verify_asm!($module, reduce_cubic(
                    &mut dest,
                    &src,
                    dest_pitch,
                    src_pitch,
                    dest_width,
                    dest_height,
                ));

                // Verify the scalar fallback processing in middle lines
                // The middle line (y=1) should have been processed by both SIMD (x=0..15) and scalar (x=16,17)
                let middle_row_start = 36; // y=1 * dest_pitch=36

                // Check SIMD-processed positions (x=0..15)
                for i in 0..16usize {
                    let simd_value = dest[middle_row_start + i];
                    assert!(simd_value < 40000,
                           "SIMD-processed position {} should have reasonable value: {}",
                           i, simd_value);
                }

                // Check scalar fallback positions (x=16,17)
                for i in 16..18usize {
                    let scalar_value = dest[middle_row_start + i];
                    assert!(scalar_value < 40000,
                           "Scalar fallback position {} should have reasonable value: {}",
                           i, scalar_value);

                    // Ensure the scalar fallback actually computed a value (not left as 0)
                    // The 6-tap cubic filter should produce non-zero results for our test pattern
                    assert_ne!(scalar_value, 0,
                              "Scalar fallback should have computed a value at position {}", i);
                }

                // Verify continuity between SIMD and scalar results
                // The transition from SIMD to scalar should be smooth
                let simd_last = dest[middle_row_start + 15];
                let scalar_first = dest[middle_row_start + 16];

                // Values should be in similar range (allowing for some variation due to different input data)
                let diff = if simd_last > scalar_first {
                    simd_last - scalar_first
                } else {
                    scalar_first - simd_last
                };
                assert!(diff < 10000,
                       "Transition from SIMD to scalar should be reasonable: {} -> {} (diff: {})",
                       simd_last, scalar_first, diff);
            }
        }
    };
}

create_tests!(rust);

#[cfg(all(target_arch = "x86_64", feature = "avx2"))]
create_tests!(avx2);