Skip to main content

oxigdal_algorithms/simd/
resampling.rs

1//! SIMD-accelerated resampling operations
2//!
3//! This module provides high-performance image resampling using SIMD instructions
4//! for bilinear and bicubic interpolation. The implementations use cache-friendly
5//! blocking strategies for optimal performance on large rasters.
6//!
7//! # Architecture Support
8//!
9//! - **aarch64**: NEON (128-bit, 4 pixels/iteration)
10//! - **x86-64**: AVX2+FMA (256-bit, 8 pixels/iteration) with scalar fallback
11//! - **Other**: Scalar fallback with auto-vectorization hints
12//!
13//! # Supported Methods
14//!
15//! - **Bilinear**: Fast, smooth interpolation (2x2 kernel)
16//! - **Bicubic**: High-quality interpolation (4x4 kernel, Catmull-Rom)
17//! - **Nearest**: No interpolation (fastest)
18//! - **Downsample Average**: Area averaging for antialiasing
19//!
20//! # Performance
21//!
22//! Expected speedup over scalar: 3-6x depending on interpolation method.
23//!
24//! # Example
25//!
26//! ```rust
27//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
28//! use oxigdal_algorithms::simd::resampling::bilinear_f32;
29//!
30//! let src = vec![1.0_f32; 100 * 100];
31//! let mut dst = vec![0.0_f32; 50 * 50];
32//!
33//! bilinear_f32(&src, 100, 100, &mut dst, 50, 50)?;
34//! # Ok(())
35//! # }
36//! ```
37
38#![allow(unsafe_code)]
39
40use crate::error::{AlgorithmError, Result};
41
42// ============================================================================
43// Validation helpers
44// ============================================================================
45
46/// Validate bilinear resampling parameters
47fn validate_bilinear(
48    src: &[f32],
49    src_width: usize,
50    src_height: usize,
51    dst: &[f32],
52    dst_width: usize,
53    dst_height: usize,
54) -> Result<()> {
55    if src.len() != src_width * src_height {
56        return Err(AlgorithmError::InvalidParameter {
57            parameter: "input",
58            message: "Source buffer size doesn't match dimensions".to_string(),
59        });
60    }
61    if dst.len() != dst_width * dst_height {
62        return Err(AlgorithmError::InvalidParameter {
63            parameter: "input",
64            message: "Destination buffer size doesn't match dimensions".to_string(),
65        });
66    }
67    if src_width == 0 || src_height == 0 || dst_width == 0 || dst_height == 0 {
68        return Err(AlgorithmError::InvalidParameter {
69            parameter: "input",
70            message: "Dimensions must be greater than 0".to_string(),
71        });
72    }
73    Ok(())
74}
75
76/// Validate bicubic resampling parameters
77fn validate_bicubic(
78    src: &[f32],
79    src_width: usize,
80    src_height: usize,
81    dst: &[f32],
82    dst_width: usize,
83    dst_height: usize,
84) -> Result<()> {
85    if src.len() != src_width * src_height {
86        return Err(AlgorithmError::InvalidParameter {
87            parameter: "input",
88            message: "Source buffer size doesn't match dimensions".to_string(),
89        });
90    }
91    if dst.len() != dst_width * dst_height {
92        return Err(AlgorithmError::InvalidParameter {
93            parameter: "input",
94            message: "Destination buffer size doesn't match dimensions".to_string(),
95        });
96    }
97    if src_width < 4 || src_height < 4 {
98        return Err(AlgorithmError::InvalidParameter {
99            parameter: "input",
100            message: "Source dimensions must be at least 4x4 for bicubic".to_string(),
101        });
102    }
103    if dst_width == 0 || dst_height == 0 {
104        return Err(AlgorithmError::InvalidParameter {
105            parameter: "input",
106            message: "Destination dimensions must be greater than 0".to_string(),
107        });
108    }
109    Ok(())
110}
111
112// ============================================================================
113// Cubic kernel (Catmull-Rom) - shared across all implementations
114// ============================================================================
115
116/// Cubic interpolation kernel (Catmull-Rom)
117///
118/// Returns 4 weights for the 4 tap positions given a fractional position t in [0,1].
119#[inline]
120fn cubic_kernel(t: f32) -> [f32; 4] {
121    let t2 = t * t;
122    let t3 = t2 * t;
123    [
124        -0.5 * t3 + t2 - 0.5 * t,
125        1.5 * t3 - 2.5 * t2 + 1.0,
126        -1.5 * t3 + 2.0 * t2 + 0.5 * t,
127        0.5 * t3 - 0.5 * t2,
128    ]
129}
130
131// ============================================================================
132// Scalar fallback implementation
133// ============================================================================
134
135mod scalar_impl {
136    use super::cubic_kernel;
137
138    /// Scalar bilinear interpolation inner loop
139    pub(crate) fn bilinear_f32(
140        src: &[f32],
141        src_width: usize,
142        src_height: usize,
143        dst: &mut [f32],
144        dst_width: usize,
145        dst_height: usize,
146    ) {
147        let x_scale = src_width as f32 / dst_width as f32;
148        let y_scale = src_height as f32 / dst_height as f32;
149        const TILE_SIZE: usize = 64;
150
151        for tile_y in (0..dst_height).step_by(TILE_SIZE) {
152            let tile_h = TILE_SIZE.min(dst_height - tile_y);
153            for tile_x in (0..dst_width).step_by(TILE_SIZE) {
154                let tile_w = TILE_SIZE.min(dst_width - tile_x);
155                for y in tile_y..(tile_y + tile_h) {
156                    let src_y = (y as f32 + 0.5) * y_scale - 0.5;
157                    let src_y0 = src_y.max(0.0) as usize;
158                    let src_y1 = (src_y0 + 1).min(src_height - 1);
159                    let y_frac = (src_y - src_y0 as f32).max(0.0).min(1.0);
160
161                    for x in tile_x..(tile_x + tile_w) {
162                        let src_x = (x as f32 + 0.5) * x_scale - 0.5;
163                        let src_x0 = src_x.max(0.0) as usize;
164                        let src_x1 = (src_x0 + 1).min(src_width - 1);
165                        let x_frac = (src_x - src_x0 as f32).max(0.0).min(1.0);
166
167                        let p00 = src[src_y0 * src_width + src_x0];
168                        let p10 = src[src_y0 * src_width + src_x1];
169                        let p01 = src[src_y1 * src_width + src_x0];
170                        let p11 = src[src_y1 * src_width + src_x1];
171
172                        let p0 = p00 + (p10 - p00) * x_frac;
173                        let p1 = p01 + (p11 - p01) * x_frac;
174                        let value = p0 + (p1 - p0) * y_frac;
175
176                        dst[y * dst_width + x] = value;
177                    }
178                }
179            }
180        }
181    }
182
183    /// Scalar bicubic interpolation inner loop
184    pub(crate) fn bicubic_f32(
185        src: &[f32],
186        src_width: usize,
187        src_height: usize,
188        dst: &mut [f32],
189        dst_width: usize,
190        dst_height: usize,
191    ) {
192        let x_scale = src_width as f32 / dst_width as f32;
193        let y_scale = src_height as f32 / dst_height as f32;
194        const TILE_SIZE: usize = 32;
195
196        for tile_y in (0..dst_height).step_by(TILE_SIZE) {
197            let tile_h = TILE_SIZE.min(dst_height - tile_y);
198            for tile_x in (0..dst_width).step_by(TILE_SIZE) {
199                let tile_w = TILE_SIZE.min(dst_width - tile_x);
200                for y in tile_y..(tile_y + tile_h) {
201                    let src_y = (y as f32 + 0.5) * y_scale - 0.5;
202                    let src_y_base = src_y.floor() as isize;
203                    let y_frac = (src_y - src_y_base as f32).max(0.0).min(1.0);
204                    let y_weights = cubic_kernel(y_frac);
205
206                    for x in tile_x..(tile_x + tile_w) {
207                        let src_x = (x as f32 + 0.5) * x_scale - 0.5;
208                        let src_x_base = src_x.floor() as isize;
209                        let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
210                        let x_weights = cubic_kernel(x_frac);
211
212                        let mut value = 0.0_f32;
213                        for ky in 0..4 {
214                            let sy = (src_y_base - 1 + ky as isize)
215                                .clamp(0, src_height as isize - 1)
216                                as usize;
217                            let mut row_sum = 0.0_f32;
218                            for kx in 0..4 {
219                                let sx = (src_x_base - 1 + kx as isize)
220                                    .clamp(0, src_width as isize - 1)
221                                    as usize;
222                                row_sum += src[sy * src_width + sx] * x_weights[kx];
223                            }
224                            value += row_sum * y_weights[ky];
225                        }
226                        dst[y * dst_width + x] = value;
227                    }
228                }
229            }
230        }
231    }
232}
233
234// ============================================================================
235// NEON implementation (aarch64)
236// ============================================================================
237
238#[cfg(target_arch = "aarch64")]
239mod neon_impl {
240    use std::arch::aarch64::*;
241
242    use super::cubic_kernel;
243
244    /// NEON bilinear interpolation - processes 4 output pixels per iteration
245    ///
246    /// For each row of output pixels, src_y0/src_y1/y_frac are constant.
247    /// We process 4 x-positions simultaneously using NEON 128-bit registers.
248    ///
249    /// SAFETY: Caller must ensure all indices are within bounds of src/dst slices
250    /// and that slice dimensions match the specified widths/heights.
251    #[target_feature(enable = "neon")]
252    pub(crate) unsafe fn bilinear_f32(
253        src: &[f32],
254        src_width: usize,
255        src_height: usize,
256        dst: &mut [f32],
257        dst_width: usize,
258        dst_height: usize,
259    ) {
260        unsafe {
261            let x_scale = src_width as f32 / dst_width as f32;
262            let y_scale = src_height as f32 / dst_height as f32;
263            let src_ptr = src.as_ptr();
264            let dst_ptr = dst.as_mut_ptr();
265            const TILE_SIZE: usize = 64;
266            let max_sx = (src_width - 1) as f32;
267            let max_sy = (src_height - 1) as f32;
268
269            for tile_y in (0..dst_height).step_by(TILE_SIZE) {
270                let tile_h = TILE_SIZE.min(dst_height - tile_y);
271                for tile_x in (0..dst_width).step_by(TILE_SIZE) {
272                    let tile_w = TILE_SIZE.min(dst_width - tile_x);
273                    for y in tile_y..(tile_y + tile_h) {
274                        let src_y = (y as f32 + 0.5) * y_scale - 0.5;
275                        let src_y_clamped = src_y.max(0.0).min(max_sy);
276                        let src_y0 = src_y_clamped as usize;
277                        let src_y1 = (src_y0 + 1).min(src_height - 1);
278                        let y_frac = (src_y - src_y0 as f32).max(0.0).min(1.0);
279                        let vy_frac = vdupq_n_f32(y_frac);
280                        let vy_frac_inv = vdupq_n_f32(1.0 - y_frac);
281
282                        let row0_base = src_y0 * src_width;
283                        let row1_base = src_y1 * src_width;
284                        let dst_row = y * dst_width;
285
286                        // Process 4 pixels at a time
287                        let simd_end = tile_x + (tile_w / 4) * 4;
288                        let mut x = tile_x;
289                        while x < simd_end {
290                            // Pre-compute source x coordinates and fractions for 4 pixels
291                            let mut sx0 = [0_usize; 4];
292                            let mut sx1 = [0_usize; 4];
293                            let mut xf = [0.0_f32; 4];
294
295                            for i in 0..4 {
296                                let src_x = ((x + i) as f32 + 0.5) * x_scale - 0.5;
297                                let src_x_clamped = src_x.max(0.0).min(max_sx);
298                                sx0[i] = src_x_clamped as usize;
299                                sx1[i] = (sx0[i] + 1).min(src_width - 1);
300                                xf[i] = (src_x - sx0[i] as f32).max(0.0).min(1.0);
301                            }
302
303                            // Manual gather: load 4 pixels from each of the 4 source positions
304                            let vp00 = vld1q_f32(
305                                [
306                                    *src_ptr.add(row0_base + sx0[0]),
307                                    *src_ptr.add(row0_base + sx0[1]),
308                                    *src_ptr.add(row0_base + sx0[2]),
309                                    *src_ptr.add(row0_base + sx0[3]),
310                                ]
311                                .as_ptr(),
312                            );
313                            let vp10 = vld1q_f32(
314                                [
315                                    *src_ptr.add(row0_base + sx1[0]),
316                                    *src_ptr.add(row0_base + sx1[1]),
317                                    *src_ptr.add(row0_base + sx1[2]),
318                                    *src_ptr.add(row0_base + sx1[3]),
319                                ]
320                                .as_ptr(),
321                            );
322                            let vp01 = vld1q_f32(
323                                [
324                                    *src_ptr.add(row1_base + sx0[0]),
325                                    *src_ptr.add(row1_base + sx0[1]),
326                                    *src_ptr.add(row1_base + sx0[2]),
327                                    *src_ptr.add(row1_base + sx0[3]),
328                                ]
329                                .as_ptr(),
330                            );
331                            let vp11 = vld1q_f32(
332                                [
333                                    *src_ptr.add(row1_base + sx1[0]),
334                                    *src_ptr.add(row1_base + sx1[1]),
335                                    *src_ptr.add(row1_base + sx1[2]),
336                                    *src_ptr.add(row1_base + sx1[3]),
337                                ]
338                                .as_ptr(),
339                            );
340
341                            let vxf = vld1q_f32(xf.as_ptr());
342                            let vxf_inv = vsubq_f32(vdupq_n_f32(1.0), vxf);
343
344                            // Bilinear interpolation using FMA:
345                            // top = p00 * (1-xf) + p10 * xf
346                            let top = vfmaq_f32(vmulq_f32(vp00, vxf_inv), vp10, vxf);
347                            // bot = p01 * (1-xf) + p11 * xf
348                            let bot = vfmaq_f32(vmulq_f32(vp01, vxf_inv), vp11, vxf);
349                            // result = top * (1-yf) + bot * yf
350                            let result = vfmaq_f32(vmulq_f32(top, vy_frac_inv), bot, vy_frac);
351
352                            vst1q_f32(dst_ptr.add(dst_row + x), result);
353                            x += 4;
354                        }
355
356                        // Scalar remainder
357                        while x < tile_x + tile_w {
358                            let src_x = (x as f32 + 0.5) * x_scale - 0.5;
359                            let src_x0 = src_x.max(0.0) as usize;
360                            let src_x1 = (src_x0 + 1).min(src_width - 1);
361                            let x_frac = (src_x - src_x0 as f32).max(0.0).min(1.0);
362
363                            let p00 = *src_ptr.add(row0_base + src_x0);
364                            let p10 = *src_ptr.add(row0_base + src_x1);
365                            let p01 = *src_ptr.add(row1_base + src_x0);
366                            let p11 = *src_ptr.add(row1_base + src_x1);
367
368                            let p0 = p00 + (p10 - p00) * x_frac;
369                            let p1 = p01 + (p11 - p01) * x_frac;
370                            *dst_ptr.add(dst_row + x) = p0 + (p1 - p0) * y_frac;
371                            x += 1;
372                        }
373                    }
374                }
375            }
376        }
377    }
378
379    /// NEON bicubic interpolation - uses NEON for the 4-tap dot product per row
380    ///
381    /// For each output pixel, we compute a separable 4x4 Catmull-Rom interpolation.
382    /// Each row of 4 source pixels is loaded into a NEON float32x4, multiplied by
383    /// the x-direction weights, and horizontally summed using vaddvq_f32.
384    /// The 4 row results are then combined with y-direction weights.
385    ///
386    /// SAFETY: Caller must ensure all indices are within bounds.
387    #[target_feature(enable = "neon")]
388    pub(crate) unsafe fn bicubic_f32(
389        src: &[f32],
390        src_width: usize,
391        src_height: usize,
392        dst: &mut [f32],
393        dst_width: usize,
394        dst_height: usize,
395    ) {
396        unsafe {
397            let x_scale = src_width as f32 / dst_width as f32;
398            let y_scale = src_height as f32 / dst_height as f32;
399            let src_ptr = src.as_ptr();
400            let dst_ptr = dst.as_mut_ptr();
401            let iw = src_width as isize;
402            let ih = src_height as isize;
403            const TILE_SIZE: usize = 32;
404
405            for tile_y in (0..dst_height).step_by(TILE_SIZE) {
406                let tile_h = TILE_SIZE.min(dst_height - tile_y);
407                for tile_x in (0..dst_width).step_by(TILE_SIZE) {
408                    let tile_w = TILE_SIZE.min(dst_width - tile_x);
409                    for y in tile_y..(tile_y + tile_h) {
410                        let src_y = (y as f32 + 0.5) * y_scale - 0.5;
411                        let src_y_base = src_y.floor() as isize;
412                        let y_frac = (src_y - src_y_base as f32).max(0.0).min(1.0);
413                        let y_weights = cubic_kernel(y_frac);
414                        let vy_weights = vld1q_f32(y_weights.as_ptr());
415
416                        // Pre-clamp the 4 source row indices
417                        let sy = [
418                            (src_y_base - 1).clamp(0, ih - 1) as usize,
419                            (src_y_base).clamp(0, ih - 1) as usize,
420                            (src_y_base + 1).clamp(0, ih - 1) as usize,
421                            (src_y_base + 2).clamp(0, ih - 1) as usize,
422                        ];
423
424                        let dst_row = y * dst_width;
425
426                        for x in tile_x..(tile_x + tile_w) {
427                            let src_x = (x as f32 + 0.5) * x_scale - 0.5;
428                            let src_x_base = src_x.floor() as isize;
429                            let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
430                            let x_weights = cubic_kernel(x_frac);
431                            let vx_weights = vld1q_f32(x_weights.as_ptr());
432
433                            // Pre-clamp the 4 source column indices
434                            let sx = [
435                                (src_x_base - 1).clamp(0, iw - 1) as usize,
436                                (src_x_base).clamp(0, iw - 1) as usize,
437                                (src_x_base + 1).clamp(0, iw - 1) as usize,
438                                (src_x_base + 2).clamp(0, iw - 1) as usize,
439                            ];
440
441                            // For each of 4 rows: load 4 pixels, dot with x_weights
442                            let mut row_sums = [0.0_f32; 4];
443                            for ky in 0..4 {
444                                let row_off = sy[ky] * src_width;
445                                let pixels = vld1q_f32(
446                                    [
447                                        *src_ptr.add(row_off + sx[0]),
448                                        *src_ptr.add(row_off + sx[1]),
449                                        *src_ptr.add(row_off + sx[2]),
450                                        *src_ptr.add(row_off + sx[3]),
451                                    ]
452                                    .as_ptr(),
453                                );
454                                // Dot product: sum of (pixels * x_weights)
455                                let prod = vmulq_f32(pixels, vx_weights);
456                                row_sums[ky] = vaddvq_f32(prod);
457                            }
458
459                            // Combine row sums with y_weights using NEON
460                            let vrow_sums = vld1q_f32(row_sums.as_ptr());
461                            let vprod = vmulq_f32(vrow_sums, vy_weights);
462                            let value = vaddvq_f32(vprod);
463
464                            *dst_ptr.add(dst_row + x) = value;
465                        }
466                    }
467                }
468            }
469        }
470    }
471}
472
473// ============================================================================
474// AVX2 implementation (x86-64)
475// ============================================================================
476
477#[cfg(target_arch = "x86_64")]
478mod avx2_impl {
479    use std::arch::x86_64::*;
480
481    use super::cubic_kernel;
482
483    /// AVX2 bilinear interpolation - processes 8 output pixels per iteration
484    ///
485    /// Manual gather of 8 pixel quartets, then FMA-based lerp in both X and Y.
486    ///
487    /// SAFETY: Caller must ensure AVX2+FMA are available (runtime detection)
488    /// and all indices are within bounds.
489    #[target_feature(enable = "avx2", enable = "fma")]
490    pub(crate) unsafe fn bilinear_f32(
491        src: &[f32],
492        src_width: usize,
493        src_height: usize,
494        dst: &mut [f32],
495        dst_width: usize,
496        dst_height: usize,
497    ) {
498        unsafe {
499            let x_scale = src_width as f32 / dst_width as f32;
500            let y_scale = src_height as f32 / dst_height as f32;
501            let src_ptr = src.as_ptr();
502            let dst_ptr = dst.as_mut_ptr();
503            const TILE_SIZE: usize = 64;
504            let max_sx = (src_width - 1) as f32;
505            let max_sy = (src_height - 1) as f32;
506
507            for tile_y in (0..dst_height).step_by(TILE_SIZE) {
508                let tile_h = TILE_SIZE.min(dst_height - tile_y);
509                for tile_x in (0..dst_width).step_by(TILE_SIZE) {
510                    let tile_w = TILE_SIZE.min(dst_width - tile_x);
511                    for y in tile_y..(tile_y + tile_h) {
512                        let src_y = (y as f32 + 0.5) * y_scale - 0.5;
513                        let src_y_clamped = src_y.max(0.0).min(max_sy);
514                        let src_y0 = src_y_clamped as usize;
515                        let src_y1 = (src_y0 + 1).min(src_height - 1);
516                        let y_frac = (src_y - src_y0 as f32).max(0.0).min(1.0);
517                        let vy_frac = _mm256_set1_ps(y_frac);
518                        let vy_frac_inv = _mm256_set1_ps(1.0 - y_frac);
519
520                        let row0_base = src_y0 * src_width;
521                        let row1_base = src_y1 * src_width;
522                        let dst_row = y * dst_width;
523
524                        // Process 8 pixels at a time
525                        let simd_end = tile_x + (tile_w / 8) * 8;
526                        let mut x = tile_x;
527                        while x < simd_end {
528                            // Pre-compute source coordinates for 8 pixels
529                            let mut sx0 = [0_usize; 8];
530                            let mut sx1 = [0_usize; 8];
531                            let mut xf = [0.0_f32; 8];
532
533                            for i in 0..8 {
534                                let src_x = ((x + i) as f32 + 0.5) * x_scale - 0.5;
535                                let src_x_clamped = src_x.max(0.0).min(max_sx);
536                                sx0[i] = src_x_clamped as usize;
537                                sx1[i] = (sx0[i] + 1).min(src_width - 1);
538                                xf[i] = (src_x - sx0[i] as f32).max(0.0).min(1.0);
539                            }
540
541                            // Manual gather: 8 scalar loads per register
542                            let vp00 = _mm256_set_ps(
543                                *src_ptr.add(row0_base + sx0[7]),
544                                *src_ptr.add(row0_base + sx0[6]),
545                                *src_ptr.add(row0_base + sx0[5]),
546                                *src_ptr.add(row0_base + sx0[4]),
547                                *src_ptr.add(row0_base + sx0[3]),
548                                *src_ptr.add(row0_base + sx0[2]),
549                                *src_ptr.add(row0_base + sx0[1]),
550                                *src_ptr.add(row0_base + sx0[0]),
551                            );
552                            let vp10 = _mm256_set_ps(
553                                *src_ptr.add(row0_base + sx1[7]),
554                                *src_ptr.add(row0_base + sx1[6]),
555                                *src_ptr.add(row0_base + sx1[5]),
556                                *src_ptr.add(row0_base + sx1[4]),
557                                *src_ptr.add(row0_base + sx1[3]),
558                                *src_ptr.add(row0_base + sx1[2]),
559                                *src_ptr.add(row0_base + sx1[1]),
560                                *src_ptr.add(row0_base + sx1[0]),
561                            );
562                            let vp01 = _mm256_set_ps(
563                                *src_ptr.add(row1_base + sx0[7]),
564                                *src_ptr.add(row1_base + sx0[6]),
565                                *src_ptr.add(row1_base + sx0[5]),
566                                *src_ptr.add(row1_base + sx0[4]),
567                                *src_ptr.add(row1_base + sx0[3]),
568                                *src_ptr.add(row1_base + sx0[2]),
569                                *src_ptr.add(row1_base + sx0[1]),
570                                *src_ptr.add(row1_base + sx0[0]),
571                            );
572                            let vp11 = _mm256_set_ps(
573                                *src_ptr.add(row1_base + sx1[7]),
574                                *src_ptr.add(row1_base + sx1[6]),
575                                *src_ptr.add(row1_base + sx1[5]),
576                                *src_ptr.add(row1_base + sx1[4]),
577                                *src_ptr.add(row1_base + sx1[3]),
578                                *src_ptr.add(row1_base + sx1[2]),
579                                *src_ptr.add(row1_base + sx1[1]),
580                                *src_ptr.add(row1_base + sx1[0]),
581                            );
582
583                            let vxf = _mm256_loadu_ps(xf.as_ptr());
584                            let vxf_inv = _mm256_sub_ps(_mm256_set1_ps(1.0), vxf);
585
586                            // Bilinear with FMA:
587                            // top = p00 * (1-xf) + p10 * xf
588                            let top = _mm256_fmadd_ps(vp10, vxf, _mm256_mul_ps(vp00, vxf_inv));
589                            // bot = p01 * (1-xf) + p11 * xf
590                            let bot = _mm256_fmadd_ps(vp11, vxf, _mm256_mul_ps(vp01, vxf_inv));
591                            // result = top * (1-yf) + bot * yf
592                            let result =
593                                _mm256_fmadd_ps(bot, vy_frac, _mm256_mul_ps(top, vy_frac_inv));
594
595                            _mm256_storeu_ps(dst_ptr.add(dst_row + x), result);
596                            x += 8;
597                        }
598
599                        // Scalar remainder
600                        while x < tile_x + tile_w {
601                            let src_x = (x as f32 + 0.5) * x_scale - 0.5;
602                            let src_x0 = src_x.max(0.0) as usize;
603                            let src_x1 = (src_x0 + 1).min(src_width - 1);
604                            let x_frac = (src_x - src_x0 as f32).max(0.0).min(1.0);
605
606                            let p00 = *src_ptr.add(row0_base + src_x0);
607                            let p10 = *src_ptr.add(row0_base + src_x1);
608                            let p01 = *src_ptr.add(row1_base + src_x0);
609                            let p11 = *src_ptr.add(row1_base + src_x1);
610
611                            let p0 = p00 + (p10 - p00) * x_frac;
612                            let p1 = p01 + (p11 - p01) * x_frac;
613                            *dst_ptr.add(dst_row + x) = p0 + (p1 - p0) * y_frac;
614                            x += 1;
615                        }
616                    }
617                }
618            }
619        }
620    }
621
622    /// AVX2 bicubic interpolation - processes 8 output pixels in parallel
623    ///
624    /// For each of 4 kernel rows, gathers 8 sets of 4-tap values, computes
625    /// weighted sums with x_weights using FMA, then combines with y_weights.
626    ///
627    /// SAFETY: Caller must ensure AVX2+FMA are available and all indices in bounds.
628    #[target_feature(enable = "avx2", enable = "fma")]
629    pub(crate) unsafe fn bicubic_f32(
630        src: &[f32],
631        src_width: usize,
632        src_height: usize,
633        dst: &mut [f32],
634        dst_width: usize,
635        dst_height: usize,
636    ) {
637        unsafe {
638            let x_scale = src_width as f32 / dst_width as f32;
639            let y_scale = src_height as f32 / dst_height as f32;
640            let src_ptr = src.as_ptr();
641            let dst_ptr = dst.as_mut_ptr();
642            let iw = src_width as isize;
643            let ih = src_height as isize;
644            const TILE_SIZE: usize = 32;
645
646            for tile_y in (0..dst_height).step_by(TILE_SIZE) {
647                let tile_h = TILE_SIZE.min(dst_height - tile_y);
648                for tile_x in (0..dst_width).step_by(TILE_SIZE) {
649                    let tile_w = TILE_SIZE.min(dst_width - tile_x);
650                    for y in tile_y..(tile_y + tile_h) {
651                        let src_y = (y as f32 + 0.5) * y_scale - 0.5;
652                        let src_y_base = src_y.floor() as isize;
653                        let y_frac = (src_y - src_y_base as f32).max(0.0).min(1.0);
654                        let y_weights = cubic_kernel(y_frac);
655
656                        let sy = [
657                            (src_y_base - 1).clamp(0, ih - 1) as usize,
658                            (src_y_base).clamp(0, ih - 1) as usize,
659                            (src_y_base + 1).clamp(0, ih - 1) as usize,
660                            (src_y_base + 2).clamp(0, ih - 1) as usize,
661                        ];
662
663                        let dst_row = y * dst_width;
664
665                        // Process 8 pixels at a time
666                        let simd_end = tile_x + (tile_w / 8) * 8;
667                        let mut x = tile_x;
668                        while x < simd_end {
669                            // Pre-compute x coords and weights for 8 pixels
670                            let mut all_sx = [[0_usize; 4]; 8];
671                            let mut all_xw = [[0.0_f32; 4]; 8];
672
673                            for i in 0..8 {
674                                let src_x = ((x + i) as f32 + 0.5) * x_scale - 0.5;
675                                let src_x_base = src_x.floor() as isize;
676                                let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
677                                all_xw[i] = cubic_kernel(x_frac);
678                                all_sx[i] = [
679                                    (src_x_base - 1).clamp(0, iw - 1) as usize,
680                                    (src_x_base).clamp(0, iw - 1) as usize,
681                                    (src_x_base + 1).clamp(0, iw - 1) as usize,
682                                    (src_x_base + 2).clamp(0, iw - 1) as usize,
683                                ];
684                            }
685
686                            // Accumulate: for each kernel row ky, compute weighted x-sum for
687                            // all 8 pixels, then multiply by y_weight[ky]
688                            let mut vaccum = _mm256_setzero_ps();
689
690                            for ky in 0..4 {
691                                let row_off = sy[ky] * src_width;
692                                let vy_w = _mm256_set1_ps(y_weights[ky]);
693
694                                // For each of 4 x-taps, gather 8 values and FMA
695                                let mut vrow_sum = _mm256_setzero_ps();
696
697                                for kx in 0..4 {
698                                    // Gather 8 pixels at tap kx
699                                    let vp = _mm256_set_ps(
700                                        *src_ptr.add(row_off + all_sx[7][kx]),
701                                        *src_ptr.add(row_off + all_sx[6][kx]),
702                                        *src_ptr.add(row_off + all_sx[5][kx]),
703                                        *src_ptr.add(row_off + all_sx[4][kx]),
704                                        *src_ptr.add(row_off + all_sx[3][kx]),
705                                        *src_ptr.add(row_off + all_sx[2][kx]),
706                                        *src_ptr.add(row_off + all_sx[1][kx]),
707                                        *src_ptr.add(row_off + all_sx[0][kx]),
708                                    );
709                                    // Load the x_weight[kx] for each of 8 pixels
710                                    let vxw = _mm256_set_ps(
711                                        all_xw[7][kx],
712                                        all_xw[6][kx],
713                                        all_xw[5][kx],
714                                        all_xw[4][kx],
715                                        all_xw[3][kx],
716                                        all_xw[2][kx],
717                                        all_xw[1][kx],
718                                        all_xw[0][kx],
719                                    );
720                                    // FMA: row_sum += pixel * x_weight
721                                    vrow_sum = _mm256_fmadd_ps(vp, vxw, vrow_sum);
722                                }
723
724                                // Accumulate: total += row_sum * y_weight[ky]
725                                vaccum = _mm256_fmadd_ps(vrow_sum, vy_w, vaccum);
726                            }
727
728                            _mm256_storeu_ps(dst_ptr.add(dst_row + x), vaccum);
729                            x += 8;
730                        }
731
732                        // Scalar remainder
733                        while x < tile_x + tile_w {
734                            let src_x = (x as f32 + 0.5) * x_scale - 0.5;
735                            let src_x_base = src_x.floor() as isize;
736                            let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
737                            let x_weights = cubic_kernel(x_frac);
738                            let sx = [
739                                (src_x_base - 1).clamp(0, iw - 1) as usize,
740                                (src_x_base).clamp(0, iw - 1) as usize,
741                                (src_x_base + 1).clamp(0, iw - 1) as usize,
742                                (src_x_base + 2).clamp(0, iw - 1) as usize,
743                            ];
744                            let mut value = 0.0_f32;
745                            for ky in 0..4 {
746                                let row_off = sy[ky] * src_width;
747                                let mut row_sum = 0.0_f32;
748                                for kx in 0..4 {
749                                    row_sum += *src_ptr.add(row_off + sx[kx]) * x_weights[kx];
750                                }
751                                value += row_sum * y_weights[ky];
752                            }
753                            *dst_ptr.add(dst_row + x) = value;
754                            x += 1;
755                        }
756                    }
757                }
758            }
759        }
760    }
761}
762
763// ============================================================================
764// Public API with dispatch
765// ============================================================================
766
767/// Bilinear interpolation with SIMD optimization
768///
769/// Resample a source image to a destination size using bilinear interpolation.
770/// Automatically dispatches to the best available SIMD instruction set:
771///
772/// - **aarch64**: NEON (4 pixels/iteration)
773/// - **x86-64**: AVX2+FMA (8 pixels/iteration) with scalar fallback
774/// - **Other**: Scalar with cache-friendly tiling
775///
776/// # Arguments
777///
778/// * `src` - Source image data (row-major)
779/// * `src_width` - Source image width
780/// * `src_height` - Source image height
781/// * `dst` - Destination buffer (must be dst_width * dst_height)
782/// * `dst_width` - Destination width
783/// * `dst_height` - Destination height
784///
785/// # Errors
786///
787/// Returns an error if buffer sizes don't match dimensions
788pub fn bilinear_f32(
789    src: &[f32],
790    src_width: usize,
791    src_height: usize,
792    dst: &mut [f32],
793    dst_width: usize,
794    dst_height: usize,
795) -> Result<()> {
796    validate_bilinear(src, src_width, src_height, dst, dst_width, dst_height)?;
797
798    #[cfg(target_arch = "aarch64")]
799    {
800        // SAFETY: NEON is always available on aarch64, dimensions validated above
801        unsafe {
802            neon_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
803        }
804    }
805
806    #[cfg(target_arch = "x86_64")]
807    {
808        if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
809            // SAFETY: AVX2+FMA runtime detected, dimensions validated above
810            unsafe {
811                avx2_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
812            }
813        } else {
814            scalar_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
815        }
816    }
817
818    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
819    {
820        scalar_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
821    }
822
823    Ok(())
824}
825
826/// Bicubic interpolation with SIMD optimization
827///
828/// Resample a source image to a destination size using bicubic (Catmull-Rom)
829/// interpolation. Provides higher quality than bilinear at the cost of speed.
830/// Dispatches to the best available SIMD instruction set.
831///
832/// # Arguments
833///
834/// * `src` - Source image data (row-major)
835/// * `src_width` - Source image width
836/// * `src_height` - Source image height
837/// * `dst` - Destination buffer (must be dst_width * dst_height)
838/// * `dst_width` - Destination width
839/// * `dst_height` - Destination height
840///
841/// # Errors
842///
843/// Returns an error if buffer sizes don't match dimensions
844pub fn bicubic_f32(
845    src: &[f32],
846    src_width: usize,
847    src_height: usize,
848    dst: &mut [f32],
849    dst_width: usize,
850    dst_height: usize,
851) -> Result<()> {
852    validate_bicubic(src, src_width, src_height, dst, dst_width, dst_height)?;
853
854    #[cfg(target_arch = "aarch64")]
855    {
856        // SAFETY: NEON is always available on aarch64, dimensions validated above
857        unsafe {
858            neon_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
859        }
860    }
861
862    #[cfg(target_arch = "x86_64")]
863    {
864        if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
865            // SAFETY: AVX2+FMA runtime detected, dimensions validated above
866            unsafe {
867                avx2_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
868            }
869        } else {
870            scalar_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
871        }
872    }
873
874    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
875    {
876        scalar_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
877    }
878
879    Ok(())
880}
881
882/// Nearest neighbor resampling (fast, no interpolation)
883///
884/// This is the fastest resampling method but produces blocky results.
885/// Useful for categorical data or when speed is critical.
886///
887/// # Arguments
888///
889/// * `src` - Source image data (row-major)
890/// * `src_width` - Source image width
891/// * `src_height` - Source image height
892/// * `dst` - Destination buffer (must be dst_width * dst_height)
893/// * `dst_width` - Destination width
894/// * `dst_height` - Destination height
895///
896/// # Errors
897///
898/// Returns an error if buffer sizes don't match dimensions
899pub fn nearest_f32(
900    src: &[f32],
901    src_width: usize,
902    src_height: usize,
903    dst: &mut [f32],
904    dst_width: usize,
905    dst_height: usize,
906) -> Result<()> {
907    if src.len() != src_width * src_height {
908        return Err(AlgorithmError::InvalidParameter {
909            parameter: "input",
910            message: "Source buffer size doesn't match dimensions".to_string(),
911        });
912    }
913    if dst.len() != dst_width * dst_height {
914        return Err(AlgorithmError::InvalidParameter {
915            parameter: "input",
916            message: "Destination buffer size doesn't match dimensions".to_string(),
917        });
918    }
919    if src_width == 0 || src_height == 0 || dst_width == 0 || dst_height == 0 {
920        return Err(AlgorithmError::InvalidParameter {
921            parameter: "input",
922            message: "Dimensions must be greater than 0".to_string(),
923        });
924    }
925
926    let x_ratio = src_width as f32 / dst_width as f32;
927    let y_ratio = src_height as f32 / dst_height as f32;
928
929    for y in 0..dst_height {
930        let src_y = ((y as f32 * y_ratio) as usize).min(src_height - 1);
931        for x in 0..dst_width {
932            let src_x = ((x as f32 * x_ratio) as usize).min(src_width - 1);
933            dst[y * dst_width + x] = src[src_y * src_width + src_x];
934        }
935    }
936
937    Ok(())
938}
939
940/// Downsample using area averaging (for antialiasing)
941///
942/// When downsampling, this method averages pixels in the source region
943/// to produce smoother results with less aliasing.
944///
945/// # Arguments
946///
947/// * `src` - Source image data (row-major)
948/// * `src_width` - Source image width
949/// * `src_height` - Source image height
950/// * `dst` - Destination buffer (must be dst_width * dst_height)
951/// * `dst_width` - Destination width
952/// * `dst_height` - Destination height
953///
954/// # Errors
955///
956/// Returns an error if buffer sizes don't match dimensions or upsampling is attempted
957pub fn downsample_average_f32(
958    src: &[f32],
959    src_width: usize,
960    src_height: usize,
961    dst: &mut [f32],
962    dst_width: usize,
963    dst_height: usize,
964) -> Result<()> {
965    if src.len() != src_width * src_height {
966        return Err(AlgorithmError::InvalidParameter {
967            parameter: "input",
968            message: "Source buffer size doesn't match dimensions".to_string(),
969        });
970    }
971    if dst.len() != dst_width * dst_height {
972        return Err(AlgorithmError::InvalidParameter {
973            parameter: "input",
974            message: "Destination buffer size doesn't match dimensions".to_string(),
975        });
976    }
977    if dst_width > src_width || dst_height > src_height {
978        return Err(AlgorithmError::InvalidParameter {
979            parameter: "input",
980            message: "This method is only for downsampling".to_string(),
981        });
982    }
983
984    let x_ratio = src_width as f32 / dst_width as f32;
985    let y_ratio = src_height as f32 / dst_height as f32;
986
987    for dst_y in 0..dst_height {
988        let src_y_start = (dst_y as f32 * y_ratio) as usize;
989        let src_y_end = (((dst_y + 1) as f32 * y_ratio) as usize).min(src_height);
990
991        for dst_x in 0..dst_width {
992            let src_x_start = (dst_x as f32 * x_ratio) as usize;
993            let src_x_end = (((dst_x + 1) as f32 * x_ratio) as usize).min(src_width);
994
995            let mut sum = 0.0_f32;
996            let mut count = 0;
997
998            for src_y in src_y_start..src_y_end {
999                for src_x in src_x_start..src_x_end {
1000                    sum += src[src_y * src_width + src_x];
1001                    count += 1;
1002                }
1003            }
1004
1005            dst[dst_y * dst_width + dst_x] = if count > 0 { sum / count as f32 } else { 0.0 };
1006        }
1007    }
1008
1009    Ok(())
1010}
1011
1012// ============================================================================
1013// Tests
1014// ============================================================================
1015
1016#[cfg(test)]
1017mod tests {
1018    use super::*;
1019    use approx::assert_relative_eq;
1020
1021    // ---- Bilinear tests ----
1022
1023    #[test]
1024    fn test_bilinear_identity() {
1025        let src = vec![1.0, 2.0, 3.0, 4.0];
1026        let mut dst = vec![0.0; 4];
1027        bilinear_f32(&src, 2, 2, &mut dst, 2, 2)
1028            .expect("bilinear_f32 identity resampling should succeed in test");
1029        for i in 0..4 {
1030            assert_relative_eq!(dst[i], src[i], epsilon = 1e-5);
1031        }
1032    }
1033
1034    #[test]
1035    fn test_bilinear_downsample() {
1036        let src = vec![
1037            1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0,
1038        ];
1039        let mut dst = vec![0.0; 4];
1040        bilinear_f32(&src, 4, 4, &mut dst, 2, 2)
1041            .expect("bilinear_f32 downsampling should succeed in test");
1042        assert!(dst[0] < dst[1]);
1043        assert!(dst[2] < dst[3]);
1044    }
1045
1046    #[test]
1047    fn test_bilinear_upsample() {
1048        let src = vec![1.0, 2.0, 3.0, 4.0];
1049        let mut dst = vec![0.0; 16];
1050        bilinear_f32(&src, 2, 2, &mut dst, 4, 4)
1051            .expect("bilinear_f32 upsampling should succeed in test");
1052        assert_relative_eq!(dst[0], 1.0, epsilon = 1e-5);
1053        assert_relative_eq!(dst[15], 4.0, epsilon = 1e-5);
1054    }
1055
1056    // ---- Bicubic tests ----
1057
1058    #[test]
1059    fn test_bicubic_identity() {
1060        let src = vec![
1061            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
1062        ];
1063        let mut dst = vec![0.0; 16];
1064        bicubic_f32(&src, 4, 4, &mut dst, 4, 4)
1065            .expect("bicubic_f32 identity resampling should succeed in test");
1066        for i in 0..16 {
1067            assert_relative_eq!(dst[i], src[i], epsilon = 0.1);
1068        }
1069    }
1070
1071    // ---- Nearest tests ----
1072
1073    #[test]
1074    fn test_nearest() {
1075        let src = vec![1.0, 2.0, 3.0, 4.0];
1076        let mut dst = vec![0.0; 4];
1077        nearest_f32(&src, 2, 2, &mut dst, 2, 2)
1078            .expect("nearest_f32 identity resampling should succeed in test");
1079        for i in 0..4 {
1080            assert_relative_eq!(dst[i], src[i]);
1081        }
1082    }
1083
1084    #[test]
1085    fn test_nearest_downsample() {
1086        let src = vec![
1087            1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0,
1088        ];
1089        let mut dst = vec![0.0; 4];
1090        nearest_f32(&src, 4, 4, &mut dst, 2, 2)
1091            .expect("nearest_f32 downsampling should succeed in test");
1092        assert_relative_eq!(dst[0], 1.0);
1093        assert_relative_eq!(dst[1], 2.0);
1094        assert_relative_eq!(dst[2], 3.0);
1095        assert_relative_eq!(dst[3], 4.0);
1096    }
1097
1098    // ---- Downsample average tests ----
1099
1100    #[test]
1101    fn test_downsample_average() {
1102        let src = vec![
1103            1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0,
1104        ];
1105        let mut dst = vec![0.0; 4];
1106        downsample_average_f32(&src, 4, 4, &mut dst, 2, 2)
1107            .expect("downsample_average_f32 should succeed in test");
1108        assert_relative_eq!(dst[0], 1.0);
1109        assert_relative_eq!(dst[1], 2.0);
1110        assert_relative_eq!(dst[2], 3.0);
1111        assert_relative_eq!(dst[3], 4.0);
1112    }
1113
1114    // ---- Validation tests ----
1115
1116    #[test]
1117    fn test_invalid_dimensions() {
1118        let src = vec![1.0; 10];
1119        let mut dst = vec![0.0; 4];
1120        assert!(bilinear_f32(&src, 4, 4, &mut dst, 2, 2).is_err());
1121
1122        let src = vec![1.0; 16];
1123        assert!(bilinear_f32(&src, 4, 4, &mut dst, 3, 3).is_err());
1124    }
1125
1126    #[test]
1127    fn test_bicubic_too_small() {
1128        let src = vec![1.0; 9];
1129        let mut dst = vec![0.0; 4];
1130        assert!(bicubic_f32(&src, 3, 3, &mut dst, 2, 2).is_err());
1131    }
1132
1133    #[test]
1134    fn test_cubic_kernel() {
1135        let weights = cubic_kernel(0.5);
1136        let sum: f32 = weights.iter().sum();
1137        assert_relative_eq!(sum, 1.0, epsilon = 1e-6);
1138    }
1139
1140    // ---- Large test ----
1141
1142    #[test]
1143    fn test_large_downsample() {
1144        let src = vec![1.0_f32; 1000 * 1000];
1145        let mut dst = vec![0.0_f32; 100 * 100];
1146        bilinear_f32(&src, 1000, 1000, &mut dst, 100, 100)
1147            .expect("bilinear_f32 large downsampling should succeed in test");
1148        for &val in &dst {
1149            assert_relative_eq!(val, 1.0);
1150        }
1151    }
1152
1153    // ---- SIMD vs Scalar accuracy tests ----
1154
1155    /// Helper: run the scalar implementation for bilinear and return results
1156    fn scalar_bilinear(src: &[f32], sw: usize, sh: usize, dw: usize, dh: usize) -> Vec<f32> {
1157        let mut dst = vec![0.0_f32; dw * dh];
1158        scalar_impl::bilinear_f32(src, sw, sh, &mut dst, dw, dh);
1159        dst
1160    }
1161
1162    /// Helper: run the scalar implementation for bicubic and return results
1163    fn scalar_bicubic(src: &[f32], sw: usize, sh: usize, dw: usize, dh: usize) -> Vec<f32> {
1164        let mut dst = vec![0.0_f32; dw * dh];
1165        scalar_impl::bicubic_f32(src, sw, sh, &mut dst, dw, dh);
1166        dst
1167    }
1168
1169    /// Compare SIMD-dispatched bilinear against scalar reference.
1170    /// Tolerance is 1e-4 to account for FMA rounding differences (FMA computes
1171    /// a*b+c with a single rounding vs two roundings in separate mul+add).
1172    fn assert_bilinear_matches_scalar(
1173        src: &[f32],
1174        sw: usize,
1175        sh: usize,
1176        dw: usize,
1177        dh: usize,
1178        label: &str,
1179    ) {
1180        let scalar = scalar_bilinear(src, sw, sh, dw, dh);
1181        let mut simd_dst = vec![0.0_f32; dw * dh];
1182        bilinear_f32(src, sw, sh, &mut simd_dst, dw, dh)
1183            .expect("bilinear_f32 should succeed for SIMD vs scalar comparison");
1184
1185        for (i, (&s, &d)) in scalar.iter().zip(simd_dst.iter()).enumerate() {
1186            assert!(
1187                (s - d).abs() < 1e-4,
1188                "bilinear mismatch at index {i} for {label}: scalar={s}, simd={d}, diff={}",
1189                (s - d).abs()
1190            );
1191        }
1192    }
1193
1194    /// Compare SIMD-dispatched bicubic against scalar reference.
1195    /// Tolerance is 1e-4 to account for FMA rounding differences.
1196    fn assert_bicubic_matches_scalar(
1197        src: &[f32],
1198        sw: usize,
1199        sh: usize,
1200        dw: usize,
1201        dh: usize,
1202        label: &str,
1203    ) {
1204        let scalar = scalar_bicubic(src, sw, sh, dw, dh);
1205        let mut simd_dst = vec![0.0_f32; dw * dh];
1206        bicubic_f32(src, sw, sh, &mut simd_dst, dw, dh)
1207            .expect("bicubic_f32 should succeed for SIMD vs scalar comparison");
1208
1209        for (i, (&s, &d)) in scalar.iter().zip(simd_dst.iter()).enumerate() {
1210            assert!(
1211                (s - d).abs() < 1e-4,
1212                "bicubic mismatch at index {i} for {label}: scalar={s}, simd={d}, diff={}",
1213                (s - d).abs()
1214            );
1215        }
1216    }
1217
1218    #[test]
1219    fn test_bilinear_simd_vs_scalar_exact_4() {
1220        // Exactly 4 output columns (NEON width)
1221        let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.1).collect();
1222        assert_bilinear_matches_scalar(&src, 8, 8, 4, 4, "exact_4");
1223    }
1224
1225    #[test]
1226    fn test_bilinear_simd_vs_scalar_exact_8() {
1227        // Exactly 8 output columns (AVX2 width)
1228        let src: Vec<f32> = (0..256).map(|i| (i as f32).sin()).collect();
1229        assert_bilinear_matches_scalar(&src, 16, 16, 8, 8, "exact_8");
1230    }
1231
1232    #[test]
1233    fn test_bilinear_simd_vs_scalar_non_multiple_7() {
1234        // 7 output columns - not a multiple of 4 or 8
1235        let src: Vec<f32> = (0..196).map(|i| i as f32 * 0.05).collect();
1236        assert_bilinear_matches_scalar(&src, 14, 14, 7, 7, "non_multiple_7");
1237    }
1238
1239    #[test]
1240    fn test_bilinear_simd_vs_scalar_non_multiple_13() {
1241        // 13 output columns - exercises both SIMD and scalar remainder
1242        let src: Vec<f32> = (0..400).map(|i| (i as f32 * 0.1).cos()).collect();
1243        assert_bilinear_matches_scalar(&src, 20, 20, 13, 13, "non_multiple_13");
1244    }
1245
1246    #[test]
1247    fn test_bilinear_simd_vs_scalar_large_256() {
1248        // 256x256 -> 128x128 - large enough to exercise multiple tiles
1249        let src: Vec<f32> = (0..256 * 256)
1250            .map(|i| {
1251                let x = (i % 256) as f32;
1252                let y = (i / 256) as f32;
1253                (x * 0.01).sin() + (y * 0.01).cos()
1254            })
1255            .collect();
1256        assert_bilinear_matches_scalar(&src, 256, 256, 128, 128, "large_256");
1257    }
1258
1259    #[test]
1260    fn test_bilinear_simd_vs_scalar_upsample() {
1261        // 10x10 -> 37x37 - upsampling with odd dimensions
1262        let src: Vec<f32> = (0..100).map(|i| i as f32).collect();
1263        assert_bilinear_matches_scalar(&src, 10, 10, 37, 37, "upsample_37");
1264    }
1265
1266    #[test]
1267    fn test_bilinear_simd_vs_scalar_identity() {
1268        // Same size - identity resampling
1269        let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.5).collect();
1270        assert_bilinear_matches_scalar(&src, 8, 8, 8, 8, "identity_8x8");
1271    }
1272
1273    #[test]
1274    fn test_bicubic_simd_vs_scalar_exact_4() {
1275        let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.1).collect();
1276        assert_bicubic_matches_scalar(&src, 8, 8, 4, 4, "exact_4");
1277    }
1278
1279    #[test]
1280    fn test_bicubic_simd_vs_scalar_exact_8() {
1281        let src: Vec<f32> = (0..256).map(|i| (i as f32).sin()).collect();
1282        assert_bicubic_matches_scalar(&src, 16, 16, 8, 8, "exact_8");
1283    }
1284
1285    #[test]
1286    fn test_bicubic_simd_vs_scalar_non_multiple_7() {
1287        let src: Vec<f32> = (0..196).map(|i| i as f32 * 0.05).collect();
1288        assert_bicubic_matches_scalar(&src, 14, 14, 7, 7, "non_multiple_7");
1289    }
1290
1291    #[test]
1292    fn test_bicubic_simd_vs_scalar_non_multiple_13() {
1293        let src: Vec<f32> = (0..400).map(|i| (i as f32 * 0.1).cos()).collect();
1294        assert_bicubic_matches_scalar(&src, 20, 20, 13, 13, "non_multiple_13");
1295    }
1296
1297    #[test]
1298    fn test_bicubic_simd_vs_scalar_large_128() {
1299        // 128x128 -> 64x64 bicubic
1300        let src: Vec<f32> = (0..128 * 128)
1301            .map(|i| {
1302                let x = (i % 128) as f32;
1303                let y = (i / 128) as f32;
1304                (x * 0.02).sin() + (y * 0.02).cos()
1305            })
1306            .collect();
1307        assert_bicubic_matches_scalar(&src, 128, 128, 64, 64, "large_128");
1308    }
1309
1310    #[test]
1311    fn test_bicubic_simd_vs_scalar_upsample() {
1312        // 8x8 -> 19x19 - upsampling with odd target
1313        let src: Vec<f32> = (0..64).map(|i| i as f32).collect();
1314        assert_bicubic_matches_scalar(&src, 8, 8, 19, 19, "upsample_19");
1315    }
1316
1317    #[test]
1318    fn test_bicubic_simd_vs_scalar_identity() {
1319        let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.5).collect();
1320        assert_bicubic_matches_scalar(&src, 8, 8, 8, 8, "identity_8x8");
1321    }
1322
1323    #[test]
1324    fn test_bilinear_simd_vs_scalar_asymmetric() {
1325        // Non-square: 20x10 -> 7x15
1326        let src: Vec<f32> = (0..200).map(|i| (i as f32 * 0.1).sin()).collect();
1327        assert_bilinear_matches_scalar(&src, 20, 10, 7, 15, "asymmetric_20x10_to_7x15");
1328    }
1329
1330    #[test]
1331    fn test_bicubic_simd_vs_scalar_asymmetric() {
1332        // Non-square: 16x8 -> 5x11
1333        let src: Vec<f32> = (0..128).map(|i| (i as f32 * 0.1).cos()).collect();
1334        assert_bicubic_matches_scalar(&src, 16, 8, 5, 11, "asymmetric_16x8_to_5x11");
1335    }
1336
1337    #[test]
1338    fn test_bilinear_constant_gradient() {
1339        // Linear gradient: bilinear should reproduce it exactly
1340        let w = 32_usize;
1341        let h = 32_usize;
1342        let src: Vec<f32> = (0..w * h)
1343            .map(|i| {
1344                let x = (i % w) as f32;
1345                let y = (i / w) as f32;
1346                x * 2.0 + y * 3.0
1347            })
1348            .collect();
1349        let dw = 16_usize;
1350        let dh = 16_usize;
1351        let mut dst = vec![0.0_f32; dw * dh];
1352        bilinear_f32(&src, w, h, &mut dst, dw, dh).expect("bilinear gradient test should succeed");
1353
1354        // Verify reasonable gradient reconstruction
1355        // For a linear function, bilinear should give close values
1356        for dy in 0..dh {
1357            for dx in 0..dw {
1358                let val = dst[dy * dw + dx];
1359                assert!(val.is_finite(), "Non-finite at ({dx},{dy}): {val}");
1360            }
1361        }
1362    }
1363
1364    #[test]
1365    fn test_bicubic_monotonicity() {
1366        // Monotonically increasing input should produce monotonically increasing output
1367        // within each row/column (approximately, given Catmull-Rom overshoot is small)
1368        let w = 16_usize;
1369        let h = 4_usize;
1370        let src: Vec<f32> = (0..w * h).map(|i| i as f32).collect();
1371        let dw = 8_usize;
1372        let dh = 4_usize;
1373        let mut dst = vec![0.0_f32; dw * dh];
1374        bicubic_f32(&src, w, h, &mut dst, dw, dh)
1375            .expect("bicubic monotonicity test should succeed");
1376
1377        // Values should generally increase left to right within a row
1378        for dy in 0..dh {
1379            let first = dst[dy * dw];
1380            let last = dst[dy * dw + dw - 1];
1381            assert!(
1382                last >= first,
1383                "Row {dy}: first={first}, last={last} - should increase"
1384            );
1385        }
1386    }
1387}