1#![allow(unsafe_code)]
39
40use crate::error::{AlgorithmError, Result};
41
42fn validate_bilinear(
48 src: &[f32],
49 src_width: usize,
50 src_height: usize,
51 dst: &[f32],
52 dst_width: usize,
53 dst_height: usize,
54) -> Result<()> {
55 if src.len() != src_width * src_height {
56 return Err(AlgorithmError::InvalidParameter {
57 parameter: "input",
58 message: "Source buffer size doesn't match dimensions".to_string(),
59 });
60 }
61 if dst.len() != dst_width * dst_height {
62 return Err(AlgorithmError::InvalidParameter {
63 parameter: "input",
64 message: "Destination buffer size doesn't match dimensions".to_string(),
65 });
66 }
67 if src_width == 0 || src_height == 0 || dst_width == 0 || dst_height == 0 {
68 return Err(AlgorithmError::InvalidParameter {
69 parameter: "input",
70 message: "Dimensions must be greater than 0".to_string(),
71 });
72 }
73 Ok(())
74}
75
76fn validate_bicubic(
78 src: &[f32],
79 src_width: usize,
80 src_height: usize,
81 dst: &[f32],
82 dst_width: usize,
83 dst_height: usize,
84) -> Result<()> {
85 if src.len() != src_width * src_height {
86 return Err(AlgorithmError::InvalidParameter {
87 parameter: "input",
88 message: "Source buffer size doesn't match dimensions".to_string(),
89 });
90 }
91 if dst.len() != dst_width * dst_height {
92 return Err(AlgorithmError::InvalidParameter {
93 parameter: "input",
94 message: "Destination buffer size doesn't match dimensions".to_string(),
95 });
96 }
97 if src_width < 4 || src_height < 4 {
98 return Err(AlgorithmError::InvalidParameter {
99 parameter: "input",
100 message: "Source dimensions must be at least 4x4 for bicubic".to_string(),
101 });
102 }
103 if dst_width == 0 || dst_height == 0 {
104 return Err(AlgorithmError::InvalidParameter {
105 parameter: "input",
106 message: "Destination dimensions must be greater than 0".to_string(),
107 });
108 }
109 Ok(())
110}
111
112#[inline]
120fn cubic_kernel(t: f32) -> [f32; 4] {
121 let t2 = t * t;
122 let t3 = t2 * t;
123 [
124 -0.5 * t3 + t2 - 0.5 * t,
125 1.5 * t3 - 2.5 * t2 + 1.0,
126 -1.5 * t3 + 2.0 * t2 + 0.5 * t,
127 0.5 * t3 - 0.5 * t2,
128 ]
129}
130
131mod scalar_impl {
136 use super::cubic_kernel;
137
138 pub(crate) fn bilinear_f32(
140 src: &[f32],
141 src_width: usize,
142 src_height: usize,
143 dst: &mut [f32],
144 dst_width: usize,
145 dst_height: usize,
146 ) {
147 let x_scale = src_width as f32 / dst_width as f32;
148 let y_scale = src_height as f32 / dst_height as f32;
149 const TILE_SIZE: usize = 64;
150
151 for tile_y in (0..dst_height).step_by(TILE_SIZE) {
152 let tile_h = TILE_SIZE.min(dst_height - tile_y);
153 for tile_x in (0..dst_width).step_by(TILE_SIZE) {
154 let tile_w = TILE_SIZE.min(dst_width - tile_x);
155 for y in tile_y..(tile_y + tile_h) {
156 let src_y = (y as f32 + 0.5) * y_scale - 0.5;
157 let src_y0 = src_y.max(0.0) as usize;
158 let src_y1 = (src_y0 + 1).min(src_height - 1);
159 let y_frac = (src_y - src_y0 as f32).max(0.0).min(1.0);
160
161 for x in tile_x..(tile_x + tile_w) {
162 let src_x = (x as f32 + 0.5) * x_scale - 0.5;
163 let src_x0 = src_x.max(0.0) as usize;
164 let src_x1 = (src_x0 + 1).min(src_width - 1);
165 let x_frac = (src_x - src_x0 as f32).max(0.0).min(1.0);
166
167 let p00 = src[src_y0 * src_width + src_x0];
168 let p10 = src[src_y0 * src_width + src_x1];
169 let p01 = src[src_y1 * src_width + src_x0];
170 let p11 = src[src_y1 * src_width + src_x1];
171
172 let p0 = p00 + (p10 - p00) * x_frac;
173 let p1 = p01 + (p11 - p01) * x_frac;
174 let value = p0 + (p1 - p0) * y_frac;
175
176 dst[y * dst_width + x] = value;
177 }
178 }
179 }
180 }
181 }
182
183 pub(crate) fn bicubic_f32(
185 src: &[f32],
186 src_width: usize,
187 src_height: usize,
188 dst: &mut [f32],
189 dst_width: usize,
190 dst_height: usize,
191 ) {
192 let x_scale = src_width as f32 / dst_width as f32;
193 let y_scale = src_height as f32 / dst_height as f32;
194 const TILE_SIZE: usize = 32;
195
196 for tile_y in (0..dst_height).step_by(TILE_SIZE) {
197 let tile_h = TILE_SIZE.min(dst_height - tile_y);
198 for tile_x in (0..dst_width).step_by(TILE_SIZE) {
199 let tile_w = TILE_SIZE.min(dst_width - tile_x);
200 for y in tile_y..(tile_y + tile_h) {
201 let src_y = (y as f32 + 0.5) * y_scale - 0.5;
202 let src_y_base = src_y.floor() as isize;
203 let y_frac = (src_y - src_y_base as f32).max(0.0).min(1.0);
204 let y_weights = cubic_kernel(y_frac);
205
206 for x in tile_x..(tile_x + tile_w) {
207 let src_x = (x as f32 + 0.5) * x_scale - 0.5;
208 let src_x_base = src_x.floor() as isize;
209 let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
210 let x_weights = cubic_kernel(x_frac);
211
212 let mut value = 0.0_f32;
213 for ky in 0..4 {
214 let sy = (src_y_base - 1 + ky as isize)
215 .clamp(0, src_height as isize - 1)
216 as usize;
217 let mut row_sum = 0.0_f32;
218 for kx in 0..4 {
219 let sx = (src_x_base - 1 + kx as isize)
220 .clamp(0, src_width as isize - 1)
221 as usize;
222 row_sum += src[sy * src_width + sx] * x_weights[kx];
223 }
224 value += row_sum * y_weights[ky];
225 }
226 dst[y * dst_width + x] = value;
227 }
228 }
229 }
230 }
231 }
232}
233
234#[cfg(target_arch = "aarch64")]
239mod neon_impl {
240 use std::arch::aarch64::*;
241
242 use super::cubic_kernel;
243
244 #[target_feature(enable = "neon")]
252 pub(crate) unsafe fn bilinear_f32(
253 src: &[f32],
254 src_width: usize,
255 src_height: usize,
256 dst: &mut [f32],
257 dst_width: usize,
258 dst_height: usize,
259 ) {
260 unsafe {
261 let x_scale = src_width as f32 / dst_width as f32;
262 let y_scale = src_height as f32 / dst_height as f32;
263 let src_ptr = src.as_ptr();
264 let dst_ptr = dst.as_mut_ptr();
265 const TILE_SIZE: usize = 64;
266 let max_sx = (src_width - 1) as f32;
267 let max_sy = (src_height - 1) as f32;
268
269 for tile_y in (0..dst_height).step_by(TILE_SIZE) {
270 let tile_h = TILE_SIZE.min(dst_height - tile_y);
271 for tile_x in (0..dst_width).step_by(TILE_SIZE) {
272 let tile_w = TILE_SIZE.min(dst_width - tile_x);
273 for y in tile_y..(tile_y + tile_h) {
274 let src_y = (y as f32 + 0.5) * y_scale - 0.5;
275 let src_y_clamped = src_y.max(0.0).min(max_sy);
276 let src_y0 = src_y_clamped as usize;
277 let src_y1 = (src_y0 + 1).min(src_height - 1);
278 let y_frac = (src_y - src_y0 as f32).max(0.0).min(1.0);
279 let vy_frac = vdupq_n_f32(y_frac);
280 let vy_frac_inv = vdupq_n_f32(1.0 - y_frac);
281
282 let row0_base = src_y0 * src_width;
283 let row1_base = src_y1 * src_width;
284 let dst_row = y * dst_width;
285
286 let simd_end = tile_x + (tile_w / 4) * 4;
288 let mut x = tile_x;
289 while x < simd_end {
290 let mut sx0 = [0_usize; 4];
292 let mut sx1 = [0_usize; 4];
293 let mut xf = [0.0_f32; 4];
294
295 for i in 0..4 {
296 let src_x = ((x + i) as f32 + 0.5) * x_scale - 0.5;
297 let src_x_clamped = src_x.max(0.0).min(max_sx);
298 sx0[i] = src_x_clamped as usize;
299 sx1[i] = (sx0[i] + 1).min(src_width - 1);
300 xf[i] = (src_x - sx0[i] as f32).max(0.0).min(1.0);
301 }
302
303 let vp00 = vld1q_f32(
305 [
306 *src_ptr.add(row0_base + sx0[0]),
307 *src_ptr.add(row0_base + sx0[1]),
308 *src_ptr.add(row0_base + sx0[2]),
309 *src_ptr.add(row0_base + sx0[3]),
310 ]
311 .as_ptr(),
312 );
313 let vp10 = vld1q_f32(
314 [
315 *src_ptr.add(row0_base + sx1[0]),
316 *src_ptr.add(row0_base + sx1[1]),
317 *src_ptr.add(row0_base + sx1[2]),
318 *src_ptr.add(row0_base + sx1[3]),
319 ]
320 .as_ptr(),
321 );
322 let vp01 = vld1q_f32(
323 [
324 *src_ptr.add(row1_base + sx0[0]),
325 *src_ptr.add(row1_base + sx0[1]),
326 *src_ptr.add(row1_base + sx0[2]),
327 *src_ptr.add(row1_base + sx0[3]),
328 ]
329 .as_ptr(),
330 );
331 let vp11 = vld1q_f32(
332 [
333 *src_ptr.add(row1_base + sx1[0]),
334 *src_ptr.add(row1_base + sx1[1]),
335 *src_ptr.add(row1_base + sx1[2]),
336 *src_ptr.add(row1_base + sx1[3]),
337 ]
338 .as_ptr(),
339 );
340
341 let vxf = vld1q_f32(xf.as_ptr());
342 let vxf_inv = vsubq_f32(vdupq_n_f32(1.0), vxf);
343
344 let top = vfmaq_f32(vmulq_f32(vp00, vxf_inv), vp10, vxf);
347 let bot = vfmaq_f32(vmulq_f32(vp01, vxf_inv), vp11, vxf);
349 let result = vfmaq_f32(vmulq_f32(top, vy_frac_inv), bot, vy_frac);
351
352 vst1q_f32(dst_ptr.add(dst_row + x), result);
353 x += 4;
354 }
355
356 while x < tile_x + tile_w {
358 let src_x = (x as f32 + 0.5) * x_scale - 0.5;
359 let src_x0 = src_x.max(0.0) as usize;
360 let src_x1 = (src_x0 + 1).min(src_width - 1);
361 let x_frac = (src_x - src_x0 as f32).max(0.0).min(1.0);
362
363 let p00 = *src_ptr.add(row0_base + src_x0);
364 let p10 = *src_ptr.add(row0_base + src_x1);
365 let p01 = *src_ptr.add(row1_base + src_x0);
366 let p11 = *src_ptr.add(row1_base + src_x1);
367
368 let p0 = p00 + (p10 - p00) * x_frac;
369 let p1 = p01 + (p11 - p01) * x_frac;
370 *dst_ptr.add(dst_row + x) = p0 + (p1 - p0) * y_frac;
371 x += 1;
372 }
373 }
374 }
375 }
376 }
377 }
378
379 #[target_feature(enable = "neon")]
388 pub(crate) unsafe fn bicubic_f32(
389 src: &[f32],
390 src_width: usize,
391 src_height: usize,
392 dst: &mut [f32],
393 dst_width: usize,
394 dst_height: usize,
395 ) {
396 unsafe {
397 let x_scale = src_width as f32 / dst_width as f32;
398 let y_scale = src_height as f32 / dst_height as f32;
399 let src_ptr = src.as_ptr();
400 let dst_ptr = dst.as_mut_ptr();
401 let iw = src_width as isize;
402 let ih = src_height as isize;
403 const TILE_SIZE: usize = 32;
404
405 for tile_y in (0..dst_height).step_by(TILE_SIZE) {
406 let tile_h = TILE_SIZE.min(dst_height - tile_y);
407 for tile_x in (0..dst_width).step_by(TILE_SIZE) {
408 let tile_w = TILE_SIZE.min(dst_width - tile_x);
409 for y in tile_y..(tile_y + tile_h) {
410 let src_y = (y as f32 + 0.5) * y_scale - 0.5;
411 let src_y_base = src_y.floor() as isize;
412 let y_frac = (src_y - src_y_base as f32).max(0.0).min(1.0);
413 let y_weights = cubic_kernel(y_frac);
414 let vy_weights = vld1q_f32(y_weights.as_ptr());
415
416 let sy = [
418 (src_y_base - 1).clamp(0, ih - 1) as usize,
419 (src_y_base).clamp(0, ih - 1) as usize,
420 (src_y_base + 1).clamp(0, ih - 1) as usize,
421 (src_y_base + 2).clamp(0, ih - 1) as usize,
422 ];
423
424 let dst_row = y * dst_width;
425
426 for x in tile_x..(tile_x + tile_w) {
427 let src_x = (x as f32 + 0.5) * x_scale - 0.5;
428 let src_x_base = src_x.floor() as isize;
429 let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
430 let x_weights = cubic_kernel(x_frac);
431 let vx_weights = vld1q_f32(x_weights.as_ptr());
432
433 let sx = [
435 (src_x_base - 1).clamp(0, iw - 1) as usize,
436 (src_x_base).clamp(0, iw - 1) as usize,
437 (src_x_base + 1).clamp(0, iw - 1) as usize,
438 (src_x_base + 2).clamp(0, iw - 1) as usize,
439 ];
440
441 let mut row_sums = [0.0_f32; 4];
443 for ky in 0..4 {
444 let row_off = sy[ky] * src_width;
445 let pixels = vld1q_f32(
446 [
447 *src_ptr.add(row_off + sx[0]),
448 *src_ptr.add(row_off + sx[1]),
449 *src_ptr.add(row_off + sx[2]),
450 *src_ptr.add(row_off + sx[3]),
451 ]
452 .as_ptr(),
453 );
454 let prod = vmulq_f32(pixels, vx_weights);
456 row_sums[ky] = vaddvq_f32(prod);
457 }
458
459 let vrow_sums = vld1q_f32(row_sums.as_ptr());
461 let vprod = vmulq_f32(vrow_sums, vy_weights);
462 let value = vaddvq_f32(vprod);
463
464 *dst_ptr.add(dst_row + x) = value;
465 }
466 }
467 }
468 }
469 }
470 }
471}
472
473#[cfg(target_arch = "x86_64")]
478mod avx2_impl {
479 use std::arch::x86_64::*;
480
481 use super::cubic_kernel;
482
483 #[target_feature(enable = "avx2", enable = "fma")]
490 pub(crate) unsafe fn bilinear_f32(
491 src: &[f32],
492 src_width: usize,
493 src_height: usize,
494 dst: &mut [f32],
495 dst_width: usize,
496 dst_height: usize,
497 ) {
498 unsafe {
499 let x_scale = src_width as f32 / dst_width as f32;
500 let y_scale = src_height as f32 / dst_height as f32;
501 let src_ptr = src.as_ptr();
502 let dst_ptr = dst.as_mut_ptr();
503 const TILE_SIZE: usize = 64;
504 let max_sx = (src_width - 1) as f32;
505 let max_sy = (src_height - 1) as f32;
506
507 for tile_y in (0..dst_height).step_by(TILE_SIZE) {
508 let tile_h = TILE_SIZE.min(dst_height - tile_y);
509 for tile_x in (0..dst_width).step_by(TILE_SIZE) {
510 let tile_w = TILE_SIZE.min(dst_width - tile_x);
511 for y in tile_y..(tile_y + tile_h) {
512 let src_y = (y as f32 + 0.5) * y_scale - 0.5;
513 let src_y_clamped = src_y.max(0.0).min(max_sy);
514 let src_y0 = src_y_clamped as usize;
515 let src_y1 = (src_y0 + 1).min(src_height - 1);
516 let y_frac = (src_y - src_y0 as f32).max(0.0).min(1.0);
517 let vy_frac = _mm256_set1_ps(y_frac);
518 let vy_frac_inv = _mm256_set1_ps(1.0 - y_frac);
519
520 let row0_base = src_y0 * src_width;
521 let row1_base = src_y1 * src_width;
522 let dst_row = y * dst_width;
523
524 let simd_end = tile_x + (tile_w / 8) * 8;
526 let mut x = tile_x;
527 while x < simd_end {
528 let mut sx0 = [0_usize; 8];
530 let mut sx1 = [0_usize; 8];
531 let mut xf = [0.0_f32; 8];
532
533 for i in 0..8 {
534 let src_x = ((x + i) as f32 + 0.5) * x_scale - 0.5;
535 let src_x_clamped = src_x.max(0.0).min(max_sx);
536 sx0[i] = src_x_clamped as usize;
537 sx1[i] = (sx0[i] + 1).min(src_width - 1);
538 xf[i] = (src_x - sx0[i] as f32).max(0.0).min(1.0);
539 }
540
541 let vp00 = _mm256_set_ps(
543 *src_ptr.add(row0_base + sx0[7]),
544 *src_ptr.add(row0_base + sx0[6]),
545 *src_ptr.add(row0_base + sx0[5]),
546 *src_ptr.add(row0_base + sx0[4]),
547 *src_ptr.add(row0_base + sx0[3]),
548 *src_ptr.add(row0_base + sx0[2]),
549 *src_ptr.add(row0_base + sx0[1]),
550 *src_ptr.add(row0_base + sx0[0]),
551 );
552 let vp10 = _mm256_set_ps(
553 *src_ptr.add(row0_base + sx1[7]),
554 *src_ptr.add(row0_base + sx1[6]),
555 *src_ptr.add(row0_base + sx1[5]),
556 *src_ptr.add(row0_base + sx1[4]),
557 *src_ptr.add(row0_base + sx1[3]),
558 *src_ptr.add(row0_base + sx1[2]),
559 *src_ptr.add(row0_base + sx1[1]),
560 *src_ptr.add(row0_base + sx1[0]),
561 );
562 let vp01 = _mm256_set_ps(
563 *src_ptr.add(row1_base + sx0[7]),
564 *src_ptr.add(row1_base + sx0[6]),
565 *src_ptr.add(row1_base + sx0[5]),
566 *src_ptr.add(row1_base + sx0[4]),
567 *src_ptr.add(row1_base + sx0[3]),
568 *src_ptr.add(row1_base + sx0[2]),
569 *src_ptr.add(row1_base + sx0[1]),
570 *src_ptr.add(row1_base + sx0[0]),
571 );
572 let vp11 = _mm256_set_ps(
573 *src_ptr.add(row1_base + sx1[7]),
574 *src_ptr.add(row1_base + sx1[6]),
575 *src_ptr.add(row1_base + sx1[5]),
576 *src_ptr.add(row1_base + sx1[4]),
577 *src_ptr.add(row1_base + sx1[3]),
578 *src_ptr.add(row1_base + sx1[2]),
579 *src_ptr.add(row1_base + sx1[1]),
580 *src_ptr.add(row1_base + sx1[0]),
581 );
582
583 let vxf = _mm256_loadu_ps(xf.as_ptr());
584 let vxf_inv = _mm256_sub_ps(_mm256_set1_ps(1.0), vxf);
585
586 let top = _mm256_fmadd_ps(vp10, vxf, _mm256_mul_ps(vp00, vxf_inv));
589 let bot = _mm256_fmadd_ps(vp11, vxf, _mm256_mul_ps(vp01, vxf_inv));
591 let result =
593 _mm256_fmadd_ps(bot, vy_frac, _mm256_mul_ps(top, vy_frac_inv));
594
595 _mm256_storeu_ps(dst_ptr.add(dst_row + x), result);
596 x += 8;
597 }
598
599 while x < tile_x + tile_w {
601 let src_x = (x as f32 + 0.5) * x_scale - 0.5;
602 let src_x0 = src_x.max(0.0) as usize;
603 let src_x1 = (src_x0 + 1).min(src_width - 1);
604 let x_frac = (src_x - src_x0 as f32).max(0.0).min(1.0);
605
606 let p00 = *src_ptr.add(row0_base + src_x0);
607 let p10 = *src_ptr.add(row0_base + src_x1);
608 let p01 = *src_ptr.add(row1_base + src_x0);
609 let p11 = *src_ptr.add(row1_base + src_x1);
610
611 let p0 = p00 + (p10 - p00) * x_frac;
612 let p1 = p01 + (p11 - p01) * x_frac;
613 *dst_ptr.add(dst_row + x) = p0 + (p1 - p0) * y_frac;
614 x += 1;
615 }
616 }
617 }
618 }
619 }
620 }
621
622 #[target_feature(enable = "avx2", enable = "fma")]
629 pub(crate) unsafe fn bicubic_f32(
630 src: &[f32],
631 src_width: usize,
632 src_height: usize,
633 dst: &mut [f32],
634 dst_width: usize,
635 dst_height: usize,
636 ) {
637 unsafe {
638 let x_scale = src_width as f32 / dst_width as f32;
639 let y_scale = src_height as f32 / dst_height as f32;
640 let src_ptr = src.as_ptr();
641 let dst_ptr = dst.as_mut_ptr();
642 let iw = src_width as isize;
643 let ih = src_height as isize;
644 const TILE_SIZE: usize = 32;
645
646 for tile_y in (0..dst_height).step_by(TILE_SIZE) {
647 let tile_h = TILE_SIZE.min(dst_height - tile_y);
648 for tile_x in (0..dst_width).step_by(TILE_SIZE) {
649 let tile_w = TILE_SIZE.min(dst_width - tile_x);
650 for y in tile_y..(tile_y + tile_h) {
651 let src_y = (y as f32 + 0.5) * y_scale - 0.5;
652 let src_y_base = src_y.floor() as isize;
653 let y_frac = (src_y - src_y_base as f32).max(0.0).min(1.0);
654 let y_weights = cubic_kernel(y_frac);
655
656 let sy = [
657 (src_y_base - 1).clamp(0, ih - 1) as usize,
658 (src_y_base).clamp(0, ih - 1) as usize,
659 (src_y_base + 1).clamp(0, ih - 1) as usize,
660 (src_y_base + 2).clamp(0, ih - 1) as usize,
661 ];
662
663 let dst_row = y * dst_width;
664
665 let simd_end = tile_x + (tile_w / 8) * 8;
667 let mut x = tile_x;
668 while x < simd_end {
669 let mut all_sx = [[0_usize; 4]; 8];
671 let mut all_xw = [[0.0_f32; 4]; 8];
672
673 for i in 0..8 {
674 let src_x = ((x + i) as f32 + 0.5) * x_scale - 0.5;
675 let src_x_base = src_x.floor() as isize;
676 let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
677 all_xw[i] = cubic_kernel(x_frac);
678 all_sx[i] = [
679 (src_x_base - 1).clamp(0, iw - 1) as usize,
680 (src_x_base).clamp(0, iw - 1) as usize,
681 (src_x_base + 1).clamp(0, iw - 1) as usize,
682 (src_x_base + 2).clamp(0, iw - 1) as usize,
683 ];
684 }
685
686 let mut vaccum = _mm256_setzero_ps();
689
690 for ky in 0..4 {
691 let row_off = sy[ky] * src_width;
692 let vy_w = _mm256_set1_ps(y_weights[ky]);
693
694 let mut vrow_sum = _mm256_setzero_ps();
696
697 for kx in 0..4 {
698 let vp = _mm256_set_ps(
700 *src_ptr.add(row_off + all_sx[7][kx]),
701 *src_ptr.add(row_off + all_sx[6][kx]),
702 *src_ptr.add(row_off + all_sx[5][kx]),
703 *src_ptr.add(row_off + all_sx[4][kx]),
704 *src_ptr.add(row_off + all_sx[3][kx]),
705 *src_ptr.add(row_off + all_sx[2][kx]),
706 *src_ptr.add(row_off + all_sx[1][kx]),
707 *src_ptr.add(row_off + all_sx[0][kx]),
708 );
709 let vxw = _mm256_set_ps(
711 all_xw[7][kx],
712 all_xw[6][kx],
713 all_xw[5][kx],
714 all_xw[4][kx],
715 all_xw[3][kx],
716 all_xw[2][kx],
717 all_xw[1][kx],
718 all_xw[0][kx],
719 );
720 vrow_sum = _mm256_fmadd_ps(vp, vxw, vrow_sum);
722 }
723
724 vaccum = _mm256_fmadd_ps(vrow_sum, vy_w, vaccum);
726 }
727
728 _mm256_storeu_ps(dst_ptr.add(dst_row + x), vaccum);
729 x += 8;
730 }
731
732 while x < tile_x + tile_w {
734 let src_x = (x as f32 + 0.5) * x_scale - 0.5;
735 let src_x_base = src_x.floor() as isize;
736 let x_frac = (src_x - src_x_base as f32).max(0.0).min(1.0);
737 let x_weights = cubic_kernel(x_frac);
738 let sx = [
739 (src_x_base - 1).clamp(0, iw - 1) as usize,
740 (src_x_base).clamp(0, iw - 1) as usize,
741 (src_x_base + 1).clamp(0, iw - 1) as usize,
742 (src_x_base + 2).clamp(0, iw - 1) as usize,
743 ];
744 let mut value = 0.0_f32;
745 for ky in 0..4 {
746 let row_off = sy[ky] * src_width;
747 let mut row_sum = 0.0_f32;
748 for kx in 0..4 {
749 row_sum += *src_ptr.add(row_off + sx[kx]) * x_weights[kx];
750 }
751 value += row_sum * y_weights[ky];
752 }
753 *dst_ptr.add(dst_row + x) = value;
754 x += 1;
755 }
756 }
757 }
758 }
759 }
760 }
761}
762
763pub fn bilinear_f32(
789 src: &[f32],
790 src_width: usize,
791 src_height: usize,
792 dst: &mut [f32],
793 dst_width: usize,
794 dst_height: usize,
795) -> Result<()> {
796 validate_bilinear(src, src_width, src_height, dst, dst_width, dst_height)?;
797
798 #[cfg(target_arch = "aarch64")]
799 {
800 unsafe {
802 neon_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
803 }
804 }
805
806 #[cfg(target_arch = "x86_64")]
807 {
808 if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
809 unsafe {
811 avx2_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
812 }
813 } else {
814 scalar_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
815 }
816 }
817
818 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
819 {
820 scalar_impl::bilinear_f32(src, src_width, src_height, dst, dst_width, dst_height);
821 }
822
823 Ok(())
824}
825
826pub fn bicubic_f32(
845 src: &[f32],
846 src_width: usize,
847 src_height: usize,
848 dst: &mut [f32],
849 dst_width: usize,
850 dst_height: usize,
851) -> Result<()> {
852 validate_bicubic(src, src_width, src_height, dst, dst_width, dst_height)?;
853
854 #[cfg(target_arch = "aarch64")]
855 {
856 unsafe {
858 neon_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
859 }
860 }
861
862 #[cfg(target_arch = "x86_64")]
863 {
864 if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
865 unsafe {
867 avx2_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
868 }
869 } else {
870 scalar_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
871 }
872 }
873
874 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
875 {
876 scalar_impl::bicubic_f32(src, src_width, src_height, dst, dst_width, dst_height);
877 }
878
879 Ok(())
880}
881
882pub fn nearest_f32(
900 src: &[f32],
901 src_width: usize,
902 src_height: usize,
903 dst: &mut [f32],
904 dst_width: usize,
905 dst_height: usize,
906) -> Result<()> {
907 if src.len() != src_width * src_height {
908 return Err(AlgorithmError::InvalidParameter {
909 parameter: "input",
910 message: "Source buffer size doesn't match dimensions".to_string(),
911 });
912 }
913 if dst.len() != dst_width * dst_height {
914 return Err(AlgorithmError::InvalidParameter {
915 parameter: "input",
916 message: "Destination buffer size doesn't match dimensions".to_string(),
917 });
918 }
919 if src_width == 0 || src_height == 0 || dst_width == 0 || dst_height == 0 {
920 return Err(AlgorithmError::InvalidParameter {
921 parameter: "input",
922 message: "Dimensions must be greater than 0".to_string(),
923 });
924 }
925
926 let x_ratio = src_width as f32 / dst_width as f32;
927 let y_ratio = src_height as f32 / dst_height as f32;
928
929 for y in 0..dst_height {
930 let src_y = ((y as f32 * y_ratio) as usize).min(src_height - 1);
931 for x in 0..dst_width {
932 let src_x = ((x as f32 * x_ratio) as usize).min(src_width - 1);
933 dst[y * dst_width + x] = src[src_y * src_width + src_x];
934 }
935 }
936
937 Ok(())
938}
939
940pub fn downsample_average_f32(
958 src: &[f32],
959 src_width: usize,
960 src_height: usize,
961 dst: &mut [f32],
962 dst_width: usize,
963 dst_height: usize,
964) -> Result<()> {
965 if src.len() != src_width * src_height {
966 return Err(AlgorithmError::InvalidParameter {
967 parameter: "input",
968 message: "Source buffer size doesn't match dimensions".to_string(),
969 });
970 }
971 if dst.len() != dst_width * dst_height {
972 return Err(AlgorithmError::InvalidParameter {
973 parameter: "input",
974 message: "Destination buffer size doesn't match dimensions".to_string(),
975 });
976 }
977 if dst_width > src_width || dst_height > src_height {
978 return Err(AlgorithmError::InvalidParameter {
979 parameter: "input",
980 message: "This method is only for downsampling".to_string(),
981 });
982 }
983
984 let x_ratio = src_width as f32 / dst_width as f32;
985 let y_ratio = src_height as f32 / dst_height as f32;
986
987 for dst_y in 0..dst_height {
988 let src_y_start = (dst_y as f32 * y_ratio) as usize;
989 let src_y_end = (((dst_y + 1) as f32 * y_ratio) as usize).min(src_height);
990
991 for dst_x in 0..dst_width {
992 let src_x_start = (dst_x as f32 * x_ratio) as usize;
993 let src_x_end = (((dst_x + 1) as f32 * x_ratio) as usize).min(src_width);
994
995 let mut sum = 0.0_f32;
996 let mut count = 0;
997
998 for src_y in src_y_start..src_y_end {
999 for src_x in src_x_start..src_x_end {
1000 sum += src[src_y * src_width + src_x];
1001 count += 1;
1002 }
1003 }
1004
1005 dst[dst_y * dst_width + dst_x] = if count > 0 { sum / count as f32 } else { 0.0 };
1006 }
1007 }
1008
1009 Ok(())
1010}
1011
1012#[cfg(test)]
1017mod tests {
1018 use super::*;
1019 use approx::assert_relative_eq;
1020
1021 #[test]
1024 fn test_bilinear_identity() {
1025 let src = vec![1.0, 2.0, 3.0, 4.0];
1026 let mut dst = vec![0.0; 4];
1027 bilinear_f32(&src, 2, 2, &mut dst, 2, 2)
1028 .expect("bilinear_f32 identity resampling should succeed in test");
1029 for i in 0..4 {
1030 assert_relative_eq!(dst[i], src[i], epsilon = 1e-5);
1031 }
1032 }
1033
1034 #[test]
1035 fn test_bilinear_downsample() {
1036 let src = vec![
1037 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0,
1038 ];
1039 let mut dst = vec![0.0; 4];
1040 bilinear_f32(&src, 4, 4, &mut dst, 2, 2)
1041 .expect("bilinear_f32 downsampling should succeed in test");
1042 assert!(dst[0] < dst[1]);
1043 assert!(dst[2] < dst[3]);
1044 }
1045
1046 #[test]
1047 fn test_bilinear_upsample() {
1048 let src = vec![1.0, 2.0, 3.0, 4.0];
1049 let mut dst = vec![0.0; 16];
1050 bilinear_f32(&src, 2, 2, &mut dst, 4, 4)
1051 .expect("bilinear_f32 upsampling should succeed in test");
1052 assert_relative_eq!(dst[0], 1.0, epsilon = 1e-5);
1053 assert_relative_eq!(dst[15], 4.0, epsilon = 1e-5);
1054 }
1055
1056 #[test]
1059 fn test_bicubic_identity() {
1060 let src = vec![
1061 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
1062 ];
1063 let mut dst = vec![0.0; 16];
1064 bicubic_f32(&src, 4, 4, &mut dst, 4, 4)
1065 .expect("bicubic_f32 identity resampling should succeed in test");
1066 for i in 0..16 {
1067 assert_relative_eq!(dst[i], src[i], epsilon = 0.1);
1068 }
1069 }
1070
1071 #[test]
1074 fn test_nearest() {
1075 let src = vec![1.0, 2.0, 3.0, 4.0];
1076 let mut dst = vec![0.0; 4];
1077 nearest_f32(&src, 2, 2, &mut dst, 2, 2)
1078 .expect("nearest_f32 identity resampling should succeed in test");
1079 for i in 0..4 {
1080 assert_relative_eq!(dst[i], src[i]);
1081 }
1082 }
1083
1084 #[test]
1085 fn test_nearest_downsample() {
1086 let src = vec![
1087 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0,
1088 ];
1089 let mut dst = vec![0.0; 4];
1090 nearest_f32(&src, 4, 4, &mut dst, 2, 2)
1091 .expect("nearest_f32 downsampling should succeed in test");
1092 assert_relative_eq!(dst[0], 1.0);
1093 assert_relative_eq!(dst[1], 2.0);
1094 assert_relative_eq!(dst[2], 3.0);
1095 assert_relative_eq!(dst[3], 4.0);
1096 }
1097
1098 #[test]
1101 fn test_downsample_average() {
1102 let src = vec![
1103 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 4.0, 4.0,
1104 ];
1105 let mut dst = vec![0.0; 4];
1106 downsample_average_f32(&src, 4, 4, &mut dst, 2, 2)
1107 .expect("downsample_average_f32 should succeed in test");
1108 assert_relative_eq!(dst[0], 1.0);
1109 assert_relative_eq!(dst[1], 2.0);
1110 assert_relative_eq!(dst[2], 3.0);
1111 assert_relative_eq!(dst[3], 4.0);
1112 }
1113
1114 #[test]
1117 fn test_invalid_dimensions() {
1118 let src = vec![1.0; 10];
1119 let mut dst = vec![0.0; 4];
1120 assert!(bilinear_f32(&src, 4, 4, &mut dst, 2, 2).is_err());
1121
1122 let src = vec![1.0; 16];
1123 assert!(bilinear_f32(&src, 4, 4, &mut dst, 3, 3).is_err());
1124 }
1125
1126 #[test]
1127 fn test_bicubic_too_small() {
1128 let src = vec![1.0; 9];
1129 let mut dst = vec![0.0; 4];
1130 assert!(bicubic_f32(&src, 3, 3, &mut dst, 2, 2).is_err());
1131 }
1132
1133 #[test]
1134 fn test_cubic_kernel() {
1135 let weights = cubic_kernel(0.5);
1136 let sum: f32 = weights.iter().sum();
1137 assert_relative_eq!(sum, 1.0, epsilon = 1e-6);
1138 }
1139
1140 #[test]
1143 fn test_large_downsample() {
1144 let src = vec![1.0_f32; 1000 * 1000];
1145 let mut dst = vec![0.0_f32; 100 * 100];
1146 bilinear_f32(&src, 1000, 1000, &mut dst, 100, 100)
1147 .expect("bilinear_f32 large downsampling should succeed in test");
1148 for &val in &dst {
1149 assert_relative_eq!(val, 1.0);
1150 }
1151 }
1152
1153 fn scalar_bilinear(src: &[f32], sw: usize, sh: usize, dw: usize, dh: usize) -> Vec<f32> {
1157 let mut dst = vec![0.0_f32; dw * dh];
1158 scalar_impl::bilinear_f32(src, sw, sh, &mut dst, dw, dh);
1159 dst
1160 }
1161
1162 fn scalar_bicubic(src: &[f32], sw: usize, sh: usize, dw: usize, dh: usize) -> Vec<f32> {
1164 let mut dst = vec![0.0_f32; dw * dh];
1165 scalar_impl::bicubic_f32(src, sw, sh, &mut dst, dw, dh);
1166 dst
1167 }
1168
1169 fn assert_bilinear_matches_scalar(
1173 src: &[f32],
1174 sw: usize,
1175 sh: usize,
1176 dw: usize,
1177 dh: usize,
1178 label: &str,
1179 ) {
1180 let scalar = scalar_bilinear(src, sw, sh, dw, dh);
1181 let mut simd_dst = vec![0.0_f32; dw * dh];
1182 bilinear_f32(src, sw, sh, &mut simd_dst, dw, dh)
1183 .expect("bilinear_f32 should succeed for SIMD vs scalar comparison");
1184
1185 for (i, (&s, &d)) in scalar.iter().zip(simd_dst.iter()).enumerate() {
1186 assert!(
1187 (s - d).abs() < 1e-4,
1188 "bilinear mismatch at index {i} for {label}: scalar={s}, simd={d}, diff={}",
1189 (s - d).abs()
1190 );
1191 }
1192 }
1193
1194 fn assert_bicubic_matches_scalar(
1197 src: &[f32],
1198 sw: usize,
1199 sh: usize,
1200 dw: usize,
1201 dh: usize,
1202 label: &str,
1203 ) {
1204 let scalar = scalar_bicubic(src, sw, sh, dw, dh);
1205 let mut simd_dst = vec![0.0_f32; dw * dh];
1206 bicubic_f32(src, sw, sh, &mut simd_dst, dw, dh)
1207 .expect("bicubic_f32 should succeed for SIMD vs scalar comparison");
1208
1209 for (i, (&s, &d)) in scalar.iter().zip(simd_dst.iter()).enumerate() {
1210 assert!(
1211 (s - d).abs() < 1e-4,
1212 "bicubic mismatch at index {i} for {label}: scalar={s}, simd={d}, diff={}",
1213 (s - d).abs()
1214 );
1215 }
1216 }
1217
1218 #[test]
1219 fn test_bilinear_simd_vs_scalar_exact_4() {
1220 let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.1).collect();
1222 assert_bilinear_matches_scalar(&src, 8, 8, 4, 4, "exact_4");
1223 }
1224
1225 #[test]
1226 fn test_bilinear_simd_vs_scalar_exact_8() {
1227 let src: Vec<f32> = (0..256).map(|i| (i as f32).sin()).collect();
1229 assert_bilinear_matches_scalar(&src, 16, 16, 8, 8, "exact_8");
1230 }
1231
1232 #[test]
1233 fn test_bilinear_simd_vs_scalar_non_multiple_7() {
1234 let src: Vec<f32> = (0..196).map(|i| i as f32 * 0.05).collect();
1236 assert_bilinear_matches_scalar(&src, 14, 14, 7, 7, "non_multiple_7");
1237 }
1238
1239 #[test]
1240 fn test_bilinear_simd_vs_scalar_non_multiple_13() {
1241 let src: Vec<f32> = (0..400).map(|i| (i as f32 * 0.1).cos()).collect();
1243 assert_bilinear_matches_scalar(&src, 20, 20, 13, 13, "non_multiple_13");
1244 }
1245
1246 #[test]
1247 fn test_bilinear_simd_vs_scalar_large_256() {
1248 let src: Vec<f32> = (0..256 * 256)
1250 .map(|i| {
1251 let x = (i % 256) as f32;
1252 let y = (i / 256) as f32;
1253 (x * 0.01).sin() + (y * 0.01).cos()
1254 })
1255 .collect();
1256 assert_bilinear_matches_scalar(&src, 256, 256, 128, 128, "large_256");
1257 }
1258
1259 #[test]
1260 fn test_bilinear_simd_vs_scalar_upsample() {
1261 let src: Vec<f32> = (0..100).map(|i| i as f32).collect();
1263 assert_bilinear_matches_scalar(&src, 10, 10, 37, 37, "upsample_37");
1264 }
1265
1266 #[test]
1267 fn test_bilinear_simd_vs_scalar_identity() {
1268 let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.5).collect();
1270 assert_bilinear_matches_scalar(&src, 8, 8, 8, 8, "identity_8x8");
1271 }
1272
1273 #[test]
1274 fn test_bicubic_simd_vs_scalar_exact_4() {
1275 let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.1).collect();
1276 assert_bicubic_matches_scalar(&src, 8, 8, 4, 4, "exact_4");
1277 }
1278
1279 #[test]
1280 fn test_bicubic_simd_vs_scalar_exact_8() {
1281 let src: Vec<f32> = (0..256).map(|i| (i as f32).sin()).collect();
1282 assert_bicubic_matches_scalar(&src, 16, 16, 8, 8, "exact_8");
1283 }
1284
1285 #[test]
1286 fn test_bicubic_simd_vs_scalar_non_multiple_7() {
1287 let src: Vec<f32> = (0..196).map(|i| i as f32 * 0.05).collect();
1288 assert_bicubic_matches_scalar(&src, 14, 14, 7, 7, "non_multiple_7");
1289 }
1290
1291 #[test]
1292 fn test_bicubic_simd_vs_scalar_non_multiple_13() {
1293 let src: Vec<f32> = (0..400).map(|i| (i as f32 * 0.1).cos()).collect();
1294 assert_bicubic_matches_scalar(&src, 20, 20, 13, 13, "non_multiple_13");
1295 }
1296
1297 #[test]
1298 fn test_bicubic_simd_vs_scalar_large_128() {
1299 let src: Vec<f32> = (0..128 * 128)
1301 .map(|i| {
1302 let x = (i % 128) as f32;
1303 let y = (i / 128) as f32;
1304 (x * 0.02).sin() + (y * 0.02).cos()
1305 })
1306 .collect();
1307 assert_bicubic_matches_scalar(&src, 128, 128, 64, 64, "large_128");
1308 }
1309
1310 #[test]
1311 fn test_bicubic_simd_vs_scalar_upsample() {
1312 let src: Vec<f32> = (0..64).map(|i| i as f32).collect();
1314 assert_bicubic_matches_scalar(&src, 8, 8, 19, 19, "upsample_19");
1315 }
1316
1317 #[test]
1318 fn test_bicubic_simd_vs_scalar_identity() {
1319 let src: Vec<f32> = (0..64).map(|i| i as f32 * 0.5).collect();
1320 assert_bicubic_matches_scalar(&src, 8, 8, 8, 8, "identity_8x8");
1321 }
1322
1323 #[test]
1324 fn test_bilinear_simd_vs_scalar_asymmetric() {
1325 let src: Vec<f32> = (0..200).map(|i| (i as f32 * 0.1).sin()).collect();
1327 assert_bilinear_matches_scalar(&src, 20, 10, 7, 15, "asymmetric_20x10_to_7x15");
1328 }
1329
1330 #[test]
1331 fn test_bicubic_simd_vs_scalar_asymmetric() {
1332 let src: Vec<f32> = (0..128).map(|i| (i as f32 * 0.1).cos()).collect();
1334 assert_bicubic_matches_scalar(&src, 16, 8, 5, 11, "asymmetric_16x8_to_5x11");
1335 }
1336
1337 #[test]
1338 fn test_bilinear_constant_gradient() {
1339 let w = 32_usize;
1341 let h = 32_usize;
1342 let src: Vec<f32> = (0..w * h)
1343 .map(|i| {
1344 let x = (i % w) as f32;
1345 let y = (i / w) as f32;
1346 x * 2.0 + y * 3.0
1347 })
1348 .collect();
1349 let dw = 16_usize;
1350 let dh = 16_usize;
1351 let mut dst = vec![0.0_f32; dw * dh];
1352 bilinear_f32(&src, w, h, &mut dst, dw, dh).expect("bilinear gradient test should succeed");
1353
1354 for dy in 0..dh {
1357 for dx in 0..dw {
1358 let val = dst[dy * dw + dx];
1359 assert!(val.is_finite(), "Non-finite at ({dx},{dy}): {val}");
1360 }
1361 }
1362 }
1363
1364 #[test]
1365 fn test_bicubic_monotonicity() {
1366 let w = 16_usize;
1369 let h = 4_usize;
1370 let src: Vec<f32> = (0..w * h).map(|i| i as f32).collect();
1371 let dw = 8_usize;
1372 let dh = 4_usize;
1373 let mut dst = vec![0.0_f32; dw * dh];
1374 bicubic_f32(&src, w, h, &mut dst, dw, dh)
1375 .expect("bicubic monotonicity test should succeed");
1376
1377 for dy in 0..dh {
1379 let first = dst[dy * dw];
1380 let last = dst[dy * dw + dw - 1];
1381 assert!(
1382 last >= first,
1383 "Row {dy}: first={first}, last={last} - should increase"
1384 );
1385 }
1386 }
1387}