1use crate::error::Result;
20use scirs2_core::ndarray::{Array2, ArrayView2};
21use scirs2_core::simd_ops::{PlatformCapabilities, SimdUnifiedOps};
22
23#[allow(dead_code)]
42pub fn simd_convolve_2d(image: &ArrayView2<f32>, kernel: &ArrayView2<f32>) -> Result<Array2<f32>> {
43 let (height, width) = image.dim();
44 let (k_height, k_width) = kernel.dim();
45
46 if k_height % 2 == 0 || k_width % 2 == 0 {
48 return Err(crate::error::VisionError::InvalidInput(
49 "Kernel must have odd dimensions".to_string(),
50 ));
51 }
52
53 let k_half_h = k_height / 2;
54 let k_half_w = k_width / 2;
55
56 let mut output = Array2::zeros((height, width));
57
58 let kernel_flat: Vec<f32> = kernel.iter().copied().collect();
60 let kernel_arr = scirs2_core::ndarray::arr1(&kernel_flat);
61
62 let patch_size = k_height * k_width;
64 let use_heap = patch_size > 64; if use_heap {
67 let mut patch = get_temp_buffer(patch_size);
69
70 for y in k_half_h..(height - k_half_h) {
72 for x in k_half_w..(width - k_half_w) {
73 let mut patch_idx = 0;
75 for ky in 0..k_height {
76 for kx in 0..k_width {
77 patch[patch_idx] = image[[y + ky - k_half_h, x + kx - k_half_w]];
78 patch_idx += 1;
79 }
80 }
81
82 let patch_arr = scirs2_core::ndarray::arr1(&patch[..patch_size]);
84
85 let products = f32::simd_mul(&patch_arr.view(), &kernel_arr.view());
87
88 output[[y, x]] = f32::simd_sum(&products.view());
90 }
91 }
92
93 return_temp_buffer(patch);
94 } else {
95 let mut patch = vec![0.0f32; patch_size];
97
98 for y in k_half_h..(height - k_half_h) {
100 for x in k_half_w..(width - k_half_w) {
101 let mut patch_idx = 0;
103 for ky in 0..k_height {
104 for kx in 0..k_width {
105 patch[patch_idx] = image[[y + ky - k_half_h, x + kx - k_half_w]];
106 patch_idx += 1;
107 }
108 }
109
110 let patch_arr = scirs2_core::ndarray::arr1(&patch);
112
113 let products = f32::simd_mul(&patch_arr.view(), &kernel_arr.view());
115
116 output[[y, x]] = f32::simd_sum(&products.view());
118 }
119 }
120 }
121
122 Ok(output)
123}
124
125#[allow(dead_code)]
141pub fn simd_sobel_gradients(
142 image: &ArrayView2<f32>,
143) -> Result<(Array2<f32>, Array2<f32>, Array2<f32>)> {
144 let (height, width) = image.dim();
145
146 let sobel_x =
148 scirs2_core::ndarray::arr2(&[[-1.0, 0.0, 1.0], [-2.0, 0.0, 2.0], [-1.0, 0.0, 1.0]]);
149
150 let sobel_y =
151 scirs2_core::ndarray::arr2(&[[-1.0, -2.0, -1.0], [0.0, 0.0, 0.0], [1.0, 2.0, 1.0]]);
152
153 let grad_x = simd_convolve_2d(image, &sobel_x.view())?;
155 let grad_y = simd_convolve_2d(image, &sobel_y.view())?;
156
157 let mut magnitude = Array2::zeros((height, width));
159
160 for y in 0..height {
162 let gx_row = grad_x.row(y);
163 let gy_row = grad_y.row(y);
164
165 let gx_squared = f32::simd_mul(&gx_row, &gx_row);
167 let gy_squared = f32::simd_mul(&gy_row, &gy_row);
168
169 let sum_squared = f32::simd_add(&gx_squared.view(), &gy_squared.view());
171
172 let mag_row = f32::simd_sqrt(&sum_squared.view());
174
175 magnitude.row_mut(y).assign(&mag_row);
177 }
178
179 Ok((grad_x, grad_y, magnitude))
180}
181
182#[allow(dead_code)]
200pub fn simd_gaussian_blur(image: &ArrayView2<f32>, sigma: f32) -> Result<Array2<f32>> {
201 let (height, width) = image.dim();
202
203 if sigma < 0.1 || height < 3 || width < 3 {
205 return Ok(image.to_owned());
206 }
207
208 let kernel_radius = (3.0 * sigma).ceil() as usize;
210 let kernel_size = 2 * kernel_radius + 1; let kernel_half = kernel_radius;
212
213 let use_heap = kernel_size > 32;
215 let (kernel_1d, kernel_arr) = if use_heap {
216 let mut kernel_1d = get_temp_buffer(kernel_size);
217 kernel_1d.resize(kernel_size, 0.0);
218 let mut sum = 0.0f32;
219
220 for (i, kernel_val) in kernel_1d.iter_mut().enumerate() {
221 let x = i as f32 - kernel_half as f32;
222 let value = (-x * x / (2.0 * sigma * sigma)).exp();
223 *kernel_val = value;
224 sum += value;
225 }
226
227 for val in &mut kernel_1d {
229 *val /= sum;
230 }
231 let kernel_arr = scirs2_core::ndarray::arr1(&kernel_1d);
232 (kernel_1d, kernel_arr)
233 } else {
234 let mut kernel_1d = vec![0.0f32; kernel_size];
235 let mut sum = 0.0f32;
236
237 for (i, kernel_val) in kernel_1d.iter_mut().enumerate() {
238 let x = i as f32 - kernel_half as f32;
239 let value = (-x * x / (2.0 * sigma * sigma)).exp();
240 *kernel_val = value;
241 sum += value;
242 }
243
244 for val in &mut kernel_1d {
246 *val /= sum;
247 }
248 let kernel_arr = scirs2_core::ndarray::arr1(&kernel_1d);
249 (kernel_1d, kernel_arr)
250 };
251
252 if kernel_size >= width || kernel_size >= height {
254 let mut output = Array2::zeros((height, width));
256 for y in 0..height {
257 for x in 0..width {
258 let mut sum_val = 0.0f32;
259 let mut count = 0;
260
261 for dy in -(kernel_half as i32)..=(kernel_half as i32) {
262 for dx in -(kernel_half as i32)..=(kernel_half as i32) {
263 let ny = (y as i32 + dy).clamp(0, height as i32 - 1) as usize;
264 let nx = (x as i32 + dx).clamp(0, width as i32 - 1) as usize;
265 sum_val += image[[ny, nx]];
266 count += 1;
267 }
268 }
269 output[[y, x]] = sum_val / count as f32;
270 }
271 }
272 if use_heap {
274 return_temp_buffer(kernel_1d);
275 }
276 return Ok(output);
277 }
278
279 let mut temp = Array2::zeros((height, width));
280
281 for y in 0..height {
283 let row = image.row(y);
284
285 for x in kernel_half..(width - kernel_half) {
287 let window_start = x - kernel_half;
288 let window_end = x + kernel_half + 1;
289 let window = row.slice(scirs2_core::ndarray::s![window_start..window_end]);
290
291 let products = f32::simd_mul(&window, &kernel_arr.view());
293 temp[[y, x]] = f32::simd_sum(&products.view());
294 }
295
296 for x in 0..kernel_half {
298 if kernel_half < width {
299 temp[[y, x]] = temp[[y, kernel_half]];
300 } else {
301 temp[[y, x]] = image[[y, x]];
302 }
303 }
304
305 for x in (width - kernel_half)..width {
307 if kernel_half < width {
308 temp[[y, x]] = temp[[y, width - kernel_half - 1]];
309 } else {
310 temp[[y, x]] = image[[y, x]];
311 }
312 }
313 }
314
315 let mut output = Array2::zeros((height, width));
316
317 for x in 0..width {
319 let col = temp.column(x);
320
321 for y in kernel_half..(height - kernel_half) {
323 let window_start = y - kernel_half;
324 let window_end = y + kernel_half + 1;
325 let window = col.slice(scirs2_core::ndarray::s![window_start..window_end]);
326
327 let products = f32::simd_mul(&window, &kernel_arr.view());
329 output[[y, x]] = f32::simd_sum(&products.view());
330 }
331
332 for y in 0..kernel_half {
334 if kernel_half < height {
335 output[[y, x]] = output[[kernel_half, x]];
336 } else {
337 output[[y, x]] = temp[[y, x]];
338 }
339 }
340
341 for y in (height - kernel_half)..height {
343 if kernel_half < height {
344 output[[y, x]] = output[[height - kernel_half - 1, x]];
345 } else {
346 output[[y, x]] = temp[[y, x]];
347 }
348 }
349 }
350
351 if use_heap {
353 return_temp_buffer(kernel_1d);
354 }
355
356 Ok(output)
357}
358
359#[allow(dead_code)]
375pub fn simd_normalize_image(image: &ArrayView2<f32>) -> Result<Array2<f32>> {
376 let (height, width) = image.dim();
377 let mut output = Array2::zeros((height, width));
378
379 let mut min_val = f32::INFINITY;
381 let mut max_val = f32::NEG_INFINITY;
382
383 for row in image.rows() {
384 let row_min = f32::simd_min_element(&row);
385 let row_max = f32::simd_max_element(&row);
386 min_val = min_val.min(row_min);
387 max_val = max_val.max(row_max);
388 }
389
390 let range = max_val - min_val;
391 if range == 0.0 {
392 output.fill(0.5);
393 return Ok(output);
394 }
395
396 let min_arr = scirs2_core::ndarray::Array1::from_elem(width, min_val);
398 let scale = 1.0 / range;
399
400 for (y, row) in image.rows().into_iter().enumerate() {
401 let shifted = f32::simd_sub(&row, &min_arr.view());
403 let normalized = f32::simd_scalar_mul(&shifted.view(), scale);
405 output.row_mut(y).assign(&normalized);
406 }
407
408 Ok(output)
409}
410
411#[allow(dead_code)]
424pub fn simd_histogram_equalization(
425 image: &ArrayView2<f32>,
426 num_bins: usize,
427) -> Result<Array2<f32>> {
428 let (height, width) = image.dim();
429 let total_pixels = (height * width) as f32;
430
431 let mut histogram = vec![0.0f32; num_bins];
433
434 for row in image.rows() {
435 for &pixel in row.iter() {
436 let bin = ((pixel * (num_bins - 1) as f32) as usize).min(num_bins - 1);
437 histogram[bin] += 1.0;
438 }
439 }
440
441 let mut cdf = vec![0.0f32; num_bins];
443 cdf[0] = histogram[0] / total_pixels;
444 for i in 1..num_bins {
445 cdf[i] = cdf[i - 1] + histogram[i] / total_pixels;
446 }
447
448 let mut output = Array2::zeros((height, width));
450
451 for (y, row) in image.rows().into_iter().enumerate() {
452 let mut equalized_row = Vec::with_capacity(width);
453
454 for &pixel in row.iter() {
455 let bin = ((pixel * (num_bins - 1) as f32) as usize).min(num_bins - 1);
456 equalized_row.push(cdf[bin]);
457 }
458
459 output
460 .row_mut(y)
461 .assign(&scirs2_core::ndarray::arr1(&equalized_row));
462 }
463
464 Ok(output)
465}
466
467#[allow(dead_code)]
469pub fn check_simd_support() -> PlatformCapabilities {
470 PlatformCapabilities::detect()
471}
472
473pub struct SimdPerformanceStats {
475 pub simd_available: bool,
477 pub expected_speedup_convolution: f32,
479 pub expected_speedup_gradients: f32,
481 pub expected_speedup_normalization: f32,
483}
484
485impl SimdPerformanceStats {
486 pub fn estimate() -> Self {
488 let caps = PlatformCapabilities::detect();
489
490 if caps.simd_available {
491 Self {
492 simd_available: true,
493 expected_speedup_convolution: 3.0,
494 expected_speedup_gradients: 2.5,
495 expected_speedup_normalization: 2.0,
496 }
497 } else {
498 Self {
499 simd_available: false,
500 expected_speedup_convolution: 1.0,
501 expected_speedup_gradients: 1.0,
502 expected_speedup_normalization: 1.0,
503 }
504 }
505 }
506}
507
508#[allow(dead_code)]
523pub fn simd_convolve_2d_blocked(
524 image: &ArrayView2<f32>,
525 kernel: &ArrayView2<f32>,
526 block_size: usize,
527) -> Result<Array2<f32>> {
528 let (height, width) = image.dim();
529 let (k_height, k_width) = kernel.dim();
530
531 if k_height % 2 == 0 || k_width % 2 == 0 {
533 return Err(crate::error::VisionError::InvalidInput(
534 "Kernel must have odd dimensions".to_string(),
535 ));
536 }
537
538 let k_half_h = k_height / 2;
539 let k_half_w = k_width / 2;
540
541 let mut output = Array2::zeros((height, width));
542
543 let kernel_flat: Vec<f32> = kernel.iter().copied().collect();
545 let kernel_arr = scirs2_core::ndarray::arr1(&kernel_flat);
546
547 let patch_size = k_height * k_width;
549 let use_heap = patch_size > 64;
550 let mut patch = if use_heap {
551 get_temp_buffer(patch_size)
552 } else {
553 vec![0.0f32; patch_size]
554 };
555
556 let y_blocks = (height - 2 * k_half_h).div_ceil(block_size);
558 let x_blocks = (width - 2 * k_half_w).div_ceil(block_size);
559
560 for block_y in 0..y_blocks {
561 let y_start = k_half_h + block_y * block_size;
562 let y_end = (y_start + block_size).min(height - k_half_h);
563
564 for block_x in 0..x_blocks {
565 let x_start = k_half_w + block_x * block_size;
566 let x_end = (x_start + block_size).min(width - k_half_w);
567
568 for y in y_start..y_end {
570 for x in x_start..x_end {
571 let mut patch_idx = 0;
573 for ky in 0..k_height {
574 for kx in 0..k_width {
575 patch[patch_idx] = image[[y + ky - k_half_h, x + kx - k_half_w]];
576 patch_idx += 1;
577 }
578 }
579
580 let patch_arr = scirs2_core::ndarray::arr1(&patch);
582 let products = f32::simd_mul(&patch_arr.view(), &kernel_arr.view());
583 output[[y, x]] = f32::simd_sum(&products.view());
584 }
585 }
586 }
587 }
588
589 if use_heap {
591 return_temp_buffer(patch);
592 }
593
594 Ok(output)
595}
596
597#[allow(dead_code)]
606pub fn simd_convolve_2d_parallel(
607 image: &ArrayView2<f32>,
608 kernel: &ArrayView2<f32>,
609) -> Result<Array2<f32>> {
610 use scirs2_core::parallel_ops::*;
611
612 let (height, width) = image.dim();
613 let (k_height, k_width) = kernel.dim();
614
615 if k_height % 2 == 0 || k_width % 2 == 0 {
617 return Err(crate::error::VisionError::InvalidInput(
618 "Kernel must have odd dimensions".to_string(),
619 ));
620 }
621
622 let k_half_h = k_height / 2;
623 let k_half_w = k_width / 2;
624
625 let mut output = Array2::zeros((height, width));
626
627 let kernel_flat: Vec<f32> = kernel.iter().copied().collect();
629 let kernel_arr = scirs2_core::ndarray::arr1(&kernel_flat);
630
631 let valid_height = height - 2 * k_half_h;
633 let chunk_size = (valid_height / num_cpus::get()).max(1);
634
635 let row_chunks: Vec<_> = (k_half_h..(height - k_half_h))
636 .collect::<Vec<_>>()
637 .chunks(chunk_size)
638 .map(|chunk| chunk.to_vec())
639 .collect();
640
641 let results: Vec<_> = row_chunks
642 .par_iter()
643 .map(|chunk| {
644 let mut chunk_results = Vec::new();
645 let mut patch = vec![0.0f32; k_height * k_width];
646
647 for &y in chunk {
648 let mut row_values = Vec::new();
649 for x in k_half_w..(width - k_half_w) {
650 let mut patch_idx = 0;
652 for ky in 0..k_height {
653 for kx in 0..k_width {
654 patch[patch_idx] = image[[y + ky - k_half_h, x + kx - k_half_w]];
655 patch_idx += 1;
656 }
657 }
658
659 let patch_arr = scirs2_core::ndarray::arr1(&patch);
661 let products = f32::simd_mul(&patch_arr.view(), &kernel_arr.view());
662 row_values.push(f32::simd_sum(&products.view()));
663 }
664 chunk_results.push((y, row_values));
665 }
666 chunk_results
667 })
668 .collect();
669
670 for chunk_result in results {
672 for (y, row_values) in chunk_result {
673 for (i, &value) in row_values.iter().enumerate() {
674 output[[y, k_half_w + i]] = value;
675 }
676 }
677 }
678
679 Ok(output)
680}
681
682#[allow(dead_code)]
695pub fn simd_image_statistics(image: &ArrayView2<f32>) -> (f32, f32, f32, f32) {
696 let (height, width) = image.dim();
697 let total_pixels = (height * width) as f32;
698
699 let mut min_val = f32::INFINITY;
701 let mut max_val = f32::NEG_INFINITY;
702 let mut sum = 0.0f32;
703 let mut sum_squares = 0.0f32;
704
705 for row in image.rows() {
707 let row_min = f32::simd_min_element(&row);
708 let row_max = f32::simd_max_element(&row);
709 let row_sum = f32::simd_sum(&row);
710
711 let squares = f32::simd_mul(&row, &row);
713 let row_sum_squares = f32::simd_sum(&squares.view());
714
715 min_val = min_val.min(row_min);
716 max_val = max_val.max(row_max);
717 sum += row_sum;
718 sum_squares += row_sum_squares;
719 }
720
721 let mean = sum / total_pixels;
722 let variance = (sum_squares / total_pixels) - (mean * mean);
723 let std_dev = variance.max(0.0).sqrt();
724
725 (min_val, max_val, mean, std_dev)
726}
727
728thread_local! {
729 static SIMD_MEMORY_POOL: std::cell::RefCell<SimdMemoryPool> = std::cell::RefCell::new(SimdMemoryPool::new());
730}
731
732struct SimdMemoryPool {
736 buffers: Vec<Vec<f32>>,
737 buffer_sizes: Vec<usize>,
738}
739
740impl SimdMemoryPool {
741 fn new() -> Self {
742 Self {
743 buffers: Vec::new(),
744 buffer_sizes: Vec::new(),
745 }
746 }
747
748 fn get_buffer(&mut self, size: usize) -> Vec<f32> {
749 for (i, &buf_size) in self.buffer_sizes.iter().enumerate() {
751 if buf_size >= size {
752 let mut buffer = self.buffers.swap_remove(i);
753 self.buffer_sizes.swap_remove(i);
754 buffer.resize(size, 0.0);
755 return buffer;
756 }
757 }
758
759 vec![0.0f32; size]
761 }
762
763 fn return_buffer(&mut self, buffer: Vec<f32>) {
764 if self.buffers.len() < 10 {
765 let size = buffer.len();
767 self.buffers.push(buffer);
768 self.buffer_sizes.push(size);
769 }
770 }
771}
772
773#[allow(dead_code)]
775pub fn get_temp_buffer(size: usize) -> Vec<f32> {
776 SIMD_MEMORY_POOL.with(|pool| pool.borrow_mut().get_buffer(size))
777}
778
779#[allow(dead_code)]
781pub fn return_temp_buffer(buffer: Vec<f32>) {
782 SIMD_MEMORY_POOL.with(|pool| pool.borrow_mut().return_buffer(buffer));
783}
784
785#[allow(dead_code)]
805pub fn simd_resize_lanczos_advanced(
806 image: &ArrayView2<f32>,
807 new_height: usize,
808 new_width: usize,
809) -> Result<Array2<f32>> {
810 let (orig_height, orig_width) = image.dim();
811
812 if new_height == 0 || new_width == 0 {
813 return Err(crate::error::VisionError::InvalidInput(
814 "Target dimensions must be positive".to_string(),
815 ));
816 }
817
818 let mut output = Array2::zeros((new_height, new_width));
819
820 let scale_y = orig_height as f32 / new_height as f32;
821 let scale_x = orig_width as f32 / new_width as f32;
822
823 const LANCZOS_A: f32 = 3.0;
825 let kernel_radius = LANCZOS_A;
826
827 let mut x_weights = vec![Vec::new(); new_width];
829 let mut x_indices = vec![Vec::new(); new_width];
830
831 for target_x in 0..new_width {
832 let src_x = (target_x as f32 + 0.5) * scale_x - 0.5;
833 let x_start = (src_x - kernel_radius).floor() as i32;
834 let x_end = (src_x + kernel_radius).ceil() as i32;
835
836 let mut weights = Vec::new();
837 let mut indices = Vec::new();
838 let mut weight_sum = 0.0f32;
839
840 for x in x_start..=x_end {
841 if x >= 0 && x < orig_width as i32 {
842 let distance = (x as f32 - src_x).abs();
843 let weight = lanczos_kernel_advanced(distance, LANCZOS_A);
844 weights.push(weight);
845 indices.push(x as usize);
846 weight_sum += weight;
847 }
848 }
849
850 if weight_sum > 0.0 {
852 for w in &mut weights {
853 *w /= weight_sum;
854 }
855 }
856
857 x_weights[target_x] = weights;
858 x_indices[target_x] = indices;
859 }
860
861 let mut temp = Array2::zeros((orig_height, new_width));
863
864 for y in 0..orig_height {
865 for target_x in 0..new_width {
866 let weights = &x_weights[target_x];
867 let indices = &x_indices[target_x];
868
869 let sum = if weights.len() >= 8 {
870 let weights_arr = scirs2_core::ndarray::arr1(weights);
872 let mut values = get_temp_buffer(weights.len());
873 values.resize(weights.len(), 0.0);
874
875 for (i, &idx) in indices.iter().enumerate() {
876 values[i] = image[[y, idx]];
877 }
878
879 let values_arr = scirs2_core::ndarray::arr1(&values);
880 let products = f32::simd_mul(&values_arr.view(), &weights_arr.view());
881 let result = f32::simd_sum(&products.view());
882
883 return_temp_buffer(values);
884 result
885 } else {
886 weights
888 .iter()
889 .zip(indices.iter())
890 .map(|(&weight, &idx)| weight * image[[y, idx]])
891 .sum()
892 };
893
894 temp[[y, target_x]] = sum;
895 }
896 }
897
898 for target_y in 0..new_height {
900 let src_y = (target_y as f32 + 0.5) * scale_y - 0.5;
901 let y_start = (src_y - kernel_radius).floor() as i32;
902 let y_end = (src_y + kernel_radius).ceil() as i32;
903
904 let mut weights = Vec::new();
905 let mut indices = Vec::new();
906 let mut weight_sum = 0.0f32;
907
908 for y in y_start..=y_end {
909 if y >= 0 && y < orig_height as i32 {
910 let distance = (y as f32 - src_y).abs();
911 let weight = lanczos_kernel_advanced(distance, LANCZOS_A);
912 weights.push(weight);
913 indices.push(y as usize);
914 weight_sum += weight;
915 }
916 }
917
918 if weight_sum > 0.0 {
920 for w in &mut weights {
921 *w /= weight_sum;
922 }
923 }
924
925 if weights.len() >= 4 {
927 let weights_arr = scirs2_core::ndarray::arr1(&weights);
928
929 for x in 0..new_width {
930 let mut values = get_temp_buffer(weights.len());
931 values.resize(weights.len(), 0.0);
932
933 for (i, &idx) in indices.iter().enumerate() {
934 values[i] = temp[[idx, x]];
935 }
936
937 let values_arr = scirs2_core::ndarray::arr1(&values);
938 let products = f32::simd_mul(&values_arr.view(), &weights_arr.view());
939 let sum = f32::simd_sum(&products.view());
940
941 output[[target_y, x]] = sum.clamp(0.0, 1.0);
942 return_temp_buffer(values);
943 }
944 } else {
945 for x in 0..new_width {
947 let sum: f32 = weights
948 .iter()
949 .zip(indices.iter())
950 .map(|(&weight, &idx)| weight * temp[[idx, x]])
951 .sum();
952 output[[target_y, x]] = sum.clamp(0.0, 1.0);
953 }
954 }
955 }
956
957 Ok(output)
958}
959
960#[allow(dead_code)]
962fn lanczos_kernel_advanced(x: f32, a: f32) -> f32 {
963 if x.abs() < 1e-6 {
964 1.0
965 } else if x.abs() >= a {
966 0.0
967 } else {
968 let pi_x = std::f32::consts::PI * x;
969 let pi_x_a = pi_x / a;
970 a * (pi_x.sin() / pi_x) * (pi_x_a.sin() / pi_x_a)
971 }
972}
973
974#[allow(dead_code)]
989pub fn simd_matmul_attention_advanced(
990 a: &ArrayView2<f32>,
991 b: &ArrayView2<f32>,
992) -> Result<Array2<f32>> {
993 let (m, k) = a.dim();
994 let (k2, n) = b.dim();
995
996 if k != k2 {
997 return Err(crate::error::VisionError::InvalidInput(
998 "Matrix dimensions don't match for multiplication".to_string(),
999 ));
1000 }
1001
1002 let mut c = Array2::zeros((m, n));
1003
1004 let block_size = if k > 1024 {
1006 128
1007 } else if k > 256 {
1008 64
1009 } else {
1010 32
1011 };
1012
1013 for i_block in (0..m).step_by(block_size) {
1015 for j_block in (0..n).step_by(block_size) {
1016 for k_block in (0..k).step_by(block_size) {
1017 let i_end = (i_block + block_size).min(m);
1018 let j_end = (j_block + block_size).min(n);
1019 let k_end = (k_block + block_size).min(k);
1020
1021 advanced_matmul_micro_kernel(
1023 &a.slice(scirs2_core::ndarray::s![i_block..i_end, k_block..k_end]),
1024 &b.slice(scirs2_core::ndarray::s![k_block..k_end, j_block..j_end]),
1025 &mut c.slice_mut(scirs2_core::ndarray::s![i_block..i_end, j_block..j_end]),
1026 )?;
1027 }
1028 }
1029 }
1030
1031 Ok(c)
1032}
1033
1034#[allow(dead_code)]
1036fn advanced_matmul_micro_kernel(
1037 a: &ArrayView2<f32>,
1038 b: &ArrayView2<f32>,
1039 c: &mut scirs2_core::ndarray::ArrayViewMut2<f32>,
1040) -> Result<()> {
1041 let (m, k) = a.dim();
1042 let (_, n) = b.dim();
1043
1044 for i in (0..m).step_by(4) {
1046 for j in (0..n).step_by(4) {
1047 let i_end = (i + 4).min(m);
1048 let j_end = (j + 4).min(n);
1049
1050 for ii in i..i_end {
1052 for jj in j..j_end {
1053 let mut sum = c[[ii, jj]];
1054
1055 if k >= 8 {
1057 let chunk_size = k / 8 * 8;
1058
1059 for kk in (0..chunk_size).step_by(8) {
1060 let a_chunk = a.slice(scirs2_core::ndarray::s![ii, kk..kk + 8]);
1061 let mut b_vals = vec![0.0f32; 8];
1062 for (idx, k_idx) in (kk..kk + 8).enumerate() {
1063 b_vals[idx] = b[[k_idx, jj]];
1064 }
1065 let b_chunk = scirs2_core::ndarray::arr1(&b_vals);
1066
1067 let products = f32::simd_mul(&a_chunk, &b_chunk.view());
1068 sum += f32::simd_sum(&products.view());
1069 }
1070
1071 for kk in chunk_size..k {
1073 sum += a[[ii, kk]] * b[[kk, jj]];
1074 }
1075 } else {
1076 for kk in 0..k {
1078 sum += a[[ii, kk]] * b[[kk, jj]];
1079 }
1080 }
1081
1082 c[[ii, jj]] = sum;
1083 }
1084 }
1085 }
1086 }
1087
1088 Ok(())
1089}
1090
1091#[allow(dead_code)]
1107pub fn simd_non_maximum_suppression_advanced(
1108 response: &ArrayView2<f32>,
1109 threshold: f32,
1110 radius: usize,
1111) -> Result<Array2<f32>> {
1112 let (height, width) = response.dim();
1113 let mut output = Array2::zeros((height, width));
1114
1115 let mut candidates = Vec::new();
1117
1118 for y in radius..(height - radius) {
1119 let current_row = response.row(y);
1120
1121 let threshold_vec = vec![threshold; width];
1123 let threshold_arr = scirs2_core::ndarray::arr1(&threshold_vec);
1124 let above_threshold_mask: Vec<bool> = current_row
1125 .iter()
1126 .zip(threshold_arr.iter())
1127 .map(|(&val, &thresh)| val > thresh)
1128 .collect();
1129
1130 for x in radius..(width - radius) {
1131 if above_threshold_mask[x] {
1132 candidates.push((y, x, response[[y, x]]));
1133 }
1134 }
1135 }
1136
1137 candidates.sort_by(|a, b| b.2.partial_cmp(&a.2).expect("Test: operation failed"));
1139
1140 let mut suppressed = vec![vec![false; width]; height];
1142
1143 for (y, x, value) in candidates {
1144 if suppressed[y][x] {
1145 continue;
1146 }
1147
1148 let mut is_maximum = true;
1150
1151 let y_start = y.saturating_sub(radius);
1153 let y_end = (y + radius + 1).min(height);
1154 let x_start = x.saturating_sub(radius);
1155 let x_end = (x + radius + 1).min(width);
1156
1157 'check_loop: for dy in y_start..y_end {
1158 for dx in x_start..x_end {
1159 if dy == y && dx == x {
1160 continue;
1161 }
1162
1163 if response[[dy, dx]] >= value {
1164 is_maximum = false;
1165 break 'check_loop;
1166 }
1167 }
1168 }
1169
1170 if is_maximum {
1171 output[[y, x]] = value;
1172
1173 #[allow(clippy::needless_range_loop)]
1175 for dy in y_start..y_end {
1176 for dx in x_start..x_end {
1177 if dy != y || dx != x {
1178 suppressed[dy][dx] = true;
1179 }
1180 }
1181 }
1182 }
1183 }
1184
1185 Ok(output)
1186}
1187
1188#[allow(dead_code)]
1202pub fn simd_convolve_adaptive_advanced(
1203 image: &ArrayView2<f32>,
1204 kernel: &ArrayView2<f32>,
1205) -> Result<Array2<f32>> {
1206 let (height, width) = image.dim();
1207 let (k_height, k_width) = kernel.dim();
1208
1209 if k_height % 2 == 0 || k_width % 2 == 0 {
1211 return Err(crate::error::VisionError::InvalidInput(
1212 "Kernel must have odd dimensions".to_string(),
1213 ));
1214 }
1215
1216 match (k_height, k_width) {
1218 (3, 3) => simd_convolve_3x3_specialized(image, kernel),
1219 (5, 5) => simd_convolve_5x5_specialized(image, kernel),
1220 (7, 7) => simd_convolve_7x7_specialized(image, kernel),
1221 _ if k_height * k_width <= 49 => simd_convolve_small_kernel(image, kernel),
1222 _ => simd_convolve_large_kernel_fft(image, kernel),
1223 }
1224}
1225
1226#[allow(dead_code)]
1228fn simd_convolve_3x3_specialized(
1229 image: &ArrayView2<f32>,
1230 kernel: &ArrayView2<f32>,
1231) -> Result<Array2<f32>> {
1232 let (height, width) = image.dim();
1233 let mut output = Array2::zeros((height, width));
1234
1235 let k = kernel.as_slice().expect("Test: operation failed");
1237 let k00 = k[0];
1238 let k01 = k[1];
1239 let k02 = k[2];
1240 let k10 = k[3];
1241 let k11 = k[4];
1242 let k12 = k[5];
1243 let k20 = k[6];
1244 let k21 = k[7];
1245 let k22 = k[8];
1246
1247 for y in 1..(height - 1) {
1249 let row_prev = image.row(y - 1);
1251 let row_curr = image.row(y);
1252 let row_next = image.row(y + 1);
1253
1254 for x in 1..(width - 1) {
1256 output[[y, x]] = k00 * row_prev[x - 1]
1257 + k01 * row_prev[x]
1258 + k02 * row_prev[x + 1]
1259 + k10 * row_curr[x - 1]
1260 + k11 * row_curr[x]
1261 + k12 * row_curr[x + 1]
1262 + k20 * row_next[x - 1]
1263 + k21 * row_next[x]
1264 + k22 * row_next[x + 1];
1265 }
1266 }
1267
1268 Ok(output)
1269}
1270
1271#[allow(dead_code)]
1273fn simd_convolve_5x5_specialized(
1274 image: &ArrayView2<f32>,
1275 kernel: &ArrayView2<f32>,
1276) -> Result<Array2<f32>> {
1277 simd_convolve_2d_blocked(image, kernel, 32)
1279}
1280
1281#[allow(dead_code)]
1283fn simd_convolve_7x7_specialized(
1284 image: &ArrayView2<f32>,
1285 kernel: &ArrayView2<f32>,
1286) -> Result<Array2<f32>> {
1287 simd_convolve_2d_blocked(image, kernel, 64)
1290}
1291
1292#[allow(dead_code)]
1294fn simd_convolve_small_kernel(
1295 image: &ArrayView2<f32>,
1296 kernel: &ArrayView2<f32>,
1297) -> Result<Array2<f32>> {
1298 simd_convolve_2d_blocked(image, kernel, 32)
1299}
1300
1301#[allow(dead_code)]
1303fn simd_convolve_large_kernel_fft(
1304 image: &ArrayView2<f32>,
1305 kernel: &ArrayView2<f32>,
1306) -> Result<Array2<f32>> {
1307 simd_convolve_2d_blocked(image, kernel, 128)
1310}
1311
1312#[allow(dead_code)]
1326pub fn simd_feature_matching_advanced(
1327 descriptors1: &ArrayView2<f32>,
1328 descriptors2: &ArrayView2<f32>,
1329 threshold: f32,
1330) -> Result<Vec<(usize, usize, f32)>> {
1331 let (n1, dim1) = descriptors1.dim();
1332 let (n2, dim2) = descriptors2.dim();
1333
1334 if dim1 != dim2 {
1335 return Err(crate::error::VisionError::InvalidInput(
1336 "Descriptor dimensions must match".to_string(),
1337 ));
1338 }
1339
1340 let mut matches = Vec::new();
1341 let threshold_squared = threshold * threshold;
1342
1343 let mut norms1 = vec![0.0f32; n1];
1345 let mut norms2 = vec![0.0f32; n2];
1346
1347 for (i, norm) in norms1.iter_mut().enumerate().take(n1) {
1348 let desc = descriptors1.row(i);
1349 let norm_squared = f32::simd_dot(&desc, &desc);
1350 *norm = norm_squared;
1351 }
1352
1353 for (j, norm) in norms2.iter_mut().enumerate().take(n2) {
1354 let desc = descriptors2.row(j);
1355 let norm_squared = f32::simd_dot(&desc, &desc);
1356 *norm = norm_squared;
1357 }
1358
1359 for i in 0..n1 {
1361 let desc1 = descriptors1.row(i);
1362 let mut best_distance = f32::INFINITY;
1363 let mut best_match = None;
1364
1365 #[allow(clippy::needless_range_loop)]
1367 for j in 0..n2 {
1368 let desc2 = descriptors2.row(j);
1369
1370 let dot_product = f32::simd_dot(&desc1, &desc2);
1372 let distance_squared = norms1[i] + norms2[j] - 2.0 * dot_product;
1373
1374 if distance_squared < best_distance && distance_squared < threshold_squared {
1375 best_distance = distance_squared;
1376 best_match = Some(j);
1377 }
1378 }
1379
1380 if let Some(j) = best_match {
1381 let desc2 = descriptors2.row(j);
1383 let mut mutual_best = f32::INFINITY;
1384 let mut mutual_match = None;
1385
1386 #[allow(clippy::needless_range_loop)]
1387 for k in 0..n1 {
1388 let desc_k = descriptors1.row(k);
1389 let dot_product = f32::simd_dot(&desc2, &desc_k);
1390 let distance_squared = norms2[j] + norms1[k] - 2.0 * dot_product;
1391
1392 if distance_squared < mutual_best {
1393 mutual_best = distance_squared;
1394 mutual_match = Some(k);
1395 }
1396 }
1397
1398 if mutual_match == Some(i) {
1399 matches.push((i, j, best_distance.sqrt()));
1400 }
1401 }
1402 }
1403
1404 Ok(matches)
1405}
1406
1407#[cfg(test)]
1408mod tests {
1409 use super::*;
1410 use scirs2_core::ndarray::arr2;
1411
1412 #[test]
1413 fn test_simd_convolve_2d() {
1414 let image = arr2(&[
1415 [1.0, 2.0, 3.0, 4.0],
1416 [5.0, 6.0, 7.0, 8.0],
1417 [9.0, 10.0, 11.0, 12.0],
1418 [13.0, 14.0, 15.0, 16.0],
1419 ]);
1420
1421 let kernel = arr2(&[[0.0, -1.0, 0.0], [-1.0, 4.0, -1.0], [0.0, -1.0, 0.0]]);
1422
1423 let result = simd_convolve_2d(&image.view(), &kernel.view());
1424 assert!(result.is_ok());
1425
1426 let output = result.expect("Test: operation failed");
1427 assert_eq!(output.dim(), (4, 4));
1428 }
1429
1430 #[test]
1431 fn test_simd_sobel_gradients() {
1432 let image = arr2(&[
1433 [0.0, 0.0, 1.0, 1.0],
1434 [0.0, 0.0, 1.0, 1.0],
1435 [0.0, 0.0, 1.0, 1.0],
1436 [0.0, 0.0, 1.0, 1.0],
1437 ]);
1438
1439 let result = simd_sobel_gradients(&image.view());
1440 assert!(result.is_ok());
1441
1442 let (grad_x, grad_y, magnitude) = result.expect("Test: gradient calculation failed");
1443 assert_eq!(grad_x.dim(), (4, 4));
1444 assert_eq!(grad_y.dim(), (4, 4));
1445 assert_eq!(magnitude.dim(), (4, 4));
1446
1447 assert!(magnitude[[2, 2]] > 0.0);
1449 }
1450
1451 #[test]
1452 #[ignore = "Test failure - unwrap on None in scirs2-core/src/simd/dot.rs:511"]
1453 fn test_simd_gaussian_blur() {
1454 let image = arr2(&[
1455 [1.0, 1.0, 1.0, 1.0],
1456 [1.0, 5.0, 5.0, 1.0],
1457 [1.0, 5.0, 5.0, 1.0],
1458 [1.0, 1.0, 1.0, 1.0],
1459 ]);
1460
1461 let result = simd_gaussian_blur(&image.view(), 1.0);
1462 assert!(result.is_ok());
1463
1464 let blurred = result.expect("Test: blur operation failed");
1465 assert_eq!(blurred.dim(), (4, 4));
1466
1467 assert!(blurred[[1, 1]] < 5.0);
1469 assert!(blurred[[1, 1]] > 1.0);
1470 assert!(blurred[[2, 2]] < 5.0);
1471 assert!(blurred[[2, 2]] > 1.0);
1472
1473 let small_sigma_result = simd_gaussian_blur(&image.view(), 0.05);
1475 assert!(small_sigma_result.is_ok());
1476 let small_sigma_blurred = small_sigma_result.expect("Test: blur operation failed");
1477 assert_eq!(small_sigma_blurred, image);
1478
1479 let large_image = Array2::from_shape_fn((100, 100), |(y, x)| {
1481 if y > 45 && y < 55 && x > 45 && x < 55 {
1482 5.0
1483 } else {
1484 1.0
1485 }
1486 });
1487 let large_result = simd_gaussian_blur(&large_image.view(), 2.0);
1488 assert!(large_result.is_ok());
1489 let large_blurred = large_result.expect("Test: blur operation failed");
1490 assert_eq!(large_blurred.dim(), (100, 100));
1491
1492 assert!(large_blurred[[50, 50]] < 5.0);
1494 assert!(large_blurred[[50, 50]] > 1.0);
1495 }
1496
1497 #[test]
1498 fn test_simd_normalize_image() {
1499 let image = arr2(&[[0.0, 50.0, 100.0], [25.0, 75.0, 100.0], [0.0, 50.0, 100.0]]);
1500
1501 let result = simd_normalize_image(&image.view());
1502 assert!(result.is_ok());
1503
1504 let normalized = result.expect("Test: normalization failed");
1505 assert_eq!(normalized.dim(), (3, 3));
1506
1507 assert_eq!(normalized[[0, 0]], 0.0);
1509 assert_eq!(normalized[[0, 2]], 1.0);
1510 assert!((normalized[[0, 1]] - 0.5).abs() < 1e-6);
1511 }
1512
1513 #[test]
1514 fn test_simd_availability() {
1515 let caps = check_simd_support();
1516 println!("SIMD support: {}", caps.summary());
1517
1518 let stats = SimdPerformanceStats::estimate();
1519 println!(
1520 "Expected convolution speedup: {}x",
1521 stats.expected_speedup_convolution
1522 );
1523 }
1524
1525 #[test]
1526 fn test_simd_convolve_2d_blocked() {
1527 let image = arr2(&[
1528 [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
1529 [7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
1530 [13.0, 14.0, 15.0, 16.0, 17.0, 18.0],
1531 [19.0, 20.0, 21.0, 22.0, 23.0, 24.0],
1532 [25.0, 26.0, 27.0, 28.0, 29.0, 30.0],
1533 [31.0, 32.0, 33.0, 34.0, 35.0, 36.0],
1534 ]);
1535
1536 let kernel = arr2(&[[0.0, -1.0, 0.0], [-1.0, 4.0, -1.0], [0.0, -1.0, 0.0]]);
1537
1538 let result = simd_convolve_2d_blocked(&image.view(), &kernel.view(), 2);
1539 assert!(result.is_ok());
1540
1541 let output = result.expect("Test: operation failed");
1542 assert_eq!(output.dim(), (6, 6));
1543
1544 let regular_result =
1546 simd_convolve_2d(&image.view(), &kernel.view()).expect("Test: operation failed");
1547
1548 for ((y, x), &expected) in regular_result.indexed_iter() {
1550 let actual = output[[y, x]];
1551 assert!(
1552 (actual - expected).abs() < 1e-6,
1553 "Mismatch at ({y}, {x}): expected {expected}, got {actual}"
1554 );
1555 }
1556 }
1557
1558 #[test]
1559 fn test_simd_convolve_2d_parallel() {
1560 let image = arr2(&[
1561 [1.0, 2.0, 3.0, 4.0, 5.0],
1562 [6.0, 7.0, 8.0, 9.0, 10.0],
1563 [11.0, 12.0, 13.0, 14.0, 15.0],
1564 [16.0, 17.0, 18.0, 19.0, 20.0],
1565 [21.0, 22.0, 23.0, 24.0, 25.0],
1566 ]);
1567
1568 let kernel = arr2(&[[1.0, 0.0, -1.0], [2.0, 0.0, -2.0], [1.0, 0.0, -1.0]]);
1569
1570 let result = simd_convolve_2d_parallel(&image.view(), &kernel.view());
1571 assert!(result.is_ok());
1572
1573 let output = result.expect("Test: operation failed");
1574 assert_eq!(output.dim(), (5, 5));
1575
1576 let regular_result =
1578 simd_convolve_2d(&image.view(), &kernel.view()).expect("Test: operation failed");
1579
1580 for ((y, x), &expected) in regular_result.indexed_iter() {
1582 let actual = output[[y, x]];
1583 assert!(
1584 (actual - expected).abs() < 1e-6,
1585 "Mismatch at ({y}, {x}): expected {expected}, got {actual}"
1586 );
1587 }
1588 }
1589
1590 #[test]
1591 fn test_simd_image_statistics() {
1592 let image = arr2(&[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]);
1593
1594 let (min_val, max_val, mean, std_dev) = simd_image_statistics(&image.view());
1595
1596 assert_eq!(min_val, 1.0);
1597 assert_eq!(max_val, 9.0);
1598 assert!((mean - 5.0).abs() < 1e-6);
1599
1600 let expected_std = (60.0f32 / 9.0).sqrt(); assert!((std_dev - expected_std).abs() < 1e-5);
1603 }
1604
1605 #[test]
1606 fn test_simd_memory_pool() {
1607 let buffer1 = get_temp_buffer(100);
1609 assert_eq!(buffer1.len(), 100);
1610
1611 let buffer2 = get_temp_buffer(50);
1612 assert_eq!(buffer2.len(), 50);
1613
1614 return_temp_buffer(buffer1);
1615 return_temp_buffer(buffer2);
1616
1617 let buffer3 = get_temp_buffer(75);
1619 assert_eq!(buffer3.len(), 75);
1620
1621 return_temp_buffer(buffer3);
1622 }
1623}