Skip to main content

oxigdal_algorithms/simd/
mod.rs

1//! SIMD (Single Instruction Multiple Data) optimizations for performance-critical operations
2//!
3//! This module provides SIMD-accelerated implementations of raster operations, statistics,
4//! resampling, and mathematical functions. It uses architecture-specific intrinsics
5//! (`std::arch`) with runtime CPU feature detection and graceful scalar fallback.
6//!
7//! # Architecture Support
8//!
9//! - **aarch64**: NEON (128-bit, baseline on all aarch64 CPUs)
10//! - **x86-64**: SSE2 (baseline), AVX2 (runtime detected), AVX-512 (runtime detected)
11//! - **Other**: Scalar fallback with auto-vectorization hints
12//!
13//! # Runtime Detection
14//!
15//! On x86-64, the module uses `is_x86_feature_detected!` to select the best available
16//! instruction set at runtime. On aarch64, NEON is always available as part of the
17//! base architecture. See [`detect`] module for details.
18//!
19//! # Performance
20//!
21//! Expected speedups over scalar implementations:
22//!
23//! - Raster operations: 2-8x (element-wise ops, min/max, thresholds)
24//! - Statistics: 4-8x (sum, mean, histograms)
25//! - Resampling: 2-4x (bilinear, bicubic interpolation)
26//! - Math functions: 3-6x (sqrt, log, exp, trig)
27//!
28//! # Usage
29//!
30//! Enable SIMD optimizations with the `simd` feature (enabled by default):
31//!
32//! ```toml
33//! [dependencies]
34//! oxigdal-algorithms = { version = "0.1", features = ["simd"] }
35//! ```
36//!
37//! # Example
38//!
39//! ```rust
40//! use oxigdal_algorithms::simd::raster::add_f32;
41//!
42//! let a = vec![1.0_f32; 1000];
43//! let b = vec![2.0_f32; 1000];
44//! let mut result = vec![0.0_f32; 1000];
45//!
46//! // Automatically uses best available SIMD instruction set
47//! add_f32(&a, &b, &mut result);
48//!
49//! assert_eq!(result[0], 3.0);
50//! ```
51//!
52//! # Safety
53//!
54//! All public SIMD operations are safe. Unsafe intrinsics are encapsulated internally
55//! with proper bounds checking, alignment handling, and error propagation.
56//! Pure Rust implementation (no C/Fortran dependencies).
57
58#![allow(clippy::module_name_repetitions)]
59#![allow(unsafe_code)]
60
61pub mod colorspace;
62pub mod cost_distance_simd;
63pub mod filters;
64pub mod focal_simd;
65pub mod histogram;
66pub mod hydrology_simd;
67pub mod math;
68pub mod morphology;
69pub mod projection;
70pub mod raster;
71pub mod resampling;
72pub mod statistics;
73pub mod terrain_simd;
74pub mod texture_simd;
75pub mod threshold;
76
77/// Runtime SIMD capability detection
78///
79/// This module provides runtime detection of available SIMD instruction sets.
80/// On x86-64, it uses `is_x86_feature_detected!` to probe CPU features.
81/// On aarch64, NEON is always available.
82pub mod detect {
83    use std::sync::OnceLock;
84
85    /// Available SIMD instruction set levels
86    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
87    pub enum SimdLevel {
88        /// No SIMD - pure scalar operations
89        Scalar,
90        /// 128-bit SIMD (SSE2 on x86-64, NEON on aarch64)
91        Simd128,
92        /// 256-bit SIMD (AVX2 on x86-64)
93        Simd256,
94        /// 512-bit SIMD (AVX-512 on x86-64)
95        Simd512,
96    }
97
98    impl SimdLevel {
99        /// Get the lane width for f32 operations at this SIMD level
100        #[must_use]
101        pub const fn lanes_f32(self) -> usize {
102            match self {
103                Self::Scalar => 1,
104                Self::Simd128 => 4,  // 128 / 32
105                Self::Simd256 => 8,  // 256 / 32
106                Self::Simd512 => 16, // 512 / 32
107            }
108        }
109
110        /// Get the lane width for f64 operations at this SIMD level
111        #[must_use]
112        pub const fn lanes_f64(self) -> usize {
113            match self {
114                Self::Scalar => 1,
115                Self::Simd128 => 2, // 128 / 64
116                Self::Simd256 => 4, // 256 / 64
117                Self::Simd512 => 8, // 512 / 64
118            }
119        }
120
121        /// Get the lane width for u8 operations at this SIMD level
122        #[must_use]
123        pub const fn lanes_u8(self) -> usize {
124            match self {
125                Self::Scalar => 1,
126                Self::Simd128 => 16, // 128 / 8
127                Self::Simd256 => 32, // 256 / 8
128                Self::Simd512 => 64, // 512 / 8
129            }
130        }
131
132        /// Get the preferred memory alignment in bytes
133        #[must_use]
134        pub const fn preferred_alignment(self) -> usize {
135            match self {
136                Self::Scalar => 8,
137                Self::Simd128 => 16,
138                Self::Simd256 => 32,
139                Self::Simd512 => 64,
140            }
141        }
142
143        /// Get the register width in bits
144        #[must_use]
145        pub const fn width_bits(self) -> usize {
146            match self {
147                Self::Scalar => 64,
148                Self::Simd128 => 128,
149                Self::Simd256 => 256,
150                Self::Simd512 => 512,
151            }
152        }
153    }
154
155    impl core::fmt::Display for SimdLevel {
156        fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
157            match self {
158                Self::Scalar => write!(f, "Scalar"),
159                Self::Simd128 => {
160                    #[cfg(target_arch = "x86_64")]
161                    {
162                        write!(f, "SSE2 (128-bit)")
163                    }
164                    #[cfg(target_arch = "aarch64")]
165                    {
166                        write!(f, "NEON (128-bit)")
167                    }
168                    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
169                    {
170                        write!(f, "SIMD128")
171                    }
172                }
173                Self::Simd256 => write!(f, "AVX2 (256-bit)"),
174                Self::Simd512 => write!(f, "AVX-512 (512-bit)"),
175            }
176        }
177    }
178
179    /// Runtime SIMD capabilities for the current CPU
180    #[derive(Debug, Clone)]
181    pub struct SimdCapabilities {
182        /// Highest available SIMD level
183        pub max_level: SimdLevel,
184        /// Whether SSE2 is available (x86-64 only, always true on x86-64)
185        pub has_sse2: bool,
186        /// Whether SSE4.1 is available (x86-64 only)
187        pub has_sse41: bool,
188        /// Whether AVX2 is available (x86-64 only)
189        pub has_avx2: bool,
190        /// Whether FMA is available (x86-64 only)
191        pub has_fma: bool,
192        /// Whether AVX-512F is available (x86-64 only)
193        pub has_avx512f: bool,
194        /// Whether NEON is available (aarch64 only, always true on aarch64)
195        pub has_neon: bool,
196    }
197
198    impl SimdCapabilities {
199        /// Detect SIMD capabilities of the current CPU at runtime
200        #[must_use]
201        pub fn detect() -> Self {
202            #[cfg(target_arch = "x86_64")]
203            {
204                let has_sse2 = is_x86_feature_detected!("sse2");
205                let has_sse41 = is_x86_feature_detected!("sse4.1");
206                let has_avx2 = is_x86_feature_detected!("avx2");
207                let has_fma = is_x86_feature_detected!("fma");
208                let has_avx512f = is_x86_feature_detected!("avx512f");
209
210                let max_level = if has_avx512f {
211                    SimdLevel::Simd512
212                } else if has_avx2 {
213                    SimdLevel::Simd256
214                } else if has_sse2 {
215                    SimdLevel::Simd128
216                } else {
217                    SimdLevel::Scalar
218                };
219
220                Self {
221                    max_level,
222                    has_sse2,
223                    has_sse41,
224                    has_avx2,
225                    has_fma,
226                    has_avx512f,
227                    has_neon: false,
228                }
229            }
230
231            #[cfg(target_arch = "aarch64")]
232            {
233                // NEON is always available on aarch64
234                Self {
235                    max_level: SimdLevel::Simd128,
236                    has_sse2: false,
237                    has_sse41: false,
238                    has_avx2: false,
239                    has_fma: false,
240                    has_avx512f: false,
241                    has_neon: true,
242                }
243            }
244
245            #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
246            {
247                Self {
248                    max_level: SimdLevel::Scalar,
249                    has_sse2: false,
250                    has_sse41: false,
251                    has_avx2: false,
252                    has_fma: false,
253                    has_avx512f: false,
254                    has_neon: false,
255                }
256            }
257        }
258    }
259
260    /// Get the cached SIMD capabilities for the current CPU
261    ///
262    /// This is initialized once on first call and cached for subsequent calls.
263    #[must_use]
264    pub fn capabilities() -> &'static SimdCapabilities {
265        static CAPS: OnceLock<SimdCapabilities> = OnceLock::new();
266        CAPS.get_or_init(SimdCapabilities::detect)
267    }
268
269    /// Get the best available SIMD level for the current CPU
270    #[must_use]
271    pub fn best_level() -> SimdLevel {
272        capabilities().max_level
273    }
274
275    /// Check if a specific SIMD level is available
276    #[must_use]
277    pub fn is_available(level: SimdLevel) -> bool {
278        best_level() >= level
279    }
280}
281
282/// Platform-specific SIMD configuration (compile-time constants)
283pub mod platform {
284    /// Check if AVX2 is available at compile time
285    #[cfg(target_feature = "avx2")]
286    pub const HAS_AVX2: bool = true;
287
288    /// Check if AVX2 is available at compile time
289    #[cfg(not(target_feature = "avx2"))]
290    pub const HAS_AVX2: bool = false;
291
292    /// Check if AVX-512 is available at compile time
293    #[cfg(target_feature = "avx512f")]
294    pub const HAS_AVX512: bool = true;
295
296    /// Check if AVX-512 is available at compile time
297    #[cfg(not(target_feature = "avx512f"))]
298    pub const HAS_AVX512: bool = false;
299
300    /// Check if NEON is available at compile time
301    #[cfg(target_feature = "neon")]
302    pub const HAS_NEON: bool = true;
303
304    /// Check if NEON is available at compile time
305    #[cfg(not(target_feature = "neon"))]
306    pub const HAS_NEON: bool = false;
307
308    /// Get the SIMD lane width for f32 operations
309    #[must_use]
310    pub const fn lane_width_f32() -> usize {
311        #[cfg(target_feature = "avx512f")]
312        {
313            16
314        }
315        #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))]
316        {
317            8
318        }
319        #[cfg(all(
320            target_feature = "neon",
321            not(target_feature = "avx2"),
322            not(target_feature = "avx512f")
323        ))]
324        {
325            4
326        }
327        #[cfg(not(any(
328            target_feature = "avx512f",
329            target_feature = "avx2",
330            target_feature = "neon"
331        )))]
332        {
333            4
334        }
335    }
336
337    /// Get the SIMD lane width for f64 operations
338    #[must_use]
339    pub const fn lane_width_f64() -> usize {
340        #[cfg(target_feature = "avx512f")]
341        {
342            8
343        }
344        #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))]
345        {
346            4
347        }
348        #[cfg(not(any(target_feature = "avx512f", target_feature = "avx2")))]
349        {
350            2
351        }
352    }
353
354    /// Get the SIMD lane width for u8 operations
355    #[must_use]
356    pub const fn lane_width_u8() -> usize {
357        #[cfg(target_feature = "avx512f")]
358        {
359            64
360        }
361        #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))]
362        {
363            32
364        }
365        #[cfg(not(any(target_feature = "avx512f", target_feature = "avx2")))]
366        {
367            16
368        }
369    }
370
371    /// Get the preferred alignment for SIMD operations
372    #[must_use]
373    pub const fn preferred_alignment() -> usize {
374        #[cfg(target_feature = "avx512f")]
375        {
376            64
377        }
378        #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))]
379        {
380            32
381        }
382        #[cfg(not(any(target_feature = "avx512f", target_feature = "avx2")))]
383        {
384            16
385        }
386    }
387}
388
389/// Utilities for working with SIMD operations
390pub mod util {
391    use std::alloc::Layout;
392
393    /// Calculate the number of SIMD chunks for a given length and lane width
394    #[must_use]
395    pub const fn chunks(len: usize, lane_width: usize) -> usize {
396        len / lane_width
397    }
398
399    /// Calculate the remainder after SIMD chunks
400    #[must_use]
401    pub const fn remainder(len: usize, lane_width: usize) -> usize {
402        len % lane_width
403    }
404
405    /// Check if a pointer is aligned to the given boundary
406    #[must_use]
407    pub fn is_aligned<T>(ptr: *const T, align: usize) -> bool {
408        (ptr as usize) % align == 0
409    }
410
411    /// Round up to the nearest multiple of alignment
412    #[must_use]
413    pub const fn align_up(value: usize, align: usize) -> usize {
414        (value + align - 1) & !(align - 1)
415    }
416
417    /// Round down to the nearest multiple of alignment
418    #[must_use]
419    pub const fn align_down(value: usize, align: usize) -> usize {
420        value & !(align - 1)
421    }
422
423    /// Allocate a SIMD-aligned `Vec<f32>`
424    ///
425    /// Returns a Vec whose data pointer is aligned to the specified boundary.
426    /// This ensures optimal SIMD load/store performance.
427    ///
428    /// # Errors
429    ///
430    /// Returns None if allocation fails.
431    pub fn aligned_vec_f32(len: usize, align: usize) -> Option<Vec<f32>> {
432        if len == 0 {
433            return Some(Vec::new());
434        }
435        let layout = Layout::from_size_align(len * std::mem::size_of::<f32>(), align).ok()?;
436        // SAFETY: layout is valid (checked by from_size_align), and we properly construct a Vec
437        unsafe {
438            let ptr = std::alloc::alloc_zeroed(layout);
439            if ptr.is_null() {
440                return None;
441            }
442            let slice = std::slice::from_raw_parts_mut(ptr.cast::<f32>(), len);
443            Some(Vec::from_raw_parts(slice.as_mut_ptr(), len, len))
444        }
445    }
446
447    /// Allocate a SIMD-aligned `Vec<f64>`
448    ///
449    /// Returns a Vec whose data pointer is aligned to the specified boundary.
450    ///
451    /// # Errors
452    ///
453    /// Returns None if allocation fails.
454    pub fn aligned_vec_f64(len: usize, align: usize) -> Option<Vec<f64>> {
455        if len == 0 {
456            return Some(Vec::new());
457        }
458        let layout = Layout::from_size_align(len * std::mem::size_of::<f64>(), align).ok()?;
459        // SAFETY: layout is valid (checked by from_size_align), and we properly construct a Vec
460        unsafe {
461            let ptr = std::alloc::alloc_zeroed(layout);
462            if ptr.is_null() {
463                return None;
464            }
465            let slice = std::slice::from_raw_parts_mut(ptr.cast::<f64>(), len);
466            Some(Vec::from_raw_parts(slice.as_mut_ptr(), len, len))
467        }
468    }
469
470    /// Process a slice in SIMD-width chunks, calling a function on each chunk
471    /// and handling the scalar remainder.
472    ///
473    /// Returns the index where the remainder begins.
474    #[inline]
475    pub fn process_chunks(len: usize, lanes: usize) -> (usize, usize) {
476        let n_chunks = len / lanes;
477        let remainder_start = n_chunks * lanes;
478        (n_chunks, remainder_start)
479    }
480}
481
482/// Internal macros for reducing SIMD boilerplate
483///
484/// These macros generate architecture-dispatched function bodies for common patterns.
485#[macro_export]
486#[doc(hidden)]
487macro_rules! simd_validate_binary {
488    ($a:expr, $b:expr, $out:expr) => {
489        if $a.len() != $b.len() || $a.len() != $out.len() {
490            return Err($crate::error::AlgorithmError::InvalidParameter {
491                parameter: "input",
492                message: format!(
493                    "Slice length mismatch: a={}, b={}, out={}",
494                    $a.len(),
495                    $b.len(),
496                    $out.len()
497                ),
498            });
499        }
500    };
501}
502
503/// Validate that two slices have the same length (unary operation)
504#[macro_export]
505#[doc(hidden)]
506macro_rules! simd_validate_unary {
507    ($data:expr, $out:expr) => {
508        if $data.len() != $out.len() {
509            return Err($crate::error::AlgorithmError::InvalidParameter {
510                parameter: "input",
511                message: format!(
512                    "Slice length mismatch: data={}, out={}",
513                    $data.len(),
514                    $out.len()
515                ),
516            });
517        }
518    };
519}
520
521#[cfg(test)]
522mod tests {
523    use super::*;
524
525    #[test]
526    fn test_platform_detection() {
527        let _avx2 = platform::HAS_AVX2;
528        let _avx512 = platform::HAS_AVX512;
529        let _neon = platform::HAS_NEON;
530
531        assert!(platform::lane_width_f32() >= 2);
532        assert!(platform::lane_width_f64() >= 2);
533        assert!(platform::lane_width_u8() >= 8);
534
535        let align = platform::preferred_alignment();
536        assert!(align.is_power_of_two());
537        assert!(align >= 16);
538    }
539
540    #[test]
541    fn test_runtime_detection() {
542        let caps = detect::SimdCapabilities::detect();
543
544        // On aarch64, NEON should be available
545        #[cfg(target_arch = "aarch64")]
546        {
547            assert!(caps.has_neon);
548            assert_eq!(caps.max_level, detect::SimdLevel::Simd128);
549        }
550
551        // On x86-64, at least SSE2 should be available
552        #[cfg(target_arch = "x86_64")]
553        {
554            assert!(caps.has_sse2);
555            assert!(caps.max_level >= detect::SimdLevel::Simd128);
556        }
557
558        // max_level should be at least Scalar on any platform
559        assert!(caps.max_level >= detect::SimdLevel::Scalar);
560    }
561
562    #[test]
563    fn test_cached_capabilities() {
564        let caps1 = detect::capabilities();
565        let caps2 = detect::capabilities();
566
567        // Should return the same cached instance
568        assert_eq!(caps1.max_level, caps2.max_level);
569        assert_eq!(caps1.has_neon, caps2.has_neon);
570    }
571
572    #[test]
573    fn test_simd_level_properties() {
574        assert_eq!(detect::SimdLevel::Scalar.lanes_f32(), 1);
575        assert_eq!(detect::SimdLevel::Simd128.lanes_f32(), 4);
576        assert_eq!(detect::SimdLevel::Simd256.lanes_f32(), 8);
577        assert_eq!(detect::SimdLevel::Simd512.lanes_f32(), 16);
578
579        assert_eq!(detect::SimdLevel::Simd128.lanes_f64(), 2);
580        assert_eq!(detect::SimdLevel::Simd256.lanes_f64(), 4);
581
582        assert_eq!(detect::SimdLevel::Simd128.lanes_u8(), 16);
583        assert_eq!(detect::SimdLevel::Simd256.lanes_u8(), 32);
584
585        assert_eq!(detect::SimdLevel::Simd128.preferred_alignment(), 16);
586        assert_eq!(detect::SimdLevel::Simd256.preferred_alignment(), 32);
587        assert_eq!(detect::SimdLevel::Simd512.preferred_alignment(), 64);
588    }
589
590    #[test]
591    fn test_simd_level_display() {
592        let level = detect::best_level();
593        let display = format!("{level}");
594        assert!(!display.is_empty());
595    }
596
597    #[test]
598    fn test_simd_level_ordering() {
599        assert!(detect::SimdLevel::Scalar < detect::SimdLevel::Simd128);
600        assert!(detect::SimdLevel::Simd128 < detect::SimdLevel::Simd256);
601        assert!(detect::SimdLevel::Simd256 < detect::SimdLevel::Simd512);
602    }
603
604    #[test]
605    fn test_is_available() {
606        // Scalar should always be available
607        assert!(detect::is_available(detect::SimdLevel::Scalar));
608
609        // The best level should be available
610        let best = detect::best_level();
611        assert!(detect::is_available(best));
612    }
613
614    #[test]
615    fn test_util_functions() {
616        assert_eq!(util::chunks(100, 8), 12);
617        assert_eq!(util::remainder(100, 8), 4);
618
619        assert_eq!(util::align_up(15, 16), 16);
620        assert_eq!(util::align_up(16, 16), 16);
621        assert_eq!(util::align_up(17, 16), 32);
622
623        assert_eq!(util::align_down(15, 16), 0);
624        assert_eq!(util::align_down(16, 16), 16);
625        assert_eq!(util::align_down(31, 16), 16);
626    }
627
628    #[test]
629    fn test_process_chunks() {
630        let (n_chunks, remainder_start) = util::process_chunks(100, 8);
631        assert_eq!(n_chunks, 12);
632        assert_eq!(remainder_start, 96);
633
634        let (n_chunks, remainder_start) = util::process_chunks(7, 4);
635        assert_eq!(n_chunks, 1);
636        assert_eq!(remainder_start, 4);
637    }
638
639    #[test]
640    fn test_pointer_alignment() {
641        let data = vec![1.0_f32; 100];
642        let ptr = data.as_ptr();
643        assert!(util::is_aligned(ptr, std::mem::align_of::<f32>()));
644    }
645
646    #[test]
647    fn test_aligned_vec_f32() {
648        if let Some(v) = util::aligned_vec_f32(100, 64) {
649            assert_eq!(v.len(), 100);
650            assert!(util::is_aligned(v.as_ptr(), 64));
651            for &val in &v {
652                assert_eq!(val, 0.0);
653            }
654        }
655    }
656
657    #[test]
658    fn test_aligned_vec_f64() {
659        if let Some(v) = util::aligned_vec_f64(100, 64) {
660            assert_eq!(v.len(), 100);
661            assert!(util::is_aligned(v.as_ptr(), 64));
662            for &val in &v {
663                assert_eq!(val, 0.0);
664            }
665        }
666    }
667
668    #[test]
669    fn test_aligned_vec_empty() {
670        let v = util::aligned_vec_f32(0, 64);
671        assert!(v.is_some());
672        if let Some(v) = v {
673            assert!(v.is_empty());
674        }
675    }
676}