Skip to main content

scirs2_core/simd/
detect.rs

1//! CPU feature detection and SIMD capability management
2//!
3//! This module provides runtime detection of SIMD capabilities and manages
4//! CPU feature information through a cached singleton pattern for optimal performance.
5
6use std::sync::OnceLock;
7
8/// CPU feature flags detected at runtime
9///
10/// This struct caches the results of CPU feature detection to avoid repeated
11/// runtime checks. It is initialized once and shared across all SIMD operations.
12#[derive(Debug, Clone)]
13pub struct CpuFeatures {
14    /// AVX-512F (512-bit SIMD) support
15    pub has_avx512f: bool,
16    /// AVX2 (256-bit SIMD) support
17    pub has_avx2: bool,
18    /// SSE (128-bit SIMD) support
19    pub has_sse: bool,
20    /// FMA (Fused Multiply-Add) support
21    pub has_fma: bool,
22    /// NEON (ARM Advanced SIMD) support
23    pub has_neon: bool,
24    /// ARM SVE (Scalable Vector Extension) support
25    ///
26    /// Only available on AArch64 (e.g. Neoverse N1/V1/V2, Apple M4+).
27    pub has_sve: bool,
28    /// ARM SVE2 (Scalable Vector Extension 2) support
29    pub has_sve2: bool,
30    /// ARM integer dot-product extension (`UDOT`/`SDOT`)
31    ///
32    /// Accelerates i8 matrix multiply; present on Cortex-A55, A76, A78,
33    /// Neoverse N1/V1/V2 and Apple M-series.
34    pub has_dotprod: bool,
35    /// ARM BFloat16 arithmetic extension
36    ///
37    /// Present on Cortex-A78C, Neoverse V1/V2, Apple M2+.
38    pub has_bf16: bool,
39}
40
41static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
42
43/// Get CPU features with lazy initialization
44///
45/// This function returns a static reference to CPU features, initializing
46/// them on first call. Subsequent calls return the cached result.
47///
48/// # Returns
49///
50/// A static reference to `CpuFeatures` containing detected CPU capabilities.
51pub fn get_cpu_features() -> &'static CpuFeatures {
52    CPU_FEATURES.get_or_init(|| {
53        #[cfg(target_arch = "x86_64")]
54        {
55            CpuFeatures {
56                has_avx512f: std::arch::is_x86_feature_detected!("avx512f"),
57                has_avx2: std::arch::is_x86_feature_detected!("avx2"),
58                has_sse: std::arch::is_x86_feature_detected!("sse"),
59                has_fma: std::arch::is_x86_feature_detected!("fma"),
60                has_neon: false,
61                has_sve: false,
62                has_sve2: false,
63                has_dotprod: false,
64                has_bf16: false,
65            }
66        }
67        #[cfg(target_arch = "aarch64")]
68        {
69            CpuFeatures {
70                has_avx512f: false,
71                has_avx2: false,
72                has_sse: false,
73                has_fma: false, // ARM uses vfmaq_f32 / vfmaq_f64 — not the x86 FMA extension
74                has_neon: std::arch::is_aarch64_feature_detected!("neon"),
75                has_sve: std::arch::is_aarch64_feature_detected!("sve"),
76                has_sve2: std::arch::is_aarch64_feature_detected!("sve2"),
77                has_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"),
78                has_bf16: std::arch::is_aarch64_feature_detected!("bf16"),
79            }
80        }
81        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
82        {
83            CpuFeatures {
84                has_avx512f: false,
85                has_avx2: false,
86                has_sse: false,
87                has_fma: false,
88                has_neon: false,
89                has_sve: false,
90                has_sve2: false,
91                has_dotprod: false,
92                has_bf16: false,
93            }
94        }
95    })
96}
97
98/// Extended SIMD capabilities including cache information
99///
100/// This struct provides detailed information about the system's SIMD capabilities
101/// including vector widths, cache sizes, and optimal prefetch distances.
102#[derive(Debug, Clone)]
103pub struct SimdCapabilities {
104    /// AVX2 (256-bit SIMD) support
105    pub has_avx2: bool,
106    /// AVX-512 (512-bit SIMD) support
107    pub has_avx512: bool,
108    /// FMA (Fused Multiply-Add) support
109    pub has_fma: bool,
110    /// SSE4.2 support
111    pub has_sse42: bool,
112    /// BMI2 (Bit Manipulation Instructions 2) support
113    pub has_bmi2: bool,
114    /// ARM NEON (128-bit SIMD) support
115    pub has_neon: bool,
116    /// ARM SVE (Scalable Vector Extension) support
117    pub has_sve: bool,
118    /// ARM SVE2 support
119    pub has_sve2: bool,
120    /// ARM integer dot-product extension (`UDOT`/`SDOT`)
121    pub has_dotprod: bool,
122    /// ARM BFloat16 arithmetic extension
123    pub has_bf16: bool,
124    /// Number of f32 elements that can be processed in parallel
125    pub vector_width_f32: usize,
126    /// Number of f64 elements that can be processed in parallel
127    pub vector_width_f64: usize,
128    /// CPU cache line size in bytes
129    pub cache_line_size: usize,
130    /// L1 cache size in bytes
131    pub l1_cache_size: usize,
132    /// L2 cache size in bytes
133    pub l2_cache_size: usize,
134    /// Prefetch distance in cache lines
135    pub prefetch_distance: usize,
136}
137
138impl Default for SimdCapabilities {
139    fn default() -> Self {
140        let cpu_features = get_cpu_features();
141
142        Self {
143            // x86 features
144            has_avx2: cpu_features.has_avx2,
145            has_avx512: cpu_features.has_avx512f,
146            has_fma: cpu_features.has_fma,
147            has_sse42: cpu_features.has_sse,
148            has_bmi2: false, // Conservative default, would need specific detection
149            // ARM features
150            has_neon: cpu_features.has_neon,
151            has_sve: cpu_features.has_sve,
152            has_sve2: cpu_features.has_sve2,
153            has_dotprod: cpu_features.has_dotprod,
154            has_bf16: cpu_features.has_bf16,
155            vector_width_f32: if cpu_features.has_avx512f {
156                16 // AVX-512 can process 16 f32s
157            } else if cpu_features.has_avx2 {
158                8 // AVX2 can process 8 f32s
159            } else if cpu_features.has_sse || cpu_features.has_neon {
160                4 // SSE/NEON can process 4 f32s
161            } else {
162                1 // Scalar fallback
163            },
164            vector_width_f64: if cpu_features.has_avx512f {
165                8 // AVX-512 can process 8 f64s
166            } else if cpu_features.has_avx2 {
167                4 // AVX2 can process 4 f64s
168            } else if cpu_features.has_sse || cpu_features.has_neon {
169                2 // SSE/NEON can process 2 f64s
170            } else {
171                1 // Scalar fallback
172            },
173            cache_line_size: 64,   // Typical cache line size
174            l1_cache_size: 32768,  // 32KB typical L1 cache
175            l2_cache_size: 262144, // 256KB typical L2 cache
176            prefetch_distance: 16, // Prefetch 16 cache lines ahead
177        }
178    }
179}
180
181/// Detect SIMD capabilities for the current system
182///
183/// This function returns detailed SIMD capabilities including vector widths,
184/// cache information, and supported instruction sets.
185///
186/// # Returns
187///
188/// A `SimdCapabilities` struct containing detailed system capabilities.
189///
190/// # Examples
191///
192/// ```ignore
193/// use scirs2_core::simd::detect::detect_simd_capabilities;
194///
195/// let caps = detect_simd_capabilities();
196/// println!("Vector width for f32: {}", caps.vector_width_f32);
197/// println!("Has AVX2: {}", caps.has_avx2);
198/// ```
199#[allow(dead_code)]
200pub fn detect_simd_capabilities() -> SimdCapabilities {
201    SimdCapabilities::default()
202}