scirs2_core/simd/detect.rs
1//! CPU feature detection and SIMD capability management
2//!
3//! This module provides runtime detection of SIMD capabilities and manages
4//! CPU feature information through a cached singleton pattern for optimal performance.
5
6use std::sync::OnceLock;
7
8/// CPU feature flags detected at runtime
9///
10/// This struct caches the results of CPU feature detection to avoid repeated
11/// runtime checks. It is initialized once and shared across all SIMD operations.
12#[derive(Debug, Clone)]
13pub struct CpuFeatures {
14 /// AVX-512F (512-bit SIMD) support
15 pub has_avx512f: bool,
16 /// AVX2 (256-bit SIMD) support
17 pub has_avx2: bool,
18 /// SSE (128-bit SIMD) support
19 pub has_sse: bool,
20 /// FMA (Fused Multiply-Add) support
21 pub has_fma: bool,
22 /// NEON (ARM Advanced SIMD) support
23 pub has_neon: bool,
24 /// ARM SVE (Scalable Vector Extension) support
25 ///
26 /// Only available on AArch64 (e.g. Neoverse N1/V1/V2, Apple M4+).
27 pub has_sve: bool,
28 /// ARM SVE2 (Scalable Vector Extension 2) support
29 pub has_sve2: bool,
30 /// ARM integer dot-product extension (`UDOT`/`SDOT`)
31 ///
32 /// Accelerates i8 matrix multiply; present on Cortex-A55, A76, A78,
33 /// Neoverse N1/V1/V2 and Apple M-series.
34 pub has_dotprod: bool,
35 /// ARM BFloat16 arithmetic extension
36 ///
37 /// Present on Cortex-A78C, Neoverse V1/V2, Apple M2+.
38 pub has_bf16: bool,
39}
40
41static CPU_FEATURES: OnceLock<CpuFeatures> = OnceLock::new();
42
43/// Get CPU features with lazy initialization
44///
45/// This function returns a static reference to CPU features, initializing
46/// them on first call. Subsequent calls return the cached result.
47///
48/// # Returns
49///
50/// A static reference to `CpuFeatures` containing detected CPU capabilities.
51pub fn get_cpu_features() -> &'static CpuFeatures {
52 CPU_FEATURES.get_or_init(|| {
53 #[cfg(target_arch = "x86_64")]
54 {
55 CpuFeatures {
56 has_avx512f: std::arch::is_x86_feature_detected!("avx512f"),
57 has_avx2: std::arch::is_x86_feature_detected!("avx2"),
58 has_sse: std::arch::is_x86_feature_detected!("sse"),
59 has_fma: std::arch::is_x86_feature_detected!("fma"),
60 has_neon: false,
61 has_sve: false,
62 has_sve2: false,
63 has_dotprod: false,
64 has_bf16: false,
65 }
66 }
67 #[cfg(target_arch = "aarch64")]
68 {
69 CpuFeatures {
70 has_avx512f: false,
71 has_avx2: false,
72 has_sse: false,
73 has_fma: false, // ARM uses vfmaq_f32 / vfmaq_f64 — not the x86 FMA extension
74 has_neon: std::arch::is_aarch64_feature_detected!("neon"),
75 has_sve: std::arch::is_aarch64_feature_detected!("sve"),
76 has_sve2: std::arch::is_aarch64_feature_detected!("sve2"),
77 has_dotprod: std::arch::is_aarch64_feature_detected!("dotprod"),
78 has_bf16: std::arch::is_aarch64_feature_detected!("bf16"),
79 }
80 }
81 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
82 {
83 CpuFeatures {
84 has_avx512f: false,
85 has_avx2: false,
86 has_sse: false,
87 has_fma: false,
88 has_neon: false,
89 has_sve: false,
90 has_sve2: false,
91 has_dotprod: false,
92 has_bf16: false,
93 }
94 }
95 })
96}
97
98/// Extended SIMD capabilities including cache information
99///
100/// This struct provides detailed information about the system's SIMD capabilities
101/// including vector widths, cache sizes, and optimal prefetch distances.
102#[derive(Debug, Clone)]
103pub struct SimdCapabilities {
104 /// AVX2 (256-bit SIMD) support
105 pub has_avx2: bool,
106 /// AVX-512 (512-bit SIMD) support
107 pub has_avx512: bool,
108 /// FMA (Fused Multiply-Add) support
109 pub has_fma: bool,
110 /// SSE4.2 support
111 pub has_sse42: bool,
112 /// BMI2 (Bit Manipulation Instructions 2) support
113 pub has_bmi2: bool,
114 /// ARM NEON (128-bit SIMD) support
115 pub has_neon: bool,
116 /// ARM SVE (Scalable Vector Extension) support
117 pub has_sve: bool,
118 /// ARM SVE2 support
119 pub has_sve2: bool,
120 /// ARM integer dot-product extension (`UDOT`/`SDOT`)
121 pub has_dotprod: bool,
122 /// ARM BFloat16 arithmetic extension
123 pub has_bf16: bool,
124 /// Number of f32 elements that can be processed in parallel
125 pub vector_width_f32: usize,
126 /// Number of f64 elements that can be processed in parallel
127 pub vector_width_f64: usize,
128 /// CPU cache line size in bytes
129 pub cache_line_size: usize,
130 /// L1 cache size in bytes
131 pub l1_cache_size: usize,
132 /// L2 cache size in bytes
133 pub l2_cache_size: usize,
134 /// Prefetch distance in cache lines
135 pub prefetch_distance: usize,
136}
137
138impl Default for SimdCapabilities {
139 fn default() -> Self {
140 let cpu_features = get_cpu_features();
141
142 Self {
143 // x86 features
144 has_avx2: cpu_features.has_avx2,
145 has_avx512: cpu_features.has_avx512f,
146 has_fma: cpu_features.has_fma,
147 has_sse42: cpu_features.has_sse,
148 has_bmi2: false, // Conservative default, would need specific detection
149 // ARM features
150 has_neon: cpu_features.has_neon,
151 has_sve: cpu_features.has_sve,
152 has_sve2: cpu_features.has_sve2,
153 has_dotprod: cpu_features.has_dotprod,
154 has_bf16: cpu_features.has_bf16,
155 vector_width_f32: if cpu_features.has_avx512f {
156 16 // AVX-512 can process 16 f32s
157 } else if cpu_features.has_avx2 {
158 8 // AVX2 can process 8 f32s
159 } else if cpu_features.has_sse || cpu_features.has_neon {
160 4 // SSE/NEON can process 4 f32s
161 } else {
162 1 // Scalar fallback
163 },
164 vector_width_f64: if cpu_features.has_avx512f {
165 8 // AVX-512 can process 8 f64s
166 } else if cpu_features.has_avx2 {
167 4 // AVX2 can process 4 f64s
168 } else if cpu_features.has_sse || cpu_features.has_neon {
169 2 // SSE/NEON can process 2 f64s
170 } else {
171 1 // Scalar fallback
172 },
173 cache_line_size: 64, // Typical cache line size
174 l1_cache_size: 32768, // 32KB typical L1 cache
175 l2_cache_size: 262144, // 256KB typical L2 cache
176 prefetch_distance: 16, // Prefetch 16 cache lines ahead
177 }
178 }
179}
180
181/// Detect SIMD capabilities for the current system
182///
183/// This function returns detailed SIMD capabilities including vector widths,
184/// cache information, and supported instruction sets.
185///
186/// # Returns
187///
188/// A `SimdCapabilities` struct containing detailed system capabilities.
189///
190/// # Examples
191///
192/// ```ignore
193/// use scirs2_core::simd::detect::detect_simd_capabilities;
194///
195/// let caps = detect_simd_capabilities();
196/// println!("Vector width for f32: {}", caps.vector_width_f32);
197/// println!("Has AVX2: {}", caps.has_avx2);
198/// ```
199#[allow(dead_code)]
200pub fn detect_simd_capabilities() -> SimdCapabilities {
201 SimdCapabilities::default()
202}